AUX column scan(MCOL-5021) effectively disables vectorized scanning on

ARM platforms. This patch resolves this issue and unifies AUX column processing at x86 and ARM using tempate class SimdProcessor. The patch also replaces uint16_t mask previously used in column.cpp and SimProcessor code with a native masks that platform uses, e.g. __m128i or __m128 on x86 and variety of masks on ARM. To unify the processing I introduced a new filtering Compare Operator - COMPARE_NULLEQ. with a 'c1 IS NULL semantics'.
2025-04-18 21:44:02 +03:00 · 2022-08-26 19:37:40 +08:00 · 2022-08-26 19:37:40 +08:00 · 7d76dc4534
commit 7d76dc4534
parent 85a6121f76
7 changed files with 1531 additions and 1407 deletions
--- a/dbcon/joblist/primitivemsg.h
+++ b/dbcon/joblist/primitivemsg.h
@ -42,13 +42,14 @@
 // from blocksize.h
 const int32_t DATA_BLOCK_SIZE = BLOCK_SIZE;

-const int8_t COMPARE_NIL = 0x00;
+const int8_t COMPARE_NIL = 0x00;  // means c = NULL predicate
 const int8_t COMPARE_LT = 0x01;
 const int8_t COMPARE_EQ = 0x02;
 const int8_t COMPARE_LE = (COMPARE_LT | COMPARE_EQ);  // 0x03
 const int8_t COMPARE_GT = 0x04;
 const int8_t COMPARE_NE = (COMPARE_LT | COMPARE_GT);  // 0x05
 const int8_t COMPARE_GE = (COMPARE_GT | COMPARE_EQ);  // 0x06
+const int8_t COMPARE_NULLEQ = 0x07;                   // means c IS NULL(see column.cpp for details)
 const int8_t COMPARE_NOT = 0x08;
 const int8_t COMPARE_NLT = (COMPARE_LT | COMPARE_NOT);  // 0x09
 const int8_t COMPARE_NLE = (COMPARE_LE | COMPARE_NOT);  // 0x0b
@ -884,4 +885,3 @@ struct LbidAtVer
 #endif

 #pragma pack(pop)
-
--- a/primitives/linux-port/column.cpp
+++ b/primitives/linux-port/column.cpp
--- a/primitives/linux-port/primitiveprocessor.h
+++ b/primitives/linux-port/primitiveprocessor.h
@ -55,7 +55,7 @@
 class PrimTest;

 // XXX: turn off dictionary range setting during scan.
-#define	XXX_PRIMITIVES_TOKEN_RANGES_XXX
+#define XXX_PRIMITIVES_TOKEN_RANGES_XXX

 namespace primitives
 {
@ -472,6 +472,103 @@ class PrimitiveProcessor
  friend class ::PrimTest;
 };

+// Bit pattern representing NULL value for given column type/width
+// TBD Use TypeHandler
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int128_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  return datatypes::Decimal128Null;
+}
+
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int64_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  switch (type)
+  {
+    case execplan::CalpontSystemCatalog::DOUBLE:
+    case execplan::CalpontSystemCatalog::UDOUBLE: return joblist::DOUBLENULL;
+
+    case execplan::CalpontSystemCatalog::CHAR:
+    case execplan::CalpontSystemCatalog::VARCHAR:
+    case execplan::CalpontSystemCatalog::DATE:
+    case execplan::CalpontSystemCatalog::DATETIME:
+    case execplan::CalpontSystemCatalog::TIMESTAMP:
+    case execplan::CalpontSystemCatalog::TIME:
+    case execplan::CalpontSystemCatalog::VARBINARY:
+    case execplan::CalpontSystemCatalog::BLOB:
+    case execplan::CalpontSystemCatalog::TEXT: return joblist::CHAR8NULL;
+
+    case execplan::CalpontSystemCatalog::UBIGINT: return joblist::UBIGINTNULL;
+
+    default: return joblist::BIGINTNULL;
+  }
+}
+
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int32_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  switch (type)
+  {
+    case execplan::CalpontSystemCatalog::FLOAT:
+    case execplan::CalpontSystemCatalog::UFLOAT: return joblist::FLOATNULL;
+
+    case execplan::CalpontSystemCatalog::CHAR:
+    case execplan::CalpontSystemCatalog::VARCHAR:
+    case execplan::CalpontSystemCatalog::BLOB:
+    case execplan::CalpontSystemCatalog::TEXT: return joblist::CHAR4NULL;
+
+    case execplan::CalpontSystemCatalog::DATE:
+    case execplan::CalpontSystemCatalog::DATETIME:
+    case execplan::CalpontSystemCatalog::TIMESTAMP:
+    case execplan::CalpontSystemCatalog::TIME: return joblist::DATENULL;
+
+    case execplan::CalpontSystemCatalog::UINT:
+    case execplan::CalpontSystemCatalog::UMEDINT: return joblist::UINTNULL;
+
+    default: return joblist::INTNULL;
+  }
+}
+
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int16_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  switch (type)
+  {
+    case execplan::CalpontSystemCatalog::CHAR:
+    case execplan::CalpontSystemCatalog::VARCHAR:
+    case execplan::CalpontSystemCatalog::BLOB:
+    case execplan::CalpontSystemCatalog::TEXT:
+    case execplan::CalpontSystemCatalog::DATE:
+    case execplan::CalpontSystemCatalog::DATETIME:
+    case execplan::CalpontSystemCatalog::TIMESTAMP:
+    case execplan::CalpontSystemCatalog::TIME: return joblist::CHAR2NULL;
+
+    case execplan::CalpontSystemCatalog::USMALLINT: return joblist::USMALLINTNULL;
+
+    default: return joblist::SMALLINTNULL;
+  }
+}
+
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int8_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  switch (type)
+  {
+    case execplan::CalpontSystemCatalog::CHAR:
+    case execplan::CalpontSystemCatalog::VARCHAR:
+    case execplan::CalpontSystemCatalog::BLOB:
+    case execplan::CalpontSystemCatalog::TEXT:
+    case execplan::CalpontSystemCatalog::DATE:
+    case execplan::CalpontSystemCatalog::DATETIME:
+    case execplan::CalpontSystemCatalog::TIMESTAMP:
+    case execplan::CalpontSystemCatalog::TIME: return joblist::CHAR1NULL;
+
+    case execplan::CalpontSystemCatalog::UTINYINT: return joblist::UTINYINTNULL;
+
+    default: return joblist::TINYINTNULL;
+  }
+}
+
 //
 // COMPILE A COLUMN FILTER
 //
@ -518,13 +615,32 @@ boost::shared_ptr<ParsedColumnFilter> _parseColumnFilter(
    // Pointer to ColArgs structure representing argIndex'th element in the BLOB
    auto args = reinterpret_cast<const ColArgs*>(filterString + (argIndex * filterSize));

-    ret->prestored_cops[argIndex] = args->COP;
    ret->prestored_rfs[argIndex] = args->rf;

-    if (datatypes::isUnsigned((execplan::CalpontSystemCatalog::ColDataType)colType))
-      ret->storeFilterArg(argIndex, reinterpret_cast<const UT*>(args->val));
+    auto colDataType = (execplan::CalpontSystemCatalog::ColDataType)colType;
+    bool isNullEqCmp = false;
+    if (datatypes::isUnsigned(colDataType))
+    {
+      const auto nullValue = getNullValue<UT>(colDataType);
+      const UT* filterValue = reinterpret_cast<const UT*>(args->val);
+      isNullEqCmp =
+          (args->COP == COMPARE_EQ && memcmp(filterValue, &nullValue, sizeof(nullValue)) == 0) ? true : false;
+      ret->storeFilterArg(argIndex, filterValue);
+    }
    else
-      ret->storeFilterArg(argIndex, reinterpret_cast<const T*>(args->val));
+    {
+      const auto nullValue = getNullValue<T>(colDataType);
+      const T* filterValue = reinterpret_cast<const T*>(args->val);
+      isNullEqCmp =
+          (args->COP == COMPARE_EQ && memcmp(filterValue, &nullValue, sizeof(nullValue)) == 0) ? true : false;
+      ret->storeFilterArg(argIndex, filterValue);
+    }
+
+    // IS NULL filtering expression is translated into COMPARE_EQ + NULL magic in the filter.
+    // This if replaces an operation id once to avoid additional branching in the main loop
+    // of vectorizedFiltering_ in column.cpp.
+    // It would be cleaner to place in into EM though.
+    ret->prestored_cops[argIndex] = (isNullEqCmp) ? COMPARE_NULLEQ : args->COP;
  }

  /*  Decide which structure to use.  I think the only cases where we can use the set
@ -575,4 +691,3 @@ boost::shared_ptr<ParsedColumnFilter> _parseColumnFilter(
 }

 }  // namespace primitives
-
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -55,6 +55,7 @@ if (WITH_UNITTESTS)
    gtest_add_tests(TARGET column_scan_filter_tests TEST_PREFIX columnstore:)

    add_executable(simd_processors simd_processors.cpp)
+    target_compile_options(simd_processors PRIVATE -Wno-error)
    add_dependencies(simd_processors googletest)
    target_link_libraries(simd_processors ${ENGINE_LDFLAGS} ${MARIADB_CLIENT_LIBS} ${ENGINE_WRITE_LIBS} ${GTEST_LIBRARIES} processor dbbc)
    gtest_add_tests(TARGET simd_processors TEST_PREFIX columnstore:)
--- a/tests/simd_processors.cpp
+++ b/tests/simd_processors.cpp
@ -15,8 +15,8 @@
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
   MA 02110-1301, USA. */

-
 #include <cstdint>
+#include <functional>
 #include <iostream>
 #include <type_traits>
 #include <gtest/gtest.h>
@ -25,465 +25,496 @@
 #include "simd_sse.h"
 #include "simd_arm.h"
 #if defined(__x86_64__)
-  #define TESTS_USING_SSE 1
-  using float64_t = double;
-  using float32_t = float;
+#define TESTS_USING_SSE 1
+using float64_t = double;
+using float32_t = float;
 #endif
 #ifdef __aarch64__
-  #define TESTS_USING_ARM 1
+#define TESTS_USING_ARM 1
 #endif

 using namespace std;
-
+#if defined(__x86_64__) || __aarch64__
 template <typename T>
-class SimdProcessorTypedTest : public testing::Test {
-public:
+class SimdProcessorTypedTest : public testing::Test
+{
+ public:
  using IntegralType = T;
-  #if TESTS_USING_SSE
-    using SimdType = std::conditional_t<std::is_same<T, float>::value,
-                                        simd::vi128f_wr,
-                                        std::conditional_t<std::is_same<T, double>::value,
-                                                           simd::vi128d_wr,
-                                                           simd::vi128_wr>>;
-    using Proc = typename simd::SimdFilterProcessor<SimdType, T>;
-    #else
-    using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<T>::WrapperType, T>;
-  #endif
+#if TESTS_USING_SSE
+  using SimdType =
+      std::conditional_t<std::is_same<T, float>::value, simd::vi128f_wr,
+                         std::conditional_t<std::is_same<T, double>::value, simd::vi128d_wr, simd::vi128_wr>>;
+  using Proc = typename simd::SimdFilterProcessor<SimdType, T>;
+#else
+  using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<T>::WrapperType, T>;
+#endif
  void SetUp() override
  {
  }
 };

-using SimdProcessor128TypedTestTypes = ::testing::Types<uint64_t, uint32_t, uint16_t, uint8_t, int64_t, int32_t, int16_t, int8_t>;
+using SimdProcessor128TypedTestTypes =
+    ::testing::Types<uint64_t, uint32_t, uint16_t, uint8_t, int64_t, int32_t, int16_t, int8_t>;
 TYPED_TEST_SUITE(SimdProcessorTypedTest, SimdProcessor128TypedTestTypes);

 TYPED_TEST(SimdProcessorTypedTest, SimdFilterProcessor_simd128)
 {
  using Proc = typename SimdProcessorTypedTest<TypeParam>::Proc;
+
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+
  using SimdType = typename Proc::SimdType;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  constexpr static simd::MT allFalse = 0x0;
  Proc proc;
+  const typename Proc::MaskType allTrue = proc.trueMask();
+  const typename Proc::MaskType allFalse = proc.falseMask();
+
  SimdType lhs = proc.loadValue((TypeParam)-2);
  SimdType rhs = proc.loadValue((TypeParam)-3);
  EXPECT_GT((uint64_t)-2LL, (uint64_t)-3LL);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpGe(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpLt(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), allFalse);
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse);
-  EXPECT_EQ(proc.cmpEq(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpNe(rhs, lhs), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allTrue));
  lhs = proc.loadValue((TypeParam)-3);
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allFalse));

  lhs = rhs = proc.loadValue((TypeParam)0);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), allFalse);
-  EXPECT_EQ(proc.cmpGe(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpLt(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse);
-  EXPECT_EQ(proc.cmpEq(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allFalse));
 }

+template <typename IntegralType, typename ResultType, int VecSize>
+ResultType bitMaskProducerT(const IntegralType* l, const IntegralType* r,
+                            std::function<bool(IntegralType, IntegralType)> cmp, const bool printOut = false)
+{
+  uint64_t allOnes = 0xFFULL;
+  for (size_t i = 1; i < sizeof(IntegralType); ++i)
+  {
+    allOnes |= 0xFFULL << (i * 8);
+  }
+  ResultType result = {0x0, 0x0};
+
+  uint64_t* resultPtr = reinterpret_cast<uint64_t*>(&result);
+  for (size_t i = 0; i < VecSize >> 1; ++i)
+  {
+    if (cmp(l[i], r[i]))
+    {
+      if (printOut)
+      {
+        uint64_t pLeft = l[i];
+        uint pRight = r[i];
+        std::cout << "i " << i << " l " << cmp.target_type().name() << " r " << pLeft << " " << pRight
+                  << std::endl;
+      }
+      resultPtr[0] |= allOnes << i * sizeof(IntegralType) * 8;
+    }
+  }
+  for (size_t i = VecSize >> 1; i < VecSize; ++i)
+  {
+    if (cmp(l[i], r[i]))
+    {
+      if (printOut)
+      {
+        uint64_t pLeft = l[i];
+        uint pRight = r[i];
+        std::cout << "i " << i << " l " << cmp.target_type().name() << " r " << pLeft << " " << pRight
+                  << std::endl;
+      }
+      resultPtr[1] |= allOnes << (i - (VecSize >> 1)) * sizeof(IntegralType) * 8;
+    }
+  }
+  return result;
+};

 TEST(SimdProcessorTest, Int8)
 {
-    using Proc = typename SimdProcessorTypedTest<int8_t>::Proc;
-    using SimdType = typename Proc::SimdType;
-    Proc proc;
-    constexpr static simd::MT allTrue = 0xFFFF;
-    simd::MT expect = 0x0;
-    int8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
-    int8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
-    int8_t minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
-    int8_t maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
-    SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
-    SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
-    SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
-    SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-    for (int i = 0; i < 16; i++)
-      if (l[i] > r[i])
-        expect |= 1 << i;
-    EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-    EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect);
-    SimdType testmax = proc.max(lhs, rhs);
-    SimdType testmin = proc.min(lhs, rhs);
-    EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-    EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
-
-    expect = 0x0;
-    for (int i = 0; i < 16; i++)
-      if (l[i] == r[i])
-        expect |= 1 << i;
-    EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-    EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
-
-    expect = 0x0;
-    for (int i = 0; i < 16; i++)
-      if (l[i] < r[i])
-        expect |= 1 << i;
-    EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-    EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
-}
-TEST(SimdProcessorTest, Uint8)
-{
-  using Proc = typename SimdProcessorTypedTest<uint8_t>::Proc;
+  using IntegralType = int8_t;
+  IntegralType l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
+  IntegralType r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  IntegralType minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
+  IntegralType maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  using IntegralType = int8_t;
+  using Proc = typename SimdProcessorTypedTest<IntegralType>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize;
  using SimdType = typename Proc::SimdType;
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  uint8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
-  uint8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
-  uint8_t minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
-  uint8_t maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  const typename Proc::MaskType allTrue = proc.trueMask();
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 16; i++)
-    if (l[i] > r[i])
-      expect |= 1 << i;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect);
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 16; i++)
-    if (l[i] == r[i])
-      expect |= 1 << i;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs),(simd::MT) ~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
+}

-  expect = 0x0;
-  for (int i = 0; i < 16; i++)
-    if (l[i] < r[i])
-      expect |= 1 << i;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs),(simd::MT) ~expect);
+TEST(SimdProcessorTest, Uint8)
+{
+  using IntegralType = uint8_t;
+  IntegralType l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
+  IntegralType r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  IntegralType minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
+  IntegralType maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  using Proc = typename SimdProcessorTypedTest<IntegralType>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  using SimdType = typename Proc::SimdType;
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
+
+  using Proc = typename SimdProcessorTypedTest<uint8_t>::Proc;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  const Proc::MaskType allTrue = proc.trueMask();
+
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
+  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), true);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
+  SimdType testmax = proc.max(lhs, rhs);
+  SimdType testmin = proc.min(lhs, rhs);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
+
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
+
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }

 TEST(SimdProcessorTest, Int16)
 {
+  using IntegralType = int16_t;
+  IntegralType l[8]{0, 1, 2, -5, 4, 3, -8, 200};
+  IntegralType r[8]{0, 105, -8, 35, 24, 13, 8, 100};
+  IntegralType minlr[8]{0, 1, -8, -5, 4, 3, -8, 100};
+  IntegralType maxlr[8]{0, 105, 2, 35, 24, 13, 8, 200};
  using Proc = typename SimdProcessorTypedTest<int16_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  using SimdType = typename Proc::SimdType;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  int16_t l[8]{0, 1, 2, -5, 4, 3, -8, 200};
-  int16_t r[8]{0, 105, -8, 35, 24, 13, 8, 100};
-  int16_t minlr[8]{0, 1, -8, -5, 4, 3, -8, 100};
-  int16_t maxlr[8]{0, 105, 2, 35, 24, 13, 8, 200};
+  const Proc::MaskType allTrue = proc.trueMask();
+
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 8; i++)
-    if (l[i] > r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 8; i++)
-    if (l[i] == r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));

-  expect = 0x0;
-  for (int i = 0; i < 8; i++)
-    if (l[i] < r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Uint16)
 {
-  using Proc = typename SimdProcessorTypedTest<uint16_t>::Proc;
+  using IntegralType = uint16_t;
+  IntegralType l[8]{0, 1, 2, 5, 4, 3, 8, 5};
+  IntegralType r[8]{0, 1, 8, 35, 24, 13, 8, 17};
+  IntegralType minlr[8]{0, 1, 2, 5, 4, 3, 8, 5};
+  IntegralType maxlr[8]{0, 1, 8, 35, 24, 13, 8, 17};
+
+  using Proc = typename SimdProcessorTypedTest<int16_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  using SimdType = typename Proc::SimdType;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  uint16_t l[8]{0, 1, 2, 5, 4, 3, 8, 5};
-  uint16_t r[8]{0, 1, 8, 35, 24, 13, 8, 17};
-  uint16_t minlr[8]{0, 1, 2, 5, 4, 3, 8, 5};
-  uint16_t maxlr[8]{0, 1, 8, 35, 24, 13, 8, 17};
+  const Proc::MaskType allTrue = proc.trueMask();
+
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 8; i++)
-    if (l[i] > r[i])
-      expect |= 3 << i*2;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 8; i++)
-    if (l[i] == r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));

-  expect = 0x0;
-  for (int i = 0; i < 8; i++)
-    if (l[i] < r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }

 TEST(SimdProcessorTest, Int32)
 {
+  using IntegralType = int32_t;
+  IntegralType l[8]{0, 1, 2, -5};
+  IntegralType r[8]{0, 105, -8, 54333};
+  IntegralType minlr[8]{0, 1, -8, -5};
+  IntegralType maxlr[8]{0, 105, 2, 54333};
  using Proc = typename SimdProcessorTypedTest<int32_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](Proc::MaskType left, Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  using SimdType = typename Proc::SimdType;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  int32_t l[8]{0, 1, 2, -5};
-  int32_t r[8]{0, 105, -8,54333};
-  int32_t minlr[8]{0, 1, -8, -5};
-  int32_t maxlr[8]{0, 105, 2, 54333};
+  const Proc::MaskType allTrue = proc.trueMask();
+
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 4; i++)
-    if (l[i] > r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] == r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));

-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] < r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Uint32)
 {
+  using IntegralType = uint32_t;
+  IntegralType l[4]{0, 1002, 2, 514};
+  IntegralType r[4]{2, 1, 80555, 35};
+  IntegralType minlr[8]{0, 1, 2, 35};
+  IntegralType maxlr[8]{2, 1002, 80555, 514};
  using Proc = typename SimdProcessorTypedTest<uint32_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  using SimdType = typename Proc::SimdType;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  uint32_t l[4]{0, 1002, 2, 514};
-  uint32_t r[4]{2, 1, 80555, 35};
-  uint32_t minlr[8]{0, 1, 2, 35};
-  uint32_t maxlr[8]{2, 1002, 80555, 514};
+  const Proc::MaskType allTrue = proc.trueMask();
+
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 4; i++)
-    if (l[i] > r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] == r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));

-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] < r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Int64)
 {
+  using IntegralType = int64_t;
+  IntegralType l[2]{-5, 122020};
+  IntegralType r[2]{0, 105};
+  IntegralType minlr[8]{-5, 105};
+  IntegralType maxlr[8]{0, 122020};
  using Proc = typename SimdProcessorTypedTest<int64_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  using SimdType = typename Proc::SimdType;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  int64_t l[2]{-5, 122020};
-  int64_t r[2]{0, 105};
-  int64_t minlr[8]{-5, 105};
-  int64_t maxlr[8]{0, 122020};
+  const Proc::MaskType allTrue = proc.trueMask();
+
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 2; i++)
-    if (l[i] > r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] == r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));

-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] < r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Uint64)
 {
+  using IntegralType = uint64_t;
+  IntegralType l[2]{822, 1002};
+  IntegralType r[2]{2, 1};
+  IntegralType minlr[8]{2, 1};
+  IntegralType maxlr[8]{822, 1002};
  using Proc = typename SimdProcessorTypedTest<uint64_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  using SimdType = typename Proc::SimdType;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  uint64_t l[2]{822, 1002};
-  uint64_t r[2]{2, 1};
-  uint64_t minlr[8]{2, 1};
-  uint64_t maxlr[8]{822, 1002};
+  const Proc::MaskType allTrue = proc.trueMask();
+
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 2; i++)
-    if (l[i] > r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] == r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));

-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] < r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Float64)
 {
-  using Proc = typename SimdProcessorTypedTest<double>::Proc;
+  using IntegralType = float64_t;
+  IntegralType l[2]{-5.0, 12.5620};
+  IntegralType r[2]{2.9, 1};
+  IntegralType minlr[8]{-5.0, 1};
+  IntegralType maxlr[8]{2.9, 12.5620};
+  using Proc = typename SimdProcessorTypedTest<IntegralType>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  using SimdType = typename Proc::SimdType;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  float64_t l[2]{-5.0, 12.5620};
-  float64_t r[2]{2.9, 1};
-  float64_t minlr[8]{-5.0, 1};
-  float64_t maxlr[8]{2.9, 12.5620};
+  const Proc::MaskType allTrue = proc.trueMask();
+
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 2; i++)
-    if (l[i] > r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] == r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));

-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] < r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Float32)
 {
-  using Proc = typename SimdProcessorTypedTest<float>::Proc;
+  using IntegralType = float32_t;
+  IntegralType l[4]{82, 102, -5.6, 9.5};
+  IntegralType r[4]{2.0, 1, -5.7, 6};
+  IntegralType minlr[8]{2.0, 1, -5.7, 6};
+  IntegralType maxlr[8]{82, 102, -5.6, 9.5};
+  using Proc = typename SimdProcessorTypedTest<IntegralType>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
  using SimdType = typename Proc::SimdType;
  Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  float32_t l[4]{82, 102,-5.6,9.5};
-  float32_t r[4]{2.0, 1,-5.7,6};
-  float32_t minlr[8]{2.0, 1, -5.7, 6};
-  float32_t maxlr[8]{82, 102, -5.6, 9.5};
+  const Proc::MaskType allTrue = proc.trueMask();
+
  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 4; i++)
-    if (l[i] > r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
  SimdType testmax = proc.max(lhs, rhs);
  SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));

-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] == r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));

-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] < r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
+#endif
--- a/utils/common/simd_arm.h
+++ b/utils/common/simd_arm.h
--- a/utils/common/simd_sse.h
+++ b/utils/common/simd_sse.h