diff --git a/dbcon/joblist/primitivemsg.h b/dbcon/joblist/primitivemsg.h
index 93413c1ce..916e4e6ce 100644
--- a/dbcon/joblist/primitivemsg.h
+++ b/dbcon/joblist/primitivemsg.h
@@ -42,13 +42,14 @@
 // from blocksize.h
 const int32_t DATA_BLOCK_SIZE = BLOCK_SIZE;
 
-const int8_t COMPARE_NIL = 0x00;
+const int8_t COMPARE_NIL = 0x00;  // means c = NULL predicate
 const int8_t COMPARE_LT = 0x01;
 const int8_t COMPARE_EQ = 0x02;
 const int8_t COMPARE_LE = (COMPARE_LT | COMPARE_EQ);  // 0x03
 const int8_t COMPARE_GT = 0x04;
 const int8_t COMPARE_NE = (COMPARE_LT | COMPARE_GT);  // 0x05
 const int8_t COMPARE_GE = (COMPARE_GT | COMPARE_EQ);  // 0x06
+const int8_t COMPARE_NULLEQ = 0x07;                   // means c IS NULL(see column.cpp for details)
 const int8_t COMPARE_NOT = 0x08;
 const int8_t COMPARE_NLT = (COMPARE_LT | COMPARE_NOT);  // 0x09
 const int8_t COMPARE_NLE = (COMPARE_LE | COMPARE_NOT);  // 0x0b
@@ -884,4 +885,3 @@ struct LbidAtVer
 #endif
 
 #pragma pack(pop)
-
diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp
index 355b928ec..dbcc63ab2 100644
--- a/primitives/linux-port/column.cpp
+++ b/primitives/linux-port/column.cpp
@@ -55,123 +55,34 @@ using namespace execplan;
 
 namespace
 {
-using MT = uint16_t;
-
-const MT nonEmptyMask2Byte[256] =
+template <ENUM_KIND KIND, typename VT, typename T>
+inline typename VT::MaskType getNonEmptyMaskAux(typename VT::MaskType* nonEmptyMaskAux, uint16_t iter)
 {
-  0x0000, 0x0003, 0x000C, 0x000F, 0x0030, 0x0033, 0x003C, 0x003F,
-  0x00C0, 0x00C3, 0x00CC, 0x00CF, 0x00F0, 0x00F3, 0x00FC, 0x00FF,
-  0x0300, 0x0303, 0x030C, 0x030F, 0x0330, 0x0333, 0x033C, 0x033F,
-  0x03C0, 0x03C3, 0x03CC, 0x03CF, 0x03F0, 0x03F3, 0x03FC, 0x03FF,
-  0x0C00, 0x0C03, 0x0C0C, 0x0C0F, 0x0C30, 0x0C33, 0x0C3C, 0x0C3F,
-  0x0CC0, 0x0CC3, 0x0CCC, 0x0CCF, 0x0CF0, 0x0CF3, 0x0CFC, 0x0CFF,
-  0x0F00, 0x0F03, 0x0F0C, 0x0F0F, 0x0F30, 0x0F33, 0x0F3C, 0x0F3F,
-  0x0FC0, 0x0FC3, 0x0FCC, 0x0FCF, 0x0FF0, 0x0FF3, 0x0FFC, 0x0FFF,
-  0x3000, 0x3003, 0x300C, 0x300F, 0x3030, 0x3033, 0x303C, 0x303F,
-  0x30C0, 0x30C3, 0x30CC, 0x30CF, 0x30F0, 0x30F3, 0x30FC, 0x30FF,
-  0x3300, 0x3303, 0x330C, 0x330F, 0x3330, 0x3333, 0x333C, 0x333F,
-  0x33C0, 0x33C3, 0x33CC, 0x33CF, 0x33F0, 0x33F3, 0x33FC, 0x33FF,
-  0x3C00, 0x3C03, 0x3C0C, 0x3C0F, 0x3C30, 0x3C33, 0x3C3C, 0x3C3F,
-  0x3CC0, 0x3CC3, 0x3CCC, 0x3CCF, 0x3CF0, 0x3CF3, 0x3CFC, 0x3CFF,
-  0x3F00, 0x3F03, 0x3F0C, 0x3F0F, 0x3F30, 0x3F33, 0x3F3C, 0x3F3F,
-  0x3FC0, 0x3FC3, 0x3FCC, 0x3FCF, 0x3FF0, 0x3FF3, 0x3FFC, 0x3FFF,
-  0xC000, 0xC003, 0xC00C, 0xC00F, 0xC030, 0xC033, 0xC03C, 0xC03F,
-  0xC0C0, 0xC0C3, 0xC0CC, 0xC0CF, 0xC0F0, 0xC0F3, 0xC0FC, 0xC0FF,
-  0xC300, 0xC303, 0xC30C, 0xC30F, 0xC330, 0xC333, 0xC33C, 0xC33F,
-  0xC3C0, 0xC3C3, 0xC3CC, 0xC3CF, 0xC3F0, 0xC3F3, 0xC3FC, 0xC3FF,
-  0xCC00, 0xCC03, 0xCC0C, 0xCC0F, 0xCC30, 0xCC33, 0xCC3C, 0xCC3F,
-  0xCCC0, 0xCCC3, 0xCCCC, 0xCCCF, 0xCCF0, 0xCCF3, 0xCCFC, 0xCCFF,
-  0xCF00, 0xCF03, 0xCF0C, 0xCF0F, 0xCF30, 0xCF33, 0xCF3C, 0xCF3F,
-  0xCFC0, 0xCFC3, 0xCFCC, 0xCFCF, 0xCFF0, 0xCFF3, 0xCFFC, 0xCFFF,
-  0xF000, 0xF003, 0xF00C, 0xF00F, 0xF030, 0xF033, 0xF03C, 0xF03F,
-  0xF0C0, 0xF0C3, 0xF0CC, 0xF0CF, 0xF0F0, 0xF0F3, 0xF0FC, 0xF0FF,
-  0xF300, 0xF303, 0xF30C, 0xF30F, 0xF330, 0xF333, 0xF33C, 0xF33F,
-  0xF3C0, 0xF3C3, 0xF3CC, 0xF3CF, 0xF3F0, 0xF3F3, 0xF3FC, 0xF3FF,
-  0xFC00, 0xFC03, 0xFC0C, 0xFC0F, 0xFC30, 0xFC33, 0xFC3C, 0xFC3F,
-  0xFCC0, 0xFCC3, 0xFCCC, 0xFCCF, 0xFCF0, 0xFCF3, 0xFCFC, 0xFCFF,
-  0xFF00, 0xFF03, 0xFF0C, 0xFF0F, 0xFF30, 0xFF33, 0xFF3C, 0xFF3F,
-  0xFFC0, 0xFFC3, 0xFFCC, 0xFFCF, 0xFFF0, 0xFFF3, 0xFFFC, 0xFFFF
-};
-
-const MT nonEmptyMask4Byte[16] =
-{
-  0x0000, 0x000F, 0x00F0, 0x00FF,
-  0x0F00, 0x0F0F, 0x0FF0, 0x0FFF,
-  0xF000, 0xF00F, 0xF0F0, 0xF0FF,
-  0xFF00, 0xFF0F, 0xFFF0, 0xFFFF
-};
-
-const MT nonEmptyMask8Byte[4] =
-{
-  0x0000, 0x00FF, 0xFF00, 0xFFFF
-};
-
-const MT nonEmptyMask16Byte[2] =
-{
-  0x0000, 0xFFFF
-};
-
-inline MT getNonEmptyMask1Byte(MT* nonEmptyMaskAux, uint16_t iter)
-{
-  return nonEmptyMaskAux[iter];
-}
-
-inline MT getNonEmptyMask2Byte(MT* nonEmptyMaskAux, uint16_t iter)
-{
-  return nonEmptyMask2Byte[(nonEmptyMaskAux[iter >> 1] >> ((iter & 0x0001) << 3)) & 0x00FF];
-}
-
-inline MT getNonEmptyMask4Byte(MT* nonEmptyMaskAux, uint16_t iter)
-{
-  return nonEmptyMask4Byte[(nonEmptyMaskAux[iter >> 2] >> ((iter & 0x0003) << 2)) & 0x000F];
-}
-
-inline MT getNonEmptyMask8Byte(MT* nonEmptyMaskAux, uint16_t iter)
-{
-  return nonEmptyMask8Byte[(nonEmptyMaskAux[iter >> 3] >> ((iter & 0x0007) << 1)) & 0x0003];
-}
-
-inline MT getNonEmptyMask16Byte(MT* nonEmptyMaskAux, uint16_t iter)
-{
-  return nonEmptyMask16Byte[(nonEmptyMaskAux[iter >> 4] >> (iter & 0x000F)) & 0x0001];
-}
-
-typedef MT (*getNonEmptyMaskPtrT)(MT*, uint16_t);
-
-template <uint16_t WIDTH>
-constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate()
-{
-  return nullptr;
-}
-
-template<>
-constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<1>()
-{
-  return getNonEmptyMask1Byte;
-}
-
-template<>
-constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<2>()
-{
-  return getNonEmptyMask2Byte;
-}
-
-template<>
-constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<4>()
-{
-  return getNonEmptyMask4Byte;
-}
-
-template<>
-constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<8>()
-{
-  return getNonEmptyMask8Byte;
-}
-
-template<>
-constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<16>()
-{
-  return getNonEmptyMask16Byte;
+  VT proc;
+  if constexpr (sizeof(T) == sizeof(uint8_t))
+  {
+    return nonEmptyMaskAux[iter];
+  }
+  else if constexpr (sizeof(T) == sizeof(uint16_t))
+  {
+    const char* ptr = reinterpret_cast<const char*>((uint64_t*)nonEmptyMaskAux + iter);
+    return proc.maskCtor(ptr);
+  }
+  else if constexpr (sizeof(T) == sizeof(uint32_t))
+  {
+    const char* ptr = reinterpret_cast<const char*>((uint32_t*)nonEmptyMaskAux + iter);
+    return proc.maskCtor(ptr);
+  }
+  else if constexpr (sizeof(T) == sizeof(uint64_t))
+  {
+    uint8_t* ptr = reinterpret_cast<uint8_t*>((uint16_t*)nonEmptyMaskAux + iter);
+    return typename VT::MaskType{ptr[0], ptr[1]};
+  }
+  else if constexpr ((sizeof(T) == 16))
+  {
+    const char* ptr = (const char*)nonEmptyMaskAux + iter;
+    return (typename VT::MaskType)proc.loadFrom(ptr);
+  }
 }
 
 inline uint64_t order_swap(uint64_t x)
@@ -183,52 +94,39 @@ inline uint64_t order_swap(uint64_t x)
 }
 
 // Dummy template
-template<typename T,
-        typename std::enable_if<sizeof(T) >= sizeof(uint128_t), T>::type* = nullptr>
+template <typename T, typename std::enable_if<sizeof(T) >= sizeof(uint128_t), T>::type* = nullptr>
 inline T orderSwap(T x)
 {
-    return x;
+  return x;
 }
 
-template<typename T,
-        typename std::enable_if<sizeof(T) == sizeof(int64_t), T>::type* = nullptr>
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int64_t), T>::type* = nullptr>
 inline T orderSwap(T x)
 {
-    T ret = (x >> 56) |
-            ((x << 40) & 0x00FF000000000000ULL) |
-            ((x << 24) & 0x0000FF0000000000ULL) |
-            ((x << 8)  & 0x000000FF00000000ULL) |
-            ((x >> 8)  & 0x00000000FF000000ULL) |
-            ((x >> 24) & 0x0000000000FF0000ULL) |
-            ((x >> 40) & 0x000000000000FF00ULL) |
-            (x << 56);
-    return ret;
+  T ret = (x >> 56) | ((x << 40) & 0x00FF000000000000ULL) | ((x << 24) & 0x0000FF0000000000ULL) |
+          ((x << 8) & 0x000000FF00000000ULL) | ((x >> 8) & 0x00000000FF000000ULL) |
+          ((x >> 24) & 0x0000000000FF0000ULL) | ((x >> 40) & 0x000000000000FF00ULL) | (x << 56);
+  return ret;
 }
 
-template<typename T,
-        typename std::enable_if<sizeof(T) == sizeof(int32_t), T>::type* = nullptr>
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int32_t), T>::type* = nullptr>
 inline T orderSwap(T x)
 {
-    T ret = (x >> 24) |
-            ((x << 8)  & 0x00FF0000U) |
-            ((x >> 8)  & 0x0000FF00U) |
-            (x << 24);
-    return ret;
+  T ret = (x >> 24) | ((x << 8) & 0x00FF0000U) | ((x >> 8) & 0x0000FF00U) | (x << 24);
+  return ret;
 }
 
-template<typename T,
-        typename std::enable_if<sizeof(T) == sizeof(int16_t), T>::type* = nullptr>
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int16_t), T>::type* = nullptr>
 inline T orderSwap(T x)
 {
-    T ret = (x >> 8) | (x <<8);
-    return ret;
+  T ret = (x >> 8) | (x << 8);
+  return ret;
 }
 
-template<typename T,
-        typename std::enable_if<sizeof(T) == sizeof(uint8_t), T>::type* = nullptr>
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(uint8_t), T>::type* = nullptr>
 inline T orderSwap(T x)
 {
-    return x;
+  return x;
 }
 
 template <class T>
@@ -272,15 +170,14 @@ inline bool colCompare_(const T& val1, const T& val2, uint8_t COP)
 
     case COMPARE_GE: return val1 >= val2;
 
+    case COMPARE_NULLEQ: return val1 == val2;
+
     default: logIt(34, COP, "colCompare_"); return false;  // throw an exception here?
   }
 }
 
-inline bool colCompareStr(const ColRequestHeaderDataType &type,
-                          uint8_t COP,
-                          const utils::ConstString &val1,
-                          const utils::ConstString &val2,
-                          const bool printOut = false)
+inline bool colCompareStr(const ColRequestHeaderDataType& type, uint8_t COP, const utils::ConstString& val1,
+                          const utils::ConstString& val2, const bool printOut = false)
 {
   int error = 0;
   bool rc = primitives::StringComparator(type).op(&error, COP, val1, val2);
@@ -311,6 +208,8 @@ inline bool colCompare_(const T& val1, const T& val2, uint8_t COP, uint8_t rf)
 
     case COMPARE_GT: return val1 > val2 || (val1 == val2 && (rf & 0x80));
 
+    case COMPARE_NULLEQ: return val1 == val2 && rf == 0;
+
     default: logIt(34, COP, "colCompare_"); return false;  // throw an exception here?
   }
 }
@@ -334,6 +233,8 @@ inline bool colStrCompare_(uint64_t val1, uint64_t val2, uint8_t COP, uint8_t rf
 
     case COMPARE_GT: return val1 > val2;
 
+    case COMPARE_NULLEQ: return val1 == val2 && rf == 0;
+
     case COMPARE_LIKE:
     case COMPARE_NLIKE:
     default: logIt(34, COP, "colStrCompare_"); return false;  // throw an exception here?
@@ -607,103 +508,6 @@ T getEmptyValue(uint8_t type)
   }
 }
 
-// Bit pattern representing NULL value for given column type/width
-// TBD Use TypeHandler
-template <typename T, typename std::enable_if<sizeof(T) == sizeof(int128_t), T>::type* = nullptr>
-T getNullValue(uint8_t type)
-{
-  return datatypes::Decimal128Null;
-}
-
-template <typename T, typename std::enable_if<sizeof(T) == sizeof(int64_t), T>::type* = nullptr>
-T getNullValue(uint8_t type)
-{
-  switch (type)
-  {
-    case CalpontSystemCatalog::DOUBLE:
-    case CalpontSystemCatalog::UDOUBLE: return joblist::DOUBLENULL;
-
-    case CalpontSystemCatalog::CHAR:
-    case CalpontSystemCatalog::VARCHAR:
-    case CalpontSystemCatalog::DATE:
-    case CalpontSystemCatalog::DATETIME:
-    case CalpontSystemCatalog::TIMESTAMP:
-    case CalpontSystemCatalog::TIME:
-    case CalpontSystemCatalog::VARBINARY:
-    case CalpontSystemCatalog::BLOB:
-    case CalpontSystemCatalog::TEXT: return joblist::CHAR8NULL;
-
-    case CalpontSystemCatalog::UBIGINT: return joblist::UBIGINTNULL;
-
-    default: return joblist::BIGINTNULL;
-  }
-}
-
-template <typename T, typename std::enable_if<sizeof(T) == sizeof(int32_t), T>::type* = nullptr>
-T getNullValue(uint8_t type)
-{
-  switch (type)
-  {
-    case CalpontSystemCatalog::FLOAT:
-    case CalpontSystemCatalog::UFLOAT: return joblist::FLOATNULL;
-
-    case CalpontSystemCatalog::CHAR:
-    case CalpontSystemCatalog::VARCHAR:
-    case CalpontSystemCatalog::BLOB:
-    case CalpontSystemCatalog::TEXT: return joblist::CHAR4NULL;
-
-    case CalpontSystemCatalog::DATE:
-    case CalpontSystemCatalog::DATETIME:
-    case CalpontSystemCatalog::TIMESTAMP:
-    case CalpontSystemCatalog::TIME: return joblist::DATENULL;
-
-    case CalpontSystemCatalog::UINT:
-    case CalpontSystemCatalog::UMEDINT: return joblist::UINTNULL;
-
-    default: return joblist::INTNULL;
-  }
-}
-
-template <typename T, typename std::enable_if<sizeof(T) == sizeof(int16_t), T>::type* = nullptr>
-T getNullValue(uint8_t type)
-{
-  switch (type)
-  {
-    case CalpontSystemCatalog::CHAR:
-    case CalpontSystemCatalog::VARCHAR:
-    case CalpontSystemCatalog::BLOB:
-    case CalpontSystemCatalog::TEXT:
-    case CalpontSystemCatalog::DATE:
-    case CalpontSystemCatalog::DATETIME:
-    case CalpontSystemCatalog::TIMESTAMP:
-    case CalpontSystemCatalog::TIME: return joblist::CHAR2NULL;
-
-    case CalpontSystemCatalog::USMALLINT: return joblist::USMALLINTNULL;
-
-    default: return joblist::SMALLINTNULL;
-  }
-}
-
-template <typename T, typename std::enable_if<sizeof(T) == sizeof(int8_t), T>::type* = nullptr>
-T getNullValue(uint8_t type)
-{
-  switch (type)
-  {
-    case CalpontSystemCatalog::CHAR:
-    case CalpontSystemCatalog::VARCHAR:
-    case CalpontSystemCatalog::BLOB:
-    case CalpontSystemCatalog::TEXT:
-    case CalpontSystemCatalog::DATE:
-    case CalpontSystemCatalog::DATETIME:
-    case CalpontSystemCatalog::TIMESTAMP:
-    case CalpontSystemCatalog::TIME: return joblist::CHAR1NULL;
-
-    case CalpontSystemCatalog::UTINYINT: return joblist::UTINYINTNULL;
-
-    default: return joblist::TINYINTNULL;
-  }
-}
-
 // Check whether val is NULL (or alternative NULL bit pattern for 64-bit string types)
 template <ENUM_KIND KIND, typename T>
 inline bool isNullValue(const T val, const T NULL_VALUE)
@@ -951,15 +755,15 @@ template <typename T, int COL_WIDTH, bool IS_AUX_COLUMN, uint8_t EMPTY_VALUE_AUX
 inline bool nextColValue(
     T& result,      // Place for the value returned
     bool& isEmpty,  // ... and flag whether it's EMPTY
-    uint32_t& index,  // Successive index either in srcArray (going from 0 to srcSize-1) or ridArray (0..ridSize-1)
+    uint32_t&
+        index,  // Successive index either in srcArray (going from 0 to srcSize-1) or ridArray (0..ridSize-1)
     uint16_t& rid,             // Index in srcArray of the value returned
     const T* srcArray,         // Input array
     const uint32_t srcSize,    // ... and its size
     const uint16_t* ridArray,  // Optional array of indexes into srcArray, that defines the read order
     const uint16_t ridSize,    // ... and its size
     const uint8_t OutputType,  // Used to decide whether to skip EMPTY values
-    const T& EMPTY_VALUE,
-    const uint8_t* blockAux)
+    const T& EMPTY_VALUE, const uint8_t* blockAux)
 {
   auto i = index;  // local copy of index to speed up loops
   [[maybe_unused]] T value;
@@ -1076,7 +880,6 @@ inline void writeColValue(uint8_t OutputType, ColResultHeader* out, uint16_t rid
   }
 }
 
-#if defined(__x86_64__)|| defined(__aarch64__)
 template <typename T, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
           typename std::enable_if<HAS_INPUT_RIDS == false, T>::type* = nullptr>
 inline void vectUpdateMinMax(const bool validMinMax, const bool isNonNullOrEmpty, T& Min, T& Max, T curValue,
@@ -1123,61 +926,51 @@ template <typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INP
           typename std::enable_if<OUTPUT_TYPE&(OT_TOKEN | OT_DATAVALUE) && !(OUTPUT_TYPE & OT_RID),
                                   T>::type* = nullptr>
 inline uint16_t vectWriteColValues(
-    VT& simdProcessor,                    // SIMD processor
-    const MT writeMask,                   // SIMD intrinsics bitmask for values to write
-    const MT nonNullOrEmptyMask,          // SIMD intrinsics inverce bitmask for NULL/EMPTY values
-    const bool validMinMax,               // The flag to update Min/Max for a block or not
-    const primitives::RIDType ridOffset,  // The first RID value of the dataVecTPtr
-    T* dataVecTPtr,                       // Typed SIMD vector from the input block
-    char* dstArray,                       // the actual char dst array ptr to start writing values
-    T& Min, T& Max,                       // Min/Max of the extent
-    NewColRequestHeader* in,              // Proto message
-    ColResultHeader* out,                 // Proto message
-    primitives::RIDType* ridDstArray,     // The actual dst arrray ptr to start writing RIDs
-    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
+    VT& simdProcessor,                               // SIMD processor
+    const typename VT::MaskType writeMask,           // SIMD intrinsics bitmask for values to write
+    const typename VT::MaskType nonNullOrEmptyMask,  // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    const bool validMinMax,                          // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,             // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                  // Typed SIMD vector from the input block
+    char* dstArray,                                  // the actual char dst array ptr to start writing values
+    T& Min, T& Max,                                  // Min/Max of the extent
+    NewColRequestHeader* in,                         // Proto message
+    ColResultHeader* out,                            // Proto message
+    primitives::RIDType* ridDstArray,                // The actual dst arrray ptr to start writing RIDs
+    primitives::RIDType* ridSrcArray)                // The actual src array ptr to read RIDs
 {
   constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
-  using SimdType = typename VT::SimdType;
-  SimdType tmpStorageVector;
-  T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector);
-  // Saving values based on writeMask into tmp vec.
-  // Min/Max processing.
-  // The mask is 16 bit long and it describes N elements.
-  // N = sizeof(vector type) / WIDTH.
+  T* tmpDstVecTPtr = reinterpret_cast<T*>(dstArray);
   uint32_t j = 0;
+  const int8_t* ptrW = reinterpret_cast<const int8_t*>(&writeMask);
   for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
   {
-    MT bitMapPosition = 1 << it;
-    if (writeMask & bitMapPosition)
+    if (ptrW[it])
     {
       *tmpDstVecTPtr = dataVecTPtr[j];
       ++tmpDstVecTPtr;
     }
   }
-  // Store the whole vector however one level up the stack
-  // vectorizedFiltering() increases the dstArray by a number of
-  // actual values written that is the result of this function.
-  simdProcessor.store(dstArray, tmpStorageVector);
 
-  return tmpDstVecTPtr - reinterpret_cast<T*>(&tmpStorageVector);
+  return tmpDstVecTPtr - reinterpret_cast<T*>(dstArray);
 }
 
 // RIDs no values
 template <typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
           typename std::enable_if<OUTPUT_TYPE & OT_RID && !(OUTPUT_TYPE & OT_TOKEN), T>::type* = nullptr>
 inline uint16_t vectWriteColValues(
-    VT& simdProcessor,                    // SIMD processor
-    const MT writeMask,                   // SIMD intrinsics bitmask for values to write
-    const MT nonNullOrEmptyMask,          // SIMD intrinsics inverce bitmask for NULL/EMPTY values
-    const bool validMinMax,               // The flag to update Min/Max for a block or not
-    const primitives::RIDType ridOffset,  // The first RID value of the dataVecTPtr
-    T* dataVecTPtr,                       // Typed SIMD vector from the input block
-    char* dstArray,                       // the actual char dst array ptr to start writing values
-    T& Min, T& Max,                       // Min/Max of the extent
-    NewColRequestHeader* in,              // Proto message
-    ColResultHeader* out,                 // Proto message
-    primitives::RIDType* ridDstArray,     // The actual dst arrray ptr to start writing RIDs
-    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
+    VT& simdProcessor,                               // SIMD processor
+    const typename VT::MaskType writeMask,           // SIMD intrinsics bitmask for values to write
+    const typename VT::MaskType nonNullOrEmptyMask,  // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    const bool validMinMax,                          // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,             // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                  // Typed SIMD vector from the input block
+    char* dstArray,                                  // the actual char dst array ptr to start writing values
+    T& Min, T& Max,                                  // Min/Max of the extent
+    NewColRequestHeader* in,                         // Proto message
+    ColResultHeader* out,                            // Proto message
+    primitives::RIDType* ridDstArray,                // The actual dst arrray ptr to start writing RIDs
+    primitives::RIDType* ridSrcArray)                // The actual src array ptr to read RIDs
 {
   return 0;
 }
@@ -1186,23 +979,22 @@ inline uint16_t vectWriteColValues(
 template <typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
           typename std::enable_if<OUTPUT_TYPE == OT_BOTH, T>::type* = nullptr>
 inline uint16_t vectWriteColValues(
-    VT& simdProcessor,                    // SIMD processor
-    const MT writeMask,                   // SIMD intrinsics bitmask for values to write
-    const MT nonNullOrEmptyMask,          // SIMD intrinsics inverce bitmask for NULL/EMPTY values
-    const bool validMinMax,               // The flag to update Min/Max for a block or not
-    const primitives::RIDType ridOffset,  // The first RID value of the dataVecTPtr
-    T* dataVecTPtr,                       // Typed SIMD vector from the input block
-    char* dstArray,                       // the actual char dst array ptr to start writing values
-    T& Min, T& Max,                       // Min/Max of the extent
-    NewColRequestHeader* in,              // Proto message
-    ColResultHeader* out,                 // Proto message
-    primitives::RIDType* ridDstArray,     // The actual dst arrray ptr to start writing RIDs
-    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
+    VT& simdProcessor,                               // SIMD processor
+    const typename VT::MaskType writeMask,           // SIMD intrinsics bitmask for values to write
+    const typename VT::MaskType nonNullOrEmptyMask,  // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    const bool validMinMax,                          // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,             // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                  // Typed SIMD vector from the input block
+    char* dstArray,                                  // the actual char dst array ptr to start writing values
+    T& Min, T& Max,                                  // Min/Max of the extent
+    NewColRequestHeader* in,                         // Proto message
+    ColResultHeader* out,                            // Proto message
+    primitives::RIDType* ridDstArray,                // The actual dst arrray ptr to start writing RIDs
+    primitives::RIDType* ridSrcArray)                // The actual src array ptr to read RIDs
 {
   constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
-  using SimdType = typename VT::SimdType;
-  SimdType tmpStorageVector;
-  T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector);
+  T* tmpDstVecTPtr = reinterpret_cast<T*>(dstArray);
+  const int8_t* ptrW = reinterpret_cast<const int8_t*>(&writeMask);
   // Saving values based on writeMask into tmp vec.
   // Min/Max processing.
   // The mask is 16 bit long and it describes N elements.
@@ -1210,8 +1002,7 @@ inline uint16_t vectWriteColValues(
   uint32_t j = 0;
   for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
   {
-    MT bitMapPosition = 1 << it;
-    if (writeMask & bitMapPosition)
+    if (ptrW[it])
     {
       *tmpDstVecTPtr = dataVecTPtr[j];
       ++tmpDstVecTPtr;
@@ -1219,12 +1010,8 @@ inline uint16_t vectWriteColValues(
       ++ridDstArray;
     }
   }
-  // Store the whole vector however one level up the stack
-  // vectorizedFiltering() increases the dstArray by a number of
-  // actual values written that is the result of this function.
-  simdProcessor.store(dstArray, tmpStorageVector);
 
-  return tmpDstVecTPtr - reinterpret_cast<T*>(&tmpStorageVector);
+  return tmpDstVecTPtr - reinterpret_cast<T*>(dstArray);
 }
 
 // RIDs no values
@@ -1232,18 +1019,18 @@ template <typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INP
           typename std::enable_if<!(OUTPUT_TYPE & (OT_TOKEN | OT_DATAVALUE)) && OUTPUT_TYPE & OT_RID,
                                   T>::type* = nullptr>
 inline uint16_t vectWriteRIDValues(
-    VT& processor,                        // SIMD processor
-    const uint16_t valuesWritten,         // The number of values written to in certain SFINAE cases
-    const bool validMinMax,               // The flag to update Min/Max for a block or not
-    const primitives::RIDType ridOffset,  // The first RID value of the dataVecTPtr
-    T* dataVecTPtr,                       // Typed SIMD vector from the input block
-    primitives::RIDType* ridDstArray,     // The actual dst arrray ptr to start writing RIDs
-    MT writeMask,                         // SIMD intrinsics bitmask for values to write
-    T& Min, T& Max,                       // Min/Max of the extent
-    NewColRequestHeader* in,              // Proto message
-    ColResultHeader* out,                 // Proto message
-    MT nonNullOrEmptyMask,                // SIMD intrinsics inverce bitmask for NULL/EMPTY values
-    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
+    VT& processor,                          // SIMD processor
+    const uint16_t valuesWritten,           // The number of values written to in certain SFINAE cases
+    const bool validMinMax,                 // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,    // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                         // Typed SIMD vector from the input block
+    primitives::RIDType* ridDstArray,       // The actual dst arrray ptr to start writing RIDs
+    const typename VT::MaskType writeMask,  // SIMD intrinsics bitmask for values to write
+    T& Min, T& Max,                         // Min/Max of the extent
+    NewColRequestHeader* in,                // Proto message
+    ColResultHeader* out,                   // Proto message
+    const typename VT::MaskType nonNullOrEmptyMask,  // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    primitives::RIDType* ridSrcArray)                // The actual src array ptr to read RIDs
 {
   constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
   primitives::RIDType* origRIDDstArray = ridDstArray;
@@ -1251,9 +1038,10 @@ inline uint16_t vectWriteRIDValues(
   // Min/Max processing.
   // The mask is 16 bit long and it describes N elements where N = sizeof(vector type) / WIDTH.
   uint16_t j = 0;
+  const int8_t* ptrW = reinterpret_cast<const int8_t*>(&writeMask);
   for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
   {
-    if (writeMask & (1 << it))
+    if (ptrW[it])
     {
       vectWriteColValuesLoopRIDAsignment<T, HAS_INPUT_RIDS>(ridDstArray, out, ridOffset + j, ridSrcArray, j);
       ++ridDstArray;
@@ -1267,18 +1055,18 @@ inline uint16_t vectWriteRIDValues(
 template <typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
           typename std::enable_if<OUTPUT_TYPE == OT_BOTH, T>::type* = nullptr>
 inline uint16_t vectWriteRIDValues(
-    VT& processor,                        // SIMD processor
-    const uint16_t valuesWritten,         // The number of values written to in certain SFINAE cases
-    const bool validMinMax,               // The flag to update Min/Max for a block or not
-    const primitives::RIDType ridOffset,  // The first RID value of the dataVecTPtr
-    T* dataVecTPtr,                       // Typed SIMD vector from the input block
-    primitives::RIDType* ridDstArray,     // The actual dst arrray ptr to start writing RIDs
-    MT writeMask,                         // SIMD intrinsics bitmask for values to write
-    T& Min, T& Max,                       // Min/Max of the extent
-    NewColRequestHeader* in,              // Proto message
-    ColResultHeader* out,                 // Proto message
-    MT nonNullOrEmptyMask,                // SIMD intrinsics inverce bitmask for NULL/EMPTY values
-    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
+    VT& processor,                          // SIMD processor
+    const uint16_t valuesWritten,           // The number of values written to in certain SFINAE cases
+    const bool validMinMax,                 // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,    // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                         // Typed SIMD vector from the input block
+    primitives::RIDType* ridDstArray,       // The actual dst arrray ptr to start writing RIDs
+    const typename VT::MaskType writeMask,  // SIMD intrinsics bitmask for values to write
+    T& Min, T& Max,                         // Min/Max of the extent
+    NewColRequestHeader* in,                // Proto message
+    ColResultHeader* out,                   // Proto message
+    const typename VT::MaskType nonNullOrEmptyMask,  // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    primitives::RIDType* ridSrcArray)                // The actual src array ptr to read RIDs
 {
   return valuesWritten;
 }
@@ -1288,22 +1076,21 @@ template <typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INP
           typename std::enable_if<OUTPUT_TYPE&(OT_TOKEN | OT_DATAVALUE) && !(OUTPUT_TYPE & OT_RID),
                                   T>::type* = nullptr>
 inline uint16_t vectWriteRIDValues(
-    VT& processor,                        // SIMD processor
-    const uint16_t valuesWritten,         // The number of values written to in certain SFINAE cases
-    const bool validMinMax,               // The flag to update Min/Max for a block or not
-    const primitives::RIDType ridOffset,  // The first RID value of the dataVecTPtr
-    T* dataVecTPtr,                       // Typed SIMD vector from the input block
-    primitives::RIDType* ridDstArray,     // The actual dst arrray ptr to start writing RIDs
-    MT writeMask,                         // SIMD intrinsics bitmask for values to write
-    T& Min, T& Max,                       // Min/Max of the extent
-    NewColRequestHeader* in,              // Proto message
-    ColResultHeader* out,                 // Proto message
-    MT nonNullOrEmptyMask,                // SIMD intrinsics inverce bitmask for NULL/EMPTY values
-    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
+    VT& processor,                          // SIMD processor
+    const uint16_t valuesWritten,           // The number of values written to in certain SFINAE cases
+    const bool validMinMax,                 // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,    // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                         // Typed SIMD vector from the input block
+    primitives::RIDType* ridDstArray,       // The actual dst arrray ptr to start writing RIDs
+    const typename VT::MaskType writeMask,  // SIMD intrinsics bitmask for values to write
+    T& Min, T& Max,                         // Min/Max of the extent
+    NewColRequestHeader* in,                // Proto message
+    ColResultHeader* out,                   // Proto message
+    const typename VT::MaskType nonNullOrEmptyMask,  // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    primitives::RIDType* ridSrcArray)                // The actual src array ptr to read RIDs
 {
   return valuesWritten;
 }
-#endif
 
 /*****************************************************************************
  *** RUN DATA THROUGH A COLUMN FILTER ****************************************
@@ -1329,8 +1116,7 @@ void scalarFiltering_(
     const bool validMinMax,     // The flag to store min/max
     T emptyValue,               // Deduced empty value magic
     T nullValue,                // Deduced null value magic
-    T Min, T Max, const bool isNullValueMatches,
-    const uint8_t* blockAux)
+    T Min, T Max, const bool isNullValueMatches, const uint8_t* blockAux)
 {
   constexpr int WIDTH = sizeof(T);
   // Loop-local variables
@@ -1343,16 +1129,16 @@ void scalarFiltering_(
   {
     if constexpr (IS_AUX_COLUMN)
     {
-      if (!(nextColValue<T, WIDTH, true, execplan::AUX_COL_EMPTYVALUE>(curValue, isEmpty, i, rid, srcArray, srcSize,
-                                                                       ridArray, ridSize, outputType, emptyValue,
-                                                                       blockAux)))
+      if (!(nextColValue<T, WIDTH, true, execplan::AUX_COL_EMPTYVALUE>(curValue, isEmpty, i, rid, srcArray,
+                                                                       srcSize, ridArray, ridSize, outputType,
+                                                                       emptyValue, blockAux)))
         break;
     }
     else
     {
-      if (!(nextColValue<T, WIDTH, false, execplan::AUX_COL_EMPTYVALUE>(curValue, isEmpty, i, rid, srcArray, srcSize,
-                                                                        ridArray, ridSize, outputType, emptyValue,
-                                                                        blockAux)))
+      if (!(nextColValue<T, WIDTH, false, execplan::AUX_COL_EMPTYVALUE>(curValue, isEmpty, i, rid, srcArray,
+                                                                        srcSize, ridArray, ridSize,
+                                                                        outputType, emptyValue, blockAux)))
         break;
     }
 
@@ -1407,32 +1193,28 @@ void scalarFiltering(
     const bool validMinMax,     // The flag to store min/max
     T emptyValue,               // Deduced empty value magic
     T nullValue,                // Deduced null value magic
-    T Min, T Max, const bool isNullValueMatches,
-    const uint8_t* blockAux)
+    T Min, T Max, const bool isNullValueMatches, const uint8_t* blockAux)
 {
   if (in->hasAuxCol)
   {
-    scalarFiltering_<T, FT, ST, KIND, true>(in, out, columnFilterMode,
-      filterSet, filterCount, filterCOPs, filterValues, filterRFs,
-      typeHolder, srcArray, srcSize, ridArray, ridSize, initialRID,
-      outputType, validMinMax, emptyValue, nullValue, Min, Max,
-      isNullValueMatches, blockAux);
+    scalarFiltering_<T, FT, ST, KIND, true>(in, out, columnFilterMode, filterSet, filterCount, filterCOPs,
+                                            filterValues, filterRFs, typeHolder, srcArray, srcSize, ridArray,
+                                            ridSize, initialRID, outputType, validMinMax, emptyValue,
+                                            nullValue, Min, Max, isNullValueMatches, blockAux);
   }
   else
   {
-    scalarFiltering_<T, FT, ST, KIND, false>(in, out, columnFilterMode,
-      filterSet, filterCount, filterCOPs, filterValues, filterRFs,
-      typeHolder, srcArray, srcSize, ridArray, ridSize, initialRID,
-      outputType, validMinMax, emptyValue, nullValue, Min, Max,
-      isNullValueMatches, blockAux);
+    scalarFiltering_<T, FT, ST, KIND, false>(in, out, columnFilterMode, filterSet, filterCount, filterCOPs,
+                                             filterValues, filterRFs, typeHolder, srcArray, srcSize, ridArray,
+                                             ridSize, initialRID, outputType, validMinMax, emptyValue,
+                                             nullValue, Min, Max, isNullValueMatches, blockAux);
   }
 }
 
-#if defined(__x86_64__)|| defined(__aarch64__)
 template <typename VT, typename SIMD_WRAPPER_TYPE, bool HAS_INPUT_RIDS, typename T,
           typename std::enable_if<HAS_INPUT_RIDS == false, T>::type* = nullptr>
 inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* origSrcArray,
-                                              const primitives::RIDType* ridArray, const uint16_t iter)
+                                      const primitives::RIDType* ridArray, const uint16_t iter)
 {
   return {processor.loadFrom(reinterpret_cast<const char*>(srcArray))};
 }
@@ -1442,7 +1224,7 @@ inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T*
 template <typename VT, typename SIMD_WRAPPER_TYPE, bool HAS_INPUT_RIDS, typename T,
           typename std::enable_if<HAS_INPUT_RIDS == true, T>::type* = nullptr>
 inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* origSrcArray,
-                                              const primitives::RIDType* ridArray, const uint16_t iter)
+                                      const primitives::RIDType* ridArray, const uint16_t iter)
 {
   constexpr const uint16_t WIDTH = sizeof(T);
   constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
@@ -1457,56 +1239,59 @@ inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T*
   return {result};
 }
 
-template <ENUM_KIND KIND, typename VT,typename SIMD_WRAPPER_TYPE, typename T,
+template <ENUM_KIND KIND, typename VT, typename SIMD_WRAPPER_TYPE, typename T,
           typename std::enable_if<KIND != KIND_TEXT, T>::type* = nullptr>
-inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType &type, VT& processor, typename VT::SimdType& dataVector)
+inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType& type, VT& processor,
+                                                 typename VT::SimdType& dataVector)
 {
-    return {dataVector};
+  return {dataVector};
 }
 
-template <ENUM_KIND KIND, typename VT,typename SIMD_WRAPPER_TYPE, typename T,
+template <ENUM_KIND KIND, typename VT, typename SIMD_WRAPPER_TYPE, typename T,
           typename std::enable_if<KIND == KIND_TEXT, T>::type* = nullptr>
-inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType &type,
-  VT& processor, typename VT::SimdType& dataVector)
+inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType& type, VT& processor,
+                                                 typename VT::SimdType& dataVector)
 {
-    constexpr const uint16_t WIDTH = sizeof(T);
-    constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
-    using SimdType = typename VT::SimdType;
-    SimdType result;
-    T* resultTypedPtr = reinterpret_cast<T*>(&result);
-    T* srcTypedPtr = reinterpret_cast<T*>(&dataVector);
-    for (uint32_t i = 0; i < VECTOR_SIZE; ++i)
-    {
-        utils::ConstString s{reinterpret_cast<const char*>(&srcTypedPtr[i]), WIDTH};
-        resultTypedPtr[i] = orderSwap(type.strnxfrm<T>(s.rtrimZero()));
-    }
-    return {result};
+  constexpr const uint16_t WIDTH = sizeof(T);
+  constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
+  using SimdType = typename VT::SimdType;
+  SimdType result;
+  T* resultTypedPtr = reinterpret_cast<T*>(&result);
+  T* srcTypedPtr = reinterpret_cast<T*>(&dataVector);
+  for (uint32_t i = 0; i < VECTOR_SIZE; ++i)
+  {
+    utils::ConstString s{reinterpret_cast<const char*>(&srcTypedPtr[i]), WIDTH};
+    resultTypedPtr[i] = orderSwap(type.strnxfrm<T>(s.rtrimZero()));
+  }
+  return {result};
 }
 
 template <typename VT, typename SimdType>
-void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT simdProcessor,
-                            SimdType& dataVec, SimdType& simdMin, SimdType& simdMax)
+void vectorizedUpdateMinMax(const bool validMinMax, const typename VT::MaskType nonNullOrEmptyMask,
+                            VT simdProcessor, SimdType& dataVec, SimdType& simdMin, SimdType& simdMax)
 {
   if (validMinMax)
   {
-    auto byteMask = utils::bitCast<SimdType>(simd::bitMaskToByteMask16(nonNullOrEmptyMask));
-    simdMin = simdProcessor.blend(
-        simdMin, dataVec, simdProcessor.bwAnd(simdProcessor.cmpGtSimdType(simdMin, dataVec), byteMask));
-    simdMax = simdProcessor.blend(
-        simdMax, dataVec, simdProcessor.bwAnd(simdProcessor.cmpGtSimdType(dataVec, simdMax), byteMask));
+    {
+      simdMin =
+          simdProcessor.blend(simdMin, dataVec, simdProcessor.cmpGt(simdMin, dataVec) & nonNullOrEmptyMask);
+      simdMax =
+          simdProcessor.blend(simdMax, dataVec, simdProcessor.cmpGt(dataVec, simdMax) & nonNullOrEmptyMask);
+    }
   }
 }
 
 template <typename VT, typename SimdType>
-void vectorizedTextUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT simdProcessor,
-                                SimdType& dataVec, SimdType& simdMin, SimdType& simdMax,
+void vectorizedTextUpdateMinMax(const bool validMinMax, const typename VT::MaskType nonNullOrEmptyMask,
+                                VT simdProcessor, SimdType& dataVec, SimdType& simdMin, SimdType& simdMax,
                                 SimdType& swapedOrderDataVec, SimdType& weightsMin, SimdType& weightsMax)
 {
+  using MT = typename VT::MaskType;
   if (validMinMax)
   {
-    auto byteMask = utils::bitCast<SimdType>(simd::bitMaskToByteMask16(nonNullOrEmptyMask));
-    auto minComp = simdProcessor.bwAnd(simdProcessor.cmpGtSimdType(weightsMin, swapedOrderDataVec), byteMask);
-    auto maxComp = simdProcessor.bwAnd(simdProcessor.cmpGtSimdType(swapedOrderDataVec, weightsMax), byteMask);
+    MT minComp = simdProcessor.cmpGt(weightsMin, swapedOrderDataVec) & nonNullOrEmptyMask;
+    MT maxComp = simdProcessor.cmpGt(swapedOrderDataVec, weightsMax) & nonNullOrEmptyMask;
+
     simdMin = simdProcessor.blend(simdMin, dataVec, minComp);
     weightsMin = simdProcessor.blend(weightsMin, swapedOrderDataVec, minComp);
     simdMax = simdProcessor.blend(simdMax, dataVec, maxComp);
@@ -1514,7 +1299,7 @@ void vectorizedTextUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyM
   }
 }
 
-template<typename T, typename VT, typename SimdType>
+template <typename T, typename VT, typename SimdType>
 void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min, T& max)
 {
   constexpr const uint16_t size = VT::vecByteSize / sizeof(T);
@@ -1524,7 +1309,7 @@ void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min
   min = *std::min_element(simdMinVec, simdMinVec + size);
 }
 
-template<typename T, typename VT, typename SimdType>
+template <typename T, typename VT, typename SimdType>
 void extractTextMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, SimdType weightsMin,
                        SimdType weightsMax, T& min, T& max)
 {
@@ -1539,10 +1324,9 @@ void extractTextMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, Si
   max = simdMaxVec[indMax - weightsMaxVec];
 }
 
-template<bool HAS_INPUT_RIDS, uint8_t EMPTY_VALUE_AUX>
-void buildAuxColEmptyVal(const uint16_t iterNumberAux, const uint16_t vectorSizeAux,
-                         const uint8_t** blockAux, MT** nonEmptyMaskAux,
-                         primitives::RIDType** ridArray)
+template <typename VT, bool HAS_INPUT_RIDS, uint8_t EMPTY_VALUE_AUX, typename MT>
+void buildAuxColEmptyVal(const uint16_t iterNumberAux, const uint16_t vectorSizeAux, const uint8_t** blockAux,
+                         MT** nonEmptyMaskAux, primitives::RIDType** ridArray)
 {
   using SimdTypeTemp = typename simd::IntegralToSIMD<uint8_t, KIND_UNSIGNED>::type;
   using FilterTypeTemp = typename simd::StorageToFiltering<uint8_t, KIND_UNSIGNED>::type;
@@ -1551,15 +1335,16 @@ void buildAuxColEmptyVal(const uint16_t iterNumberAux, const uint16_t vectorSize
   using SimdWrapperTypeAux = typename VTAux::SimdWrapperType;
   VTAux simdProcessorAux;
   SimdTypeAux dataVecAux;
-  SimdTypeAux emptyFilterArgVecAux = simdProcessorAux.emptyNullLoadValue(EMPTY_VALUE_AUX);
+  SimdTypeAux emptyFilterArgVecAux = simdProcessorAux.loadValue(EMPTY_VALUE_AUX);
   const uint8_t* origBlockAux = *blockAux;
   primitives::RIDType* origRidArray = *ridArray;
 
   for (uint16_t i = 0; i < iterNumberAux; ++i)
   {
     dataVecAux = simdDataLoad<VTAux, SimdWrapperTypeAux, HAS_INPUT_RIDS, uint8_t>(simdProcessorAux, *blockAux,
-      origBlockAux, *ridArray, i).v;
-    (*nonEmptyMaskAux)[i] = simdProcessorAux.nullEmptyCmpNe(dataVecAux, emptyFilterArgVecAux);
+                                                                                  origBlockAux, *ridArray, i)
+                     .v;
+    (*nonEmptyMaskAux)[i] = (MT)simdProcessorAux.nullEmptyCmpNe(dataVecAux, emptyFilterArgVecAux);
     *blockAux += vectorSizeAux;
     *ridArray += vectorSizeAux;
   }
@@ -1576,9 +1361,8 @@ void buildAuxColEmptyVal(const uint16_t iterNumberAux, const uint16_t vectorSize
 // to glue the masks produced by actual filters.
 // Then it takes a vector of data, run filters and logical function using pointers.
 // See the corresponding dispatcher to get more details on vector processing class.
-template<typename T, typename VT, bool HAS_INPUT_RIDS, int OUTPUT_TYPE,
-         ENUM_KIND KIND, typename FT, typename ST, bool IS_AUX_COLUMN,
-         uint8_t EMPTY_VALUE_AUX>
+template <typename T, typename VT, bool HAS_INPUT_RIDS, int OUTPUT_TYPE, ENUM_KIND KIND, typename FT,
+          typename ST, bool IS_AUX_COLUMN, uint8_t EMPTY_VALUE_AUX>
 void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T* srcArray,
                           const uint32_t srcSize, primitives::RIDType* ridArray, const uint16_t ridSize,
                           ParsedColumnFilter* parsedColumnFilter, const bool validMinMax, const T emptyValue,
@@ -1589,16 +1373,22 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T
   using SimdType = typename VT::SimdType;
   using SimdWrapperType = typename VT::SimdWrapperType;
   using FilterType = typename VT::FilterType;
-  using UT = typename std::conditional<std::is_unsigned<FilterType>::value || datatypes::is_uint128_t<FilterType>::value || std::is_same<double, FilterType>::value,
-    FilterType, typename datatypes::make_unsigned<FilterType>::type>::type;
+  using UT = typename std::conditional<std::is_unsigned<FilterType>::value ||
+                                           datatypes::is_uint128_t<FilterType>::value ||
+                                           std::is_same<double, FilterType>::value,
+                                       FilterType, typename datatypes::make_unsigned<FilterType>::type>::type;
   VT simdProcessor;
+  using MT = typename VT::MaskType;
   SimdType dataVec;
   [[maybe_unused]] SimdType swapedOrderDataVec;
   [[maybe_unused]] auto typeHolder = in->colType;
   [[maybe_unused]] SimdType emptyFilterArgVec = simdProcessor.emptyNullLoadValue(emptyValue);
   SimdType nullFilterArgVec = simdProcessor.emptyNullLoadValue(nullValue);
-  MT writeMask, nonEmptyMask, nonNullMask, nonNullOrEmptyMask;
-  MT initFilterMask = 0xFFFF;
+  MT writeMask, nonNullMask, nonNullOrEmptyMask;
+  MT trueMask = simdProcessor.trueMask();
+  MT falseMask = simdProcessor.falseMask();
+  MT nonEmptyMask = trueMask;
+  MT initFilterMask = trueMask;
   primitives::RIDType rid = 0;
   primitives::RIDType* origRidArray = ridArray;
   uint16_t totalValuesWritten = 0;
@@ -1620,12 +1410,8 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wignored-attributes"
   std::vector<SimdType> filterArgsVectors;
-  auto ptrA = std::mem_fn(&VT::cmpEq);
-  using COPType = decltype(ptrA);
-  std::vector<COPType> copFunctorVec;
+  bool isOr = false;
 #pragma GCC diagnostic pop
-  using BOPType = std::function<MT(MT, MT)>;
-  BOPType bopFunctor;
   // filter comparators and logical function compilation.
   if (parsedColumnFilter != nullptr)
   {
@@ -1637,23 +1423,15 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T
     filterCount = parsedColumnFilter->getFilterCount();
     if (iterNumber > 0)
     {
-      copFunctorVec.reserve(filterCount);
       switch (parsedColumnFilter->getBOP())
       {
         case BOP_OR:
-          bopFunctor = std::bit_or<MT>();
-          initFilterMask = 0;
-          break;
-        case BOP_AND: bopFunctor = std::bit_and<MT>(); break;
         case BOP_XOR:
-          bopFunctor = std::bit_or<MT>();
-          initFilterMask = 0;
-          break;
-        case BOP_NONE:
-          // According with the comments in linux-port/primitiveprocessor.h
-          // there can't be BOP_NONE with filterCount > 0
-          bopFunctor = std::bit_and<MT>();
+          isOr = true;
+          initFilterMask = falseMask;
           break;
+        case BOP_AND: break;
+        case BOP_NONE: break;
         default: idbassert(false);
       }
       filterArgsVectors.reserve(filterCount);
@@ -1677,30 +1455,6 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T
           FilterType filterValue = *((FilterType*)&filterValues[j]);
           filterArgsVectors.push_back(simdProcessor.loadValue(filterValue));
         }
-        switch (filterCOPs[j])
-        {
-          case (COMPARE_EQ):
-            // Filter against NULL value
-            if (memcmp(&filterValues[j], &nullValue, sizeof(nullValue)) == 0)
-              copFunctorVec.push_back(std::mem_fn(&VT::nullEmptyCmpEq));
-            else
-              copFunctorVec.push_back(std::mem_fn(&VT::cmpEq));
-            break;
-          case (COMPARE_GE): copFunctorVec.push_back(std::mem_fn(&VT::cmpGe)); break;
-
-          case (COMPARE_GT): copFunctorVec.push_back(std::mem_fn(&VT::cmpGt)); break;
-          case (COMPARE_LE): copFunctorVec.push_back(std::mem_fn(&VT::cmpLe)); break;
-          case (COMPARE_LT): copFunctorVec.push_back(std::mem_fn(&VT::cmpLt)); break;
-          case (COMPARE_NE): copFunctorVec.push_back(std::mem_fn(&VT::cmpNe)); break;
-          case (COMPARE_NIL):
-            copFunctorVec.push_back(std::mem_fn(&VT::cmpAlwaysFalse));
-            break;
-            // There are couple other COP, e.g. COMPARE_NOT however they can't be met here
-            // b/c MCS 6.x uses COMPARE_NOT for strings with OP_LIKE only. See op2num() for
-            // details.
-
-          default: idbassert(false);
-        }
       }
     }
   }
@@ -1715,20 +1469,17 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T
     weightsMin = simdSwapedOrderDataLoad<KIND, VT, SimdWrapperType, T>(typeHolder, simdProcessor, simdMin).v;
     weightsMax = simdSwapedOrderDataLoad<KIND, VT, SimdWrapperType, T>(typeHolder, simdProcessor, simdMax).v;
   }
-
   MT* nonEmptyMaskAux;
 
   if constexpr (IS_AUX_COLUMN)
   {
     constexpr uint16_t vectorSizeAux = VT::vecByteSize;
     uint16_t iterNumberAux = HAS_INPUT_RIDS ? ridSize / vectorSizeAux : srcSize / vectorSizeAux;
-    nonEmptyMaskAux = (MT*) alloca(sizeof(MT) * iterNumberAux);
-    buildAuxColEmptyVal<HAS_INPUT_RIDS, EMPTY_VALUE_AUX>(iterNumberAux, vectorSizeAux, &blockAux,
-      &nonEmptyMaskAux, &ridArray);
+    nonEmptyMaskAux = (MT*)alloca(sizeof(MT) * iterNumberAux);
+    buildAuxColEmptyVal<VT, HAS_INPUT_RIDS, EMPTY_VALUE_AUX>(iterNumberAux, vectorSizeAux, &blockAux,
+                                                             &nonEmptyMaskAux, &ridArray);
   }
 
-  constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtr = getNonEmptyMaskPtrTemplate<WIDTH>();
-
   // main loop
   // writeMask tells which values must get into the result. Includes values that matches filters. Can have
   // NULLs. nonEmptyMask tells which vector coords are not EMPTY magics. nonNullMask tells which vector coords
@@ -1737,40 +1488,74 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T
   {
     primitives::RIDType ridOffset = i * VECTOR_SIZE;
     assert(!HAS_INPUT_RIDS || (HAS_INPUT_RIDS && ridSize >= ridOffset));
-    dataVec = simdDataLoad<VT, SimdWrapperType, HAS_INPUT_RIDS, T>(simdProcessor, srcArray,
-      origSrcArray, ridArray, i).v;
+    dataVec = simdDataLoad<VT, SimdWrapperType, HAS_INPUT_RIDS, T>(simdProcessor, srcArray, origSrcArray,
+                                                                   ridArray, i)
+                  .v;
 
-    if constexpr(KIND==KIND_TEXT)
-      swapedOrderDataVec = simdSwapedOrderDataLoad<KIND, VT, SimdWrapperType, T>(typeHolder, simdProcessor, dataVec).v;
+    if constexpr (KIND == KIND_TEXT)
+    {
+      swapedOrderDataVec =
+          simdSwapedOrderDataLoad<KIND, VT, SimdWrapperType, T>(typeHolder, simdProcessor, dataVec).v;
+    }
 
     if constexpr (IS_AUX_COLUMN)
-      nonEmptyMask = (*getNonEmptyMaskPtr)(nonEmptyMaskAux, i);
+    {
+      //'Ne' translates AUX vectors of "0xFF" values into the vectors of the corresponding
+      // width "0xFF...FF" for u16/32/64bits.
+      nonEmptyMask = simdProcessor.nullEmptyCmpNe(
+          (SimdType)getNonEmptyMaskAux<KIND, VT, T>(nonEmptyMaskAux, i), (SimdType)falseMask);
+    }
     else
-      nonEmptyMask = simdProcessor.nullEmptyCmpNe(dataVec, emptyFilterArgVec);
+    {
+      nonEmptyMask = simdProcessor.cmpNe(dataVec, emptyFilterArgVec);
+    }
 
     writeMask = nonEmptyMask;
     // NULL check
     nonNullMask = simdProcessor.nullEmptyCmpNe(dataVec, nullFilterArgVec);
     // Exclude NULLs from the resulting set if NULL doesn't match the filters.
     writeMask = isNullValueMatches ? writeMask : writeMask & nonNullMask;
+
     nonNullOrEmptyMask = nonNullMask & nonEmptyMask;
     // filters
     MT prevFilterMask = initFilterMask;
-    // TODO name this mask literal
-    MT filterMask = 0xFFFF;
+    MT filterMask = trueMask;
+
     for (uint32_t j = 0; j < filterCount; ++j)
     {
-      // filter using compiled filter and preloaded filter argument
-      if constexpr(KIND==KIND_TEXT)
-        filterMask = copFunctorVec[j](simdProcessor, swapedOrderDataVec, filterArgsVectors[j]);
+      SimdType l;
+      if constexpr (KIND == KIND_TEXT)
+      {
+        l = swapedOrderDataVec;
+      }
       else
-        filterMask = copFunctorVec[j](simdProcessor, dataVec, filterArgsVectors[j]);
+      {
+        l = dataVec;
+      }
 
-      filterMask = bopFunctor(prevFilterMask, filterMask);
+      // The operator form doesn't work for x86. We need explicit functions here.
+      switch (filterCOPs[j])
+      {
+        case (COMPARE_NULLEQ): filterMask = simdProcessor.nullEmptyCmpEq(l, filterArgsVectors[j]); break;
+        case (COMPARE_EQ): filterMask = simdProcessor.cmpEq(l, filterArgsVectors[j]); break;
+        case (COMPARE_GE): filterMask = simdProcessor.cmpGe(l, filterArgsVectors[j]); break;
+        case (COMPARE_GT): filterMask = simdProcessor.cmpGt(l, filterArgsVectors[j]); break;
+        case (COMPARE_LE): filterMask = simdProcessor.cmpLe(l, filterArgsVectors[j]); break;
+        case (COMPARE_LT): filterMask = simdProcessor.cmpLt(l, filterArgsVectors[j]); break;
+        case (COMPARE_NE): filterMask = simdProcessor.cmpNe(l, filterArgsVectors[j]); break;
+        case (COMPARE_NIL): filterMask = falseMask; break;
+
+        default:
+          idbassert(false);
+          // There are couple other COP, e.g. COMPARE_NOT however they can't be met here
+          // b/c MCS 6.x uses COMPARE_NOT for strings with OP_LIKE only. See op2num() for
+          // details.
+      }
+
+      filterMask = isOr ? prevFilterMask | filterMask : prevFilterMask & filterMask;
       prevFilterMask = filterMask;
     }
     writeMask = writeMask & filterMask;
-
     T* dataVecTPtr = reinterpret_cast<T*>(&dataVec);
 
     // vectWriteColValues iterates over the values in the source vec
@@ -1788,11 +1573,19 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T
         simdProcessor, valuesWritten, validMinMax, ridOffset, dataVecTPtr, ridDstArray, writeMask, min, max,
         in, out, nonNullOrEmptyMask, ridArray);
 
-      if constexpr (KIND != KIND_TEXT)
-        vectorizedUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax);
-      else
-        vectorizedTextUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax,
-                                   swapedOrderDataVec, weightsMin, weightsMax);
+    if constexpr (KIND == KIND_TEXT)
+    {
+      vectorizedTextUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax,
+                                 swapedOrderDataVec, weightsMin, weightsMax);
+    }
+    else if constexpr (KIND == KIND_FLOAT)
+    {
+      // noop for future development
+    }
+    else
+    {
+      vectorizedUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax);
+    }
 
     // Calculate bytes written
     uint16_t bytesWritten = valuesWritten * WIDTH;
@@ -1828,8 +1621,9 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T
                                    min, max, isNullValueMatches, blockAux);
 }
 
-template<typename T, typename VT, bool HAS_INPUT_RIDS, int OUTPUT_TYPE,
-         ENUM_KIND KIND, typename FT, typename ST>
+#if defined(__x86_64__) || (__aarch64__)
+template <typename T, typename VT, bool HAS_INPUT_RIDS, int OUTPUT_TYPE, ENUM_KIND KIND, typename FT,
+          typename ST>
 void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* srcArray,
                          const uint32_t srcSize, primitives::RIDType* ridArray, const uint16_t ridSize,
                          ParsedColumnFilter* parsedColumnFilter, const bool validMinMax, const T emptyValue,
@@ -1838,21 +1632,20 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T*
 {
   if (in->hasAuxCol)
   {
-    vectorizedFiltering_<T, VT, HAS_INPUT_RIDS, OUTPUT_TYPE, KIND, FT, ST, true, execplan::AUX_COL_EMPTYVALUE>(
-      in, out, srcArray, srcSize, ridArray, ridSize,
-      parsedColumnFilter, validMinMax, emptyValue,
-      nullValue, min, max, isNullValueMatches,
-      blockAux);
+    vectorizedFiltering_<T, VT, HAS_INPUT_RIDS, OUTPUT_TYPE, KIND, FT, ST, true,
+                         execplan::AUX_COL_EMPTYVALUE>(in, out, srcArray, srcSize, ridArray, ridSize,
+                                                       parsedColumnFilter, validMinMax, emptyValue, nullValue,
+                                                       min, max, isNullValueMatches, blockAux);
   }
   else
   {
-    vectorizedFiltering_<T, VT, HAS_INPUT_RIDS, OUTPUT_TYPE, KIND, FT, ST, false, execplan::AUX_COL_EMPTYVALUE>(
-      in, out, srcArray, srcSize, ridArray, ridSize,
-      parsedColumnFilter, validMinMax, emptyValue,
-      nullValue, min, max, isNullValueMatches,
-      blockAux);
+    vectorizedFiltering_<T, VT, HAS_INPUT_RIDS, OUTPUT_TYPE, KIND, FT, ST, false,
+                         execplan::AUX_COL_EMPTYVALUE>(in, out, srcArray, srcSize, ridArray, ridSize,
+                                                       parsedColumnFilter, validMinMax, emptyValue, nullValue,
+                                                       min, max, isNullValueMatches, blockAux);
   }
 }
+#endif
 
 // This routine dispatches template function calls to reduce branching.
 template <typename STORAGE_TYPE, ENUM_KIND KIND, typename FT, typename ST>
@@ -1861,8 +1654,7 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out
                                    const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter,
                                    const bool validMinMax, const STORAGE_TYPE emptyValue,
                                    const STORAGE_TYPE nullValue, STORAGE_TYPE Min, STORAGE_TYPE Max,
-                                   const bool isNullValueMatches,
-                                   const uint8_t* blockAux)
+                                   const bool isNullValueMatches, const uint8_t* blockAux)
 {
   // Using struct to dispatch SIMD type based on integral type T.
   using SimdType = typename simd::IntegralToSIMD<STORAGE_TYPE, KIND>::type;
@@ -1924,7 +1716,6 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out
     }
   }
 }
-#endif
 
 // TBD Make changes in Command class ancestors to threat BPP::values as buffer.
 // TBD this will allow to copy values only once from BPP::blockData to the destination.
@@ -1936,8 +1727,7 @@ template <typename T, ENUM_KIND KIND>
 void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* ridArray,
                       const uint16_t ridSize,  // Number of values in ridArray
                       int* srcArray16, const uint32_t srcSize,
-                      boost::shared_ptr<ParsedColumnFilter> parsedColumnFilter,
-                      int* blockAux)
+                      boost::shared_ptr<ParsedColumnFilter> parsedColumnFilter, int* blockAux)
 {
   using FT = typename IntegralTypeToFilterType<T>::type;
   using ST = typename IntegralTypeToFilterSetType<T>::type;
@@ -1981,10 +1771,10 @@ void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* r
   // Syscat queries mustn't follow vectorized processing path b/c PP must return
   // all values w/o any filter(even empty values filter) applied.
 
-#if defined(__x86_64__)|| defined(__aarch64__)
-  // Don't use vectorized filtering for text based data types.
-  if (WIDTH < 16 &&
-    (KIND != KIND_TEXT || (KIND == KIND_TEXT && in->colType.strnxfrmIsValid()) ))
+#if defined(__x86_64__) || defined(__aarch64__)
+  // Don't use vectorized filtering for text based data types which collation translation
+  // can deliver more then 1 byte for a single input byte of an encoded string.
+  if (WIDTH < 16 && (KIND != KIND_TEXT || (KIND == KIND_TEXT && in->colType.strnxfrmIsValid())))
   {
     bool canUseFastFiltering = true;
     for (uint32_t i = 0; i < filterCount; ++i)
@@ -1996,10 +1786,9 @@ void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* r
 
     if (canUseFastFiltering)
     {
-      vectorizedFilteringDispatcher<T, KIND, FT, ST>(in, out, srcArray, srcSize, ridArray, ridSize,
-                                                     parsedColumnFilter.get(), validMinMax, emptyValue,
-                                                     nullValue, Min, Max, isNullValueMatches,
-                                                     reinterpret_cast<const uint8_t*>(blockAux));
+      vectorizedFilteringDispatcher<T, KIND, FT, ST>(
+          in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter.get(), validMinMax, emptyValue,
+          nullValue, Min, Max, isNullValueMatches, reinterpret_cast<const uint8_t*>(blockAux));
       return;
     }
   }
@@ -2043,8 +1832,7 @@ template <typename T,
 #else
           typename std::enable_if<sizeof(T) == sizeof(int32_t), T>::type* = nullptr>
 #endif
-void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in,
-                                                     ColResultHeader* out)
+void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out)
 {
   constexpr int W = sizeof(T);
   auto dataType = (execplan::CalpontSystemCatalog::ColDataType)in->colType.DataType;
@@ -2053,7 +1841,8 @@ void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in,
     const uint16_t ridSize = in->NVALS;
     uint16_t* ridArray = in->getRIDArrayPtr(W);
     const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W;
-    filterColumnData<T, KIND_FLOAT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux);
+    filterColumnData<T, KIND_FLOAT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter,
+                                    blockAux);
     return;
   }
   _scanAndFilterTypeDispatcher<T>(in, out);
@@ -2069,8 +1858,7 @@ template <typename T,
 #else
           typename std::enable_if<sizeof(T) == sizeof(int64_t), T>::type* = nullptr>
 #endif
-void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in,
-                                                     ColResultHeader* out)
+void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out)
 {
   constexpr int W = sizeof(T);
   auto dataType = (execplan::CalpontSystemCatalog::ColDataType)in->colType.DataType;
@@ -2079,7 +1867,8 @@ void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in,
     const uint16_t ridSize = in->NVALS;
     uint16_t* ridArray = in->getRIDArrayPtr(W);
     const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W;
-    filterColumnData<T, KIND_FLOAT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux);
+    filterColumnData<T, KIND_FLOAT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter,
+                                    blockAux);
     return;
   }
   _scanAndFilterTypeDispatcher<T>(in, out);
@@ -2098,8 +1887,7 @@ template <typename T, typename std::enable_if<sizeof(T) == sizeof(int8_t) || siz
                                                   sizeof(T) == sizeof(int128_t),
                                               T>::type* = nullptr>
 #endif
-void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in,
-                                                     ColResultHeader* out)
+void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out)
 {
   _scanAndFilterTypeDispatcher<T>(in, out);
 }
@@ -2114,15 +1902,15 @@ template <typename T,
 #else
           typename std::enable_if<sizeof(T) == sizeof(int128_t), T>::type* = nullptr>
 #endif
-void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in,
-                                                      ColResultHeader* out)
+void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out)
 {
   constexpr int W = sizeof(T);
   const uint16_t ridSize = in->NVALS;
   uint16_t* ridArray = in->getRIDArrayPtr(W);
   const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W;
 
-  filterColumnData<T, KIND_DEFAULT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux);
+  filterColumnData<T, KIND_DEFAULT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter,
+                                    blockAux);
 }
 
 template <typename T,
@@ -2135,12 +1923,11 @@ template <typename T,
 #else
           typename std::enable_if<sizeof(T) <= sizeof(int64_t), T>::type* = nullptr>
 #endif
-void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in,
-                                                      ColResultHeader* out)
+void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out)
 {
   constexpr int W = sizeof(T);
   using UT = typename std::conditional<std::is_unsigned<T>::value || datatypes::is_uint128_t<T>::value, T,
-                                      typename datatypes::make_unsigned<T>::type>::type;
+                                       typename datatypes::make_unsigned<T>::type>::type;
   const uint16_t ridSize = in->NVALS;
   uint16_t* ridArray = in->getRIDArrayPtr(W);
   const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W;
@@ -2151,16 +1938,19 @@ void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in,
        dataType == execplan::CalpontSystemCatalog::TEXT) &&
       !isDictTokenScan(in))
   {
-    filterColumnData<UT, KIND_TEXT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux);
+    filterColumnData<UT, KIND_TEXT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter,
+                                    blockAux);
     return;
   }
 
   if (datatypes::isUnsigned(dataType))
   {
-    filterColumnData<UT, KIND_UNSIGNED>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux);
+    filterColumnData<UT, KIND_UNSIGNED>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter,
+                                        blockAux);
     return;
   }
-  filterColumnData<T, KIND_DEFAULT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux);
+  filterColumnData<T, KIND_DEFAULT>(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter,
+                                    blockAux);
 }
 
 // The entrypoint for block scanning and filtering.
diff --git a/primitives/linux-port/primitiveprocessor.h b/primitives/linux-port/primitiveprocessor.h
index 23340f5e3..22cb8a155 100644
--- a/primitives/linux-port/primitiveprocessor.h
+++ b/primitives/linux-port/primitiveprocessor.h
@@ -55,7 +55,7 @@
 class PrimTest;
 
 // XXX: turn off dictionary range setting during scan.
-#define	XXX_PRIMITIVES_TOKEN_RANGES_XXX
+#define XXX_PRIMITIVES_TOKEN_RANGES_XXX
 
 namespace primitives
 {
@@ -472,6 +472,103 @@ class PrimitiveProcessor
   friend class ::PrimTest;
 };
 
+// Bit pattern representing NULL value for given column type/width
+// TBD Use TypeHandler
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int128_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  return datatypes::Decimal128Null;
+}
+
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int64_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  switch (type)
+  {
+    case execplan::CalpontSystemCatalog::DOUBLE:
+    case execplan::CalpontSystemCatalog::UDOUBLE: return joblist::DOUBLENULL;
+
+    case execplan::CalpontSystemCatalog::CHAR:
+    case execplan::CalpontSystemCatalog::VARCHAR:
+    case execplan::CalpontSystemCatalog::DATE:
+    case execplan::CalpontSystemCatalog::DATETIME:
+    case execplan::CalpontSystemCatalog::TIMESTAMP:
+    case execplan::CalpontSystemCatalog::TIME:
+    case execplan::CalpontSystemCatalog::VARBINARY:
+    case execplan::CalpontSystemCatalog::BLOB:
+    case execplan::CalpontSystemCatalog::TEXT: return joblist::CHAR8NULL;
+
+    case execplan::CalpontSystemCatalog::UBIGINT: return joblist::UBIGINTNULL;
+
+    default: return joblist::BIGINTNULL;
+  }
+}
+
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int32_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  switch (type)
+  {
+    case execplan::CalpontSystemCatalog::FLOAT:
+    case execplan::CalpontSystemCatalog::UFLOAT: return joblist::FLOATNULL;
+
+    case execplan::CalpontSystemCatalog::CHAR:
+    case execplan::CalpontSystemCatalog::VARCHAR:
+    case execplan::CalpontSystemCatalog::BLOB:
+    case execplan::CalpontSystemCatalog::TEXT: return joblist::CHAR4NULL;
+
+    case execplan::CalpontSystemCatalog::DATE:
+    case execplan::CalpontSystemCatalog::DATETIME:
+    case execplan::CalpontSystemCatalog::TIMESTAMP:
+    case execplan::CalpontSystemCatalog::TIME: return joblist::DATENULL;
+
+    case execplan::CalpontSystemCatalog::UINT:
+    case execplan::CalpontSystemCatalog::UMEDINT: return joblist::UINTNULL;
+
+    default: return joblist::INTNULL;
+  }
+}
+
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int16_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  switch (type)
+  {
+    case execplan::CalpontSystemCatalog::CHAR:
+    case execplan::CalpontSystemCatalog::VARCHAR:
+    case execplan::CalpontSystemCatalog::BLOB:
+    case execplan::CalpontSystemCatalog::TEXT:
+    case execplan::CalpontSystemCatalog::DATE:
+    case execplan::CalpontSystemCatalog::DATETIME:
+    case execplan::CalpontSystemCatalog::TIMESTAMP:
+    case execplan::CalpontSystemCatalog::TIME: return joblist::CHAR2NULL;
+
+    case execplan::CalpontSystemCatalog::USMALLINT: return joblist::USMALLINTNULL;
+
+    default: return joblist::SMALLINTNULL;
+  }
+}
+
+template <typename T, typename std::enable_if<sizeof(T) == sizeof(int8_t), T>::type* = nullptr>
+T getNullValue(uint8_t type)
+{
+  switch (type)
+  {
+    case execplan::CalpontSystemCatalog::CHAR:
+    case execplan::CalpontSystemCatalog::VARCHAR:
+    case execplan::CalpontSystemCatalog::BLOB:
+    case execplan::CalpontSystemCatalog::TEXT:
+    case execplan::CalpontSystemCatalog::DATE:
+    case execplan::CalpontSystemCatalog::DATETIME:
+    case execplan::CalpontSystemCatalog::TIMESTAMP:
+    case execplan::CalpontSystemCatalog::TIME: return joblist::CHAR1NULL;
+
+    case execplan::CalpontSystemCatalog::UTINYINT: return joblist::UTINYINTNULL;
+
+    default: return joblist::TINYINTNULL;
+  }
+}
+
 //
 // COMPILE A COLUMN FILTER
 //
@@ -518,13 +615,32 @@ boost::shared_ptr<ParsedColumnFilter> _parseColumnFilter(
     // Pointer to ColArgs structure representing argIndex'th element in the BLOB
     auto args = reinterpret_cast<const ColArgs*>(filterString + (argIndex * filterSize));
 
-    ret->prestored_cops[argIndex] = args->COP;
     ret->prestored_rfs[argIndex] = args->rf;
 
-    if (datatypes::isUnsigned((execplan::CalpontSystemCatalog::ColDataType)colType))
-      ret->storeFilterArg(argIndex, reinterpret_cast<const UT*>(args->val));
+    auto colDataType = (execplan::CalpontSystemCatalog::ColDataType)colType;
+    bool isNullEqCmp = false;
+    if (datatypes::isUnsigned(colDataType))
+    {
+      const auto nullValue = getNullValue<UT>(colDataType);
+      const UT* filterValue = reinterpret_cast<const UT*>(args->val);
+      isNullEqCmp =
+          (args->COP == COMPARE_EQ && memcmp(filterValue, &nullValue, sizeof(nullValue)) == 0) ? true : false;
+      ret->storeFilterArg(argIndex, filterValue);
+    }
     else
-      ret->storeFilterArg(argIndex, reinterpret_cast<const T*>(args->val));
+    {
+      const auto nullValue = getNullValue<T>(colDataType);
+      const T* filterValue = reinterpret_cast<const T*>(args->val);
+      isNullEqCmp =
+          (args->COP == COMPARE_EQ && memcmp(filterValue, &nullValue, sizeof(nullValue)) == 0) ? true : false;
+      ret->storeFilterArg(argIndex, filterValue);
+    }
+
+    // IS NULL filtering expression is translated into COMPARE_EQ + NULL magic in the filter.
+    // This if replaces an operation id once to avoid additional branching in the main loop
+    // of vectorizedFiltering_ in column.cpp.
+    // It would be cleaner to place in into EM though.
+    ret->prestored_cops[argIndex] = (isNullEqCmp) ? COMPARE_NULLEQ : args->COP;
   }
 
   /*  Decide which structure to use.  I think the only cases where we can use the set
@@ -575,4 +691,3 @@ boost::shared_ptr<ParsedColumnFilter> _parseColumnFilter(
 }
 
 }  // namespace primitives
-
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4f760200f..1b2ba7869 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -55,6 +55,7 @@ if (WITH_UNITTESTS)
     gtest_add_tests(TARGET column_scan_filter_tests TEST_PREFIX columnstore:)
 
     add_executable(simd_processors simd_processors.cpp)
+    target_compile_options(simd_processors PRIVATE -Wno-error)
     add_dependencies(simd_processors googletest)
     target_link_libraries(simd_processors ${ENGINE_LDFLAGS} ${MARIADB_CLIENT_LIBS} ${ENGINE_WRITE_LIBS} ${GTEST_LIBRARIES} processor dbbc)
     gtest_add_tests(TARGET simd_processors TEST_PREFIX columnstore:)
diff --git a/tests/simd_processors.cpp b/tests/simd_processors.cpp
index 85bcc14f9..0d009aff7 100644
--- a/tests/simd_processors.cpp
+++ b/tests/simd_processors.cpp
@@ -15,8 +15,8 @@
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
    MA 02110-1301, USA. */
 
-
 #include <cstdint>
+#include <functional>
 #include <iostream>
 #include <type_traits>
 #include <gtest/gtest.h>
@@ -25,465 +25,496 @@
 #include "simd_sse.h"
 #include "simd_arm.h"
 #if defined(__x86_64__)
-  #define TESTS_USING_SSE 1
-  using float64_t = double;
-  using float32_t = float;
+#define TESTS_USING_SSE 1
+using float64_t = double;
+using float32_t = float;
 #endif
 #ifdef __aarch64__
-  #define TESTS_USING_ARM 1
+#define TESTS_USING_ARM 1
 #endif
 
 using namespace std;
-
+#if defined(__x86_64__) || __aarch64__
 template <typename T>
-class SimdProcessorTypedTest : public testing::Test {
-public:
+class SimdProcessorTypedTest : public testing::Test
+{
+ public:
   using IntegralType = T;
-  #if TESTS_USING_SSE
-    using SimdType = std::conditional_t<std::is_same<T, float>::value,
-                                        simd::vi128f_wr,
-                                        std::conditional_t<std::is_same<T, double>::value,
-                                                           simd::vi128d_wr,
-                                                           simd::vi128_wr>>;
-    using Proc = typename simd::SimdFilterProcessor<SimdType, T>;
-    #else
-    using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<T>::WrapperType, T>;
-  #endif
+#if TESTS_USING_SSE
+  using SimdType =
+      std::conditional_t<std::is_same<T, float>::value, simd::vi128f_wr,
+                         std::conditional_t<std::is_same<T, double>::value, simd::vi128d_wr, simd::vi128_wr>>;
+  using Proc = typename simd::SimdFilterProcessor<SimdType, T>;
+#else
+  using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<T>::WrapperType, T>;
+#endif
   void SetUp() override
   {
   }
 };
 
-using SimdProcessor128TypedTestTypes = ::testing::Types<uint64_t, uint32_t, uint16_t, uint8_t, int64_t, int32_t, int16_t, int8_t>;
+using SimdProcessor128TypedTestTypes =
+    ::testing::Types<uint64_t, uint32_t, uint16_t, uint8_t, int64_t, int32_t, int16_t, int8_t>;
 TYPED_TEST_SUITE(SimdProcessorTypedTest, SimdProcessor128TypedTestTypes);
 
 TYPED_TEST(SimdProcessorTypedTest, SimdFilterProcessor_simd128)
 {
   using Proc = typename SimdProcessorTypedTest<TypeParam>::Proc;
+
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+
   using SimdType = typename Proc::SimdType;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  constexpr static simd::MT allFalse = 0x0;
   Proc proc;
+  const typename Proc::MaskType allTrue = proc.trueMask();
+  const typename Proc::MaskType allFalse = proc.falseMask();
+
   SimdType lhs = proc.loadValue((TypeParam)-2);
   SimdType rhs = proc.loadValue((TypeParam)-3);
   EXPECT_GT((uint64_t)-2LL, (uint64_t)-3LL);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpGe(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpLt(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), allFalse);
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse);
-  EXPECT_EQ(proc.cmpEq(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpNe(rhs, lhs), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allTrue));
   lhs = proc.loadValue((TypeParam)-3);
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allFalse));
 
   lhs = rhs = proc.loadValue((TypeParam)0);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), allFalse);
-  EXPECT_EQ(proc.cmpGe(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpLt(rhs, lhs), allFalse);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), allTrue);
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse);
-  EXPECT_EQ(proc.cmpEq(rhs, lhs), allTrue);
-  EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(rhs, lhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), allFalse));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(rhs, lhs), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allFalse));
 }
 
+template <typename IntegralType, typename ResultType, int VecSize>
+ResultType bitMaskProducerT(const IntegralType* l, const IntegralType* r,
+                            std::function<bool(IntegralType, IntegralType)> cmp, const bool printOut = false)
+{
+  uint64_t allOnes = 0xFFULL;
+  for (size_t i = 1; i < sizeof(IntegralType); ++i)
+  {
+    allOnes |= 0xFFULL << (i * 8);
+  }
+  ResultType result = {0x0, 0x0};
+
+  uint64_t* resultPtr = reinterpret_cast<uint64_t*>(&result);
+  for (size_t i = 0; i < VecSize >> 1; ++i)
+  {
+    if (cmp(l[i], r[i]))
+    {
+      if (printOut)
+      {
+        uint64_t pLeft = l[i];
+        uint pRight = r[i];
+        std::cout << "i " << i << " l " << cmp.target_type().name() << " r " << pLeft << " " << pRight
+                  << std::endl;
+      }
+      resultPtr[0] |= allOnes << i * sizeof(IntegralType) * 8;
+    }
+  }
+  for (size_t i = VecSize >> 1; i < VecSize; ++i)
+  {
+    if (cmp(l[i], r[i]))
+    {
+      if (printOut)
+      {
+        uint64_t pLeft = l[i];
+        uint pRight = r[i];
+        std::cout << "i " << i << " l " << cmp.target_type().name() << " r " << pLeft << " " << pRight
+                  << std::endl;
+      }
+      resultPtr[1] |= allOnes << (i - (VecSize >> 1)) * sizeof(IntegralType) * 8;
+    }
+  }
+  return result;
+};
 
 TEST(SimdProcessorTest, Int8)
 {
-    using Proc = typename SimdProcessorTypedTest<int8_t>::Proc;
-    using SimdType = typename Proc::SimdType;
-    Proc proc;
-    constexpr static simd::MT allTrue = 0xFFFF;
-    simd::MT expect = 0x0;
-    int8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
-    int8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
-    int8_t minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
-    int8_t maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
-    SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
-    SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
-    SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
-    SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-    for (int i = 0; i < 16; i++)
-      if (l[i] > r[i])
-        expect |= 1 << i;
-    EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-    EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect);
-    SimdType testmax = proc.max(lhs, rhs);
-    SimdType testmin = proc.min(lhs, rhs);
-    EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-    EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
-
-    expect = 0x0;
-    for (int i = 0; i < 16; i++)
-      if (l[i] == r[i])
-        expect |= 1 << i;
-    EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-    EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
-
-    expect = 0x0;
-    for (int i = 0; i < 16; i++)
-      if (l[i] < r[i])
-        expect |= 1 << i;
-    EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-    EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
-}
-TEST(SimdProcessorTest, Uint8)
-{
-  using Proc = typename SimdProcessorTypedTest<uint8_t>::Proc;
+  using IntegralType = int8_t;
+  IntegralType l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
+  IntegralType r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  IntegralType minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
+  IntegralType maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  using IntegralType = int8_t;
+  using Proc = typename SimdProcessorTypedTest<IntegralType>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize;
   using SimdType = typename Proc::SimdType;
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  uint8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
-  uint8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
-  uint8_t minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
-  uint8_t maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  const typename Proc::MaskType allTrue = proc.trueMask();
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 16; i++)
-    if (l[i] > r[i])
-      expect |= 1 << i;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect);
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 16; i++)
-    if (l[i] == r[i])
-      expect |= 1 << i;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs),(simd::MT) ~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
+}
 
-  expect = 0x0;
-  for (int i = 0; i < 16; i++)
-    if (l[i] < r[i])
-      expect |= 1 << i;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs),(simd::MT) ~expect);
+TEST(SimdProcessorTest, Uint8)
+{
+  using IntegralType = uint8_t;
+  IntegralType l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
+  IntegralType r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  IntegralType minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
+  IntegralType maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  using Proc = typename SimdProcessorTypedTest<IntegralType>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  using SimdType = typename Proc::SimdType;
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
+
+  using Proc = typename SimdProcessorTypedTest<uint8_t>::Proc;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  const Proc::MaskType allTrue = proc.trueMask();
+
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
+  SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), true);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
+  SimdType testmax = proc.max(lhs, rhs);
+  SimdType testmin = proc.min(lhs, rhs);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
+
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
+
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 
 TEST(SimdProcessorTest, Int16)
 {
+  using IntegralType = int16_t;
+  IntegralType l[8]{0, 1, 2, -5, 4, 3, -8, 200};
+  IntegralType r[8]{0, 105, -8, 35, 24, 13, 8, 100};
+  IntegralType minlr[8]{0, 1, -8, -5, 4, 3, -8, 100};
+  IntegralType maxlr[8]{0, 105, 2, 35, 24, 13, 8, 200};
   using Proc = typename SimdProcessorTypedTest<int16_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   using SimdType = typename Proc::SimdType;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  int16_t l[8]{0, 1, 2, -5, 4, 3, -8, 200};
-  int16_t r[8]{0, 105, -8, 35, 24, 13, 8, 100};
-  int16_t minlr[8]{0, 1, -8, -5, 4, 3, -8, 100};
-  int16_t maxlr[8]{0, 105, 2, 35, 24, 13, 8, 200};
+  const Proc::MaskType allTrue = proc.trueMask();
+
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 8; i++)
-    if (l[i] > r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 8; i++)
-    if (l[i] == r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
 
-  expect = 0x0;
-  for (int i = 0; i < 8; i++)
-    if (l[i] < r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Uint16)
 {
-  using Proc = typename SimdProcessorTypedTest<uint16_t>::Proc;
+  using IntegralType = uint16_t;
+  IntegralType l[8]{0, 1, 2, 5, 4, 3, 8, 5};
+  IntegralType r[8]{0, 1, 8, 35, 24, 13, 8, 17};
+  IntegralType minlr[8]{0, 1, 2, 5, 4, 3, 8, 5};
+  IntegralType maxlr[8]{0, 1, 8, 35, 24, 13, 8, 17};
+
+  using Proc = typename SimdProcessorTypedTest<int16_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   using SimdType = typename Proc::SimdType;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  uint16_t l[8]{0, 1, 2, 5, 4, 3, 8, 5};
-  uint16_t r[8]{0, 1, 8, 35, 24, 13, 8, 17};
-  uint16_t minlr[8]{0, 1, 2, 5, 4, 3, 8, 5};
-  uint16_t maxlr[8]{0, 1, 8, 35, 24, 13, 8, 17};
+  const Proc::MaskType allTrue = proc.trueMask();
+
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 8; i++)
-    if (l[i] > r[i])
-      expect |= 3 << i*2;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 8; i++)
-    if (l[i] == r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
 
-  expect = 0x0;
-  for (int i = 0; i < 8; i++)
-    if (l[i] < r[i])
-      expect |= 3 << i * 2;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 
 TEST(SimdProcessorTest, Int32)
 {
+  using IntegralType = int32_t;
+  IntegralType l[8]{0, 1, 2, -5};
+  IntegralType r[8]{0, 105, -8, 54333};
+  IntegralType minlr[8]{0, 1, -8, -5};
+  IntegralType maxlr[8]{0, 105, 2, 54333};
   using Proc = typename SimdProcessorTypedTest<int32_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](Proc::MaskType left, Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   using SimdType = typename Proc::SimdType;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  int32_t l[8]{0, 1, 2, -5};
-  int32_t r[8]{0, 105, -8,54333};
-  int32_t minlr[8]{0, 1, -8, -5};
-  int32_t maxlr[8]{0, 105, 2, 54333};
+  const Proc::MaskType allTrue = proc.trueMask();
+
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 4; i++)
-    if (l[i] > r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] == r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
 
-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] < r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Uint32)
 {
+  using IntegralType = uint32_t;
+  IntegralType l[4]{0, 1002, 2, 514};
+  IntegralType r[4]{2, 1, 80555, 35};
+  IntegralType minlr[8]{0, 1, 2, 35};
+  IntegralType maxlr[8]{2, 1002, 80555, 514};
   using Proc = typename SimdProcessorTypedTest<uint32_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   using SimdType = typename Proc::SimdType;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  uint32_t l[4]{0, 1002, 2, 514};
-  uint32_t r[4]{2, 1, 80555, 35};
-  uint32_t minlr[8]{0, 1, 2, 35};
-  uint32_t maxlr[8]{2, 1002, 80555, 514};
+  const Proc::MaskType allTrue = proc.trueMask();
+
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 4; i++)
-    if (l[i] > r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] == r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
 
-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] < r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Int64)
 {
+  using IntegralType = int64_t;
+  IntegralType l[2]{-5, 122020};
+  IntegralType r[2]{0, 105};
+  IntegralType minlr[8]{-5, 105};
+  IntegralType maxlr[8]{0, 122020};
   using Proc = typename SimdProcessorTypedTest<int64_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   using SimdType = typename Proc::SimdType;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  int64_t l[2]{-5, 122020};
-  int64_t r[2]{0, 105};
-  int64_t minlr[8]{-5, 105};
-  int64_t maxlr[8]{0, 122020};
+  const Proc::MaskType allTrue = proc.trueMask();
+
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 2; i++)
-    if (l[i] > r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] == r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
 
-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] < r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Uint64)
 {
+  using IntegralType = uint64_t;
+  IntegralType l[2]{822, 1002};
+  IntegralType r[2]{2, 1};
+  IntegralType minlr[8]{2, 1};
+  IntegralType maxlr[8]{822, 1002};
   using Proc = typename SimdProcessorTypedTest<uint64_t>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   using SimdType = typename Proc::SimdType;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  uint64_t l[2]{822, 1002};
-  uint64_t r[2]{2, 1};
-  uint64_t minlr[8]{2, 1};
-  uint64_t maxlr[8]{822, 1002};
+  const Proc::MaskType allTrue = proc.trueMask();
+
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 2; i++)
-    if (l[i] > r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] == r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
 
-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] < r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Float64)
 {
-  using Proc = typename SimdProcessorTypedTest<double>::Proc;
+  using IntegralType = float64_t;
+  IntegralType l[2]{-5.0, 12.5620};
+  IntegralType r[2]{2.9, 1};
+  IntegralType minlr[8]{-5.0, 1};
+  IntegralType maxlr[8]{2.9, 12.5620};
+  using Proc = typename SimdProcessorTypedTest<IntegralType>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   using SimdType = typename Proc::SimdType;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  float64_t l[2]{-5.0, 12.5620};
-  float64_t r[2]{2.9, 1};
-  float64_t minlr[8]{-5.0, 1};
-  float64_t maxlr[8]{2.9, 12.5620};
+  const Proc::MaskType allTrue = proc.trueMask();
+
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 2; i++)
-    if (l[i] > r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] == r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
 
-  expect = 0x0;
-  for (int i = 0; i < 2; i++)
-    if (l[i] < r[i])
-      expect |= 0xFF << i * 8;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
 TEST(SimdProcessorTest, Float32)
 {
-  using Proc = typename SimdProcessorTypedTest<float>::Proc;
+  using IntegralType = float32_t;
+  IntegralType l[4]{82, 102, -5.6, 9.5};
+  IntegralType r[4]{2.0, 1, -5.7, 6};
+  IntegralType minlr[8]{2.0, 1, -5.7, 6};
+  IntegralType maxlr[8]{82, 102, -5.6, 9.5};
+  using Proc = typename SimdProcessorTypedTest<IntegralType>::Proc;
+  constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType);
+  auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right)
+  { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); };
+  auto bitMaskProducer = bitMaskProducerT<IntegralType, Proc::MaskType, VecSize>;
   using SimdType = typename Proc::SimdType;
   Proc proc;
-  constexpr static simd::MT allTrue = 0xFFFF;
-  simd::MT expect = 0x0;
-  float32_t l[4]{82, 102,-5.6,9.5};
-  float32_t r[4]{2.0, 1,-5.7,6};
-  float32_t minlr[8]{2.0, 1, -5.7, 6};
-  float32_t maxlr[8]{82, 102, -5.6, 9.5};
+  const Proc::MaskType allTrue = proc.trueMask();
+
   SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
   SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
   SimdType min = proc.loadFrom(reinterpret_cast<char*>(minlr));
   SimdType max = proc.loadFrom(reinterpret_cast<char*>(maxlr));
-  for (int i = 0; i < 4; i++)
-    if (l[i] > r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt));
   SimdType testmax = proc.max(lhs, rhs);
   SimdType testmin = proc.min(lhs, rhs);
-  EXPECT_EQ(proc.cmpEq(testmax, max), allTrue);
-  EXPECT_EQ(proc.cmpEq(testmin, min), allTrue);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue));
 
-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] == r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq));
 
-  expect = 0x0;
-  for (int i = 0; i < 4; i++)
-    if (l[i] < r[i])
-      expect |= 15 << i * 4;
-  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
-  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+  Proc::MaskType expectLt = bitMaskProducer(l, r, std::less<IntegralType>(), false);
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt));
+  EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt));
 }
+#endif
\ No newline at end of file
diff --git a/utils/common/simd_arm.h b/utils/common/simd_arm.h
index a4387561b..884dcb324 100644
--- a/utils/common/simd_arm.h
+++ b/utils/common/simd_arm.h
@@ -17,59 +17,35 @@
 
 #pragma once
 
+#include "simd_sse.h"  // ENUM_KIND
 #ifdef __aarch64__
 #include "arm_neon.h"
 #include <cstdint>
 #include <type_traits>
 #ifdef __OPTIMIZE__
-#define MCS_FORCE_INLINE __attribute__((__always_inline__)) inline
+#define MCS_FORCE_INLINE __attribute__((__always_inline__))
 #else
 #define MCS_FORCE_INLINE inline
 #endif
 
 #include "mcs_datatype.h"
 
-
 namespace simd
 {
 // the type is decided by the basic type
-using vi1_t =int8x16_t;
-using vi2_t =int16x8_t;
-using vi4_t =int32x4_t;
-using vi8_t =int64x2_t;
-using vi1u_t =uint8x16_t;
-using vi2u_t =uint16x8_t;
-using vi4u_t =uint32x4_t;
-using vi8u_t =uint64x2_t;
+using vi1_t = int8x16_t;
+using vi2_t = int16x8_t;
+using vi4_t = int32x4_t;
+using vi8_t = int64x2_t;
+using vi1u_t = uint8x16_t;
+using vi2u_t = uint16x8_t;
+using vi4u_t = uint32x4_t;
+using vi8u_t = uint64x2_t;
 using vi128f_t = float32x4_t;
 using vi128d_t = float64x2_t;
 using int128_t = __int128;
-using MT = uint16_t;
-using MaskSimdType=vi1u_t;
-template <int64_t i0, int64_t i1>
-static vi8_t constant2i()
-{
-  static const union
-  {
-    int64_t i[2];
-    vi8_t xmm;
-  } u = {{i0, i1}};
-  return u.xmm;
-}
-static inline MaskSimdType bitMaskToByteMask16(MT m)
-{
-  vi8_t sel = constant2i<(int64_t)0xffffffffffffffff, (int64_t)0x0>();
-  vi8_t andop = constant2i<(int64_t)0x8040201008040201, (int64_t)0x8040201008040201>();
-  vi1u_t op = vreinterpretq_u8_s64(
-      vandq_s64(vbslq_s64(vreinterpretq_u64_s64(sel), vreinterpretq_s64_u8(vdupq_n_u8(m & 0xff)),
-                          vreinterpretq_s64_u8(vdupq_n_u8((m & 0xff00) >> 8))),
-                andop));
-  vi1u_t zero = vdupq_n_u8(0);
-  return vcgtq_u8(op, zero);
-}
-//the type is used by the  fun like arm__neon__mm__...
-using ArmNeonSSEVecType=uint8x16_t;
-//wrapper types
+using MaskSimdType = vi1u_t;
+// wrapper types
 struct vi1_wr
 {
   int8x16_t v;
@@ -116,35 +92,35 @@ struct vi128d_wr
   float64x2_t v;
 };
 
-template<int W>
+template <int W>
 struct WidthToSVecWrapperType;
 
 template <>
 struct WidthToSVecWrapperType<1>
 {
-    using Vectype=int8x16_t;
-    using WrapperType=struct vi1_wr;
+  using Vectype = int8x16_t;
+  using WrapperType = struct vi1_wr;
 };
 
 template <>
 struct WidthToSVecWrapperType<2>
 {
   using Vectype = int16x8_t;
-  using WrapperType=struct vi2_wr;
+  using WrapperType = struct vi2_wr;
 };
 
 template <>
 struct WidthToSVecWrapperType<4>
 {
   using Vectype = int32x4_t;
-  using WrapperType=struct vi4_wr;
+  using WrapperType = struct vi4_wr;
 };
 
 template <>
 struct WidthToSVecWrapperType<8>
 {
   using Vectype = int64x2_t;
-  using WrapperType=struct vi8_wr;
+  using WrapperType = struct vi8_wr;
 };
 template <>
 struct WidthToSVecWrapperType<16>
@@ -183,17 +159,17 @@ struct WidthToVecWrapperType<8>
   using WrapperType = struct viu8_wr;
 };
 
-//We get the simd and wrapper type of basic type by TypeToVecWrapperType.
-template <typename T, typename ENABLE=void>
+// We get the simd and wrapper type of basic type by TypeToVecWrapperType.
+template <typename T, typename ENABLE = void>
 struct TypeToVecWrapperType;
 
 template <typename T>
-struct TypeToVecWrapperType<T, typename std::enable_if<std::is_same<T,__int128>::value>::type>
+struct TypeToVecWrapperType<T, typename std::enable_if<std::is_same<T, __int128>::value>::type>
  : WidthToSVecWrapperType<sizeof(__int128)>
 {
 };
 template <typename T>
-struct TypeToVecWrapperType<T, typename std::enable_if<std::is_same_v<float,T>>::type>
+struct TypeToVecWrapperType<T, typename std::enable_if<std::is_same_v<float, T>>::type>
 {
   using Vectype = vi128f_t;
   using WrapperType = vi128f_wr;
@@ -202,22 +178,20 @@ template <typename T>
 struct TypeToVecWrapperType<T, typename std::enable_if<std::is_same_v<double, T>>::type>
 {
   using Vectype = vi128d_t;
-  using WrapperType =  vi128d_wr;
+  using WrapperType = vi128d_wr;
 };
 template <typename T>
-struct TypeToVecWrapperType<T, typename std::enable_if<std::is_unsigned_v<T> >::type>
-    : WidthToVecWrapperType<sizeof(T)>
+struct TypeToVecWrapperType<T, typename std::enable_if<std::is_unsigned_v<T>>::type>
+ : WidthToVecWrapperType<sizeof(T)>
 {
 };
 
 template <typename T>
-    struct TypeToVecWrapperType<
-        T, typename std::enable_if<std::is_signed_v<T> &&!is_floating_point_v<T>>::type>
-    : WidthToSVecWrapperType<sizeof(T)>
+struct TypeToVecWrapperType<T, typename std::enable_if<std::is_signed_v<T> && !is_floating_point_v<T>>::type>
+ : WidthToSVecWrapperType<sizeof(T)>
 {
 };
 
-
 template <typename T, ENUM_KIND KIND, typename ENABLE = void>
 struct IntegralToSIMD;
 
@@ -264,78 +238,11 @@ struct StorageToFiltering<T, KIND, typename std::enable_if<KIND != KIND_FLOAT>::
   using type = T;
 };
 
-// these are the x86 instructions that need to be realized by some arm neon instructions
-// the implementations of mm_movemask_epi8 for each type are different because of performance
-
-MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_64(ArmNeonSSEVecType input)
-{
-  return static_cast<MT>(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8));
-}
-
-MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_32(ArmNeonSSEVecType input)
-{
-  input = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(input), 4));
-  return static_cast<MT>(vgetq_lane_u8(input, 3) | ((int)vgetq_lane_u8(input, 11) << 8));
-}
-
-MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_16(ArmNeonSSEVecType input)
-{
-  input = vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(input), 14));
-  input = vreinterpretq_u8_u32(vsraq_n_u32(vreinterpretq_u32_u8(input), vreinterpretq_u32_u8(input), 14));
-  input = vreinterpretq_u8_u64(vsraq_n_u64(vreinterpretq_u64_u8(input), vreinterpretq_u64_u8(input), 28));
-  return static_cast<MT>(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8));
-}
-MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8(ArmNeonSSEVecType input)
-{
-  // Example input (half scale):
-  // 0x89 FF 1D C0 00 10 99 33
-
-  // Shift out everything but the sign bits
-  // 0x01 01 00 01 00 00 01 00
-  uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-  // Merge the even lanes together with vsra. The '??' bytes are garbage.
-  // vsri could also be used, but it is slightly slower on aarch64.
-  // 0x??03 ??02 ??00 ??01
-  uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-  // Repeat with wider lanes.
-  // 0x??????0B ??????04
-  uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-  // 0x??????????????4B
-  uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-  // Extract the low 8 bits from each lane and join.
-  // 0x4B
-  return static_cast<MT>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
-}
-
-
-MCS_FORCE_INLINE MT arm_neon_mm_movemask_pd(ArmNeonSSEVecType a)
-{
-  uint64x2_t input = vreinterpretq_u64_u8(a);
-  uint64x2_t high_bits = vshrq_n_u64(input, 63);
-  return static_cast<MT> (vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1));
-}
-MCS_FORCE_INLINE MT arm_neon_mm_movemask_ps(ArmNeonSSEVecType a)
-{
-  uint32x4_t input = vreinterpretq_u32_u8(a);
-  static const int32x4_t shift = {0, 1, 2, 3};
-  uint32x4_t tmp = vshrq_n_u32(input, 31);
-  return static_cast<MT>(vaddvq_u32(vshlq_u32(tmp, shift)));
-}
-
-MCS_FORCE_INLINE void arm_neon_mm_maskmoveu_si128(ArmNeonSSEVecType a, ArmNeonSSEVecType mask, char* mem_addr)
-{
-  int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_u8(mask), 7);
-  float32x4_t b = vld1q_f32((float*)mem_addr);
-  int8x16_t masked =
-      vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_u8(a), vreinterpretq_s8_f32(b));
-  vst1q_s8((int8_t*)mem_addr, masked);
-}
-
 template <typename VT, typename T, typename ENABLE = void>
 class SimdFilterProcessor;
 
-// Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.we use int32_t to do operations
+// Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.we use
+// int32_t to do operations
 template <typename VT, typename CHECK_T>
 class SimdFilterProcessor<
     VT, CHECK_T,
@@ -352,6 +259,8 @@ class SimdFilterProcessor<
   using SimdType = int32x4_t;
   using FilterType = T;
   using StorageType = T;
+  using MT = uint32x4_t;
+  using MaskType = MT;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
@@ -363,14 +272,18 @@ class SimdFilterProcessor<
   {
     return vdupq_n_s32(fill);
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_s32((uint32x4_t)mask, y,x);
+    return vbslq_s32(mask, y, x);
   }
   MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const
   {
     return (SimdType)vcgtq_s32(x, y);
   }
+  MCS_FORCE_INLINE T maxScalar(SimdType x)
+  {
+    return vmaxvq_s32(x);
+  }
   MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const
   {
     return vandq_s32(x, y);
@@ -387,7 +300,7 @@ class SimdFilterProcessor<
 
   MCS_FORCE_INLINE MT cmpDummy(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u32(0xFFFFFFFF);
   }
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
@@ -422,24 +335,28 @@ class SimdFilterProcessor<
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u32(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u32(0xFFFFFFFF);
+  }
+  MCS_FORCE_INLINE MT falseMask()
+  {
+    return vdupq_n_u32(0);
   }
 
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT trueMask()
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u32(0xFFFFFFFF);
   }
 
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
+
   MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y)
   {
     return reinterpret_cast<SimdType>(std::min(reinterpret_cast<int128_t>(x), reinterpret_cast<int128_t>(y)));
@@ -458,17 +375,11 @@ class SimdFilterProcessor<
   {
     return vdupq_n_s32(0);
   }
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
 
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
     vst1q_s32(reinterpret_cast<int32_t*>(dst), x);
   }
-
 };
 
 template <typename VT, typename T>
@@ -485,9 +396,11 @@ class SimdFilterProcessor<
   using SimdType = simd::vi128d_t;
   using StorageSimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
   using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
-  using StorageWrapperTypeType =typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using StorageWrapperTypeType = typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
   using StorageVecProcType = SimdFilterProcessor<StorageWrapperTypeType, StorageType>;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  using MT = uint64x2_t;
+  using MaskType = MT;
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -506,9 +419,9 @@ class SimdFilterProcessor<
   {
     return vld1q_f64(reinterpret_cast<const T*>(from));
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_f64((uint64x2_t)mask,y,x);
+    return vbslq_f64(mask, y, x);
   }
 
   MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const
@@ -518,12 +431,12 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_f64(x, y));
+    return vceqq_f64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_f64(x, y));
+    return vcgeq_f64(x, y);
   }
   MCS_FORCE_INLINE T maxScalar(SimdType x)
   {
@@ -539,40 +452,42 @@ class SimdFilterProcessor<
   }
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_f64(x, y));
+    return vcgtq_f64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_f64(x, y));
+    return vcleq_f64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_f64(x, y));
+    return vcltq_f64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return cmpEq(x,y) ^ 0xFFFF;
+    return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmpEq(x, y))));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u64(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u64(0xFFFFFFFFFFFFFFFF);
   }
-
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT falseMask()
   {
-    return arm_neon_mm_movemask_pd((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u64(0);
   }
 
+  MCS_FORCE_INLINE MT trueMask()
+  {
+    return vdupq_n_u64(0xFFFFFFFFFFFFFFFF);
+  }
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
   {
     StorageVecProcType nullEmptyProcessor;
@@ -619,14 +534,22 @@ class SimdFilterProcessor<
   constexpr static const uint16_t vecByteSize = 16U;
   constexpr static const uint16_t vecBitSize = 128U;
   using FilterType = T;
-  using NullEmptySimdType =typename  WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using NullEmptySimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
   using SimdWrapperType = vi128f_wr;
   using SimdType = vi128f_t;
   using StorageSimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
   using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
-  using StorageWrapperTypeType =typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using StorageWrapperTypeType = typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
   using StorageVecProcType = SimdFilterProcessor<StorageWrapperTypeType, StorageType>;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  using MT = uint32x4_t;
+  using MaskType = MT;
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // These masks are valid for little-endian archs.
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(reinterpret_cast<const uint32_t*>(inputArray));
+    return uint32x4_t{ptr[0], ptr[1], ptr[2], ptr[3]};
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -634,9 +557,9 @@ class SimdFilterProcessor<
     // This spec borrows the expr from u-/int64 based proceesor class.
     return (SimdType)nullEmptyProcessor.loadValue(fill);
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_f32((uint32x4_t)mask, y,x);
+    return vbslq_f32(mask, y, x);
   }
   MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const
   {
@@ -667,48 +590,51 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_f32(x, y));
+    return vceqq_f32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_f32(x, y));
+    return vcgeq_f32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_f32(x, y));
+    return vcgtq_f32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_f32(x, y));
+    return vcleq_f32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_f32(x, y));
+    return vcltq_f32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmvnq_u32(vceqq_f32(x, y)));
+    return vmvnq_u32(vceqq_f32(x, y));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u32(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u32(0xFFFFFFFF);
+  }
+  MCS_FORCE_INLINE MT falseMask()
+  {
+    return vdupq_n_u32(0);
   }
 
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT trueMask()
   {
-    return arm_neon_mm_movemask_ps((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u32(0xFFFFFFFF);
   }
 
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
@@ -762,10 +688,11 @@ class SimdFilterProcessor<
   constexpr static const uint16_t vecBitSize = 128U;
   using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
   using SimdWrapperType = typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
-  using SimdType =typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using SimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
   using FilterType = T;
   using StorageType = T;
-
+  using MT = uint64x2_t;
+  using MaskType = MT;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
@@ -777,15 +704,22 @@ class SimdFilterProcessor<
   {
     return vdupq_n_s64(fill);
   }
-
+  MCS_FORCE_INLINE T maxScalar(SimdType x)
+  {
+    return std::max(((int64_t*)(&x))[0], ((int64_t*)(&x))[1]);
+  }
+  MCS_FORCE_INLINE T minScalar(SimdType x)
+  {
+    return std::min(((int64_t*)(&x))[0], ((int64_t*)(&x))[1]);
+  }
   // Load from
   MCS_FORCE_INLINE SimdType loadFrom(const char* from)
   {
     return vld1q_s64(reinterpret_cast<const int64_t*>(from));
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_s64((uint64x2_t)mask, y,x);
+    return vbslq_s64(mask, y, x);
   }
 
   MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const
@@ -800,48 +734,56 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType) vcgeq_s64(x,y));
+    return vcgeq_s64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_s64(x, y));
+    return vcgtq_s64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_s64(x, y));
+    return vceqq_s64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_s64(x, y));
+    return vcleq_s64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_s64(x, y));
+    return vcltq_s64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return cmpEq(x,y)^0xFFFF;
+    return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmpEq(x, y))));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u64(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u64(0xFFFFFFFFFFFFFFFF);
+  }
+  MCS_FORCE_INLINE MT falseMask()
+  {
+    return vdupq_n_u64(0);
   }
 
+  MCS_FORCE_INLINE MT trueMask()
+  {
+    return vdupq_n_u64(0xFFFFFFFFFFFFFFFF);
+  }
   // misc
   MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask);
+    return vmask;
   }
 
   MCS_FORCE_INLINE SimdType setToZero()
@@ -853,24 +795,25 @@ class SimdFilterProcessor<
   {
     return cmpNe(x, y);
   }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(MaskType x, MaskType y)
+  {
+    return cmpNe(x, y);
+  }
+
   MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
 
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
   MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y)
   {
-    return vbslq_s64(vcgtq_s64(y,x), x, y);
+    return vbslq_s64(vcgtq_s64(y, x), x, y);
   }
 
   MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y)
   {
-    return vbslq_s64(vcgtq_s64(x,y), x, y);
+    return vbslq_s64(vcgtq_s64(x, y), x, y);
   }
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
@@ -892,7 +835,8 @@ class SimdFilterProcessor<
   using SimdType = typename WidthToVecWrapperType<sizeof(T)>::Vectype;
   using FilterType = T;
   using StorageType = T;
-
+  using MT = uint64x2_t;
+  using MaskType = MT;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
@@ -904,9 +848,9 @@ class SimdFilterProcessor<
   {
     return vdupq_n_u64(fill);
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_u64((uint64x2_t)mask, y,x);
+    return vbslq_u64(mask, y, x);
   }
   MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const
   {
@@ -924,30 +868,37 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_u64(x, y));
+    return vcgeq_u64(x, y);
+  }
+  MCS_FORCE_INLINE T maxScalar(SimdType x)
+  {
+    return std::max(((uint64_t*)(&x))[0], ((uint64_t*)(&x))[1]);
+  }
+  MCS_FORCE_INLINE T minScalar(SimdType x)
+  {
+    return std::min(((uint64_t*)(&x))[0], ((uint64_t*)(&x))[1]);
   }
-
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_u64(x, y));
+    return vcgtq_u64(x, y);
   }
   MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y)
   {
-    return vbslq_u64(vcgtq_u64(y,x), x, y);
+    return vbslq_u64(vcgtq_u64(y, x), x, y);
   }
 
   MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y)
   {
-    return vbslq_u64(vcgtq_u64(x,y), x, y);
+    return vbslq_u64(vcgtq_u64(x, y), x, y);
   }
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y));
+    return vceqq_u64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_u64(x, y));
+    return vcleq_u64(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
@@ -957,25 +908,28 @@ class SimdFilterProcessor<
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)) ^ 0xFFFF;
+    return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_u64(x, y))));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u64(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u64(0xFFFFFFFFFFFFFFFF);
   }
 
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT falseMask()
   {
-    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u64(0);
   }
 
+  MCS_FORCE_INLINE MT trueMask()
+  {
+    return vdupq_n_u64(0xFFFFFFFFFFFFFFFF);
+  }
   MCS_FORCE_INLINE SimdType setToZero()
   {
     return vdupq_n_u64(0);
@@ -991,12 +945,6 @@ class SimdFilterProcessor<
     return cmpEq(x, y);
   }
 
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
-
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
     vst1q_u64(reinterpret_cast<uint64_t*>(dst), x);
@@ -1012,12 +960,19 @@ class SimdFilterProcessor<
   constexpr static const uint16_t vecByteSize = 16U;
   constexpr static const uint16_t vecBitSize = 128U;
   using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
-  using SimdWrapperType =typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdWrapperType = typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
   using SimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
   using FilterType = T;
   using StorageType = T;
-
+  using MT = uint32x4_t;
+  using MaskType = MT;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // These masks are valid for little-endian archs.
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(reinterpret_cast<const uint32_t*>(inputArray));
+    return uint32x4_t{ptr[0], ptr[1], ptr[2], ptr[3]};
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -1037,9 +992,9 @@ class SimdFilterProcessor<
   {
     return vld1q_s32(reinterpret_cast<const int32_t*>(from));
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_s32((uint32x4_t)mask, y,x);
+    return vbslq_s32(mask, y, x);
   }
   MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const
   {
@@ -1052,22 +1007,22 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType) vceqq_s32(x, y));
+    return vceqq_s32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_s32(x, y));
+    return vcgeq_s32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_s32(x, y));
+    return vcgtq_s32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_s32(x, y));
+    return vcleq_s32(x, y);
   }
   MCS_FORCE_INLINE T minScalar(SimdType x)
   {
@@ -1084,28 +1039,31 @@ class SimdFilterProcessor<
   }
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_s32(x, y));
+    return vcltq_s32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_s32(x, y)) ^ 0xFFFF;
+    return vmvnq_u32(vceqq_s32(x, y));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u32(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u32(0xFFFFFFFF);
+  }
+  MCS_FORCE_INLINE MT falseMask()
+  {
+    return vdupq_n_u32(0);
   }
 
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT trueMask()
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u32(0xFFFFFFFF);
   }
 
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
@@ -1123,12 +1081,6 @@ class SimdFilterProcessor<
     return vdupq_n_s32(0);
   }
 
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
-
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
     vst1q_s32(reinterpret_cast<int32_t*>(dst), x);
@@ -1149,8 +1101,15 @@ class SimdFilterProcessor<
   using SimdType = typename WidthToVecWrapperType<sizeof(T)>::Vectype;
   using FilterType = T;
   using StorageType = T;
-
+  using MT = uint32x4_t;
+  using MaskType = MT;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // These masks are valid for little-endian archs.
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(reinterpret_cast<const uint32_t*>(inputArray));
+    return uint32x4_t{ptr[0], ptr[1], ptr[2], ptr[3]};
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -1170,9 +1129,9 @@ class SimdFilterProcessor<
   {
     return vld1q_u32(reinterpret_cast<const uint32_t*>(from));
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_u32((uint32x4_t)mask, y,x);
+    return vbslq_u32(mask, y, x);
   }
   MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const
   {
@@ -1185,12 +1144,12 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y));
+    return vceqq_u32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_u32(x, y));
+    return vcgeq_u32(x, y);
   }
   MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y)
   {
@@ -1203,12 +1162,12 @@ class SimdFilterProcessor<
   }
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_u32(x, y));
+    return vcgtq_u32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_u32(x, y));
+    return vcleq_u32(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
@@ -1218,12 +1177,12 @@ class SimdFilterProcessor<
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)) ^ 0xFFFF;
+    return vmvnq_u32(vceqq_u32(x, y));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u32(0);
   }
   MCS_FORCE_INLINE T minScalar(SimdType x)
   {
@@ -1231,13 +1190,16 @@ class SimdFilterProcessor<
   }
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u32(0xFFFFFFFF);
+  }
+  MCS_FORCE_INLINE MT falseMask()
+  {
+    return vdupq_n_u32(0);
   }
 
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT trueMask()
   {
-    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u32(0xFFFFFFFF);
   }
 
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
@@ -1255,12 +1217,6 @@ class SimdFilterProcessor<
     return vdupq_n_u32(0);
   }
 
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
-
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
     vst1q_u32(reinterpret_cast<uint32_t*>(dst), x);
@@ -1280,8 +1236,16 @@ class SimdFilterProcessor<
   using SimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
   using FilterType = T;
   using StorageType = T;
-
+  using MT = uint16x8_t;
+  using MaskType = MT;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // These masks are valid for little-endian archs.
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(reinterpret_cast<const uint64_t*>(inputArray));
+    return uint16x8_t{ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]};
+  }
+
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -1305,11 +1269,11 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_s16(x, y));
+    return vceqq_s16(x, y);
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_s16((uint16x8_t)mask, y,x);
+    return vbslq_s16(mask, y, x);
   }
 
   MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const
@@ -1326,12 +1290,12 @@ class SimdFilterProcessor<
   }
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_s16(x, y));
+    return vcgeq_s16(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_s16(x, y));
+    return vcgtq_s16(x, y);
   }
   MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y)
   {
@@ -1344,33 +1308,36 @@ class SimdFilterProcessor<
   }
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcleq_s16(x, y));
+    return vcleq_s16(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_s16(x, y));
+    return vcltq_s16(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return cmpEq(x,y) ^ 0xFFFF;
+    return vmvnq_u16(cmpEq(x, y));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u16(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u16(0xFFFF);
+  }
+  MCS_FORCE_INLINE MT falseMask()
+  {
+    return vdupq_n_u16(0);
   }
 
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT trueMask()
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u16(0xFFFF);
   }
 
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
@@ -1388,12 +1355,6 @@ class SimdFilterProcessor<
     return vdupq_n_s16(0);
   }
 
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
-
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
     vst1q_s16(reinterpret_cast<int16_t*>(dst), x);
@@ -1401,20 +1362,27 @@ class SimdFilterProcessor<
 };
 
 template <typename VT, typename CHECK_T>
-class SimdFilterProcessor<VT, CHECK_T,
-                          typename std::enable_if<std::is_same<VT, viu2_wr>::value &&
-                                                  std::is_same<CHECK_T, uint16_t>::value>::type>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, viu2_wr>::value && std::is_same<CHECK_T, uint16_t>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
   constexpr static const uint16_t vecBitSize = 128U;
-  using T =  uint16_t;
+  using T = uint16_t;
   using SimdWrapperType = typename WidthToVecWrapperType<sizeof(T)>::WrapperType;
   using SimdType = typename WidthToVecWrapperType<sizeof(T)>::Vectype;
   using FilterType = T;
   using StorageType = T;
-
+  using MT = uint16x8_t;
+  using MaskType = MT;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // These masks are valid for little-endian archs.
+    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(reinterpret_cast<const uint64_t*>(inputArray));
+    return uint16x8_t{ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]};
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -1454,16 +1422,16 @@ class SimdFilterProcessor<VT, CHECK_T,
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y));
+    return vceqq_u16(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_u16(x, y));
+    return vcgeq_u16(x, y);
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_u16((uint16x8_t)mask, y, x);
+    return vbslq_u16(mask, y, x);
   }
 
   MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const
@@ -1472,7 +1440,7 @@ class SimdFilterProcessor<VT, CHECK_T,
   }
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_u16(x, y));
+    return vcgtq_u16(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
@@ -1482,28 +1450,31 @@ class SimdFilterProcessor<VT, CHECK_T,
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_u16(x, y));
+    return vcltq_u16(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y)) ^ 0xFFFF;
+    return vmvnq_u16(vceqq_u16(x, y));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u16(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u16(0xFFFF);
+  }
+  MCS_FORCE_INLINE MT falseMask()
+  {
+    return vdupq_n_u16(0);
   }
 
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT trueMask()
   {
-    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u16(0xFFFF);
   }
 
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
@@ -1521,12 +1492,6 @@ class SimdFilterProcessor<VT, CHECK_T,
     return vdupq_n_u16(0);
   }
 
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
-
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
     vst1q_u16(reinterpret_cast<uint16_t*>(dst), x);
@@ -1546,6 +1511,8 @@ class SimdFilterProcessor<
   using SimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
   using FilterType = T;
   using StorageType = T;
+  using MT = uint8x16_t;
+  using MaskType = MT;
 
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
   // Load value
@@ -1584,9 +1551,9 @@ class SimdFilterProcessor<
   {
     return vld1q_s8(reinterpret_cast<const int8_t*>(from));
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_s8((uint8x16_t)mask, y, x);
+    return vbslq_s8(mask, y, x);
   }
 
   MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const
@@ -1596,47 +1563,56 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y));
+    return vceqq_s8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_s8(x, y));
+    return vcgeq_s8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_s8(x, y));
+    return vcgtq_s8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_s8(x, y));
+    return vcleq_s8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_s8(x, y));
+    return vcltq_s8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)) ^ 0xFFFF;
+    return vmvnq_u8(vceqq_s8(x, y));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u8(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u8(0xff);
+  }
+  MCS_FORCE_INLINE MT falseMask()
+  {
+    return vdupq_n_u8(0);
+  }
+
+  MCS_FORCE_INLINE MT trueMask()
+  {
+    return vdupq_n_u8(0xff);
   }
   // misc
   MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask);
+    return vmask;
   }
 
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
@@ -1654,12 +1630,6 @@ class SimdFilterProcessor<
     return vdupq_n_s8(0);
   }
 
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
-
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
     vst1q_s8(reinterpret_cast<int8_t*>(dst), x);
@@ -1675,11 +1645,12 @@ class SimdFilterProcessor<
   constexpr static const uint16_t vecByteSize = 16U;
   constexpr static const uint16_t vecBitSize = 128U;
   using T = uint8_t;
-  using SimdWrapperType =typename WidthToVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdWrapperType = typename WidthToVecWrapperType<sizeof(T)>::WrapperType;
   using SimdType = typename WidthToVecWrapperType<sizeof(T)>::Vectype;
   using FilterType = T;
   using StorageType = T;
-
+  using MT = uint8x16_t;
+  using MaskType = MT;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
@@ -1705,9 +1676,9 @@ class SimdFilterProcessor<
   {
     return vld1q_u8(reinterpret_cast<const uint8_t*>(from));
   }
-  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const
+  MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const
   {
-    return vbslq_u8((uint8x16_t)mask, y, x);
+    return vbslq_u8(mask, y, x);
   }
 
   MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const
@@ -1721,17 +1692,17 @@ class SimdFilterProcessor<
   // Compare
   MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y));
+    return vceqq_u8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_u8(x, y));
+    return vcgeq_u8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_u8(x, y));
+    return vcgtq_u8(x, y);
   }
   MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y)
   {
@@ -1744,34 +1715,37 @@ class SimdFilterProcessor<
   }
   MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_u8(x, y));
+    return vcleq_u8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_u8(x, y));
+    return vcltq_u8(x, y);
   }
 
   MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)) ^ 0xFFFF;
+    return vmvnq_u8(vceqq_u8(x, y));
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return vdupq_n_u8(0);
   }
 
   MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return vdupq_n_u8(0xff);
   }
 
-
-  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask)
+  MCS_FORCE_INLINE MT falseMask()
   {
-    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask);
+    return vdupq_n_u8(0);
+  }
+
+  MCS_FORCE_INLINE MT trueMask()
+  {
+    return vdupq_n_u8(0xff);
   }
 
   MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
@@ -1779,6 +1753,11 @@ class SimdFilterProcessor<
     return cmpNe(x, y);
   }
 
+  // MCS_FORCE_INLINE MaskType nullEmptyCmpNe(MaskType x, MaskType y)
+  // {
+  //   return cmpNe(x, y);
+  // }
+
   MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
@@ -1788,19 +1767,12 @@ class SimdFilterProcessor<
   {
     return vdupq_n_u8(0);
   }
-
-  // store
-  MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst)
-  {
-    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
-  }
-
   MCS_FORCE_INLINE void store(char* dst, SimdType x)
   {
     vst1q_u8(reinterpret_cast<uint8_t*>(dst), x);
   }
 };
 
-};      // namespace simd
+};  // namespace simd
 
 #endif
diff --git a/utils/common/simd_sse.h b/utils/common/simd_sse.h
index edd75bcc6..4a8fb42df 100644
--- a/utils/common/simd_sse.h
+++ b/utils/common/simd_sse.h
@@ -27,7 +27,6 @@ enum ENUM_KIND
   KIND_TEXT
 };  // whitespace-trimmed and then compared as signed integers
 
-
 #if defined(__x86_64__)
 
 #include <cstdint>
@@ -54,6 +53,7 @@ using vi128f_t = __m128;
 using vi128d_t = __m128d;
 using int128_t = __int128;
 using MT = uint16_t;
+
 // These ugly wrappers are used to allow to use __m128* as template class parameter argument
 struct vi128_wr
 {
@@ -117,22 +117,32 @@ struct StorageToFiltering<T, KIND, typename std::enable_if<KIND != KIND_FLOAT>::
 };
 
 template <int i0, int i1, int i2, int i3>
-static inline vi128_t constant4i() {
-    static const union {
-        int     i[4];
-        vi128_t xmm;
-    } u = {{i0,i1,i2,i3}};
-    return u.xmm;
+static inline vi128_t constant4i()
+{
+  static const union
+  {
+    int i[4];
+    vi128_t xmm;
+  } u = {{i0, i1, i2, i3}};
+  return u.xmm;
 }
 
-static inline vi128_t bitMaskToByteMask16(MT m) {
+template <int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7>
+static inline vi128_t constant8i()
+{
+  static const union
+  {
+    int8_t i[16];
+    vi128_t xmm;
+  } u = {{i0, i0, i1, i1, i2, i2, i3, i3, i4, i4, i5, i5, i6, i6, i7, i7}};
+  return u.xmm;
+}
+
+static inline vi128_t bitMaskToByteMask16(MT m)
+{
   vi128_t sel = _mm_set1_epi64x(0x8040201008040201);
   return _mm_cmpeq_epi8(
-    _mm_and_si128(
-      _mm_shuffle_epi8(_mm_cvtsi32_si128(m),
-        _mm_set_epi64x(0x0101010101010101, 0)),
-      sel),
-    sel);
+      _mm_and_si128(_mm_shuffle_epi8(_mm_cvtsi32_si128(m), _mm_set_epi64x(0x0101010101010101, 0)), sel), sel);
 }
 
 template <typename VT, typename T, typename ENABLE = void>
@@ -155,6 +165,7 @@ class SimdFilterProcessor<
   using SimdType = vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
@@ -173,49 +184,49 @@ class SimdFilterProcessor<
     return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
   }
 
-  MCS_FORCE_INLINE MT cmpDummy(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpDummy(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return MaskType{0x0, 0x0};
   }
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);  // ????
   }
 
   // misc
@@ -224,12 +235,12 @@ class SimdFilterProcessor<
     return _mm_movemask_epi8(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpDummy(x, y);
   }
@@ -274,6 +285,14 @@ class SimdFilterProcessor<
   {
     return reinterpret_cast<SimdType>(std::max(reinterpret_cast<int128_t>(x), reinterpret_cast<int128_t>(y)));
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 template <typename VT, typename T>
@@ -290,6 +309,7 @@ class SimdFilterProcessor<
   using SimdType = simd::vi128d_t;
   using StorageSimdType = simd::vi128_t;
   using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
+  using MaskType = vi128_t;
   using StorageVecProcType = SimdFilterProcessor<simd::vi128_wr, StorageType>;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
@@ -314,44 +334,44 @@ class SimdFilterProcessor<
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_pd(x, y));
+    return (MaskType)_mm_cmpeq_pd(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_pd(x, y));
+    return (MaskType)_mm_cmpge_pd(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_pd(x, y));
+    return (MaskType)_mm_cmpgt_pd(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmple_pd(x, y));
+    return (MaskType)_mm_cmple_pd(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_pd(x, y));
+    return (MaskType)_mm_cmplt_pd(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_pd(x, y));
+    return (MaskType)_mm_cmpneq_pd(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
   }
 
   // misc
@@ -360,7 +380,8 @@ class SimdFilterProcessor<
     return _mm_movemask_pd(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  // Maybe unused
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     StorageVecProcType nullEmptyProcessor;
     NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
@@ -369,7 +390,13 @@ class SimdFilterProcessor<
     return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(MaskType x, MaskType y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+    return nullEmptyProcessor.cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     StorageVecProcType nullEmptyProcessor;
 
@@ -413,6 +440,14 @@ class SimdFilterProcessor<
   {
     return _mm_and_pd(x, y);
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 template <typename VT, typename T>
@@ -428,10 +463,18 @@ class SimdFilterProcessor<
   using SimdType = vi128f_t;
   using StorageSimdType = simd::vi128_t;
   using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
+  using MaskType = vi128_t;
   using StorageVecProcType = SimdFilterProcessor<simd::vi128_wr, StorageType>;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // These masks are valid for little-endian archs.
+    const MaskType byteMaskVec =
+        constant4i<(int32_t)0x000000FF, (int32_t)0x0000FF00, (int32_t)0x00FF0000, (int32_t)0xFF000000>();
+    return _mm_and_si128(_mm_set1_epi32(*(const int32_t*)inputArray), byteMaskVec);
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -452,44 +495,44 @@ class SimdFilterProcessor<
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_ps(x, y));
+    return (MaskType)_mm_cmpeq_ps(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_ps(x, y));
+    return (MaskType)_mm_cmpge_ps(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_ps(x, y));
+    return (MaskType)_mm_cmpgt_ps(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmple_ps(x, y));
+    return (MaskType)_mm_cmple_ps(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_ps(x, y));
+    return (MaskType)_mm_cmplt_ps(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_ps(x, y));
+    return (MaskType)_mm_cmpneq_ps(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
   }
 
   // misc
@@ -498,7 +541,8 @@ class SimdFilterProcessor<
     return _mm_movemask_ps(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  // WIP maybe unused
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     StorageVecProcType nullEmptyProcessor;
 
@@ -508,7 +552,7 @@ class SimdFilterProcessor<
     return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     StorageVecProcType nullEmptyProcessor;
 
@@ -518,6 +562,12 @@ class SimdFilterProcessor<
     return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr);
   }
 
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(MaskType x, MaskType y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+    return nullEmptyProcessor.cmpNe(x, y);
+  }
+
   MCS_FORCE_INLINE SimdType setToZero()
   {
     return _mm_setzero_ps();
@@ -552,12 +602,21 @@ class SimdFilterProcessor<
   {
     return _mm_and_ps(x, y);
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 template <typename VT, typename CHECK_T>
-class SimdFilterProcessor<VT, CHECK_T,
-                          typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, int64_t>::value &&
-                                                  !std::is_same<CHECK_T, double>::value>::type>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, int64_t>::value &&
+                            !std::is_same<CHECK_T, double>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
@@ -567,6 +626,7 @@ class SimdFilterProcessor<VT, CHECK_T,
   using SimdType = vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
@@ -588,44 +648,44 @@ class SimdFilterProcessor<VT, CHECK_T,
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi64(x, y), _mm_cmpeq_epi64(x, y)));
+    return _mm_or_si128(_mm_cmpgt_epi64(x, y), _mm_cmpeq_epi64(x, y));
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) const
   {
-    return _mm_movemask_epi8(_mm_cmpgt_epi64(x, y));
+    return _mm_cmpgt_epi64(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y));
+    return _mm_cmpeq_epi64(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
-    return cmpGt(x, y) ^ 0xFFFF;
+    return cmpGt(x, y) ^ loadValue(0xFFFFFFFFFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
     return cmpNe(x, y) ^ cmpGt(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)) ^ 0xFFFF;
+    return _mm_cmpeq_epi64(x, y) ^ loadValue(0xFFFFFFFFFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return loadValue(0xFFFFFFFFFFFFFFFF);
   }
 
   // misc
@@ -639,12 +699,12 @@ class SimdFilterProcessor<VT, CHECK_T,
     return _mm_setzero_si128();
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpNe(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
@@ -677,19 +737,28 @@ class SimdFilterProcessor<VT, CHECK_T,
 
   MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const
   {
-    return blend(x, y, cmpGtSimdType(x,y));
+    return blend(x, y, cmpGt(x, y));
   }
 
   MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const
   {
-    return blend(x, y, cmpGtSimdType(y,x));
+    return blend(x, y, cmpGt(y, x));
+  }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
   }
 };
 
 template <typename VT, typename CHECK_T>
-class SimdFilterProcessor<VT, CHECK_T,
-                          typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, uint64_t>::value &&
-                                                  !std::is_same<CHECK_T, double>::value>::type>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, uint64_t>::value &&
+                            !std::is_same<CHECK_T, double>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
@@ -699,6 +768,7 @@ class SimdFilterProcessor<VT, CHECK_T,
   using SimdType = vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
@@ -720,47 +790,47 @@ class SimdFilterProcessor<VT, CHECK_T,
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    return cmpGt(y, x) ^ 0xFFFF;
+    return cmpGt(y, x) ^ loadValue(0xFFFFFFFFFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) const
   {
-    SimdType signVec = constant4i<0,(int32_t)0x80000000,0,(int32_t)0x80000000>();
+    SimdType signVec = constant4i<0, (int32_t)0x80000000, 0, (int32_t)0x80000000>();
     SimdType xFlip = _mm_xor_si128(x, signVec);
     SimdType yFlip = _mm_xor_si128(y, signVec);
-    return _mm_movemask_epi8(_mm_cmpgt_epi64(xFlip, yFlip));
+    return _mm_cmpgt_epi64(xFlip, yFlip);
   }
 
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y));
+    return _mm_cmpeq_epi64(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
-    return cmpGt(x, y) ^ 0xFFFF;
+    return cmpGt(x, y) ^ loadValue(0xFFFFFFFFFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
     return cmpGt(y, x);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)) ^ 0xFFFF;
+    return _mm_cmpeq_epi64(x, y) ^ loadValue(0xFFFFFFFFFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return loadValue(0xFFFFFFFFFFFFFFFF);
   }
 
   // misc
@@ -774,12 +844,12 @@ class SimdFilterProcessor<VT, CHECK_T,
     return _mm_setzero_si128();
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpNe(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
@@ -807,27 +877,36 @@ class SimdFilterProcessor<VT, CHECK_T,
 
   MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const
   {
-    SimdType signVec = constant4i<0,(int32_t)0x80000000,0,(int32_t)0x80000000>();
+    SimdType signVec = constant4i<0, (int32_t)0x80000000, 0, (int32_t)0x80000000>();
     SimdType xFlip = _mm_xor_si128(x, signVec);
     SimdType yFlip = _mm_xor_si128(y, signVec);
     return _mm_cmpgt_epi64(xFlip, yFlip);
   }
 
-  MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const
+  MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y)
   {
-    return blend(x, y, cmpGtSimdType(x,y));
+    return blend(x, y, cmpGt(x, y));
   }
 
   MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const
   {
-    return blend(x, y, cmpGtSimdType(y,x));
+    return blend(x, y, cmpGt(y, x));
+  }
+  MCS_FORCE_INLINE MaskType falseMask() const
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask() const
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
   }
 };
 
 template <typename VT, typename CHECK_T>
-class SimdFilterProcessor<VT, CHECK_T,
-                          typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, int32_t>::value &&
-                                                  !std::is_same<CHECK_T, float>::value>::type>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, int32_t>::value &&
+                            !std::is_same<CHECK_T, float>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
@@ -837,9 +916,18 @@ class SimdFilterProcessor<VT, CHECK_T,
   using SimdType = vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // MaskType ctor
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // These masks are valid for little-endian archs.
+    const SimdType byteMaskVec =
+        constant4i<(int32_t)0x000000FF, (int32_t)0x0000FF00, (int32_t)0x00FF0000, (int32_t)0xFF000000>();
+    return _mm_and_si128(_mm_set1_epi32(*(const int32_t*)inputArray), byteMaskVec);
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -858,44 +946,44 @@ class SimdFilterProcessor<VT, CHECK_T,
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y));
+    return _mm_cmpeq_epi32(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    return cmpLt(x, y) ^ 0xFFFF;
+    return cmpLt(x, y) ^ loadValue(0xFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpgt_epi32(x, y));
+    return _mm_cmpgt_epi32(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
-    return cmpGt(x, y) ^ 0xFFFF;
+    return cmpGt(x, y) ^ loadValue(0xFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmplt_epi32(x, y));
+    return _mm_cmplt_epi32(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)) ^ 0xFFFF;
+    return _mm_cmpeq_epi32(x, y) ^ loadValue(0xFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return loadValue(0xFFFF);
   }
 
   // misc
@@ -904,12 +992,12 @@ class SimdFilterProcessor<VT, CHECK_T,
     return _mm_movemask_epi8(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpNe(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
@@ -954,12 +1042,21 @@ class SimdFilterProcessor<VT, CHECK_T,
   {
     return _mm_max_epi32(x, y);
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 template <typename VT, typename CHECK_T>
-class SimdFilterProcessor<VT, CHECK_T,
-                          typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, uint32_t>::value &&
-                                                  !std::is_same<CHECK_T, float>::value>::type>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, uint32_t>::value &&
+                            !std::is_same<CHECK_T, float>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
@@ -969,9 +1066,17 @@ class SimdFilterProcessor<VT, CHECK_T,
   using SimdType = vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // These masks are valid for little-endian archs.
+    const SimdType byteMaskVec =
+        constant4i<(int32_t)0x000000FF, (int32_t)0x0000FF00, (int32_t)0x00FF0000, (int32_t)0xFF000000>();
+    return _mm_and_si128(_mm_set1_epi32(*(const int32_t*)inputArray), byteMaskVec);
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -990,47 +1095,48 @@ class SimdFilterProcessor<VT, CHECK_T,
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y));
+    return _mm_cmpeq_epi32(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    return cmpGt(y, x) ^ 0xFFFF;
+    return cmpGt(y, x) ^ loadValue(0xFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
-    SimdType signVec = constant4i<(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000>();
+    SimdType signVec =
+        constant4i<(int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000>();
     SimdType xFlip = _mm_xor_si128(x, signVec);
     SimdType yFlip = _mm_xor_si128(y, signVec);
-    return _mm_movemask_epi8(_mm_cmpgt_epi32(xFlip, yFlip));
+    return _mm_cmpgt_epi32(xFlip, yFlip);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
-    return cmpGt(x, y) ^ 0xFFFF;
+    return cmpGt(x, y) ^ loadValue(0xFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
     return cmpGt(y, x);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)) ^ 0xFFFF;
+    return _mm_cmpeq_epi32(x, y) ^ loadValue(0xFFFFFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return loadValue(0xFFFFFFFF);
   }
 
   // misc
@@ -1039,12 +1145,12 @@ class SimdFilterProcessor<VT, CHECK_T,
     return _mm_movemask_epi8(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpNe(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
@@ -1077,7 +1183,8 @@ class SimdFilterProcessor<VT, CHECK_T,
 
   MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const
   {
-    SimdType signVec = constant4i<(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000>();
+    SimdType signVec =
+        constant4i<(int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000>();
     SimdType xFlip = _mm_xor_si128(x, signVec);
     SimdType yFlip = _mm_xor_si128(y, signVec);
     return _mm_cmpgt_epi32(xFlip, yFlip);
@@ -1092,11 +1199,20 @@ class SimdFilterProcessor<VT, CHECK_T,
   {
     return _mm_max_epu32(x, y);
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 template <typename VT, typename CHECK_T>
 class SimdFilterProcessor<
-    VT, CHECK_T, typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, int16_t>::value>::type>
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, int16_t>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
@@ -1106,9 +1222,45 @@ class SimdFilterProcessor<
   using SimdType = simd::vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    // const CHECK_T value1 = inputArray[0];
+    // const CHECK_T value2 = inputArray[1];
+    // const CHECK_T value3 = inputArray[2];
+    // const CHECK_T value4 = inputArray[3];
+    // const CHECK_T value5 = inputArray[4];
+    // const CHECK_T value6 = inputArray[5];
+    // const CHECK_T value7 = inputArray[6];
+    // const CHECK_T value8 = inputArray[7];
+    // union
+    // {
+    //   CHECK_T i[vecByteSize / sizeof(CHECK_T)];
+    //   vi128_t xmm;
+    // } u = {{value1, value2, value3, value4, value5, value6, value7, value8}};
+    // return u.xmm;
+    // std::cout << " maskCtor ptr " << std::hex << (uint64_t)inputArray << " val " << *(int64_t*)inputArray
+    //           << std::endl;
+    const SimdType byteMaskVec = constant8i<0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07>();
+    // auto a1 = _mm_set1_epi64x(*(int64_t*)inputArray);
+    // auto a2 = _mm_shuffle_epi8(a1, byteMaskVec);
+    // {
+    //   std::cout << " maskCtor ptr byteMaskVec " << std::hex << ((uint64_t*)(&byteMaskVec))[0] << " "
+    //             << ((uint64_t*)(&byteMaskVec))[1] << std::endl;
+    // }
+    // {
+    //   std::cout << " maskCtor ptr a1 " << std::hex << ((uint64_t*)(&a1))[0] << " " << ((uint64_t*)(&a1))[1]
+    //             << std::endl;
+    // }
+    // {
+    //   std::cout << " maskCtor ptr a2 " << std::hex << ((uint64_t*)(&a2))[0] << " " << ((uint64_t*)(&a2))[1]
+    //             << std::endl;
+    // }
+    return _mm_shuffle_epi8(_mm_set1_epi64x(*(int64_t*)inputArray), byteMaskVec);
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -1127,44 +1279,44 @@ class SimdFilterProcessor<
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y));
+    return _mm_cmpeq_epi16(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    return cmpLt(x, y) ^ 0xFFFF;
+    return cmpLt(x, y) ^ loadValue(0xFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpgt_epi16(x, y));
+    return _mm_cmpgt_epi16(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
-    return cmpGt(x, y) ^ 0xFFFF;
+    return cmpGt(x, y) ^ loadValue(0xFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmplt_epi16(x, y));
+    return _mm_cmplt_epi16(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF;
+    return _mm_cmpeq_epi16(x, y) ^ loadValue(0xFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return loadValue(0xFFFF);
   }
 
   // misc
@@ -1173,12 +1325,12 @@ class SimdFilterProcessor<
     return _mm_movemask_epi8(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpNe(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
@@ -1223,11 +1375,20 @@ class SimdFilterProcessor<
   {
     return _mm_max_epi16(x, y);
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 template <typename VT, typename CHECK_T>
-class SimdFilterProcessor<
-    VT, CHECK_T, typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, uint16_t>::value>::type>
+class SimdFilterProcessor<VT, CHECK_T,
+                          typename std::enable_if<std::is_same<VT, vi128_wr>::value &&
+                                                  std::is_same<CHECK_T, uint16_t>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
@@ -1237,9 +1398,15 @@ class SimdFilterProcessor<
   using SimdType = simd::vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray)
+  {
+    const SimdType byteMaskVec = constant8i<0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07>();
+    return _mm_shuffle_epi8(_mm_set1_epi64x(*(int64_t*)inputArray), byteMaskVec);
+  }
   // Load value
   MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
   {
@@ -1258,45 +1425,65 @@ class SimdFilterProcessor<
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y));
+    return _mm_cmpeq_epi16(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    SimdType maxOfTwo = _mm_max_epu16(x, y); // max(x, y), unsigned
-    return _mm_movemask_epi8(_mm_cmpeq_epi16(x, maxOfTwo));
+    SimdType maxOfTwo = _mm_max_epu16(x, y);  // max(x, y), unsigned
+    return _mm_cmpeq_epi16(x, maxOfTwo);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  // MCS_FORCE_INLINE MaskType cmpGE(SimdType x, SimdType y)
+  // {
+  //   SimdType maxOfTwo = _mm_max_epu16(x, y);  // max(x, y), unsigned
+  //   return _mm_cmpeq_epi16(x, maxOfTwo);
+  // }
+
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
-    return cmpGe(y, x) ^ 0xFFFF;
+    return cmpGe(y, x) ^ loadValue(0xFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
     return cmpGe(y, x);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  // MCS_FORCE_INLINE MaskType cmpLE(SimdType x, SimdType y)
+  // {
+  //   return cmpGE(y, x);
+  // }
+
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
-    return cmpGe(x, y) ^ 0xFFFF;
+    // auto a = cmpGe(x, y);
+    // uint64_t* aRef = (uint64_t*)&a;
+    // auto b = loadValue(0xFF);
+    // uint64_t* bRef = (uint64_t*)&b;
+    // auto c = cmpGe(x, y) ^ loadValue(0xFF);
+    // uint64_t* cRef = (uint64_t*)&c;
+    // std::cout << " cmpLt cmpGe " << std::hex << aRef[0] << " " << aRef[1] << " loadValue " << bRef[0] << "
+    // "
+    //           << bRef[1] << " result " << cRef[0] << " " << cRef[1] << std::endl;
+    return cmpGe(x, y) ^ loadValue(0xFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF;
+    return _mm_cmpeq_epi16(x, y) ^ loadValue(0xFFFF);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return loadValue(0xFFFF);
   }
 
   // misc
@@ -1305,12 +1492,12 @@ class SimdFilterProcessor<
     return _mm_movemask_epi8(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpNe(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
@@ -1358,11 +1545,20 @@ class SimdFilterProcessor<
   {
     return _mm_max_epu16(x, y);
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 template <typename VT, typename CHECK_T>
 class SimdFilterProcessor<
-    VT, CHECK_T, typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, int8_t>::value>::type>
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, int8_t>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
@@ -1372,6 +1568,7 @@ class SimdFilterProcessor<
   using SimdType = vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
@@ -1393,44 +1590,44 @@ class SimdFilterProcessor<
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
+    return _mm_cmpeq_epi8(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    return cmpLt(x, y) ^ 0xFFFF;
+    return cmpLt(x, y) ^ loadValue(0xFF);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y));
+    return _mm_cmpgt_epi8(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
-    return cmpGt(x, y) ^ 0xFFFF;
+    return cmpGt(x, y) ^ loadValue(0xFF);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmplt_epi8(x, y));
+    return _mm_cmplt_epi8(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF;
+    return _mm_cmpeq_epi8(x, y) ^ loadValue(0xFF);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return loadValue(0xFF);
   }
 
   // permute
@@ -1446,12 +1643,12 @@ class SimdFilterProcessor<
     return _mm_movemask_epi8(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpNe(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
@@ -1496,11 +1693,20 @@ class SimdFilterProcessor<
   {
     return _mm_max_epi8(x, y);
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 template <typename VT, typename CHECK_T>
 class SimdFilterProcessor<
-    VT, CHECK_T, typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, uint8_t>::value>::type>
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi128_wr>::value && std::is_same<CHECK_T, uint8_t>::value>::type>
 {
  public:
   constexpr static const uint16_t vecByteSize = 16U;
@@ -1510,6 +1716,7 @@ class SimdFilterProcessor<
   using SimdType = vi128_t;
   using FilterType = T;
   using StorageType = T;
+  using MaskType = vi128_t;
   // Mask calculation for int and float types differs.
   // See corresponding intrinsics algos for details.
   constexpr static const uint16_t FilterMaskStep = sizeof(T);
@@ -1531,45 +1738,45 @@ class SimdFilterProcessor<
   }
 
   // Compare
-  MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
+    return _mm_cmpeq_epi8(x, y);
   }
 
-  MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y)
   {
-    SimdType maxOfTwo = _mm_max_epu8(x, y); // max(x, y), unsigned
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(x, maxOfTwo));
+    SimdType maxOfTwo = _mm_max_epu8(x, y);  // max(x, y), unsigned
+    return _mm_cmpeq_epi8(x, maxOfTwo);
   }
 
-  MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y)
   {
-    return cmpGe(y, x) ^ 0xFFFF;
+    return cmpGe(y, x) ^ loadValue(0xFF);
   }
 
-  MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y)
   {
     return cmpGe(y, x);
   }
 
-  MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y)
   {
-    return cmpGe(x, y) ^ 0xFFFF;
+    return cmpGe(x, y) ^ loadValue(0xFF);
   }
 
-  MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y)
   {
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF;
+    return _mm_cmpeq_epi8(x, y) ^ loadValue(0xFF);
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y)
   {
-    return 0;
+    return MaskType{0x0, 0x0};
   }
 
-  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y)
   {
-    return 0xFFFF;
+    return loadValue(0xFF);
   }
 
   // permute
@@ -1585,12 +1792,12 @@ class SimdFilterProcessor<
     return _mm_movemask_epi8(vmask);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y)
   {
     return cmpNe(x, y);
   }
 
-  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y)
+  MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y)
   {
     return cmpEq(x, y);
   }
@@ -1638,6 +1845,14 @@ class SimdFilterProcessor<
   {
     return _mm_max_epu8(x, y);
   }
+  MCS_FORCE_INLINE MaskType falseMask()
+  {
+    return MaskType{0x0, 0x0};
+  }
+  MCS_FORCE_INLINE MaskType trueMask()
+  {
+    return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL);
+  }
 };
 
 }  // namespace simd