MCOL-4809 This patch adds support for float data types filtering and scanning vectorization

2025-04-18 21:44:02 +03:00 · 2022-02-03 16:38:06 +00:00 · 2022-02-03 16:38:06 +00:00 · c79dfc4925
commit c79dfc4925
parent 5f948bce52
5 changed files with 596 additions and 174 deletions
--- a/primitives/linux-port/column.cpp
+++ b/primitives/linux-port/column.cpp
@ -50,19 +50,8 @@ using namespace execplan;

 namespace
 {
-// WIP Move this
 using MT = uint16_t;

-// Column filtering is dispatched 4-way based on the column type,
-// which defines implementation of comparison operations for the column values
-enum ENUM_KIND
-{
-  KIND_DEFAULT,   // compared as signed integers
-  KIND_UNSIGNED,  // compared as unsigned integers
-  KIND_FLOAT,     // compared as floating-point numbers
-  KIND_TEXT
-};  // whitespace-trimmed and then compared as signed integers
-
 inline uint64_t order_swap(uint64_t x)
 {
  uint64_t ret = (x >> 56) | ((x << 40) & 0x00FF000000000000ULL) | ((x << 24) & 0x0000FF0000000000ULL) |
@ -950,16 +939,16 @@ inline uint16_t vectWriteColValues(
    primitives::RIDType* ridDstArray,     // The actual dst arrray ptr to start writing RIDs
    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
 {
-  constexpr const uint16_t WIDTH = sizeof(T);
-  using SIMD_TYPE = typename VT::SIMD_TYPE;
-  SIMD_TYPE tmpStorageVector;
+  constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
+  using SimdType = typename VT::SimdType;
+  SimdType tmpStorageVector;
  T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector);
  // Saving values based on writeMask into tmp vec.
  // Min/Max processing.
  // The mask is 16 bit long and it describes N elements.
  // N = sizeof(vector type) / WIDTH.
  uint32_t j = 0;
-  for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH)
+  for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
  {
    MT bitMapPosition = 1 << it;
    if (writeMask & bitMapPosition)
@ -1016,16 +1005,16 @@ inline uint16_t vectWriteColValues(
    primitives::RIDType* ridDstArray,     // The actual dst arrray ptr to start writing RIDs
    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
 {
-  constexpr const uint16_t WIDTH = sizeof(T);
-  using SIMD_TYPE = typename VT::SIMD_TYPE;
-  SIMD_TYPE tmpStorageVector;
+  constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
+  using SimdType = typename VT::SimdType;
+  SimdType tmpStorageVector;
  T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector);
  // Saving values based on writeMask into tmp vec.
  // Min/Max processing.
  // The mask is 16 bit long and it describes N elements.
  // N = sizeof(vector type) / WIDTH.
  uint32_t j = 0;
-  for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH)
+  for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
  {
    MT bitMapPosition = 1 << it;
    if (writeMask & bitMapPosition)
@ -1064,13 +1053,13 @@ inline uint16_t vectWriteRIDValues(
    MT nonNullOrEmptyMask,                // SIMD intrinsics inverce bitmask for NULL/EMPTY values
    primitives::RIDType* ridSrcArray)     // The actual src array ptr to read RIDs
 {
-  constexpr const uint16_t WIDTH = sizeof(T);
+  constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
  primitives::RIDType* origRIDDstArray = ridDstArray;
  // Saving values based on writeMask into tmp vec.
  // Min/Max processing.
  // The mask is 16 bit long and it describes N elements where N = sizeof(vector type) / WIDTH.
  uint16_t j = 0;
-  for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH)
+  for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
  {
    MT bitMapPosition = 1 << it;
    if (writeMask & (1 << it))
@ -1213,13 +1202,11 @@ inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray,
 {
  constexpr const uint16_t WIDTH = sizeof(T);
  constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
-  using SIMD_TYPE = typename VT::SIMD_TYPE;
-  SIMD_TYPE result;
+  using SimdType = typename VT::SimdType;
+  SimdType result;
  T* resultTypedPtr = reinterpret_cast<T*>(&result);
  for (uint32_t i = 0; i < VECTOR_SIZE; ++i)
  {
-    // std::cout << " simdDataLoadTemplate ridArray[ridArrayOffset] " << (int8_t) origSrcArray[ridArray[i]] <<
-    // " ridArray[i] " << ridArray[i] << "\n";
    resultTypedPtr[i] = origSrcArray[ridArray[i]];
  }

@ -1243,12 +1230,13 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T*
                         const T nullValue, T Min, T Max, const bool isNullValueMatches)
 {
  constexpr const uint16_t WIDTH = sizeof(T);
-  using SIMD_TYPE = typename VT::SIMD_TYPE;
-  using SIMD_WRAPPER_TYPE = typename VT::SIMD_WRAPPER_TYPE;
+  using SimdType = typename VT::SimdType;
+  using SimdWrapperType = typename VT::SimdWrapperType;
+  using FilterType = typename VT::FilterType;
  VT simdProcessor;
-  SIMD_TYPE dataVec;
-  SIMD_TYPE emptyFilterArgVec = simdProcessor.loadValue(emptyValue);
-  SIMD_TYPE nullFilterArgVec = simdProcessor.loadValue(nullValue);
+  SimdType dataVec;
+  SimdType emptyFilterArgVec = simdProcessor.emptyNullLoadValue(emptyValue);
+  SimdType nullFilterArgVec = simdProcessor.emptyNullLoadValue(nullValue);
  MT writeMask, nonEmptyMask, nonNullMask, nonNullOrEmptyMask;
  MT initFilterMask = 0xFFFF;
  primitives::RIDType rid = 0;
@ -1262,18 +1250,16 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T*
  ColumnFilterMode columnFilterMode = ALWAYS_TRUE;
  const ST* filterSet = nullptr;
  const ParsedColumnFilter::RFsType* filterRFs = nullptr;
-
  uint8_t outputType = in->OutputType;
-
  constexpr uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
  // If there are RIDs use its number to get a number of vectorized iterations.
  uint16_t iterNumber = HAS_INPUT_RIDS ? ridSize / VECTOR_SIZE : srcSize / VECTOR_SIZE;
  uint32_t filterCount = 0;
  // These pragmas are to silence GCC warnings
-  //  warning: ignoring attributes on template argument
+  // warning: ignoring attributes on template argument
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wignored-attributes"
-  std::vector<SIMD_TYPE> filterArgsVectors;
+  std::vector<SimdType> filterArgsVectors;
  auto ptrA = std::mem_fn(&VT::cmpEq);
  using COPType = decltype(ptrA);
  std::vector<COPType> copFunctorVec;
@ -1314,11 +1300,18 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T*
      for (uint32_t j = 0; j < filterCount; ++j)
      {
        // Preload filter argument values only once.
-        filterArgsVectors[j] = simdProcessor.loadValue(filterValues[j]);
+        filterArgsVectors[j] = simdProcessor.loadValue(*((FilterType*)&filterValues[j]));
        switch (filterCOPs[j])
        {
-          case (COMPARE_EQ): copFunctorVec.push_back(std::mem_fn(&VT::cmpEq)); break;
+          case (COMPARE_EQ):
+            // Skipping extra filter pass generated by IS NULL
+            if (memcmp(&filterValues[j], &nullValue, sizeof(nullValue)) == 0)
+              copFunctorVec.push_back(std::mem_fn(&VT::nullEmptyCmpEq));
+            else
+              copFunctorVec.push_back(std::mem_fn(&VT::cmpEq));
+            break;
          case (COMPARE_GE): copFunctorVec.push_back(std::mem_fn(&VT::cmpGe)); break;
+
          case (COMPARE_GT): copFunctorVec.push_back(std::mem_fn(&VT::cmpGt)); break;
          case (COMPARE_LE): copFunctorVec.push_back(std::mem_fn(&VT::cmpLe)); break;
          case (COMPARE_LT): copFunctorVec.push_back(std::mem_fn(&VT::cmpLt)); break;
@ -1344,14 +1337,13 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T*
  {
    primitives::RIDType ridOffset = i * VECTOR_SIZE;
    assert(!HAS_INPUT_RIDS || (HAS_INPUT_RIDS && ridSize >= ridOffset));
-    dataVec = simdDataLoadTemplate<VT, SIMD_WRAPPER_TYPE, HAS_INPUT_RIDS, T>(simdProcessor, srcArray,
-                                                                             origSrcArray, ridArray, i)
+    dataVec = simdDataLoadTemplate<VT, SimdWrapperType, HAS_INPUT_RIDS, T>(simdProcessor, srcArray,
+                                                                           origSrcArray, ridArray, i)
                  .v;
-    // empty check
-    nonEmptyMask = simdProcessor.cmpNe(dataVec, emptyFilterArgVec);
+    nonEmptyMask = simdProcessor.nullEmptyCmpNe(dataVec, emptyFilterArgVec);
    writeMask = nonEmptyMask;
    // NULL check
-    nonNullMask = simdProcessor.cmpNe(dataVec, nullFilterArgVec);
+    nonNullMask = simdProcessor.nullEmptyCmpNe(dataVec, nullFilterArgVec);
    // Exclude NULLs from the resulting set if NULL doesn't match the filters.
    writeMask = isNullValueMatches ? writeMask : writeMask & nonNullMask;
    nonNullOrEmptyMask = nonNullMask & nonEmptyMask;
@ -1397,6 +1389,7 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T*

  // Set the number of output values here b/c tail processing can skip this operation.
  out->NVALS = totalValuesWritten;
+
  // Write captured Min/Max values to *out
  out->ValidMinMax = validMinMax;
  if (validMinMax)
@ -1415,17 +1408,18 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T*
 }

 // This routine dispatches template function calls to reduce branching.
-template <typename T, ENUM_KIND KIND, typename FT, typename ST>
-void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out, const T* srcArray,
-                                   const uint32_t srcSize, uint16_t* ridArray, const uint16_t ridSize,
-                                   ParsedColumnFilter* parsedColumnFilter, const bool validMinMax,
-                                   const T emptyValue, const T nullValue, T Min, T Max,
+template <typename STORAGE_TYPE, ENUM_KIND KIND, typename FT, typename ST>
+void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out,
+                                   const STORAGE_TYPE* srcArray, const uint32_t srcSize, uint16_t* ridArray,
+                                   const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter,
+                                   const bool validMinMax, const STORAGE_TYPE emptyValue,
+                                   const STORAGE_TYPE nullValue, STORAGE_TYPE Min, STORAGE_TYPE Max,
                                   const bool isNullValueMatches)
 {
-  constexpr const uint8_t WIDTH = sizeof(T);
-  // TODO make a SFINAE template switch for the class template spec.
-  using SIMD_TYPE = simd::vi128_wr;
-  using VT = typename simd::SimdFilterProcessor<SIMD_TYPE, WIDTH>;
+  // Using struct to dispatch SIMD type based on integral type T.
+  using SimdType = typename simd::IntegralToSIMD<STORAGE_TYPE, KIND>::type;
+  using FilterType = typename simd::StorageToFiltering<STORAGE_TYPE, KIND>::type;
+  using VT = typename simd::SimdFilterProcessor<SimdType, FilterType>;
  bool hasInputRIDs = (in->NVALS > 0) ? true : false;
  if (hasInputRIDs)
  {
@ -1433,22 +1427,22 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out
    switch (in->OutputType)
    {
      case OT_RID:
-        vectorizedFiltering<T, VT, hasInput, OT_RID, KIND, FT, ST>(
+        vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_RID, KIND, FT, ST>(
            in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue,
            nullValue, Min, Max, isNullValueMatches);
        break;
      case OT_BOTH:
-        vectorizedFiltering<T, VT, hasInput, OT_BOTH, KIND, FT, ST>(
+        vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_BOTH, KIND, FT, ST>(
            in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue,
            nullValue, Min, Max, isNullValueMatches);
        break;
      case OT_TOKEN:
-        vectorizedFiltering<T, VT, hasInput, OT_TOKEN, KIND, FT, ST>(
+        vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_TOKEN, KIND, FT, ST>(
            in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue,
            nullValue, Min, Max, isNullValueMatches);
        break;
      case OT_DATAVALUE:
-        vectorizedFiltering<T, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(
+        vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(
            in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue,
            nullValue, Min, Max, isNullValueMatches);
        break;
@ -1460,22 +1454,22 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out
    switch (in->OutputType)
    {
      case OT_RID:
-        vectorizedFiltering<T, VT, hasInput, OT_RID, KIND, FT, ST>(
+        vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_RID, KIND, FT, ST>(
            in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue,
            nullValue, Min, Max, isNullValueMatches);
        break;
      case OT_BOTH:
-        vectorizedFiltering<T, VT, hasInput, OT_BOTH, KIND, FT, ST>(
+        vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_BOTH, KIND, FT, ST>(
            in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue,
            nullValue, Min, Max, isNullValueMatches);
        break;
      case OT_TOKEN:
-        vectorizedFiltering<T, VT, hasInput, OT_TOKEN, KIND, FT, ST>(
+        vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_TOKEN, KIND, FT, ST>(
            in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue,
            nullValue, Min, Max, isNullValueMatches);
        break;
      case OT_DATAVALUE:
-        vectorizedFiltering<T, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(
+        vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(
            in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue,
            nullValue, Min, Max, isNullValueMatches);
        break;
@ -1539,8 +1533,8 @@ void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* r
  // all values w/o any filter(even empty values filter) applied.

 #if defined(__x86_64__)
-  // Don't use vectorized filtering for non-integer based data types wider than 16 bytes.
-  if (KIND < KIND_FLOAT && WIDTH < 16)
+  // Don't use vectorized filtering for text based data types.
+  if (KIND <= KIND_FLOAT && WIDTH < 16)
  {
    bool canUseFastFiltering = true;
    for (uint32_t i = 0; i < filterCount; ++i)
@ -1601,7 +1595,6 @@ void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, Co
  auto dataType = (execplan::CalpontSystemCatalog::ColDataType)in->colType.DataType;
  if (dataType == execplan::CalpontSystemCatalog::FLOAT)
  {
-    // WIP make this inline function
    const uint16_t ridSize = in->NVALS;
    uint16_t* ridArray = in->getRIDArrayPtr(W);
    const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W;
@ -1771,4 +1764,4 @@ template void primitives::PrimitiveProcessor::columnScanAndFilter<int128_t>(NewC
                                                                            ColResultHeader*);

 }  // namespace primitives
-// vim:ts=4 sw=4:
+// vim:ts=2 sw=2:
--- a/primitives/linux-port/primitiveprocessor.h
+++ b/primitives/linux-port/primitiveprocessor.h
@ -167,6 +167,7 @@ class ParsedColumnFilter
  using RFsType = uint8_t;
  static constexpr uint32_t noSetFilterThreshold = 8;
  ColumnFilterMode columnFilterMode;
+  // Very unfortunately prestored_argVals can also be used to store double/float values.
  boost::shared_array<int64_t> prestored_argVals;
  boost::shared_array<int128_t> prestored_argVals128;
  boost::shared_array<CopsType> prestored_cops;
@ -181,7 +182,7 @@ class ParsedColumnFilter
  template <typename T, typename std::enable_if<std::is_same<T, int64_t>::value, T>::type* = nullptr>
  T* getFilterVals()
  {
-    return prestored_argVals.get();
+    return reinterpret_cast<T*>(prestored_argVals.get());
  }

  template <typename T, typename std::enable_if<std::is_same<T, int128_t>::value, T>::type* = nullptr>
@ -561,4 +562,4 @@ boost::shared_ptr<ParsedColumnFilter> _parseColumnFilter(

 }  // namespace primitives

-// vim:ts=4 sw=4:
+// vim:ts=2 sw=2:
--- a/tests/col_double_block.h
+++ b/tests/col_double_block.h
@ -17,7 +17,7 @@

 #pragma once
 unsigned char ___bin_col_double_block_cdf[] = {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00,
+    0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xfa, 0xff, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xfa, 0xff, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x0c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x12, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x40, 0x00, 0x00, 0x00, 0x00,
--- a/tests/primitives_column_scan_and_filter.cpp
+++ b/tests/primitives_column_scan_and_filter.cpp
@ -428,7 +428,9 @@ TEST_F(ColumnScanFilterTest, ColumnScan4Bytes2Filters)
  ASSERT_EQ(out->NVALS, 9);

  for (i = 0; i < out->NVALS; i++)
+  {
    ASSERT_EQ(results[i], 11 + (uint32_t)i);
+  }

  EXPECT_EQ(out->Max, __col4block_cdf_umax);
  EXPECT_EQ(out->Min, __col4block_cdf_umin);
@ -867,7 +869,7 @@ TEST_F(ColumnScanFilterTest, ColumnScan4BytesNegFloat2CompFiltersOutputBoth)
 }

 // void p_Col_neg_double_1()
-TEST_F(ColumnScanFilterTest, ColumnScan4BytesNegDouble2CompFilters)
+TEST_F(ColumnScanFilterTest, ColumnScan8BytesNegDouble2CompFilters)
 {
  constexpr const uint8_t W = 8;
  using IntegralType = double;
--- a/utils/common/simd_sse.h
+++ b/utils/common/simd_sse.h
@ -36,437 +36,863 @@

 #include <mcs_datatype.h>

+// Column filtering is dispatched 4-way based on the column type,
+// which defines implementation of comparison operations for the column values
+enum ENUM_KIND
+{
+  KIND_DEFAULT,   // compared as signed integers
+  KIND_UNSIGNED,  // compared as unsigned integers
+  KIND_FLOAT,     // compared as floating-point numbers
+  KIND_TEXT
+};  // whitespace-trimmed and then compared as signed integers
+
 namespace simd
 {
 using vi128_t = __m128i;
-using msk128_t = uint16_t;
+using vi128f_t = __m128;
+using vi128d_t = __m128d;
 using int128_t = __int128;
 using MT = uint16_t;
-// This ugly wrapper used to allow to use __m128i as a template class parameter argument
+// These ugly wrappers are used to allow to use __m128* as template class parameter argument
 struct vi128_wr
 {
  __m128i v;
 };

-template <typename VT, int WIDTH>
-class SimdFilterProcessor
+struct vi128f_wr
 {
+  __m128 v;
 };

-template <>
-class SimdFilterProcessor<vi128_wr, 16>
+struct vi128d_wr
+{
+  __m128d v;
+};
+
+template <typename T, ENUM_KIND KIND, typename ENABLE = void>
+struct IntegralToSIMD;
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND,
+                      typename std::enable_if<KIND == KIND_FLOAT && sizeof(double) == sizeof(T)>::type>
+{
+  using type = vi128d_wr;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND,
+                      typename std::enable_if<KIND == KIND_FLOAT && sizeof(float) == sizeof(T)>::type>
+{
+  using type = vi128f_wr;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND, typename std::enable_if<KIND != KIND_FLOAT>::type>
+{
+  using type = vi128_wr;
+};
+
+template <typename T, ENUM_KIND KIND, typename ENABLE = void>
+struct StorageToFiltering;
+
+template <typename T, ENUM_KIND KIND>
+struct StorageToFiltering<T, KIND,
+                          typename std::enable_if<KIND == KIND_FLOAT && sizeof(double) == sizeof(T)>::type>
+{
+  using type = double;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct StorageToFiltering<T, KIND,
+                          typename std::enable_if<KIND == KIND_FLOAT && sizeof(float) == sizeof(T)>::type>
+{
+  using type = float;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct StorageToFiltering<T, KIND, typename std::enable_if<KIND != KIND_FLOAT>::type>
+{
+  using type = T;
+};
+
+template <typename VT, typename T, typename ENABLE = void>
+class SimdFilterProcessor;
+
+// Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<(std::is_same<VT, vi128_wr>::value && sizeof(CHECK_T) == 16) ||
+                            (std::is_same<VT, vi128f_wr>::value && !std::is_same<CHECK_T, float>::value &&
+                             !std::is_same<CHECK_T, double>::value)>::type>
 {
  // This is a dummy class that is not currently used.
 public:
  constexpr static const uint16_t vecByteSize = 16U;
  constexpr static const uint16_t vecBitSize = 128U;
-  using T = int128_t;
-  using SIMD_WRAPPER_TYPE = simd::vi128_wr;
-  using SIMD_TYPE = simd::vi128_t;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType = vi128_wr;
+  using SimdType = vi128_t;
+  using FilterType = T;
+  using StorageType = T;
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
  // Load value
-  MCS_FORCE_INLINE vi128_t loadValue(const T fill)
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
  {
-    return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(&fill));
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return _mm_loadu_si128(reinterpret_cast<const SimdType*>(&fill));
  }

  // Load from
-  MCS_FORCE_INLINE vi128_t loadFrom(const char* from)
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
  {
-    return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from));
+    return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
  }

-  MCS_FORCE_INLINE MT cmpDummy(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpDummy(SimdType& x, SimdType& y)
  {
    return 0xFFFF;
  }
  // Compare
-  MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
  {
    return cmpDummy(x, y);
  }

-  MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
  {
    return cmpDummy(x, y);
  }

-  MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
  {
    return cmpDummy(x, y);
  }

-  MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
  {
    return cmpDummy(x, y);
  }

-  MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
  {
    return cmpDummy(x, y);
  }

-  MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
  {
    return cmpDummy(x, y);
  }

-  MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
  {
    return 0;
  }

+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
  // misc
-  MCS_FORCE_INLINE uint16_t convertVectorToBitMask(vi128_t& vmask)
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
  {
    return _mm_movemask_epi8(vmask);
  }

-  MCS_FORCE_INLINE vi128_t setToZero()
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
  {
    return _mm_setzero_si128();
  }

  // store
-  MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst)
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
  {
    _mm_maskmoveu_si128(x, vmask, dst);
  }

-  MCS_FORCE_INLINE void store(char* dst, vi128_t& x)
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
  {
-    _mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x);
+    _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
  }
 };

-template <>
-class SimdFilterProcessor<vi128_wr, 8>
+template <typename VT, typename T>
+class SimdFilterProcessor<
+    VT, T,
+    typename std::enable_if<std::is_same<VT, vi128d_wr>::value && std::is_same<T, double>::value>::type>
 {
 public:
  constexpr static const uint16_t vecByteSize = 16U;
  constexpr static const uint16_t vecBitSize = 128U;
-  using T = datatypes::WidthToSIntegralType<8>::type;
-  using SIMD_WRAPPER_TYPE = simd::vi128_wr;
-  using SIMD_TYPE = simd::vi128_t;
+  using FilterType = T;
+  using NullEmptySimdType = vi128_t;
+  using SimdWrapperType = simd::vi128d_wr;
+  using SimdType = simd::vi128d_t;
+  using StorageSimdType = simd::vi128_t;
+  using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
+  using StorageVecProcType = SimdFilterProcessor<simd::vi128_wr, StorageType>;
+  // Mask calculation for int and float types differs.
+  // See corresponding intrinsics algos for details.
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
  // Load value
-  MCS_FORCE_INLINE vi128_t loadValue(const T fill)
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    StorageVecProcType nullEmptyProcessor;
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return (SimdType)nullEmptyProcessor.loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return _mm_set1_pd(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return _mm_loadu_pd(reinterpret_cast<const T*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_pd(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_pd(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_pd(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmple_pd(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_pd(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_pd(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return _mm_movemask_pd(vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+    NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
+    NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+
+    NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
+    NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return _mm_setzero_pd();
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    _mm_storeu_pd(reinterpret_cast<T*>(dst), x);
+  }
+};
+
+template <typename VT, typename T>
+class SimdFilterProcessor<
+    VT, T, typename std::enable_if<std::is_same<VT, vi128f_wr>::value && std::is_same<T, float>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using FilterType = T;
+  using NullEmptySimdType = vi128_t;
+  using SimdWrapperType = vi128f_wr;
+  using SimdType = vi128f_t;
+  using StorageSimdType = simd::vi128_t;
+  using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
+  using StorageVecProcType = SimdFilterProcessor<simd::vi128_wr, StorageType>;
+  // Mask calculation for int and float types differs.
+  // See corresponding intrinsics algos for details.
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    StorageVecProcType nullEmptyProcessor;
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return (SimdType)nullEmptyProcessor.loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return _mm_set1_ps(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return _mm_loadu_ps(reinterpret_cast<const T*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_ps(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_ps(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_ps(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmple_ps(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_ps(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_ps(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return _mm_movemask_ps(vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+
+    NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
+    NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+
+    NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
+    NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return _mm_setzero_ps();
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    _mm_storeu_ps(reinterpret_cast<T*>(dst), x);
+  }
+};
+
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<VT, CHECK_T,
+                          typename std::enable_if<std::is_same<VT, vi128_wr>::value && sizeof(CHECK_T) == 8 &&
+                                                  !std::is_same<CHECK_T, double>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType = vi128_wr;
+  using SimdType = vi128_t;
+  using FilterType = T;
+  using StorageType = T;
+  // Mask calculation for int and float types differs.
+  // See corresponding intrinsics algos for details.
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
  {
    return _mm_set_epi64x(fill, fill);
  }

  // Load from
-  MCS_FORCE_INLINE vi128_t loadFrom(const char* from)
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
  {
-    return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from));
+    return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
  }

  // Compare
-  MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi64(x, y), _mm_cmpeq_epi64(x, y)));
  }

-  MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpgt_epi64(x, y));
  }

-  MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y));
  }

-  MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
  {
    return cmpGt(x, y) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
  {
    return cmpNe(x, y) ^ cmpGt(x, y);
  }

-  MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
  {
    return 0;
  }

+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask)
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
  {
    return _mm_movemask_epi8(vmask);
  }

-  MCS_FORCE_INLINE vi128_t setToZero()
+  MCS_FORCE_INLINE SimdType setToZero()
  {
    return _mm_setzero_si128();
  }

+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
  // store
-  MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst)
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
  {
    _mm_maskmoveu_si128(x, vmask, dst);
  }

-  MCS_FORCE_INLINE void store(char* dst, vi128_t& x)
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
  {
-    _mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x);
+    _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
  }
 };

-template <>
-class SimdFilterProcessor<vi128_wr, 4>
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<VT, CHECK_T,
+                          typename std::enable_if<std::is_same<VT, vi128_wr>::value && sizeof(CHECK_T) == 4 &&
+                                                  !std::is_same<CHECK_T, float>::value>::type>
 {
 public:
  constexpr static const uint16_t vecByteSize = 16U;
  constexpr static const uint16_t vecBitSize = 128U;
-  using T = datatypes::WidthToSIntegralType<4>::type;
-  using SIMD_WRAPPER_TYPE = simd::vi128_wr;
-  using SIMD_TYPE = simd::vi128_t;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType = vi128_wr;
+  using SimdType = vi128_t;
+  using FilterType = T;
+  using StorageType = T;
+  // Mask calculation for int and float types differs.
+  // See corresponding intrinsics algos for details.
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
  // Load value
-  MCS_FORCE_INLINE vi128_t loadValue(const T fill)
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
  {
    return _mm_set1_epi32(fill);
  }

  // Load from
-  MCS_FORCE_INLINE vi128_t loadFrom(const char* from)
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
  {
-    return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from));
+    return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
  }

  // Compare
-  MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y));
  }

-  MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
  {
    return cmpLt(x, y) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpgt_epi32(x, y));
  }

-  MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
  {
    return cmpGt(x, y) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmplt_epi32(x, y));
  }

-  MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
  {
    return 0;
  }

+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask)
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
  {
    return _mm_movemask_epi8(vmask);
  }

-  MCS_FORCE_INLINE vi128_t setToZero()
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
  {
    return _mm_setzero_si128();
  }

  // store
-  MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst)
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
  {
    _mm_maskmoveu_si128(x, vmask, dst);
  }

-  MCS_FORCE_INLINE void store(char* dst, vi128_t& x)
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
  {
-    _mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x);
+    _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
  }
 };

-template <>
-class SimdFilterProcessor<vi128_wr, 2>
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T, typename std::enable_if<std::is_same<VT, vi128_wr>::value && sizeof(CHECK_T) == 2>::type>
 {
 public:
  constexpr static const uint16_t vecByteSize = 16U;
  constexpr static const uint16_t vecBitSize = 128U;
-  using T = datatypes::WidthToSIntegralType<2>::type;
-  using SIMD_WRAPPER_TYPE = simd::vi128_wr;
-  using SIMD_TYPE = simd::vi128_t;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType = simd::vi128_wr;
+  using SimdType = simd::vi128_t;
+  using FilterType = T;
+  using StorageType = T;
+  // Mask calculation for int and float types differs.
+  // See corresponding intrinsics algos for details.
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
  // Load value
-  MCS_FORCE_INLINE vi128_t loadValue(const T fill)
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
  {
    return _mm_set1_epi16(fill);
  }

  // Load from
-  MCS_FORCE_INLINE vi128_t loadFrom(const char* from)
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
  {
-    return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from));
+    return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
  }

  // Compare
-  MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y));
  }

-  MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
  {
    return cmpLt(x, y) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpgt_epi16(x, y));
  }

-  MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
  {
    return cmpGt(x, y) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmplt_epi16(x, y));
  }

-  MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
  {
    return 0;
  }

+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask)
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
  {
    return _mm_movemask_epi8(vmask);
  }

-  MCS_FORCE_INLINE vi128_t setToZero()
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
  {
    return _mm_setzero_si128();
  }

  // store
-  MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst)
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
  {
    _mm_maskmoveu_si128(x, vmask, dst);
  }

-  MCS_FORCE_INLINE void store(char* dst, vi128_t& x)
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
  {
-    _mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x);
+    _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
  }
 };

-template <>
-class SimdFilterProcessor<vi128_wr, 1>
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T, typename std::enable_if<std::is_same<VT, vi128_wr>::value && sizeof(CHECK_T) == 1>::type>
 {
 public:
  constexpr static const uint16_t vecByteSize = 16U;
  constexpr static const uint16_t vecBitSize = 128U;
-  using T = datatypes::WidthToSIntegralType<1>::type;
-  using SIMD_WRAPPER_TYPE = simd::vi128_wr;
-  using SIMD_TYPE = simd::vi128_t;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType = vi128_wr;
+  using SimdType = vi128_t;
+  using FilterType = T;
+  using StorageType = T;
+  // Mask calculation for int and float types differs.
+  // See corresponding intrinsics algos for details.
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
  // Load value
-  MCS_FORCE_INLINE vi128_t loadValue(const T fill)
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
  {
    return _mm_set1_epi8(fill);
  }

  // Load from
-  MCS_FORCE_INLINE vi128_t loadFrom(const char* from)
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
  {
-    return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from));
+    return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
  }

  // Compare
-  MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
  }

-  MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
  {
    return cmpLt(x, y) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y));
  }

-  MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
  {
    return cmpGt(x, y) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmplt_epi8(x, y));
  }

-  MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
  {
    return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF;
  }

-  MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y)
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
  {
    return 0;
  }

+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
  // permute
  /* TODO Available in AVX-512
-      MCS_FORCE_INLINE vi128_t perm8Bits(vi128_t& x, vi128_t& idx)
+      MCS_FORCE_INLINE SimdType perm8Bits(SimdType& x, SimdType& idx)
      {
        return _mm_permutexvar_epi8(x, idx);
      }
  */
  // misc
-  MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask)
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
  {
    return _mm_movemask_epi8(vmask);
  }

-  MCS_FORCE_INLINE vi128_t setToZero()
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
  {
    return _mm_setzero_si128();
  }

  // store
-  MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst)
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
  {
    _mm_maskmoveu_si128(x, vmask, dst);
  }

-  MCS_FORCE_INLINE void store(char* dst, vi128_t& x)
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
  {
-    _mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x);
+    _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
  }
 };

 }  // namespace simd

 #endif  // if defined(__x86_64__ )
-
 // vim:ts=2 sw=2: