From 4c0b8fd829586ad1fe838600f97670652322d03e Mon Sep 17 00:00:00 2001
From: NTH19 <3310288189@qq.com>
Date: Thu, 16 Jun 2022 12:08:42 +0800
Subject: [PATCH] simd of arm neon

unit testing

pass unit test for simdprocessor

add test cases

implement specific _mm_movemask for different types

float movemask change

rename
---
 tests/simd_processors.cpp |  375 ++++++++-
 utils/common/simd_arm.h   | 1503 +++++++++++++++++++++++++++++++++++++
 2 files changed, 1874 insertions(+), 4 deletions(-)
 create mode 100644 utils/common/simd_arm.h
diff --git a/tests/simd_processors.cpp b/tests/simd_processors.cpp
index 8883a8743..864c93071 100644
--- a/tests/simd_processors.cpp
+++ b/tests/simd_processors.cpp
@@ -15,14 +15,15 @@
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
    MA 02110-1301, USA. */
 
-#if defined(__x86_64__)
+
 #include <iostream>
 #include <gtest/gtest.h>
-
-#include "simd_sse.h"
 #include "datatypes/mcs_datatype.h"
 #include "datatypes/mcs_int128.h"
 
+#if defined(__x86_64__)
+#include "simd_sse.h"
+
 using namespace std;
 
 template <typename T>
@@ -63,4 +64,370 @@ TYPED_TEST(SimdProcessorTypedTest, SimdFilterProcessor_simd128)
   EXPECT_EQ(proc.cmpEq(lhs, rhs), allTrue);
   EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse);
 }
-#endif
\ No newline at end of file
+#endif
+#ifdef __aarch64__
+#include "simd_arm.h"
+
+
+using namespace std;
+
+template <typename T>
+class SimdProcessorTypedTest : public testing::Test
+{
+  using IntegralType = T;
+
+ public:
+  void SetUp() override
+  {
+  }
+};
+
+using SimdProcessor128TypedTestTypes =
+    ::testing::Types<uint64_t, uint32_t, uint16_t, uint8_t, int64_t, int32_t, int16_t, int8_t>;
+TYPED_TEST_SUITE(SimdProcessorTypedTest, SimdProcessor128TypedTestTypes);
+
+TYPED_TEST(SimdProcessorTypedTest, SimdFilterProcessor_simd128)
+{
+  using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<TypeParam>::WrapperType, TypeParam>;
+  using SimdType = typename Proc::SimdType;
+  constexpr static simd::MT allTrue = 0xFFFF;
+  constexpr static simd::MT allFalse = 0x0;
+  Proc proc;
+  SimdType lhs = proc.loadValue((TypeParam)-2);
+  SimdType rhs = proc.loadValue((TypeParam)-3);
+  EXPECT_GT((uint64_t)-2LL, (uint64_t)-3LL);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue);
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), allTrue);
+  EXPECT_EQ(proc.cmpGe(rhs, lhs), allFalse);
+  EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse);
+  EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue);
+  EXPECT_EQ(proc.cmpLt(rhs, lhs), allTrue);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), allFalse);
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse);
+  EXPECT_EQ(proc.cmpEq(rhs, lhs), allFalse);
+  EXPECT_EQ(proc.cmpNe(rhs, lhs), allTrue);
+  lhs = proc.loadValue((TypeParam)-3);
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), allTrue);
+  EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse);
+
+  lhs = rhs = proc.loadValue((TypeParam)0);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue);
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), allFalse);
+  EXPECT_EQ(proc.cmpGe(rhs, lhs), allTrue);
+  EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse);
+  EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue);
+  EXPECT_EQ(proc.cmpLt(rhs, lhs), allFalse);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), allTrue);
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse);
+  EXPECT_EQ(proc.cmpEq(rhs, lhs), allTrue);
+  EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse);
+}
+TEST(SimdProcessorTest,Int8)
+{
+    using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<int8_t>::WrapperType, int8_t>;
+    using SimdType = typename Proc::SimdType;
+    Proc proc;
+    simd::MT expect = 0x0;
+    int8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5};
+    int8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+    SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+    SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+    for (int i = 0; i < 16; i++)
+      if (l[i] > r[i])
+        expect |= 1 << i;
+    EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+    EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect);
+
+    expect = 0x0;
+    for (int i = 0; i < 16; i++)
+      if (l[i] == r[i])
+        expect |= 1 << i;
+    EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+    EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+    expect = 0x0;
+    for (int i = 0; i < 16; i++)
+      if (l[i] < r[i])
+        expect |= 1 << i;
+    EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+    EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+TEST(SimdProcessorTest, Uint8)
+{
+  using Proc =
+      typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<uint8_t>::WrapperType, uint8_t>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  uint8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5};
+  uint8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 16; i++)
+    if (l[i] > r[i])
+      expect |= 1 << i;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 16; i++)
+    if (l[i] == r[i])
+      expect |= 1 << i;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs),(simd::MT) ~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 16; i++)
+    if (l[i] < r[i])
+      expect |= 1 << i;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs),(simd::MT) ~expect);
+}
+TEST(SimdProcessorTest, Int16)
+{
+  using Proc =
+      typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<int16_t>::WrapperType, int16_t>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  int16_t l[8]{0, 1, 2, -5, 4, 3, -8, 200};
+  int16_t r[8]{0, 105, -8, 35, 24, 13, 8};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 8; i++)
+    if (l[i] > r[i])
+      expect |= 3 << i * 2;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 8; i++)
+    if (l[i] == r[i])
+      expect |= 3 << i * 2;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 8; i++)
+    if (l[i] < r[i])
+      expect |= 3 << i * 2;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+TEST(SimdProcessorTest, Uint16)
+{
+  using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<uint16_t>::WrapperType,
+                                                  uint16_t>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  uint16_t l[8]{0, 1, 2, 5, 4, 3, 8, 5};
+  uint16_t r[8]{0, 1, 8, 35, 24, 13, 8};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 8; i++)
+    if (l[i] > r[i])
+      expect |= 3 << i*2;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 8; i++)
+    if (l[i] == r[i])
+      expect |= 3 << i * 2;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 8; i++)
+    if (l[i] < r[i])
+      expect |= 3 << i * 2;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+
+TEST(SimdProcessorTest, Int32)
+{
+  using Proc =
+      typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<int32_t>::WrapperType, int32_t>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  int32_t l[8]{0, 1, 2, -5};
+  int32_t r[8]{0, 105, -8,54333};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 4; i++)
+    if (l[i] > r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 4; i++)
+    if (l[i] == r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 4; i++)
+    if (l[i] < r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+TEST(SimdProcessorTest, Uint32)
+{
+  using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<uint32_t>::WrapperType,
+                                                  uint32_t>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  uint32_t l[4]{0, 1002, 2, 514};
+  uint32_t r[4]{2, 1, 80555, 35};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 4; i++)
+    if (l[i] > r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 4; i++)
+    if (l[i] == r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 4; i++)
+    if (l[i] < r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+TEST(SimdProcessorTest, Int64)
+{
+  using Proc =
+      typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<int64_t>::WrapperType, int64_t>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  int64_t l[2]{-5, 122020};
+  int64_t r[2]{0, 105};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 2; i++)
+    if (l[i] > r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 2; i++)
+    if (l[i] == r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 2; i++)
+    if (l[i] < r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+TEST(SimdProcessorTest, Uint64)
+{
+  using Proc = typename simd::SimdFilterProcessor<typename simd::TypeToVecWrapperType<uint64_t>::WrapperType,
+                                                  uint64_t>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  uint64_t l[2]{822, 1002};
+  uint64_t r[2]{2, 1};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 2; i++)
+    if (l[i] > r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 2; i++)
+    if (l[i] == r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 2; i++)
+    if (l[i] < r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+TEST(SimdProcessorTest, Float64)
+{
+  using Proc = typename simd::SimdFilterProcessor<simd::vi128d_wr, double>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  float64_t l[2]{-5.0, 12.5620};
+  float64_t r[2]{2.9, 1};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 2; i++)
+    if (l[i] > r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 2; i++)
+    if (l[i] == r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 2; i++)
+    if (l[i] < r[i])
+      expect |= 0xFF << i * 8;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+TEST(SimdProcessorTest, Float32)
+{
+  using Proc = typename simd::SimdFilterProcessor<simd::vi128f_wr, float>;
+  using SimdType = typename Proc::SimdType;
+  Proc proc;
+  simd::MT expect = 0x0;
+  float32_t l[4]{82, 102,-5.6,9.5};
+  float32_t r[4]{2.0, 1,-5.7,6};
+  SimdType lhs = proc.loadFrom(reinterpret_cast<char*>(l));
+  SimdType rhs = proc.loadFrom(reinterpret_cast<char*>(r));
+  for (int i = 0; i < 4; i++)
+    if (l[i] > r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpGt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 4; i++)
+    if (l[i] == r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpEq(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect);
+
+  expect = 0x0;
+  for (int i = 0; i < 4; i++)
+    if (l[i] < r[i])
+      expect |= 15 << i * 4;
+  EXPECT_EQ(proc.cmpLt(lhs, rhs), expect);
+  EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect);
+}
+#endif
diff --git a/utils/common/simd_arm.h b/utils/common/simd_arm.h
new file mode 100644
index 000000000..96795800f
--- /dev/null
+++ b/utils/common/simd_arm.h
@@ -0,0 +1,1503 @@
+/* Copyright (C) 2021-2022 Mariadb Corporation.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#pragma once
+
+
+#ifdef __aarch64__
+#include "arm_neon.h"
+#include <cstdint>
+#include <type_traits>
+#ifdef __OPTIMIZE__
+#define MCS_FORCE_INLINE __attribute__((__always_inline__))
+#else
+#define MCS_FORCE_INLINE inline
+#endif
+
+#include "mcs_datatype.h"
+
+// Column filtering is dispatched 4-way based on the column type,
+// which defines implementation of comparison operations for the column values
+enum ENUM_KIND
+{
+  KIND_DEFAULT,   // compared as signed integers
+  KIND_UNSIGNED,  // compared as unsigned integers
+  KIND_FLOAT,     // compared as floating-point numbers
+  KIND_TEXT
+};  // whitespace-trimmed and then compared as signed integers
+
+namespace simd
+{
+// the type is decided by the basic type
+using vi1_t =int8x16_t;
+using vi2_t =int16x8_t;
+using vi4_t =int32x4_t;
+using vi8_t =int64x2_t;
+using vi1u_t =uint8x16_t;
+using vi2u_t =uint16x8_t;
+using vi4u_t =uint32x4_t;
+using vi8u_t =uint64x2_t;
+using vi128f_t = float32x4_t;
+using vi128d_t = float64x2_t;
+using int128_t = __int128;
+using MT = uint16_t;
+//the type is used by the  fun like arm__neon__mm__...
+using ArmNeonSSEVecType=uint8x16_t;
+//wrapper types
+struct vi1_wr
+{
+  int8x16_t v;
+};
+struct vi2_wr
+{
+  int8x16_t v;
+};
+struct vi4_wr
+{
+  int32x4_t v;
+};
+struct vi8_wr
+{
+  int64x2_t v;
+};
+struct viu1_wr
+{
+  uint8x16_t v;
+};
+struct viu2_wr
+{
+  uint8x16_t v;
+};
+struct viu4_wr
+{
+  uint32x4_t v;
+};
+struct viu8_wr
+{
+  uint64x2_t v;
+};
+struct vi128f_wr
+{
+  float32x4_t v;
+};
+struct vi128d_wr
+{
+  float64x2_t v;
+};
+
+template<int W>
+struct WidthToSVecWrapperType;
+
+template <>
+struct WidthToSVecWrapperType<1>
+{
+    using Vectype=int8x16_t;
+    using WrapperType=struct vi1_wr;
+};
+
+template <>
+struct WidthToSVecWrapperType<2>
+{
+  using Vectype = int16x8_t;
+  using WrapperType=struct vi2_wr;
+};
+
+template <>
+struct WidthToSVecWrapperType<4>
+{
+  using Vectype = int32x4_t;
+  using WrapperType=struct vi4_wr;
+};
+
+template <>
+struct WidthToSVecWrapperType<8>
+{
+  using Vectype = int64x2_t;
+  using WrapperType=struct vi8_wr;
+};
+
+template <int W>
+struct WidthToVecWrapperType;
+
+template <>
+struct WidthToVecWrapperType<1>
+{
+  using Vectype = uint8x16_t;
+  using WrapperType = struct viu1_wr;
+};
+
+template <>
+struct WidthToVecWrapperType<2>
+{
+  using Vectype = uint16x8_t;
+  using WrapperType = struct viu2_wr;
+};
+
+template <>
+struct WidthToVecWrapperType<4>
+{
+  using Vectype = uint32x4_t;
+  using WrapperType = struct viu4_wr;
+};
+
+template <>
+struct WidthToVecWrapperType<8>
+{
+  using Vectype = uint64x2_t;
+  using WrapperType = struct viu8_wr;
+};
+//We get the simd and wrapper type of basic type by TypeToVecWrapperType.
+template <typename T, typename ENABLE=void>
+struct TypeToVecWrapperType;
+
+template <typename T>
+struct TypeToVecWrapperType<T, typename std::enable_if<std::is_unsigned<T>::value>::type> 
+    : WidthToVecWrapperType<sizeof(T)>
+{
+};
+
+template <typename T>
+struct TypeToVecWrapperType<T, typename std::enable_if<std::is_signed<T>::value>::type> 
+    : WidthToSVecWrapperType<sizeof(T)>
+{
+};
+
+
+template <typename T, ENUM_KIND KIND, typename ENABLE = void>
+struct IntegralToSIMD;
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND,
+                      typename std::enable_if<KIND == KIND_FLOAT && sizeof(double) == sizeof(T)>::type>
+{
+  using type = vi128d_wr;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND,
+                      typename std::enable_if<KIND == KIND_FLOAT && sizeof(float) == sizeof(T)>::type>
+{
+  using type = vi128f_wr;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND, typename std::enable_if<KIND != KIND_FLOAT && sizeof(T) == sizeof(int8_t)>::type>
+{
+  using type = vi1_wr;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND,
+                      typename std::enable_if<KIND != KIND_FLOAT && sizeof(T) == sizeof(int16_t)>::type>
+{
+  using type = vi2_wr;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND,
+                      typename std::enable_if<KIND != KIND_FLOAT && sizeof(T) == sizeof(int32_t)>::type>
+{
+  using type = vi4_wr;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct IntegralToSIMD<T, KIND,
+                      typename std::enable_if<KIND != KIND_FLOAT && sizeof(T) == sizeof(int64_t)>::type>
+{
+  using type = vi8_wr;
+};
+
+template <typename T, ENUM_KIND KIND, typename ENABLE = void>
+struct StorageToFiltering;
+
+template <typename T, ENUM_KIND KIND>
+struct StorageToFiltering<T, KIND,
+                          typename std::enable_if<KIND == KIND_FLOAT && sizeof(float) == sizeof(T)>::type>
+{
+  using type = float;
+};
+
+template <typename T, ENUM_KIND KIND>
+struct StorageToFiltering<T, KIND, typename std::enable_if<KIND != KIND_FLOAT>::type>
+{
+  using type = T;
+};
+
+// these are the x86 instructions that need to be realized by some arm neon instructions
+// the implementations of mm_movemask_epi8 for each type are different because of performance
+
+MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_64(ArmNeonSSEVecType input)
+{
+  return static_cast<MT>(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8));
+}
+
+MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_32(ArmNeonSSEVecType input)
+{
+  input = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(input), 4));
+  return static_cast<MT>(vgetq_lane_u8(input, 3) | ((int)vgetq_lane_u8(input, 11) << 8));
+}
+
+MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_16(ArmNeonSSEVecType input)
+{
+  input = vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(input), 14));
+  input = vreinterpretq_u8_u32(vsraq_n_u32(vreinterpretq_u32_u8(input), vreinterpretq_u32_u8(input), 14));
+  input = vreinterpretq_u8_u64(vsraq_n_u64(vreinterpretq_u64_u8(input), vreinterpretq_u64_u8(input), 28));
+  return static_cast<MT>(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8));
+}
+MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8(ArmNeonSSEVecType input)
+{
+  // Example input (half scale):
+  // 0x89 FF 1D C0 00 10 99 33
+
+  // Shift out everything but the sign bits
+  // 0x01 01 00 01 00 00 01 00
+  uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+  // Merge the even lanes together with vsra. The '??' bytes are garbage.
+  // vsri could also be used, but it is slightly slower on aarch64.
+  // 0x??03 ??02 ??00 ??01
+  uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+  // Repeat with wider lanes.
+  // 0x??????0B ??????04
+  uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+  // 0x??????????????4B
+  uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+  // Extract the low 8 bits from each lane and join.
+  // 0x4B
+  return static_cast<MT>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
+}
+
+
+MCS_FORCE_INLINE MT arm_neon_mm_movemask_pd(ArmNeonSSEVecType a)
+{
+  uint64x2_t input = vreinterpretq_u64_u8(a);
+  uint64x2_t high_bits = vshrq_n_u64(input, 63);
+  return static_cast<MT> (vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1));
+}
+MCS_FORCE_INLINE MT arm_neon_mm_movemask_ps(ArmNeonSSEVecType a)
+{
+  uint32x4_t input = vreinterpretq_u32_u8(a);
+  static const int32x4_t shift = {0, 1, 2, 3};
+  uint32x4_t tmp = vshrq_n_u32(input, 31);
+  return static_cast<MT>(vaddvq_u32(vshlq_u32(tmp, shift)));
+}
+
+MCS_FORCE_INLINE void arm_neon_mm_maskmoveu_si128(ArmNeonSSEVecType a, ArmNeonSSEVecType mask, char* mem_addr)
+{
+  int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_u8(mask), 7);
+  float32x4_t b = vld1q_f32((float*)mem_addr);
+  int8x16_t masked =
+      vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_u8(a), vreinterpretq_s8_f32(b));
+  vst1q_s8((int8_t*)mem_addr, masked);
+}
+
+template <typename VT, typename T, typename ENABLE = void>
+class SimdFilterProcessor;
+
+// Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.we use int32_t to do operations
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<sizeof(CHECK_T) == 16 ||
+                            (std::is_same<VT, vi128f_wr>::value && !std::is_same<CHECK_T, float>::value &&
+                             !std::is_same<CHECK_T, double>::value)>::type>
+{
+  // This is a dummy class that is not currently used.
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = int32_t;
+  using SimdWrapperType = vi4_wr;
+  using SimdType = int32x4_t;
+  using FilterType = T;
+  using StorageType = T;
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_s32(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_s32(reinterpret_cast<const int32_t*>(from));
+  }
+
+  MCS_FORCE_INLINE MT cmpDummy(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpDummy(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_s32(0);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_s32(reinterpret_cast<int32_t*>(dst), x);
+  }
+};
+
+template <typename VT, typename T>
+class SimdFilterProcessor<
+    VT, T,
+    typename std::enable_if<std::is_same<VT, vi128d_wr>::value && std::is_same<T, double>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using FilterType = T;
+  using NullEmptySimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using SimdWrapperType = simd::vi128d_wr;
+  using SimdType = simd::vi128d_t;
+  using StorageSimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
+  using StorageWrapperTypeType =typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using StorageVecProcType = SimdFilterProcessor<StorageWrapperTypeType, StorageType>;
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    StorageVecProcType nullEmptyProcessor;
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return (SimdType)nullEmptyProcessor.loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_f64(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_f64(reinterpret_cast<const T*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_f64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_f64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_f64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_f64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_f64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x,y) ^ 0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_pd((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+    NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
+    NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+
+    NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
+    NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_f64(0);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_f64(reinterpret_cast<T*>(dst), x);
+  }
+};
+
+template <typename VT, typename T>
+class SimdFilterProcessor<
+    VT, T, typename std::enable_if<std::is_same<VT, vi128f_wr>::value && std::is_same<T, float>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using FilterType = T;
+  using NullEmptySimdType =typename  WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using SimdWrapperType = vi128f_wr;
+  using SimdType = vi128f_t;
+  using StorageSimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
+  using StorageWrapperTypeType =typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using StorageVecProcType = SimdFilterProcessor<StorageWrapperTypeType, StorageType>;
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    StorageVecProcType nullEmptyProcessor;
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return (SimdType)nullEmptyProcessor.loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_f32(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_f32(reinterpret_cast<const T*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_f32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_f32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_f32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_f32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_f32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmvnq_u32(vceqq_f32(x, y)));
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_ps((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+
+    NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
+    NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    StorageVecProcType nullEmptyProcessor;
+
+    NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
+    NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
+    // This spec borrows the expr from u-/int64 based proceesor class.
+    return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_f32(0);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_f32(reinterpret_cast<T*>(dst), x);
+  }
+};
+
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi8_wr>::value && std::is_same<CHECK_T, int64_t>::value &&
+                            !std::is_same<CHECK_T, double>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType = typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdType =typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using FilterType = T;
+  using StorageType = T;
+
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_s64(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_s64(reinterpret_cast<const int64_t*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType) vcgeq_s64(x,y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_s64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_s64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_s64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_s64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x,y)^0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_s64(0);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_s64(reinterpret_cast<int64_t*>(dst), x);
+  }
+};
+
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, viu8_wr>::value && std::is_same<CHECK_T, uint64_t>::value &&
+                            !std::is_same<CHECK_T, double>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = uint64_t;
+  using SimdWrapperType = typename WidthToVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdType = typename WidthToVecWrapperType<sizeof(T)>::Vectype;
+  using FilterType = T;
+  using StorageType = T;
+
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_u64(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_u64(reinterpret_cast<const uint64_t*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_u64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_u64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_u64(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return cmpGt(y, x);
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)) ^ 0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_u64(0);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_u64(reinterpret_cast<uint64_t*>(dst), x);
+  }
+};
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi4_wr>::value && std::is_same<CHECK_T, int32_t>::value &&
+                            !std::is_same<CHECK_T, float>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType =typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using FilterType = T;
+  using StorageType = T;
+
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_s32(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_s32(reinterpret_cast<const int32_t*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType) vceqq_s32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_s32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_s32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_s32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_s32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_s32(x, y)) ^ 0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_s32(0);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_s32(reinterpret_cast<int32_t*>(dst), x);
+  }
+};
+
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, viu4_wr>::value && std::is_same<CHECK_T, uint32_t>::value &&
+                            !std::is_same<CHECK_T, float>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = uint32_t;
+  using SimdWrapperType = typename WidthToVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdType = typename WidthToVecWrapperType<sizeof(T)>::Vectype;
+  using FilterType = T;
+  using StorageType = T;
+
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_u32(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_u32(reinterpret_cast<const uint32_t*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_u32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_u32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_u32(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return cmpGt(y, x);
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)) ^ 0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_u32(0);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_u32(reinterpret_cast<uint32_t*>(dst), x);
+  }
+};
+
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi2_wr>::value && std::is_same<CHECK_T, int16_t>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType = typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using FilterType = T;
+  using StorageType = T;
+
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_s16(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_s16(reinterpret_cast<const int16_t*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_s16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_s16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_s16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcleq_s16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_s16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x,y) ^ 0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_s16(0);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_s16(reinterpret_cast<int16_t*>(dst), x);
+  }
+};
+
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<VT, CHECK_T,
+                          typename std::enable_if<std::is_same<VT, viu2_wr>::value &&
+                                                  std::is_same<CHECK_T, uint16_t>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T =  uint16_t;
+  using SimdWrapperType = typename WidthToVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdType = typename WidthToVecWrapperType<sizeof(T)>::Vectype;
+  using FilterType = T;
+  using StorageType = T;
+
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_u16(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_u16(reinterpret_cast<const uint16_t*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_u16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_u16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return cmpGe(y, x);
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_u16(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y)) ^ 0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_u16(0);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_u16(reinterpret_cast<uint16_t*>(dst), x);
+  }
+};
+
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, vi1_wr>::value && std::is_same<CHECK_T, int8_t>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
+  using SimdWrapperType = typename WidthToSVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdType = typename WidthToSVecWrapperType<sizeof(T)>::Vectype;
+  using FilterType = T;
+  using StorageType = T;
+
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_s8(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_s8(reinterpret_cast<const int8_t*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_s8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_s8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_s8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_s8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)) ^ 0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_s8(0);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_s8(reinterpret_cast<int8_t*>(dst), x);
+  }
+};
+
+template <typename VT, typename CHECK_T>
+class SimdFilterProcessor<
+    VT, CHECK_T,
+    typename std::enable_if<std::is_same<VT, viu1_wr>::value && std::is_same<CHECK_T, uint8_t>::value>::type>
+{
+ public:
+  constexpr static const uint16_t vecByteSize = 16U;
+  constexpr static const uint16_t vecBitSize = 128U;
+  using T = uint8_t;
+  using SimdWrapperType =typename WidthToVecWrapperType<sizeof(T)>::WrapperType;
+  using SimdType = typename WidthToVecWrapperType<sizeof(T)>::Vectype;
+  using FilterType = T;
+  using StorageType = T;
+
+  constexpr static const uint16_t FilterMaskStep = sizeof(T);
+  // Load value
+  MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
+  {
+    return loadValue(fill);
+  }
+
+  MCS_FORCE_INLINE SimdType loadValue(const T fill)
+  {
+    return vdupq_n_u8(fill);
+  }
+
+  // Load from
+  MCS_FORCE_INLINE SimdType loadFrom(const char* from)
+  {
+    return vld1q_u8(reinterpret_cast<const uint8_t*>(from));
+  }
+
+  // Compare
+  MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_u8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_u8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_u8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_u8(x, y));
+  }
+
+  MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)) ^ 0xFFFF;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
+  {
+    return 0;
+  }
+
+  MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
+  {
+    return 0xFFFF;
+  }
+
+
+  // misc
+  MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
+  {
+    return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
+  {
+    return cmpNe(x, y);
+  }
+
+  MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
+  {
+    return cmpEq(x, y);
+  }
+
+  MCS_FORCE_INLINE SimdType setToZero()
+  {
+    return vdupq_n_u8(0);
+  }
+
+  // store
+  MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
+  {
+    arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst);
+  }
+
+  MCS_FORCE_INLINE void store(char* dst, SimdType& x)
+  {
+    vst1q_u8(reinterpret_cast<uint8_t*>(dst), x);
+  }
+};
+
+};      // namespace simd
+
+#endif