From 4c0b8fd829586ad1fe838600f97670652322d03e Mon Sep 17 00:00:00 2001 From: NTH19 <3310288189@qq.com> Date: Thu, 16 Jun 2022 12:08:42 +0800 Subject: [PATCH] simd of arm neon unit testing pass unit test for simdprocessor add test cases implement specific _mm_movemask for different types float movemask change rename --- tests/simd_processors.cpp | 375 ++++++++- utils/common/simd_arm.h | 1503 +++++++++++++++++++++++++++++++++++++ 2 files changed, 1874 insertions(+), 4 deletions(-) create mode 100644 utils/common/simd_arm.h diff --git a/tests/simd_processors.cpp b/tests/simd_processors.cpp index 8883a8743..864c93071 100644 --- a/tests/simd_processors.cpp +++ b/tests/simd_processors.cpp @@ -15,14 +15,15 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#if defined(__x86_64__) + #include #include - -#include "simd_sse.h" #include "datatypes/mcs_datatype.h" #include "datatypes/mcs_int128.h" +#if defined(__x86_64__) +#include "simd_sse.h" + using namespace std; template @@ -63,4 +64,370 @@ TYPED_TEST(SimdProcessorTypedTest, SimdFilterProcessor_simd128) EXPECT_EQ(proc.cmpEq(lhs, rhs), allTrue); EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse); } -#endif \ No newline at end of file +#endif +#ifdef __aarch64__ +#include "simd_arm.h" + + +using namespace std; + +template +class SimdProcessorTypedTest : public testing::Test +{ + using IntegralType = T; + + public: + void SetUp() override + { + } +}; + +using SimdProcessor128TypedTestTypes = + ::testing::Types; +TYPED_TEST_SUITE(SimdProcessorTypedTest, SimdProcessor128TypedTestTypes); + +TYPED_TEST(SimdProcessorTypedTest, SimdFilterProcessor_simd128) +{ + using Proc = typename simd::SimdFilterProcessor::WrapperType, TypeParam>; + using SimdType = typename Proc::SimdType; + constexpr static simd::MT allTrue = 0xFFFF; + constexpr static simd::MT allFalse = 0x0; + Proc proc; + SimdType lhs = proc.loadValue((TypeParam)-2); + SimdType rhs = proc.loadValue((TypeParam)-3); + EXPECT_GT((uint64_t)-2LL, (uint64_t)-3LL); + EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue); + EXPECT_EQ(proc.cmpGt(lhs, rhs), allTrue); + EXPECT_EQ(proc.cmpGe(rhs, lhs), allFalse); + EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse); + EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue); + EXPECT_EQ(proc.cmpLt(rhs, lhs), allTrue); + EXPECT_EQ(proc.cmpLe(lhs, rhs), allFalse); + EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse); + EXPECT_EQ(proc.cmpEq(rhs, lhs), allFalse); + EXPECT_EQ(proc.cmpNe(rhs, lhs), allTrue); + lhs = proc.loadValue((TypeParam)-3); + EXPECT_EQ(proc.cmpEq(lhs, rhs), allTrue); + EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse); + + lhs = rhs = proc.loadValue((TypeParam)0); + EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue); + EXPECT_EQ(proc.cmpGt(lhs, rhs), allFalse); + EXPECT_EQ(proc.cmpGe(rhs, lhs), allTrue); + EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse); + EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue); + EXPECT_EQ(proc.cmpLt(rhs, lhs), allFalse); + EXPECT_EQ(proc.cmpLe(lhs, rhs), allTrue); + EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse); + EXPECT_EQ(proc.cmpEq(rhs, lhs), allTrue); + EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse); +} +TEST(SimdProcessorTest,Int8) +{ + using Proc = typename simd::SimdFilterProcessor::WrapperType, int8_t>; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + int8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5}; + int8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 16; i++) + if (l[i] > r[i]) + expect |= 1 << i; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect); + + expect = 0x0; + for (int i = 0; i < 16; i++) + if (l[i] == r[i]) + expect |= 1 << i; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 16; i++) + if (l[i] < r[i]) + expect |= 1 << i; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} +TEST(SimdProcessorTest, Uint8) +{ + using Proc = + typename simd::SimdFilterProcessor::WrapperType, uint8_t>; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + uint8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5}; + uint8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 16; i++) + if (l[i] > r[i]) + expect |= 1 << i; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect); + + expect = 0x0; + for (int i = 0; i < 16; i++) + if (l[i] == r[i]) + expect |= 1 << i; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs),(simd::MT) ~expect); + + expect = 0x0; + for (int i = 0; i < 16; i++) + if (l[i] < r[i]) + expect |= 1 << i; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs),(simd::MT) ~expect); +} +TEST(SimdProcessorTest, Int16) +{ + using Proc = + typename simd::SimdFilterProcessor::WrapperType, int16_t>; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + int16_t l[8]{0, 1, 2, -5, 4, 3, -8, 200}; + int16_t r[8]{0, 105, -8, 35, 24, 13, 8}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 8; i++) + if (l[i] > r[i]) + expect |= 3 << i * 2; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 8; i++) + if (l[i] == r[i]) + expect |= 3 << i * 2; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 8; i++) + if (l[i] < r[i]) + expect |= 3 << i * 2; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} +TEST(SimdProcessorTest, Uint16) +{ + using Proc = typename simd::SimdFilterProcessor::WrapperType, + uint16_t>; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + uint16_t l[8]{0, 1, 2, 5, 4, 3, 8, 5}; + uint16_t r[8]{0, 1, 8, 35, 24, 13, 8}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 8; i++) + if (l[i] > r[i]) + expect |= 3 << i*2; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 8; i++) + if (l[i] == r[i]) + expect |= 3 << i * 2; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 8; i++) + if (l[i] < r[i]) + expect |= 3 << i * 2; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} + +TEST(SimdProcessorTest, Int32) +{ + using Proc = + typename simd::SimdFilterProcessor::WrapperType, int32_t>; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + int32_t l[8]{0, 1, 2, -5}; + int32_t r[8]{0, 105, -8,54333}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 4; i++) + if (l[i] > r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 4; i++) + if (l[i] == r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 4; i++) + if (l[i] < r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} +TEST(SimdProcessorTest, Uint32) +{ + using Proc = typename simd::SimdFilterProcessor::WrapperType, + uint32_t>; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + uint32_t l[4]{0, 1002, 2, 514}; + uint32_t r[4]{2, 1, 80555, 35}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 4; i++) + if (l[i] > r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 4; i++) + if (l[i] == r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 4; i++) + if (l[i] < r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} +TEST(SimdProcessorTest, Int64) +{ + using Proc = + typename simd::SimdFilterProcessor::WrapperType, int64_t>; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + int64_t l[2]{-5, 122020}; + int64_t r[2]{0, 105}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 2; i++) + if (l[i] > r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 2; i++) + if (l[i] == r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 2; i++) + if (l[i] < r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} +TEST(SimdProcessorTest, Uint64) +{ + using Proc = typename simd::SimdFilterProcessor::WrapperType, + uint64_t>; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + uint64_t l[2]{822, 1002}; + uint64_t r[2]{2, 1}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 2; i++) + if (l[i] > r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 2; i++) + if (l[i] == r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 2; i++) + if (l[i] < r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} +TEST(SimdProcessorTest, Float64) +{ + using Proc = typename simd::SimdFilterProcessor; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + float64_t l[2]{-5.0, 12.5620}; + float64_t r[2]{2.9, 1}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 2; i++) + if (l[i] > r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 2; i++) + if (l[i] == r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 2; i++) + if (l[i] < r[i]) + expect |= 0xFF << i * 8; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} +TEST(SimdProcessorTest, Float32) +{ + using Proc = typename simd::SimdFilterProcessor; + using SimdType = typename Proc::SimdType; + Proc proc; + simd::MT expect = 0x0; + float32_t l[4]{82, 102,-5.6,9.5}; + float32_t r[4]{2.0, 1,-5.7,6}; + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + for (int i = 0; i < 4; i++) + if (l[i] > r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 4; i++) + if (l[i] == r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); + EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + + expect = 0x0; + for (int i = 0; i < 4; i++) + if (l[i] < r[i]) + expect |= 15 << i * 4; + EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); + EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); +} +#endif diff --git a/utils/common/simd_arm.h b/utils/common/simd_arm.h new file mode 100644 index 000000000..96795800f --- /dev/null +++ b/utils/common/simd_arm.h @@ -0,0 +1,1503 @@ +/* Copyright (C) 2021-2022 Mariadb Corporation. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; version 2 of + the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + MA 02110-1301, USA. */ + +#pragma once + + +#ifdef __aarch64__ +#include "arm_neon.h" +#include +#include +#ifdef __OPTIMIZE__ +#define MCS_FORCE_INLINE __attribute__((__always_inline__)) +#else +#define MCS_FORCE_INLINE inline +#endif + +#include "mcs_datatype.h" + +// Column filtering is dispatched 4-way based on the column type, +// which defines implementation of comparison operations for the column values +enum ENUM_KIND +{ + KIND_DEFAULT, // compared as signed integers + KIND_UNSIGNED, // compared as unsigned integers + KIND_FLOAT, // compared as floating-point numbers + KIND_TEXT +}; // whitespace-trimmed and then compared as signed integers + +namespace simd +{ +// the type is decided by the basic type +using vi1_t =int8x16_t; +using vi2_t =int16x8_t; +using vi4_t =int32x4_t; +using vi8_t =int64x2_t; +using vi1u_t =uint8x16_t; +using vi2u_t =uint16x8_t; +using vi4u_t =uint32x4_t; +using vi8u_t =uint64x2_t; +using vi128f_t = float32x4_t; +using vi128d_t = float64x2_t; +using int128_t = __int128; +using MT = uint16_t; +//the type is used by the fun like arm__neon__mm__... +using ArmNeonSSEVecType=uint8x16_t; +//wrapper types +struct vi1_wr +{ + int8x16_t v; +}; +struct vi2_wr +{ + int8x16_t v; +}; +struct vi4_wr +{ + int32x4_t v; +}; +struct vi8_wr +{ + int64x2_t v; +}; +struct viu1_wr +{ + uint8x16_t v; +}; +struct viu2_wr +{ + uint8x16_t v; +}; +struct viu4_wr +{ + uint32x4_t v; +}; +struct viu8_wr +{ + uint64x2_t v; +}; +struct vi128f_wr +{ + float32x4_t v; +}; +struct vi128d_wr +{ + float64x2_t v; +}; + +template +struct WidthToSVecWrapperType; + +template <> +struct WidthToSVecWrapperType<1> +{ + using Vectype=int8x16_t; + using WrapperType=struct vi1_wr; +}; + +template <> +struct WidthToSVecWrapperType<2> +{ + using Vectype = int16x8_t; + using WrapperType=struct vi2_wr; +}; + +template <> +struct WidthToSVecWrapperType<4> +{ + using Vectype = int32x4_t; + using WrapperType=struct vi4_wr; +}; + +template <> +struct WidthToSVecWrapperType<8> +{ + using Vectype = int64x2_t; + using WrapperType=struct vi8_wr; +}; + +template +struct WidthToVecWrapperType; + +template <> +struct WidthToVecWrapperType<1> +{ + using Vectype = uint8x16_t; + using WrapperType = struct viu1_wr; +}; + +template <> +struct WidthToVecWrapperType<2> +{ + using Vectype = uint16x8_t; + using WrapperType = struct viu2_wr; +}; + +template <> +struct WidthToVecWrapperType<4> +{ + using Vectype = uint32x4_t; + using WrapperType = struct viu4_wr; +}; + +template <> +struct WidthToVecWrapperType<8> +{ + using Vectype = uint64x2_t; + using WrapperType = struct viu8_wr; +}; +//We get the simd and wrapper type of basic type by TypeToVecWrapperType. +template +struct TypeToVecWrapperType; + +template +struct TypeToVecWrapperType::value>::type> + : WidthToVecWrapperType +{ +}; + +template +struct TypeToVecWrapperType::value>::type> + : WidthToSVecWrapperType +{ +}; + + +template +struct IntegralToSIMD; + +template +struct IntegralToSIMD::type> +{ + using type = vi128d_wr; +}; + +template +struct IntegralToSIMD::type> +{ + using type = vi128f_wr; +}; + +template +struct IntegralToSIMD::type> +{ + using type = vi1_wr; +}; + +template +struct IntegralToSIMD::type> +{ + using type = vi2_wr; +}; + +template +struct IntegralToSIMD::type> +{ + using type = vi4_wr; +}; + +template +struct IntegralToSIMD::type> +{ + using type = vi8_wr; +}; + +template +struct StorageToFiltering; + +template +struct StorageToFiltering::type> +{ + using type = float; +}; + +template +struct StorageToFiltering::type> +{ + using type = T; +}; + +// these are the x86 instructions that need to be realized by some arm neon instructions +// the implementations of mm_movemask_epi8 for each type are different because of performance + +MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_64(ArmNeonSSEVecType input) +{ + return static_cast(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8)); +} + +MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_32(ArmNeonSSEVecType input) +{ + input = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(input), 4)); + return static_cast(vgetq_lane_u8(input, 3) | ((int)vgetq_lane_u8(input, 11) << 8)); +} + +MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_16(ArmNeonSSEVecType input) +{ + input = vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(input), 14)); + input = vreinterpretq_u8_u32(vsraq_n_u32(vreinterpretq_u32_u8(input), vreinterpretq_u32_u8(input), 14)); + input = vreinterpretq_u8_u64(vsraq_n_u64(vreinterpretq_u64_u8(input), vreinterpretq_u64_u8(input), 28)); + return static_cast(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8)); +} +MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8(ArmNeonSSEVecType input) +{ + // Example input (half scale): + // 0x89 FF 1D C0 00 10 99 33 + + // Shift out everything but the sign bits + // 0x01 01 00 01 00 00 01 00 + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with vsra. The '??' bytes are garbage. + // vsri could also be used, but it is slightly slower on aarch64. + // 0x??03 ??02 ??00 ??01 + uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + // Repeat with wider lanes. + // 0x??????0B ??????04 + uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + // 0x??????????????4B + uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + // Extract the low 8 bits from each lane and join. + // 0x4B + return static_cast(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8)); +} + + +MCS_FORCE_INLINE MT arm_neon_mm_movemask_pd(ArmNeonSSEVecType a) +{ + uint64x2_t input = vreinterpretq_u64_u8(a); + uint64x2_t high_bits = vshrq_n_u64(input, 63); + return static_cast (vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1)); +} +MCS_FORCE_INLINE MT arm_neon_mm_movemask_ps(ArmNeonSSEVecType a) +{ + uint32x4_t input = vreinterpretq_u32_u8(a); + static const int32x4_t shift = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return static_cast(vaddvq_u32(vshlq_u32(tmp, shift))); +} + +MCS_FORCE_INLINE void arm_neon_mm_maskmoveu_si128(ArmNeonSSEVecType a, ArmNeonSSEVecType mask, char* mem_addr) +{ + int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_u8(mask), 7); + float32x4_t b = vld1q_f32((float*)mem_addr); + int8x16_t masked = + vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_u8(a), vreinterpretq_s8_f32(b)); + vst1q_s8((int8_t*)mem_addr, masked); +} + +template +class SimdFilterProcessor; + +// Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.we use int32_t to do operations +template +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && !std::is_same::value && + !std::is_same::value)>::type> +{ + // This is a dummy class that is not currently used. + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = int32_t; + using SimdWrapperType = vi4_wr; + using SimdType = int32x4_t; + using FilterType = T; + using StorageType = T; + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_s32(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_s32(reinterpret_cast(from)); + } + + MCS_FORCE_INLINE MT cmpDummy(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_s32(0); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_s32(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, T, + typename std::enable_if::value && std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using FilterType = T; + using NullEmptySimdType = typename WidthToSVecWrapperType::Vectype; + using SimdWrapperType = simd::vi128d_wr; + using SimdType = simd::vi128d_t; + using StorageSimdType = typename WidthToSVecWrapperType::Vectype; + using StorageType = typename datatypes::WidthToSIntegralType::type; + using StorageWrapperTypeType =typename WidthToSVecWrapperType::WrapperType; + using StorageVecProcType = SimdFilterProcessor; + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + StorageVecProcType nullEmptyProcessor; + // This spec borrows the expr from u-/int64 based proceesor class. + return (SimdType)nullEmptyProcessor.loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_f64(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_f64(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_f64(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_f64(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_f64(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_f64(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_f64(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return cmpEq(x,y) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_pd((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + StorageVecProcType nullEmptyProcessor; + NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); + NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); + // This spec borrows the expr from u-/int64 based proceesor class. + return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + StorageVecProcType nullEmptyProcessor; + + NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); + NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); + // This spec borrows the expr from u-/int64 based proceesor class. + return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_f64(0); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_f64(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, T, typename std::enable_if::value && std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using FilterType = T; + using NullEmptySimdType =typename WidthToSVecWrapperType::Vectype; + using SimdWrapperType = vi128f_wr; + using SimdType = vi128f_t; + using StorageSimdType = typename WidthToSVecWrapperType::Vectype; + using StorageType = typename datatypes::WidthToSIntegralType::type; + using StorageWrapperTypeType =typename WidthToSVecWrapperType::WrapperType; + using StorageVecProcType = SimdFilterProcessor; + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + StorageVecProcType nullEmptyProcessor; + // This spec borrows the expr from u-/int64 based proceesor class. + return (SimdType)nullEmptyProcessor.loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_f32(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_f32(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_f32(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_f32(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_f32(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_f32(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_f32(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmvnq_u32(vceqq_f32(x, y))); + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_ps((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + StorageVecProcType nullEmptyProcessor; + + NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); + NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); + // This spec borrows the expr from u-/int64 based proceesor class. + return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + StorageVecProcType nullEmptyProcessor; + + NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); + NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); + // This spec borrows the expr from u-/int64 based proceesor class. + return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_f32(0); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_f32(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = typename WidthToSVecWrapperType::WrapperType; + using SimdType =typename WidthToSVecWrapperType::Vectype; + using FilterType = T; + using StorageType = T; + + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_s64(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_s64(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType) vcgeq_s64(x,y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_s64(x, y)); + } + + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_s64(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_s64(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_s64(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return cmpEq(x,y)^0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_s64(0); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_s64(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = uint64_t; + using SimdWrapperType = typename WidthToVecWrapperType::WrapperType; + using SimdType = typename WidthToVecWrapperType::Vectype; + using FilterType = T; + using StorageType = T; + + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_u64(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_u64(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_u64(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_u64(x, y)); + } + + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_u64(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return cmpGt(y, x); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_u64(0); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_u64(reinterpret_cast(dst), x); + } +}; +template +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType =typename WidthToSVecWrapperType::WrapperType; + using SimdType = typename WidthToSVecWrapperType::Vectype; + using FilterType = T; + using StorageType = T; + + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_s32(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_s32(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType) vceqq_s32(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_s32(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_s32(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_s32(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_s32(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_s32(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_s32(0); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_s32(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = uint32_t; + using SimdWrapperType = typename WidthToVecWrapperType::WrapperType; + using SimdType = typename WidthToVecWrapperType::Vectype; + using FilterType = T; + using StorageType = T; + + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_u32(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_u32(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_u32(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_u32(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_u32(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return cmpGt(y, x); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_u32(0); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_u32(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = typename WidthToSVecWrapperType::WrapperType; + using SimdType = typename WidthToSVecWrapperType::Vectype; + using FilterType = T; + using StorageType = T; + + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_s16(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_s16(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_s16(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_s16(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_s16(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcleq_s16(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_s16(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return cmpEq(x,y) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_s16(0); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_s16(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor::value && + std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = uint16_t; + using SimdWrapperType = typename WidthToVecWrapperType::WrapperType; + using SimdType = typename WidthToVecWrapperType::Vectype; + using FilterType = T; + using StorageType = T; + + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_u16(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_u16(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_u16(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_u16(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return cmpGe(y, x); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_u16(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_u16(0); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_u16(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = typename WidthToSVecWrapperType::WrapperType; + using SimdType = typename WidthToSVecWrapperType::Vectype; + using FilterType = T; + using StorageType = T; + + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_s8(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_s8(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_s8(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_s8(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_s8(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_s8(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_s8(0); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_s8(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = uint8_t; + using SimdWrapperType =typename WidthToVecWrapperType::WrapperType; + using SimdType = typename WidthToVecWrapperType::Vectype; + using FilterType = T; + using StorageType = T; + + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return vdupq_n_u8(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return vld1q_u8(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_u8(x, y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_u8(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_u8(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_u8(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return vdupq_n_u8(0); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + vst1q_u8(reinterpret_cast(dst), x); + } +}; + +}; // namespace simd + +#endif