/* Copyright (C) 2021-2022 Mariadb Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #pragma once #ifdef __aarch64__ #include "arm_neon.h" #include #include #ifdef __OPTIMIZE__ #define MCS_FORCE_INLINE __attribute__((__always_inline__)) inline #else #define MCS_FORCE_INLINE inline #endif #include "mcs_datatype.h" // Column filtering is dispatched 4-way based on the column type, // which defines implementation of comparison operations for the column values enum ENUM_KIND { KIND_DEFAULT, // compared as signed integers KIND_UNSIGNED, // compared as unsigned integers KIND_FLOAT, // compared as floating-point numbers KIND_TEXT }; // whitespace-trimmed and then compared as signed integers namespace simd { // the type is decided by the basic type using vi1_t =int8x16_t; using vi2_t =int16x8_t; using vi4_t =int32x4_t; using vi8_t =int64x2_t; using vi1u_t =uint8x16_t; using vi2u_t =uint16x8_t; using vi4u_t =uint32x4_t; using vi8u_t =uint64x2_t; using vi128f_t = float32x4_t; using vi128d_t = float64x2_t; using int128_t = __int128; using MT = uint16_t; //the type is used by the fun like arm__neon__mm__... using ArmNeonSSEVecType=uint8x16_t; //wrapper types struct vi1_wr { int8x16_t v; }; struct vi2_wr { int8x16_t v; }; struct vi4_wr { int32x4_t v; }; struct vi8_wr { int64x2_t v; }; struct viu1_wr { uint8x16_t v; }; struct viu2_wr { uint8x16_t v; }; struct viu4_wr { uint32x4_t v; }; struct viu8_wr { uint64x2_t v; }; struct vi128f_wr { float32x4_t v; }; struct vi128d_wr { float64x2_t v; }; template struct WidthToSVecWrapperType; template <> struct WidthToSVecWrapperType<1> { using Vectype=int8x16_t; using WrapperType=struct vi1_wr; }; template <> struct WidthToSVecWrapperType<2> { using Vectype = int16x8_t; using WrapperType=struct vi2_wr; }; template <> struct WidthToSVecWrapperType<4> { using Vectype = int32x4_t; using WrapperType=struct vi4_wr; }; template <> struct WidthToSVecWrapperType<8> { using Vectype = int64x2_t; using WrapperType=struct vi8_wr; }; template struct WidthToVecWrapperType; template <> struct WidthToVecWrapperType<1> { using Vectype = uint8x16_t; using WrapperType = struct viu1_wr; }; template <> struct WidthToVecWrapperType<2> { using Vectype = uint16x8_t; using WrapperType = struct viu2_wr; }; template <> struct WidthToVecWrapperType<4> { using Vectype = uint32x4_t; using WrapperType = struct viu4_wr; }; template <> struct WidthToVecWrapperType<8> { using Vectype = uint64x2_t; using WrapperType = struct viu8_wr; }; //We get the simd and wrapper type of basic type by TypeToVecWrapperType. template struct TypeToVecWrapperType; template struct TypeToVecWrapperType::value>::type> : WidthToVecWrapperType { }; template struct TypeToVecWrapperType::value>::type> : WidthToSVecWrapperType { }; template struct IntegralToSIMD; template struct IntegralToSIMD::type> { using type = vi128d_wr; }; template struct IntegralToSIMD::type> { using type = vi128f_wr; }; template struct IntegralToSIMD::type> { using type = vi1_wr; }; template struct IntegralToSIMD::type> { using type = vi2_wr; }; template struct IntegralToSIMD::type> { using type = vi4_wr; }; template struct IntegralToSIMD::type> { using type = vi8_wr; }; template struct StorageToFiltering; template struct StorageToFiltering::type> { using type = float; }; template struct StorageToFiltering::type> { using type = T; }; // these are the x86 instructions that need to be realized by some arm neon instructions // the implementations of mm_movemask_epi8 for each type are different because of performance MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_64(ArmNeonSSEVecType input) { return static_cast(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8)); } MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_32(ArmNeonSSEVecType input) { input = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(input), 4)); return static_cast(vgetq_lane_u8(input, 3) | ((int)vgetq_lane_u8(input, 11) << 8)); } MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_16(ArmNeonSSEVecType input) { input = vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(input), 14)); input = vreinterpretq_u8_u32(vsraq_n_u32(vreinterpretq_u32_u8(input), vreinterpretq_u32_u8(input), 14)); input = vreinterpretq_u8_u64(vsraq_n_u64(vreinterpretq_u64_u8(input), vreinterpretq_u64_u8(input), 28)); return static_cast(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8)); } MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8(ArmNeonSSEVecType input) { // Example input (half scale): // 0x89 FF 1D C0 00 10 99 33 // Shift out everything but the sign bits // 0x01 01 00 01 00 00 01 00 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); // Merge the even lanes together with vsra. The '??' bytes are garbage. // vsri could also be used, but it is slightly slower on aarch64. // 0x??03 ??02 ??00 ??01 uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); // Repeat with wider lanes. // 0x??????0B ??????04 uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); // 0x??????????????4B uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); // Extract the low 8 bits from each lane and join. // 0x4B return static_cast(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8)); } MCS_FORCE_INLINE MT arm_neon_mm_movemask_pd(ArmNeonSSEVecType a) { uint64x2_t input = vreinterpretq_u64_u8(a); uint64x2_t high_bits = vshrq_n_u64(input, 63); return static_cast (vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1)); } MCS_FORCE_INLINE MT arm_neon_mm_movemask_ps(ArmNeonSSEVecType a) { uint32x4_t input = vreinterpretq_u32_u8(a); static const int32x4_t shift = {0, 1, 2, 3}; uint32x4_t tmp = vshrq_n_u32(input, 31); return static_cast(vaddvq_u32(vshlq_u32(tmp, shift))); } MCS_FORCE_INLINE void arm_neon_mm_maskmoveu_si128(ArmNeonSSEVecType a, ArmNeonSSEVecType mask, char* mem_addr) { int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_u8(mask), 7); float32x4_t b = vld1q_f32((float*)mem_addr); int8x16_t masked = vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_u8(a), vreinterpretq_s8_f32(b)); vst1q_s8((int8_t*)mem_addr, masked); } template class SimdFilterProcessor; // Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.we use int32_t to do operations template class SimdFilterProcessor< VT, CHECK_T, typename std::enable_if::value && !std::is_same::value && !std::is_same::value)>::type> { // This is a dummy class that is not currently used. public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = int32_t; using SimdWrapperType = vi4_wr; using SimdType = int32x4_t; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_s32(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_s32(reinterpret_cast(from)); } MCS_FORCE_INLINE MT cmpDummy(SimdType& x, SimdType& y) { return 0xFFFF; } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return cmpDummy(x, y); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return cmpDummy(x, y); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return cmpDummy(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return cmpDummy(x, y); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return cmpDummy(x, y); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return cmpDummy(x, y); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpDummy(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpDummy(x, y); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_s32(0); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_s32(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, T, typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using FilterType = T; using NullEmptySimdType = typename WidthToSVecWrapperType::Vectype; using SimdWrapperType = simd::vi128d_wr; using SimdType = simd::vi128d_t; using StorageSimdType = typename WidthToSVecWrapperType::Vectype; using StorageType = typename datatypes::WidthToSIntegralType::type; using StorageWrapperTypeType =typename WidthToSVecWrapperType::WrapperType; using StorageVecProcType = SimdFilterProcessor; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { StorageVecProcType nullEmptyProcessor; // This spec borrows the expr from u-/int64 based proceesor class. return (SimdType)nullEmptyProcessor.loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_f64(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_f64(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_f64(x, y)); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_f64(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_f64(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_f64(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_f64(x, y)); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return cmpEq(x,y) ^ 0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_pd((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { StorageVecProcType nullEmptyProcessor; NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); // This spec borrows the expr from u-/int64 based proceesor class. return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { StorageVecProcType nullEmptyProcessor; NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); // This spec borrows the expr from u-/int64 based proceesor class. return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_f64(0); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_f64(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, T, typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using FilterType = T; using NullEmptySimdType =typename WidthToSVecWrapperType::Vectype; using SimdWrapperType = vi128f_wr; using SimdType = vi128f_t; using StorageSimdType = typename WidthToSVecWrapperType::Vectype; using StorageType = typename datatypes::WidthToSIntegralType::type; using StorageWrapperTypeType =typename WidthToSVecWrapperType::WrapperType; using StorageVecProcType = SimdFilterProcessor; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { StorageVecProcType nullEmptyProcessor; // This spec borrows the expr from u-/int64 based proceesor class. return (SimdType)nullEmptyProcessor.loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_f32(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_f32(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_f32(x, y)); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_f32(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_f32(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_f32(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_f32(x, y)); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmvnq_u32(vceqq_f32(x, y))); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_ps((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { StorageVecProcType nullEmptyProcessor; NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); // This spec borrows the expr from u-/int64 based proceesor class. return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { StorageVecProcType nullEmptyProcessor; NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); // This spec borrows the expr from u-/int64 based proceesor class. return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_f32(0); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_f32(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, CHECK_T, typename std::enable_if::value && std::is_same::value && !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = typename datatypes::WidthToSIntegralType::type; using SimdWrapperType = typename WidthToSVecWrapperType::WrapperType; using SimdType =typename WidthToSVecWrapperType::Vectype; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_s64(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_s64(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType) vcgeq_s64(x,y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_s64(x, y)); } MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_s64(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_s64(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_s64(x, y)); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return cmpEq(x,y)^0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_s64(0); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpNe(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpEq(x, y); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_s64(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, CHECK_T, typename std::enable_if::value && std::is_same::value && !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = uint64_t; using SimdWrapperType = typename WidthToVecWrapperType::WrapperType; using SimdType = typename WidthToVecWrapperType::Vectype; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_u64(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_u64(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_u64(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_u64(x, y)); } MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_u64(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return cmpGt(y, x); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)) ^ 0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_u64(0); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpNe(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpEq(x, y); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_u64(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, CHECK_T, typename std::enable_if::value && std::is_same::value && !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = typename datatypes::WidthToSIntegralType::type; using SimdWrapperType =typename WidthToSVecWrapperType::WrapperType; using SimdType = typename WidthToSVecWrapperType::Vectype; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_s32(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_s32(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType) vceqq_s32(x, y)); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_s32(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_s32(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_s32(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_s32(x, y)); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_s32(x, y)) ^ 0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpNe(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpEq(x, y); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_s32(0); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_s32(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, CHECK_T, typename std::enable_if::value && std::is_same::value && !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = uint32_t; using SimdWrapperType = typename WidthToVecWrapperType::WrapperType; using SimdType = typename WidthToVecWrapperType::Vectype; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_u32(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_u32(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_u32(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_u32(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_u32(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return cmpGt(y, x); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)) ^ 0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpNe(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpEq(x, y); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_u32(0); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_u32(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = typename datatypes::WidthToSIntegralType::type; using SimdWrapperType = typename WidthToSVecWrapperType::WrapperType; using SimdType = typename WidthToSVecWrapperType::Vectype; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_s16(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_s16(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_s16(x, y)); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_s16(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_s16(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcleq_s16(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_s16(x, y)); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return cmpEq(x,y) ^ 0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpNe(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpEq(x, y); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_s16(0); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_s16(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = uint16_t; using SimdWrapperType = typename WidthToVecWrapperType::WrapperType; using SimdType = typename WidthToVecWrapperType::Vectype; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_u16(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_u16(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y)); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_u16(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_u16(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return cmpGe(y, x); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_u16(x, y)); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y)) ^ 0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpNe(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpEq(x, y); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_u16(0); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_u16(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = typename datatypes::WidthToSIntegralType::type; using SimdWrapperType = typename WidthToSVecWrapperType::WrapperType; using SimdType = typename WidthToSVecWrapperType::Vectype; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_s8(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_s8(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_s8(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_s8(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_s8(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_s8(x, y)); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)) ^ 0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpNe(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpEq(x, y); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_s8(0); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_s8(reinterpret_cast(dst), x); } }; template class SimdFilterProcessor< VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = uint8_t; using SimdWrapperType =typename WidthToVecWrapperType::WrapperType; using SimdType = typename WidthToVecWrapperType::Vectype; using FilterType = T; using StorageType = T; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { return loadValue(fill); } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_u8(fill); } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_u8(reinterpret_cast(from)); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)); } MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_u8(x, y)); } MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_u8(x, y)); } MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_u8(x, y)); } MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_u8(x, y)); } MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)) ^ 0xFFFF; } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) { return 0xFFFF; } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) { return cmpNe(x, y); } MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) { return cmpEq(x, y); } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_u8(0); } // store MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } MCS_FORCE_INLINE void store(char* dst, SimdType& x) { vst1q_u8(reinterpret_cast(dst), x); } }; }; // namespace simd #endif