diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp index a3ac1e4e5..40c610edf 100644 --- a/primitives/linux-port/column.cpp +++ b/primitives/linux-port/column.cpp @@ -41,6 +41,7 @@ using namespace boost; #include "dataconvert.h" #include "mcs_decimal.h" #include "simd_sse.h" +#include "simd_arm.h" #include "utils/common/columnwidth.h" #include "exceptclasses.h" @@ -118,24 +119,6 @@ inline int compareBlock(const void* a, const void* b) return ((*(T*)a) - (*(T*)b)); } -template -std::enable_if_t< - sizeof(To) == sizeof(From) && - std::is_trivially_copyable_v && - std::is_trivially_copyable_v, - To> -// constexpr support needs compiler magic -bitCast(const From& src) noexcept -{ - static_assert(std::is_trivially_constructible_v, - "This implementation additionally requires " - "destination type to be trivially constructible"); - - To dst; - std::memcpy(&dst, &src, sizeof(To)); - return dst; -} - // this function is out-of-band, we don't need to inline it void logIt(int mid, int arg1, const string& arg2 = string()) { @@ -942,7 +925,7 @@ inline void writeColValue(uint8_t OutputType, ColResultHeader* out, uint16_t rid } } -#if defined(__x86_64__) +#if defined(__x86_64__) || defined(__aarch64__) template ::type* = nullptr> inline void vectUpdateMinMax(const bool validMinMax, const bool isNonNullOrEmpty, T& Min, T& Max, T curValue, @@ -1239,7 +1222,7 @@ void scalarFiltering( } } -#if defined(__x86_64__) +#if defined(__x86_64__) || defined(__aarch64__) template ::type* = nullptr> inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* origSrcArray, @@ -1295,19 +1278,13 @@ inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType } template -void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT simdProcessor, +void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& simdProcessor, SimdType& dataVec, SimdType& simdMin, SimdType& simdMax) { - if (validMinMax) + if (validMinMax && nonNullOrEmptyMask) { - simdMin = simdProcessor.blend( - simdMin, dataVec, - simdProcessor.bwAnd(simdProcessor.cmpGt2(simdMin, dataVec), - bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)))); - simdMax = simdProcessor.blend( - simdMax, dataVec, - simdProcessor.bwAnd(simdProcessor.cmpGt2(dataVec, simdMax), - bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)))); + simdMin = simdProcessor.min(simdMin, dataVec); + simdMax = simdProcessor.max(simdMax, dataVec); } } @@ -1328,7 +1305,7 @@ void scalarUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& } template -void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min, T& max) +void extractMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T& min, T& max) { constexpr const uint16_t size = VT::vecByteSize / sizeof(T); T* simdMinVec = reinterpret_cast(&simdMin); @@ -1336,13 +1313,6 @@ void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min max = *std::max_element(simdMaxVec, simdMaxVec + size); min = *std::min_element(simdMinVec, simdMinVec + size); } - -template -void getInitialSimdMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T min, T max) -{ - simdMin = simdProcessor.loadValue(min); - simdMax = simdProcessor.loadValue(max); -} // This routine filters input block in a vectorized manner. // It supports all output types, all input types. // It doesn't support KIND==TEXT so upper layers filters this KIND out beforehand. @@ -1478,12 +1448,9 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* } } } - [[maybe_unused]] SimdType simdMin; - [[maybe_unused]] SimdType simdMax; - if constexpr (KIND != KIND_TEXT) - { - getInitialSimdMinMax(simdProcessor, simdMin, simdMax, min, max); - } + [[maybe_unused]] SimdType simdMin = simdDataLoad(simdProcessor, srcArray, + origSrcArray, ridArray, 0).v;; + [[maybe_unused]] SimdType simdMax = simdMin; // main loop // writeMask tells which values must get into the result. Includes values that matches filters. Can have // NULLs. nonEmptyMask tells which vector coords are not EMPTY magics. nonNullMask tells which vector coords @@ -1704,7 +1671,7 @@ void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* r // Syscat queries mustn't follow vectorized processing path b/c PP must return // all values w/o any filter(even empty values filter) applied. -#if defined(__x86_64__) +#if defined(__x86_64__) || defined(__aarch64__) // Don't use vectorized filtering for text based data types. if (WIDTH < 16 && (KIND != KIND_TEXT || (KIND == KIND_TEXT && in->colType.strnxfrmIsValid()) )) diff --git a/tests/simd_processors.cpp b/tests/simd_processors.cpp index 0c8f145d7..85bcc14f9 100644 --- a/tests/simd_processors.cpp +++ b/tests/simd_processors.cpp @@ -22,37 +22,33 @@ #include #include "datatypes/mcs_datatype.h" #include "datatypes/mcs_int128.h" - +#include "simd_sse.h" +#include "simd_arm.h" #if defined(__x86_64__) - #include "simd_sse.h" #define TESTS_USING_SSE 1 using float64_t = double; using float32_t = float; #endif #ifdef __aarch64__ - #include "simd_arm.h" #define TESTS_USING_ARM 1 #endif using namespace std; template -class SimdProcessorTypedTest : public testing::Test -{ - public: +class SimdProcessorTypedTest : public testing::Test { +public: using IntegralType = T; -#if TESTS_USING_SSE - using SimdType = - std::conditional_t::value, simd::vi128f_wr, - std::conditional_t::value, simd::vi128d_wr, simd::vi128_wr>>; - using Proc = typename simd::SimdFilterProcessor; -#else - using SimdType = - std::conditional_t::value, simd::vi128f_wr, - std::conditional_t::value, simd::vi128d_wr, - typename simd::TypeToVecWrapperType::WrapperType>>; - using Proc = typename simd::SimdFilterProcessor; -#endif + #if TESTS_USING_SSE + using SimdType = std::conditional_t::value, + simd::vi128f_wr, + std::conditional_t::value, + simd::vi128d_wr, + simd::vi128_wr>>; + using Proc = typename simd::SimdFilterProcessor; + #else + using Proc = typename simd::SimdFilterProcessor::WrapperType, T>; + #endif void SetUp() override { } diff --git a/utils/common/simd_arm.h b/utils/common/simd_arm.h index 304849125..f157b4475 100644 --- a/utils/common/simd_arm.h +++ b/utils/common/simd_arm.h @@ -17,7 +17,6 @@ #pragma once - #ifdef __aarch64__ #include "arm_neon.h" #include @@ -30,15 +29,6 @@ #include "mcs_datatype.h" -// Column filtering is dispatched 4-way based on the column type, -// which defines implementation of comparison operations for the column values -enum ENUM_KIND -{ - KIND_DEFAULT, // compared as signed integers - KIND_UNSIGNED, // compared as unsigned integers - KIND_FLOAT, // compared as floating-point numbers - KIND_TEXT -}; // whitespace-trimmed and then compared as signed integers namespace simd { @@ -55,6 +45,28 @@ using vi128f_t = float32x4_t; using vi128d_t = float64x2_t; using int128_t = __int128; using MT = uint16_t; +using MaskSimdType=vi1u_t; +template +static vi8_t constant2i() +{ + static const union + { + int64_t i[2]; + vi8_t xmm; + } u = {{i0, i1}}; + return u.xmm; +} +static inline MaskSimdType bitMaskToByteMask16(MT m) +{ + vi8_t sel = constant2i<(int64_t)0xffffffffffffffff, (int64_t)0x0>(); + vi8_t andop = constant2i<(int64_t)0x8040201008040201, (int64_t)0x8040201008040201>(); + vi1u_t op = vreinterpretq_u8_s64( + vandq_s64(vbslq_s64(vreinterpretq_u64_s64(sel), vreinterpretq_s64_u8(vdupq_n_u8(m & 0xff)), + vreinterpretq_s64_u8(vdupq_n_u8((m & 0xff00) >> 8))), + andop)); + vi1u_t zero = vdupq_n_u8(0); + return vcgtq_u8(op, zero); +} //the type is used by the fun like arm__neon__mm__... using ArmNeonSSEVecType=uint8x16_t; //wrapper types @@ -64,7 +76,7 @@ struct vi1_wr }; struct vi2_wr { - int8x16_t v; + int16x8_t v; }; struct vi4_wr { @@ -74,13 +86,17 @@ struct vi8_wr { int64x2_t v; }; +struct vi16_wr +{ + int128_t v; +}; struct viu1_wr { uint8x16_t v; }; struct viu2_wr { - uint8x16_t v; + uint16x8_t v; }; struct viu4_wr { @@ -90,6 +106,7 @@ struct viu8_wr { uint64x2_t v; }; + struct vi128f_wr { float32x4_t v; @@ -129,7 +146,12 @@ struct WidthToSVecWrapperType<8> using Vectype = int64x2_t; using WrapperType=struct vi8_wr; }; - +template <> +struct WidthToSVecWrapperType<16> +{ + using Vectype = int128_t; + using WrapperType = struct vi16_wr; +}; template struct WidthToVecWrapperType; @@ -160,18 +182,37 @@ struct WidthToVecWrapperType<8> using Vectype = uint64x2_t; using WrapperType = struct viu8_wr; }; + //We get the simd and wrapper type of basic type by TypeToVecWrapperType. template struct TypeToVecWrapperType; template -struct TypeToVecWrapperType::value>::type> +struct TypeToVecWrapperType::value>::type> + : WidthToSVecWrapperType +{ +}; +template +struct TypeToVecWrapperType>::type> +{ + using Vectype = vi128f_t; + using WrapperType = vi128f_wr; +}; +template +struct TypeToVecWrapperType>::type> +{ + using Vectype = vi128d_t; + using WrapperType = vi128d_wr; +}; +template +struct TypeToVecWrapperType >::type> : WidthToVecWrapperType { }; template -struct TypeToVecWrapperType::value>::type> + struct TypeToVecWrapperType< + T, typename std::enable_if &&!is_floating_point_v>::type> : WidthToSVecWrapperType { }; @@ -195,35 +236,21 @@ struct IntegralToSIMD -struct IntegralToSIMD::type> +struct IntegralToSIMD::type> { - using type = vi1_wr; -}; - -template -struct IntegralToSIMD::type> -{ - using type = vi2_wr; -}; - -template -struct IntegralToSIMD::type> -{ - using type = vi4_wr; -}; - -template -struct IntegralToSIMD::type> -{ - using type = vi8_wr; + using type = TypeToVecWrapperType::WrapperType; }; template struct StorageToFiltering; +template +struct StorageToFiltering::type> +{ + using type = double; +}; + template struct StorageToFiltering::type> @@ -336,70 +363,93 @@ class SimdFilterProcessor< { return vdupq_n_s32(fill); } - + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_s32((uint32x4_t)mask, x, y); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_s32(x, y); + } + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_s32(x, y); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_s32(x); + } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_s32(reinterpret_cast(from)); } - MCS_FORCE_INLINE MT cmpDummy(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpDummy(SimdType x, SimdType y) { return 0xFFFF; } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpDummy(x, y); } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) + { + return reinterpret_cast(std::min(reinterpret_cast(x), reinterpret_cast(y))); + } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) + { + return reinterpret_cast(std::max(reinterpret_cast(x), reinterpret_cast(y))); + } + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpDummy(x, y); } @@ -408,27 +458,17 @@ class SimdFilterProcessor< { return vdupq_n_s32(0); } - // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_s32(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) - { - return reinterpret_cast(std::min(reinterpret_cast(x), reinterpret_cast(y))); - } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) - { - return reinterpret_cast(std::max(reinterpret_cast(x), reinterpret_cast(y))); - } }; template @@ -466,55 +506,74 @@ class SimdFilterProcessor< { return vld1q_f64(reinterpret_cast(from)); } + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_f64((uint64x2_t)mask, x, y); + } + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return (SimdType)vandq_s64((StorageSimdType)x, (StorageSimdType)y); + } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_f64(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_f64(x, y)); } - - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_f64(x); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_f64(x); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_f64(x, y); + } + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_f64(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_f64(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_f64(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return cmpEq(x,y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_pd((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); @@ -523,7 +582,7 @@ class SimdFilterProcessor< return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -537,21 +596,19 @@ class SimdFilterProcessor< { return vdupq_n_f64(0); } - - MCS_FORCE_INLINE void store(char* dst, SimdType& x) - { - vst1q_f64(reinterpret_cast(dst), x); - } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { return vminq_f64(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) { return vmaxq_f64(x, y); } + MCS_FORCE_INLINE void store(char* dst, SimdType x) + { + vst1q_f64(reinterpret_cast(dst), x); + } }; template @@ -577,12 +634,30 @@ class SimdFilterProcessor< // This spec borrows the expr from u-/int64 based proceesor class. return (SimdType)nullEmptyProcessor.loadValue(fill); } - + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_f32((uint32x4_t)mask, x, y); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_f32(x, y); + } + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return (SimdType)vandq_s32((StorageSimdType)x, (StorageSimdType)y); + } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_f32(fill); } - + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_f32(x); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_f32(x); + } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { @@ -590,53 +665,53 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_f32(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_f32(x, y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_f32(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_f32(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_f32(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmvnq_u32(vceqq_f32(x, y))); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_ps((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -646,7 +721,7 @@ class SimdFilterProcessor< return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -661,17 +736,16 @@ class SimdFilterProcessor< return vdupq_n_f32(0); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_f32(reinterpret_cast(dst), x); } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { return vminq_f32(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) { return vmaxq_f32(x, y); } @@ -709,50 +783,63 @@ class SimdFilterProcessor< { return vld1q_s64(reinterpret_cast(from)); } + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_s64((uint64x2_t)mask, x, y); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_s64(x, y); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_s64(x, y); + } // Compare - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType) vcgeq_s64(x,y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_s64(x, y)); } - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_s64(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_s64(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_s64(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return cmpEq(x,y)^0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask); } @@ -762,35 +849,32 @@ class SimdFilterProcessor< return vdupq_n_s64(0); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } - - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { - vst1q_s64(reinterpret_cast(dst), x); + return vbslq_s64(vcgtq_s64(y,x), x, y); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) { return vbslq_s64(vcgtq_s64(x,y), x, y); } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { - return vbslq_s64(vcgtq_s64(y,x), x, y); + vst1q_s64(reinterpret_cast(dst), x); } }; @@ -820,56 +904,74 @@ class SimdFilterProcessor< { return vdupq_n_u64(fill); } - + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_u64((uint64x2_t)mask, x, y); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_u64(x, y); + } + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_u64(x, y); + } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_u64(reinterpret_cast(from)); } - // Compare - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_u64(x, y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_u64(x, y)); } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) + { + return vbslq_u64(vcgtq_u64(y,x), x, y); + } - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) + { + return vbslq_u64(vcgtq_u64(x,y), x, y); + } + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_u64(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return cmpGt(y, x); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask); } @@ -879,36 +981,26 @@ class SimdFilterProcessor< return vdupq_n_u64(0); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_u64(reinterpret_cast(dst), x); } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) - { - return vbslq_u64(vcgtq_u64(x,y), x, y); - } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) - { - return vbslq_u64(vcgtq_u64(y,x), x, y); - } }; template class SimdFilterProcessor< @@ -936,66 +1028,92 @@ class SimdFilterProcessor< { return vdupq_n_s32(fill); } - + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_s32(x); + } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_s32(reinterpret_cast(from)); } - + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_s32((uint32x4_t)mask, x, y); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_s32(x, y); + } + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_s32(x, y); + } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType) vceqq_s32(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_s32(x, y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_s32(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_s32(x, y)); } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_s32(x); + } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) + { + return vminq_s32(x, y); + } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) + { + return vmaxq_s32(x, y); + } + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_s32(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_s32(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1006,25 +1124,15 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_s32(reinterpret_cast(dst), x); } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) - { - return vminq_s32(x, y); - } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) - { - return vmaxq_s32(x, y); - } }; template @@ -1053,66 +1161,91 @@ class SimdFilterProcessor< { return vdupq_n_u32(fill); } - + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_u32(x); + } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_u32(reinterpret_cast(from)); } - + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_u32((uint32x4_t)mask, x, y); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_u32(x, y); + } + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_u32(x, y); + } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_u32(x, y)); } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) + { + return vminq_u32(x, y); + } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) + { + return vmaxq_u32(x, y); + } + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_u32(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_u32(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return cmpGt(y, x); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_u32(x); + } + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1123,25 +1256,15 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_u32(reinterpret_cast(dst), x); } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) - { - return vminq_u32(x, y); - } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) - { - return vmaxq_u32(x, y); - } }; template @@ -1169,7 +1292,10 @@ class SimdFilterProcessor< { return vdupq_n_s16(fill); } - + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_s16(x); + } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { @@ -1177,58 +1303,82 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_s16(x, y)); } + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_s16((uint16x8_t)mask, x, y); + } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_s16(x, y); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_s16(x, y); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_s16(x); + } + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_s16(x, y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_s16(x, y)); } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) + { + return vminq_s16(x, y); + } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) + { + return vmaxq_s16(x, y); + } + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcleq_s16(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_s16(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return cmpEq(x,y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1239,25 +1389,15 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_s16(reinterpret_cast(dst), x); } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) - { - return vminq_s16(x, y); - } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) - { - return vmaxq_s16(x, y); - } }; template @@ -1285,66 +1425,93 @@ class SimdFilterProcessor(from)); } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) + { + return vminq_u16(x, y); + } + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) + { + return vmaxq_u16(x, y); + } + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_u16(x); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_u16(x); + } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_u16(x, y)); } + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_u16((uint16x8_t)mask, x, y); + } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_u16(x, y); + } + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_u16(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return cmpGe(y, x); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_u16(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_u16(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1355,25 +1522,15 @@ class SimdFilterProcessor(dst), x); } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) - { - return vminq_u16(x, y); - } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) - { - return vmaxq_u16(x, y); - } }; template @@ -1396,7 +1553,27 @@ class SimdFilterProcessor< { return loadValue(fill); } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) + { + return vminq_s8(x, y); + } + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) + { + return vmaxq_s8(x, y); + } + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_s8(x); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_s8(x, y); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_s8(x); + } MCS_FORCE_INLINE SimdType loadValue(const T fill) { return vdupq_n_s8(fill); @@ -1407,59 +1584,67 @@ class SimdFilterProcessor< { return vld1q_s8(reinterpret_cast(from)); } + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_s8((uint8x16_t)mask, x, y); + } + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_s8(x, y); + } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_s8(x, y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_s8(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_s8(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_s8(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1470,25 +1655,15 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_s8(reinterpret_cast(dst), x); } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) - { - return vminq_s8(x, y); - } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) - { - return vmaxq_s8(x, y); - } }; template @@ -1516,67 +1691,95 @@ class SimdFilterProcessor< { return vdupq_n_u8(fill); } + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_u8(x); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return vminvq_u8(x); + } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_u8(reinterpret_cast(from)); } + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return vbslq_u8((uint8x16_t)mask, x, y); + } + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return vandq_u8(x, y); + } + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return (SimdType)vcgtq_u8(x, y); + } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_u8(x, y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_u8(x, y)); } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) + { + return vminq_u8(x, y); + } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) + { + return vmaxq_u8(x, y); + } + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_u8(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_u8(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1587,25 +1790,15 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_u8(reinterpret_cast(dst), x); } - - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) - { - return vminq_u8(x, y); - } - - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) - { - return vmaxq_u8(x, y); - } }; }; // namespace simd