diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp index 4f5f88641..feb139f60 100644 --- a/primitives/linux-port/column.cpp +++ b/primitives/linux-port/column.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #ifndef _MSC_VER #include @@ -118,6 +119,24 @@ inline int compareBlock(const void* a, const void* b) return ((*(T*)a) - (*(T*)b)); } +template +std::enable_if_t< + sizeof(To) == sizeof(From) && + std::is_trivially_copyable_v && + std::is_trivially_copyable_v, + To> +// constexpr support needs compiler magic +bitCast(const From& src) noexcept +{ + static_assert(std::is_trivially_constructible_v, + "This implementation additionally requires " + "destination type to be trivially constructible"); + + To dst; + std::memcpy(&dst, &src, sizeof(To)); + return dst; +} + // this function is out-of-band, we don't need to inline it void logIt(int mid, int arg1, const string& arg2 = string()) { @@ -1277,13 +1296,19 @@ inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType } template -void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& simdProcessor, +void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT simdProcessor, SimdType& dataVec, SimdType& simdMin, SimdType& simdMax) { - if (validMinMax && nonNullOrEmptyMask) + if (validMinMax) { - simdMin = simdProcessor.min(simdMin, dataVec); - simdMax = simdProcessor.max(simdMax, dataVec); + simdMin = simdProcessor.blend( + simdMin, dataVec, + simdProcessor.bwAnd(simdProcessor.cmpGt2(simdMin, dataVec), + bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)))); + simdMax = simdProcessor.blend( + simdMax, dataVec, + simdProcessor.bwAnd(simdProcessor.cmpGt2(dataVec, simdMax), + bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)))); } } @@ -1304,7 +1329,7 @@ void scalarUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& } template -void extractMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T& min, T& max) +void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min, T& max) { constexpr const uint16_t size = VT::vecByteSize / sizeof(T); T* simdMinVec = reinterpret_cast(&simdMin); @@ -1312,6 +1337,13 @@ void extractMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T& m max = *std::max_element(simdMaxVec, simdMaxVec + size); min = *std::min_element(simdMinVec, simdMinVec + size); } + +template +void getInitialSimdMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T min, T max) +{ + simdMin = simdProcessor.loadValue(min); + simdMax = simdProcessor.loadValue(max); +} // This routine filters input block in a vectorized manner. // It supports all output types, all input types. // It doesn't support KIND==TEXT so upper layers filters this KIND out beforehand. @@ -1447,9 +1479,12 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* } } } - [[maybe_unused]] SimdType simdMin = simdDataLoad(simdProcessor, srcArray, - origSrcArray, ridArray, 0).v;; - [[maybe_unused]] SimdType simdMax = simdMin; + [[maybe_unused]] SimdType simdMin; + [[maybe_unused]] SimdType simdMax; + if constexpr (KIND != KIND_TEXT) + { + getInitialSimdMinMax(simdProcessor, simdMin, simdMax, min, max); + } // main loop // writeMask tells which values must get into the result. Includes values that matches filters. Can have // NULLs. nonEmptyMask tells which vector coords are not EMPTY magics. nonNullMask tells which vector coords diff --git a/tests/col1block.h b/tests/col1block.h index 9346e4055..1b79a2dce 100644 --- a/tests/col1block.h +++ b/tests/col1block.h @@ -500,5 +500,5 @@ unsigned char __col1block_cdf[] = { 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff}; unsigned int __col1block_cdf_len = 8192; -constexpr int __col1block_cdf_umin = -128; +constexpr int __col1block_cdf_umin = -126; constexpr int __col1block_cdf_umax = 127; diff --git a/tests/simd_processors.cpp b/tests/simd_processors.cpp index 8e24a8fcd..0c8f145d7 100644 --- a/tests/simd_processors.cpp +++ b/tests/simd_processors.cpp @@ -37,19 +37,22 @@ using namespace std; template -class SimdProcessorTypedTest : public testing::Test { -public: +class SimdProcessorTypedTest : public testing::Test +{ + public: using IntegralType = T; - #if TESTS_USING_SSE - using SimdType = std::conditional_t::value, - simd::vi128f_wr, - std::conditional_t::value, - simd::vi128d_wr, - simd::vi128_wr>>; - using Proc = typename simd::SimdFilterProcessor; - #else - using Proc = typename simd::SimdFilterProcessor::WrapperType, T>; - #endif +#if TESTS_USING_SSE + using SimdType = + std::conditional_t::value, simd::vi128f_wr, + std::conditional_t::value, simd::vi128d_wr, simd::vi128_wr>>; + using Proc = typename simd::SimdFilterProcessor; +#else + using SimdType = + std::conditional_t::value, simd::vi128f_wr, + std::conditional_t::value, simd::vi128d_wr, + typename simd::TypeToVecWrapperType::WrapperType>>; + using Proc = typename simd::SimdFilterProcessor; +#endif void SetUp() override { } diff --git a/utils/common/simd_sse.h b/utils/common/simd_sse.h index f9277593c..fe755d0c9 100644 --- a/utils/common/simd_sse.h +++ b/utils/common/simd_sse.h @@ -125,6 +125,16 @@ static inline vi128_t constant4i() { return u.xmm; } +static inline vi128_t bitMaskToByteMask16(MT m) { + vi128_t sel = _mm_set1_epi64x(0x8040201008040201); + return _mm_cmpeq_epi8( + _mm_and_si128( + _mm_shuffle_epi8(_mm_cvtsi32_si128(m), + _mm_set_epi64x(0x0101010101010101, 0)), + sel), + sel); +} + template class SimdFilterProcessor; @@ -163,63 +173,63 @@ class SimdFilterProcessor< return _mm_loadu_si128(reinterpret_cast(from)); } - MCS_FORCE_INLINE MT cmpDummy(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpDummy(SimdType x, SimdType y) { return 0xFFFF; } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpDummy(x, y); } @@ -230,27 +240,40 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return x; + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return x; + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return x; + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return reinterpret_cast(std::min(reinterpret_cast(x), reinterpret_cast(y))); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return reinterpret_cast(std::max(reinterpret_cast(x), reinterpret_cast(y))); } - - }; template @@ -291,53 +314,53 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_pd(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_pd(x, y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_pd(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmple_pd(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_pd(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_pd(x, y)); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return _mm_movemask_pd(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); @@ -346,7 +369,7 @@ class SimdFilterProcessor< return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -361,20 +384,35 @@ class SimdFilterProcessor< return _mm_setzero_pd(); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { _mm_storeu_pd(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_pd(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_pd(x, y); } + + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_pd(x, y, mask); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_pd(x, y); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_pd(x, y); + } }; template @@ -414,53 +452,53 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_ps(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_ps(x, y)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_ps(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmple_ps(x, y)); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_ps(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_ps(x, y)); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return _mm_movemask_ps(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -470,7 +508,7 @@ class SimdFilterProcessor< return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -485,20 +523,35 @@ class SimdFilterProcessor< return _mm_setzero_ps(); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { _mm_storeu_ps(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_ps(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_ps(x, y); } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_ps(x, y); + } + + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_ps(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_ps(x, y); + } }; template @@ -535,48 +588,48 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const { - return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(x,y)); + return _mm_blendv_epi8(x, y, mask); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const { - return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(y,x)); + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_epi64(x, y); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + { + return blend(x, y, cmpGt2(x,y)); + } + + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + { + return blend(x, y, cmpGt2(y,x)); } }; @@ -652,12 +720,12 @@ class SimdFilterProcessor(); SimdType xFlip = _mm_xor_si128(x, signVec); @@ -665,38 +733,38 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const { - return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(x,y)); + return _mm_blendv_epi8(x, y, mask); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const { - return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(y,x)); + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + SimdType signVec = constant4i<0,(int32_t)0x80000000,0,(int32_t)0x80000000>(); + SimdType xFlip = _mm_xor_si128(x, signVec); + SimdType yFlip = _mm_xor_si128(y, signVec); + return _mm_cmpgt_epi64(xFlip, yFlip); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + { + return blend(x, y, cmpGt2(x,y)); + } + + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + { + return blend(x, y, cmpGt2(y,x)); } }; @@ -772,58 +858,58 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_epi32(x, y); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epi32(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epi32(x, y); } @@ -889,17 +990,17 @@ class SimdFilterProcessor(); SimdType xFlip = _mm_xor_si128(x, signVec); @@ -907,43 +1008,43 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + SimdType signVec = constant4i<(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000>(); + SimdType xFlip = _mm_xor_si128(x, signVec); + SimdType yFlip = _mm_xor_si128(y, signVec); + return _mm_cmpgt_epi32(xFlip, yFlip); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epu32(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epu32(x, y); } @@ -1008,58 +1127,58 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return cmpLt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpgt_epi16(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return cmpGt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmplt_epi16(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1070,22 +1189,37 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_epi16(x, y); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epi16(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epi16(x, y); } @@ -1124,59 +1258,59 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { SimdType maxOfTwo = _mm_max_epu16(x, y); // max(x, y), unsigned return _mm_movemask_epi8(_mm_cmpeq_epi16(x, maxOfTwo)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return cmpGe(y, x) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return cmpGe(y, x); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return cmpGe(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1187,22 +1321,40 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) + { + SimdType ones = + constant4i<(int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF>(); + SimdType maxOfTwo = _mm_max_epu16(x, y); + return _mm_xor_si128(_mm_cmpeq_epi16(y, maxOfTwo), ones); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epu16(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epu16(x, y); } @@ -1241,65 +1393,65 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { return cmpLt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y)); } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { return cmpGt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmplt_epi8(x, y)); } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // permute /* TODO Available in AVX-512 - MCS_FORCE_INLINE SimdType perm8Bits(SimdType& x, SimdType& idx) + MCS_FORCE_INLINE SimdType perm8Bits(SimdType x, SimdType idx) { return _mm_permutexvar_epi8(x, idx); } */ // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1310,22 +1462,37 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_epi8(x, y); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epi8(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epi8(x, y); } @@ -1364,65 +1531,66 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); } - MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return cmpLt(x, y) ^ 0xFFFF; + SimdType maxOfTwo = _mm_max_epu8(x, y); // max(x, y), unsigned + return _mm_movemask_epi8(_mm_cmpeq_epi8(x, maxOfTwo)); } - MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y)); + return cmpGe(y, x) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return cmpGt(x, y) ^ 0xFFFF; + return cmpGe(y, x); } - MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmplt_epi8(x, y)); + return cmpGe(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { return 0; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { return 0xFFFF; } // permute /* TODO Available in AVX-512 - MCS_FORCE_INLINE SimdType perm8Bits(SimdType& x, SimdType& idx) + MCS_FORCE_INLINE SimdType perm8Bits(SimdType x, SimdType idx) { return _mm_permutexvar_epi8(x, idx); } */ // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1433,22 +1601,40 @@ class SimdFilterProcessor< } // store - MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, SimdType& x) + MCS_FORCE_INLINE void store(char* dst, SimdType x) { _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) + { + SimdType ones = + constant4i<(int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF>(); + SimdType maxOfTwo = _mm_max_epu8(x, y); + return _mm_xor_si128(_mm_cmpeq_epi8(y, maxOfTwo), ones); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epu8(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epu8(x, y); }