diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp index 4fb629b4f..feb139f60 100644 --- a/primitives/linux-port/column.cpp +++ b/primitives/linux-port/column.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #ifndef _MSC_VER #include @@ -1295,13 +1296,19 @@ inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType } template -void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& simdProcessor, +void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT simdProcessor, SimdType& dataVec, SimdType& simdMin, SimdType& simdMax) { - if (validMinMax && nonNullOrEmptyMask) + if (validMinMax) { - simdMin = simdProcessor.min(simdMin, dataVec); - simdMax = simdProcessor.max(simdMax, dataVec); + simdMin = simdProcessor.blend( + simdMin, dataVec, + simdProcessor.bwAnd(simdProcessor.cmpGt2(simdMin, dataVec), + bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)))); + simdMax = simdProcessor.blend( + simdMax, dataVec, + simdProcessor.bwAnd(simdProcessor.cmpGt2(dataVec, simdMax), + bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)))); } } @@ -1322,7 +1329,7 @@ void scalarUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& } template -void extractMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T& min, T& max) +void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min, T& max) { constexpr const uint16_t size = VT::vecByteSize / sizeof(T); T* simdMinVec = reinterpret_cast(&simdMin); @@ -1330,6 +1337,13 @@ void extractMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T& m max = *std::max_element(simdMaxVec, simdMaxVec + size); min = *std::min_element(simdMinVec, simdMinVec + size); } + +template +void getInitialSimdMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T min, T max) +{ + simdMin = simdProcessor.loadValue(min); + simdMax = simdProcessor.loadValue(max); +} // This routine filters input block in a vectorized manner. // It supports all output types, all input types. // It doesn't support KIND==TEXT so upper layers filters this KIND out beforehand. @@ -1465,9 +1479,12 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* } } } - [[maybe_unused]] SimdType simdMin = simdDataLoad(simdProcessor, srcArray, - origSrcArray, ridArray, 0).v;; - [[maybe_unused]] SimdType simdMax = simdMin; + [[maybe_unused]] SimdType simdMin; + [[maybe_unused]] SimdType simdMax; + if constexpr (KIND != KIND_TEXT) + { + getInitialSimdMinMax(simdProcessor, simdMin, simdMax, min, max); + } // main loop // writeMask tells which values must get into the result. Includes values that matches filters. Can have // NULLs. nonEmptyMask tells which vector coords are not EMPTY magics. nonNullMask tells which vector coords diff --git a/tests/col1block.h b/tests/col1block.h index 9346e4055..1b79a2dce 100644 --- a/tests/col1block.h +++ b/tests/col1block.h @@ -500,5 +500,5 @@ unsigned char __col1block_cdf[] = { 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff}; unsigned int __col1block_cdf_len = 8192; -constexpr int __col1block_cdf_umin = -128; +constexpr int __col1block_cdf_umin = -126; constexpr int __col1block_cdf_umax = 127; diff --git a/tests/simd_processors.cpp b/tests/simd_processors.cpp index 8e24a8fcd..0c8f145d7 100644 --- a/tests/simd_processors.cpp +++ b/tests/simd_processors.cpp @@ -37,19 +37,22 @@ using namespace std; template -class SimdProcessorTypedTest : public testing::Test { -public: +class SimdProcessorTypedTest : public testing::Test +{ + public: using IntegralType = T; - #if TESTS_USING_SSE - using SimdType = std::conditional_t::value, - simd::vi128f_wr, - std::conditional_t::value, - simd::vi128d_wr, - simd::vi128_wr>>; - using Proc = typename simd::SimdFilterProcessor; - #else - using Proc = typename simd::SimdFilterProcessor::WrapperType, T>; - #endif +#if TESTS_USING_SSE + using SimdType = + std::conditional_t::value, simd::vi128f_wr, + std::conditional_t::value, simd::vi128d_wr, simd::vi128_wr>>; + using Proc = typename simd::SimdFilterProcessor; +#else + using SimdType = + std::conditional_t::value, simd::vi128f_wr, + std::conditional_t::value, simd::vi128d_wr, + typename simd::TypeToVecWrapperType::WrapperType>>; + using Proc = typename simd::SimdFilterProcessor; +#endif void SetUp() override { } diff --git a/utils/common/simd_sse.h b/utils/common/simd_sse.h index e8f8bc2f2..fe755d0c9 100644 --- a/utils/common/simd_sse.h +++ b/utils/common/simd_sse.h @@ -250,17 +250,30 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return x; + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return x; + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return x; + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return reinterpret_cast(std::min(reinterpret_cast(x), reinterpret_cast(y))); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return reinterpret_cast(std::max(reinterpret_cast(x), reinterpret_cast(y))); } - - }; template @@ -376,15 +389,30 @@ class SimdFilterProcessor< _mm_storeu_pd(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_pd(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_pd(x, y); } + + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_pd(x, y, mask); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_pd(x, y); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_pd(x, y); + } }; template @@ -500,15 +528,30 @@ class SimdFilterProcessor< _mm_storeu_ps(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_ps(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_ps(x, y); } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_ps(x, y); + } + + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_ps(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_ps(x, y); + } }; template @@ -617,14 +660,29 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const { - return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(x,y)); + return _mm_blendv_epi8(x, y, mask); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const { - return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(y,x)); + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_epi64(x, y); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + { + return blend(x, y, cmpGt2(x,y)); + } + + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + { + return blend(x, y, cmpGt2(y,x)); } }; @@ -737,14 +795,32 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const { - return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(x,y)); + return _mm_blendv_epi8(x, y, mask); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const { - return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(y,x)); + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + SimdType signVec = constant4i<0,(int32_t)0x80000000,0,(int32_t)0x80000000>(); + SimdType xFlip = _mm_xor_si128(x, signVec); + SimdType yFlip = _mm_xor_si128(y, signVec); + return _mm_cmpgt_epi64(xFlip, yFlip); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + { + return blend(x, y, cmpGt2(x,y)); + } + + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + { + return blend(x, y, cmpGt2(y,x)); } }; @@ -854,12 +930,27 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_epi32(x, y); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epi32(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epi32(x, y); } @@ -974,12 +1065,30 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + SimdType signVec = constant4i<(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000>(); + SimdType xFlip = _mm_xor_si128(x, signVec); + SimdType yFlip = _mm_xor_si128(y, signVec); + return _mm_cmpgt_epi32(xFlip, yFlip); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epu32(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epu32(x, y); } @@ -1090,12 +1199,27 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_epi16(x, y); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epi16(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epi16(x, y); } @@ -1207,12 +1331,30 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) + { + SimdType ones = + constant4i<(int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF>(); + SimdType maxOfTwo = _mm_max_epu16(x, y); + return _mm_xor_si128(_mm_cmpeq_epi16(y, maxOfTwo), ones); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epu16(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epu16(x, y); } @@ -1330,12 +1472,27 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const + { + return _mm_cmpgt_epi8(x, y); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epi8(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epi8(x, y); } @@ -1454,12 +1611,30 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + { + return _mm_blendv_epi8(x, y, mask); + } + + MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + { + return _mm_and_si128(x, y); + } + + MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) + { + SimdType ones = + constant4i<(int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF>(); + SimdType maxOfTwo = _mm_max_epu8(x, y); + return _mm_xor_si128(_mm_cmpeq_epi8(y, maxOfTwo), ones); + } + + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const { return _mm_min_epu8(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { return _mm_max_epu8(x, y); }