diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp index a3ac1e4e5..9bfaa66d1 100644 --- a/primitives/linux-port/column.cpp +++ b/primitives/linux-port/column.cpp @@ -1295,19 +1295,13 @@ inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType } template -void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT simdProcessor, - SimdType& dataVec, SimdType& simdMin, SimdType& simdMax) +void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& simdProcessor, + SimdType dataVec, SimdType simdMin, SimdType simdMax) { - if (validMinMax) + if (validMinMax && nonNullOrEmptyMask) { - simdMin = simdProcessor.blend( - simdMin, dataVec, - simdProcessor.bwAnd(simdProcessor.cmpGt2(simdMin, dataVec), - bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)))); - simdMax = simdProcessor.blend( - simdMax, dataVec, - simdProcessor.bwAnd(simdProcessor.cmpGt2(dataVec, simdMax), - bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)))); + simdMin = simdProcessor.min(simdMin, dataVec); + simdMax = simdProcessor.max(simdMax, dataVec); } } @@ -1328,7 +1322,7 @@ void scalarUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& } template -void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min, T& max) +void extractMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T& min, T& max) { constexpr const uint16_t size = VT::vecByteSize / sizeof(T); T* simdMinVec = reinterpret_cast(&simdMin); @@ -1336,13 +1330,6 @@ void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min max = *std::max_element(simdMaxVec, simdMaxVec + size); min = *std::min_element(simdMinVec, simdMinVec + size); } - -template -void getInitialSimdMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T min, T max) -{ - simdMin = simdProcessor.loadValue(min); - simdMax = simdProcessor.loadValue(max); -} // This routine filters input block in a vectorized manner. // It supports all output types, all input types. // It doesn't support KIND==TEXT so upper layers filters this KIND out beforehand. @@ -1478,12 +1465,9 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* } } } - [[maybe_unused]] SimdType simdMin; - [[maybe_unused]] SimdType simdMax; - if constexpr (KIND != KIND_TEXT) - { - getInitialSimdMinMax(simdProcessor, simdMin, simdMax, min, max); - } + [[maybe_unused]] SimdType simdMin = simdDataLoad(simdProcessor, srcArray, + origSrcArray, ridArray, 0).v;; + [[maybe_unused]] SimdType simdMax = simdMin; // main loop // writeMask tells which values must get into the result. Includes values that matches filters. Can have // NULLs. nonEmptyMask tells which vector coords are not EMPTY magics. nonNullMask tells which vector coords @@ -1537,7 +1521,7 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* simdProcessor, valuesWritten, validMinMax, ridOffset, dataVecTPtr, ridDstArray, writeMask, min, max, in, out, nonNullOrEmptyMask, ridArray); - if constexpr (KIND != KIND_TEXT) + if constexpr (HAS_INPUT_RIDS && KIND != KIND_TEXT) { vectorizedUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax); } @@ -1556,7 +1540,7 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* srcArray += VECTOR_SIZE; ridArray += VECTOR_SIZE; } - if constexpr(KIND != KIND_TEXT) + if constexpr(HAS_INPUT_RIDS && KIND != KIND_TEXT) { extractMinMax(simdProcessor, simdMin, simdMax, min, max); } diff --git a/utils/common/simd_sse.h b/utils/common/simd_sse.h index fe755d0c9..e8f8bc2f2 100644 --- a/utils/common/simd_sse.h +++ b/utils/common/simd_sse.h @@ -250,30 +250,17 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return x; - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return x; - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - return x; - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return reinterpret_cast(std::min(reinterpret_cast(x), reinterpret_cast(y))); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return reinterpret_cast(std::max(reinterpret_cast(x), reinterpret_cast(y))); } + + }; template @@ -389,30 +376,15 @@ class SimdFilterProcessor< _mm_storeu_pd(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return _mm_min_pd(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return _mm_max_pd(x, y); } - - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return _mm_blendv_pd(x, y, mask); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - return _mm_cmpgt_pd(x, y); - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return _mm_and_pd(x, y); - } }; template @@ -528,30 +500,15 @@ class SimdFilterProcessor< _mm_storeu_ps(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return _mm_min_ps(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return _mm_max_ps(x, y); } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - return _mm_cmpgt_ps(x, y); - } - - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return _mm_blendv_ps(x, y, mask); - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return _mm_and_ps(x, y); - } }; template @@ -660,29 +617,14 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { - return _mm_blendv_epi8(x, y, mask); + return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(x,y)); } - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { - return _mm_and_si128(x, y); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - return _mm_cmpgt_epi64(x, y); - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const - { - return blend(x, y, cmpGt2(x,y)); - } - - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const - { - return blend(x, y, cmpGt2(y,x)); + return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(y,x)); } }; @@ -795,32 +737,14 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { - return _mm_blendv_epi8(x, y, mask); + return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(x,y)); } - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { - return _mm_and_si128(x, y); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - SimdType signVec = constant4i<0,(int32_t)0x80000000,0,(int32_t)0x80000000>(); - SimdType xFlip = _mm_xor_si128(x, signVec); - SimdType yFlip = _mm_xor_si128(y, signVec); - return _mm_cmpgt_epi64(xFlip, yFlip); - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const - { - return blend(x, y, cmpGt2(x,y)); - } - - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const - { - return blend(x, y, cmpGt2(y,x)); + return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(y,x)); } }; @@ -930,27 +854,12 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return _mm_blendv_epi8(x, y, mask); - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return _mm_and_si128(x, y); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - return _mm_cmpgt_epi32(x, y); - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return _mm_min_epi32(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return _mm_max_epi32(x, y); } @@ -1065,30 +974,12 @@ class SimdFilterProcessor(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return _mm_blendv_epi8(x, y, mask); - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return _mm_and_si128(x, y); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - SimdType signVec = constant4i<(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000>(); - SimdType xFlip = _mm_xor_si128(x, signVec); - SimdType yFlip = _mm_xor_si128(y, signVec); - return _mm_cmpgt_epi32(xFlip, yFlip); - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return _mm_min_epu32(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return _mm_max_epu32(x, y); } @@ -1199,27 +1090,12 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return _mm_blendv_epi8(x, y, mask); - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return _mm_and_si128(x, y); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - return _mm_cmpgt_epi16(x, y); - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return _mm_min_epi16(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return _mm_max_epi16(x, y); } @@ -1331,30 +1207,12 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return _mm_blendv_epi8(x, y, mask); - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return _mm_and_si128(x, y); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) - { - SimdType ones = - constant4i<(int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF>(); - SimdType maxOfTwo = _mm_max_epu16(x, y); - return _mm_xor_si128(_mm_cmpeq_epi16(y, maxOfTwo), ones); - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return _mm_min_epu16(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return _mm_max_epu16(x, y); } @@ -1472,27 +1330,12 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return _mm_blendv_epi8(x, y, mask); - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return _mm_and_si128(x, y); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) const - { - return _mm_cmpgt_epi8(x, y); - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return _mm_min_epi8(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return _mm_max_epi8(x, y); } @@ -1611,30 +1454,12 @@ class SimdFilterProcessor< _mm_storeu_si128(reinterpret_cast(dst), x); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const - { - return _mm_blendv_epi8(x, y, mask); - } - - MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const - { - return _mm_and_si128(x, y); - } - - MCS_FORCE_INLINE SimdType cmpGt2(SimdType x, SimdType y) - { - SimdType ones = - constant4i<(int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF, (int32_t)0xFFFFFFFF>(); - SimdType maxOfTwo = _mm_max_epu8(x, y); - return _mm_xor_si128(_mm_cmpeq_epi8(y, maxOfTwo), ones); - } - - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) { return _mm_min_epu8(x, y); } - MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) { return _mm_max_epu8(x, y); }