From 9930d0deddbae26b0158d44a632394aeee323129 Mon Sep 17 00:00:00 2001 From: Andrey Piskunov Date: Wed, 29 Jun 2022 14:06:43 +0300 Subject: [PATCH] Vectorized update min max --- primitives/linux-port/column.cpp | 75 ++++++++++++++++----- utils/common/simd_sse.h | 112 +++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 16 deletions(-) diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp index 01be9e39d..a777722e0 100644 --- a/primitives/linux-port/column.cpp +++ b/primitives/linux-port/column.cpp @@ -1001,9 +1001,6 @@ inline uint16_t vectWriteColValues( *tmpDstVecTPtr = dataVecTPtr[j]; ++tmpDstVecTPtr; } - - vectUpdateMinMax(validMinMax, nonNullOrEmptyMask & bitMapPosition, Min, Max, - dataVecTPtr[j], in); } // Store the whole vector however one level up the stack // vectorizedFiltering() increases the dstArray by a number of @@ -1069,8 +1066,6 @@ inline uint16_t vectWriteColValues( vectWriteColValuesLoopRIDAsignment(ridDstArray, out, ridOffset + j, ridSrcArray, j); ++ridDstArray; } - vectUpdateMinMax(validMinMax, nonNullOrEmptyMask & bitMapPosition, Min, Max, - dataVecTPtr[j], in); } // Store the whole vector however one level up the stack // vectorizedFiltering() increases the dstArray by a number of @@ -1106,14 +1101,11 @@ inline uint16_t vectWriteRIDValues( uint16_t j = 0; for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep) { - MT bitMapPosition = 1 << it; if (writeMask & (1 << it)) { vectWriteColValuesLoopRIDAsignment(ridDstArray, out, ridOffset + j, ridSrcArray, j); ++ridDstArray; } - vectUpdateMinMax(validMinMax, nonNullOrEmptyMask & bitMapPosition, Min, Max, - dataVecTPtr[j], in); } return ridDstArray - origRIDDstArray; } @@ -1284,6 +1276,42 @@ inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType return {result}; } +template +void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& simdProcessor, + SimdType dataVec, SimdType simdMin, SimdType simdMax) +{ + if (validMinMax && nonNullOrEmptyMask) + { + simdMin = simdProcessor.min(simdMin, dataVec); + simdMax = simdProcessor.max(simdMax, dataVec); + } +} + +template +void scalarUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT& simdPRocessor, + T* dataVecTPtr, T& min, T& max, NewColRequestHeader* in) +{ + constexpr const uint16_t filterMaskStep = VT::FilterMaskStep; + uint16_t j = 0; + for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += filterMaskStep) + { + MT bitMapPosition = 1 << it; + if (validMinMax && (nonNullOrEmptyMask & bitMapPosition)) + { + updateMinMax(min, max, dataVecTPtr[j], in); + } + } +} + +template +void extractMinMax(VT& simdProcessor, SimdType& simdMin, SimdType& simdMax, T& min, T& max) +{ + constexpr const uint16_t size = VT::vecByteSize / sizeof(T); + T* simdMinVec = reinterpret_cast(&simdMin); + T* simdMaxVec = reinterpret_cast(&simdMax); + max = *std::max_element(simdMaxVec, simdMaxVec + size); + min = *std::min_element(simdMinVec, simdMinVec + size); +} // This routine filters input block in a vectorized manner. // It supports all output types, all input types. // It doesn't support KIND==TEXT so upper layers filters this KIND out beforehand. @@ -1298,7 +1326,7 @@ template(simdProcessor, srcArray, + origSrcArray, ridArray, 0).v;; + [[maybe_unused]] SimdType simdMax = simdMin; // main loop // writeMask tells which values must get into the result. Includes values that matches filters. Can have // NULLs. nonEmptyMask tells which vector coords are not EMPTY magics. nonNullMask tells which vector coords @@ -1465,14 +1495,24 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* // outside the scope of the memory allocated to out msg. // vectWriteColValues is empty if outputMode == OT_RID. uint16_t valuesWritten = vectWriteColValues( - simdProcessor, writeMask, nonNullOrEmptyMask, validMinMax, ridOffset, dataVecTPtr, dstArray, Min, Max, + simdProcessor, writeMask, nonNullOrEmptyMask, validMinMax, ridOffset, dataVecTPtr, dstArray, min, max, in, out, ridDstArray, ridArray); // Some outputType modes saves RIDs also. vectWriteRIDValues is empty for // OT_DATAVALUE, OT_BOTH(vectWriteColValues takes care about RIDs). valuesWritten = vectWriteRIDValues( - simdProcessor, valuesWritten, validMinMax, ridOffset, dataVecTPtr, ridDstArray, writeMask, Min, Max, + simdProcessor, valuesWritten, validMinMax, ridOffset, dataVecTPtr, ridDstArray, writeMask, min, max, in, out, nonNullOrEmptyMask, ridArray); + if constexpr (HAS_INPUT_RIDS && KIND != KIND_TEXT) + { + vectorizedUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax); + } + else + { + scalarUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVecTPtr, min, max, + in); + } + // Calculate bytes written uint16_t bytesWritten = valuesWritten * WIDTH; totalValuesWritten += valuesWritten; @@ -1482,7 +1522,10 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* srcArray += VECTOR_SIZE; ridArray += VECTOR_SIZE; } - + if constexpr(HAS_INPUT_RIDS && KIND != KIND_TEXT) + { + extractMinMax(simdProcessor, simdMin, simdMax, min, max); + } // Set the number of output values here b/c tail processing can skip this operation. out->NVALS = totalValuesWritten; @@ -1490,8 +1533,8 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* out->ValidMinMax = validMinMax; if (validMinMax) { - out->Min = Min; - out->Max = Max; + out->Min = min; + out->Max = max; } // process the tail. scalarFiltering changes out contents, e.g. Min/Max, NVALS, RIDs and values array // This tail also sets out::Min/Max, out::validMinMax if validMinMax is set. @@ -1499,7 +1542,7 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* scalarFiltering(in, out, columnFilterMode, filterSet, filterCount, filterCOPs, filterValues, filterRFs, in->colType, origSrcArray, srcSize, origRidArray, ridSize, processedSoFar, outputType, validMinMax, emptyValue, nullValue, - Min, Max, isNullValueMatches); + min, max, isNullValueMatches); } // This routine dispatches template function calls to reduce branching. diff --git a/utils/common/simd_sse.h b/utils/common/simd_sse.h index d6407f58e..f9277593c 100644 --- a/utils/common/simd_sse.h +++ b/utils/common/simd_sse.h @@ -239,6 +239,18 @@ class SimdFilterProcessor< { _mm_storeu_si128(reinterpret_cast(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return reinterpret_cast(std::min(reinterpret_cast(x), reinterpret_cast(y))); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return reinterpret_cast(std::max(reinterpret_cast(x), reinterpret_cast(y))); + } + + }; template @@ -353,6 +365,16 @@ class SimdFilterProcessor< { _mm_storeu_pd(reinterpret_cast(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_min_pd(x, y); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_max_pd(x, y); + } }; template @@ -467,6 +489,16 @@ class SimdFilterProcessor< { _mm_storeu_ps(reinterpret_cast(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_min_ps(x, y); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_max_ps(x, y); + } }; template @@ -574,6 +606,16 @@ class SimdFilterProcessor(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(x,y)); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(y,x)); + } }; template @@ -684,6 +726,16 @@ class SimdFilterProcessor(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(x,y)); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_blendv_epi8(x, y, _mm_cmpgt_epi64(y,x)); + } }; template @@ -791,6 +843,16 @@ class SimdFilterProcessor(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_min_epi32(x, y); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_max_epi32(x, y); + } }; template @@ -901,6 +963,16 @@ class SimdFilterProcessor(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_min_epu32(x, y); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_max_epu32(x, y); + } }; template @@ -1007,6 +1079,16 @@ class SimdFilterProcessor< { _mm_storeu_si128(reinterpret_cast(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_min_epi16(x, y); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_max_epi16(x, y); + } }; template @@ -1114,6 +1196,16 @@ class SimdFilterProcessor< { _mm_storeu_si128(reinterpret_cast(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_min_epu16(x, y); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_max_epu16(x, y); + } }; template @@ -1227,6 +1319,16 @@ class SimdFilterProcessor< { _mm_storeu_si128(reinterpret_cast(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_min_epi8(x, y); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_max_epi8(x, y); + } }; template @@ -1340,6 +1442,16 @@ class SimdFilterProcessor< { _mm_storeu_si128(reinterpret_cast(dst), x); } + + MCS_FORCE_INLINE SimdType min(SimdType& x, SimdType& y) + { + return _mm_min_epu8(x, y); + } + + MCS_FORCE_INLINE SimdType max(SimdType& x, SimdType& y) + { + return _mm_max_epu8(x, y); + } }; } // namespace simd