diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp index 974a277da..01be9e39d 100644 --- a/primitives/linux-port/column.cpp +++ b/primitives/linux-port/column.cpp @@ -1,5 +1,5 @@ /* Copyright (C) 2014 InfiniDB, Inc. - Copyright (C) 2016-2021 MariaDB Corporation + Copyright (C) 2016-2022 MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -22,6 +22,7 @@ #include #include #include +#include #ifndef _MSC_VER #include #else @@ -62,6 +63,55 @@ inline uint64_t order_swap(uint64_t x) return ret; } +// Dummy template +template= sizeof(uint128_t), T>::type* = nullptr> +inline T orderSwap(T x) +{ + return x; +} + +template::type* = nullptr> +inline T orderSwap(T x) +{ + T ret = (x >> 56) | + ((x << 40) & 0x00FF000000000000ULL) | + ((x << 24) & 0x0000FF0000000000ULL) | + ((x << 8) & 0x000000FF00000000ULL) | + ((x >> 8) & 0x00000000FF000000ULL) | + ((x >> 24) & 0x0000000000FF0000ULL) | + ((x >> 40) & 0x000000000000FF00ULL) | + (x << 56); + return ret; +} + +template::type* = nullptr> +inline T orderSwap(T x) +{ + T ret = (x >> 24) | + ((x << 8) & 0x00FF0000U) | + ((x >> 8) & 0x0000FF00U) | + (x << 24); + return ret; +} + +template::type* = nullptr> +inline T orderSwap(T x) +{ + T ret = (x >> 8) | (x <<8); + return ret; +} + +template::type* = nullptr> +inline T orderSwap(T x) +{ + return x; +} + template inline int compareBlock(const void* a, const void* b) { @@ -107,8 +157,11 @@ inline bool colCompare_(const T& val1, const T& val2, uint8_t COP) } } -inline bool colCompareStr(const ColRequestHeaderDataType& type, uint8_t COP, const utils::ConstString& val1, - const utils::ConstString& val2) +inline bool colCompareStr(const ColRequestHeaderDataType &type, + uint8_t COP, + const utils::ConstString &val1, + const utils::ConstString &val2, + const bool printOut = false) { int error = 0; bool rc = primitives::StringComparator(type).op(&error, COP, val1, val2); @@ -1179,7 +1232,7 @@ void scalarFiltering( #if defined(__x86_64__) template ::type* = nullptr> -inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray, const T* origSrcArray, +inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* origSrcArray, const primitives::RIDType* ridArray, const uint16_t iter) { return {processor.loadFrom(reinterpret_cast(srcArray))}; @@ -1189,7 +1242,7 @@ inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray, // TODO Move the logic into simd namespace class methods and use intrinsics template ::type* = nullptr> -inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray, const T* origSrcArray, +inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* origSrcArray, const primitives::RIDType* ridArray, const uint16_t iter) { constexpr const uint16_t WIDTH = sizeof(T); @@ -1205,6 +1258,32 @@ inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray, return {result}; } +template ::type* = nullptr> +inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType &type, VT& processor, typename VT::SimdType& dataVector) +{ + return {dataVector}; +} + +template ::type* = nullptr> +inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType &type, + VT& processor, typename VT::SimdType& dataVector) +{ + constexpr const uint16_t WIDTH = sizeof(T); + constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH; + using SimdType = typename VT::SimdType; + SimdType result; + T* resultTypedPtr = reinterpret_cast(&result); + T* srcTypedPtr = reinterpret_cast(&dataVector); + for (uint32_t i = 0; i < VECTOR_SIZE; ++i) + { + utils::ConstString s{reinterpret_cast(&srcTypedPtr[i]), WIDTH}; + resultTypedPtr[i] = orderSwap(type.strnxfrm(s.rtrimZero())); + } + return {result}; +} + // This routine filters input block in a vectorized manner. // It supports all output types, all input types. // It doesn't support KIND==TEXT so upper layers filters this KIND out beforehand. @@ -1214,8 +1293,8 @@ inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray, // to glue the masks produced by actual filters. // Then it takes a vector of data, run filters and logical function using pointers. // See the corresponding dispatcher to get more details on vector processing class. -template +template void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* srcArray, const uint32_t srcSize, primitives::RIDType* ridArray, const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter, const bool validMinMax, const T emptyValue, @@ -1225,8 +1304,12 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* using SimdType = typename VT::SimdType; using SimdWrapperType = typename VT::SimdWrapperType; using FilterType = typename VT::FilterType; + using UT = typename std::conditional::value || datatypes::is_uint128_t::value || std::is_same::value, + FilterType, typename datatypes::make_unsigned::type>::type; VT simdProcessor; SimdType dataVec; + [[maybe_unused]] SimdType swapedOrderDataVec; + [[maybe_unused]] auto typeHolder = in->colType; SimdType emptyFilterArgVec = simdProcessor.emptyNullLoadValue(emptyValue); SimdType nullFilterArgVec = simdProcessor.emptyNullLoadValue(nullValue); MT writeMask, nonEmptyMask, nonNullMask, nonNullOrEmptyMask; @@ -1292,11 +1375,27 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* for (uint32_t j = 0; j < filterCount; ++j) { // Preload filter argument values only once. - filterArgsVectors.push_back(simdProcessor.loadValue(*((FilterType*)&filterValues[j]))); + if constexpr (KIND == KIND_TEXT) + { + // Preload filter argument values only once. + // First cast filter value as the corresponding unsigned int value + UT filterValue = *((UT*)&filterValues[j]); + // Cast to ConstString to preprocess the string + utils::ConstString s{reinterpret_cast(&filterValue), sizeof(UT)}; + // Strip all 0 bytes on the right, convert byte into collation weights array + // and swap bytes order. + UT bigEndianFilterWeights = orderSwap(typeHolder.strnxfrm(s.rtrimZero())); + filterArgsVectors.push_back(simdProcessor.loadValue(bigEndianFilterWeights)); + } + else + { + FilterType filterValue = *((FilterType*)&filterValues[j]); + filterArgsVectors.push_back(simdProcessor.loadValue(filterValue)); + } switch (filterCOPs[j]) { case (COMPARE_EQ): - // Skipping extra filter pass generated by IS NULL + // Filter against NULL value if (memcmp(&filterValues[j], &nullValue, sizeof(nullValue)) == 0) copFunctorVec.push_back(std::mem_fn(&VT::nullEmptyCmpEq)); else @@ -1329,9 +1428,10 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* { primitives::RIDType ridOffset = i * VECTOR_SIZE; assert(!HAS_INPUT_RIDS || (HAS_INPUT_RIDS && ridSize >= ridOffset)); - dataVec = simdDataLoadTemplate(simdProcessor, srcArray, - origSrcArray, ridArray, i) - .v; + dataVec = simdDataLoad(simdProcessor, srcArray, + origSrcArray, ridArray, i).v; + if constexpr(KIND==KIND_TEXT) + swapedOrderDataVec = simdSwapedOrderDataLoad(typeHolder, simdProcessor, dataVec).v; nonEmptyMask = simdProcessor.nullEmptyCmpNe(dataVec, emptyFilterArgVec); writeMask = nonEmptyMask; // NULL check @@ -1346,7 +1446,11 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* for (uint32_t j = 0; j < filterCount; ++j) { // filter using compiled filter and preloaded filter argument - filterMask = copFunctorVec[j](simdProcessor, dataVec, filterArgsVectors[j]); + if constexpr(KIND==KIND_TEXT) + filterMask = copFunctorVec[j](simdProcessor, swapedOrderDataVec, filterArgsVectors[j]); + else + filterMask = copFunctorVec[j](simdProcessor, dataVec, filterArgsVectors[j]); + filterMask = bopFunctor(prevFilterMask, filterMask); prevFilterMask = filterMask; } @@ -1389,7 +1493,6 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* out->Min = Min; out->Max = Max; } - // process the tail. scalarFiltering changes out contents, e.g. Min/Max, NVALS, RIDs and values array // This tail also sets out::Min/Max, out::validMinMax if validMinMax is set. uint32_t processedSoFar = rid; @@ -1526,7 +1629,8 @@ void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* r #if defined(__x86_64__) // Don't use vectorized filtering for text based data types. - if (KIND <= KIND_FLOAT && WIDTH < 16) + if (WIDTH < 16 && + (KIND != KIND_TEXT || (KIND == KIND_TEXT && in->colType.strnxfrmIsValid()) )) { bool canUseFastFiltering = true; for (uint32_t i = 0; i < filterCount; ++i) @@ -1672,6 +1776,8 @@ template ::value || datatypes::is_uint128_t::value, T, + typename datatypes::make_unsigned::type>::type; const uint16_t ridSize = in->NVALS; uint16_t* ridArray = in->getRIDArrayPtr(W); const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W; @@ -1682,16 +1788,12 @@ void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in, C dataType == execplan::CalpontSystemCatalog::TEXT) && !isDictTokenScan(in)) { - using UT = typename std::conditional::value, T, - typename datatypes::make_unsigned::type>::type; filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter); return; } if (datatypes::isUnsigned(dataType)) { - using UT = typename std::conditional::value || datatypes::is_uint128_t::value, T, - typename datatypes::make_unsigned::type>::type; filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter); return; } diff --git a/tests/simd_processors.cpp b/tests/simd_processors.cpp index 519d02845..8883a8743 100644 --- a/tests/simd_processors.cpp +++ b/tests/simd_processors.cpp @@ -30,6 +30,7 @@ class SimdProcessorTypedTest : public testing::Test { using IntegralType = T; public: + void SetUp() override { } diff --git a/utils/common/collation.h b/utils/common/collation.h index 5d43df31f..721c98d16 100644 --- a/utils/common/collation.h +++ b/utils/common/collation.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2020 MariaDB Corporation + Copyright (C) 2020-2022 MariaDB Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -135,6 +135,8 @@ class Charset { protected: const struct charset_info_st* mCharset; + private: + static constexpr uint flags_ = MY_STRXFRM_PAD_WITH_SPACE | MY_STRXFRM_PAD_TO_MAXLEN; public: Charset(CHARSET_INFO& cs) : mCharset(&cs) @@ -182,9 +184,31 @@ class Charset size_t strnxfrm(uchar* dst, size_t dstlen, uint nweights, const uchar* src, size_t srclen, uint flags) { idbassert(mCharset->coll); - return mCharset->coll->strnxfrm(mCharset, dst, dstlen, nweights, src, srclen, flags); } + // The magic check that tells that bytes are mapped to weights as 1:1 + bool strnxfrmIsValid() const + { + return (mCharset->state & MY_CS_NON1TO1) == 0; + } + template + T strnxfrm(const char* src) const + { + T ret = 0; + size_t len __attribute__((unused)) = mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T), + src, sizeof(T), flags_); + assert(len <= sizeof(T)); + return ret; + } + template + T strnxfrm(const utils::ConstString &src) const + { + T ret = 0; + size_t len __attribute__((unused)) = mCharset->strnxfrm((char*)&ret, sizeof(T), sizeof(T), + (char*)src.str(), src.length(), flags_); + assert(len <= sizeof(T)); + return ret; + } }; class CollationAwareHasher : public Charset diff --git a/utils/common/simd_sse.h b/utils/common/simd_sse.h index ccf2ff2b7..d6407f58e 100644 --- a/utils/common/simd_sse.h +++ b/utils/common/simd_sse.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2021 Mariadb Corporation. +/* Copyright (C) 2021-2022 Mariadb Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -116,6 +116,15 @@ struct StorageToFiltering:: using type = T; }; +template +static inline vi128_t constant4i() { + static const union { + int i[4]; + vi128_t xmm; + } u = {{i0,i1,i2,i3}}; + return u.xmm; +} + template class SimdFilterProcessor; @@ -462,7 +471,7 @@ class SimdFilterProcessor< template class SimdFilterProcessor::value && sizeof(CHECK_T) == 8 && + typename std::enable_if::value && std::is_same::value && !std::is_same::value>::type> { public: @@ -569,7 +578,117 @@ class SimdFilterProcessor class SimdFilterProcessor::value && sizeof(CHECK_T) == 4 && + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = vi128_wr; + using SimdType = vi128_t; + using FilterType = T; + using StorageType = T; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return _mm_set_epi64x(fill, fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return _mm_loadu_si128(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return cmpGt(y, x) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + SimdType signVec = constant4i<0,(int32_t)0x80000000,0,(int32_t)0x80000000>(); + SimdType xFlip = _mm_xor_si128(x, signVec); + SimdType yFlip = _mm_xor_si128(y, signVec); + return _mm_movemask_epi8(_mm_cmpgt_epi64(xFlip, yFlip)); + } + + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return cmpGt(x, y) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return cmpGt(y, x); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return _mm_movemask_epi8(vmask); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return _mm_setzero_si128(); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + _mm_maskmoveu_si128(x, vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + _mm_storeu_si128(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor::value && std::is_same::value && !std::is_same::value>::type> { public: @@ -674,9 +793,119 @@ class SimdFilterProcessor +class SimdFilterProcessor::value && std::is_same::value && + !std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = vi128_wr; + using SimdType = vi128_t; + using FilterType = T; + using StorageType = T; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return _mm_set1_epi32(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return _mm_loadu_si128(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return cmpGt(y, x) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + SimdType signVec = constant4i<(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000,(int32_t)0x80000000>(); + SimdType xFlip = _mm_xor_si128(x, signVec); + SimdType yFlip = _mm_xor_si128(y, signVec); + return _mm_movemask_epi8(_mm_cmpgt_epi32(xFlip, yFlip)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return cmpGt(x, y) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return cmpGt(y, x); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return _mm_movemask_epi8(vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return _mm_setzero_si128(); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + _mm_maskmoveu_si128(x, vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + _mm_storeu_si128(reinterpret_cast(dst), x); + } +}; + template class SimdFilterProcessor< - VT, CHECK_T, typename std::enable_if::value && sizeof(CHECK_T) == 2>::type> + VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -782,7 +1011,227 @@ class SimdFilterProcessor< template class SimdFilterProcessor< - VT, CHECK_T, typename std::enable_if::value && sizeof(CHECK_T) == 1>::type> + VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = simd::vi128_wr; + using SimdType = simd::vi128_t; + using FilterType = T; + using StorageType = T; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return _mm_set1_epi16(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return _mm_loadu_si128(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + SimdType maxOfTwo = _mm_max_epu16(x, y); // max(x, y), unsigned + return _mm_movemask_epi8(_mm_cmpeq_epi16(x, maxOfTwo)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return cmpGe(y, x) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return cmpGe(y, x); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return cmpGe(x, y) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return _mm_movemask_epi8(vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return _mm_setzero_si128(); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + _mm_maskmoveu_si128(x, vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + _mm_storeu_si128(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> +{ + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = vi128_wr; + using SimdType = vi128_t; + using FilterType = T; + using StorageType = T; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return _mm_set1_epi8(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return _mm_loadu_si128(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return cmpLt(x, y) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return cmpGt(x, y) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmplt_epi8(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF; + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // permute + /* TODO Available in AVX-512 + MCS_FORCE_INLINE SimdType perm8Bits(SimdType& x, SimdType& idx) + { + return _mm_permutexvar_epi8(x, idx); + } + */ + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return _mm_movemask_epi8(vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return _mm_setzero_si128(); + } + + // store + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) + { + _mm_maskmoveu_si128(x, vmask, dst); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + _mm_storeu_si128(reinterpret_cast(dst), x); + } +}; + +template +class SimdFilterProcessor< + VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U;