diff --git a/dbcon/joblist/primitivemsg.h b/dbcon/joblist/primitivemsg.h index 93413c1ce..916e4e6ce 100644 --- a/dbcon/joblist/primitivemsg.h +++ b/dbcon/joblist/primitivemsg.h @@ -42,13 +42,14 @@ // from blocksize.h const int32_t DATA_BLOCK_SIZE = BLOCK_SIZE; -const int8_t COMPARE_NIL = 0x00; +const int8_t COMPARE_NIL = 0x00; // means c = NULL predicate const int8_t COMPARE_LT = 0x01; const int8_t COMPARE_EQ = 0x02; const int8_t COMPARE_LE = (COMPARE_LT | COMPARE_EQ); // 0x03 const int8_t COMPARE_GT = 0x04; const int8_t COMPARE_NE = (COMPARE_LT | COMPARE_GT); // 0x05 const int8_t COMPARE_GE = (COMPARE_GT | COMPARE_EQ); // 0x06 +const int8_t COMPARE_NULLEQ = 0x07; // means c IS NULL(see column.cpp for details) const int8_t COMPARE_NOT = 0x08; const int8_t COMPARE_NLT = (COMPARE_LT | COMPARE_NOT); // 0x09 const int8_t COMPARE_NLE = (COMPARE_LE | COMPARE_NOT); // 0x0b @@ -884,4 +885,3 @@ struct LbidAtVer #endif #pragma pack(pop) - diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp index 355b928ec..dbcc63ab2 100644 --- a/primitives/linux-port/column.cpp +++ b/primitives/linux-port/column.cpp @@ -55,123 +55,34 @@ using namespace execplan; namespace { -using MT = uint16_t; - -const MT nonEmptyMask2Byte[256] = +template +inline typename VT::MaskType getNonEmptyMaskAux(typename VT::MaskType* nonEmptyMaskAux, uint16_t iter) { - 0x0000, 0x0003, 0x000C, 0x000F, 0x0030, 0x0033, 0x003C, 0x003F, - 0x00C0, 0x00C3, 0x00CC, 0x00CF, 0x00F0, 0x00F3, 0x00FC, 0x00FF, - 0x0300, 0x0303, 0x030C, 0x030F, 0x0330, 0x0333, 0x033C, 0x033F, - 0x03C0, 0x03C3, 0x03CC, 0x03CF, 0x03F0, 0x03F3, 0x03FC, 0x03FF, - 0x0C00, 0x0C03, 0x0C0C, 0x0C0F, 0x0C30, 0x0C33, 0x0C3C, 0x0C3F, - 0x0CC0, 0x0CC3, 0x0CCC, 0x0CCF, 0x0CF0, 0x0CF3, 0x0CFC, 0x0CFF, - 0x0F00, 0x0F03, 0x0F0C, 0x0F0F, 0x0F30, 0x0F33, 0x0F3C, 0x0F3F, - 0x0FC0, 0x0FC3, 0x0FCC, 0x0FCF, 0x0FF0, 0x0FF3, 0x0FFC, 0x0FFF, - 0x3000, 0x3003, 0x300C, 0x300F, 0x3030, 0x3033, 0x303C, 0x303F, - 0x30C0, 0x30C3, 0x30CC, 0x30CF, 0x30F0, 0x30F3, 0x30FC, 0x30FF, - 0x3300, 0x3303, 0x330C, 0x330F, 0x3330, 0x3333, 0x333C, 0x333F, - 0x33C0, 0x33C3, 0x33CC, 0x33CF, 0x33F0, 0x33F3, 0x33FC, 0x33FF, - 0x3C00, 0x3C03, 0x3C0C, 0x3C0F, 0x3C30, 0x3C33, 0x3C3C, 0x3C3F, - 0x3CC0, 0x3CC3, 0x3CCC, 0x3CCF, 0x3CF0, 0x3CF3, 0x3CFC, 0x3CFF, - 0x3F00, 0x3F03, 0x3F0C, 0x3F0F, 0x3F30, 0x3F33, 0x3F3C, 0x3F3F, - 0x3FC0, 0x3FC3, 0x3FCC, 0x3FCF, 0x3FF0, 0x3FF3, 0x3FFC, 0x3FFF, - 0xC000, 0xC003, 0xC00C, 0xC00F, 0xC030, 0xC033, 0xC03C, 0xC03F, - 0xC0C0, 0xC0C3, 0xC0CC, 0xC0CF, 0xC0F0, 0xC0F3, 0xC0FC, 0xC0FF, - 0xC300, 0xC303, 0xC30C, 0xC30F, 0xC330, 0xC333, 0xC33C, 0xC33F, - 0xC3C0, 0xC3C3, 0xC3CC, 0xC3CF, 0xC3F0, 0xC3F3, 0xC3FC, 0xC3FF, - 0xCC00, 0xCC03, 0xCC0C, 0xCC0F, 0xCC30, 0xCC33, 0xCC3C, 0xCC3F, - 0xCCC0, 0xCCC3, 0xCCCC, 0xCCCF, 0xCCF0, 0xCCF3, 0xCCFC, 0xCCFF, - 0xCF00, 0xCF03, 0xCF0C, 0xCF0F, 0xCF30, 0xCF33, 0xCF3C, 0xCF3F, - 0xCFC0, 0xCFC3, 0xCFCC, 0xCFCF, 0xCFF0, 0xCFF3, 0xCFFC, 0xCFFF, - 0xF000, 0xF003, 0xF00C, 0xF00F, 0xF030, 0xF033, 0xF03C, 0xF03F, - 0xF0C0, 0xF0C3, 0xF0CC, 0xF0CF, 0xF0F0, 0xF0F3, 0xF0FC, 0xF0FF, - 0xF300, 0xF303, 0xF30C, 0xF30F, 0xF330, 0xF333, 0xF33C, 0xF33F, - 0xF3C0, 0xF3C3, 0xF3CC, 0xF3CF, 0xF3F0, 0xF3F3, 0xF3FC, 0xF3FF, - 0xFC00, 0xFC03, 0xFC0C, 0xFC0F, 0xFC30, 0xFC33, 0xFC3C, 0xFC3F, - 0xFCC0, 0xFCC3, 0xFCCC, 0xFCCF, 0xFCF0, 0xFCF3, 0xFCFC, 0xFCFF, - 0xFF00, 0xFF03, 0xFF0C, 0xFF0F, 0xFF30, 0xFF33, 0xFF3C, 0xFF3F, - 0xFFC0, 0xFFC3, 0xFFCC, 0xFFCF, 0xFFF0, 0xFFF3, 0xFFFC, 0xFFFF -}; - -const MT nonEmptyMask4Byte[16] = -{ - 0x0000, 0x000F, 0x00F0, 0x00FF, - 0x0F00, 0x0F0F, 0x0FF0, 0x0FFF, - 0xF000, 0xF00F, 0xF0F0, 0xF0FF, - 0xFF00, 0xFF0F, 0xFFF0, 0xFFFF -}; - -const MT nonEmptyMask8Byte[4] = -{ - 0x0000, 0x00FF, 0xFF00, 0xFFFF -}; - -const MT nonEmptyMask16Byte[2] = -{ - 0x0000, 0xFFFF -}; - -inline MT getNonEmptyMask1Byte(MT* nonEmptyMaskAux, uint16_t iter) -{ - return nonEmptyMaskAux[iter]; -} - -inline MT getNonEmptyMask2Byte(MT* nonEmptyMaskAux, uint16_t iter) -{ - return nonEmptyMask2Byte[(nonEmptyMaskAux[iter >> 1] >> ((iter & 0x0001) << 3)) & 0x00FF]; -} - -inline MT getNonEmptyMask4Byte(MT* nonEmptyMaskAux, uint16_t iter) -{ - return nonEmptyMask4Byte[(nonEmptyMaskAux[iter >> 2] >> ((iter & 0x0003) << 2)) & 0x000F]; -} - -inline MT getNonEmptyMask8Byte(MT* nonEmptyMaskAux, uint16_t iter) -{ - return nonEmptyMask8Byte[(nonEmptyMaskAux[iter >> 3] >> ((iter & 0x0007) << 1)) & 0x0003]; -} - -inline MT getNonEmptyMask16Byte(MT* nonEmptyMaskAux, uint16_t iter) -{ - return nonEmptyMask16Byte[(nonEmptyMaskAux[iter >> 4] >> (iter & 0x000F)) & 0x0001]; -} - -typedef MT (*getNonEmptyMaskPtrT)(MT*, uint16_t); - -template -constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate() -{ - return nullptr; -} - -template<> -constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<1>() -{ - return getNonEmptyMask1Byte; -} - -template<> -constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<2>() -{ - return getNonEmptyMask2Byte; -} - -template<> -constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<4>() -{ - return getNonEmptyMask4Byte; -} - -template<> -constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<8>() -{ - return getNonEmptyMask8Byte; -} - -template<> -constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtrTemplate<16>() -{ - return getNonEmptyMask16Byte; + VT proc; + if constexpr (sizeof(T) == sizeof(uint8_t)) + { + return nonEmptyMaskAux[iter]; + } + else if constexpr (sizeof(T) == sizeof(uint16_t)) + { + const char* ptr = reinterpret_cast((uint64_t*)nonEmptyMaskAux + iter); + return proc.maskCtor(ptr); + } + else if constexpr (sizeof(T) == sizeof(uint32_t)) + { + const char* ptr = reinterpret_cast((uint32_t*)nonEmptyMaskAux + iter); + return proc.maskCtor(ptr); + } + else if constexpr (sizeof(T) == sizeof(uint64_t)) + { + uint8_t* ptr = reinterpret_cast((uint16_t*)nonEmptyMaskAux + iter); + return typename VT::MaskType{ptr[0], ptr[1]}; + } + else if constexpr ((sizeof(T) == 16)) + { + const char* ptr = (const char*)nonEmptyMaskAux + iter; + return (typename VT::MaskType)proc.loadFrom(ptr); + } } inline uint64_t order_swap(uint64_t x) @@ -183,52 +94,39 @@ inline uint64_t order_swap(uint64_t x) } // Dummy template -template= sizeof(uint128_t), T>::type* = nullptr> +template = sizeof(uint128_t), T>::type* = nullptr> inline T orderSwap(T x) { - return x; + return x; } -template::type* = nullptr> +template ::type* = nullptr> inline T orderSwap(T x) { - T ret = (x >> 56) | - ((x << 40) & 0x00FF000000000000ULL) | - ((x << 24) & 0x0000FF0000000000ULL) | - ((x << 8) & 0x000000FF00000000ULL) | - ((x >> 8) & 0x00000000FF000000ULL) | - ((x >> 24) & 0x0000000000FF0000ULL) | - ((x >> 40) & 0x000000000000FF00ULL) | - (x << 56); - return ret; + T ret = (x >> 56) | ((x << 40) & 0x00FF000000000000ULL) | ((x << 24) & 0x0000FF0000000000ULL) | + ((x << 8) & 0x000000FF00000000ULL) | ((x >> 8) & 0x00000000FF000000ULL) | + ((x >> 24) & 0x0000000000FF0000ULL) | ((x >> 40) & 0x000000000000FF00ULL) | (x << 56); + return ret; } -template::type* = nullptr> +template ::type* = nullptr> inline T orderSwap(T x) { - T ret = (x >> 24) | - ((x << 8) & 0x00FF0000U) | - ((x >> 8) & 0x0000FF00U) | - (x << 24); - return ret; + T ret = (x >> 24) | ((x << 8) & 0x00FF0000U) | ((x >> 8) & 0x0000FF00U) | (x << 24); + return ret; } -template::type* = nullptr> +template ::type* = nullptr> inline T orderSwap(T x) { - T ret = (x >> 8) | (x <<8); - return ret; + T ret = (x >> 8) | (x << 8); + return ret; } -template::type* = nullptr> +template ::type* = nullptr> inline T orderSwap(T x) { - return x; + return x; } template @@ -272,15 +170,14 @@ inline bool colCompare_(const T& val1, const T& val2, uint8_t COP) case COMPARE_GE: return val1 >= val2; + case COMPARE_NULLEQ: return val1 == val2; + default: logIt(34, COP, "colCompare_"); return false; // throw an exception here? } } -inline bool colCompareStr(const ColRequestHeaderDataType &type, - uint8_t COP, - const utils::ConstString &val1, - const utils::ConstString &val2, - const bool printOut = false) +inline bool colCompareStr(const ColRequestHeaderDataType& type, uint8_t COP, const utils::ConstString& val1, + const utils::ConstString& val2, const bool printOut = false) { int error = 0; bool rc = primitives::StringComparator(type).op(&error, COP, val1, val2); @@ -311,6 +208,8 @@ inline bool colCompare_(const T& val1, const T& val2, uint8_t COP, uint8_t rf) case COMPARE_GT: return val1 > val2 || (val1 == val2 && (rf & 0x80)); + case COMPARE_NULLEQ: return val1 == val2 && rf == 0; + default: logIt(34, COP, "colCompare_"); return false; // throw an exception here? } } @@ -334,6 +233,8 @@ inline bool colStrCompare_(uint64_t val1, uint64_t val2, uint8_t COP, uint8_t rf case COMPARE_GT: return val1 > val2; + case COMPARE_NULLEQ: return val1 == val2 && rf == 0; + case COMPARE_LIKE: case COMPARE_NLIKE: default: logIt(34, COP, "colStrCompare_"); return false; // throw an exception here? @@ -607,103 +508,6 @@ T getEmptyValue(uint8_t type) } } -// Bit pattern representing NULL value for given column type/width -// TBD Use TypeHandler -template ::type* = nullptr> -T getNullValue(uint8_t type) -{ - return datatypes::Decimal128Null; -} - -template ::type* = nullptr> -T getNullValue(uint8_t type) -{ - switch (type) - { - case CalpontSystemCatalog::DOUBLE: - case CalpontSystemCatalog::UDOUBLE: return joblist::DOUBLENULL; - - case CalpontSystemCatalog::CHAR: - case CalpontSystemCatalog::VARCHAR: - case CalpontSystemCatalog::DATE: - case CalpontSystemCatalog::DATETIME: - case CalpontSystemCatalog::TIMESTAMP: - case CalpontSystemCatalog::TIME: - case CalpontSystemCatalog::VARBINARY: - case CalpontSystemCatalog::BLOB: - case CalpontSystemCatalog::TEXT: return joblist::CHAR8NULL; - - case CalpontSystemCatalog::UBIGINT: return joblist::UBIGINTNULL; - - default: return joblist::BIGINTNULL; - } -} - -template ::type* = nullptr> -T getNullValue(uint8_t type) -{ - switch (type) - { - case CalpontSystemCatalog::FLOAT: - case CalpontSystemCatalog::UFLOAT: return joblist::FLOATNULL; - - case CalpontSystemCatalog::CHAR: - case CalpontSystemCatalog::VARCHAR: - case CalpontSystemCatalog::BLOB: - case CalpontSystemCatalog::TEXT: return joblist::CHAR4NULL; - - case CalpontSystemCatalog::DATE: - case CalpontSystemCatalog::DATETIME: - case CalpontSystemCatalog::TIMESTAMP: - case CalpontSystemCatalog::TIME: return joblist::DATENULL; - - case CalpontSystemCatalog::UINT: - case CalpontSystemCatalog::UMEDINT: return joblist::UINTNULL; - - default: return joblist::INTNULL; - } -} - -template ::type* = nullptr> -T getNullValue(uint8_t type) -{ - switch (type) - { - case CalpontSystemCatalog::CHAR: - case CalpontSystemCatalog::VARCHAR: - case CalpontSystemCatalog::BLOB: - case CalpontSystemCatalog::TEXT: - case CalpontSystemCatalog::DATE: - case CalpontSystemCatalog::DATETIME: - case CalpontSystemCatalog::TIMESTAMP: - case CalpontSystemCatalog::TIME: return joblist::CHAR2NULL; - - case CalpontSystemCatalog::USMALLINT: return joblist::USMALLINTNULL; - - default: return joblist::SMALLINTNULL; - } -} - -template ::type* = nullptr> -T getNullValue(uint8_t type) -{ - switch (type) - { - case CalpontSystemCatalog::CHAR: - case CalpontSystemCatalog::VARCHAR: - case CalpontSystemCatalog::BLOB: - case CalpontSystemCatalog::TEXT: - case CalpontSystemCatalog::DATE: - case CalpontSystemCatalog::DATETIME: - case CalpontSystemCatalog::TIMESTAMP: - case CalpontSystemCatalog::TIME: return joblist::CHAR1NULL; - - case CalpontSystemCatalog::UTINYINT: return joblist::UTINYINTNULL; - - default: return joblist::TINYINTNULL; - } -} - // Check whether val is NULL (or alternative NULL bit pattern for 64-bit string types) template inline bool isNullValue(const T val, const T NULL_VALUE) @@ -951,15 +755,15 @@ template ::type* = nullptr> inline void vectUpdateMinMax(const bool validMinMax, const bool isNonNullOrEmpty, T& Min, T& Max, T curValue, @@ -1123,61 +926,51 @@ template ::type* = nullptr> inline uint16_t vectWriteColValues( - VT& simdProcessor, // SIMD processor - const MT writeMask, // SIMD intrinsics bitmask for values to write - const MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values - const bool validMinMax, // The flag to update Min/Max for a block or not - const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr - T* dataVecTPtr, // Typed SIMD vector from the input block - char* dstArray, // the actual char dst array ptr to start writing values - T& Min, T& Max, // Min/Max of the extent - NewColRequestHeader* in, // Proto message - ColResultHeader* out, // Proto message - primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs - primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs + VT& simdProcessor, // SIMD processor + const typename VT::MaskType writeMask, // SIMD intrinsics bitmask for values to write + const typename VT::MaskType nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values + const bool validMinMax, // The flag to update Min/Max for a block or not + const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr + T* dataVecTPtr, // Typed SIMD vector from the input block + char* dstArray, // the actual char dst array ptr to start writing values + T& Min, T& Max, // Min/Max of the extent + NewColRequestHeader* in, // Proto message + ColResultHeader* out, // Proto message + primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs + primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep; - using SimdType = typename VT::SimdType; - SimdType tmpStorageVector; - T* tmpDstVecTPtr = reinterpret_cast(&tmpStorageVector); - // Saving values based on writeMask into tmp vec. - // Min/Max processing. - // The mask is 16 bit long and it describes N elements. - // N = sizeof(vector type) / WIDTH. + T* tmpDstVecTPtr = reinterpret_cast(dstArray); uint32_t j = 0; + const int8_t* ptrW = reinterpret_cast(&writeMask); for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep) { - MT bitMapPosition = 1 << it; - if (writeMask & bitMapPosition) + if (ptrW[it]) { *tmpDstVecTPtr = dataVecTPtr[j]; ++tmpDstVecTPtr; } } - // Store the whole vector however one level up the stack - // vectorizedFiltering() increases the dstArray by a number of - // actual values written that is the result of this function. - simdProcessor.store(dstArray, tmpStorageVector); - return tmpDstVecTPtr - reinterpret_cast(&tmpStorageVector); + return tmpDstVecTPtr - reinterpret_cast(dstArray); } // RIDs no values template ::type* = nullptr> inline uint16_t vectWriteColValues( - VT& simdProcessor, // SIMD processor - const MT writeMask, // SIMD intrinsics bitmask for values to write - const MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values - const bool validMinMax, // The flag to update Min/Max for a block or not - const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr - T* dataVecTPtr, // Typed SIMD vector from the input block - char* dstArray, // the actual char dst array ptr to start writing values - T& Min, T& Max, // Min/Max of the extent - NewColRequestHeader* in, // Proto message - ColResultHeader* out, // Proto message - primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs - primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs + VT& simdProcessor, // SIMD processor + const typename VT::MaskType writeMask, // SIMD intrinsics bitmask for values to write + const typename VT::MaskType nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values + const bool validMinMax, // The flag to update Min/Max for a block or not + const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr + T* dataVecTPtr, // Typed SIMD vector from the input block + char* dstArray, // the actual char dst array ptr to start writing values + T& Min, T& Max, // Min/Max of the extent + NewColRequestHeader* in, // Proto message + ColResultHeader* out, // Proto message + primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs + primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { return 0; } @@ -1186,23 +979,22 @@ inline uint16_t vectWriteColValues( template ::type* = nullptr> inline uint16_t vectWriteColValues( - VT& simdProcessor, // SIMD processor - const MT writeMask, // SIMD intrinsics bitmask for values to write - const MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values - const bool validMinMax, // The flag to update Min/Max for a block or not - const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr - T* dataVecTPtr, // Typed SIMD vector from the input block - char* dstArray, // the actual char dst array ptr to start writing values - T& Min, T& Max, // Min/Max of the extent - NewColRequestHeader* in, // Proto message - ColResultHeader* out, // Proto message - primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs - primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs + VT& simdProcessor, // SIMD processor + const typename VT::MaskType writeMask, // SIMD intrinsics bitmask for values to write + const typename VT::MaskType nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values + const bool validMinMax, // The flag to update Min/Max for a block or not + const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr + T* dataVecTPtr, // Typed SIMD vector from the input block + char* dstArray, // the actual char dst array ptr to start writing values + T& Min, T& Max, // Min/Max of the extent + NewColRequestHeader* in, // Proto message + ColResultHeader* out, // Proto message + primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs + primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep; - using SimdType = typename VT::SimdType; - SimdType tmpStorageVector; - T* tmpDstVecTPtr = reinterpret_cast(&tmpStorageVector); + T* tmpDstVecTPtr = reinterpret_cast(dstArray); + const int8_t* ptrW = reinterpret_cast(&writeMask); // Saving values based on writeMask into tmp vec. // Min/Max processing. // The mask is 16 bit long and it describes N elements. @@ -1210,8 +1002,7 @@ inline uint16_t vectWriteColValues( uint32_t j = 0; for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep) { - MT bitMapPosition = 1 << it; - if (writeMask & bitMapPosition) + if (ptrW[it]) { *tmpDstVecTPtr = dataVecTPtr[j]; ++tmpDstVecTPtr; @@ -1219,12 +1010,8 @@ inline uint16_t vectWriteColValues( ++ridDstArray; } } - // Store the whole vector however one level up the stack - // vectorizedFiltering() increases the dstArray by a number of - // actual values written that is the result of this function. - simdProcessor.store(dstArray, tmpStorageVector); - return tmpDstVecTPtr - reinterpret_cast(&tmpStorageVector); + return tmpDstVecTPtr - reinterpret_cast(dstArray); } // RIDs no values @@ -1232,18 +1019,18 @@ template ::type* = nullptr> inline uint16_t vectWriteRIDValues( - VT& processor, // SIMD processor - const uint16_t valuesWritten, // The number of values written to in certain SFINAE cases - const bool validMinMax, // The flag to update Min/Max for a block or not - const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr - T* dataVecTPtr, // Typed SIMD vector from the input block - primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs - MT writeMask, // SIMD intrinsics bitmask for values to write - T& Min, T& Max, // Min/Max of the extent - NewColRequestHeader* in, // Proto message - ColResultHeader* out, // Proto message - MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values - primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs + VT& processor, // SIMD processor + const uint16_t valuesWritten, // The number of values written to in certain SFINAE cases + const bool validMinMax, // The flag to update Min/Max for a block or not + const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr + T* dataVecTPtr, // Typed SIMD vector from the input block + primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs + const typename VT::MaskType writeMask, // SIMD intrinsics bitmask for values to write + T& Min, T& Max, // Min/Max of the extent + NewColRequestHeader* in, // Proto message + ColResultHeader* out, // Proto message + const typename VT::MaskType nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values + primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep; primitives::RIDType* origRIDDstArray = ridDstArray; @@ -1251,9 +1038,10 @@ inline uint16_t vectWriteRIDValues( // Min/Max processing. // The mask is 16 bit long and it describes N elements where N = sizeof(vector type) / WIDTH. uint16_t j = 0; + const int8_t* ptrW = reinterpret_cast(&writeMask); for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep) { - if (writeMask & (1 << it)) + if (ptrW[it]) { vectWriteColValuesLoopRIDAsignment(ridDstArray, out, ridOffset + j, ridSrcArray, j); ++ridDstArray; @@ -1267,18 +1055,18 @@ inline uint16_t vectWriteRIDValues( template ::type* = nullptr> inline uint16_t vectWriteRIDValues( - VT& processor, // SIMD processor - const uint16_t valuesWritten, // The number of values written to in certain SFINAE cases - const bool validMinMax, // The flag to update Min/Max for a block or not - const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr - T* dataVecTPtr, // Typed SIMD vector from the input block - primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs - MT writeMask, // SIMD intrinsics bitmask for values to write - T& Min, T& Max, // Min/Max of the extent - NewColRequestHeader* in, // Proto message - ColResultHeader* out, // Proto message - MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values - primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs + VT& processor, // SIMD processor + const uint16_t valuesWritten, // The number of values written to in certain SFINAE cases + const bool validMinMax, // The flag to update Min/Max for a block or not + const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr + T* dataVecTPtr, // Typed SIMD vector from the input block + primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs + const typename VT::MaskType writeMask, // SIMD intrinsics bitmask for values to write + T& Min, T& Max, // Min/Max of the extent + NewColRequestHeader* in, // Proto message + ColResultHeader* out, // Proto message + const typename VT::MaskType nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values + primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { return valuesWritten; } @@ -1288,22 +1076,21 @@ template ::type* = nullptr> inline uint16_t vectWriteRIDValues( - VT& processor, // SIMD processor - const uint16_t valuesWritten, // The number of values written to in certain SFINAE cases - const bool validMinMax, // The flag to update Min/Max for a block or not - const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr - T* dataVecTPtr, // Typed SIMD vector from the input block - primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs - MT writeMask, // SIMD intrinsics bitmask for values to write - T& Min, T& Max, // Min/Max of the extent - NewColRequestHeader* in, // Proto message - ColResultHeader* out, // Proto message - MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values - primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs + VT& processor, // SIMD processor + const uint16_t valuesWritten, // The number of values written to in certain SFINAE cases + const bool validMinMax, // The flag to update Min/Max for a block or not + const primitives::RIDType ridOffset, // The first RID value of the dataVecTPtr + T* dataVecTPtr, // Typed SIMD vector from the input block + primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs + const typename VT::MaskType writeMask, // SIMD intrinsics bitmask for values to write + T& Min, T& Max, // Min/Max of the extent + NewColRequestHeader* in, // Proto message + ColResultHeader* out, // Proto message + const typename VT::MaskType nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values + primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { return valuesWritten; } -#endif /***************************************************************************** *** RUN DATA THROUGH A COLUMN FILTER **************************************** @@ -1329,8 +1116,7 @@ void scalarFiltering_( const bool validMinMax, // The flag to store min/max T emptyValue, // Deduced empty value magic T nullValue, // Deduced null value magic - T Min, T Max, const bool isNullValueMatches, - const uint8_t* blockAux) + T Min, T Max, const bool isNullValueMatches, const uint8_t* blockAux) { constexpr int WIDTH = sizeof(T); // Loop-local variables @@ -1343,16 +1129,16 @@ void scalarFiltering_( { if constexpr (IS_AUX_COLUMN) { - if (!(nextColValue(curValue, isEmpty, i, rid, srcArray, srcSize, - ridArray, ridSize, outputType, emptyValue, - blockAux))) + if (!(nextColValue(curValue, isEmpty, i, rid, srcArray, + srcSize, ridArray, ridSize, outputType, + emptyValue, blockAux))) break; } else { - if (!(nextColValue(curValue, isEmpty, i, rid, srcArray, srcSize, - ridArray, ridSize, outputType, emptyValue, - blockAux))) + if (!(nextColValue(curValue, isEmpty, i, rid, srcArray, + srcSize, ridArray, ridSize, + outputType, emptyValue, blockAux))) break; } @@ -1407,32 +1193,28 @@ void scalarFiltering( const bool validMinMax, // The flag to store min/max T emptyValue, // Deduced empty value magic T nullValue, // Deduced null value magic - T Min, T Max, const bool isNullValueMatches, - const uint8_t* blockAux) + T Min, T Max, const bool isNullValueMatches, const uint8_t* blockAux) { if (in->hasAuxCol) { - scalarFiltering_(in, out, columnFilterMode, - filterSet, filterCount, filterCOPs, filterValues, filterRFs, - typeHolder, srcArray, srcSize, ridArray, ridSize, initialRID, - outputType, validMinMax, emptyValue, nullValue, Min, Max, - isNullValueMatches, blockAux); + scalarFiltering_(in, out, columnFilterMode, filterSet, filterCount, filterCOPs, + filterValues, filterRFs, typeHolder, srcArray, srcSize, ridArray, + ridSize, initialRID, outputType, validMinMax, emptyValue, + nullValue, Min, Max, isNullValueMatches, blockAux); } else { - scalarFiltering_(in, out, columnFilterMode, - filterSet, filterCount, filterCOPs, filterValues, filterRFs, - typeHolder, srcArray, srcSize, ridArray, ridSize, initialRID, - outputType, validMinMax, emptyValue, nullValue, Min, Max, - isNullValueMatches, blockAux); + scalarFiltering_(in, out, columnFilterMode, filterSet, filterCount, filterCOPs, + filterValues, filterRFs, typeHolder, srcArray, srcSize, ridArray, + ridSize, initialRID, outputType, validMinMax, emptyValue, + nullValue, Min, Max, isNullValueMatches, blockAux); } } -#if defined(__x86_64__)|| defined(__aarch64__) template ::type* = nullptr> inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* origSrcArray, - const primitives::RIDType* ridArray, const uint16_t iter) + const primitives::RIDType* ridArray, const uint16_t iter) { return {processor.loadFrom(reinterpret_cast(srcArray))}; } @@ -1442,7 +1224,7 @@ inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* template ::type* = nullptr> inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* origSrcArray, - const primitives::RIDType* ridArray, const uint16_t iter) + const primitives::RIDType* ridArray, const uint16_t iter) { constexpr const uint16_t WIDTH = sizeof(T); constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH; @@ -1457,56 +1239,59 @@ inline SIMD_WRAPPER_TYPE simdDataLoad(VT& processor, const T* srcArray, const T* return {result}; } -template ::type* = nullptr> -inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType &type, VT& processor, typename VT::SimdType& dataVector) +inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType& type, VT& processor, + typename VT::SimdType& dataVector) { - return {dataVector}; + return {dataVector}; } -template ::type* = nullptr> -inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType &type, - VT& processor, typename VT::SimdType& dataVector) +inline SIMD_WRAPPER_TYPE simdSwapedOrderDataLoad(const ColRequestHeaderDataType& type, VT& processor, + typename VT::SimdType& dataVector) { - constexpr const uint16_t WIDTH = sizeof(T); - constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH; - using SimdType = typename VT::SimdType; - SimdType result; - T* resultTypedPtr = reinterpret_cast(&result); - T* srcTypedPtr = reinterpret_cast(&dataVector); - for (uint32_t i = 0; i < VECTOR_SIZE; ++i) - { - utils::ConstString s{reinterpret_cast(&srcTypedPtr[i]), WIDTH}; - resultTypedPtr[i] = orderSwap(type.strnxfrm(s.rtrimZero())); - } - return {result}; + constexpr const uint16_t WIDTH = sizeof(T); + constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH; + using SimdType = typename VT::SimdType; + SimdType result; + T* resultTypedPtr = reinterpret_cast(&result); + T* srcTypedPtr = reinterpret_cast(&dataVector); + for (uint32_t i = 0; i < VECTOR_SIZE; ++i) + { + utils::ConstString s{reinterpret_cast(&srcTypedPtr[i]), WIDTH}; + resultTypedPtr[i] = orderSwap(type.strnxfrm(s.rtrimZero())); + } + return {result}; } template -void vectorizedUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT simdProcessor, - SimdType& dataVec, SimdType& simdMin, SimdType& simdMax) +void vectorizedUpdateMinMax(const bool validMinMax, const typename VT::MaskType nonNullOrEmptyMask, + VT simdProcessor, SimdType& dataVec, SimdType& simdMin, SimdType& simdMax) { if (validMinMax) { - auto byteMask = utils::bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)); - simdMin = simdProcessor.blend( - simdMin, dataVec, simdProcessor.bwAnd(simdProcessor.cmpGtSimdType(simdMin, dataVec), byteMask)); - simdMax = simdProcessor.blend( - simdMax, dataVec, simdProcessor.bwAnd(simdProcessor.cmpGtSimdType(dataVec, simdMax), byteMask)); + { + simdMin = + simdProcessor.blend(simdMin, dataVec, simdProcessor.cmpGt(simdMin, dataVec) & nonNullOrEmptyMask); + simdMax = + simdProcessor.blend(simdMax, dataVec, simdProcessor.cmpGt(dataVec, simdMax) & nonNullOrEmptyMask); + } } } template -void vectorizedTextUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyMask, VT simdProcessor, - SimdType& dataVec, SimdType& simdMin, SimdType& simdMax, +void vectorizedTextUpdateMinMax(const bool validMinMax, const typename VT::MaskType nonNullOrEmptyMask, + VT simdProcessor, SimdType& dataVec, SimdType& simdMin, SimdType& simdMax, SimdType& swapedOrderDataVec, SimdType& weightsMin, SimdType& weightsMax) { + using MT = typename VT::MaskType; if (validMinMax) { - auto byteMask = utils::bitCast(simd::bitMaskToByteMask16(nonNullOrEmptyMask)); - auto minComp = simdProcessor.bwAnd(simdProcessor.cmpGtSimdType(weightsMin, swapedOrderDataVec), byteMask); - auto maxComp = simdProcessor.bwAnd(simdProcessor.cmpGtSimdType(swapedOrderDataVec, weightsMax), byteMask); + MT minComp = simdProcessor.cmpGt(weightsMin, swapedOrderDataVec) & nonNullOrEmptyMask; + MT maxComp = simdProcessor.cmpGt(swapedOrderDataVec, weightsMax) & nonNullOrEmptyMask; + simdMin = simdProcessor.blend(simdMin, dataVec, minComp); weightsMin = simdProcessor.blend(weightsMin, swapedOrderDataVec, minComp); simdMax = simdProcessor.blend(simdMax, dataVec, maxComp); @@ -1514,7 +1299,7 @@ void vectorizedTextUpdateMinMax(const bool validMinMax, const MT nonNullOrEmptyM } } -template +template void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min, T& max) { constexpr const uint16_t size = VT::vecByteSize / sizeof(T); @@ -1524,7 +1309,7 @@ void extractMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, T& min min = *std::min_element(simdMinVec, simdMinVec + size); } -template +template void extractTextMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, SimdType weightsMin, SimdType weightsMax, T& min, T& max) { @@ -1539,10 +1324,9 @@ void extractTextMinMax(VT& simdProcessor, SimdType simdMin, SimdType simdMax, Si max = simdMaxVec[indMax - weightsMaxVec]; } -template -void buildAuxColEmptyVal(const uint16_t iterNumberAux, const uint16_t vectorSizeAux, - const uint8_t** blockAux, MT** nonEmptyMaskAux, - primitives::RIDType** ridArray) +template +void buildAuxColEmptyVal(const uint16_t iterNumberAux, const uint16_t vectorSizeAux, const uint8_t** blockAux, + MT** nonEmptyMaskAux, primitives::RIDType** ridArray) { using SimdTypeTemp = typename simd::IntegralToSIMD::type; using FilterTypeTemp = typename simd::StorageToFiltering::type; @@ -1551,15 +1335,16 @@ void buildAuxColEmptyVal(const uint16_t iterNumberAux, const uint16_t vectorSize using SimdWrapperTypeAux = typename VTAux::SimdWrapperType; VTAux simdProcessorAux; SimdTypeAux dataVecAux; - SimdTypeAux emptyFilterArgVecAux = simdProcessorAux.emptyNullLoadValue(EMPTY_VALUE_AUX); + SimdTypeAux emptyFilterArgVecAux = simdProcessorAux.loadValue(EMPTY_VALUE_AUX); const uint8_t* origBlockAux = *blockAux; primitives::RIDType* origRidArray = *ridArray; for (uint16_t i = 0; i < iterNumberAux; ++i) { dataVecAux = simdDataLoad(simdProcessorAux, *blockAux, - origBlockAux, *ridArray, i).v; - (*nonEmptyMaskAux)[i] = simdProcessorAux.nullEmptyCmpNe(dataVecAux, emptyFilterArgVecAux); + origBlockAux, *ridArray, i) + .v; + (*nonEmptyMaskAux)[i] = (MT)simdProcessorAux.nullEmptyCmpNe(dataVecAux, emptyFilterArgVecAux); *blockAux += vectorSizeAux; *ridArray += vectorSizeAux; } @@ -1576,9 +1361,8 @@ void buildAuxColEmptyVal(const uint16_t iterNumberAux, const uint16_t vectorSize // to glue the masks produced by actual filters. // Then it takes a vector of data, run filters and logical function using pointers. // See the corresponding dispatcher to get more details on vector processing class. -template +template void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T* srcArray, const uint32_t srcSize, primitives::RIDType* ridArray, const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter, const bool validMinMax, const T emptyValue, @@ -1589,16 +1373,22 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T using SimdType = typename VT::SimdType; using SimdWrapperType = typename VT::SimdWrapperType; using FilterType = typename VT::FilterType; - using UT = typename std::conditional::value || datatypes::is_uint128_t::value || std::is_same::value, - FilterType, typename datatypes::make_unsigned::type>::type; + using UT = typename std::conditional::value || + datatypes::is_uint128_t::value || + std::is_same::value, + FilterType, typename datatypes::make_unsigned::type>::type; VT simdProcessor; + using MT = typename VT::MaskType; SimdType dataVec; [[maybe_unused]] SimdType swapedOrderDataVec; [[maybe_unused]] auto typeHolder = in->colType; [[maybe_unused]] SimdType emptyFilterArgVec = simdProcessor.emptyNullLoadValue(emptyValue); SimdType nullFilterArgVec = simdProcessor.emptyNullLoadValue(nullValue); - MT writeMask, nonEmptyMask, nonNullMask, nonNullOrEmptyMask; - MT initFilterMask = 0xFFFF; + MT writeMask, nonNullMask, nonNullOrEmptyMask; + MT trueMask = simdProcessor.trueMask(); + MT falseMask = simdProcessor.falseMask(); + MT nonEmptyMask = trueMask; + MT initFilterMask = trueMask; primitives::RIDType rid = 0; primitives::RIDType* origRidArray = ridArray; uint16_t totalValuesWritten = 0; @@ -1620,12 +1410,8 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-attributes" std::vector filterArgsVectors; - auto ptrA = std::mem_fn(&VT::cmpEq); - using COPType = decltype(ptrA); - std::vector copFunctorVec; + bool isOr = false; #pragma GCC diagnostic pop - using BOPType = std::function; - BOPType bopFunctor; // filter comparators and logical function compilation. if (parsedColumnFilter != nullptr) { @@ -1637,23 +1423,15 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T filterCount = parsedColumnFilter->getFilterCount(); if (iterNumber > 0) { - copFunctorVec.reserve(filterCount); switch (parsedColumnFilter->getBOP()) { case BOP_OR: - bopFunctor = std::bit_or(); - initFilterMask = 0; - break; - case BOP_AND: bopFunctor = std::bit_and(); break; case BOP_XOR: - bopFunctor = std::bit_or(); - initFilterMask = 0; - break; - case BOP_NONE: - // According with the comments in linux-port/primitiveprocessor.h - // there can't be BOP_NONE with filterCount > 0 - bopFunctor = std::bit_and(); + isOr = true; + initFilterMask = falseMask; break; + case BOP_AND: break; + case BOP_NONE: break; default: idbassert(false); } filterArgsVectors.reserve(filterCount); @@ -1677,30 +1455,6 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T FilterType filterValue = *((FilterType*)&filterValues[j]); filterArgsVectors.push_back(simdProcessor.loadValue(filterValue)); } - switch (filterCOPs[j]) - { - case (COMPARE_EQ): - // Filter against NULL value - if (memcmp(&filterValues[j], &nullValue, sizeof(nullValue)) == 0) - copFunctorVec.push_back(std::mem_fn(&VT::nullEmptyCmpEq)); - else - copFunctorVec.push_back(std::mem_fn(&VT::cmpEq)); - break; - case (COMPARE_GE): copFunctorVec.push_back(std::mem_fn(&VT::cmpGe)); break; - - case (COMPARE_GT): copFunctorVec.push_back(std::mem_fn(&VT::cmpGt)); break; - case (COMPARE_LE): copFunctorVec.push_back(std::mem_fn(&VT::cmpLe)); break; - case (COMPARE_LT): copFunctorVec.push_back(std::mem_fn(&VT::cmpLt)); break; - case (COMPARE_NE): copFunctorVec.push_back(std::mem_fn(&VT::cmpNe)); break; - case (COMPARE_NIL): - copFunctorVec.push_back(std::mem_fn(&VT::cmpAlwaysFalse)); - break; - // There are couple other COP, e.g. COMPARE_NOT however they can't be met here - // b/c MCS 6.x uses COMPARE_NOT for strings with OP_LIKE only. See op2num() for - // details. - - default: idbassert(false); - } } } } @@ -1715,20 +1469,17 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T weightsMin = simdSwapedOrderDataLoad(typeHolder, simdProcessor, simdMin).v; weightsMax = simdSwapedOrderDataLoad(typeHolder, simdProcessor, simdMax).v; } - MT* nonEmptyMaskAux; if constexpr (IS_AUX_COLUMN) { constexpr uint16_t vectorSizeAux = VT::vecByteSize; uint16_t iterNumberAux = HAS_INPUT_RIDS ? ridSize / vectorSizeAux : srcSize / vectorSizeAux; - nonEmptyMaskAux = (MT*) alloca(sizeof(MT) * iterNumberAux); - buildAuxColEmptyVal(iterNumberAux, vectorSizeAux, &blockAux, - &nonEmptyMaskAux, &ridArray); + nonEmptyMaskAux = (MT*)alloca(sizeof(MT) * iterNumberAux); + buildAuxColEmptyVal(iterNumberAux, vectorSizeAux, &blockAux, + &nonEmptyMaskAux, &ridArray); } - constexpr getNonEmptyMaskPtrT getNonEmptyMaskPtr = getNonEmptyMaskPtrTemplate(); - // main loop // writeMask tells which values must get into the result. Includes values that matches filters. Can have // NULLs. nonEmptyMask tells which vector coords are not EMPTY magics. nonNullMask tells which vector coords @@ -1737,40 +1488,74 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T { primitives::RIDType ridOffset = i * VECTOR_SIZE; assert(!HAS_INPUT_RIDS || (HAS_INPUT_RIDS && ridSize >= ridOffset)); - dataVec = simdDataLoad(simdProcessor, srcArray, - origSrcArray, ridArray, i).v; + dataVec = simdDataLoad(simdProcessor, srcArray, origSrcArray, + ridArray, i) + .v; - if constexpr(KIND==KIND_TEXT) - swapedOrderDataVec = simdSwapedOrderDataLoad(typeHolder, simdProcessor, dataVec).v; + if constexpr (KIND == KIND_TEXT) + { + swapedOrderDataVec = + simdSwapedOrderDataLoad(typeHolder, simdProcessor, dataVec).v; + } if constexpr (IS_AUX_COLUMN) - nonEmptyMask = (*getNonEmptyMaskPtr)(nonEmptyMaskAux, i); + { + //'Ne' translates AUX vectors of "0xFF" values into the vectors of the corresponding + // width "0xFF...FF" for u16/32/64bits. + nonEmptyMask = simdProcessor.nullEmptyCmpNe( + (SimdType)getNonEmptyMaskAux(nonEmptyMaskAux, i), (SimdType)falseMask); + } else - nonEmptyMask = simdProcessor.nullEmptyCmpNe(dataVec, emptyFilterArgVec); + { + nonEmptyMask = simdProcessor.cmpNe(dataVec, emptyFilterArgVec); + } writeMask = nonEmptyMask; // NULL check nonNullMask = simdProcessor.nullEmptyCmpNe(dataVec, nullFilterArgVec); // Exclude NULLs from the resulting set if NULL doesn't match the filters. writeMask = isNullValueMatches ? writeMask : writeMask & nonNullMask; + nonNullOrEmptyMask = nonNullMask & nonEmptyMask; // filters MT prevFilterMask = initFilterMask; - // TODO name this mask literal - MT filterMask = 0xFFFF; + MT filterMask = trueMask; + for (uint32_t j = 0; j < filterCount; ++j) { - // filter using compiled filter and preloaded filter argument - if constexpr(KIND==KIND_TEXT) - filterMask = copFunctorVec[j](simdProcessor, swapedOrderDataVec, filterArgsVectors[j]); + SimdType l; + if constexpr (KIND == KIND_TEXT) + { + l = swapedOrderDataVec; + } else - filterMask = copFunctorVec[j](simdProcessor, dataVec, filterArgsVectors[j]); + { + l = dataVec; + } - filterMask = bopFunctor(prevFilterMask, filterMask); + // The operator form doesn't work for x86. We need explicit functions here. + switch (filterCOPs[j]) + { + case (COMPARE_NULLEQ): filterMask = simdProcessor.nullEmptyCmpEq(l, filterArgsVectors[j]); break; + case (COMPARE_EQ): filterMask = simdProcessor.cmpEq(l, filterArgsVectors[j]); break; + case (COMPARE_GE): filterMask = simdProcessor.cmpGe(l, filterArgsVectors[j]); break; + case (COMPARE_GT): filterMask = simdProcessor.cmpGt(l, filterArgsVectors[j]); break; + case (COMPARE_LE): filterMask = simdProcessor.cmpLe(l, filterArgsVectors[j]); break; + case (COMPARE_LT): filterMask = simdProcessor.cmpLt(l, filterArgsVectors[j]); break; + case (COMPARE_NE): filterMask = simdProcessor.cmpNe(l, filterArgsVectors[j]); break; + case (COMPARE_NIL): filterMask = falseMask; break; + + default: + idbassert(false); + // There are couple other COP, e.g. COMPARE_NOT however they can't be met here + // b/c MCS 6.x uses COMPARE_NOT for strings with OP_LIKE only. See op2num() for + // details. + } + + filterMask = isOr ? prevFilterMask | filterMask : prevFilterMask & filterMask; prevFilterMask = filterMask; } writeMask = writeMask & filterMask; - T* dataVecTPtr = reinterpret_cast(&dataVec); // vectWriteColValues iterates over the values in the source vec @@ -1788,11 +1573,19 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T simdProcessor, valuesWritten, validMinMax, ridOffset, dataVecTPtr, ridDstArray, writeMask, min, max, in, out, nonNullOrEmptyMask, ridArray); - if constexpr (KIND != KIND_TEXT) - vectorizedUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax); - else - vectorizedTextUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax, - swapedOrderDataVec, weightsMin, weightsMax); + if constexpr (KIND == KIND_TEXT) + { + vectorizedTextUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax, + swapedOrderDataVec, weightsMin, weightsMax); + } + else if constexpr (KIND == KIND_FLOAT) + { + // noop for future development + } + else + { + vectorizedUpdateMinMax(validMinMax, nonNullOrEmptyMask, simdProcessor, dataVec, simdMin, simdMax); + } // Calculate bytes written uint16_t bytesWritten = valuesWritten * WIDTH; @@ -1828,8 +1621,9 @@ void vectorizedFiltering_(NewColRequestHeader* in, ColResultHeader* out, const T min, max, isNullValueMatches, blockAux); } -template +#if defined(__x86_64__) || (__aarch64__) +template void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* srcArray, const uint32_t srcSize, primitives::RIDType* ridArray, const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter, const bool validMinMax, const T emptyValue, @@ -1838,21 +1632,20 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, const T* { if (in->hasAuxCol) { - vectorizedFiltering_( - in, out, srcArray, srcSize, ridArray, ridSize, - parsedColumnFilter, validMinMax, emptyValue, - nullValue, min, max, isNullValueMatches, - blockAux); + vectorizedFiltering_(in, out, srcArray, srcSize, ridArray, ridSize, + parsedColumnFilter, validMinMax, emptyValue, nullValue, + min, max, isNullValueMatches, blockAux); } else { - vectorizedFiltering_( - in, out, srcArray, srcSize, ridArray, ridSize, - parsedColumnFilter, validMinMax, emptyValue, - nullValue, min, max, isNullValueMatches, - blockAux); + vectorizedFiltering_(in, out, srcArray, srcSize, ridArray, ridSize, + parsedColumnFilter, validMinMax, emptyValue, nullValue, + min, max, isNullValueMatches, blockAux); } } +#endif // This routine dispatches template function calls to reduce branching. template @@ -1861,8 +1654,7 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter, const bool validMinMax, const STORAGE_TYPE emptyValue, const STORAGE_TYPE nullValue, STORAGE_TYPE Min, STORAGE_TYPE Max, - const bool isNullValueMatches, - const uint8_t* blockAux) + const bool isNullValueMatches, const uint8_t* blockAux) { // Using struct to dispatch SIMD type based on integral type T. using SimdType = typename simd::IntegralToSIMD::type; @@ -1924,7 +1716,6 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out } } } -#endif // TBD Make changes in Command class ancestors to threat BPP::values as buffer. // TBD this will allow to copy values only once from BPP::blockData to the destination. @@ -1936,8 +1727,7 @@ template void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* ridArray, const uint16_t ridSize, // Number of values in ridArray int* srcArray16, const uint32_t srcSize, - boost::shared_ptr parsedColumnFilter, - int* blockAux) + boost::shared_ptr parsedColumnFilter, int* blockAux) { using FT = typename IntegralTypeToFilterType::type; using ST = typename IntegralTypeToFilterSetType::type; @@ -1981,10 +1771,10 @@ void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* r // Syscat queries mustn't follow vectorized processing path b/c PP must return // all values w/o any filter(even empty values filter) applied. -#if defined(__x86_64__)|| defined(__aarch64__) - // Don't use vectorized filtering for text based data types. - if (WIDTH < 16 && - (KIND != KIND_TEXT || (KIND == KIND_TEXT && in->colType.strnxfrmIsValid()) )) +#if defined(__x86_64__) || defined(__aarch64__) + // Don't use vectorized filtering for text based data types which collation translation + // can deliver more then 1 byte for a single input byte of an encoded string. + if (WIDTH < 16 && (KIND != KIND_TEXT || (KIND == KIND_TEXT && in->colType.strnxfrmIsValid()))) { bool canUseFastFiltering = true; for (uint32_t i = 0; i < filterCount; ++i) @@ -1996,10 +1786,9 @@ void filterColumnData(NewColRequestHeader* in, ColResultHeader* out, uint16_t* r if (canUseFastFiltering) { - vectorizedFilteringDispatcher(in, out, srcArray, srcSize, ridArray, ridSize, - parsedColumnFilter.get(), validMinMax, emptyValue, - nullValue, Min, Max, isNullValueMatches, - reinterpret_cast(blockAux)); + vectorizedFilteringDispatcher( + in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter.get(), validMinMax, emptyValue, + nullValue, Min, Max, isNullValueMatches, reinterpret_cast(blockAux)); return; } } @@ -2043,8 +1832,7 @@ template ::type* = nullptr> #endif -void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, - ColResultHeader* out) +void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out) { constexpr int W = sizeof(T); auto dataType = (execplan::CalpontSystemCatalog::ColDataType)in->colType.DataType; @@ -2053,7 +1841,8 @@ void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, const uint16_t ridSize = in->NVALS; uint16_t* ridArray = in->getRIDArrayPtr(W); const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W; - filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux); + filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, + blockAux); return; } _scanAndFilterTypeDispatcher(in, out); @@ -2069,8 +1858,7 @@ template ::type* = nullptr> #endif -void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, - ColResultHeader* out) +void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out) { constexpr int W = sizeof(T); auto dataType = (execplan::CalpontSystemCatalog::ColDataType)in->colType.DataType; @@ -2079,7 +1867,8 @@ void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, const uint16_t ridSize = in->NVALS; uint16_t* ridArray = in->getRIDArrayPtr(W); const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W; - filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux); + filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, + blockAux); return; } _scanAndFilterTypeDispatcher(in, out); @@ -2098,8 +1887,7 @@ template ::type* = nullptr> #endif -void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, - ColResultHeader* out) +void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out) { _scanAndFilterTypeDispatcher(in, out); } @@ -2114,15 +1902,15 @@ template ::type* = nullptr> #endif -void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in, - ColResultHeader* out) +void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out) { constexpr int W = sizeof(T); const uint16_t ridSize = in->NVALS; uint16_t* ridArray = in->getRIDArrayPtr(W); const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W; - filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux); + filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, + blockAux); } template ::type* = nullptr> #endif -void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in, - ColResultHeader* out) +void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out) { constexpr int W = sizeof(T); using UT = typename std::conditional::value || datatypes::is_uint128_t::value, T, - typename datatypes::make_unsigned::type>::type; + typename datatypes::make_unsigned::type>::type; const uint16_t ridSize = in->NVALS; uint16_t* ridArray = in->getRIDArrayPtr(W); const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE : BLOCK_SIZE / W; @@ -2151,16 +1938,19 @@ void PrimitiveProcessor::_scanAndFilterTypeDispatcher(NewColRequestHeader* in, dataType == execplan::CalpontSystemCatalog::TEXT) && !isDictTokenScan(in)) { - filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux); + filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, + blockAux); return; } if (datatypes::isUnsigned(dataType)) { - filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux); + filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, + blockAux); return; } - filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, blockAux); + filterColumnData(in, out, ridArray, ridSize, block, itemsPerBlock, parsedColumnFilter, + blockAux); } // The entrypoint for block scanning and filtering. diff --git a/primitives/linux-port/primitiveprocessor.h b/primitives/linux-port/primitiveprocessor.h index 23340f5e3..22cb8a155 100644 --- a/primitives/linux-port/primitiveprocessor.h +++ b/primitives/linux-port/primitiveprocessor.h @@ -55,7 +55,7 @@ class PrimTest; // XXX: turn off dictionary range setting during scan. -#define XXX_PRIMITIVES_TOKEN_RANGES_XXX +#define XXX_PRIMITIVES_TOKEN_RANGES_XXX namespace primitives { @@ -472,6 +472,103 @@ class PrimitiveProcessor friend class ::PrimTest; }; +// Bit pattern representing NULL value for given column type/width +// TBD Use TypeHandler +template ::type* = nullptr> +T getNullValue(uint8_t type) +{ + return datatypes::Decimal128Null; +} + +template ::type* = nullptr> +T getNullValue(uint8_t type) +{ + switch (type) + { + case execplan::CalpontSystemCatalog::DOUBLE: + case execplan::CalpontSystemCatalog::UDOUBLE: return joblist::DOUBLENULL; + + case execplan::CalpontSystemCatalog::CHAR: + case execplan::CalpontSystemCatalog::VARCHAR: + case execplan::CalpontSystemCatalog::DATE: + case execplan::CalpontSystemCatalog::DATETIME: + case execplan::CalpontSystemCatalog::TIMESTAMP: + case execplan::CalpontSystemCatalog::TIME: + case execplan::CalpontSystemCatalog::VARBINARY: + case execplan::CalpontSystemCatalog::BLOB: + case execplan::CalpontSystemCatalog::TEXT: return joblist::CHAR8NULL; + + case execplan::CalpontSystemCatalog::UBIGINT: return joblist::UBIGINTNULL; + + default: return joblist::BIGINTNULL; + } +} + +template ::type* = nullptr> +T getNullValue(uint8_t type) +{ + switch (type) + { + case execplan::CalpontSystemCatalog::FLOAT: + case execplan::CalpontSystemCatalog::UFLOAT: return joblist::FLOATNULL; + + case execplan::CalpontSystemCatalog::CHAR: + case execplan::CalpontSystemCatalog::VARCHAR: + case execplan::CalpontSystemCatalog::BLOB: + case execplan::CalpontSystemCatalog::TEXT: return joblist::CHAR4NULL; + + case execplan::CalpontSystemCatalog::DATE: + case execplan::CalpontSystemCatalog::DATETIME: + case execplan::CalpontSystemCatalog::TIMESTAMP: + case execplan::CalpontSystemCatalog::TIME: return joblist::DATENULL; + + case execplan::CalpontSystemCatalog::UINT: + case execplan::CalpontSystemCatalog::UMEDINT: return joblist::UINTNULL; + + default: return joblist::INTNULL; + } +} + +template ::type* = nullptr> +T getNullValue(uint8_t type) +{ + switch (type) + { + case execplan::CalpontSystemCatalog::CHAR: + case execplan::CalpontSystemCatalog::VARCHAR: + case execplan::CalpontSystemCatalog::BLOB: + case execplan::CalpontSystemCatalog::TEXT: + case execplan::CalpontSystemCatalog::DATE: + case execplan::CalpontSystemCatalog::DATETIME: + case execplan::CalpontSystemCatalog::TIMESTAMP: + case execplan::CalpontSystemCatalog::TIME: return joblist::CHAR2NULL; + + case execplan::CalpontSystemCatalog::USMALLINT: return joblist::USMALLINTNULL; + + default: return joblist::SMALLINTNULL; + } +} + +template ::type* = nullptr> +T getNullValue(uint8_t type) +{ + switch (type) + { + case execplan::CalpontSystemCatalog::CHAR: + case execplan::CalpontSystemCatalog::VARCHAR: + case execplan::CalpontSystemCatalog::BLOB: + case execplan::CalpontSystemCatalog::TEXT: + case execplan::CalpontSystemCatalog::DATE: + case execplan::CalpontSystemCatalog::DATETIME: + case execplan::CalpontSystemCatalog::TIMESTAMP: + case execplan::CalpontSystemCatalog::TIME: return joblist::CHAR1NULL; + + case execplan::CalpontSystemCatalog::UTINYINT: return joblist::UTINYINTNULL; + + default: return joblist::TINYINTNULL; + } +} + // // COMPILE A COLUMN FILTER // @@ -518,13 +615,32 @@ boost::shared_ptr _parseColumnFilter( // Pointer to ColArgs structure representing argIndex'th element in the BLOB auto args = reinterpret_cast(filterString + (argIndex * filterSize)); - ret->prestored_cops[argIndex] = args->COP; ret->prestored_rfs[argIndex] = args->rf; - if (datatypes::isUnsigned((execplan::CalpontSystemCatalog::ColDataType)colType)) - ret->storeFilterArg(argIndex, reinterpret_cast(args->val)); + auto colDataType = (execplan::CalpontSystemCatalog::ColDataType)colType; + bool isNullEqCmp = false; + if (datatypes::isUnsigned(colDataType)) + { + const auto nullValue = getNullValue(colDataType); + const UT* filterValue = reinterpret_cast(args->val); + isNullEqCmp = + (args->COP == COMPARE_EQ && memcmp(filterValue, &nullValue, sizeof(nullValue)) == 0) ? true : false; + ret->storeFilterArg(argIndex, filterValue); + } else - ret->storeFilterArg(argIndex, reinterpret_cast(args->val)); + { + const auto nullValue = getNullValue(colDataType); + const T* filterValue = reinterpret_cast(args->val); + isNullEqCmp = + (args->COP == COMPARE_EQ && memcmp(filterValue, &nullValue, sizeof(nullValue)) == 0) ? true : false; + ret->storeFilterArg(argIndex, filterValue); + } + + // IS NULL filtering expression is translated into COMPARE_EQ + NULL magic in the filter. + // This if replaces an operation id once to avoid additional branching in the main loop + // of vectorizedFiltering_ in column.cpp. + // It would be cleaner to place in into EM though. + ret->prestored_cops[argIndex] = (isNullEqCmp) ? COMPARE_NULLEQ : args->COP; } /* Decide which structure to use. I think the only cases where we can use the set @@ -575,4 +691,3 @@ boost::shared_ptr _parseColumnFilter( } } // namespace primitives - diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4f760200f..1b2ba7869 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -55,6 +55,7 @@ if (WITH_UNITTESTS) gtest_add_tests(TARGET column_scan_filter_tests TEST_PREFIX columnstore:) add_executable(simd_processors simd_processors.cpp) + target_compile_options(simd_processors PRIVATE -Wno-error) add_dependencies(simd_processors googletest) target_link_libraries(simd_processors ${ENGINE_LDFLAGS} ${MARIADB_CLIENT_LIBS} ${ENGINE_WRITE_LIBS} ${GTEST_LIBRARIES} processor dbbc) gtest_add_tests(TARGET simd_processors TEST_PREFIX columnstore:) diff --git a/tests/simd_processors.cpp b/tests/simd_processors.cpp index 85bcc14f9..0d009aff7 100644 --- a/tests/simd_processors.cpp +++ b/tests/simd_processors.cpp @@ -15,8 +15,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ - #include +#include #include #include #include @@ -25,465 +25,496 @@ #include "simd_sse.h" #include "simd_arm.h" #if defined(__x86_64__) - #define TESTS_USING_SSE 1 - using float64_t = double; - using float32_t = float; +#define TESTS_USING_SSE 1 +using float64_t = double; +using float32_t = float; #endif #ifdef __aarch64__ - #define TESTS_USING_ARM 1 +#define TESTS_USING_ARM 1 #endif using namespace std; - +#if defined(__x86_64__) || __aarch64__ template -class SimdProcessorTypedTest : public testing::Test { -public: +class SimdProcessorTypedTest : public testing::Test +{ + public: using IntegralType = T; - #if TESTS_USING_SSE - using SimdType = std::conditional_t::value, - simd::vi128f_wr, - std::conditional_t::value, - simd::vi128d_wr, - simd::vi128_wr>>; - using Proc = typename simd::SimdFilterProcessor; - #else - using Proc = typename simd::SimdFilterProcessor::WrapperType, T>; - #endif +#if TESTS_USING_SSE + using SimdType = + std::conditional_t::value, simd::vi128f_wr, + std::conditional_t::value, simd::vi128d_wr, simd::vi128_wr>>; + using Proc = typename simd::SimdFilterProcessor; +#else + using Proc = typename simd::SimdFilterProcessor::WrapperType, T>; +#endif void SetUp() override { } }; -using SimdProcessor128TypedTestTypes = ::testing::Types; +using SimdProcessor128TypedTestTypes = + ::testing::Types; TYPED_TEST_SUITE(SimdProcessorTypedTest, SimdProcessor128TypedTestTypes); TYPED_TEST(SimdProcessorTypedTest, SimdFilterProcessor_simd128) { using Proc = typename SimdProcessorTypedTest::Proc; + + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + using SimdType = typename Proc::SimdType; - constexpr static simd::MT allTrue = 0xFFFF; - constexpr static simd::MT allFalse = 0x0; Proc proc; + const typename Proc::MaskType allTrue = proc.trueMask(); + const typename Proc::MaskType allFalse = proc.falseMask(); + SimdType lhs = proc.loadValue((TypeParam)-2); SimdType rhs = proc.loadValue((TypeParam)-3); EXPECT_GT((uint64_t)-2LL, (uint64_t)-3LL); - EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue); - EXPECT_EQ(proc.cmpGt(lhs, rhs), allTrue); - EXPECT_EQ(proc.cmpGe(rhs, lhs), allFalse); - EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse); - EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue); - EXPECT_EQ(proc.cmpLt(rhs, lhs), allTrue); - EXPECT_EQ(proc.cmpLe(lhs, rhs), allFalse); - EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse); - EXPECT_EQ(proc.cmpEq(rhs, lhs), allFalse); - EXPECT_EQ(proc.cmpNe(rhs, lhs), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(rhs, lhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(rhs, lhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(rhs, lhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(rhs, lhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(rhs, lhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allTrue)); lhs = proc.loadValue((TypeParam)-3); - EXPECT_EQ(proc.cmpEq(lhs, rhs), allTrue); - EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allFalse)); lhs = rhs = proc.loadValue((TypeParam)0); - EXPECT_EQ(proc.cmpGe(lhs, rhs), allTrue); - EXPECT_EQ(proc.cmpGt(lhs, rhs), allFalse); - EXPECT_EQ(proc.cmpGe(rhs, lhs), allTrue); - EXPECT_EQ(proc.cmpGt(rhs, lhs), allFalse); - EXPECT_EQ(proc.cmpLe(rhs, lhs), allTrue); - EXPECT_EQ(proc.cmpLt(rhs, lhs), allFalse); - EXPECT_EQ(proc.cmpLe(lhs, rhs), allTrue); - EXPECT_EQ(proc.cmpLt(lhs, rhs), allFalse); - EXPECT_EQ(proc.cmpEq(rhs, lhs), allTrue); - EXPECT_EQ(proc.cmpNe(rhs, lhs), allFalse); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(rhs, lhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(rhs, lhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(rhs, lhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(rhs, lhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), allFalse)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(rhs, lhs), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(rhs, lhs), allFalse)); } +template +ResultType bitMaskProducerT(const IntegralType* l, const IntegralType* r, + std::function cmp, const bool printOut = false) +{ + uint64_t allOnes = 0xFFULL; + for (size_t i = 1; i < sizeof(IntegralType); ++i) + { + allOnes |= 0xFFULL << (i * 8); + } + ResultType result = {0x0, 0x0}; + + uint64_t* resultPtr = reinterpret_cast(&result); + for (size_t i = 0; i < VecSize >> 1; ++i) + { + if (cmp(l[i], r[i])) + { + if (printOut) + { + uint64_t pLeft = l[i]; + uint pRight = r[i]; + std::cout << "i " << i << " l " << cmp.target_type().name() << " r " << pLeft << " " << pRight + << std::endl; + } + resultPtr[0] |= allOnes << i * sizeof(IntegralType) * 8; + } + } + for (size_t i = VecSize >> 1; i < VecSize; ++i) + { + if (cmp(l[i], r[i])) + { + if (printOut) + { + uint64_t pLeft = l[i]; + uint pRight = r[i]; + std::cout << "i " << i << " l " << cmp.target_type().name() << " r " << pLeft << " " << pRight + << std::endl; + } + resultPtr[1] |= allOnes << (i - (VecSize >> 1)) * sizeof(IntegralType) * 8; + } + } + return result; +}; TEST(SimdProcessorTest, Int8) { - using Proc = typename SimdProcessorTypedTest::Proc; - using SimdType = typename Proc::SimdType; - Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - int8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5}; - int8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; - int8_t minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5}; - int8_t maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; - SimdType lhs = proc.loadFrom(reinterpret_cast(l)); - SimdType rhs = proc.loadFrom(reinterpret_cast(r)); - SimdType min = proc.loadFrom(reinterpret_cast(minlr)); - SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 16; i++) - if (l[i] > r[i]) - expect |= 1 << i; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect); - SimdType testmax = proc.max(lhs, rhs); - SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); - - expect = 0x0; - for (int i = 0; i < 16; i++) - if (l[i] == r[i]) - expect |= 1 << i; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); - - expect = 0x0; - for (int i = 0; i < 16; i++) - if (l[i] < r[i]) - expect |= 1 << i; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); -} -TEST(SimdProcessorTest, Uint8) -{ - using Proc = typename SimdProcessorTypedTest::Proc; + using IntegralType = int8_t; + IntegralType l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5}; + IntegralType r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; + IntegralType minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 58, 2, 32, 41, 2, 5}; + IntegralType maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; + using IntegralType = int8_t; + using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize; using SimdType = typename Proc::SimdType; + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - uint8_t l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5}; - uint8_t r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; - uint8_t minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5}; - uint8_t maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; + const typename Proc::MaskType allTrue = proc.trueMask(); SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 16; i++) - if (l[i] > r[i]) - expect |= 1 << i; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs),(simd::MT) ~expect); + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 16; i++) - if (l[i] == r[i]) - expect |= 1 << i; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs),(simd::MT) ~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); +} - expect = 0x0; - for (int i = 0; i < 16; i++) - if (l[i] < r[i]) - expect |= 1 << i; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs),(simd::MT) ~expect); +TEST(SimdProcessorTest, Uint8) +{ + using IntegralType = uint8_t; + IntegralType l[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5}; + IntegralType r[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; + IntegralType minlr[16]{0, 1, 2, 5, 4, 3, 8, 5, 6, 10, 5, 2, 32, 41, 2, 5}; + IntegralType maxlr[16]{0, 1, 8, 35, 24, 13, 8, 25, 16, 10, 58, 2, 32, 41, 2, 5}; + using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + using SimdType = typename Proc::SimdType; + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; + + using Proc = typename SimdProcessorTypedTest::Proc; + using SimdType = typename Proc::SimdType; + Proc proc; + const Proc::MaskType allTrue = proc.trueMask(); + + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); + SimdType rhs = proc.loadFrom(reinterpret_cast(r)); + SimdType min = proc.loadFrom(reinterpret_cast(minlr)); + SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), true); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); + SimdType testmax = proc.max(lhs, rhs); + SimdType testmin = proc.min(lhs, rhs); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); + + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); + + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } TEST(SimdProcessorTest, Int16) { + using IntegralType = int16_t; + IntegralType l[8]{0, 1, 2, -5, 4, 3, -8, 200}; + IntegralType r[8]{0, 105, -8, 35, 24, 13, 8, 100}; + IntegralType minlr[8]{0, 1, -8, -5, 4, 3, -8, 100}; + IntegralType maxlr[8]{0, 105, 2, 35, 24, 13, 8, 200}; using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; using SimdType = typename Proc::SimdType; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - int16_t l[8]{0, 1, 2, -5, 4, 3, -8, 200}; - int16_t r[8]{0, 105, -8, 35, 24, 13, 8, 100}; - int16_t minlr[8]{0, 1, -8, -5, 4, 3, -8, 100}; - int16_t maxlr[8]{0, 105, 2, 35, 24, 13, 8, 200}; + const Proc::MaskType allTrue = proc.trueMask(); + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 8; i++) - if (l[i] > r[i]) - expect |= 3 << i * 2; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 8; i++) - if (l[i] == r[i]) - expect |= 3 << i * 2; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); - expect = 0x0; - for (int i = 0; i < 8; i++) - if (l[i] < r[i]) - expect |= 3 << i * 2; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } TEST(SimdProcessorTest, Uint16) { - using Proc = typename SimdProcessorTypedTest::Proc; + using IntegralType = uint16_t; + IntegralType l[8]{0, 1, 2, 5, 4, 3, 8, 5}; + IntegralType r[8]{0, 1, 8, 35, 24, 13, 8, 17}; + IntegralType minlr[8]{0, 1, 2, 5, 4, 3, 8, 5}; + IntegralType maxlr[8]{0, 1, 8, 35, 24, 13, 8, 17}; + + using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; using SimdType = typename Proc::SimdType; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - uint16_t l[8]{0, 1, 2, 5, 4, 3, 8, 5}; - uint16_t r[8]{0, 1, 8, 35, 24, 13, 8, 17}; - uint16_t minlr[8]{0, 1, 2, 5, 4, 3, 8, 5}; - uint16_t maxlr[8]{0, 1, 8, 35, 24, 13, 8, 17}; + const Proc::MaskType allTrue = proc.trueMask(); + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 8; i++) - if (l[i] > r[i]) - expect |= 3 << i*2; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 8; i++) - if (l[i] == r[i]) - expect |= 3 << i * 2; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); - expect = 0x0; - for (int i = 0; i < 8; i++) - if (l[i] < r[i]) - expect |= 3 << i * 2; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } TEST(SimdProcessorTest, Int32) { + using IntegralType = int32_t; + IntegralType l[8]{0, 1, 2, -5}; + IntegralType r[8]{0, 105, -8, 54333}; + IntegralType minlr[8]{0, 1, -8, -5}; + IntegralType maxlr[8]{0, 105, 2, 54333}; using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + auto cmpEqFunctor = [](Proc::MaskType left, Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; using SimdType = typename Proc::SimdType; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - int32_t l[8]{0, 1, 2, -5}; - int32_t r[8]{0, 105, -8,54333}; - int32_t minlr[8]{0, 1, -8, -5}; - int32_t maxlr[8]{0, 105, 2, 54333}; + const Proc::MaskType allTrue = proc.trueMask(); + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 4; i++) - if (l[i] > r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 4; i++) - if (l[i] == r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); - expect = 0x0; - for (int i = 0; i < 4; i++) - if (l[i] < r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } TEST(SimdProcessorTest, Uint32) { + using IntegralType = uint32_t; + IntegralType l[4]{0, 1002, 2, 514}; + IntegralType r[4]{2, 1, 80555, 35}; + IntegralType minlr[8]{0, 1, 2, 35}; + IntegralType maxlr[8]{2, 1002, 80555, 514}; using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; using SimdType = typename Proc::SimdType; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - uint32_t l[4]{0, 1002, 2, 514}; - uint32_t r[4]{2, 1, 80555, 35}; - uint32_t minlr[8]{0, 1, 2, 35}; - uint32_t maxlr[8]{2, 1002, 80555, 514}; + const Proc::MaskType allTrue = proc.trueMask(); + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 4; i++) - if (l[i] > r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 4; i++) - if (l[i] == r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); - expect = 0x0; - for (int i = 0; i < 4; i++) - if (l[i] < r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } TEST(SimdProcessorTest, Int64) { + using IntegralType = int64_t; + IntegralType l[2]{-5, 122020}; + IntegralType r[2]{0, 105}; + IntegralType minlr[8]{-5, 105}; + IntegralType maxlr[8]{0, 122020}; using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; using SimdType = typename Proc::SimdType; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - int64_t l[2]{-5, 122020}; - int64_t r[2]{0, 105}; - int64_t minlr[8]{-5, 105}; - int64_t maxlr[8]{0, 122020}; + const Proc::MaskType allTrue = proc.trueMask(); + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 2; i++) - if (l[i] > r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 2; i++) - if (l[i] == r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); - expect = 0x0; - for (int i = 0; i < 2; i++) - if (l[i] < r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } TEST(SimdProcessorTest, Uint64) { + using IntegralType = uint64_t; + IntegralType l[2]{822, 1002}; + IntegralType r[2]{2, 1}; + IntegralType minlr[8]{2, 1}; + IntegralType maxlr[8]{822, 1002}; using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; using SimdType = typename Proc::SimdType; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - uint64_t l[2]{822, 1002}; - uint64_t r[2]{2, 1}; - uint64_t minlr[8]{2, 1}; - uint64_t maxlr[8]{822, 1002}; + const Proc::MaskType allTrue = proc.trueMask(); + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 2; i++) - if (l[i] > r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 2; i++) - if (l[i] == r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); - expect = 0x0; - for (int i = 0; i < 2; i++) - if (l[i] < r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } TEST(SimdProcessorTest, Float64) { - using Proc = typename SimdProcessorTypedTest::Proc; + using IntegralType = float64_t; + IntegralType l[2]{-5.0, 12.5620}; + IntegralType r[2]{2.9, 1}; + IntegralType minlr[8]{-5.0, 1}; + IntegralType maxlr[8]{2.9, 12.5620}; + using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; using SimdType = typename Proc::SimdType; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - float64_t l[2]{-5.0, 12.5620}; - float64_t r[2]{2.9, 1}; - float64_t minlr[8]{-5.0, 1}; - float64_t maxlr[8]{2.9, 12.5620}; + const Proc::MaskType allTrue = proc.trueMask(); + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 2; i++) - if (l[i] > r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 2; i++) - if (l[i] == r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); - expect = 0x0; - for (int i = 0; i < 2; i++) - if (l[i] < r[i]) - expect |= 0xFF << i * 8; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } TEST(SimdProcessorTest, Float32) { - using Proc = typename SimdProcessorTypedTest::Proc; + using IntegralType = float32_t; + IntegralType l[4]{82, 102, -5.6, 9.5}; + IntegralType r[4]{2.0, 1, -5.7, 6}; + IntegralType minlr[8]{2.0, 1, -5.7, 6}; + IntegralType maxlr[8]{82, 102, -5.6, 9.5}; + using Proc = typename SimdProcessorTypedTest::Proc; + constexpr const size_t VecSize = Proc::vecByteSize / sizeof(IntegralType); + auto cmpEqFunctor = [](typename Proc::MaskType left, typename Proc::MaskType right) + { return !memcmp((void*)(&left), (void*)(&right), sizeof(typename Proc::MaskType)); }; + auto bitMaskProducer = bitMaskProducerT; using SimdType = typename Proc::SimdType; Proc proc; - constexpr static simd::MT allTrue = 0xFFFF; - simd::MT expect = 0x0; - float32_t l[4]{82, 102,-5.6,9.5}; - float32_t r[4]{2.0, 1,-5.7,6}; - float32_t minlr[8]{2.0, 1, -5.7, 6}; - float32_t maxlr[8]{82, 102, -5.6, 9.5}; + const Proc::MaskType allTrue = proc.trueMask(); + SimdType lhs = proc.loadFrom(reinterpret_cast(l)); SimdType rhs = proc.loadFrom(reinterpret_cast(r)); SimdType min = proc.loadFrom(reinterpret_cast(minlr)); SimdType max = proc.loadFrom(reinterpret_cast(maxlr)); - for (int i = 0; i < 4; i++) - if (l[i] > r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpGt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpLe(lhs, rhs), (simd::MT)~expect); + + Proc::MaskType expectGt = bitMaskProducer(l, r, std::greater(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGt(lhs, rhs), expectGt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLe(lhs, rhs), ~expectGt)); SimdType testmax = proc.max(lhs, rhs); SimdType testmin = proc.min(lhs, rhs); - EXPECT_EQ(proc.cmpEq(testmax, max), allTrue); - EXPECT_EQ(proc.cmpEq(testmin, min), allTrue); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmax, max), allTrue)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(testmin, min), allTrue)); - expect = 0x0; - for (int i = 0; i < 4; i++) - if (l[i] == r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpEq(lhs, rhs), expect); - EXPECT_EQ(proc.cmpNe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectEq = bitMaskProducer(l, r, std::equal_to(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpEq(lhs, rhs), expectEq)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpNe(lhs, rhs), ~expectEq)); - expect = 0x0; - for (int i = 0; i < 4; i++) - if (l[i] < r[i]) - expect |= 15 << i * 4; - EXPECT_EQ(proc.cmpLt(lhs, rhs), expect); - EXPECT_EQ(proc.cmpGe(lhs, rhs), (simd::MT)~expect); + Proc::MaskType expectLt = bitMaskProducer(l, r, std::less(), false); + EXPECT_TRUE(cmpEqFunctor(proc.cmpLt(lhs, rhs), expectLt)); + EXPECT_TRUE(cmpEqFunctor(proc.cmpGe(lhs, rhs), ~expectLt)); } +#endif \ No newline at end of file diff --git a/utils/common/simd_arm.h b/utils/common/simd_arm.h index a4387561b..884dcb324 100644 --- a/utils/common/simd_arm.h +++ b/utils/common/simd_arm.h @@ -17,59 +17,35 @@ #pragma once +#include "simd_sse.h" // ENUM_KIND #ifdef __aarch64__ #include "arm_neon.h" #include #include #ifdef __OPTIMIZE__ -#define MCS_FORCE_INLINE __attribute__((__always_inline__)) inline +#define MCS_FORCE_INLINE __attribute__((__always_inline__)) #else #define MCS_FORCE_INLINE inline #endif #include "mcs_datatype.h" - namespace simd { // the type is decided by the basic type -using vi1_t =int8x16_t; -using vi2_t =int16x8_t; -using vi4_t =int32x4_t; -using vi8_t =int64x2_t; -using vi1u_t =uint8x16_t; -using vi2u_t =uint16x8_t; -using vi4u_t =uint32x4_t; -using vi8u_t =uint64x2_t; +using vi1_t = int8x16_t; +using vi2_t = int16x8_t; +using vi4_t = int32x4_t; +using vi8_t = int64x2_t; +using vi1u_t = uint8x16_t; +using vi2u_t = uint16x8_t; +using vi4u_t = uint32x4_t; +using vi8u_t = uint64x2_t; using vi128f_t = float32x4_t; using vi128d_t = float64x2_t; using int128_t = __int128; -using MT = uint16_t; -using MaskSimdType=vi1u_t; -template -static vi8_t constant2i() -{ - static const union - { - int64_t i[2]; - vi8_t xmm; - } u = {{i0, i1}}; - return u.xmm; -} -static inline MaskSimdType bitMaskToByteMask16(MT m) -{ - vi8_t sel = constant2i<(int64_t)0xffffffffffffffff, (int64_t)0x0>(); - vi8_t andop = constant2i<(int64_t)0x8040201008040201, (int64_t)0x8040201008040201>(); - vi1u_t op = vreinterpretq_u8_s64( - vandq_s64(vbslq_s64(vreinterpretq_u64_s64(sel), vreinterpretq_s64_u8(vdupq_n_u8(m & 0xff)), - vreinterpretq_s64_u8(vdupq_n_u8((m & 0xff00) >> 8))), - andop)); - vi1u_t zero = vdupq_n_u8(0); - return vcgtq_u8(op, zero); -} -//the type is used by the fun like arm__neon__mm__... -using ArmNeonSSEVecType=uint8x16_t; -//wrapper types +using MaskSimdType = vi1u_t; +// wrapper types struct vi1_wr { int8x16_t v; @@ -116,35 +92,35 @@ struct vi128d_wr float64x2_t v; }; -template +template struct WidthToSVecWrapperType; template <> struct WidthToSVecWrapperType<1> { - using Vectype=int8x16_t; - using WrapperType=struct vi1_wr; + using Vectype = int8x16_t; + using WrapperType = struct vi1_wr; }; template <> struct WidthToSVecWrapperType<2> { using Vectype = int16x8_t; - using WrapperType=struct vi2_wr; + using WrapperType = struct vi2_wr; }; template <> struct WidthToSVecWrapperType<4> { using Vectype = int32x4_t; - using WrapperType=struct vi4_wr; + using WrapperType = struct vi4_wr; }; template <> struct WidthToSVecWrapperType<8> { using Vectype = int64x2_t; - using WrapperType=struct vi8_wr; + using WrapperType = struct vi8_wr; }; template <> struct WidthToSVecWrapperType<16> @@ -183,17 +159,17 @@ struct WidthToVecWrapperType<8> using WrapperType = struct viu8_wr; }; -//We get the simd and wrapper type of basic type by TypeToVecWrapperType. -template +// We get the simd and wrapper type of basic type by TypeToVecWrapperType. +template struct TypeToVecWrapperType; template -struct TypeToVecWrapperType::value>::type> +struct TypeToVecWrapperType::value>::type> : WidthToSVecWrapperType { }; template -struct TypeToVecWrapperType>::type> +struct TypeToVecWrapperType>::type> { using Vectype = vi128f_t; using WrapperType = vi128f_wr; @@ -202,22 +178,20 @@ template struct TypeToVecWrapperType>::type> { using Vectype = vi128d_t; - using WrapperType = vi128d_wr; + using WrapperType = vi128d_wr; }; template -struct TypeToVecWrapperType >::type> - : WidthToVecWrapperType +struct TypeToVecWrapperType>::type> + : WidthToVecWrapperType { }; template - struct TypeToVecWrapperType< - T, typename std::enable_if &&!is_floating_point_v>::type> - : WidthToSVecWrapperType +struct TypeToVecWrapperType && !is_floating_point_v>::type> + : WidthToSVecWrapperType { }; - template struct IntegralToSIMD; @@ -264,78 +238,11 @@ struct StorageToFiltering:: using type = T; }; -// these are the x86 instructions that need to be realized by some arm neon instructions -// the implementations of mm_movemask_epi8 for each type are different because of performance - -MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_64(ArmNeonSSEVecType input) -{ - return static_cast(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8)); -} - -MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_32(ArmNeonSSEVecType input) -{ - input = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(input), 4)); - return static_cast(vgetq_lane_u8(input, 3) | ((int)vgetq_lane_u8(input, 11) << 8)); -} - -MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8_16(ArmNeonSSEVecType input) -{ - input = vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(input), 14)); - input = vreinterpretq_u8_u32(vsraq_n_u32(vreinterpretq_u32_u8(input), vreinterpretq_u32_u8(input), 14)); - input = vreinterpretq_u8_u64(vsraq_n_u64(vreinterpretq_u64_u8(input), vreinterpretq_u64_u8(input), 28)); - return static_cast(vgetq_lane_u8(input, 0) | ((int)vgetq_lane_u8(input, 8) << 8)); -} -MCS_FORCE_INLINE MT arm_neon_mm_movemask_epi8(ArmNeonSSEVecType input) -{ - // Example input (half scale): - // 0x89 FF 1D C0 00 10 99 33 - - // Shift out everything but the sign bits - // 0x01 01 00 01 00 00 01 00 - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); - - // Merge the even lanes together with vsra. The '??' bytes are garbage. - // vsri could also be used, but it is slightly slower on aarch64. - // 0x??03 ??02 ??00 ??01 - uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); - // Repeat with wider lanes. - // 0x??????0B ??????04 - uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); - // 0x??????????????4B - uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); - // Extract the low 8 bits from each lane and join. - // 0x4B - return static_cast(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8)); -} - - -MCS_FORCE_INLINE MT arm_neon_mm_movemask_pd(ArmNeonSSEVecType a) -{ - uint64x2_t input = vreinterpretq_u64_u8(a); - uint64x2_t high_bits = vshrq_n_u64(input, 63); - return static_cast (vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1)); -} -MCS_FORCE_INLINE MT arm_neon_mm_movemask_ps(ArmNeonSSEVecType a) -{ - uint32x4_t input = vreinterpretq_u32_u8(a); - static const int32x4_t shift = {0, 1, 2, 3}; - uint32x4_t tmp = vshrq_n_u32(input, 31); - return static_cast(vaddvq_u32(vshlq_u32(tmp, shift))); -} - -MCS_FORCE_INLINE void arm_neon_mm_maskmoveu_si128(ArmNeonSSEVecType a, ArmNeonSSEVecType mask, char* mem_addr) -{ - int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_u8(mask), 7); - float32x4_t b = vld1q_f32((float*)mem_addr); - int8x16_t masked = - vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_u8(a), vreinterpretq_s8_f32(b)); - vst1q_s8((int8_t*)mem_addr, masked); -} - template class SimdFilterProcessor; -// Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.we use int32_t to do operations +// Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.we use +// int32_t to do operations template class SimdFilterProcessor< VT, CHECK_T, @@ -352,6 +259,8 @@ class SimdFilterProcessor< using SimdType = int32x4_t; using FilterType = T; using StorageType = T; + using MT = uint32x4_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) @@ -363,14 +272,18 @@ class SimdFilterProcessor< { return vdupq_n_s32(fill); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_s32((uint32x4_t)mask, y,x); + return vbslq_s32(mask, y, x); } MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const { return (SimdType)vcgtq_s32(x, y); } + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return vmaxvq_s32(x); + } MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const { return vandq_s32(x, y); @@ -387,7 +300,7 @@ class SimdFilterProcessor< MCS_FORCE_INLINE MT cmpDummy(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u32(0xFFFFFFFF); } // Compare MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) @@ -422,24 +335,28 @@ class SimdFilterProcessor< MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u32(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u32(0xFFFFFFFF); + } + MCS_FORCE_INLINE MT falseMask() + { + return vdupq_n_u32(0); } - // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) + MCS_FORCE_INLINE MT trueMask() { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); + return vdupq_n_u32(0xFFFFFFFF); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { return cmpDummy(x, y); } + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { return reinterpret_cast(std::min(reinterpret_cast(x), reinterpret_cast(y))); @@ -458,17 +375,11 @@ class SimdFilterProcessor< { return vdupq_n_s32(0); } - // store - MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) - { - arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); - } MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_s32(reinterpret_cast(dst), x); } - }; template @@ -485,9 +396,11 @@ class SimdFilterProcessor< using SimdType = simd::vi128d_t; using StorageSimdType = typename WidthToSVecWrapperType::Vectype; using StorageType = typename datatypes::WidthToSIntegralType::type; - using StorageWrapperTypeType =typename WidthToSVecWrapperType::WrapperType; + using StorageWrapperTypeType = typename WidthToSVecWrapperType::WrapperType; using StorageVecProcType = SimdFilterProcessor; constexpr static const uint16_t FilterMaskStep = sizeof(T); + using MT = uint64x2_t; + using MaskType = MT; // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -506,9 +419,9 @@ class SimdFilterProcessor< { return vld1q_f64(reinterpret_cast(from)); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_f64((uint64x2_t)mask,y,x); + return vbslq_f64(mask, y, x); } MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const @@ -518,12 +431,12 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_f64(x, y)); + return vceqq_f64(x, y); } MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_f64(x, y)); + return vcgeq_f64(x, y); } MCS_FORCE_INLINE T maxScalar(SimdType x) { @@ -539,40 +452,42 @@ class SimdFilterProcessor< } MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_f64(x, y)); + return vcgtq_f64(x, y); } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_f64(x, y)); + return vcleq_f64(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_f64(x, y)); + return vcltq_f64(x, y); } MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return cmpEq(x,y) ^ 0xFFFF; + return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmpEq(x, y)))); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u64(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u64(0xFFFFFFFFFFFFFFFF); } - - // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) + MCS_FORCE_INLINE MT falseMask() { - return arm_neon_mm_movemask_pd((ArmNeonSSEVecType)vmask); + return vdupq_n_u64(0); } + MCS_FORCE_INLINE MT trueMask() + { + return vdupq_n_u64(0xFFFFFFFFFFFFFFFF); + } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -619,14 +534,22 @@ class SimdFilterProcessor< constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using FilterType = T; - using NullEmptySimdType =typename WidthToSVecWrapperType::Vectype; + using NullEmptySimdType = typename WidthToSVecWrapperType::Vectype; using SimdWrapperType = vi128f_wr; using SimdType = vi128f_t; using StorageSimdType = typename WidthToSVecWrapperType::Vectype; using StorageType = typename datatypes::WidthToSIntegralType::type; - using StorageWrapperTypeType =typename WidthToSVecWrapperType::WrapperType; + using StorageWrapperTypeType = typename WidthToSVecWrapperType::WrapperType; using StorageVecProcType = SimdFilterProcessor; constexpr static const uint16_t FilterMaskStep = sizeof(T); + using MT = uint32x4_t; + using MaskType = MT; + MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray) + { + // These masks are valid for little-endian archs. + const uint8_t* ptr = reinterpret_cast(reinterpret_cast(inputArray)); + return uint32x4_t{ptr[0], ptr[1], ptr[2], ptr[3]}; + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -634,9 +557,9 @@ class SimdFilterProcessor< // This spec borrows the expr from u-/int64 based proceesor class. return (SimdType)nullEmptyProcessor.loadValue(fill); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_f32((uint32x4_t)mask, y,x); + return vbslq_f32(mask, y, x); } MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const { @@ -667,48 +590,51 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_f32(x, y)); + return vceqq_f32(x, y); } MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_f32(x, y)); + return vcgeq_f32(x, y); } MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_f32(x, y)); + return vcgtq_f32(x, y); } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_f32(x, y)); + return vcleq_f32(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_f32(x, y)); + return vcltq_f32(x, y); } MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmvnq_u32(vceqq_f32(x, y))); + return vmvnq_u32(vceqq_f32(x, y)); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u32(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u32(0xFFFFFFFF); + } + MCS_FORCE_INLINE MT falseMask() + { + return vdupq_n_u32(0); } - // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) + MCS_FORCE_INLINE MT trueMask() { - return arm_neon_mm_movemask_ps((ArmNeonSSEVecType)vmask); + return vdupq_n_u32(0xFFFFFFFF); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) @@ -762,10 +688,11 @@ class SimdFilterProcessor< constexpr static const uint16_t vecBitSize = 128U; using T = typename datatypes::WidthToSIntegralType::type; using SimdWrapperType = typename WidthToSVecWrapperType::WrapperType; - using SimdType =typename WidthToSVecWrapperType::Vectype; + using SimdType = typename WidthToSVecWrapperType::Vectype; using FilterType = T; using StorageType = T; - + using MT = uint64x2_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) @@ -777,15 +704,22 @@ class SimdFilterProcessor< { return vdupq_n_s64(fill); } - + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return std::max(((int64_t*)(&x))[0], ((int64_t*)(&x))[1]); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return std::min(((int64_t*)(&x))[0], ((int64_t*)(&x))[1]); + } // Load from MCS_FORCE_INLINE SimdType loadFrom(const char* from) { return vld1q_s64(reinterpret_cast(from)); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_s64((uint64x2_t)mask, y,x); + return vbslq_s64(mask, y, x); } MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const @@ -800,48 +734,56 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType) vcgeq_s64(x,y)); + return vcgeq_s64(x, y); } MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_s64(x, y)); + return vcgtq_s64(x, y); } MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_s64(x, y)); + return vceqq_s64(x, y); } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_s64(x, y)); + return vcleq_s64(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcltq_s64(x, y)); + return vcltq_s64(x, y); } MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return cmpEq(x,y)^0xFFFF; + return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(cmpEq(x, y)))); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u64(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u64(0xFFFFFFFFFFFFFFFF); + } + MCS_FORCE_INLINE MT falseMask() + { + return vdupq_n_u64(0); } + MCS_FORCE_INLINE MT trueMask() + { + return vdupq_n_u64(0xFFFFFFFFFFFFFFFF); + } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask); + return vmask; } MCS_FORCE_INLINE SimdType setToZero() @@ -853,24 +795,25 @@ class SimdFilterProcessor< { return cmpNe(x, y); } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(MaskType x, MaskType y) + { + return cmpNe(x, y); + } + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } - // store - MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) - { - arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); - } MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { - return vbslq_s64(vcgtq_s64(y,x), x, y); + return vbslq_s64(vcgtq_s64(y, x), x, y); } MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) { - return vbslq_s64(vcgtq_s64(x,y), x, y); + return vbslq_s64(vcgtq_s64(x, y), x, y); } MCS_FORCE_INLINE void store(char* dst, SimdType x) { @@ -892,7 +835,8 @@ class SimdFilterProcessor< using SimdType = typename WidthToVecWrapperType::Vectype; using FilterType = T; using StorageType = T; - + using MT = uint64x2_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) @@ -904,9 +848,9 @@ class SimdFilterProcessor< { return vdupq_n_u64(fill); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_u64((uint64x2_t)mask, y,x); + return vbslq_u64(mask, y, x); } MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const { @@ -924,30 +868,37 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgeq_u64(x, y)); + return vcgeq_u64(x, y); + } + MCS_FORCE_INLINE T maxScalar(SimdType x) + { + return std::max(((uint64_t*)(&x))[0], ((uint64_t*)(&x))[1]); + } + MCS_FORCE_INLINE T minScalar(SimdType x) + { + return std::min(((uint64_t*)(&x))[0], ((uint64_t*)(&x))[1]); } - MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcgtq_u64(x, y)); + return vcgtq_u64(x, y); } MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { - return vbslq_u64(vcgtq_u64(y,x), x, y); + return vbslq_u64(vcgtq_u64(y, x), x, y); } MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) { - return vbslq_u64(vcgtq_u64(x,y), x, y); + return vbslq_u64(vcgtq_u64(x, y), x, y); } MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)); + return vceqq_u64(x, y); } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vcleq_u64(x, y)); + return vcleq_u64(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) @@ -957,25 +908,28 @@ class SimdFilterProcessor< MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vceqq_u64(x, y)) ^ 0xFFFF; + return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_u64(x, y)))); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u64(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u64(0xFFFFFFFFFFFFFFFF); } - // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) + MCS_FORCE_INLINE MT falseMask() { - return arm_neon_mm_movemask_epi8_64((ArmNeonSSEVecType)vmask); + return vdupq_n_u64(0); } + MCS_FORCE_INLINE MT trueMask() + { + return vdupq_n_u64(0xFFFFFFFFFFFFFFFF); + } MCS_FORCE_INLINE SimdType setToZero() { return vdupq_n_u64(0); @@ -991,12 +945,6 @@ class SimdFilterProcessor< return cmpEq(x, y); } - // store - MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) - { - arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); - } - MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_u64(reinterpret_cast(dst), x); @@ -1012,12 +960,19 @@ class SimdFilterProcessor< constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = typename datatypes::WidthToSIntegralType::type; - using SimdWrapperType =typename WidthToSVecWrapperType::WrapperType; + using SimdWrapperType = typename WidthToSVecWrapperType::WrapperType; using SimdType = typename WidthToSVecWrapperType::Vectype; using FilterType = T; using StorageType = T; - + using MT = uint32x4_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); + MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray) + { + // These masks are valid for little-endian archs. + const uint8_t* ptr = reinterpret_cast(reinterpret_cast(inputArray)); + return uint32x4_t{ptr[0], ptr[1], ptr[2], ptr[3]}; + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -1037,9 +992,9 @@ class SimdFilterProcessor< { return vld1q_s32(reinterpret_cast(from)); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_s32((uint32x4_t)mask, y,x); + return vbslq_s32(mask, y, x); } MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const { @@ -1052,22 +1007,22 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType) vceqq_s32(x, y)); + return vceqq_s32(x, y); } MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_s32(x, y)); + return vcgeq_s32(x, y); } MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_s32(x, y)); + return vcgtq_s32(x, y); } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_s32(x, y)); + return vcleq_s32(x, y); } MCS_FORCE_INLINE T minScalar(SimdType x) { @@ -1084,28 +1039,31 @@ class SimdFilterProcessor< } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcltq_s32(x, y)); + return vcltq_s32(x, y); } MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_s32(x, y)) ^ 0xFFFF; + return vmvnq_u32(vceqq_s32(x, y)); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u32(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u32(0xFFFFFFFF); + } + MCS_FORCE_INLINE MT falseMask() + { + return vdupq_n_u32(0); } - // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) + MCS_FORCE_INLINE MT trueMask() { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask); + return vdupq_n_u32(0xFFFFFFFF); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) @@ -1123,12 +1081,6 @@ class SimdFilterProcessor< return vdupq_n_s32(0); } - // store - MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) - { - arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); - } - MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_s32(reinterpret_cast(dst), x); @@ -1149,8 +1101,15 @@ class SimdFilterProcessor< using SimdType = typename WidthToVecWrapperType::Vectype; using FilterType = T; using StorageType = T; - + using MT = uint32x4_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); + MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray) + { + // These masks are valid for little-endian archs. + const uint8_t* ptr = reinterpret_cast(reinterpret_cast(inputArray)); + return uint32x4_t{ptr[0], ptr[1], ptr[2], ptr[3]}; + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -1170,9 +1129,9 @@ class SimdFilterProcessor< { return vld1q_u32(reinterpret_cast(from)); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_u32((uint32x4_t)mask, y,x); + return vbslq_u32(mask, y, x); } MCS_FORCE_INLINE SimdType cmpGtSimdType(SimdType x, SimdType y) const { @@ -1185,12 +1144,12 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)); + return vceqq_u32(x, y); } MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgeq_u32(x, y)); + return vcgeq_u32(x, y); } MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { @@ -1203,12 +1162,12 @@ class SimdFilterProcessor< } MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcgtq_u32(x, y)); + return vcgtq_u32(x, y); } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vcleq_u32(x, y)); + return vcleq_u32(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) @@ -1218,12 +1177,12 @@ class SimdFilterProcessor< MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vceqq_u32(x, y)) ^ 0xFFFF; + return vmvnq_u32(vceqq_u32(x, y)); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u32(0); } MCS_FORCE_INLINE T minScalar(SimdType x) { @@ -1231,13 +1190,16 @@ class SimdFilterProcessor< } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u32(0xFFFFFFFF); + } + MCS_FORCE_INLINE MT falseMask() + { + return vdupq_n_u32(0); } - // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) + MCS_FORCE_INLINE MT trueMask() { - return arm_neon_mm_movemask_epi8_32((ArmNeonSSEVecType)vmask); + return vdupq_n_u32(0xFFFFFFFF); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) @@ -1255,12 +1217,6 @@ class SimdFilterProcessor< return vdupq_n_u32(0); } - // store - MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) - { - arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); - } - MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_u32(reinterpret_cast(dst), x); @@ -1280,8 +1236,16 @@ class SimdFilterProcessor< using SimdType = typename WidthToSVecWrapperType::Vectype; using FilterType = T; using StorageType = T; - + using MT = uint16x8_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); + MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray) + { + // These masks are valid for little-endian archs. + const uint8_t* ptr = reinterpret_cast(reinterpret_cast(inputArray)); + return uint16x8_t{ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]}; + } + // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -1305,11 +1269,11 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vceqq_s16(x, y)); + return vceqq_s16(x, y); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_s16((uint16x8_t)mask, y,x); + return vbslq_s16(mask, y, x); } MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const @@ -1326,12 +1290,12 @@ class SimdFilterProcessor< } MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgeq_s16(x, y)); + return vcgeq_s16(x, y); } MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcgtq_s16(x, y)); + return vcgtq_s16(x, y); } MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { @@ -1344,33 +1308,36 @@ class SimdFilterProcessor< } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcleq_s16(x, y)); + return vcleq_s16(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vcltq_s16(x, y)); + return vcltq_s16(x, y); } MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return cmpEq(x,y) ^ 0xFFFF; + return vmvnq_u16(cmpEq(x, y)); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u16(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u16(0xFFFF); + } + MCS_FORCE_INLINE MT falseMask() + { + return vdupq_n_u16(0); } - // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) + MCS_FORCE_INLINE MT trueMask() { - return arm_neon_mm_movemask_epi8_16((ArmNeonSSEVecType)vmask); + return vdupq_n_u16(0xFFFF); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) @@ -1388,12 +1355,6 @@ class SimdFilterProcessor< return vdupq_n_s16(0); } - // store - MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) - { - arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); - } - MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_s16(reinterpret_cast(dst), x); @@ -1401,20 +1362,27 @@ class SimdFilterProcessor< }; template -class SimdFilterProcessor::value && - std::is_same::value>::type> +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; - using T = uint16_t; + using T = uint16_t; using SimdWrapperType = typename WidthToVecWrapperType::WrapperType; using SimdType = typename WidthToVecWrapperType::Vectype; using FilterType = T; using StorageType = T; - + using MT = uint16x8_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); + MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray) + { + // These masks are valid for little-endian archs. + const uint8_t* ptr = reinterpret_cast(reinterpret_cast(inputArray)); + return uint16x8_t{ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]}; + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -1454,16 +1422,16 @@ class SimdFilterProcessor(dst), x); @@ -1546,6 +1511,8 @@ class SimdFilterProcessor< using SimdType = typename WidthToSVecWrapperType::Vectype; using FilterType = T; using StorageType = T; + using MT = uint8x16_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value @@ -1584,9 +1551,9 @@ class SimdFilterProcessor< { return vld1q_s8(reinterpret_cast(from)); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_s8((uint8x16_t)mask, y, x); + return vbslq_s8(mask, y, x); } MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const @@ -1596,47 +1563,56 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)); + return vceqq_s8(x, y); } MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_s8(x, y)); + return vcgeq_s8(x, y); } MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_s8(x, y)); + return vcgtq_s8(x, y); } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_s8(x, y)); + return vcleq_s8(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_s8(x, y)); + return vcltq_s8(x, y); } MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_s8(x, y)) ^ 0xFFFF; + return vmvnq_u8(vceqq_s8(x, y)); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u8(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u8(0xff); + } + MCS_FORCE_INLINE MT falseMask() + { + return vdupq_n_u8(0); + } + + MCS_FORCE_INLINE MT trueMask() + { + return vdupq_n_u8(0xff); } // misc MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); + return vmask; } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) @@ -1654,12 +1630,6 @@ class SimdFilterProcessor< return vdupq_n_s8(0); } - // store - MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) - { - arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); - } - MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_s8(reinterpret_cast(dst), x); @@ -1675,11 +1645,12 @@ class SimdFilterProcessor< constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; using T = uint8_t; - using SimdWrapperType =typename WidthToVecWrapperType::WrapperType; + using SimdWrapperType = typename WidthToVecWrapperType::WrapperType; using SimdType = typename WidthToVecWrapperType::Vectype; using FilterType = T; using StorageType = T; - + using MT = uint8x16_t; + using MaskType = MT; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) @@ -1705,9 +1676,9 @@ class SimdFilterProcessor< { return vld1q_u8(reinterpret_cast(from)); } - MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, SimdType mask) const + MCS_FORCE_INLINE SimdType blend(SimdType x, SimdType y, MT mask) const { - return vbslq_u8((uint8x16_t)mask, y, x); + return vbslq_u8(mask, y, x); } MCS_FORCE_INLINE SimdType bwAnd(SimdType x, SimdType y) const @@ -1721,17 +1692,17 @@ class SimdFilterProcessor< // Compare MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)); + return vceqq_u8(x, y); } MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgeq_u8(x, y)); + return vcgeq_u8(x, y); } MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcgtq_u8(x, y)); + return vcgtq_u8(x, y); } MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { @@ -1744,34 +1715,37 @@ class SimdFilterProcessor< } MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcleq_u8(x, y)); + return vcleq_u8(x, y); } MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vcltq_u8(x, y)); + return vcltq_u8(x, y); } MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vceqq_u8(x, y)) ^ 0xFFFF; + return vmvnq_u8(vceqq_u8(x, y)); } MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return vdupq_n_u8(0); } MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return vdupq_n_u8(0xff); } - - // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType vmask) + MCS_FORCE_INLINE MT falseMask() { - return arm_neon_mm_movemask_epi8((ArmNeonSSEVecType)vmask); + return vdupq_n_u8(0); + } + + MCS_FORCE_INLINE MT trueMask() + { + return vdupq_n_u8(0xff); } MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) @@ -1779,6 +1753,11 @@ class SimdFilterProcessor< return cmpNe(x, y); } + // MCS_FORCE_INLINE MaskType nullEmptyCmpNe(MaskType x, MaskType y) + // { + // return cmpNe(x, y); + // } + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); @@ -1788,19 +1767,12 @@ class SimdFilterProcessor< { return vdupq_n_u8(0); } - - // store - MCS_FORCE_INLINE void storeWMask(SimdType x, SimdType vmask, char* dst) - { - arm_neon_mm_maskmoveu_si128((ArmNeonSSEVecType)x, (ArmNeonSSEVecType)vmask, dst); - } - MCS_FORCE_INLINE void store(char* dst, SimdType x) { vst1q_u8(reinterpret_cast(dst), x); } }; -}; // namespace simd +}; // namespace simd #endif diff --git a/utils/common/simd_sse.h b/utils/common/simd_sse.h index edd75bcc6..4a8fb42df 100644 --- a/utils/common/simd_sse.h +++ b/utils/common/simd_sse.h @@ -27,7 +27,6 @@ enum ENUM_KIND KIND_TEXT }; // whitespace-trimmed and then compared as signed integers - #if defined(__x86_64__) #include @@ -54,6 +53,7 @@ using vi128f_t = __m128; using vi128d_t = __m128d; using int128_t = __int128; using MT = uint16_t; + // These ugly wrappers are used to allow to use __m128* as template class parameter argument struct vi128_wr { @@ -117,22 +117,32 @@ struct StorageToFiltering:: }; template -static inline vi128_t constant4i() { - static const union { - int i[4]; - vi128_t xmm; - } u = {{i0,i1,i2,i3}}; - return u.xmm; +static inline vi128_t constant4i() +{ + static const union + { + int i[4]; + vi128_t xmm; + } u = {{i0, i1, i2, i3}}; + return u.xmm; } -static inline vi128_t bitMaskToByteMask16(MT m) { +template +static inline vi128_t constant8i() +{ + static const union + { + int8_t i[16]; + vi128_t xmm; + } u = {{i0, i0, i1, i1, i2, i2, i3, i3, i4, i4, i5, i5, i6, i6, i7, i7}}; + return u.xmm; +} + +static inline vi128_t bitMaskToByteMask16(MT m) +{ vi128_t sel = _mm_set1_epi64x(0x8040201008040201); return _mm_cmpeq_epi8( - _mm_and_si128( - _mm_shuffle_epi8(_mm_cvtsi32_si128(m), - _mm_set_epi64x(0x0101010101010101, 0)), - sel), - sel); + _mm_and_si128(_mm_shuffle_epi8(_mm_cvtsi32_si128(m), _mm_set_epi64x(0x0101010101010101, 0)), sel), sel); } template @@ -155,6 +165,7 @@ class SimdFilterProcessor< using SimdType = vi128_t; using FilterType = T; using StorageType = T; + using MaskType = vi128_t; constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) @@ -173,49 +184,49 @@ class SimdFilterProcessor< return _mm_loadu_si128(reinterpret_cast(from)); } - MCS_FORCE_INLINE MT cmpDummy(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpDummy(SimdType x, SimdType y) { - return 0xFFFF; + return MaskType{0x0, 0x0}; } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); // ???? } // misc @@ -224,12 +235,12 @@ class SimdFilterProcessor< return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y) { return cmpDummy(x, y); } @@ -274,6 +285,14 @@ class SimdFilterProcessor< { return reinterpret_cast(std::max(reinterpret_cast(x), reinterpret_cast(y))); } + MCS_FORCE_INLINE MaskType falseMask() + { + return MaskType{0x0, 0x0}; + } + MCS_FORCE_INLINE MaskType trueMask() + { + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); + } }; template @@ -290,6 +309,7 @@ class SimdFilterProcessor< using SimdType = simd::vi128d_t; using StorageSimdType = simd::vi128_t; using StorageType = typename datatypes::WidthToSIntegralType::type; + using MaskType = vi128_t; using StorageVecProcType = SimdFilterProcessor; // Mask calculation for int and float types differs. // See corresponding intrinsics algos for details. @@ -314,44 +334,44 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_pd(x, y)); + return (MaskType)_mm_cmpeq_pd(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_pd(x, y)); + return (MaskType)_mm_cmpge_pd(x, y); } - MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_pd(x, y)); + return (MaskType)_mm_cmpgt_pd(x, y); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmple_pd(x, y)); + return (MaskType)_mm_cmple_pd(x, y); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_pd(x, y)); + return (MaskType)_mm_cmplt_pd(x, y); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_pd(x, y)); + return (MaskType)_mm_cmpneq_pd(x, y); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); } // misc @@ -360,7 +380,8 @@ class SimdFilterProcessor< return _mm_movemask_pd(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) + // Maybe unused + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); @@ -369,7 +390,13 @@ class SimdFilterProcessor< return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(MaskType x, MaskType y) + { + StorageVecProcType nullEmptyProcessor; + return nullEmptyProcessor.cmpNe(x, y); + } + + MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -413,6 +440,14 @@ class SimdFilterProcessor< { return _mm_and_pd(x, y); } + MCS_FORCE_INLINE MaskType falseMask() + { + return MaskType{0x0, 0x0}; + } + MCS_FORCE_INLINE MaskType trueMask() + { + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); + } }; template @@ -428,10 +463,18 @@ class SimdFilterProcessor< using SimdType = vi128f_t; using StorageSimdType = simd::vi128_t; using StorageType = typename datatypes::WidthToSIntegralType::type; + using MaskType = vi128_t; using StorageVecProcType = SimdFilterProcessor; // Mask calculation for int and float types differs. // See corresponding intrinsics algos for details. constexpr static const uint16_t FilterMaskStep = sizeof(T); + MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray) + { + // These masks are valid for little-endian archs. + const MaskType byteMaskVec = + constant4i<(int32_t)0x000000FF, (int32_t)0x0000FF00, (int32_t)0x00FF0000, (int32_t)0xFF000000>(); + return _mm_and_si128(_mm_set1_epi32(*(const int32_t*)inputArray), byteMaskVec); + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -452,44 +495,44 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_ps(x, y)); + return (MaskType)_mm_cmpeq_ps(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_ps(x, y)); + return (MaskType)_mm_cmpge_ps(x, y); } - MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_ps(x, y)); + return (MaskType)_mm_cmpgt_ps(x, y); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmple_ps(x, y)); + return (MaskType)_mm_cmple_ps(x, y); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_ps(x, y)); + return (MaskType)_mm_cmplt_ps(x, y); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { - return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_ps(x, y)); + return (MaskType)_mm_cmpneq_ps(x, y); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); } // misc @@ -498,7 +541,8 @@ class SimdFilterProcessor< return _mm_movemask_ps(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) + // WIP maybe unused + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -508,7 +552,7 @@ class SimdFilterProcessor< return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y) { StorageVecProcType nullEmptyProcessor; @@ -518,6 +562,12 @@ class SimdFilterProcessor< return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr); } + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(MaskType x, MaskType y) + { + StorageVecProcType nullEmptyProcessor; + return nullEmptyProcessor.cmpNe(x, y); + } + MCS_FORCE_INLINE SimdType setToZero() { return _mm_setzero_ps(); @@ -552,12 +602,21 @@ class SimdFilterProcessor< { return _mm_and_ps(x, y); } + MCS_FORCE_INLINE MaskType falseMask() + { + return MaskType{0x0, 0x0}; + } + MCS_FORCE_INLINE MaskType trueMask() + { + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); + } }; template -class SimdFilterProcessor::value && std::is_same::value && - !std::is_same::value>::type> +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -567,6 +626,7 @@ class SimdFilterProcessor -class SimdFilterProcessor::value && std::is_same::value && - !std::is_same::value>::type> +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -699,6 +768,7 @@ class SimdFilterProcessor(); + SimdType signVec = constant4i<0, (int32_t)0x80000000, 0, (int32_t)0x80000000>(); SimdType xFlip = _mm_xor_si128(x, signVec); SimdType yFlip = _mm_xor_si128(y, signVec); - return _mm_movemask_epi8(_mm_cmpgt_epi64(xFlip, yFlip)); + return _mm_cmpgt_epi64(xFlip, yFlip); } - MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)); + return _mm_cmpeq_epi64(x, y); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { - return cmpGt(x, y) ^ 0xFFFF; + return cmpGt(x, y) ^ loadValue(0xFFFFFFFFFFFFFFFF); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { return cmpGt(y, x); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)) ^ 0xFFFF; + return _mm_cmpeq_epi64(x, y) ^ loadValue(0xFFFFFFFFFFFFFFFF); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return loadValue(0xFFFFFFFFFFFFFFFF); } // misc @@ -774,12 +844,12 @@ class SimdFilterProcessor(); + SimdType signVec = constant4i<0, (int32_t)0x80000000, 0, (int32_t)0x80000000>(); SimdType xFlip = _mm_xor_si128(x, signVec); SimdType yFlip = _mm_xor_si128(y, signVec); return _mm_cmpgt_epi64(xFlip, yFlip); } - MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) const + MCS_FORCE_INLINE SimdType min(SimdType x, SimdType y) { - return blend(x, y, cmpGtSimdType(x,y)); + return blend(x, y, cmpGt(x, y)); } MCS_FORCE_INLINE SimdType max(SimdType x, SimdType y) const { - return blend(x, y, cmpGtSimdType(y,x)); + return blend(x, y, cmpGt(y, x)); + } + MCS_FORCE_INLINE MaskType falseMask() const + { + return MaskType{0x0, 0x0}; + } + MCS_FORCE_INLINE MaskType trueMask() const + { + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); } }; template -class SimdFilterProcessor::value && std::is_same::value && - !std::is_same::value>::type> +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -837,9 +916,18 @@ class SimdFilterProcessor(); + return _mm_and_si128(_mm_set1_epi32(*(const int32_t*)inputArray), byteMaskVec); + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -858,44 +946,44 @@ class SimdFilterProcessor -class SimdFilterProcessor::value && std::is_same::value && - !std::is_same::value>::type> +class SimdFilterProcessor< + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value && + !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -969,9 +1066,17 @@ class SimdFilterProcessor(); + return _mm_and_si128(_mm_set1_epi32(*(const int32_t*)inputArray), byteMaskVec); + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -990,47 +1095,48 @@ class SimdFilterProcessor(); + SimdType signVec = + constant4i<(int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000>(); SimdType xFlip = _mm_xor_si128(x, signVec); SimdType yFlip = _mm_xor_si128(y, signVec); - return _mm_movemask_epi8(_mm_cmpgt_epi32(xFlip, yFlip)); + return _mm_cmpgt_epi32(xFlip, yFlip); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { - return cmpGt(x, y) ^ 0xFFFF; + return cmpGt(x, y) ^ loadValue(0xFFFFFFFF); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { return cmpGt(y, x); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)) ^ 0xFFFF; + return _mm_cmpeq_epi32(x, y) ^ loadValue(0xFFFFFFFF); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return loadValue(0xFFFFFFFF); } // misc @@ -1039,12 +1145,12 @@ class SimdFilterProcessor(); + SimdType signVec = + constant4i<(int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000>(); SimdType xFlip = _mm_xor_si128(x, signVec); SimdType yFlip = _mm_xor_si128(y, signVec); return _mm_cmpgt_epi32(xFlip, yFlip); @@ -1092,11 +1199,20 @@ class SimdFilterProcessor class SimdFilterProcessor< - VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -1106,9 +1222,45 @@ class SimdFilterProcessor< using SimdType = simd::vi128_t; using FilterType = T; using StorageType = T; + using MaskType = vi128_t; // Mask calculation for int and float types differs. // See corresponding intrinsics algos for details. constexpr static const uint16_t FilterMaskStep = sizeof(T); + MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray) + { + // const CHECK_T value1 = inputArray[0]; + // const CHECK_T value2 = inputArray[1]; + // const CHECK_T value3 = inputArray[2]; + // const CHECK_T value4 = inputArray[3]; + // const CHECK_T value5 = inputArray[4]; + // const CHECK_T value6 = inputArray[5]; + // const CHECK_T value7 = inputArray[6]; + // const CHECK_T value8 = inputArray[7]; + // union + // { + // CHECK_T i[vecByteSize / sizeof(CHECK_T)]; + // vi128_t xmm; + // } u = {{value1, value2, value3, value4, value5, value6, value7, value8}}; + // return u.xmm; + // std::cout << " maskCtor ptr " << std::hex << (uint64_t)inputArray << " val " << *(int64_t*)inputArray + // << std::endl; + const SimdType byteMaskVec = constant8i<0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07>(); + // auto a1 = _mm_set1_epi64x(*(int64_t*)inputArray); + // auto a2 = _mm_shuffle_epi8(a1, byteMaskVec); + // { + // std::cout << " maskCtor ptr byteMaskVec " << std::hex << ((uint64_t*)(&byteMaskVec))[0] << " " + // << ((uint64_t*)(&byteMaskVec))[1] << std::endl; + // } + // { + // std::cout << " maskCtor ptr a1 " << std::hex << ((uint64_t*)(&a1))[0] << " " << ((uint64_t*)(&a1))[1] + // << std::endl; + // } + // { + // std::cout << " maskCtor ptr a2 " << std::hex << ((uint64_t*)(&a2))[0] << " " << ((uint64_t*)(&a2))[1] + // << std::endl; + // } + return _mm_shuffle_epi8(_mm_set1_epi64x(*(int64_t*)inputArray), byteMaskVec); + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -1127,44 +1279,44 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)); + return _mm_cmpeq_epi16(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y) { - return cmpLt(x, y) ^ 0xFFFF; + return cmpLt(x, y) ^ loadValue(0xFFFF); } - MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpgt_epi16(x, y)); + return _mm_cmpgt_epi16(x, y); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { - return cmpGt(x, y) ^ 0xFFFF; + return cmpGt(x, y) ^ loadValue(0xFFFF); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmplt_epi16(x, y)); + return _mm_cmplt_epi16(x, y); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF; + return _mm_cmpeq_epi16(x, y) ^ loadValue(0xFFFF); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return loadValue(0xFFFF); } // misc @@ -1173,12 +1325,12 @@ class SimdFilterProcessor< return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1223,11 +1375,20 @@ class SimdFilterProcessor< { return _mm_max_epi16(x, y); } + MCS_FORCE_INLINE MaskType falseMask() + { + return MaskType{0x0, 0x0}; + } + MCS_FORCE_INLINE MaskType trueMask() + { + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); + } }; template -class SimdFilterProcessor< - VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> +class SimdFilterProcessor::value && + std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -1237,9 +1398,15 @@ class SimdFilterProcessor< using SimdType = simd::vi128_t; using FilterType = T; using StorageType = T; + using MaskType = vi128_t; // Mask calculation for int and float types differs. // See corresponding intrinsics algos for details. constexpr static const uint16_t FilterMaskStep = sizeof(T); + MCS_FORCE_INLINE MaskType maskCtor(const char* inputArray) + { + const SimdType byteMaskVec = constant8i<0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07>(); + return _mm_shuffle_epi8(_mm_set1_epi64x(*(int64_t*)inputArray), byteMaskVec); + } // Load value MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { @@ -1258,45 +1425,65 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)); + return _mm_cmpeq_epi16(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y) { - SimdType maxOfTwo = _mm_max_epu16(x, y); // max(x, y), unsigned - return _mm_movemask_epi8(_mm_cmpeq_epi16(x, maxOfTwo)); + SimdType maxOfTwo = _mm_max_epu16(x, y); // max(x, y), unsigned + return _mm_cmpeq_epi16(x, maxOfTwo); } - MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) + // MCS_FORCE_INLINE MaskType cmpGE(SimdType x, SimdType y) + // { + // SimdType maxOfTwo = _mm_max_epu16(x, y); // max(x, y), unsigned + // return _mm_cmpeq_epi16(x, maxOfTwo); + // } + + MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) { - return cmpGe(y, x) ^ 0xFFFF; + return cmpGe(y, x) ^ loadValue(0xFFFF); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { return cmpGe(y, x); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + // MCS_FORCE_INLINE MaskType cmpLE(SimdType x, SimdType y) + // { + // return cmpGE(y, x); + // } + + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { - return cmpGe(x, y) ^ 0xFFFF; + // auto a = cmpGe(x, y); + // uint64_t* aRef = (uint64_t*)&a; + // auto b = loadValue(0xFF); + // uint64_t* bRef = (uint64_t*)&b; + // auto c = cmpGe(x, y) ^ loadValue(0xFF); + // uint64_t* cRef = (uint64_t*)&c; + // std::cout << " cmpLt cmpGe " << std::hex << aRef[0] << " " << aRef[1] << " loadValue " << bRef[0] << " + // " + // << bRef[1] << " result " << cRef[0] << " " << cRef[1] << std::endl; + return cmpGe(x, y) ^ loadValue(0xFFFF); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF; + return _mm_cmpeq_epi16(x, y) ^ loadValue(0xFFFF); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return loadValue(0xFFFF); } // misc @@ -1305,12 +1492,12 @@ class SimdFilterProcessor< return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1358,11 +1545,20 @@ class SimdFilterProcessor< { return _mm_max_epu16(x, y); } + MCS_FORCE_INLINE MaskType falseMask() + { + return MaskType{0x0, 0x0}; + } + MCS_FORCE_INLINE MaskType trueMask() + { + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); + } }; template class SimdFilterProcessor< - VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -1372,6 +1568,7 @@ class SimdFilterProcessor< using SimdType = vi128_t; using FilterType = T; using StorageType = T; + using MaskType = vi128_t; // Mask calculation for int and float types differs. // See corresponding intrinsics algos for details. constexpr static const uint16_t FilterMaskStep = sizeof(T); @@ -1393,44 +1590,44 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); + return _mm_cmpeq_epi8(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y) { - return cmpLt(x, y) ^ 0xFFFF; + return cmpLt(x, y) ^ loadValue(0xFF); } - MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y)); + return _mm_cmpgt_epi8(x, y); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { - return cmpGt(x, y) ^ 0xFFFF; + return cmpGt(x, y) ^ loadValue(0xFF); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmplt_epi8(x, y)); + return _mm_cmplt_epi8(x, y); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF; + return _mm_cmpeq_epi8(x, y) ^ loadValue(0xFF); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return loadValue(0xFF); } // permute @@ -1446,12 +1643,12 @@ class SimdFilterProcessor< return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1496,11 +1693,20 @@ class SimdFilterProcessor< { return _mm_max_epi8(x, y); } + MCS_FORCE_INLINE MaskType falseMask() + { + return MaskType{0x0, 0x0}; + } + MCS_FORCE_INLINE MaskType trueMask() + { + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); + } }; template class SimdFilterProcessor< - VT, CHECK_T, typename std::enable_if::value && std::is_same::value>::type> + VT, CHECK_T, + typename std::enable_if::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; @@ -1510,6 +1716,7 @@ class SimdFilterProcessor< using SimdType = vi128_t; using FilterType = T; using StorageType = T; + using MaskType = vi128_t; // Mask calculation for int and float types differs. // See corresponding intrinsics algos for details. constexpr static const uint16_t FilterMaskStep = sizeof(T); @@ -1531,45 +1738,45 @@ class SimdFilterProcessor< } // Compare - MCS_FORCE_INLINE MT cmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpEq(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); + return _mm_cmpeq_epi8(x, y); } - MCS_FORCE_INLINE MT cmpGe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGe(SimdType x, SimdType y) { - SimdType maxOfTwo = _mm_max_epu8(x, y); // max(x, y), unsigned - return _mm_movemask_epi8(_mm_cmpeq_epi8(x, maxOfTwo)); + SimdType maxOfTwo = _mm_max_epu8(x, y); // max(x, y), unsigned + return _mm_cmpeq_epi8(x, maxOfTwo); } - MCS_FORCE_INLINE MT cmpGt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpGt(SimdType x, SimdType y) { - return cmpGe(y, x) ^ 0xFFFF; + return cmpGe(y, x) ^ loadValue(0xFF); } - MCS_FORCE_INLINE MT cmpLe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLe(SimdType x, SimdType y) { return cmpGe(y, x); } - MCS_FORCE_INLINE MT cmpLt(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpLt(SimdType x, SimdType y) { - return cmpGe(x, y) ^ 0xFFFF; + return cmpGe(x, y) ^ loadValue(0xFF); } - MCS_FORCE_INLINE MT cmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpNe(SimdType x, SimdType y) { - return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF; + return _mm_cmpeq_epi8(x, y) ^ loadValue(0xFF); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysFalse(SimdType x, SimdType y) { - return 0; + return MaskType{0x0, 0x0}; } - MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType cmpAlwaysTrue(SimdType x, SimdType y) { - return 0xFFFF; + return loadValue(0xFF); } // permute @@ -1585,12 +1792,12 @@ class SimdFilterProcessor< return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpNe(SimdType x, SimdType y) { return cmpNe(x, y); } - MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType x, SimdType y) + MCS_FORCE_INLINE MaskType nullEmptyCmpEq(SimdType x, SimdType y) { return cmpEq(x, y); } @@ -1638,6 +1845,14 @@ class SimdFilterProcessor< { return _mm_max_epu8(x, y); } + MCS_FORCE_INLINE MaskType falseMask() + { + return MaskType{0x0, 0x0}; + } + MCS_FORCE_INLINE MaskType trueMask() + { + return _mm_set_epi64x(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL); + } }; } // namespace simd