diff --git a/mysql-test/columnstore/basic/r/double_float.result b/mysql-test/columnstore/basic/r/double_float.result new file mode 100644 index 000000000..25997a3d4 --- /dev/null +++ b/mysql-test/columnstore/basic/r/double_float.result @@ -0,0 +1,176 @@ +DROP DATABASE IF EXISTS `double_float`; +CREATE DATABASE `double_float`; +USE `double_float`; +SET default_storage_engine=Columnstore; +SELECT @@default_storage_engine; +@@default_storage_engine +Columnstore +set autocommit=0; +CREATE TABLE test1 (dkey int); +INSERT INTO test1 VALUES (1), (2), (3); +SELECT test1.dkey FROM test1 ORDER BY test1.dkey; +dkey +1 +2 +3 +CREATE TABLE qatabledouble (col DOUBLE) ; +CREATE TABLE qatablefloat (col float) ; +INSERT INTO qatabledouble VALUES (-2.225073858507201E-307); +INSERT INTO qatabledouble VALUES (-1.807302187774382E-127); +INSERT INTO qatabledouble VALUES (0); +INSERT INTO qatabledouble VALUES (1.993777023789432E+21); +INSERT INTO qatabledouble VALUES (1.797693134862315E+38); +INSERT INTO qatabledouble VALUES (-19937770237894323221); +INSERT INTO qatabledouble VALUES (17976931348623158); +SELECT * FROM qatabledouble; +col +-2.225073858507201e-307 +-1.807302187774382e-127 +0 +1.993777023789432e21 +1.797693134862315e38 +-1.9937770237894324e19 +1.7976931348623158e16 +INSERT INTO qatabledouble VALUES (null); +INSERT INTO qatabledouble VALUES (null); +INSERT INTO qatabledouble VALUES (null); +SELECT * FROM qatabledouble; +col +-2.225073858507201e-307 +-1.807302187774382e-127 +0 +1.993777023789432e21 +1.797693134862315e38 +-1.9937770237894324e19 +1.7976931348623158e16 +NULL +NULL +NULL +INSERT INTO qatablefloat VALUES (null); +INSERT INTO qatablefloat VALUES (null); +INSERT INTO qatablefloat VALUES (null); +SELECT * FROM qatablefloat; +col +NULL +NULL +NULL +DELETE FROM qatabledouble WHERE col IS NULL; +SELECT * FROM qatabledouble; +col +-2.225073858507201e-307 +-1.807302187774382e-127 +0 +1.993777023789432e21 +1.797693134862315e38 +-1.9937770237894324e19 +1.7976931348623158e16 +delete FROM qatablefloat WHERE col IS NULL; +SELECT * FROM qatablefloat; +col +CREATE TABLE qatabledouble_v2 (col1 DOUBLE, col2 DOUBLE, col3 DOUBLE) ; +INSERT INTO qatabledouble_v2 VALUES (-0.50, -0.50, -0.50); +INSERT INTO qatabledouble_v2 VALUES (-0.49, -0.49, -0.49); +INSERT INTO qatabledouble_v2 VALUES (0.49, 0.49, 0.49); +INSERT INTO qatabledouble_v2 VALUES (0.50, 0.50, 0.50); +INSERT INTO qatabledouble_v2 VALUES (+8,+8,+8); +INSERT INTO qatabledouble_v2 VALUES (+0.50,+0.50,+0.50); +INSERT INTO qatabledouble_v2 VALUES (+0.49,+0.49,+0.49); +INSERT INTO qatabledouble_v2 VALUES (+0.0,+0.0,+0.0); +INSERT INTO qatabledouble_v2 VALUES (+.50,+.50,+.50); +INSERT INTO qatabledouble_v2 VALUES (+.49,+.49,+.49); +INSERT INTO qatabledouble_v2 VALUES (+.0,+.0,+.0); +INSERT INTO qatabledouble_v2 VALUES (-.0,-.0,-.0); +INSERT INTO qatabledouble_v2 VALUES (-.49,-.49,-.49); +INSERT INTO qatabledouble_v2 VALUES (-.50,-.50,-.50); +INSERT INTO qatabledouble_v2 VALUES (-0.0,-0.0,-0.0); +INSERT INTO qatabledouble_v2 VALUES (-0.49,-0.49,-0.49); +INSERT INTO qatabledouble_v2 VALUES (-0.50,-0.50,-0.50); +INSERT INTO qatabledouble_v2 VALUES (-8,-8,-8); +INSERT INTO qatabledouble_v2 VALUES (8,8,8); +INSERT INTO qatabledouble_v2 VALUES (0.50,0.50,0.50); +INSERT INTO qatabledouble_v2 VALUES (0.49,0.49,0.49); +INSERT INTO qatabledouble_v2 VALUES (0.0,0.0,0.0); +INSERT INTO qatabledouble_v2 VALUES (.50,.50,.50); +INSERT INTO qatabledouble_v2 VALUES (.49,.49,.49); +INSERT INTO qatabledouble_v2 VALUES (.0,.0,.0); +SELECT * FROM qatabledouble_v2; +col1 col2 col3 +-0.5 -0.5 -0.5 +-0.49 -0.49 -0.49 +0.49 0.49 0.49 +0.5 0.5 0.5 +8 8 8 +0.5 0.5 0.5 +0.49 0.49 0.49 +0 0 0 +0.5 0.5 0.5 +0.49 0.49 0.49 +0 0 0 +0 0 0 +-0.49 -0.49 -0.49 +-0.5 -0.5 -0.5 +0 0 0 +-0.49 -0.49 -0.49 +-0.5 -0.5 -0.5 +-8 -8 -8 +8 8 8 +0.5 0.5 0.5 +0.49 0.49 0.49 +0 0 0 +0.5 0.5 0.5 +0.49 0.49 0.49 +0 0 0 +CREATE TABLE qatablefloat_v3 (col1 float, col2 float, col3 float) ; +INSERT INTO qatablefloat_v3 VALUES (-0.50, -0.50, -0.50); +INSERT INTO qatablefloat_v3 VALUES (-0.49, -0.49, -0.49); +INSERT INTO qatablefloat_v3 VALUES (0.49, 0.49, 0.49); +INSERT INTO qatablefloat_v3 VALUES (0.50, 0.50, 0.50); +INSERT INTO qatablefloat_v3 VALUES (+8,+8,+8); +INSERT INTO qatablefloat_v3 VALUES (+0.50,+0.50,+0.50); +INSERT INTO qatablefloat_v3 VALUES (+0.49,+0.49,+0.49); +INSERT INTO qatablefloat_v3 VALUES (+0.0,+0.0,+0.0); +INSERT INTO qatablefloat_v3 VALUES (+.50,+.50,+.50); +INSERT INTO qatablefloat_v3 VALUES (+.49,+.49,+.49); +INSERT INTO qatablefloat_v3 VALUES (+.0,+.0,+.0); +INSERT INTO qatablefloat_v3 VALUES (-.0,-.0,-.0); +INSERT INTO qatablefloat_v3 VALUES (-.49,-.49,-.49); +INSERT INTO qatablefloat_v3 VALUES (-.50,-.50,-.50); +INSERT INTO qatablefloat_v3 VALUES (-0.0,-0.0,-0.0); +INSERT INTO qatablefloat_v3 VALUES (-0.49,-0.49,-0.49); +INSERT INTO qatablefloat_v3 VALUES (-0.50,-0.50,-0.50); +INSERT INTO qatablefloat_v3 VALUES (-8,-8,-8); +INSERT INTO qatablefloat_v3 VALUES (8,8,8); +INSERT INTO qatablefloat_v3 VALUES (0.50,0.50,0.50); +INSERT INTO qatablefloat_v3 VALUES (0.49,0.49,0.49); +INSERT INTO qatablefloat_v3 VALUES (0.0,0.0,0.0); +INSERT INTO qatablefloat_v3 VALUES (.50,.50,.50); +INSERT INTO qatablefloat_v3 VALUES (.49,.49,.49); +INSERT INTO qatablefloat_v3 VALUES (.0,.0,.0); +SELECT * FROM qatablefloat_v3; +col1 col2 col3 +-0.5 -0.5 -0.5 +-0.49 -0.49 -0.49 +0.49 0.49 0.49 +0.5 0.5 0.5 +8 8 8 +0.5 0.5 0.5 +0.49 0.49 0.49 +0 0 0 +0.5 0.5 0.5 +0.49 0.49 0.49 +0 0 0 +0 0 0 +-0.49 -0.49 -0.49 +-0.5 -0.5 -0.5 +0 0 0 +-0.49 -0.49 -0.49 +-0.5 -0.5 -0.5 +-8 -8 -8 +8 8 8 +0.5 0.5 0.5 +0.49 0.49 0.49 +0 0 0 +0.5 0.5 0.5 +0.49 0.49 0.49 +0 0 0 +DROP DATABASE `double_float`; diff --git a/mysql-test/columnstore/basic/t/double_float.test b/mysql-test/columnstore/basic/t/double_float.test new file mode 100644 index 000000000..e49719256 --- /dev/null +++ b/mysql-test/columnstore/basic/t/double_float.test @@ -0,0 +1,100 @@ +# +# Some double/float tests moved from regr. +# DML, DQL w/without filters +# + +-- source ../include/have_columnstore.inc + +--disable_warnings +DROP DATABASE IF EXISTS `double_float`; +--enable_warnings +CREATE DATABASE `double_float`; +USE `double_float`; + +SET default_storage_engine=Columnstore; +SELECT @@default_storage_engine; + +set autocommit=0; + +CREATE TABLE test1 (dkey int); +INSERT INTO test1 VALUES (1), (2), (3); +SELECT test1.dkey FROM test1 ORDER BY test1.dkey; + +CREATE TABLE qatabledouble (col DOUBLE) ; +CREATE TABLE qatablefloat (col float) ; +INSERT INTO qatabledouble VALUES (-2.225073858507201E-307); # Changed -308 to -307, -308 errors on qaftest7. +INSERT INTO qatabledouble VALUES (-1.807302187774382E-127); +INSERT INTO qatabledouble VALUES (0); +INSERT INTO qatabledouble VALUES (1.993777023789432E+21); +INSERT INTO qatabledouble VALUES (1.797693134862315E+38); +INSERT INTO qatabledouble VALUES (-19937770237894323221); +INSERT INTO qatabledouble VALUES (17976931348623158); +SELECT * FROM qatabledouble; +INSERT INTO qatabledouble VALUES (null); +INSERT INTO qatabledouble VALUES (null); +INSERT INTO qatabledouble VALUES (null); +SELECT * FROM qatabledouble; +INSERT INTO qatablefloat VALUES (null); +INSERT INTO qatablefloat VALUES (null); +INSERT INTO qatablefloat VALUES (null); +SELECT * FROM qatablefloat; +DELETE FROM qatabledouble WHERE col IS NULL; +SELECT * FROM qatabledouble; +delete FROM qatablefloat WHERE col IS NULL; +SELECT * FROM qatablefloat; +CREATE TABLE qatabledouble_v2 (col1 DOUBLE, col2 DOUBLE, col3 DOUBLE) ; +INSERT INTO qatabledouble_v2 VALUES (-0.50, -0.50, -0.50); +INSERT INTO qatabledouble_v2 VALUES (-0.49, -0.49, -0.49); +INSERT INTO qatabledouble_v2 VALUES (0.49, 0.49, 0.49); +INSERT INTO qatabledouble_v2 VALUES (0.50, 0.50, 0.50); +INSERT INTO qatabledouble_v2 VALUES (+8,+8,+8); +INSERT INTO qatabledouble_v2 VALUES (+0.50,+0.50,+0.50); +INSERT INTO qatabledouble_v2 VALUES (+0.49,+0.49,+0.49); +INSERT INTO qatabledouble_v2 VALUES (+0.0,+0.0,+0.0); +INSERT INTO qatabledouble_v2 VALUES (+.50,+.50,+.50); +INSERT INTO qatabledouble_v2 VALUES (+.49,+.49,+.49); +INSERT INTO qatabledouble_v2 VALUES (+.0,+.0,+.0); +INSERT INTO qatabledouble_v2 VALUES (-.0,-.0,-.0); +INSERT INTO qatabledouble_v2 VALUES (-.49,-.49,-.49); +INSERT INTO qatabledouble_v2 VALUES (-.50,-.50,-.50); +INSERT INTO qatabledouble_v2 VALUES (-0.0,-0.0,-0.0); +INSERT INTO qatabledouble_v2 VALUES (-0.49,-0.49,-0.49); +INSERT INTO qatabledouble_v2 VALUES (-0.50,-0.50,-0.50); +INSERT INTO qatabledouble_v2 VALUES (-8,-8,-8); +INSERT INTO qatabledouble_v2 VALUES (8,8,8); +INSERT INTO qatabledouble_v2 VALUES (0.50,0.50,0.50); +INSERT INTO qatabledouble_v2 VALUES (0.49,0.49,0.49); +INSERT INTO qatabledouble_v2 VALUES (0.0,0.0,0.0); +INSERT INTO qatabledouble_v2 VALUES (.50,.50,.50); +INSERT INTO qatabledouble_v2 VALUES (.49,.49,.49); +INSERT INTO qatabledouble_v2 VALUES (.0,.0,.0); +SELECT * FROM qatabledouble_v2; +CREATE TABLE qatablefloat_v3 (col1 float, col2 float, col3 float) ; +INSERT INTO qatablefloat_v3 VALUES (-0.50, -0.50, -0.50); +INSERT INTO qatablefloat_v3 VALUES (-0.49, -0.49, -0.49); +INSERT INTO qatablefloat_v3 VALUES (0.49, 0.49, 0.49); +INSERT INTO qatablefloat_v3 VALUES (0.50, 0.50, 0.50); +INSERT INTO qatablefloat_v3 VALUES (+8,+8,+8); +INSERT INTO qatablefloat_v3 VALUES (+0.50,+0.50,+0.50); +INSERT INTO qatablefloat_v3 VALUES (+0.49,+0.49,+0.49); +INSERT INTO qatablefloat_v3 VALUES (+0.0,+0.0,+0.0); +INSERT INTO qatablefloat_v3 VALUES (+.50,+.50,+.50); +INSERT INTO qatablefloat_v3 VALUES (+.49,+.49,+.49); +INSERT INTO qatablefloat_v3 VALUES (+.0,+.0,+.0); +INSERT INTO qatablefloat_v3 VALUES (-.0,-.0,-.0); +INSERT INTO qatablefloat_v3 VALUES (-.49,-.49,-.49); +INSERT INTO qatablefloat_v3 VALUES (-.50,-.50,-.50); +INSERT INTO qatablefloat_v3 VALUES (-0.0,-0.0,-0.0); +INSERT INTO qatablefloat_v3 VALUES (-0.49,-0.49,-0.49); +INSERT INTO qatablefloat_v3 VALUES (-0.50,-0.50,-0.50); +INSERT INTO qatablefloat_v3 VALUES (-8,-8,-8); +INSERT INTO qatablefloat_v3 VALUES (8,8,8); +INSERT INTO qatablefloat_v3 VALUES (0.50,0.50,0.50); +INSERT INTO qatablefloat_v3 VALUES (0.49,0.49,0.49); +INSERT INTO qatablefloat_v3 VALUES (0.0,0.0,0.0); +INSERT INTO qatablefloat_v3 VALUES (.50,.50,.50); +INSERT INTO qatablefloat_v3 VALUES (.49,.49,.49); +INSERT INTO qatablefloat_v3 VALUES (.0,.0,.0); +SELECT * FROM qatablefloat_v3; + +DROP DATABASE `double_float`; \ No newline at end of file diff --git a/primitives/linux-port/column.cpp b/primitives/linux-port/column.cpp index 64e4da1a9..f1533fe7e 100644 --- a/primitives/linux-port/column.cpp +++ b/primitives/linux-port/column.cpp @@ -50,16 +50,8 @@ using namespace execplan; namespace { -// WIP Move this using MT = uint16_t; -// Column filtering is dispatched 4-way based on the column type, -// which defines implementation of comparison operations for the column values -enum ENUM_KIND {KIND_DEFAULT, // compared as signed integers - KIND_UNSIGNED, // compared as unsigned integers - KIND_FLOAT, // compared as floating-point numbers - KIND_TEXT}; // whitespace-trimmed and then compared as signed integers - inline uint64_t order_swap(uint64_t x) { uint64_t ret = (x >> 56) | @@ -1086,16 +1078,16 @@ inline uint16_t vectWriteColValues(VT& simdProcessor, // SIMD processor primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { - constexpr const uint16_t WIDTH = sizeof(T); - using SIMD_TYPE = typename VT::SIMD_TYPE; - SIMD_TYPE tmpStorageVector; + constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep; + using SimdType = typename VT::SimdType; + SimdType tmpStorageVector; T* tmpDstVecTPtr = reinterpret_cast(&tmpStorageVector); // Saving values based on writeMask into tmp vec. // Min/Max processing. // The mask is 16 bit long and it describes N elements. // N = sizeof(vector type) / WIDTH. uint32_t j = 0; - for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH) + for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep) { MT bitMapPosition = 1 << it; if (writeMask & bitMapPosition) @@ -1150,16 +1142,16 @@ inline uint16_t vectWriteColValues(VT& simdProcessor, // SIMD processor primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { - constexpr const uint16_t WIDTH = sizeof(T); - using SIMD_TYPE = typename VT::SIMD_TYPE; - SIMD_TYPE tmpStorageVector; + constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep; + using SimdType = typename VT::SimdType; + SimdType tmpStorageVector; T* tmpDstVecTPtr = reinterpret_cast(&tmpStorageVector); // Saving values based on writeMask into tmp vec. // Min/Max processing. // The mask is 16 bit long and it describes N elements. // N = sizeof(vector type) / WIDTH. uint32_t j = 0; - for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH) + for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep) { MT bitMapPosition = 1 << it; if (writeMask & bitMapPosition) @@ -1197,13 +1189,13 @@ inline uint16_t vectWriteRIDValues(VT& processor, // SIMD processor MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs { - constexpr const uint16_t WIDTH = sizeof(T); + constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep; primitives::RIDType* origRIDDstArray = ridDstArray; // Saving values based on writeMask into tmp vec. // Min/Max processing. // The mask is 16 bit long and it describes N elements where N = sizeof(vector type) / WIDTH. uint16_t j = 0; - for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH) + for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep) { MT bitMapPosition = 1 << it; if (writeMask & (1 << it)) @@ -1348,12 +1340,11 @@ inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray, { constexpr const uint16_t WIDTH = sizeof(T); constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH; - using SIMD_TYPE = typename VT::SIMD_TYPE; - SIMD_TYPE result; + using SimdType = typename VT::SimdType; + SimdType result; T* resultTypedPtr = reinterpret_cast(&result); for (uint32_t i = 0; i < VECTOR_SIZE; ++i) { - //std::cout << " simdDataLoadTemplate ridArray[ridArrayOffset] " << (int8_t) origSrcArray[ridArray[i]] << " ridArray[i] " << ridArray[i] << "\n"; resultTypedPtr[i] = origSrcArray[ridArray[i]]; } @@ -1378,12 +1369,13 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, T Min, T Max, const bool isNullValueMatches) { constexpr const uint16_t WIDTH = sizeof(T); - using SIMD_TYPE = typename VT::SIMD_TYPE; - using SIMD_WRAPPER_TYPE = typename VT::SIMD_WRAPPER_TYPE; + using SimdType = typename VT::SimdType; + using SimdWrapperType = typename VT::SimdWrapperType; + using FilterType = typename VT::FilterType; VT simdProcessor; - SIMD_TYPE dataVec; - SIMD_TYPE emptyFilterArgVec = simdProcessor.loadValue(emptyValue); - SIMD_TYPE nullFilterArgVec = simdProcessor.loadValue(nullValue); + SimdType dataVec; + SimdType emptyFilterArgVec = simdProcessor.emptyNullLoadValue(emptyValue); + SimdType nullFilterArgVec = simdProcessor.emptyNullLoadValue(nullValue); MT writeMask, nonEmptyMask, nonNullMask, nonNullOrEmptyMask; MT initFilterMask = 0xFFFF; primitives::RIDType rid = 0; @@ -1397,18 +1389,16 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, ColumnFilterMode columnFilterMode = ALWAYS_TRUE; const ST* filterSet = nullptr; const ParsedColumnFilter::RFsType* filterRFs = nullptr; - uint8_t outputType = in->OutputType; - constexpr uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH; // If there are RIDs use its number to get a number of vectorized iterations. uint16_t iterNumber = HAS_INPUT_RIDS ? ridSize / VECTOR_SIZE : srcSize / VECTOR_SIZE; uint32_t filterCount = 0; // These pragmas are to silence GCC warnings - // warning: ignoring attributes on template argument + // warning: ignoring attributes on template argument #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-attributes" - std::vector filterArgsVectors; + std::vector filterArgsVectors; auto ptrA = std::mem_fn(&VT::cmpEq); using COPType = decltype(ptrA); std::vector copFunctorVec; @@ -1452,15 +1442,20 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, for (uint32_t j = 0; j < filterCount; ++j) { // Preload filter argument values only once. - filterArgsVectors[j] = simdProcessor.loadValue(filterValues[j]); + filterArgsVectors[j] = simdProcessor.loadValue(*((FilterType*)&filterValues[j])); switch(filterCOPs[j]) { case(COMPARE_EQ): - copFunctorVec.push_back(std::mem_fn(&VT::cmpEq)); + // Skipping extra filter pass generated by IS NULL + if (memcmp(&filterValues[j], &nullValue, sizeof(nullValue)) == 0) + copFunctorVec.push_back(std::mem_fn(&VT::nullEmptyCmpEq)); + else + copFunctorVec.push_back(std::mem_fn(&VT::cmpEq)); break; case(COMPARE_GE): copFunctorVec.push_back(std::mem_fn(&VT::cmpGe)); break; + case(COMPARE_GT): copFunctorVec.push_back(std::mem_fn(&VT::cmpGt)); break; @@ -1495,12 +1490,11 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, { primitives::RIDType ridOffset = i * VECTOR_SIZE; assert(!HAS_INPUT_RIDS || (HAS_INPUT_RIDS && ridSize >= ridOffset)); - dataVec = simdDataLoadTemplate(simdProcessor, srcArray, origSrcArray, ridArray, i).v; - // empty check - nonEmptyMask = simdProcessor.cmpNe(dataVec, emptyFilterArgVec); + dataVec = simdDataLoadTemplate(simdProcessor, srcArray, origSrcArray, ridArray, i).v; + nonEmptyMask = simdProcessor.nullEmptyCmpNe(dataVec, emptyFilterArgVec); writeMask = nonEmptyMask; // NULL check - nonNullMask = simdProcessor.cmpNe(dataVec, nullFilterArgVec); + nonNullMask = simdProcessor.nullEmptyCmpNe(dataVec, nullFilterArgVec); // Exclude NULLs from the resulting set if NULL doesn't match the filters. writeMask = isNullValueMatches ? writeMask : writeMask & nonNullMask; nonNullOrEmptyMask = nonNullMask & nonEmptyMask; @@ -1526,7 +1520,7 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, // outside the scope of the memory allocated to out msg. // vectWriteColValues is empty if outputMode == OT_RID. uint16_t valuesWritten = - vectWriteColValues(simdProcessor, + vectWriteColValues(simdProcessor, writeMask, nonNullOrEmptyMask, validMinMax, @@ -1563,6 +1557,7 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, // Set the number of output values here b/c tail processing can skip this operation. out->NVALS = totalValuesWritten; + // WIP Remove this block // Write captured Min/Max values to *out out->ValidMinMax = validMinMax; if (validMinMax) @@ -1581,17 +1576,17 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out, } // This routine dispatches template function calls to reduce branching. -template +template void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out, - const T* srcArray, const uint32_t srcSize, uint16_t* ridArray, + const STORAGE_TYPE* srcArray, const uint32_t srcSize, uint16_t* ridArray, const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter, - const bool validMinMax, const T emptyValue, const T nullValue, - T Min, T Max, const bool isNullValueMatches) + const bool validMinMax, const STORAGE_TYPE emptyValue, const STORAGE_TYPE nullValue, + STORAGE_TYPE Min, STORAGE_TYPE Max, const bool isNullValueMatches) { - constexpr const uint8_t WIDTH = sizeof(T); - // TODO make a SFINAE template switch for the class template spec. - using SIMD_TYPE = simd::vi128_wr; - using VT = typename simd::SimdFilterProcessor; + // Using struct to dispatch SIMD type based on integral type T. + using SimdType = typename simd::IntegralToSIMD::type; + using FilterType = typename simd::StorageToFiltering::type; + using VT = typename simd::SimdFilterProcessor; bool hasInputRIDs = (in->NVALS > 0) ? true : false; if (hasInputRIDs) { @@ -1599,25 +1594,25 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out switch (in->OutputType) { case OT_RID: - vectorizedFiltering(in, out, + vectorizedFiltering(in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); break; case OT_BOTH: - vectorizedFiltering(in, out, + vectorizedFiltering(in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); break; case OT_TOKEN: - vectorizedFiltering(in, out, + vectorizedFiltering(in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); break; case OT_DATAVALUE: - vectorizedFiltering(in, out, + vectorizedFiltering(in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); @@ -1630,25 +1625,25 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out switch (in->OutputType) { case OT_RID: - vectorizedFiltering(in, out, + vectorizedFiltering(in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); break; case OT_BOTH: - vectorizedFiltering(in, out, + vectorizedFiltering(in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); break; case OT_TOKEN: - vectorizedFiltering(in, out, + vectorizedFiltering(in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); break; case OT_DATAVALUE: - vectorizedFiltering(in, out, + vectorizedFiltering(in, out, srcArray, srcSize, ridArray, ridSize, parsedColumnFilter, validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); @@ -1718,8 +1713,8 @@ void filterColumnData( // all values w/o any filter(even empty values filter) applied. #if defined(__x86_64__ ) - // Don't use vectorized filtering for non-integer based data types wider than 16 bytes. - if (KIND < KIND_FLOAT && WIDTH < 16) + // Don't use vectorized filtering for text based data types. + if (KIND <= KIND_FLOAT && WIDTH < 16) { bool canUseFastFiltering = true; for (uint32_t i = 0; i < filterCount; ++i) @@ -1784,7 +1779,6 @@ void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in, auto dataType = (execplan::CalpontSystemCatalog::ColDataType) in->colType.DataType; if (dataType == execplan::CalpontSystemCatalog::FLOAT) { -// WIP make this inline function const uint16_t ridSize = in->NVALS; uint16_t* ridArray = in->getRIDArrayPtr(W); const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE diff --git a/primitives/linux-port/primitiveprocessor.h b/primitives/linux-port/primitiveprocessor.h index fc54d5371..aaa4f9d00 100644 --- a/primitives/linux-port/primitiveprocessor.h +++ b/primitives/linux-port/primitiveprocessor.h @@ -169,6 +169,7 @@ class ParsedColumnFilter using RFsType = uint8_t; static constexpr uint32_t noSetFilterThreshold = 8; ColumnFilterMode columnFilterMode; + // Very unfortunately prestored_argVals can also be used to store double/float values. boost::shared_array prestored_argVals; boost::shared_array prestored_argVals128; boost::shared_array prestored_cops; @@ -184,7 +185,7 @@ class ParsedColumnFilter typename std::enable_if::value, T>::type* = nullptr> T* getFilterVals() { - return prestored_argVals.get(); + return reinterpret_cast(prestored_argVals.get()); } templateNVALS, 9); for (i = 0; i < out->NVALS; i++) - ASSERT_EQ(results[i], 11 + (uint32_t)i); + { + ASSERT_EQ(results[i], 11 + (uint32_t)i); + } EXPECT_EQ(out->Max, __col4block_cdf_umax); EXPECT_EQ(out->Min, __col4block_cdf_umin); @@ -868,7 +870,7 @@ TEST_F(ColumnScanFilterTest, ColumnScan4BytesNegFloat2CompFiltersOutputBoth) } //void p_Col_neg_double_1() -TEST_F(ColumnScanFilterTest, ColumnScan4BytesNegDouble2CompFilters) +TEST_F(ColumnScanFilterTest, ColumnScan8BytesNegDouble2CompFilters) { constexpr const uint8_t W = 8; using IntegralType = double; diff --git a/utils/common/simd_sse.h b/utils/common/simd_sse.h index 494deaa2e..f7411dc4f 100644 --- a/utils/common/simd_sse.h +++ b/utils/common/simd_sse.h @@ -15,10 +15,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#ifndef UTILS_SIMD_SSE_H -#define UTILS_SIMD_SSE_H +#pragma once -#if defined(__x86_64__ ) +#if defined(__x86_64__) #include #include @@ -37,437 +36,867 @@ #include +// Column filtering is dispatched 4-way based on the column type, +// which defines implementation of comparison operations for the column values +enum ENUM_KIND {KIND_DEFAULT, // compared as signed integers + KIND_UNSIGNED, // compared as unsigned integers + KIND_FLOAT, // compared as floating-point numbers + KIND_TEXT}; // whitespace-trimmed and then compared as signed integers + namespace simd { using vi128_t = __m128i; - using msk128_t = uint16_t; + using vi128f_t = __m128; + using vi128d_t = __m128d; using int128_t = __int128; using MT = uint16_t; - // This ugly wrapper used to allow to use __m128i as a template class parameter argument + // These ugly wrappers are used to allow to use __m128* as template class parameter argument struct vi128_wr { __m128i v; }; - template - class SimdFilterProcessor - { }; + struct vi128f_wr + { + __m128 v; + }; - template<> - class SimdFilterProcessor + struct vi128d_wr + { + __m128d v; + }; + + template + struct IntegralToSIMD; + + template + struct IntegralToSIMD::type> + { + using type = vi128d_wr; + }; + + template + struct IntegralToSIMD::type> + { + using type = vi128f_wr; + }; + + template + struct IntegralToSIMD::type> + { + using type = vi128_wr; + }; + + template + struct StorageToFiltering; + + template + struct StorageToFiltering::type> + { + using type = double; + }; + + template + struct StorageToFiltering::type> + { + using type = float; + }; + + template + struct StorageToFiltering::type> + { + using type = T; + }; + + template + class SimdFilterProcessor; + + // Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T. + template + class SimdFilterProcessor::value && sizeof(CHECK_T) == 16) || + (std::is_same::value && !std::is_same::value && !std::is_same::value)>::type> { // This is a dummy class that is not currently used. public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; - using T = int128_t; - using SIMD_WRAPPER_TYPE = simd::vi128_wr; - using SIMD_TYPE = simd::vi128_t; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = vi128_wr; + using SimdType = vi128_t; + using FilterType = T; + using StorageType = T; + constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value - MCS_FORCE_INLINE vi128_t loadValue(const T fill) + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) { - return _mm_loadu_si128(reinterpret_cast(&fill)); + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return _mm_loadu_si128(reinterpret_cast(&fill)); } // Load from - MCS_FORCE_INLINE vi128_t loadFrom(const char* from) + MCS_FORCE_INLINE SimdType loadFrom(const char* from) { - return _mm_loadu_si128(reinterpret_cast(from)); + return _mm_loadu_si128(reinterpret_cast(from)); } - MCS_FORCE_INLINE MT cmpDummy(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpDummy(SimdType& x, SimdType& y) { return 0xFFFF; } // Compare - MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return cmpDummy(x, y); } - MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + // misc - MCS_FORCE_INLINE uint16_t convertVectorToBitMask(vi128_t& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE vi128_t setToZero() + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpDummy(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() { return _mm_setzero_si128(); } // store - MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, vi128_t& x) + MCS_FORCE_INLINE void store(char* dst, SimdType& x) { - _mm_storeu_si128(reinterpret_cast(dst), x); + _mm_storeu_si128(reinterpret_cast(dst), x); } }; - template<> - class SimdFilterProcessor + template + class SimdFilterProcessor::value && std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; - using T = datatypes::WidthToSIntegralType<8>::type; - using SIMD_WRAPPER_TYPE = simd::vi128_wr; - using SIMD_TYPE = simd::vi128_t; + using FilterType = T; + using NullEmptySimdType = vi128_t; + using SimdWrapperType = simd::vi128d_wr; + using SimdType = simd::vi128d_t; + using StorageSimdType = simd::vi128_t; + using StorageType = typename datatypes::WidthToSIntegralType::type; + using StorageVecProcType = SimdFilterProcessor; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value - MCS_FORCE_INLINE vi128_t loadValue(const T fill) + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + StorageVecProcType nullEmptyProcessor; + // This spec borrows the expr from u-/int64 based proceesor class. + return (SimdType) nullEmptyProcessor.loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return _mm_set1_pd(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return _mm_loadu_pd(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_pd(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_pd(x,y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_pd(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmple_pd(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_pd(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_pd(x, y)); + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return _mm_movemask_pd(vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + StorageVecProcType nullEmptyProcessor; + NullEmptySimdType* xAsIntVecPtr = reinterpret_cast(&x); + NullEmptySimdType* yAsIntVecPtr = reinterpret_cast(&y); + // This spec borrows the expr from u-/int64 based proceesor class. + return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + StorageVecProcType nullEmptyProcessor; + + NullEmptySimdType* xAsIntVecPtr + = reinterpret_cast(&x); + NullEmptySimdType* yAsIntVecPtr + = reinterpret_cast(&y); + // This spec borrows the expr from u-/int64 based proceesor class. + return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return _mm_setzero_pd(); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + _mm_storeu_pd(reinterpret_cast(dst), x); + } + }; + + template + class SimdFilterProcessor::value && std::is_same::value>::type> + { + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using FilterType = T; + using NullEmptySimdType = vi128_t; + using SimdWrapperType = vi128f_wr; + using SimdType = vi128f_t; + using StorageSimdType = simd::vi128_t; + using StorageType = typename datatypes::WidthToSIntegralType::type; + using StorageVecProcType = SimdFilterProcessor; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + StorageVecProcType nullEmptyProcessor; + // This spec borrows the expr from u-/int64 based proceesor class. + return (SimdType) nullEmptyProcessor.loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) + { + return _mm_set1_ps(fill); + } + + // Load from + MCS_FORCE_INLINE SimdType loadFrom(const char* from) + { + return _mm_loadu_ps(reinterpret_cast(from)); + } + + // Compare + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_ps(x, y)); + } + + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType) _mm_cmpge_ps(x,y)); + } + + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_ps(x, y)); + } + + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmple_ps(x, y)); + } + + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_ps(x, y)); + } + + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) + { + return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_ps(x, y)); + } + + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) + { + return 0; + } + + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) + { + return _mm_movemask_ps(vmask); + } + + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + StorageVecProcType nullEmptyProcessor; + + NullEmptySimdType* xAsIntVecPtr + = reinterpret_cast(&x); + NullEmptySimdType* yAsIntVecPtr + = reinterpret_cast(&y); + // This spec borrows the expr from u-/int64 based proceesor class. + return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + StorageVecProcType nullEmptyProcessor; + + NullEmptySimdType* xAsIntVecPtr + = reinterpret_cast(&x); + NullEmptySimdType* yAsIntVecPtr + = reinterpret_cast(&y); + // This spec borrows the expr from u-/int64 based proceesor class. + return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr); + } + + MCS_FORCE_INLINE SimdType setToZero() + { + return _mm_setzero_ps(); + } + + MCS_FORCE_INLINE void store(char* dst, SimdType& x) + { + _mm_storeu_ps(reinterpret_cast(dst), x); + } + }; + + template + class SimdFilterProcessor::value && + sizeof(CHECK_T) == 8 && !std::is_same::value>::type> + { + public: + constexpr static const uint16_t vecByteSize = 16U; + constexpr static const uint16_t vecBitSize = 128U; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = vi128_wr; + using SimdType = vi128_t; + using FilterType = T; + using StorageType = T; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); + // Load value + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) { return _mm_set_epi64x(fill, fill); } // Load from - MCS_FORCE_INLINE vi128_t loadFrom(const char* from) + MCS_FORCE_INLINE SimdType loadFrom(const char* from) { - return _mm_loadu_si128(reinterpret_cast(from)); + return _mm_loadu_si128(reinterpret_cast(from)); } // Compare - MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi64(x, y),_mm_cmpeq_epi64(x, y))); } - MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpgt_epi64(x, y)); } - MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)); } - MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return cmpGt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return cmpNe(x, y) ^ cmpGt(x, y); } - MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE vi128_t setToZero() + MCS_FORCE_INLINE SimdType setToZero() { return _mm_setzero_si128(); } + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + // store - MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, vi128_t& x) + MCS_FORCE_INLINE void store(char* dst, SimdType& x) { - _mm_storeu_si128(reinterpret_cast(dst), x); + _mm_storeu_si128(reinterpret_cast(dst), x); } }; - template<> - class SimdFilterProcessor + template + class SimdFilterProcessor::value && + sizeof(CHECK_T) == 4 && !std::is_same::value>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; - using T = datatypes::WidthToSIntegralType<4>::type; - using SIMD_WRAPPER_TYPE = simd::vi128_wr; - using SIMD_TYPE = simd::vi128_t; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = vi128_wr; + using SimdType = vi128_t; + using FilterType = T; + using StorageType = T; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value - MCS_FORCE_INLINE vi128_t loadValue(const T fill) + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) { return _mm_set1_epi32(fill); } // Load from - MCS_FORCE_INLINE vi128_t loadFrom(const char* from) + MCS_FORCE_INLINE SimdType loadFrom(const char* from) { - return _mm_loadu_si128(reinterpret_cast(from)); + return _mm_loadu_si128(reinterpret_cast(from)); } // Compare - MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)); } - MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return cmpLt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpgt_epi32(x, y)); } - MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return cmpGt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmplt_epi32(x, y)); } - MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + + // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE vi128_t setToZero() + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() { return _mm_setzero_si128(); } // store - MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, vi128_t& x) + MCS_FORCE_INLINE void store(char* dst, SimdType& x) { - _mm_storeu_si128(reinterpret_cast(dst), x); + _mm_storeu_si128(reinterpret_cast(dst), x); } }; - template<> - class SimdFilterProcessor + template + class SimdFilterProcessor::value && sizeof(CHECK_T) == 2>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; - using T = datatypes::WidthToSIntegralType<2>::type; - using SIMD_WRAPPER_TYPE = simd::vi128_wr; - using SIMD_TYPE = simd::vi128_t; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = simd::vi128_wr; + using SimdType = simd::vi128_t; + using FilterType = T; + using StorageType = T; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value - MCS_FORCE_INLINE vi128_t loadValue(const T fill) + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) { return _mm_set1_epi16(fill); } // Load from - MCS_FORCE_INLINE vi128_t loadFrom(const char* from) + MCS_FORCE_INLINE SimdType loadFrom(const char* from) { - return _mm_loadu_si128(reinterpret_cast(from)); + return _mm_loadu_si128(reinterpret_cast(from)); } // Compare - MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)); } - MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return cmpLt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpgt_epi16(x, y)); } - MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return cmpGt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmplt_epi16(x, y)); } - MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE vi128_t setToZero() + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() { return _mm_setzero_si128(); } // store - MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, vi128_t& x) + MCS_FORCE_INLINE void store(char* dst, SimdType& x) { - _mm_storeu_si128(reinterpret_cast(dst), x); + _mm_storeu_si128(reinterpret_cast(dst), x); } }; - template<> - class SimdFilterProcessor + template + class SimdFilterProcessor::value && sizeof(CHECK_T) == 1>::type> { public: constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecBitSize = 128U; - using T = datatypes::WidthToSIntegralType<1>::type; - using SIMD_WRAPPER_TYPE = simd::vi128_wr; - using SIMD_TYPE = simd::vi128_t; + using T = typename datatypes::WidthToSIntegralType::type; + using SimdWrapperType = vi128_wr; + using SimdType = vi128_t; + using FilterType = T; + using StorageType = T; + // Mask calculation for int and float types differs. + // See corresponding intrinsics algos for details. + constexpr static const uint16_t FilterMaskStep = sizeof(T); // Load value - MCS_FORCE_INLINE vi128_t loadValue(const T fill) + MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill) + { + return loadValue(fill); + } + + MCS_FORCE_INLINE SimdType loadValue(const T fill) { return _mm_set1_epi8(fill); } // Load from - MCS_FORCE_INLINE vi128_t loadFrom(const char* from) + MCS_FORCE_INLINE SimdType loadFrom(const char* from) { - return _mm_loadu_si128(reinterpret_cast(from)); + return _mm_loadu_si128(reinterpret_cast(from)); } // Compare - MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); } - MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y) { return cmpLt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y)); } - MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y) { return cmpGt(x, y) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmplt_epi8(x, y)); } - MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y) { return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF; } - MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) + MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y) { return 0; } + MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y) + { + return 0xFFFF; + } + // permute /* TODO Available in AVX-512 - MCS_FORCE_INLINE vi128_t perm8Bits(vi128_t& x, vi128_t& idx) + MCS_FORCE_INLINE SimdType perm8Bits(SimdType& x, SimdType& idx) { return _mm_permutexvar_epi8(x, idx); } */ // misc - MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask) + MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask) { return _mm_movemask_epi8(vmask); } - MCS_FORCE_INLINE vi128_t setToZero() + MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y) + { + return cmpNe(x, y); + } + + MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y) + { + return cmpEq(x, y); + } + + MCS_FORCE_INLINE SimdType setToZero() { return _mm_setzero_si128(); } // store - MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) + MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst) { _mm_maskmoveu_si128(x, vmask, dst); } - MCS_FORCE_INLINE void store(char* dst, vi128_t& x) + MCS_FORCE_INLINE void store(char* dst, SimdType& x) { - _mm_storeu_si128(reinterpret_cast(dst), x); + _mm_storeu_si128(reinterpret_cast(dst), x); } }; } // end of simd #endif // if defined(__x86_64__ ) - -#endif // vim:ts=2 sw=2: