1
0
mirror of https://github.com/mariadb-corporation/mariadb-columnstore-engine.git synced 2025-08-07 03:22:57 +03:00

MCOL-4809 This patch adds support for float data types filtering and scanning vectorization

This commit is contained in:
Roman Nozdrin
2021-11-01 15:19:19 +00:00
committed by Roman Nozdrin
parent b1beb631c1
commit f7417c0b10
7 changed files with 876 additions and 174 deletions

View File

@@ -0,0 +1,176 @@
DROP DATABASE IF EXISTS `double_float`;
CREATE DATABASE `double_float`;
USE `double_float`;
SET default_storage_engine=Columnstore;
SELECT @@default_storage_engine;
@@default_storage_engine
Columnstore
set autocommit=0;
CREATE TABLE test1 (dkey int);
INSERT INTO test1 VALUES (1), (2), (3);
SELECT test1.dkey FROM test1 ORDER BY test1.dkey;
dkey
1
2
3
CREATE TABLE qatabledouble (col DOUBLE) ;
CREATE TABLE qatablefloat (col float) ;
INSERT INTO qatabledouble VALUES (-2.225073858507201E-307);
INSERT INTO qatabledouble VALUES (-1.807302187774382E-127);
INSERT INTO qatabledouble VALUES (0);
INSERT INTO qatabledouble VALUES (1.993777023789432E+21);
INSERT INTO qatabledouble VALUES (1.797693134862315E+38);
INSERT INTO qatabledouble VALUES (-19937770237894323221);
INSERT INTO qatabledouble VALUES (17976931348623158);
SELECT * FROM qatabledouble;
col
-2.225073858507201e-307
-1.807302187774382e-127
0
1.993777023789432e21
1.797693134862315e38
-1.9937770237894324e19
1.7976931348623158e16
INSERT INTO qatabledouble VALUES (null);
INSERT INTO qatabledouble VALUES (null);
INSERT INTO qatabledouble VALUES (null);
SELECT * FROM qatabledouble;
col
-2.225073858507201e-307
-1.807302187774382e-127
0
1.993777023789432e21
1.797693134862315e38
-1.9937770237894324e19
1.7976931348623158e16
NULL
NULL
NULL
INSERT INTO qatablefloat VALUES (null);
INSERT INTO qatablefloat VALUES (null);
INSERT INTO qatablefloat VALUES (null);
SELECT * FROM qatablefloat;
col
NULL
NULL
NULL
DELETE FROM qatabledouble WHERE col IS NULL;
SELECT * FROM qatabledouble;
col
-2.225073858507201e-307
-1.807302187774382e-127
0
1.993777023789432e21
1.797693134862315e38
-1.9937770237894324e19
1.7976931348623158e16
delete FROM qatablefloat WHERE col IS NULL;
SELECT * FROM qatablefloat;
col
CREATE TABLE qatabledouble_v2 (col1 DOUBLE, col2 DOUBLE, col3 DOUBLE) ;
INSERT INTO qatabledouble_v2 VALUES (-0.50, -0.50, -0.50);
INSERT INTO qatabledouble_v2 VALUES (-0.49, -0.49, -0.49);
INSERT INTO qatabledouble_v2 VALUES (0.49, 0.49, 0.49);
INSERT INTO qatabledouble_v2 VALUES (0.50, 0.50, 0.50);
INSERT INTO qatabledouble_v2 VALUES (+8,+8,+8);
INSERT INTO qatabledouble_v2 VALUES (+0.50,+0.50,+0.50);
INSERT INTO qatabledouble_v2 VALUES (+0.49,+0.49,+0.49);
INSERT INTO qatabledouble_v2 VALUES (+0.0,+0.0,+0.0);
INSERT INTO qatabledouble_v2 VALUES (+.50,+.50,+.50);
INSERT INTO qatabledouble_v2 VALUES (+.49,+.49,+.49);
INSERT INTO qatabledouble_v2 VALUES (+.0,+.0,+.0);
INSERT INTO qatabledouble_v2 VALUES (-.0,-.0,-.0);
INSERT INTO qatabledouble_v2 VALUES (-.49,-.49,-.49);
INSERT INTO qatabledouble_v2 VALUES (-.50,-.50,-.50);
INSERT INTO qatabledouble_v2 VALUES (-0.0,-0.0,-0.0);
INSERT INTO qatabledouble_v2 VALUES (-0.49,-0.49,-0.49);
INSERT INTO qatabledouble_v2 VALUES (-0.50,-0.50,-0.50);
INSERT INTO qatabledouble_v2 VALUES (-8,-8,-8);
INSERT INTO qatabledouble_v2 VALUES (8,8,8);
INSERT INTO qatabledouble_v2 VALUES (0.50,0.50,0.50);
INSERT INTO qatabledouble_v2 VALUES (0.49,0.49,0.49);
INSERT INTO qatabledouble_v2 VALUES (0.0,0.0,0.0);
INSERT INTO qatabledouble_v2 VALUES (.50,.50,.50);
INSERT INTO qatabledouble_v2 VALUES (.49,.49,.49);
INSERT INTO qatabledouble_v2 VALUES (.0,.0,.0);
SELECT * FROM qatabledouble_v2;
col1 col2 col3
-0.5 -0.5 -0.5
-0.49 -0.49 -0.49
0.49 0.49 0.49
0.5 0.5 0.5
8 8 8
0.5 0.5 0.5
0.49 0.49 0.49
0 0 0
0.5 0.5 0.5
0.49 0.49 0.49
0 0 0
0 0 0
-0.49 -0.49 -0.49
-0.5 -0.5 -0.5
0 0 0
-0.49 -0.49 -0.49
-0.5 -0.5 -0.5
-8 -8 -8
8 8 8
0.5 0.5 0.5
0.49 0.49 0.49
0 0 0
0.5 0.5 0.5
0.49 0.49 0.49
0 0 0
CREATE TABLE qatablefloat_v3 (col1 float, col2 float, col3 float) ;
INSERT INTO qatablefloat_v3 VALUES (-0.50, -0.50, -0.50);
INSERT INTO qatablefloat_v3 VALUES (-0.49, -0.49, -0.49);
INSERT INTO qatablefloat_v3 VALUES (0.49, 0.49, 0.49);
INSERT INTO qatablefloat_v3 VALUES (0.50, 0.50, 0.50);
INSERT INTO qatablefloat_v3 VALUES (+8,+8,+8);
INSERT INTO qatablefloat_v3 VALUES (+0.50,+0.50,+0.50);
INSERT INTO qatablefloat_v3 VALUES (+0.49,+0.49,+0.49);
INSERT INTO qatablefloat_v3 VALUES (+0.0,+0.0,+0.0);
INSERT INTO qatablefloat_v3 VALUES (+.50,+.50,+.50);
INSERT INTO qatablefloat_v3 VALUES (+.49,+.49,+.49);
INSERT INTO qatablefloat_v3 VALUES (+.0,+.0,+.0);
INSERT INTO qatablefloat_v3 VALUES (-.0,-.0,-.0);
INSERT INTO qatablefloat_v3 VALUES (-.49,-.49,-.49);
INSERT INTO qatablefloat_v3 VALUES (-.50,-.50,-.50);
INSERT INTO qatablefloat_v3 VALUES (-0.0,-0.0,-0.0);
INSERT INTO qatablefloat_v3 VALUES (-0.49,-0.49,-0.49);
INSERT INTO qatablefloat_v3 VALUES (-0.50,-0.50,-0.50);
INSERT INTO qatablefloat_v3 VALUES (-8,-8,-8);
INSERT INTO qatablefloat_v3 VALUES (8,8,8);
INSERT INTO qatablefloat_v3 VALUES (0.50,0.50,0.50);
INSERT INTO qatablefloat_v3 VALUES (0.49,0.49,0.49);
INSERT INTO qatablefloat_v3 VALUES (0.0,0.0,0.0);
INSERT INTO qatablefloat_v3 VALUES (.50,.50,.50);
INSERT INTO qatablefloat_v3 VALUES (.49,.49,.49);
INSERT INTO qatablefloat_v3 VALUES (.0,.0,.0);
SELECT * FROM qatablefloat_v3;
col1 col2 col3
-0.5 -0.5 -0.5
-0.49 -0.49 -0.49
0.49 0.49 0.49
0.5 0.5 0.5
8 8 8
0.5 0.5 0.5
0.49 0.49 0.49
0 0 0
0.5 0.5 0.5
0.49 0.49 0.49
0 0 0
0 0 0
-0.49 -0.49 -0.49
-0.5 -0.5 -0.5
0 0 0
-0.49 -0.49 -0.49
-0.5 -0.5 -0.5
-8 -8 -8
8 8 8
0.5 0.5 0.5
0.49 0.49 0.49
0 0 0
0.5 0.5 0.5
0.49 0.49 0.49
0 0 0
DROP DATABASE `double_float`;

View File

@@ -0,0 +1,100 @@
#
# Some double/float tests moved from regr.
# DML, DQL w/without filters
#
-- source ../include/have_columnstore.inc
--disable_warnings
DROP DATABASE IF EXISTS `double_float`;
--enable_warnings
CREATE DATABASE `double_float`;
USE `double_float`;
SET default_storage_engine=Columnstore;
SELECT @@default_storage_engine;
set autocommit=0;
CREATE TABLE test1 (dkey int);
INSERT INTO test1 VALUES (1), (2), (3);
SELECT test1.dkey FROM test1 ORDER BY test1.dkey;
CREATE TABLE qatabledouble (col DOUBLE) ;
CREATE TABLE qatablefloat (col float) ;
INSERT INTO qatabledouble VALUES (-2.225073858507201E-307); # Changed -308 to -307, -308 errors on qaftest7.
INSERT INTO qatabledouble VALUES (-1.807302187774382E-127);
INSERT INTO qatabledouble VALUES (0);
INSERT INTO qatabledouble VALUES (1.993777023789432E+21);
INSERT INTO qatabledouble VALUES (1.797693134862315E+38);
INSERT INTO qatabledouble VALUES (-19937770237894323221);
INSERT INTO qatabledouble VALUES (17976931348623158);
SELECT * FROM qatabledouble;
INSERT INTO qatabledouble VALUES (null);
INSERT INTO qatabledouble VALUES (null);
INSERT INTO qatabledouble VALUES (null);
SELECT * FROM qatabledouble;
INSERT INTO qatablefloat VALUES (null);
INSERT INTO qatablefloat VALUES (null);
INSERT INTO qatablefloat VALUES (null);
SELECT * FROM qatablefloat;
DELETE FROM qatabledouble WHERE col IS NULL;
SELECT * FROM qatabledouble;
delete FROM qatablefloat WHERE col IS NULL;
SELECT * FROM qatablefloat;
CREATE TABLE qatabledouble_v2 (col1 DOUBLE, col2 DOUBLE, col3 DOUBLE) ;
INSERT INTO qatabledouble_v2 VALUES (-0.50, -0.50, -0.50);
INSERT INTO qatabledouble_v2 VALUES (-0.49, -0.49, -0.49);
INSERT INTO qatabledouble_v2 VALUES (0.49, 0.49, 0.49);
INSERT INTO qatabledouble_v2 VALUES (0.50, 0.50, 0.50);
INSERT INTO qatabledouble_v2 VALUES (+8,+8,+8);
INSERT INTO qatabledouble_v2 VALUES (+0.50,+0.50,+0.50);
INSERT INTO qatabledouble_v2 VALUES (+0.49,+0.49,+0.49);
INSERT INTO qatabledouble_v2 VALUES (+0.0,+0.0,+0.0);
INSERT INTO qatabledouble_v2 VALUES (+.50,+.50,+.50);
INSERT INTO qatabledouble_v2 VALUES (+.49,+.49,+.49);
INSERT INTO qatabledouble_v2 VALUES (+.0,+.0,+.0);
INSERT INTO qatabledouble_v2 VALUES (-.0,-.0,-.0);
INSERT INTO qatabledouble_v2 VALUES (-.49,-.49,-.49);
INSERT INTO qatabledouble_v2 VALUES (-.50,-.50,-.50);
INSERT INTO qatabledouble_v2 VALUES (-0.0,-0.0,-0.0);
INSERT INTO qatabledouble_v2 VALUES (-0.49,-0.49,-0.49);
INSERT INTO qatabledouble_v2 VALUES (-0.50,-0.50,-0.50);
INSERT INTO qatabledouble_v2 VALUES (-8,-8,-8);
INSERT INTO qatabledouble_v2 VALUES (8,8,8);
INSERT INTO qatabledouble_v2 VALUES (0.50,0.50,0.50);
INSERT INTO qatabledouble_v2 VALUES (0.49,0.49,0.49);
INSERT INTO qatabledouble_v2 VALUES (0.0,0.0,0.0);
INSERT INTO qatabledouble_v2 VALUES (.50,.50,.50);
INSERT INTO qatabledouble_v2 VALUES (.49,.49,.49);
INSERT INTO qatabledouble_v2 VALUES (.0,.0,.0);
SELECT * FROM qatabledouble_v2;
CREATE TABLE qatablefloat_v3 (col1 float, col2 float, col3 float) ;
INSERT INTO qatablefloat_v3 VALUES (-0.50, -0.50, -0.50);
INSERT INTO qatablefloat_v3 VALUES (-0.49, -0.49, -0.49);
INSERT INTO qatablefloat_v3 VALUES (0.49, 0.49, 0.49);
INSERT INTO qatablefloat_v3 VALUES (0.50, 0.50, 0.50);
INSERT INTO qatablefloat_v3 VALUES (+8,+8,+8);
INSERT INTO qatablefloat_v3 VALUES (+0.50,+0.50,+0.50);
INSERT INTO qatablefloat_v3 VALUES (+0.49,+0.49,+0.49);
INSERT INTO qatablefloat_v3 VALUES (+0.0,+0.0,+0.0);
INSERT INTO qatablefloat_v3 VALUES (+.50,+.50,+.50);
INSERT INTO qatablefloat_v3 VALUES (+.49,+.49,+.49);
INSERT INTO qatablefloat_v3 VALUES (+.0,+.0,+.0);
INSERT INTO qatablefloat_v3 VALUES (-.0,-.0,-.0);
INSERT INTO qatablefloat_v3 VALUES (-.49,-.49,-.49);
INSERT INTO qatablefloat_v3 VALUES (-.50,-.50,-.50);
INSERT INTO qatablefloat_v3 VALUES (-0.0,-0.0,-0.0);
INSERT INTO qatablefloat_v3 VALUES (-0.49,-0.49,-0.49);
INSERT INTO qatablefloat_v3 VALUES (-0.50,-0.50,-0.50);
INSERT INTO qatablefloat_v3 VALUES (-8,-8,-8);
INSERT INTO qatablefloat_v3 VALUES (8,8,8);
INSERT INTO qatablefloat_v3 VALUES (0.50,0.50,0.50);
INSERT INTO qatablefloat_v3 VALUES (0.49,0.49,0.49);
INSERT INTO qatablefloat_v3 VALUES (0.0,0.0,0.0);
INSERT INTO qatablefloat_v3 VALUES (.50,.50,.50);
INSERT INTO qatablefloat_v3 VALUES (.49,.49,.49);
INSERT INTO qatablefloat_v3 VALUES (.0,.0,.0);
SELECT * FROM qatablefloat_v3;
DROP DATABASE `double_float`;

View File

@@ -50,16 +50,8 @@ using namespace execplan;
namespace namespace
{ {
// WIP Move this
using MT = uint16_t; using MT = uint16_t;
// Column filtering is dispatched 4-way based on the column type,
// which defines implementation of comparison operations for the column values
enum ENUM_KIND {KIND_DEFAULT, // compared as signed integers
KIND_UNSIGNED, // compared as unsigned integers
KIND_FLOAT, // compared as floating-point numbers
KIND_TEXT}; // whitespace-trimmed and then compared as signed integers
inline uint64_t order_swap(uint64_t x) inline uint64_t order_swap(uint64_t x)
{ {
uint64_t ret = (x >> 56) | uint64_t ret = (x >> 56) |
@@ -1086,16 +1078,16 @@ inline uint16_t vectWriteColValues(VT& simdProcessor, // SIMD processor
primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs
primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs
{ {
constexpr const uint16_t WIDTH = sizeof(T); constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
using SIMD_TYPE = typename VT::SIMD_TYPE; using SimdType = typename VT::SimdType;
SIMD_TYPE tmpStorageVector; SimdType tmpStorageVector;
T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector); T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector);
// Saving values based on writeMask into tmp vec. // Saving values based on writeMask into tmp vec.
// Min/Max processing. // Min/Max processing.
// The mask is 16 bit long and it describes N elements. // The mask is 16 bit long and it describes N elements.
// N = sizeof(vector type) / WIDTH. // N = sizeof(vector type) / WIDTH.
uint32_t j = 0; uint32_t j = 0;
for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH) for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
{ {
MT bitMapPosition = 1 << it; MT bitMapPosition = 1 << it;
if (writeMask & bitMapPosition) if (writeMask & bitMapPosition)
@@ -1150,16 +1142,16 @@ inline uint16_t vectWriteColValues(VT& simdProcessor, // SIMD processor
primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs primitives::RIDType* ridDstArray, // The actual dst arrray ptr to start writing RIDs
primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs
{ {
constexpr const uint16_t WIDTH = sizeof(T); constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
using SIMD_TYPE = typename VT::SIMD_TYPE; using SimdType = typename VT::SimdType;
SIMD_TYPE tmpStorageVector; SimdType tmpStorageVector;
T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector); T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector);
// Saving values based on writeMask into tmp vec. // Saving values based on writeMask into tmp vec.
// Min/Max processing. // Min/Max processing.
// The mask is 16 bit long and it describes N elements. // The mask is 16 bit long and it describes N elements.
// N = sizeof(vector type) / WIDTH. // N = sizeof(vector type) / WIDTH.
uint32_t j = 0; uint32_t j = 0;
for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH) for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
{ {
MT bitMapPosition = 1 << it; MT bitMapPosition = 1 << it;
if (writeMask & bitMapPosition) if (writeMask & bitMapPosition)
@@ -1197,13 +1189,13 @@ inline uint16_t vectWriteRIDValues(VT& processor, // SIMD processor
MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values MT nonNullOrEmptyMask, // SIMD intrinsics inverce bitmask for NULL/EMPTY values
primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs primitives::RIDType* ridSrcArray) // The actual src array ptr to read RIDs
{ {
constexpr const uint16_t WIDTH = sizeof(T); constexpr const uint16_t FilterMaskStep = VT::FilterMaskStep;
primitives::RIDType* origRIDDstArray = ridDstArray; primitives::RIDType* origRIDDstArray = ridDstArray;
// Saving values based on writeMask into tmp vec. // Saving values based on writeMask into tmp vec.
// Min/Max processing. // Min/Max processing.
// The mask is 16 bit long and it describes N elements where N = sizeof(vector type) / WIDTH. // The mask is 16 bit long and it describes N elements where N = sizeof(vector type) / WIDTH.
uint16_t j = 0; uint16_t j = 0;
for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH) for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += FilterMaskStep)
{ {
MT bitMapPosition = 1 << it; MT bitMapPosition = 1 << it;
if (writeMask & (1 << it)) if (writeMask & (1 << it))
@@ -1348,12 +1340,11 @@ inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray,
{ {
constexpr const uint16_t WIDTH = sizeof(T); constexpr const uint16_t WIDTH = sizeof(T);
constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH; constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
using SIMD_TYPE = typename VT::SIMD_TYPE; using SimdType = typename VT::SimdType;
SIMD_TYPE result; SimdType result;
T* resultTypedPtr = reinterpret_cast<T*>(&result); T* resultTypedPtr = reinterpret_cast<T*>(&result);
for (uint32_t i = 0; i < VECTOR_SIZE; ++i) for (uint32_t i = 0; i < VECTOR_SIZE; ++i)
{ {
//std::cout << " simdDataLoadTemplate ridArray[ridArrayOffset] " << (int8_t) origSrcArray[ridArray[i]] << " ridArray[i] " << ridArray[i] << "\n";
resultTypedPtr[i] = origSrcArray[ridArray[i]]; resultTypedPtr[i] = origSrcArray[ridArray[i]];
} }
@@ -1378,12 +1369,13 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out,
T Min, T Max, const bool isNullValueMatches) T Min, T Max, const bool isNullValueMatches)
{ {
constexpr const uint16_t WIDTH = sizeof(T); constexpr const uint16_t WIDTH = sizeof(T);
using SIMD_TYPE = typename VT::SIMD_TYPE; using SimdType = typename VT::SimdType;
using SIMD_WRAPPER_TYPE = typename VT::SIMD_WRAPPER_TYPE; using SimdWrapperType = typename VT::SimdWrapperType;
using FilterType = typename VT::FilterType;
VT simdProcessor; VT simdProcessor;
SIMD_TYPE dataVec; SimdType dataVec;
SIMD_TYPE emptyFilterArgVec = simdProcessor.loadValue(emptyValue); SimdType emptyFilterArgVec = simdProcessor.emptyNullLoadValue(emptyValue);
SIMD_TYPE nullFilterArgVec = simdProcessor.loadValue(nullValue); SimdType nullFilterArgVec = simdProcessor.emptyNullLoadValue(nullValue);
MT writeMask, nonEmptyMask, nonNullMask, nonNullOrEmptyMask; MT writeMask, nonEmptyMask, nonNullMask, nonNullOrEmptyMask;
MT initFilterMask = 0xFFFF; MT initFilterMask = 0xFFFF;
primitives::RIDType rid = 0; primitives::RIDType rid = 0;
@@ -1397,18 +1389,16 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out,
ColumnFilterMode columnFilterMode = ALWAYS_TRUE; ColumnFilterMode columnFilterMode = ALWAYS_TRUE;
const ST* filterSet = nullptr; const ST* filterSet = nullptr;
const ParsedColumnFilter::RFsType* filterRFs = nullptr; const ParsedColumnFilter::RFsType* filterRFs = nullptr;
uint8_t outputType = in->OutputType; uint8_t outputType = in->OutputType;
constexpr uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH; constexpr uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
// If there are RIDs use its number to get a number of vectorized iterations. // If there are RIDs use its number to get a number of vectorized iterations.
uint16_t iterNumber = HAS_INPUT_RIDS ? ridSize / VECTOR_SIZE : srcSize / VECTOR_SIZE; uint16_t iterNumber = HAS_INPUT_RIDS ? ridSize / VECTOR_SIZE : srcSize / VECTOR_SIZE;
uint32_t filterCount = 0; uint32_t filterCount = 0;
// These pragmas are to silence GCC warnings // These pragmas are to silence GCC warnings
// warning: ignoring attributes on template argument // warning: ignoring attributes on template argument
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wignored-attributes" #pragma GCC diagnostic ignored "-Wignored-attributes"
std::vector<SIMD_TYPE> filterArgsVectors; std::vector<SimdType> filterArgsVectors;
auto ptrA = std::mem_fn(&VT::cmpEq); auto ptrA = std::mem_fn(&VT::cmpEq);
using COPType = decltype(ptrA); using COPType = decltype(ptrA);
std::vector<COPType> copFunctorVec; std::vector<COPType> copFunctorVec;
@@ -1452,15 +1442,20 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out,
for (uint32_t j = 0; j < filterCount; ++j) for (uint32_t j = 0; j < filterCount; ++j)
{ {
// Preload filter argument values only once. // Preload filter argument values only once.
filterArgsVectors[j] = simdProcessor.loadValue(filterValues[j]); filterArgsVectors[j] = simdProcessor.loadValue(*((FilterType*)&filterValues[j]));
switch(filterCOPs[j]) switch(filterCOPs[j])
{ {
case(COMPARE_EQ): case(COMPARE_EQ):
copFunctorVec.push_back(std::mem_fn(&VT::cmpEq)); // Skipping extra filter pass generated by IS NULL
if (memcmp(&filterValues[j], &nullValue, sizeof(nullValue)) == 0)
copFunctorVec.push_back(std::mem_fn(&VT::nullEmptyCmpEq));
else
copFunctorVec.push_back(std::mem_fn(&VT::cmpEq));
break; break;
case(COMPARE_GE): case(COMPARE_GE):
copFunctorVec.push_back(std::mem_fn(&VT::cmpGe)); copFunctorVec.push_back(std::mem_fn(&VT::cmpGe));
break; break;
case(COMPARE_GT): case(COMPARE_GT):
copFunctorVec.push_back(std::mem_fn(&VT::cmpGt)); copFunctorVec.push_back(std::mem_fn(&VT::cmpGt));
break; break;
@@ -1495,12 +1490,11 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out,
{ {
primitives::RIDType ridOffset = i * VECTOR_SIZE; primitives::RIDType ridOffset = i * VECTOR_SIZE;
assert(!HAS_INPUT_RIDS || (HAS_INPUT_RIDS && ridSize >= ridOffset)); assert(!HAS_INPUT_RIDS || (HAS_INPUT_RIDS && ridSize >= ridOffset));
dataVec = simdDataLoadTemplate<VT, SIMD_WRAPPER_TYPE, HAS_INPUT_RIDS, T>(simdProcessor, srcArray, origSrcArray, ridArray, i).v; dataVec = simdDataLoadTemplate<VT, SimdWrapperType, HAS_INPUT_RIDS, T>(simdProcessor, srcArray, origSrcArray, ridArray, i).v;
// empty check nonEmptyMask = simdProcessor.nullEmptyCmpNe(dataVec, emptyFilterArgVec);
nonEmptyMask = simdProcessor.cmpNe(dataVec, emptyFilterArgVec);
writeMask = nonEmptyMask; writeMask = nonEmptyMask;
// NULL check // NULL check
nonNullMask = simdProcessor.cmpNe(dataVec, nullFilterArgVec); nonNullMask = simdProcessor.nullEmptyCmpNe(dataVec, nullFilterArgVec);
// Exclude NULLs from the resulting set if NULL doesn't match the filters. // Exclude NULLs from the resulting set if NULL doesn't match the filters.
writeMask = isNullValueMatches ? writeMask : writeMask & nonNullMask; writeMask = isNullValueMatches ? writeMask : writeMask & nonNullMask;
nonNullOrEmptyMask = nonNullMask & nonEmptyMask; nonNullOrEmptyMask = nonNullMask & nonEmptyMask;
@@ -1526,7 +1520,7 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out,
// outside the scope of the memory allocated to out msg. // outside the scope of the memory allocated to out msg.
// vectWriteColValues is empty if outputMode == OT_RID. // vectWriteColValues is empty if outputMode == OT_RID.
uint16_t valuesWritten = uint16_t valuesWritten =
vectWriteColValues<T, VT, OUTPUT_TYPE, KIND, HAS_INPUT_RIDS>(simdProcessor, vectWriteColValues<T, VT, OUTPUT_TYPE, KIND, HAS_INPUT_RIDS>(simdProcessor,
writeMask, writeMask,
nonNullOrEmptyMask, nonNullOrEmptyMask,
validMinMax, validMinMax,
@@ -1563,6 +1557,7 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out,
// Set the number of output values here b/c tail processing can skip this operation. // Set the number of output values here b/c tail processing can skip this operation.
out->NVALS = totalValuesWritten; out->NVALS = totalValuesWritten;
// WIP Remove this block
// Write captured Min/Max values to *out // Write captured Min/Max values to *out
out->ValidMinMax = validMinMax; out->ValidMinMax = validMinMax;
if (validMinMax) if (validMinMax)
@@ -1581,17 +1576,17 @@ void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out,
} }
// This routine dispatches template function calls to reduce branching. // This routine dispatches template function calls to reduce branching.
template<typename T, ENUM_KIND KIND, typename FT, typename ST> template<typename STORAGE_TYPE, ENUM_KIND KIND, typename FT, typename ST>
void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out, void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out,
const T* srcArray, const uint32_t srcSize, uint16_t* ridArray, const STORAGE_TYPE* srcArray, const uint32_t srcSize, uint16_t* ridArray,
const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter, const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter,
const bool validMinMax, const T emptyValue, const T nullValue, const bool validMinMax, const STORAGE_TYPE emptyValue, const STORAGE_TYPE nullValue,
T Min, T Max, const bool isNullValueMatches) STORAGE_TYPE Min, STORAGE_TYPE Max, const bool isNullValueMatches)
{ {
constexpr const uint8_t WIDTH = sizeof(T); // Using struct to dispatch SIMD type based on integral type T.
// TODO make a SFINAE template switch for the class template spec. using SimdType = typename simd::IntegralToSIMD<STORAGE_TYPE, KIND>::type;
using SIMD_TYPE = simd::vi128_wr; using FilterType = typename simd::StorageToFiltering<STORAGE_TYPE, KIND>::type;
using VT = typename simd::SimdFilterProcessor<SIMD_TYPE, WIDTH>; using VT = typename simd::SimdFilterProcessor<SimdType, FilterType>;
bool hasInputRIDs = (in->NVALS > 0) ? true : false; bool hasInputRIDs = (in->NVALS > 0) ? true : false;
if (hasInputRIDs) if (hasInputRIDs)
{ {
@@ -1599,25 +1594,25 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out
switch (in->OutputType) switch (in->OutputType)
{ {
case OT_RID: case OT_RID:
vectorizedFiltering<T, VT, hasInput, OT_RID, KIND, FT, ST>(in, out, vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_RID, KIND, FT, ST>(in, out,
srcArray, srcSize, ridArray, ridSize, srcArray, srcSize, ridArray, ridSize,
parsedColumnFilter, parsedColumnFilter,
validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
break; break;
case OT_BOTH: case OT_BOTH:
vectorizedFiltering<T, VT, hasInput, OT_BOTH, KIND, FT, ST>(in, out, vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_BOTH, KIND, FT, ST>(in, out,
srcArray, srcSize, ridArray, ridSize, srcArray, srcSize, ridArray, ridSize,
parsedColumnFilter, parsedColumnFilter,
validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
break; break;
case OT_TOKEN: case OT_TOKEN:
vectorizedFiltering<T, VT, hasInput, OT_TOKEN, KIND, FT, ST>(in, out, vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_TOKEN, KIND, FT, ST>(in, out,
srcArray, srcSize, ridArray, ridSize, srcArray, srcSize, ridArray, ridSize,
parsedColumnFilter, parsedColumnFilter,
validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
break; break;
case OT_DATAVALUE: case OT_DATAVALUE:
vectorizedFiltering<T, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(in, out, vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(in, out,
srcArray, srcSize, ridArray, ridSize, srcArray, srcSize, ridArray, ridSize,
parsedColumnFilter, parsedColumnFilter,
validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
@@ -1630,25 +1625,25 @@ void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out
switch (in->OutputType) switch (in->OutputType)
{ {
case OT_RID: case OT_RID:
vectorizedFiltering<T, VT, hasInput, OT_RID, KIND, FT, ST>(in, out, vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_RID, KIND, FT, ST>(in, out,
srcArray, srcSize, ridArray, ridSize, srcArray, srcSize, ridArray, ridSize,
parsedColumnFilter, parsedColumnFilter,
validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
break; break;
case OT_BOTH: case OT_BOTH:
vectorizedFiltering<T, VT, hasInput, OT_BOTH, KIND, FT, ST>(in, out, vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_BOTH, KIND, FT, ST>(in, out,
srcArray, srcSize, ridArray, ridSize, srcArray, srcSize, ridArray, ridSize,
parsedColumnFilter, parsedColumnFilter,
validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
break; break;
case OT_TOKEN: case OT_TOKEN:
vectorizedFiltering<T, VT, hasInput, OT_TOKEN, KIND, FT, ST>(in, out, vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_TOKEN, KIND, FT, ST>(in, out,
srcArray, srcSize, ridArray, ridSize, srcArray, srcSize, ridArray, ridSize,
parsedColumnFilter, parsedColumnFilter,
validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
break; break;
case OT_DATAVALUE: case OT_DATAVALUE:
vectorizedFiltering<T, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(in, out, vectorizedFiltering<STORAGE_TYPE, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(in, out,
srcArray, srcSize, ridArray, ridSize, srcArray, srcSize, ridArray, ridSize,
parsedColumnFilter, parsedColumnFilter,
validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches); validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
@@ -1718,8 +1713,8 @@ void filterColumnData(
// all values w/o any filter(even empty values filter) applied. // all values w/o any filter(even empty values filter) applied.
#if defined(__x86_64__ ) #if defined(__x86_64__ )
// Don't use vectorized filtering for non-integer based data types wider than 16 bytes. // Don't use vectorized filtering for text based data types.
if (KIND < KIND_FLOAT && WIDTH < 16) if (KIND <= KIND_FLOAT && WIDTH < 16)
{ {
bool canUseFastFiltering = true; bool canUseFastFiltering = true;
for (uint32_t i = 0; i < filterCount; ++i) for (uint32_t i = 0; i < filterCount; ++i)
@@ -1784,7 +1779,6 @@ void PrimitiveProcessor::scanAndFilterTypeDispatcher(NewColRequestHeader* in,
auto dataType = (execplan::CalpontSystemCatalog::ColDataType) in->colType.DataType; auto dataType = (execplan::CalpontSystemCatalog::ColDataType) in->colType.DataType;
if (dataType == execplan::CalpontSystemCatalog::FLOAT) if (dataType == execplan::CalpontSystemCatalog::FLOAT)
{ {
// WIP make this inline function
const uint16_t ridSize = in->NVALS; const uint16_t ridSize = in->NVALS;
uint16_t* ridArray = in->getRIDArrayPtr(W); uint16_t* ridArray = in->getRIDArrayPtr(W);
const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE const uint32_t itemsPerBlock = logicalBlockMode ? BLOCK_SIZE

View File

@@ -169,6 +169,7 @@ class ParsedColumnFilter
using RFsType = uint8_t; using RFsType = uint8_t;
static constexpr uint32_t noSetFilterThreshold = 8; static constexpr uint32_t noSetFilterThreshold = 8;
ColumnFilterMode columnFilterMode; ColumnFilterMode columnFilterMode;
// Very unfortunately prestored_argVals can also be used to store double/float values.
boost::shared_array<int64_t> prestored_argVals; boost::shared_array<int64_t> prestored_argVals;
boost::shared_array<int128_t> prestored_argVals128; boost::shared_array<int128_t> prestored_argVals128;
boost::shared_array<CopsType> prestored_cops; boost::shared_array<CopsType> prestored_cops;
@@ -184,7 +185,7 @@ class ParsedColumnFilter
typename std::enable_if<std::is_same<T, int64_t>::value, T>::type* = nullptr> typename std::enable_if<std::is_same<T, int64_t>::value, T>::type* = nullptr>
T* getFilterVals() T* getFilterVals()
{ {
return prestored_argVals.get(); return reinterpret_cast<T*>(prestored_argVals.get());
} }
template<typename T, template<typename T,

View File

@@ -18,8 +18,8 @@
#ifndef HAVE_COL_DOUBLE_BLOCK #ifndef HAVE_COL_DOUBLE_BLOCK
#define HAVE_COL_DOUBLE_BLOCK #define HAVE_COL_DOUBLE_BLOCK
unsigned char ___bin_col_double_block_cdf[] = { unsigned char ___bin_col_double_block_cdf[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x3f, 0x00, 0x00, 0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xfa, 0xff, 0xaa, 0xaa, 0xaa, 0xaa,
0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x40, 0xaa, 0xaa, 0xfa, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x40,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x0c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, 0x00, 0x00, 0x0c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x40, 0x00, 0x00, 0x00, 0x00,

View File

@@ -429,7 +429,9 @@ TEST_F(ColumnScanFilterTest, ColumnScan4Bytes2Filters)
ASSERT_EQ(out->NVALS, 9); ASSERT_EQ(out->NVALS, 9);
for (i = 0; i < out->NVALS; i++) for (i = 0; i < out->NVALS; i++)
ASSERT_EQ(results[i], 11 + (uint32_t)i); {
ASSERT_EQ(results[i], 11 + (uint32_t)i);
}
EXPECT_EQ(out->Max, __col4block_cdf_umax); EXPECT_EQ(out->Max, __col4block_cdf_umax);
EXPECT_EQ(out->Min, __col4block_cdf_umin); EXPECT_EQ(out->Min, __col4block_cdf_umin);
@@ -868,7 +870,7 @@ TEST_F(ColumnScanFilterTest, ColumnScan4BytesNegFloat2CompFiltersOutputBoth)
} }
//void p_Col_neg_double_1() //void p_Col_neg_double_1()
TEST_F(ColumnScanFilterTest, ColumnScan4BytesNegDouble2CompFilters) TEST_F(ColumnScanFilterTest, ColumnScan8BytesNegDouble2CompFilters)
{ {
constexpr const uint8_t W = 8; constexpr const uint8_t W = 8;
using IntegralType = double; using IntegralType = double;

View File

@@ -15,10 +15,9 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA. */ MA 02110-1301, USA. */
#ifndef UTILS_SIMD_SSE_H #pragma once
#define UTILS_SIMD_SSE_H
#if defined(__x86_64__ ) #if defined(__x86_64__)
#include <cstdint> #include <cstdint>
#include <type_traits> #include <type_traits>
@@ -37,437 +36,867 @@
#include <mcs_datatype.h> #include <mcs_datatype.h>
// Column filtering is dispatched 4-way based on the column type,
// which defines implementation of comparison operations for the column values
enum ENUM_KIND {KIND_DEFAULT, // compared as signed integers
KIND_UNSIGNED, // compared as unsigned integers
KIND_FLOAT, // compared as floating-point numbers
KIND_TEXT}; // whitespace-trimmed and then compared as signed integers
namespace simd namespace simd
{ {
using vi128_t = __m128i; using vi128_t = __m128i;
using msk128_t = uint16_t; using vi128f_t = __m128;
using vi128d_t = __m128d;
using int128_t = __int128; using int128_t = __int128;
using MT = uint16_t; using MT = uint16_t;
// This ugly wrapper used to allow to use __m128i as a template class parameter argument // These ugly wrappers are used to allow to use __m128* as template class parameter argument
struct vi128_wr struct vi128_wr
{ {
__m128i v; __m128i v;
}; };
template<typename VT, int WIDTH> struct vi128f_wr
class SimdFilterProcessor {
{ }; __m128 v;
};
template<> struct vi128d_wr
class SimdFilterProcessor<vi128_wr, 16> {
__m128d v;
};
template<typename T, ENUM_KIND KIND, typename ENABLE = void>
struct IntegralToSIMD;
template<typename T, ENUM_KIND KIND>
struct IntegralToSIMD<T,KIND,
typename std::enable_if<KIND == KIND_FLOAT && sizeof(double) == sizeof(T)>::type>
{
using type = vi128d_wr;
};
template<typename T, ENUM_KIND KIND>
struct IntegralToSIMD<T,KIND,
typename std::enable_if<KIND == KIND_FLOAT && sizeof(float) == sizeof(T)>::type>
{
using type = vi128f_wr;
};
template<typename T, ENUM_KIND KIND>
struct IntegralToSIMD<T,KIND,
typename std::enable_if<KIND != KIND_FLOAT>::type>
{
using type = vi128_wr;
};
template<typename T, ENUM_KIND KIND, typename ENABLE = void>
struct StorageToFiltering;
template<typename T, ENUM_KIND KIND>
struct StorageToFiltering<T,KIND,
typename std::enable_if<KIND == KIND_FLOAT && sizeof(double) == sizeof(T)>::type>
{
using type = double;
};
template<typename T, ENUM_KIND KIND>
struct StorageToFiltering<T,KIND,
typename std::enable_if<KIND == KIND_FLOAT && sizeof(float) == sizeof(T)>::type>
{
using type = float;
};
template<typename T, ENUM_KIND KIND>
struct StorageToFiltering<T,KIND,
typename std::enable_if<KIND != KIND_FLOAT>::type>
{
using type = T;
};
template <typename VT, typename T, typename ENABLE = void>
class SimdFilterProcessor;
// Dummy class that captures all impossible cases, e.g. integer vector as VT and flot as CHECK_T.
template <typename VT, typename CHECK_T>
class SimdFilterProcessor<VT, CHECK_T,
typename std::enable_if<(std::is_same<VT, vi128_wr>::value && sizeof(CHECK_T) == 16) ||
(std::is_same<VT, vi128f_wr>::value && !std::is_same<CHECK_T, float>::value && !std::is_same<CHECK_T, double>::value)>::type>
{ {
// This is a dummy class that is not currently used. // This is a dummy class that is not currently used.
public: public:
constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecByteSize = 16U;
constexpr static const uint16_t vecBitSize = 128U; constexpr static const uint16_t vecBitSize = 128U;
using T = int128_t; using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
using SIMD_WRAPPER_TYPE = simd::vi128_wr; using SimdWrapperType = vi128_wr;
using SIMD_TYPE = simd::vi128_t; using SimdType = vi128_t;
using FilterType = T;
using StorageType = T;
constexpr static const uint16_t FilterMaskStep = sizeof(T);
// Load value // Load value
MCS_FORCE_INLINE vi128_t loadValue(const T fill) MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
{ {
return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(&fill)); return loadValue(fill);
}
MCS_FORCE_INLINE SimdType loadValue(const T fill)
{
return _mm_loadu_si128(reinterpret_cast<const SimdType*>(&fill));
} }
// Load from // Load from
MCS_FORCE_INLINE vi128_t loadFrom(const char* from) MCS_FORCE_INLINE SimdType loadFrom(const char* from)
{ {
return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from)); return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
} }
MCS_FORCE_INLINE MT cmpDummy(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpDummy(SimdType& x, SimdType& y)
{ {
return 0xFFFF; return 0xFFFF;
} }
// Compare // Compare
MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
{ {
return cmpDummy(x, y); return cmpDummy(x, y);
} }
MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
{ {
return cmpDummy(x, y); return cmpDummy(x, y);
} }
MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
{ {
return cmpDummy(x, y); return cmpDummy(x, y);
} }
MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
{ {
return cmpDummy(x, y); return cmpDummy(x, y);
} }
MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
{ {
return cmpDummy(x, y); return cmpDummy(x, y);
} }
MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
{ {
return cmpDummy(x, y); return cmpDummy(x, y);
} }
MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
{ {
return 0; return 0;
} }
MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
{
return 0xFFFF;
}
// misc // misc
MCS_FORCE_INLINE uint16_t convertVectorToBitMask(vi128_t& vmask) MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
{ {
return _mm_movemask_epi8(vmask); return _mm_movemask_epi8(vmask);
} }
MCS_FORCE_INLINE vi128_t setToZero() MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
{
return cmpDummy(x, y);
}
MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
{
return cmpDummy(x, y);
}
MCS_FORCE_INLINE SimdType setToZero()
{ {
return _mm_setzero_si128(); return _mm_setzero_si128();
} }
// store // store
MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
{ {
_mm_maskmoveu_si128(x, vmask, dst); _mm_maskmoveu_si128(x, vmask, dst);
} }
MCS_FORCE_INLINE void store(char* dst, vi128_t& x) MCS_FORCE_INLINE void store(char* dst, SimdType& x)
{ {
_mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x); _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
} }
}; };
template<> template <typename VT, typename T>
class SimdFilterProcessor<vi128_wr, 8> class SimdFilterProcessor<VT, T,
typename std::enable_if<std::is_same<VT, vi128d_wr>::value && std::is_same<T, double>::value>::type>
{ {
public: public:
constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecByteSize = 16U;
constexpr static const uint16_t vecBitSize = 128U; constexpr static const uint16_t vecBitSize = 128U;
using T = datatypes::WidthToSIntegralType<8>::type; using FilterType = T;
using SIMD_WRAPPER_TYPE = simd::vi128_wr; using NullEmptySimdType = vi128_t;
using SIMD_TYPE = simd::vi128_t; using SimdWrapperType = simd::vi128d_wr;
using SimdType = simd::vi128d_t;
using StorageSimdType = simd::vi128_t;
using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
using StorageVecProcType = SimdFilterProcessor<simd::vi128_wr, StorageType>;
// Mask calculation for int and float types differs.
// See corresponding intrinsics algos for details.
constexpr static const uint16_t FilterMaskStep = sizeof(T);
// Load value // Load value
MCS_FORCE_INLINE vi128_t loadValue(const T fill) MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
{
StorageVecProcType nullEmptyProcessor;
// This spec borrows the expr from u-/int64 based proceesor class.
return (SimdType) nullEmptyProcessor.loadValue(fill);
}
MCS_FORCE_INLINE SimdType loadValue(const T fill)
{
return _mm_set1_pd(fill);
}
// Load from
MCS_FORCE_INLINE SimdType loadFrom(const char* from)
{
return _mm_loadu_pd(reinterpret_cast<const T*>(from));
}
// Compare
MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_pd(x, y));
}
MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmpge_pd(x,y));
}
MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_pd(x, y));
}
MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmple_pd(x, y));
}
MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_pd(x, y));
}
MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_pd(x, y));
}
MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
{
return 0;
}
MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
{
return 0xFFFF;
}
// misc
MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
{
return _mm_movemask_pd(vmask);
}
MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
{
StorageVecProcType nullEmptyProcessor;
NullEmptySimdType* xAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&x);
NullEmptySimdType* yAsIntVecPtr = reinterpret_cast<NullEmptySimdType*>(&y);
// This spec borrows the expr from u-/int64 based proceesor class.
return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr);
}
MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
{
StorageVecProcType nullEmptyProcessor;
NullEmptySimdType* xAsIntVecPtr
= reinterpret_cast<NullEmptySimdType*>(&x);
NullEmptySimdType* yAsIntVecPtr
= reinterpret_cast<NullEmptySimdType*>(&y);
// This spec borrows the expr from u-/int64 based proceesor class.
return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr);
}
MCS_FORCE_INLINE SimdType setToZero()
{
return _mm_setzero_pd();
}
MCS_FORCE_INLINE void store(char* dst, SimdType& x)
{
_mm_storeu_pd(reinterpret_cast<T*>(dst), x);
}
};
template <typename VT, typename T>
class SimdFilterProcessor<VT, T,
typename std::enable_if<std::is_same<VT, vi128f_wr>::value && std::is_same<T, float>::value>::type>
{
public:
constexpr static const uint16_t vecByteSize = 16U;
constexpr static const uint16_t vecBitSize = 128U;
using FilterType = T;
using NullEmptySimdType = vi128_t;
using SimdWrapperType = vi128f_wr;
using SimdType = vi128f_t;
using StorageSimdType = simd::vi128_t;
using StorageType = typename datatypes::WidthToSIntegralType<sizeof(T)>::type;
using StorageVecProcType = SimdFilterProcessor<simd::vi128_wr, StorageType>;
// Mask calculation for int and float types differs.
// See corresponding intrinsics algos for details.
constexpr static const uint16_t FilterMaskStep = sizeof(T);
// Load value
MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
{
StorageVecProcType nullEmptyProcessor;
// This spec borrows the expr from u-/int64 based proceesor class.
return (SimdType) nullEmptyProcessor.loadValue(fill);
}
MCS_FORCE_INLINE SimdType loadValue(const T fill)
{
return _mm_set1_ps(fill);
}
// Load from
MCS_FORCE_INLINE SimdType loadFrom(const char* from)
{
return _mm_loadu_ps(reinterpret_cast<const T*>(from));
}
// Compare
MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmpeq_ps(x, y));
}
MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType) _mm_cmpge_ps(x,y));
}
MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmpgt_ps(x, y));
}
MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmple_ps(x, y));
}
MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmplt_ps(x, y));
}
MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
{
return _mm_movemask_epi8((StorageSimdType)_mm_cmpneq_ps(x, y));
}
MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
{
return 0;
}
MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
{
return 0xFFFF;
}
// misc
MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
{
return _mm_movemask_ps(vmask);
}
MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
{
StorageVecProcType nullEmptyProcessor;
NullEmptySimdType* xAsIntVecPtr
= reinterpret_cast<NullEmptySimdType*>(&x);
NullEmptySimdType* yAsIntVecPtr
= reinterpret_cast<NullEmptySimdType*>(&y);
// This spec borrows the expr from u-/int64 based proceesor class.
return nullEmptyProcessor.cmpNe(*xAsIntVecPtr, *yAsIntVecPtr);
}
MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
{
StorageVecProcType nullEmptyProcessor;
NullEmptySimdType* xAsIntVecPtr
= reinterpret_cast<NullEmptySimdType*>(&x);
NullEmptySimdType* yAsIntVecPtr
= reinterpret_cast<NullEmptySimdType*>(&y);
// This spec borrows the expr from u-/int64 based proceesor class.
return nullEmptyProcessor.cmpEq(*xAsIntVecPtr, *yAsIntVecPtr);
}
MCS_FORCE_INLINE SimdType setToZero()
{
return _mm_setzero_ps();
}
MCS_FORCE_INLINE void store(char* dst, SimdType& x)
{
_mm_storeu_ps(reinterpret_cast<T*>(dst), x);
}
};
template <typename VT, typename CHECK_T>
class SimdFilterProcessor<VT, CHECK_T,
typename std::enable_if<std::is_same<VT, vi128_wr>::value &&
sizeof(CHECK_T) == 8 && !std::is_same<CHECK_T, double>::value>::type>
{
public:
constexpr static const uint16_t vecByteSize = 16U;
constexpr static const uint16_t vecBitSize = 128U;
using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
using SimdWrapperType = vi128_wr;
using SimdType = vi128_t;
using FilterType = T;
using StorageType = T;
// Mask calculation for int and float types differs.
// See corresponding intrinsics algos for details.
constexpr static const uint16_t FilterMaskStep = sizeof(T);
// Load value
MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
{
return loadValue(fill);
}
MCS_FORCE_INLINE SimdType loadValue(const T fill)
{ {
return _mm_set_epi64x(fill, fill); return _mm_set_epi64x(fill, fill);
} }
// Load from // Load from
MCS_FORCE_INLINE vi128_t loadFrom(const char* from) MCS_FORCE_INLINE SimdType loadFrom(const char* from)
{ {
return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from)); return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
} }
// Compare // Compare
MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi64(x, y),_mm_cmpeq_epi64(x, y))); return _mm_movemask_epi8(_mm_or_si128(_mm_cmpgt_epi64(x, y),_mm_cmpeq_epi64(x, y)));
} }
MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpgt_epi64(x, y)); return _mm_movemask_epi8(_mm_cmpgt_epi64(x, y));
} }
MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)); return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y));
} }
MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
{ {
return cmpGt(x, y) ^ 0xFFFF; return cmpGt(x, y) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
{ {
return cmpNe(x, y) ^ cmpGt(x, y); return cmpNe(x, y) ^ cmpGt(x, y);
} }
MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)) ^ 0xFFFF; return _mm_movemask_epi8(_mm_cmpeq_epi64(x, y)) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
{ {
return 0; return 0;
} }
MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
{
return 0xFFFF;
}
// misc // misc
MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask) MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
{ {
return _mm_movemask_epi8(vmask); return _mm_movemask_epi8(vmask);
} }
MCS_FORCE_INLINE vi128_t setToZero() MCS_FORCE_INLINE SimdType setToZero()
{ {
return _mm_setzero_si128(); return _mm_setzero_si128();
} }
MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
{
return cmpNe(x, y);
}
MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
{
return cmpEq(x, y);
}
// store // store
MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
{ {
_mm_maskmoveu_si128(x, vmask, dst); _mm_maskmoveu_si128(x, vmask, dst);
} }
MCS_FORCE_INLINE void store(char* dst, vi128_t& x) MCS_FORCE_INLINE void store(char* dst, SimdType& x)
{ {
_mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x); _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
} }
}; };
template<> template <typename VT, typename CHECK_T>
class SimdFilterProcessor<vi128_wr, 4> class SimdFilterProcessor<VT, CHECK_T,
typename std::enable_if<std::is_same<VT, vi128_wr>::value &&
sizeof(CHECK_T) == 4 && !std::is_same<CHECK_T, float>::value>::type>
{ {
public: public:
constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecByteSize = 16U;
constexpr static const uint16_t vecBitSize = 128U; constexpr static const uint16_t vecBitSize = 128U;
using T = datatypes::WidthToSIntegralType<4>::type; using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
using SIMD_WRAPPER_TYPE = simd::vi128_wr; using SimdWrapperType = vi128_wr;
using SIMD_TYPE = simd::vi128_t; using SimdType = vi128_t;
using FilterType = T;
using StorageType = T;
// Mask calculation for int and float types differs.
// See corresponding intrinsics algos for details.
constexpr static const uint16_t FilterMaskStep = sizeof(T);
// Load value // Load value
MCS_FORCE_INLINE vi128_t loadValue(const T fill) MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
{
return loadValue(fill);
}
MCS_FORCE_INLINE SimdType loadValue(const T fill)
{ {
return _mm_set1_epi32(fill); return _mm_set1_epi32(fill);
} }
// Load from // Load from
MCS_FORCE_INLINE vi128_t loadFrom(const char* from) MCS_FORCE_INLINE SimdType loadFrom(const char* from)
{ {
return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from)); return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
} }
// Compare // Compare
MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)); return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y));
} }
MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
{ {
return cmpLt(x, y) ^ 0xFFFF; return cmpLt(x, y) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpgt_epi32(x, y)); return _mm_movemask_epi8(_mm_cmpgt_epi32(x, y));
} }
MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
{ {
return cmpGt(x, y) ^ 0xFFFF; return cmpGt(x, y) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmplt_epi32(x, y)); return _mm_movemask_epi8(_mm_cmplt_epi32(x, y));
} }
MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)) ^ 0xFFFF; return _mm_movemask_epi8(_mm_cmpeq_epi32(x, y)) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
{ {
return 0; return 0;
} }
MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
{
return 0xFFFF;
}
// misc // misc
MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask) MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
{ {
return _mm_movemask_epi8(vmask); return _mm_movemask_epi8(vmask);
} }
MCS_FORCE_INLINE vi128_t setToZero() MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
{
return cmpNe(x, y);
}
MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
{
return cmpEq(x, y);
}
MCS_FORCE_INLINE SimdType setToZero()
{ {
return _mm_setzero_si128(); return _mm_setzero_si128();
} }
// store // store
MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
{ {
_mm_maskmoveu_si128(x, vmask, dst); _mm_maskmoveu_si128(x, vmask, dst);
} }
MCS_FORCE_INLINE void store(char* dst, vi128_t& x) MCS_FORCE_INLINE void store(char* dst, SimdType& x)
{ {
_mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x); _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
} }
}; };
template<> template <typename VT, typename CHECK_T>
class SimdFilterProcessor<vi128_wr, 2> class SimdFilterProcessor<VT, CHECK_T,
typename std::enable_if<std::is_same<VT, vi128_wr>::value && sizeof(CHECK_T) == 2>::type>
{ {
public: public:
constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecByteSize = 16U;
constexpr static const uint16_t vecBitSize = 128U; constexpr static const uint16_t vecBitSize = 128U;
using T = datatypes::WidthToSIntegralType<2>::type; using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
using SIMD_WRAPPER_TYPE = simd::vi128_wr; using SimdWrapperType = simd::vi128_wr;
using SIMD_TYPE = simd::vi128_t; using SimdType = simd::vi128_t;
using FilterType = T;
using StorageType = T;
// Mask calculation for int and float types differs.
// See corresponding intrinsics algos for details.
constexpr static const uint16_t FilterMaskStep = sizeof(T);
// Load value // Load value
MCS_FORCE_INLINE vi128_t loadValue(const T fill) MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
{
return loadValue(fill);
}
MCS_FORCE_INLINE SimdType loadValue(const T fill)
{ {
return _mm_set1_epi16(fill); return _mm_set1_epi16(fill);
} }
// Load from // Load from
MCS_FORCE_INLINE vi128_t loadFrom(const char* from) MCS_FORCE_INLINE SimdType loadFrom(const char* from)
{ {
return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from)); return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
} }
// Compare // Compare
MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)); return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y));
} }
MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
{ {
return cmpLt(x, y) ^ 0xFFFF; return cmpLt(x, y) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpgt_epi16(x, y)); return _mm_movemask_epi8(_mm_cmpgt_epi16(x, y));
} }
MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
{ {
return cmpGt(x, y) ^ 0xFFFF; return cmpGt(x, y) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmplt_epi16(x, y)); return _mm_movemask_epi8(_mm_cmplt_epi16(x, y));
} }
MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF; return _mm_movemask_epi8(_mm_cmpeq_epi16(x, y)) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
{ {
return 0; return 0;
} }
MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
{
return 0xFFFF;
}
// misc // misc
MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask) MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
{ {
return _mm_movemask_epi8(vmask); return _mm_movemask_epi8(vmask);
} }
MCS_FORCE_INLINE vi128_t setToZero() MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
{
return cmpNe(x, y);
}
MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
{
return cmpEq(x, y);
}
MCS_FORCE_INLINE SimdType setToZero()
{ {
return _mm_setzero_si128(); return _mm_setzero_si128();
} }
// store // store
MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
{ {
_mm_maskmoveu_si128(x, vmask, dst); _mm_maskmoveu_si128(x, vmask, dst);
} }
MCS_FORCE_INLINE void store(char* dst, vi128_t& x) MCS_FORCE_INLINE void store(char* dst, SimdType& x)
{ {
_mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x); _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
} }
}; };
template<> template <typename VT, typename CHECK_T>
class SimdFilterProcessor<vi128_wr, 1> class SimdFilterProcessor<VT, CHECK_T,
typename std::enable_if<std::is_same<VT, vi128_wr>::value && sizeof(CHECK_T) == 1>::type>
{ {
public: public:
constexpr static const uint16_t vecByteSize = 16U; constexpr static const uint16_t vecByteSize = 16U;
constexpr static const uint16_t vecBitSize = 128U; constexpr static const uint16_t vecBitSize = 128U;
using T = datatypes::WidthToSIntegralType<1>::type; using T = typename datatypes::WidthToSIntegralType<sizeof(CHECK_T)>::type;
using SIMD_WRAPPER_TYPE = simd::vi128_wr; using SimdWrapperType = vi128_wr;
using SIMD_TYPE = simd::vi128_t; using SimdType = vi128_t;
using FilterType = T;
using StorageType = T;
// Mask calculation for int and float types differs.
// See corresponding intrinsics algos for details.
constexpr static const uint16_t FilterMaskStep = sizeof(T);
// Load value // Load value
MCS_FORCE_INLINE vi128_t loadValue(const T fill) MCS_FORCE_INLINE SimdType emptyNullLoadValue(const T fill)
{
return loadValue(fill);
}
MCS_FORCE_INLINE SimdType loadValue(const T fill)
{ {
return _mm_set1_epi8(fill); return _mm_set1_epi8(fill);
} }
// Load from // Load from
MCS_FORCE_INLINE vi128_t loadFrom(const char* from) MCS_FORCE_INLINE SimdType loadFrom(const char* from)
{ {
return _mm_loadu_si128(reinterpret_cast<const vi128_t*>(from)); return _mm_loadu_si128(reinterpret_cast<const SimdType*>(from));
} }
// Compare // Compare
MCS_FORCE_INLINE MT cmpEq(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpEq(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
} }
MCS_FORCE_INLINE MT cmpGe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGe(SimdType& x, SimdType& y)
{ {
return cmpLt(x, y) ^ 0xFFFF; return cmpLt(x, y) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpGt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpGt(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y)); return _mm_movemask_epi8(_mm_cmpgt_epi8(x, y));
} }
MCS_FORCE_INLINE MT cmpLe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLe(SimdType& x, SimdType& y)
{ {
return cmpGt(x, y) ^ 0xFFFF; return cmpGt(x, y) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpLt(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpLt(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmplt_epi8(x, y)); return _mm_movemask_epi8(_mm_cmplt_epi8(x, y));
} }
MCS_FORCE_INLINE MT cmpNe(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpNe(SimdType& x, SimdType& y)
{ {
return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF; return _mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xFFFF;
} }
MCS_FORCE_INLINE MT cmpAlwaysFalse(vi128_t& x, vi128_t& y) MCS_FORCE_INLINE MT cmpAlwaysFalse(SimdType& x, SimdType& y)
{ {
return 0; return 0;
} }
MCS_FORCE_INLINE MT cmpAlwaysTrue(SimdType& x, SimdType& y)
{
return 0xFFFF;
}
// permute // permute
/* TODO Available in AVX-512 /* TODO Available in AVX-512
MCS_FORCE_INLINE vi128_t perm8Bits(vi128_t& x, vi128_t& idx) MCS_FORCE_INLINE SimdType perm8Bits(SimdType& x, SimdType& idx)
{ {
return _mm_permutexvar_epi8(x, idx); return _mm_permutexvar_epi8(x, idx);
} }
*/ */
// misc // misc
MCS_FORCE_INLINE MT convertVectorToBitMask(vi128_t& vmask) MCS_FORCE_INLINE MT convertVectorToBitMask(SimdType& vmask)
{ {
return _mm_movemask_epi8(vmask); return _mm_movemask_epi8(vmask);
} }
MCS_FORCE_INLINE vi128_t setToZero() MCS_FORCE_INLINE MT nullEmptyCmpNe(SimdType& x, SimdType& y)
{
return cmpNe(x, y);
}
MCS_FORCE_INLINE MT nullEmptyCmpEq(SimdType& x, SimdType& y)
{
return cmpEq(x, y);
}
MCS_FORCE_INLINE SimdType setToZero()
{ {
return _mm_setzero_si128(); return _mm_setzero_si128();
} }
// store // store
MCS_FORCE_INLINE void storeWMask(vi128_t& x, vi128_t& vmask, char* dst) MCS_FORCE_INLINE void storeWMask(SimdType& x, SimdType& vmask, char* dst)
{ {
_mm_maskmoveu_si128(x, vmask, dst); _mm_maskmoveu_si128(x, vmask, dst);
} }
MCS_FORCE_INLINE void store(char* dst, vi128_t& x) MCS_FORCE_INLINE void store(char* dst, SimdType& x)
{ {
_mm_storeu_si128(reinterpret_cast<vi128_t*>(dst), x); _mm_storeu_si128(reinterpret_cast<SimdType*>(dst), x);
} }
}; };
} // end of simd } // end of simd
#endif // if defined(__x86_64__ ) #endif // if defined(__x86_64__ )
#endif
// vim:ts=2 sw=2: // vim:ts=2 sw=2: