This patch introduces support for scanning/filtering vectorized execution for numeric-based

data types TEXT, CHAR, VARCHAR, FLOAT and DOUBLE are not yet supported by vectorized path This patch introduces an example for Google benchmarking suite to measure a perf diff b/w legacy scan/filtering code and the templated version
2025-07-29 08:21:15 +03:00 · 2021-09-08 17:59:20 +00:00
parent cac23b0afc
commit af36f9940f
22 changed files with 2720 additions and 143 deletions
--- a/primitives/linux-port/column.cpp
+++ b/primitives/linux-port/column.cpp
@ -21,6 +21,7 @@
 //#define NDEBUG
 #include <cassert>
 #include <cmath>
+#include <functional>
 #ifndef _MSC_VER
 #include <pthread.h>
 #else
@ -38,6 +39,8 @@ using namespace boost;
 #include "primproc.h"
 #include "dataconvert.h"
 #include "mcs_decimal.h"
+#include "simd_sse.h"
+#include "utils/common/columnwidth.h"

 using namespace logging;
 using namespace dbbc;
@ -47,7 +50,8 @@ using namespace execplan;

 namespace
 {
-using RID_T = uint16_t;  // Row index type, as used in rid arrays
+// WIP Move this
+using MT = uint16_t;

 // Column filtering is dispatched 4-way based on the column type,
 // which defines implementation of comparison operations for the column values
@ -118,7 +122,7 @@ inline bool colCompare_(const T& val1, const T& val2, uint8_t COP)
            return val1 >= val2;

        default:
-            logIt(34, COP, "colCompare");
+            logIt(34, COP, "colCompare_");
            return false;						// throw an exception here?
    }
 }
@ -848,6 +852,34 @@ inline bool matchingColValue(const T curValue,
    }
 }

+/*****************************************************************************
+ *** MISC FUNCS **************************************************************
+ *****************************************************************************/
+// These two are templates update min/max values in the loop iterating the values in filterColumnData.
+template<ENUM_KIND KIND, typename T,
+         typename std::enable_if<KIND == KIND_TEXT, T>::type* = nullptr>
+inline void updateMinMax(T& Min, T& Max, const T curValue, NewColRequestHeader* in)
+{
+    constexpr int COL_WIDTH = sizeof(T);
+    if (colCompare<KIND_TEXT, COL_WIDTH>(Min, curValue, COMPARE_GT, false, in->colType))
+        Min = curValue;
+
+    if (colCompare<KIND_TEXT, COL_WIDTH>(Max, curValue, COMPARE_LT, false, in->colType))
+        Max = curValue;
+}
+
+template<ENUM_KIND KIND, typename T,
+         typename std::enable_if<KIND != KIND_TEXT, T>::type* = nullptr>
+inline void updateMinMax(T& Min, T& Max, const T curValue, NewColRequestHeader* in)
+{
+    if (Min > curValue)
+        Min = curValue;
+
+    if (Max < curValue)
+        Max = curValue;
+}
+
+
 /*****************************************************************************
 *** READ COLUMN VALUES ******************************************************
 *****************************************************************************/
@ -936,6 +968,7 @@ inline void writeColValue(
    uint16_t rid,
    const T* srcArray)
 {
+    // TODO move base ptr calculation one level up.
    uint8_t* outPtr = reinterpret_cast<uint8_t*>(&out[1]);
    auto idx = out->NVALS++;
    if (OutputType & OT_RID)
@ -947,42 +980,650 @@ inline void writeColValue(

    if (OutputType & (OT_TOKEN | OT_DATAVALUE))
    {
+        // TODO move base ptr calculation one level up.
        T* outPos = getValuesArrayPosition<T>(primitives::getFirstValueArrayPosition(out), idx);
        // TODO check bytecode for the 16 byte type
        *outPos = srcArray[rid];
    }
 }

-// These two are templates update min/max values in the loop iterating the values in filterColumnData.
-template<ENUM_KIND KIND, typename T,
-         typename std::enable_if<KIND == KIND_TEXT, T>::type* = nullptr>
-inline void updateMinMax(T& Min, T& Max, T& curValue, NewColRequestHeader* in)
+#if defined(__x86_64__ )
+template<typename T, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
+         typename std::enable_if<HAS_INPUT_RIDS == false, T>::type* = nullptr>
+inline void vectUpdateMinMax(const bool validMinMax, const bool isNonNullOrEmpty,
+                             T& Min, T& Max, T curValue, NewColRequestHeader* in)
 {
-    constexpr int COL_WIDTH = sizeof(T);
-    if (colCompare<KIND_TEXT, COL_WIDTH>(Min, curValue, COMPARE_GT, false, in->colType))
-        Min = curValue;
-
-    if (colCompare<KIND_TEXT, COL_WIDTH>(Max, curValue, COMPARE_LT, false, in->colType))
-        Max = curValue;
+    if (validMinMax && isNonNullOrEmpty)
+        updateMinMax<KIND>(Min, Max, curValue, in);
 }

-template<ENUM_KIND KIND, typename T,
-         typename std::enable_if<KIND != KIND_TEXT, T>::type* = nullptr>
-inline void updateMinMax(T& Min, T& Max, T& curValue, NewColRequestHeader* in)
+// MCS won't update Min/Max for a block if it doesn't read all values in a block.
+// This happens if in->NVALS > 0(HAS_INPUT_RIDS is set).
+template<typename T, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
+         typename std::enable_if<HAS_INPUT_RIDS == true, T>::type* = nullptr>
+inline void vectUpdateMinMax(const bool validMinMax, const bool isNonNullOrEmpty,
+                             T& Min, T& Max, T curValue, NewColRequestHeader* in)
 {
-    if (Min > curValue)
-        Min = curValue;
-
-    if (Max < curValue)
-        Max = curValue;
+    //
 }

-// TBD Check if MCS really needs to copy values from in into out msgs or
-// it is possible to copy from in msg into BPP::values directly.
+template<typename T, bool HAS_INPUT_RIDS,
+         typename std::enable_if<HAS_INPUT_RIDS == false, T>::type* = nullptr>
+void vectWriteColValuesLoopRIDAsignment(primitives::RIDType* ridDstArray, ColResultHeader* out,
+                                        const primitives::RIDType calculatedRID,
+                                        const primitives::RIDType* ridSrcArray, const uint32_t srcRIDIdx)
+{
+    *ridDstArray = calculatedRID;
+    out->RidFlags |= (1 << (calculatedRID >> 9)); // set the (row/512)'th bit
+}
+
+template<typename T, bool HAS_INPUT_RIDS,
+         typename std::enable_if<HAS_INPUT_RIDS == true, T>::type* = nullptr>
+void vectWriteColValuesLoopRIDAsignment(primitives::RIDType* ridDstArray, ColResultHeader* out,
+                                        const primitives::RIDType calculatedRID,
+                                        const primitives::RIDType* ridSrcArray, const uint32_t srcRIDIdx)
+{
+    *ridDstArray = ridSrcArray[srcRIDIdx];
+    out->RidFlags |= (1 << (ridSrcArray[srcRIDIdx] >> 9)); // set the (row/512)'th bit
+}
+
+// The set of SFINAE templates are used to write values/RID into the output buffer based on
+// a number of template parameters
+// No RIDs only values
+template<typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
+         typename std::enable_if<OUTPUT_TYPE & (OT_TOKEN | OT_DATAVALUE) && !(OUTPUT_TYPE & OT_RID), T>::type* = nullptr>
+inline uint16_t vectWriteColValues(VT& simdProcessor, // SIMD processor
+    const MT writeMask,                               // SIMD intrinsics bitmask for values to write
+    const MT nonNullOrEmptyMask,                      // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    const bool validMinMax,                           // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,              // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                   // Typed SIMD vector from the input block
+    char* dstArray,                                   // the actual char dst array ptr to start writing values
+    T& Min, T&Max,                                    // Min/Max of the extent
+    NewColRequestHeader* in,                          // Proto message
+    ColResultHeader* out,                             // Proto message
+    primitives::RIDType* ridDstArray,                 // The actual dst arrray ptr to start writing RIDs
+    primitives::RIDType* ridSrcArray)                 // The actual src array ptr to read RIDs
+{
+    constexpr const uint16_t WIDTH = sizeof(T);
+    using SIMD_TYPE = typename VT::SIMD_TYPE;
+    SIMD_TYPE tmpStorageVector;
+    T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector);
+    // Saving values based on writeMask into tmp vec.
+    // Min/Max processing.
+    // The mask is 16 bit long and it describes N elements.
+    // N = sizeof(vector type) / WIDTH.
+    uint32_t j = 0;
+    for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH)
+    {
+        MT bitMapPosition = 1 << it;
+        if (writeMask & bitMapPosition)
+        {
+            *tmpDstVecTPtr = dataVecTPtr[j];
+            ++tmpDstVecTPtr;
+        }
+
+        vectUpdateMinMax<T, KIND, HAS_INPUT_RIDS>(validMinMax, nonNullOrEmptyMask & bitMapPosition,
+                                                  Min, Max, dataVecTPtr[j], in);
+    }
+    // Store the whole vector however one level up the stack
+    // vectorizedFiltering() increases the dstArray by a number of
+    // actual values written that is the result of this function.
+    simdProcessor.store(dstArray, tmpStorageVector);
+
+    return tmpDstVecTPtr - reinterpret_cast<T*>(&tmpStorageVector);
+}
+
+// RIDs no values
+template<typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
+         typename std::enable_if<OUTPUT_TYPE & OT_RID && !(OUTPUT_TYPE & OT_TOKEN), T>::type* = nullptr>
+inline uint16_t vectWriteColValues(VT& simdProcessor, // SIMD processor
+    const MT writeMask,                               // SIMD intrinsics bitmask for values to write
+    const MT nonNullOrEmptyMask,                      // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    const bool validMinMax,                           // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,              // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                   // Typed SIMD vector from the input block
+    char* dstArray,                                   // the actual char dst array ptr to start writing values
+    T& Min, T&Max,                                    // Min/Max of the extent
+    NewColRequestHeader* in,                          // Proto message
+    ColResultHeader* out,                             // Proto message
+    primitives::RIDType* ridDstArray,                 // The actual dst arrray ptr to start writing RIDs
+    primitives::RIDType* ridSrcArray)                 // The actual src array ptr to read RIDs
+{
+    return 0;
+}
+
+// Both RIDs and values
+template<typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
+         typename std::enable_if<OUTPUT_TYPE == OT_BOTH, T>::type* = nullptr>
+inline uint16_t vectWriteColValues(VT& simdProcessor, // SIMD processor
+    const MT writeMask,                               // SIMD intrinsics bitmask for values to write
+    const MT nonNullOrEmptyMask,                      // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    const bool validMinMax,                           // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,              // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                   // Typed SIMD vector from the input block
+    char* dstArray,                                   // the actual char dst array ptr to start writing values
+    T& Min, T&Max,                                    // Min/Max of the extent
+    NewColRequestHeader* in,                          // Proto message
+    ColResultHeader* out,                             // Proto message
+    primitives::RIDType* ridDstArray,                 // The actual dst arrray ptr to start writing RIDs
+    primitives::RIDType* ridSrcArray)                 // The actual src array ptr to read RIDs
+{
+    constexpr const uint16_t WIDTH = sizeof(T);
+    using SIMD_TYPE = typename VT::SIMD_TYPE;
+    SIMD_TYPE tmpStorageVector;
+    T* tmpDstVecTPtr = reinterpret_cast<T*>(&tmpStorageVector);
+    // Saving values based on writeMask into tmp vec.
+    // Min/Max processing.
+    // The mask is 16 bit long and it describes N elements.
+    // N = sizeof(vector type) / WIDTH.
+    uint32_t j = 0;
+    for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH)
+    {
+        MT bitMapPosition = 1 << it;
+        if (writeMask & bitMapPosition)
+        {
+            *tmpDstVecTPtr = dataVecTPtr[j];
+            ++tmpDstVecTPtr;
+           vectWriteColValuesLoopRIDAsignment<T, HAS_INPUT_RIDS>(ridDstArray, out, ridOffset + j,
+                                                                 ridSrcArray, j);
+            ++ridDstArray;
+        }
+        vectUpdateMinMax<T, KIND, HAS_INPUT_RIDS>(validMinMax, nonNullOrEmptyMask & bitMapPosition,
+                                                  Min, Max, dataVecTPtr[j], in);
+    }
+    // Store the whole vector however one level up the stack
+    // vectorizedFiltering() increases the dstArray by a number of
+    // actual values written that is the result of this function.
+    simdProcessor.store(dstArray, tmpStorageVector);
+
+    return tmpDstVecTPtr - reinterpret_cast<T*>(&tmpStorageVector);
+}
+
+// RIDs no values
+template<typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
+         typename std::enable_if<!(OUTPUT_TYPE & (OT_TOKEN | OT_DATAVALUE)) && OUTPUT_TYPE & OT_RID, T>::type* = nullptr>
+inline uint16_t vectWriteRIDValues(VT& processor,   // SIMD processor
+    const uint16_t valuesWritten,                   // The number of values written to in certain SFINAE cases
+    const bool validMinMax,                         // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,            // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                 // Typed SIMD vector from the input block
+    primitives::RIDType* ridDstArray,               // The actual dst arrray ptr to start writing RIDs
+    MT writeMask,                                   // SIMD intrinsics bitmask for values to write
+    T& Min, T&Max,                                  // Min/Max of the extent
+    NewColRequestHeader* in,                        // Proto message
+    ColResultHeader* out,                           // Proto message
+    MT nonNullOrEmptyMask,                          // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    primitives::RIDType* ridSrcArray)               // The actual src array ptr to read RIDs
+{
+    constexpr const uint16_t WIDTH = sizeof(T);
+    primitives::RIDType* origRIDDstArray = ridDstArray;
+    // Saving values based on writeMask into tmp vec.
+    // Min/Max processing.
+    // The mask is 16 bit long and it describes N elements where N = sizeof(vector type) / WIDTH.
+    uint16_t j = 0;
+    for (uint32_t it = 0; it < VT::vecByteSize; ++j, it += WIDTH)
+    {
+        MT bitMapPosition = 1 << it;
+        if (writeMask & (1 << it))
+        {
+            vectWriteColValuesLoopRIDAsignment<T, HAS_INPUT_RIDS>(ridDstArray, out, ridOffset + j,
+                                                                  ridSrcArray, j);
+            ++ridDstArray;
+        }
+        vectUpdateMinMax<T, KIND, HAS_INPUT_RIDS>(validMinMax, nonNullOrEmptyMask & bitMapPosition,
+                                                  Min, Max, dataVecTPtr[j], in);
+    }
+    return ridDstArray - origRIDDstArray;
+}
+
+// Both RIDs and values
+// vectWriteColValues writes RIDs traversing the writeMask.
+template<typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
+         typename std::enable_if<OUTPUT_TYPE == OT_BOTH, T>::type* = nullptr>
+inline uint16_t vectWriteRIDValues(VT& processor,   // SIMD processor
+    const uint16_t valuesWritten,                   // The number of values written to in certain SFINAE cases
+    const bool validMinMax,                         // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,            // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                 // Typed SIMD vector from the input block
+    primitives::RIDType* ridDstArray,               // The actual dst arrray ptr to start writing RIDs
+    MT writeMask,                                   // SIMD intrinsics bitmask for values to write
+    T& Min, T&Max,                                  // Min/Max of the extent
+    NewColRequestHeader* in,                        // Proto message
+    ColResultHeader* out,                           // Proto message
+    MT nonNullOrEmptyMask,                          // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    primitives::RIDType* ridSrcArray)               // The actual src array ptr to read RIDs
+{
+    return valuesWritten;
+}
+
+// No RIDs only values
+template<typename T, typename VT, int OUTPUT_TYPE, ENUM_KIND KIND, bool HAS_INPUT_RIDS,
+         typename std::enable_if<OUTPUT_TYPE & (OT_TOKEN | OT_DATAVALUE) && !(OUTPUT_TYPE & OT_RID), T>::type* = nullptr>
+inline uint16_t vectWriteRIDValues(VT& processor,   // SIMD processor
+    const uint16_t valuesWritten,                   // The number of values written to in certain SFINAE cases
+    const bool validMinMax,                         // The flag to update Min/Max for a block or not
+    const primitives::RIDType ridOffset,            // The first RID value of the dataVecTPtr
+    T* dataVecTPtr,                                 // Typed SIMD vector from the input block
+    primitives::RIDType* ridDstArray,               // The actual dst arrray ptr to start writing RIDs
+    MT writeMask,                                   // SIMD intrinsics bitmask for values to write
+    T& Min, T&Max,                                  // Min/Max of the extent
+    NewColRequestHeader* in,                        // Proto message
+    ColResultHeader* out,                           // Proto message
+    MT nonNullOrEmptyMask,                          // SIMD intrinsics inverce bitmask for NULL/EMPTY values
+    primitives::RIDType* ridSrcArray)               // The actual src array ptr to read RIDs
+{
+    return valuesWritten;
+}
+#endif
+
+/*****************************************************************************
+ *** RUN DATA THROUGH A COLUMN FILTER ****************************************
+ *****************************************************************************/
+// TODO turn columnFilterMode into template param to use it in matchingColValue
+// This routine filters values in a columnar block processing one scalar at a time.
+template<typename T, typename FT, typename ST, ENUM_KIND KIND>
+void scalarFiltering(NewColRequestHeader* in, ColResultHeader* out,
+    const ColumnFilterMode columnFilterMode,
+    const ST* filterSet,        // Set of values for simple filters (any of values / none of them)
+    const uint32_t filterCount, // Number of filter elements, each described by one entry in the following arrays:
+    const uint8_t* filterCOPs,  //   comparison operation
+    const FT* filterValues,     //   value to compare to
+    const uint8_t* filterRFs,
+    const ColRequestHeaderDataType& typeHolder, // TypeHolder to use collation-aware ops for char/text.
+    const T* srcArray,          // Input array
+    const uint32_t srcSize,     // ... and its size
+    const uint16_t* ridArray,   // Optional array of indexes into srcArray, that defines the read order
+    const uint16_t ridSize,     // ... and its size
+    const uint32_t initialRID,  // The input block idx to start scanning/filter at.
+    const uint8_t outputType,   // Used to decide whether to skip EMPTY values
+    const bool validMinMax,     // The flag to store min/max
+    T emptyValue,               // Deduced empty value magic
+    T nullValue,                // Deduced null value magic
+    T Min,
+    T Max,
+    const bool isNullValueMatches)
+{
+    constexpr int WIDTH = sizeof(T);
+    // Loop-local variables
+    T curValue = 0;
+    primitives::RIDType rid = 0;
+    bool isEmpty = false;
+
+    // Loop over the column values, storing those matching the filter, and updating the min..max range
+    for (uint32_t i = initialRID;
+         nextColValue<T, WIDTH>(curValue, &isEmpty,
+                                &i, &rid,
+                                srcArray, srcSize, ridArray, ridSize,
+                                outputType, emptyValue); )
+    {
+        if (isEmpty)
+            continue;
+        else if (isNullValue<KIND,T>(curValue, nullValue))
+        {
+            // If NULL values match the filter, write curValue to the output buffer
+            if (isNullValueMatches)
+                writeColValue<T>(outputType, out, rid, srcArray);
+        }
+        else
+        {
+            // If curValue matches the filter, write it to the output buffer
+            if (matchingColValue<KIND, WIDTH, false>(curValue, columnFilterMode, filterSet, filterCount,
+                                                     filterCOPs, filterValues, filterRFs, in->colType, nullValue))
+            {
+                writeColValue<T>(outputType, out, rid, srcArray);
+            }
+
+            // Update Min and Max if necessary.  EMPTY/NULL values are processed in other branches.
+            if (validMinMax)
+                updateMinMax<KIND>(Min, Max, curValue, in);
+        }
+    }
+
+    // Write captured Min/Max values to *out
+    out->ValidMinMax = validMinMax;
+    if (validMinMax)
+    {
+        out->Min = Min;
+        out->Max = Max;
+    }
+}
+
+#if defined(__x86_64__ )
+template <typename VT, typename SIMD_WRAPPER_TYPE, bool HAS_INPUT_RIDS, typename T,
+          typename std::enable_if<HAS_INPUT_RIDS == false, T>::type* = nullptr>
+inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray,
+    const T* origSrcArray, const primitives::RIDType* ridArray, const uint16_t iter)
+{
+    return {processor.loadFrom(reinterpret_cast<const char*>(srcArray))};
+}
+
+// Scatter-gather implementation
+// TODO Move the logic into simd namespace class methods and use intrinsics
+template <typename VT, typename SIMD_WRAPPER_TYPE, bool HAS_INPUT_RIDS, typename T,
+          typename std::enable_if<HAS_INPUT_RIDS == true, T>::type* = nullptr>
+inline SIMD_WRAPPER_TYPE simdDataLoadTemplate(VT& processor, const T* srcArray,
+    const T* origSrcArray, const primitives::RIDType* ridArray, const uint16_t iter)
+{
+    constexpr const uint16_t WIDTH = sizeof(T);
+    constexpr const uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
+    using SIMD_TYPE = typename VT::SIMD_TYPE;
+    SIMD_TYPE result;
+    T* resultTypedPtr = reinterpret_cast<T*>(&result);
+    for (uint32_t i = 0; i < VECTOR_SIZE; ++i)
+    {
+        //std::cout << " simdDataLoadTemplate ridArray[ridArrayOffset] " << (int8_t) origSrcArray[ridArray[i]] << " ridArray[i] " << ridArray[i] << "\n";
+        resultTypedPtr[i] = origSrcArray[ridArray[i]];
+    }
+
+    return {result};
+}
+
+// This routine filters input block in a vectorized manner.
+// It supports all output types, all input types.
+// It doesn't support KIND==TEXT so upper layers filters this KIND out beforehand.
+// It doesn't support KIND==FLOAT yet also.
+// To reduce branching it first compiles the filter to produce a vector of
+// vector processing class methods(actual filters) pointers and a logical function pointer
+// to glue the masks produced by actual filters.
+// Then it takes a vector of data, run filters and logical function using pointers.
+// See the corresponding dispatcher to get more details on vector processing class.
+template<typename T, typename VT, bool HAS_INPUT_RIDS, int OUTPUT_TYPE,
+         ENUM_KIND KIND, typename FT, typename ST>
+void vectorizedFiltering(NewColRequestHeader* in, ColResultHeader* out,
+     const T* srcArray, const uint32_t srcSize, primitives::RIDType* ridArray,
+     const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter,
+     const bool validMinMax, const T emptyValue, const T nullValue,
+     T Min, T Max, const bool isNullValueMatches)
+{
+    constexpr const uint16_t WIDTH = sizeof(T);
+    using SIMD_TYPE = typename VT::SIMD_TYPE;
+    using SIMD_WRAPPER_TYPE = typename VT::SIMD_WRAPPER_TYPE;
+    VT simdProcessor;
+    SIMD_TYPE dataVec;
+    SIMD_TYPE emptyFilterArgVec = simdProcessor.loadValue(emptyValue);
+    SIMD_TYPE nullFilterArgVec = simdProcessor.loadValue(nullValue);
+    MT writeMask, nonEmptyMask, nonNullMask, nonNullOrEmptyMask;
+    MT initFilterMask = 0xFFFF;
+    primitives::RIDType rid = 0;
+    primitives::RIDType* origRidArray = ridArray;
+    uint16_t totalValuesWritten = 0;
+    char* dstArray = reinterpret_cast<char*>(primitives::getFirstValueArrayPosition(out));
+    primitives::RIDType* ridDstArray = reinterpret_cast<primitives::RIDType*>(getFirstRIDArrayPosition(out));
+    const T* origSrcArray = srcArray;
+    const FT* filterValues = nullptr;
+    const ParsedColumnFilter::CopsType* filterCOPs = nullptr;
+    ColumnFilterMode columnFilterMode = ALWAYS_TRUE;
+    const ST* filterSet = nullptr;
+    const ParsedColumnFilter::RFsType* filterRFs = nullptr;
+
+    uint8_t  outputType  = in->OutputType;
+
+    constexpr uint16_t VECTOR_SIZE = VT::vecByteSize / WIDTH;
+    // If there are RIDs use its number to get a number of vectorized iterations.
+    uint16_t iterNumber = HAS_INPUT_RIDS ? ridSize / VECTOR_SIZE : srcSize / VECTOR_SIZE;
+    uint32_t filterCount = 0;
+    // These pragmas are to silence GCC warnings
+    //  warning: ignoring attributes on template argument
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+    std::vector<SIMD_TYPE> filterArgsVectors;
+    auto ptrA = std::mem_fn(&VT::cmpEq);
+    using COPType = decltype(ptrA);
+    std::vector<COPType> copFunctorVec;
+#pragma GCC diagnostic pop
+    using BOPType = std::function<MT(MT, MT)>;
+    BOPType bopFunctor;
+    // filter comparators and logical function compilation.
+    if (parsedColumnFilter != nullptr)
+    {
+        filterValues = parsedColumnFilter->getFilterVals<FT>();
+        filterCOPs = parsedColumnFilter->prestored_cops.get();
+        columnFilterMode = parsedColumnFilter->columnFilterMode;
+        filterSet = parsedColumnFilter->getFilterSet<ST>();
+        filterRFs = parsedColumnFilter->prestored_rfs.get();
+        filterCount = parsedColumnFilter->getFilterCount();
+        if (iterNumber > 0)
+        {
+            copFunctorVec.reserve(filterCount);
+            switch(parsedColumnFilter->getBOP())
+            {
+                case BOP_OR:
+                    bopFunctor = std::bit_or<MT>();
+                    initFilterMask = 0;
+                    break;
+                case BOP_AND:
+                    bopFunctor = std::bit_and<MT>();
+                    break;
+                case BOP_XOR:
+                    bopFunctor = std::bit_or<MT>();
+                    initFilterMask = 0;
+                    break;
+                case BOP_NONE:
+                    // According with the comments in linux-port/primitiveprocessor.h
+                    // there can't be BOP_NONE with filterCount > 0
+                    bopFunctor = std::bit_and<MT>();
+                    break;
+                default:
+                    idbassert(false);
+            }
+            filterArgsVectors.reserve(filterCount);
+            for (uint32_t j = 0; j < filterCount; ++j)
+            {
+                // Preload filter argument values only once.
+                filterArgsVectors[j] = simdProcessor.loadValue(filterValues[j]);
+                switch(filterCOPs[j])
+                {
+                    case(COMPARE_EQ):
+                        copFunctorVec.push_back(std::mem_fn(&VT::cmpEq));
+                        break;
+                    case(COMPARE_GE):
+                        copFunctorVec.push_back(std::mem_fn(&VT::cmpGe));
+                        break;
+                    case(COMPARE_GT):
+                        copFunctorVec.push_back(std::mem_fn(&VT::cmpGt));
+                        break;
+                    case(COMPARE_LE):
+                        copFunctorVec.push_back(std::mem_fn(&VT::cmpLe));
+                        break;
+                    case(COMPARE_LT):
+                        copFunctorVec.push_back(std::mem_fn(&VT::cmpLt));
+                        break;
+                    case(COMPARE_NE):
+                        copFunctorVec.push_back(std::mem_fn(&VT::cmpNe));
+                        break;
+                    case(COMPARE_NIL):
+                        copFunctorVec.push_back(std::mem_fn(&VT::cmpAlwaysFalse));
+                        break;
+                    // There are couple other COP, e.g. COMPARE_NOT however they can't be met here
+                    // b/c MCS 6.x uses COMPARE_NOT for strings with OP_LIKE only. See op2num() for
+                    // details.
+
+                    default:
+                        idbassert(false);
+                }
+            }
+        }
+    }
+
+    // main loop
+    // writeMask tells which values must get into the result. Includes values that matches filters. Can have NULLs.
+    // nonEmptyMask tells which vector coords are not EMPTY magics.
+    // nonNullMask tells which vector coords are not NULL magics.
+    for (uint16_t i = 0; i < iterNumber; ++i)
+    {
+        primitives::RIDType ridOffset = i * VECTOR_SIZE;
+        assert(!HAS_INPUT_RIDS || (HAS_INPUT_RIDS && ridSize >= ridOffset));
+        dataVec = simdDataLoadTemplate<VT, SIMD_WRAPPER_TYPE, HAS_INPUT_RIDS, T>(simdProcessor, srcArray, origSrcArray, ridArray, i).v;
+        // empty check
+        nonEmptyMask = simdProcessor.cmpNe(dataVec, emptyFilterArgVec);
+        writeMask = nonEmptyMask;
+        // NULL check
+        nonNullMask = simdProcessor.cmpNe(dataVec, nullFilterArgVec);
+        // Exclude NULLs from the resulting set if NULL doesn't match the filters.
+        writeMask = isNullValueMatches ? writeMask : writeMask & nonNullMask;
+        nonNullOrEmptyMask = nonNullMask & nonEmptyMask;
+        // filters
+        MT prevFilterMask = initFilterMask;
+        // TODO name this mask literal
+        MT filterMask = 0xFFFF;
+        for (uint32_t j = 0; j < filterCount; ++j)
+        {
+            // filter using compiled filter and preloaded filter argument
+            filterMask = copFunctorVec[j](simdProcessor, dataVec, filterArgsVectors[j]);
+            filterMask = bopFunctor(prevFilterMask, filterMask);
+            prevFilterMask = filterMask;
+        }
+        writeMask = writeMask & filterMask;
+
+        T* dataVecTPtr = reinterpret_cast<T*>(&dataVec);
+
+        // vectWriteColValues iterates over the values in the source vec
+        // to store values/RIDs into dstArray/ridDstArray.
+        // It also sets Min/Max values for the block if eligible.
+        // !!! vectWriteColValues increases ridDstArray internally but it doesn't go
+        // outside the scope of the memory allocated to out msg.
+        // vectWriteColValues is empty if outputMode == OT_RID.
+        uint16_t valuesWritten =
+            vectWriteColValues<T, VT, OUTPUT_TYPE, KIND, HAS_INPUT_RIDS>(simdProcessor,
+                                                                         writeMask,
+                                                                         nonNullOrEmptyMask,
+                                                                         validMinMax,
+                                                                         ridOffset,
+                                                                         dataVecTPtr,
+                                                                         dstArray,
+                                                                         Min, Max,
+                                                                         in, out, ridDstArray,
+                                                                         ridArray);
+        // Some outputType modes saves RIDs also. vectWriteRIDValues is empty for
+        // OT_DATAVALUE, OT_BOTH(vectWriteColValues takes care about RIDs).
+        valuesWritten =
+            vectWriteRIDValues<T, VT, OUTPUT_TYPE, KIND, HAS_INPUT_RIDS>(simdProcessor,
+                                                                         valuesWritten,
+                                                                         validMinMax,
+                                                                         ridOffset,
+                                                                         dataVecTPtr,
+                                                                         ridDstArray,
+                                                                         writeMask,
+                                                                         Min, Max,
+                                                                         in, out,
+                                                                         nonNullOrEmptyMask,
+                                                                         ridArray);
+
+        // Calculate bytes written
+        uint16_t bytesWritten = valuesWritten * WIDTH;
+        totalValuesWritten += valuesWritten;
+        ridDstArray += valuesWritten;
+        dstArray += bytesWritten;
+        rid += VECTOR_SIZE;
+        srcArray += VECTOR_SIZE;
+        ridArray += VECTOR_SIZE;
+    }
+
+    // Set the number of output values here b/c tail processing can skip this operation.
+    out->NVALS = totalValuesWritten;
+    // Write captured Min/Max values to *out
+    out->ValidMinMax = validMinMax;
+    if (validMinMax)
+    {
+        out->Min = Min;
+        out->Max = Max;
+    }
+
+    // process the tail. scalarFiltering changes out contents, e.g. Min/Max, NVALS, RIDs and values array
+    // This tail also sets out::Min/Max, out::validMinMax if validMinMax is set.
+    uint32_t processedSoFar = rid;
+    scalarFiltering<T, FT, ST, KIND>(in, out, columnFilterMode, filterSet, filterCount, filterCOPs,
+                                     filterValues, filterRFs, in->colType, origSrcArray, srcSize, origRidArray,
+                                     ridSize, processedSoFar, outputType, validMinMax, emptyValue, nullValue,
+                                     Min, Max, isNullValueMatches);
+}
+
+// This routine dispatches template function calls to reduce branching.
+template<typename T, ENUM_KIND KIND, typename FT, typename ST>
+void vectorizedFilteringDispatcher(NewColRequestHeader* in, ColResultHeader* out,
+    const T* srcArray, const uint32_t srcSize, uint16_t* ridArray,
+    const uint16_t ridSize, ParsedColumnFilter* parsedColumnFilter,
+    const bool validMinMax, const T emptyValue, const T nullValue,
+    T Min, T Max, const bool isNullValueMatches)
+{
+    constexpr const uint8_t WIDTH = sizeof(T);
+    // TODO make a SFINAE template switch for the class template spec.
+    using SIMD_TYPE = simd::vi128_wr;
+    using VT = typename simd::SimdFilterProcessor<SIMD_TYPE, WIDTH>;
+    bool hasInputRIDs = (in->NVALS > 0) ? true : false;
+    if (hasInputRIDs)
+    {
+        constexpr const bool hasInput = true;
+        switch (in->OutputType)
+        {
+            case OT_RID:
+                vectorizedFiltering<T, VT, hasInput, OT_RID, KIND, FT, ST>(in, out,
+                                                                 srcArray, srcSize, ridArray, ridSize,
+                                                                 parsedColumnFilter,
+                                                                 validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
+                break;
+            case OT_BOTH:
+                vectorizedFiltering<T, VT, hasInput, OT_BOTH, KIND, FT, ST>(in, out,
+                                                                  srcArray, srcSize, ridArray, ridSize,
+                                                                  parsedColumnFilter,
+                                                                  validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
+                break;
+            case OT_TOKEN:
+                vectorizedFiltering<T, VT, hasInput, OT_TOKEN, KIND, FT, ST>(in, out,
+                                                                   srcArray, srcSize, ridArray, ridSize,
+                                                                   parsedColumnFilter,
+                                                                   validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
+                break;
+            case OT_DATAVALUE:
+                vectorizedFiltering<T, VT, hasInput, OT_DATAVALUE, KIND, FT, ST>(in, out,
+                                                                       srcArray, srcSize, ridArray, ridSize,
+                                                                       parsedColumnFilter,
+                                                                       validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
+                break;
+        }
+    }
+    else
+    {
+        constexpr const bool hasNoInput = false;
+        switch (in->OutputType)
+        {
+            case OT_RID:
+                vectorizedFiltering<T, VT, hasNoInput, OT_RID, KIND, FT, ST>(in, out,
+                                                                 srcArray, srcSize, ridArray, ridSize,
+                                                                 parsedColumnFilter,
+                                                                 validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
+                break;
+            case OT_BOTH:
+                vectorizedFiltering<T, VT, hasNoInput, OT_BOTH, KIND, FT, ST>(in, out,
+                                                                  srcArray, srcSize, ridArray, ridSize,
+                                                                  parsedColumnFilter,
+                                                                  validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
+                break;
+            case OT_TOKEN:
+                vectorizedFiltering<T, VT, hasNoInput, OT_TOKEN, KIND, FT, ST>(in, out,
+                                                                   srcArray, srcSize, ridArray, ridSize,
+                                                                   parsedColumnFilter,
+                                                                   validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
+                break;
+            case OT_DATAVALUE:
+                vectorizedFiltering<T, VT, hasNoInput, OT_DATAVALUE, KIND, FT, ST>(in, out,
+                                                                       srcArray, srcSize, ridArray, ridSize,
+                                                                       parsedColumnFilter,
+                                                                       validMinMax, emptyValue, nullValue, Min, Max, isNullValueMatches);
+                break;
+
+        }
+    }
+}
+#endif
+
+// TBD Make changes in Command class ancestors to threat BPP::values as buffer.
+// TBD this will allow to copy values only once from BPP::blockData to the destination.
 // This template contains the main scanning/filtering loop.
 // Copy data matching parsedColumnFilter from input to output.
 // Input is srcArray[srcSize], optionally accessed in the order defined by ridArray[ridSize].
-// Output is BLOB out[outSize], written starting at offset *written, which is updated afterward.
+// Output is buf: ColResponseHeader, RIDType[BLOCK_SIZE], T[BLOCK_SIZE].
 template<typename T, ENUM_KIND KIND>
 void filterColumnData(
    NewColRequestHeader* in,
@ -995,7 +1636,7 @@ void filterColumnData(
 {
    using FT = typename IntegralTypeToFilterType<T>::type;
    using ST = typename IntegralTypeToFilterSetType<T>::type;
-    constexpr int COL_WIDTH = sizeof(T);
+    constexpr int WIDTH = sizeof(T);
    const T* srcArray = reinterpret_cast<const T*>(srcArray16);

    // Cache some structure fields in local vars
@ -1015,90 +1656,49 @@ void filterColumnData(
    auto filterRFs   = filterCount==0 ? nullptr : parsedColumnFilter->prestored_rfs.get();
    ST* filterSet    = filterCount==0 ? nullptr : parsedColumnFilter->getFilterSet<ST>();

-    // ###########################
    // Bit patterns in srcArray[i] representing EMPTY and NULL values
-    T EMPTY_VALUE = getEmptyValue<T>(dataType);
-    T NULL_VALUE  = getNullValue<T>(dataType);
+    T emptyValue = getEmptyValue<T>(dataType);
+    T nullValue  = getNullValue<T>(dataType);

    // Precompute filter results for NULL values
-    bool isNullValueMatches = matchingColValue<KIND, COL_WIDTH, true>(NULL_VALUE, columnFilterMode,
-        filterSet, filterCount, filterCOPs, filterValues, filterRFs, in->colType, NULL_VALUE);
+    bool isNullValueMatches = matchingColValue<KIND, WIDTH, true>(nullValue, columnFilterMode,
+        filterSet, filterCount, filterCOPs, filterValues, filterRFs, in->colType, nullValue);

+    // ###########################
    // Boolean indicating whether to capture the min and max values
-    bool ValidMinMax = isMinMaxValid(in);
-    // Local vars to capture the min and max values
+    bool validMinMax = isMinMaxValid(in);
    T Min = datatypes::numeric_limits<T>::max();
    T Max = (KIND == KIND_UNSIGNED) ? 0 : datatypes::numeric_limits<T>::min();

-/* WIP add vertical processing
-    // If possible, use faster "vertical" filtering approach
-    if (KIND != KIND_TEXT)
+    // Vectorized scanning/filtering for all numerics except float/double types.
+    // If the total number of input values can't fill a vector the vector path
+    // applies scalar filtering.
+    // Syscat queries mustn't follow vectorized processing path b/c PP must return
+    // all values w/o any filter(even empty values filter) applied.
+
+#if defined(__x86_64__ )
+    // Don't use vectorized filtering for non-integer based data types wider than 16 bytes.
+    if (KIND < KIND_FLOAT && WIDTH < 16)
    {
        bool canUseFastFiltering = true;
-        for (int i = 0; i < filterCount; ++i)
+        for (uint32_t i = 0; i < filterCount; ++i)
            if (filterRFs[i] != 0)
            canUseFastFiltering = false;

        if (canUseFastFiltering)
        {
-            processArray<T, KIND, T>(srcArray, srcSize, ridArray, ridSize,
-                         in->BOP, filterSet, filterCount, filterCOPs, filterValues,
-                         reinterpret_cast<uint8_t*>(out) + *written,
-                         written, & out->NVALS, & out->RidFlags,
-                         (outputType & OT_RID) != 0,
-                         (outputType & (OT_TOKEN | OT_DATAVALUE)) != 0,
-                         (outputType & OT_RID) != 0,  //TODO: check correctness of this condition for SKIP_EMPTY_VALUES
-                         EMPTY_VALUE,
-                         isNullValueMatches, NULL_VALUE,
-                         ValidMinMax, &Min, &Max);
+            vectorizedFilteringDispatcher<T, KIND, FT, ST>(in, out, srcArray, srcSize, ridArray, ridSize,
+                                                           parsedColumnFilter.get(), validMinMax, emptyValue,
+                                                           nullValue, Min, Max, isNullValueMatches);
            return;
        }
    }
-*/
-
-    // Loop-local variables
-    T curValue = 0;
-    uint16_t rid = 0;
-    bool isEmpty = false;
-
-    // Loop over the column values, storing those matching the filter, and updating the min..max range
-    for (uint32_t i = 0;
-         nextColValue<T, COL_WIDTH>(curValue, &isEmpty,
-                                    &i, &rid,
-                                    srcArray, srcSize, ridArray, ridSize,
-                                    outputType, EMPTY_VALUE); )
-    {
-        if (isEmpty)
-            continue;
-        else if (isNullValue<KIND,T>(curValue, NULL_VALUE))
-        {
-            // If NULL values match the filter, write curValue to the output buffer
-            if (isNullValueMatches)
-                writeColValue<T>(outputType, out, rid, srcArray);
-        }
-        else
-        {
-            // If curValue matches the filter, write it to the output buffer
-            if (matchingColValue<KIND, COL_WIDTH, false>(curValue, columnFilterMode, filterSet, filterCount,
-                                filterCOPs, filterValues, filterRFs, in->colType, NULL_VALUE))
-            {
-                writeColValue<T>(outputType, out, rid, srcArray);
-            }
-
-            // Update Min and Max if necessary.  EMPTY/NULL values are processed in other branches.
-            if (ValidMinMax)
-                updateMinMax<KIND>(Min, Max, curValue, in);
-        }
-    }
-
-
-    // Write captured Min/Max values to *out
-    out->ValidMinMax = ValidMinMax;
-    if (ValidMinMax)
-    {
-        out->Min = Min;
-        out->Max = Max;
-    }
+#endif
+    uint32_t initialRID = 0;
+    scalarFiltering<T, FT, ST, KIND>(in, out, columnFilterMode, filterSet, filterCount, filterCOPs,
+                                     filterValues, filterRFs, in->colType, srcArray, srcSize, ridArray,
+                                     ridSize, initialRID, outputType, validMinMax, emptyValue, nullValue,
+                                     Min, Max, isNullValueMatches);
 } // end of filterColumnData

 } //namespace anon
--- a/primitives/linux-port/primitiveprocessor.cpp
+++ b/primitives/linux-port/primitiveprocessor.cpp
@ -62,8 +62,8 @@ ParsedColumnFilter::ParsedColumnFilter() : columnFilterMode(ALWAYS_TRUE), mFilte
 {
 }

-ParsedColumnFilter::ParsedColumnFilter(const uint32_t aFilterCount)
-    : columnFilterMode(ALWAYS_TRUE), mFilterCount(aFilterCount)
+ParsedColumnFilter::ParsedColumnFilter(const uint32_t aFilterCount, const int BOP)
+    : columnFilterMode(ALWAYS_TRUE), mFilterCount(aFilterCount), mBOP(BOP)
 {
    prestored_rfs.reset(new uint8_t[mFilterCount]);
    prestored_cops.reset(new uint8_t[mFilterCount]);
--- a/primitives/linux-port/primitiveprocessor.h
+++ b/primitives/linux-port/primitiveprocessor.h
@ -165,17 +165,19 @@ struct IntegralTypeToFilterSetType<int128_t>
 class ParsedColumnFilter
 {
  public:
-    static constexpr uint32_t noSetFilterThreshold = 8; 
+    using CopsType = uint8_t;
+    using RFsType = uint8_t;
+    static constexpr uint32_t noSetFilterThreshold = 8;
    ColumnFilterMode columnFilterMode;
    boost::shared_array<int64_t> prestored_argVals;
    boost::shared_array<int128_t> prestored_argVals128;
-    boost::shared_array<uint8_t> prestored_cops;
+    boost::shared_array<CopsType> prestored_cops;
    boost::shared_array<uint8_t> prestored_rfs;
    boost::shared_ptr<prestored_set_t> prestored_set;
    boost::shared_ptr<prestored_set_t_128> prestored_set_128;

    ParsedColumnFilter();
-    ParsedColumnFilter(const uint32_t aFilterCount);
+    ParsedColumnFilter(const uint32_t aFilterCount, const int BOP);
    ~ParsedColumnFilter();

    template<typename T,
@ -259,8 +261,19 @@ class ParsedColumnFilter
                prestored_set->insert(prestored_argVals[argIndex]);
    }

+    inline int getBOP() const
+    {
+        return mBOP;
+    }
+
+    inline int getFilterCount() const
+    {
+        return mFilterCount;
+    }
+
  private:
    uint32_t mFilterCount;
+    int mBOP;
 };

 //@bug 1828 These need to be public so that column operations can use it for 'like'
@ -400,7 +413,6 @@ public:
    template<typename T,
             typename std::enable_if<sizeof(T) == sizeof(int64_t), T>::type* = nullptr>
    void scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out);
-    
    template<typename T,
             typename std::enable_if<sizeof(T) <= sizeof(int64_t), T>::type* = nullptr>
    void _scanAndFilterTypeDispatcher(NewColRequestHeader* in, ColResultHeader* out);
@ -433,7 +445,7 @@ public:
 //	void p_ColAggregate(const NewColAggRequestHeader *in, NewColAggResultHeader *out);

    void p_Dictionary(const DictInput* in, std::vector<uint8_t>* out,
-                      bool skipNulls, uint32_t charsetNumber, 
+                      bool skipNulls, uint32_t charsetNumber,
                      boost::shared_ptr<DictEqualityFilter> eqFilter,
                      uint8_t eqOp);

@ -492,7 +504,7 @@ boost::shared_ptr<ParsedColumnFilter> _parseColumnFilter(

    // Allocate the compiled filter structure with space for filterCount filters.
    // No need to init arrays since they will be filled on the fly.
-    ret.reset(new ParsedColumnFilter(filterCount));
+    ret.reset(new ParsedColumnFilter(filterCount, BOP));
    ret->allocateSpaceForFilterArgs<T>();

    // Choose initial filter mode based on operation and number of filter elements
--- a/primitives/primproc/columncommand.cpp
+++ b/primitives/primproc/columncommand.cpp
@ -168,12 +168,11 @@ void ColumnCommand::_loadData()


    _mask = mask;
-// 	primMsg->RidFlags = 0xffff;   // disables selective block loading
-    //cout <<__FILE__ << "::issuePrimitive() o: " << getOID() << " l:" << primMsg->LBID << " ll: " << oidLastLbid << endl;
+ 	//primMsg->RidFlags = 0xffff;   // disables selective block loading
+    //cerr << "::ColumnCommand::_loadData OID " << getOID() << " l:" << primMsg->LBID << " ll: " << oidLastLbid << " primMsg->RidFlags " << primMsg->RidFlags << endl;

    for (i = 0; i < W; ++i, _mask <<= shift)
    {
-
        if ((!lastBlockReached && _isScan) || (!_isScan && primMsg->RidFlags & _mask))
        {
            lbids[blocksToLoad] = primMsg->LBID + i;
@ -397,7 +396,6 @@ void ColumnCommand::_process_OT_BOTH()
 {
    using T = typename datatypes::WidthToSIntegralType<W>::type;
    bpp->ridCount = outMsg->NVALS;
-    bpp->ridCount = outMsg->NVALS;
    bpp->ridMap = outMsg->RidFlags;
    uint8_t* outPtr = reinterpret_cast<uint8_t*>(&outMsg[1]);
    auto* ridPos = primitives::getRIDArrayPosition(outPtr, 0);
--- a/primitives/primproc/dictstep.cpp
+++ b/primitives/primproc/dictstep.cpp
@ -154,7 +154,7 @@ void DictStep::issuePrimitive(bool isFilter)

    if (!(primMsg->LBID & 0x8000000000000000LL))
    {
-        //cout << "DS issuePrimitive lbid: " << (uint64_t)primMsg->LBID << endl;
+        //std::cerr << "DS issuePrimitive lbid: " << (uint64_t)primMsg->LBID << endl;
        primitiveprocessor::loadBlock(primMsg->LBID,
                                      bpp->versionInfo,
                                      bpp->txnID,
@ -577,7 +577,7 @@ void DictStep::_projectToRG(RowGroup& rg, uint32_t col)
            for (i = curResultCounter; i < tmpResultCounter; i++)
            {
                rg.getRow(newRidList[i].pos, &r);
-                //cout << "serializing " << tmpStrings[i] << endl;
+                //std::cerr << "serializing " << tmpStrings[i] << endl;
                r.setStringField(tmpStrings[i].getConstString(), col);
            }
        }