You've already forked mariadb-columnstore-engine
							
							
				mirror of
				https://github.com/mariadb-corporation/mariadb-columnstore-engine.git
				synced 2025-11-03 17:13:17 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			379 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			379 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/* Copyright (C) 2014 InfiniDB, Inc.
 | 
						|
 | 
						|
   This program is free software; you can redistribute it and/or
 | 
						|
   modify it under the terms of the GNU General Public License
 | 
						|
   as published by the Free Software Foundation; version 2 of
 | 
						|
   the License.
 | 
						|
 | 
						|
   This program is distributed in the hope that it will be useful,
 | 
						|
   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
   GNU General Public License for more details.
 | 
						|
 | 
						|
   You should have received a copy of the GNU General Public License
 | 
						|
   along with this program; if not, write to the Free Software
 | 
						|
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 | 
						|
   MA 02110-1301, USA. */
 | 
						|
 | 
						|
#ifndef TJOINER_H_
 | 
						|
#define TJOINER_H_
 | 
						|
 | 
						|
#include <iostream>
 | 
						|
#include <vector>
 | 
						|
#include <boost/shared_ptr.hpp>
 | 
						|
#include <boost/scoped_ptr.hpp>
 | 
						|
#include <boost/shared_array.hpp>
 | 
						|
#include <boost/scoped_array.hpp>
 | 
						|
#ifdef _MSC_VER
 | 
						|
#include <unordered_map>
 | 
						|
#else
 | 
						|
#include <tr1/unordered_map>
 | 
						|
#endif
 | 
						|
 | 
						|
#include "rowgroup.h"
 | 
						|
#include "joiner.h"
 | 
						|
#include "fixedallocator.h"
 | 
						|
#include "joblisttypes.h"
 | 
						|
#include "../funcexp/funcexpwrapper.h"
 | 
						|
#include "stlpoolallocator.h"
 | 
						|
#include "hasher.h"
 | 
						|
 | 
						|
namespace joiner
 | 
						|
{
 | 
						|
 | 
						|
inline uint64_t order_swap(uint64_t x)
 | 
						|
{
 | 
						|
    return (x >> 56) |
 | 
						|
           ((x << 40) & 0x00FF000000000000ULL) |
 | 
						|
           ((x << 24) & 0x0000FF0000000000ULL) |
 | 
						|
           ((x << 8)  & 0x000000FF00000000ULL) |
 | 
						|
           ((x >> 8)  & 0x00000000FF000000ULL) |
 | 
						|
           ((x >> 24) & 0x0000000000FF0000ULL) |
 | 
						|
           ((x >> 40) & 0x000000000000FF00ULL) |
 | 
						|
           (x << 56);
 | 
						|
}
 | 
						|
 | 
						|
class TypelessData
 | 
						|
{
 | 
						|
public:
 | 
						|
    uint8_t* data;
 | 
						|
    uint32_t len;
 | 
						|
 | 
						|
    TypelessData() : data(NULL), len(0) { }
 | 
						|
    inline bool operator==(const TypelessData&) const;
 | 
						|
    void serialize(messageqcpp::ByteStream&) const;
 | 
						|
    void deserialize(messageqcpp::ByteStream&, utils::FixedAllocator&);
 | 
						|
    void deserialize(messageqcpp::ByteStream&, utils::PoolAllocator&);
 | 
						|
    std::string toString() const;
 | 
						|
};
 | 
						|
 | 
						|
inline bool TypelessData::operator==(const TypelessData& t) const
 | 
						|
{
 | 
						|
    if (len != t.len)
 | 
						|
        return false;
 | 
						|
 | 
						|
    if (len == 0)  // special value to force mismatches
 | 
						|
        return false;
 | 
						|
 | 
						|
    return (memcmp(data, t.data, len) == 0);
 | 
						|
}
 | 
						|
 | 
						|
/* This function makes the keys for string & compound joins.  The length of the
 | 
						|
 * key is limited by keylen.  Keys that are longer are assigned a length of 0 on return,
 | 
						|
 * signifying that it shouldn't match anything.
 | 
						|
 */
 | 
						|
extern TypelessData makeTypelessKey(const rowgroup::Row&,
 | 
						|
                                    const std::vector<uint32_t>&, uint32_t keylen, utils::FixedAllocator* fa);
 | 
						|
extern TypelessData makeTypelessKey(const rowgroup::Row&,
 | 
						|
                                    const std::vector<uint32_t>&, utils::PoolAllocator* fa);
 | 
						|
extern uint64_t getHashOfTypelessKey(const rowgroup::Row&, const std::vector<uint32_t>&,
 | 
						|
                                     uint32_t seed = 0);
 | 
						|
 | 
						|
 | 
						|
class TupleJoiner
 | 
						|
{
 | 
						|
public:
 | 
						|
    struct hasher
 | 
						|
    {
 | 
						|
        inline size_t operator()(int64_t val) const
 | 
						|
        {
 | 
						|
            return fHasher((char*) &val, 8);
 | 
						|
        }
 | 
						|
        inline size_t operator()(const TypelessData& e) const
 | 
						|
        {
 | 
						|
            return fHasher((char*) e.data, e.len);
 | 
						|
        }
 | 
						|
 | 
						|
    private:
 | 
						|
        utils::Hasher fHasher;
 | 
						|
    };
 | 
						|
 | 
						|
    /* ctor to use for numeric join */
 | 
						|
    TupleJoiner(
 | 
						|
        const rowgroup::RowGroup& smallInput,
 | 
						|
        const rowgroup::RowGroup& largeInput,
 | 
						|
        uint32_t smallJoinColumn,
 | 
						|
        uint32_t largeJoinColumn,
 | 
						|
        joblist::JoinType jt);
 | 
						|
 | 
						|
    /* ctor to use for string & compound join */
 | 
						|
    TupleJoiner(
 | 
						|
        const rowgroup::RowGroup& smallInput,
 | 
						|
        const rowgroup::RowGroup& largeInput,
 | 
						|
        const std::vector<uint32_t>& smallJoinColumns,
 | 
						|
        const std::vector<uint32_t>& largeJoinColumns,
 | 
						|
        joblist::JoinType jt);
 | 
						|
 | 
						|
    ~TupleJoiner();
 | 
						|
 | 
						|
    size_t size() const;
 | 
						|
    void insert(rowgroup::Row& r, bool zeroTheRid = true);
 | 
						|
    void doneInserting();
 | 
						|
 | 
						|
    /* match() returns the small-side rows that match the large-side row.
 | 
						|
    	On a UM join, it uses largeSideRow,
 | 
						|
    	on a PM join, it uses index and threadID.
 | 
						|
    */
 | 
						|
    void match(rowgroup::Row& largeSideRow, uint32_t index, uint32_t threadID,
 | 
						|
               std::vector<rowgroup::Row::Pointer>* matches);
 | 
						|
 | 
						|
    /* On a PM left outer join + aggregation, the result is already complete.
 | 
						|
    	No need to match, just mark.
 | 
						|
    */
 | 
						|
    void markMatches(uint32_t threadID, uint32_t rowCount);
 | 
						|
 | 
						|
    /* For small outer joins, this is how matches are marked now. */
 | 
						|
    void markMatches(uint32_t threadID, const std::vector<rowgroup::Row::Pointer>& matches);
 | 
						|
 | 
						|
    /* Some accessors */
 | 
						|
    inline bool inPM() const
 | 
						|
    {
 | 
						|
        return joinAlg == PM;
 | 
						|
    }
 | 
						|
    inline bool inUM() const
 | 
						|
    {
 | 
						|
        return joinAlg == UM;
 | 
						|
    }
 | 
						|
    void setInPM();
 | 
						|
    void setInUM();
 | 
						|
    void setThreadCount(uint32_t cnt);
 | 
						|
    void setPMJoinResults(boost::shared_array<std::vector<uint32_t> >,
 | 
						|
                          uint32_t threadID);
 | 
						|
    boost::shared_array<std::vector<uint32_t> > getPMJoinArrays(uint32_t threadID);
 | 
						|
    std::vector<rowgroup::Row::Pointer>* getSmallSide()
 | 
						|
    {
 | 
						|
        return &rows;
 | 
						|
    }
 | 
						|
    inline bool smallOuterJoin()
 | 
						|
    {
 | 
						|
        return ((joinType & joblist::SMALLOUTER) != 0);
 | 
						|
    }
 | 
						|
    inline bool largeOuterJoin()
 | 
						|
    {
 | 
						|
        return ((joinType & joblist::LARGEOUTER) != 0);
 | 
						|
    }
 | 
						|
    inline bool innerJoin()
 | 
						|
    {
 | 
						|
        return joinType == joblist::INNER;
 | 
						|
    }
 | 
						|
    inline bool fullOuterJoin()
 | 
						|
    {
 | 
						|
        return (smallOuterJoin() && largeOuterJoin());
 | 
						|
    }
 | 
						|
    inline joblist::JoinType getJoinType()
 | 
						|
    {
 | 
						|
        return joinType;
 | 
						|
    }
 | 
						|
    inline const rowgroup::RowGroup& getSmallRG()
 | 
						|
    {
 | 
						|
        return smallRG;
 | 
						|
    }
 | 
						|
    inline const rowgroup::RowGroup& getLargeRG()
 | 
						|
    {
 | 
						|
        return largeRG;
 | 
						|
    }
 | 
						|
    inline uint32_t getSmallKeyColumn()
 | 
						|
    {
 | 
						|
        return smallKeyColumns[0];
 | 
						|
    }
 | 
						|
    inline uint32_t getLargeKeyColumn()
 | 
						|
    {
 | 
						|
        return largeKeyColumns[0];
 | 
						|
    }
 | 
						|
    bool hasNullJoinColumn(const rowgroup::Row& largeRow) const;
 | 
						|
    void getUnmarkedRows(std::vector<rowgroup::Row::Pointer>* out);
 | 
						|
    std::string getTableName() const;
 | 
						|
    void setTableName(const std::string& tname);
 | 
						|
 | 
						|
    /* To allow sorting */
 | 
						|
    bool operator<(const TupleJoiner&) const;
 | 
						|
 | 
						|
    uint64_t getMemUsage() const;
 | 
						|
 | 
						|
    /* Typeless join interface */
 | 
						|
    inline bool isTypelessJoin()
 | 
						|
    {
 | 
						|
        return typelessJoin;
 | 
						|
    }
 | 
						|
    inline bool isSignedUnsignedJoin()
 | 
						|
    {
 | 
						|
        return bSignedUnsignedJoin;
 | 
						|
    }
 | 
						|
    inline const std::vector<uint32_t>& getSmallKeyColumns()
 | 
						|
    {
 | 
						|
        return smallKeyColumns;
 | 
						|
    }
 | 
						|
    inline const std::vector<uint32_t>& getLargeKeyColumns()
 | 
						|
    {
 | 
						|
        return largeKeyColumns;
 | 
						|
    }
 | 
						|
    inline uint32_t getKeyLength()
 | 
						|
    {
 | 
						|
        return keyLength;
 | 
						|
    }
 | 
						|
 | 
						|
    /* Runtime casual partitioning support */
 | 
						|
    inline const boost::scoped_array<bool>& discreteCPValues()
 | 
						|
    {
 | 
						|
        return discreteValues;
 | 
						|
    }
 | 
						|
    inline const boost::scoped_array<std::vector<int64_t> >& getCPData()
 | 
						|
    {
 | 
						|
        return cpValues;
 | 
						|
    }
 | 
						|
    inline void setUniqueLimit(uint32_t limit)
 | 
						|
    {
 | 
						|
        uniqueLimit = limit;
 | 
						|
    }
 | 
						|
 | 
						|
    /* Semi-join interface */
 | 
						|
    inline bool semiJoin()
 | 
						|
    {
 | 
						|
        return ((joinType & joblist::SEMI) != 0);
 | 
						|
    }
 | 
						|
    inline bool antiJoin()
 | 
						|
    {
 | 
						|
        return ((joinType & joblist::ANTI) != 0);
 | 
						|
    }
 | 
						|
    inline bool scalar()
 | 
						|
    {
 | 
						|
        return ((joinType & joblist::SCALAR) != 0);
 | 
						|
    }
 | 
						|
    inline bool matchnulls()
 | 
						|
    {
 | 
						|
        return ((joinType & joblist::MATCHNULLS) != 0);
 | 
						|
    }
 | 
						|
    inline bool hasFEFilter()
 | 
						|
    {
 | 
						|
        return fe.get();
 | 
						|
    }
 | 
						|
    inline boost::shared_ptr<funcexp::FuncExpWrapper> getFcnExpFilter()
 | 
						|
    {
 | 
						|
        return fe;
 | 
						|
    }
 | 
						|
    void setFcnExpFilter(boost::shared_ptr<funcexp::FuncExpWrapper> fe);
 | 
						|
    inline bool evaluateFilter(rowgroup::Row& r, uint32_t index)
 | 
						|
    {
 | 
						|
        return fes[index].evaluate(&r);
 | 
						|
    }
 | 
						|
    inline uint64_t getJoinNullValue()
 | 
						|
    {
 | 
						|
        return joblist::BIGINTNULL;    // a normalized NULL value
 | 
						|
    }
 | 
						|
    inline uint64_t smallNullValue()
 | 
						|
    {
 | 
						|
        return nullValueForJoinColumn;
 | 
						|
    }
 | 
						|
 | 
						|
    // Disk-based join support
 | 
						|
    void clearData();
 | 
						|
    boost::shared_ptr<TupleJoiner> copyForDiskJoin();
 | 
						|
    bool isFinished()
 | 
						|
    {
 | 
						|
        return finished;
 | 
						|
    }
 | 
						|
 | 
						|
private:
 | 
						|
    typedef std::tr1::unordered_multimap<int64_t, uint8_t*, hasher, std::equal_to<int64_t>,
 | 
						|
            utils::STLPoolAllocator<std::pair<const int64_t, uint8_t*> > > hash_t;
 | 
						|
    typedef std::tr1::unordered_multimap<int64_t, rowgroup::Row::Pointer, hasher, std::equal_to<int64_t>,
 | 
						|
            utils::STLPoolAllocator<std::pair<const int64_t, rowgroup::Row::Pointer> > > sthash_t;
 | 
						|
    typedef std::tr1::unordered_multimap<TypelessData, rowgroup::Row::Pointer, hasher, std::equal_to<TypelessData>,
 | 
						|
            utils::STLPoolAllocator<std::pair<const TypelessData, rowgroup::Row::Pointer> > > typelesshash_t;
 | 
						|
 | 
						|
    typedef hash_t::iterator iterator;
 | 
						|
    typedef typelesshash_t::iterator thIterator;
 | 
						|
 | 
						|
    TupleJoiner();
 | 
						|
    TupleJoiner(const TupleJoiner&);
 | 
						|
    TupleJoiner& operator=(const TupleJoiner&);
 | 
						|
 | 
						|
    iterator begin()
 | 
						|
    {
 | 
						|
        return h->begin();
 | 
						|
    }
 | 
						|
    iterator end()
 | 
						|
    {
 | 
						|
        return h->end();
 | 
						|
    }
 | 
						|
 | 
						|
 | 
						|
    rowgroup::RGData smallNullMemory;
 | 
						|
 | 
						|
 | 
						|
    boost::scoped_ptr<hash_t> h;  // used for UM joins on ints
 | 
						|
    boost::scoped_ptr<sthash_t> sth;  // used for UM join on ints where the backing table uses a string table
 | 
						|
    std::vector<rowgroup::Row::Pointer> rows;   // used for PM join
 | 
						|
 | 
						|
    /* This struct is rough.  The BPP-JL stores the parsed results for
 | 
						|
    the logical block being processed.  There are X threads at once, so
 | 
						|
    up to X logical blocks being processed.  For each of those there's a vector
 | 
						|
    of matches.  Each match is an index into 'rows'. */
 | 
						|
    boost::shared_array<boost::shared_array<std::vector<uint32_t> > > pmJoinResults;
 | 
						|
    rowgroup::RowGroup smallRG, largeRG;
 | 
						|
    boost::scoped_array<rowgroup::Row> smallRow;
 | 
						|
    //boost::shared_array<uint8_t> smallNullMemory;
 | 
						|
    rowgroup::Row smallNullRow;
 | 
						|
 | 
						|
    enum JoinAlg
 | 
						|
    {
 | 
						|
        INSERTING,
 | 
						|
        PM,
 | 
						|
        UM,
 | 
						|
        LARGE
 | 
						|
    };
 | 
						|
    JoinAlg joinAlg;
 | 
						|
    joblist::JoinType joinType;
 | 
						|
    boost::shared_ptr<utils::PoolAllocator> _pool;	// pool for the table and nodes
 | 
						|
    uint32_t threadCount;
 | 
						|
    std::string tableName;
 | 
						|
 | 
						|
    /* vars, & fcns for typeless join */
 | 
						|
    bool typelessJoin;
 | 
						|
    std::vector<uint32_t> smallKeyColumns, largeKeyColumns;
 | 
						|
    boost::scoped_ptr<typelesshash_t> ht;  // used for UM join on strings
 | 
						|
    uint32_t keyLength;
 | 
						|
    utils::FixedAllocator storedKeyAlloc;
 | 
						|
    boost::scoped_array<utils::FixedAllocator> tmpKeyAlloc;
 | 
						|
    bool bSignedUnsignedJoin; // Set if we have a signed vs unsigned compare in a join. When not set, we can save checking for the signed bit.
 | 
						|
 | 
						|
    /* semi-join vars & fcns */
 | 
						|
    boost::shared_ptr<funcexp::FuncExpWrapper> fe;
 | 
						|
    boost::scoped_array<funcexp::FuncExpWrapper> fes;  // holds X copies of fe, one per thread
 | 
						|
    // this var is only used to normalize the NULL values for single-column joins,
 | 
						|
    // will have to change when/if we need to support that for compound or string joins
 | 
						|
    int64_t nullValueForJoinColumn;
 | 
						|
 | 
						|
    /* Runtime casual partitioning support */
 | 
						|
    void updateCPData(const rowgroup::Row& r);
 | 
						|
    boost::scoped_array<bool> discreteValues;
 | 
						|
    boost::scoped_array<std::vector<int64_t> > cpValues;    // if !discreteValues, [0] has min, [1] has max
 | 
						|
    uint32_t uniqueLimit;
 | 
						|
    bool finished;
 | 
						|
};
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
#endif
 | 
						|
 |