MCOL-987 Add LZ4 compression.

* Adds CompressInterfaceLZ4 which uses LZ4 API for compress/uncompress. * Adds CMake machinery to search LZ4 on running host. * All methods which use static data and do not modify any internal data - become `static`, so we can use them without creation of the specific object. This is possible, because the header specification has not been modified. We still use 2 sections in header, first one with file meta data, the second one with pointers for compressed chunks. * Methods `compress`, `uncompress`, `maxCompressedSize`, `getUncompressedSize` - become pure virtual, so we can override them for the other compression algos. * Adds method `getChunkMagicNumber`, so we can verify chunk magic number for each compression algo. * Renames "s/IDBCompressInterface/CompressInterface/g" according to requirement.
2025-08-08 14:22:09 +03:00 · 2021-04-01 17:26:38 +03:00
parent dd12bd3cd0
commit cc1c3629c5
45 changed files with 1311 additions and 549 deletions
--- a/utils/joiner/joinpartition.cpp
+++ b/utils/joiner/joinpartition.cpp
@@ -50,7 +50,10 @@ namespace joiner

 uint64_t uniqueNums = 0;

-JoinPartition::JoinPartition() { }
+JoinPartition::JoinPartition()
+{
+    compressor.reset(new compress::CompressInterfaceSnappy());
+}

 /* This is the ctor used by THJS */
 JoinPartition::JoinPartition(const RowGroup& lRG,
@@ -103,6 +106,22 @@ JoinPartition::JoinPartition(const RowGroup& lRG,

    for (int i = 0; i < (int) bucketCount; i++)
        buckets.push_back(boost::shared_ptr<JoinPartition>(new JoinPartition(*this, false)));
+
+    string compressionType;
+    try
+    {
+        compressionType =
+            config->getConfig("HashJoin", "TempFileCompressionType");
+    } catch (...) {}
+
+    if (compressionType == "LZ4")
+    {
+        compressor.reset(new compress::CompressInterfaceLZ4());
+    }
+    else
+    {
+        compressor.reset(new compress::CompressInterfaceSnappy());
+    }
 }

 /* Ctor used by JoinPartition on expansion, creates JP's in filemode */
@@ -151,6 +170,8 @@ JoinPartition::JoinPartition(const JoinPartition& jp, bool splitMode) :
    smallRG.setData(&buffer);
    smallRG.resetRowGroup(0);
    smallRG.getRow(0, &smallRow);
+
+    compressor = jp.compressor;
 }


@@ -694,6 +715,7 @@ void JoinPartition::readByteStream(int which, ByteStream* bs)

    fs.seekg(offset);
    fs.read((char*) &len, sizeof(len));
+
    saveErrno = errno;

    if (!fs)
@@ -735,12 +757,14 @@ void JoinPartition::readByteStream(int which, ByteStream* bs)
    else
    {
        size_t uncompressedSize;
+        fs.read((char*) &uncompressedSize, sizeof(uncompressedSize));
+
        boost::scoped_array<char> buf(new char[len]);

        fs.read(buf.get(), len);
        saveErrno = errno;

-        if (!fs)
+        if (!fs || !uncompressedSize)
        {
            fs.close();
            ostringstream os;
@@ -749,9 +773,9 @@ void JoinPartition::readByteStream(int which, ByteStream* bs)
        }

        totalBytesRead += len;
-        compressor.getUncompressedSize(buf.get(), len, &uncompressedSize);
        bs->needAtLeast(uncompressedSize);
-        compressor.uncompress(buf.get(), len, (char*) bs->getInputPtr());
+        compressor->uncompress(buf.get(), len, (char*) bs->getInputPtr(),
+                               &uncompressedSize);
        bs->advanceInputPtr(uncompressedSize);
    }

@@ -801,13 +825,15 @@ uint64_t JoinPartition::writeByteStream(int which, ByteStream& bs)
    }
    else
    {
-        uint64_t maxSize = compressor.maxCompressedSize(len);
-        size_t actualSize;
+        size_t maxSize = compressor->maxCompressedSize(len);
+        size_t actualSize = maxSize;
        boost::scoped_array<uint8_t> compressed(new uint8_t[maxSize]);

-        compressor.compress((char*) bs.buf(), len, (char*) compressed.get(), &actualSize);
-        ret = actualSize + 4;
+        compressor->compress((char*) bs.buf(), len, (char*) compressed.get(), &actualSize);
+        ret = actualSize + 4 + 8; // sizeof (size_t) == 8. Why 4?
        fs.write((char*) &actualSize, sizeof(actualSize));
+        // Save uncompressed len.
+        fs.write((char*) &len, sizeof(len));
        fs.write((char*) compressed.get(), actualSize);
        saveErrno = errno;