MCOL-641 Work of Ivan Zuniga on basic read and write support for Binary16

2025-08-08 14:22:09 +03:00 · 2019-10-24 14:01:47 -04:00
parent d943beb445
commit 32f6167067
48 changed files with 1114 additions and 75 deletions
--- a/writeengine/shared/shared_components_tests.cpp
+++ b/writeengine/shared/shared_components_tests.cpp
@@ -91,7 +91,8 @@ CPPUNIT_TEST(setUp);
 // Extent & dict related testing
    CPPUNIT_TEST( testExtensionWOPrealloc );
    CPPUNIT_TEST( testDictExtensionWOPrealloc );
-// Semaphore related testing
+    CPPUNIT_TEST( testExtentCrWOPreallocBin );
+    // Semaphore related testing
 //    CPPUNIT_TEST( testSem );

 // Log related testing
@@ -1542,7 +1543,180 @@ public:

    }
 */
+    template<uint8_t W> struct binary;
+    typedef binary<16> binary16;
+    typedef binary<32> binary32;
+    template<uint8_t W>
+    struct binary {
+        unsigned char data[W]; // May be ok for empty value ?
+        void operator=(uint64_t v) {*((uint64_t *) data) = v; memset(data + 8, 0, W - 8);}
+        inline uint8_t& operator[](const int index) {return *((uint8_t*) (data + index));}
+        inline uint64_t& uint64(const int index) {return *((uint64_t*) (data + (index << 3)));}
+    };
+    
+    void testExtentCrWOPreallocBin() {
+        IDBDataFile* pFile = NULL;
+        ColumnOpCompress1 fileOp;
+        BlockOp blockOp;
+        char fileName[20];
+        int rc;
+        char hdrs[ IDBCompressInterface::HDR_BUF_LEN * 2 ];
+        int dbRoot = 1;

+        idbdatafile::IDBPolicy::init(true, false, "", 0);
+        // Set to versionbuffer to satisfy IDBPolicy::getType
+        strcpy(fileName, "versionbuffer");
+        fileOp.compressionType(1);
+
+        fileOp.deleteFile(fileName);
+        CPPUNIT_ASSERT(fileOp.exists(fileName) == false);
+
+        //binary16 emptyVal = blockOp.getEmptyBinRowValue( execplan::CalpontSystemCatalog::BINARY, 16 );
+        uint64_t emptyVal = blockOp.getEmptyRowValue(execplan::CalpontSystemCatalog::BIGINT, 8);
+        int width = blockOp.getCorrectRowWidth(execplan::CalpontSystemCatalog::BINARY, sizeof (binary16));
+        int nBlocks = INITIAL_EXTENT_ROWS_TO_DISK / BYTE_PER_BLOCK * width;
+
+        // createFile runs IDBDataFile::open + initAbrevCompColumnExtent
+        // under the hood
+        // bigint column file
+        rc = fileOp.createFile(fileName,
+                nBlocks, // number of blocks
+                emptyVal, // NULL value
+                width, // width
+                dbRoot); // dbroot
+        CPPUNIT_ASSERT(rc == NO_ERROR);
+
+        fileOp.closeFile(pFile);
+
+        // open created compressed file and check its header
+        pFile = IDBDataFile::open(IDBPolicy::getType(fileName,
+                IDBPolicy::WRITEENG), fileName, "rb", dbRoot);
+
+        rc = pFile->seek(0, 0);
+        CPPUNIT_ASSERT(rc == NO_ERROR);
+        rc = fileOp.readHeaders(pFile, hdrs);
+        CPPUNIT_ASSERT(rc == NO_ERROR);
+        // Couldn't use IDBDataFile->close() here w/o excplicit cast
+        fileOp.closeFile(pFile);
+
+        // Extend the extent up to 64MB
+        pFile = IDBDataFile::open(IDBPolicy::getType(fileName,
+                IDBPolicy::WRITEENG), fileName, "rb", dbRoot);
+
+        // disable disk space preallocation 
+        idbdatafile::IDBPolicy::setPreallocSpace(dbRoot);
+        rc = fileOp.initColumnExtent(pFile,
+                dbRoot,
+                BYTE_PER_BLOCK - nBlocks, // number of blocks
+                emptyVal,
+                width,
+                false, // use existing file
+                false, // don't expand; new extent
+                false, // add full (not abbreviated) extent
+                true); // optimize extention
+
+        CPPUNIT_ASSERT(rc == NO_ERROR);
+        fileOp.closeFile(pFile);
+        // file has been extended
+        cout << endl << "file has been extended";
+
+        // write up to INITIAL_EXTENT_ROWS_TO_DISK + 1 rows into the file
+
+        Column curCol;
+        binary16 valArray[INITIAL_EXTENT_ROWS_TO_DISK + 1];
+        RID rowIdArray[INITIAL_EXTENT_ROWS_TO_DISK + 1];
+        // This is the magic for the stub in FileOp::oid2FileName 
+        int fid = 42;
+
+        for (uint64_t it = 0; it <= INITIAL_EXTENT_ROWS_TO_DISK; it++) {
+            rowIdArray[it] = it;
+            valArray[it].uint64(0) = it + 3;
+            valArray[it].uint64(1) = it + 5;
+        }
+
+        fileOp.initColumn(curCol);
+        fileOp.setColParam(curCol,
+                1, // column number
+                width,
+                execplan::CalpontSystemCatalog::BINARY,
+                WriteEngine::WR_BINARY,
+                fid,
+                1); //compression type
+
+        string segFile;
+        // openColumnFile uses DBRM's oid server but we
+        // have to get the chunks' pointers from the header.
+        curCol.dataFile.pFile = fileOp.openFile(
+                curCol,
+                dbRoot,
+                0,
+                0,
+                segFile,
+                false,
+                "r+b",
+                BYTE_PER_BLOCK * BYTE_PER_BLOCK); // buffer size is 64MB
+
+        CPPUNIT_ASSERT(rc == NO_ERROR);
+
+        rc = fileOp.writeRow(curCol, INITIAL_EXTENT_ROWS_TO_DISK + 1,
+                (RID*) rowIdArray, valArray);
+        CPPUNIT_ASSERT_EQUAL(NO_ERROR, rc); // I prefer this way as it prints values
+
+        // flush and close the file used for reading
+        fileOp.clearColumn(curCol);
+
+        std::map<uint32_t, uint32_t> oids;
+        oids[fid] = fid;
+
+        // flush changed chunks from the Manager 
+        int rtn1 = fileOp.chunkManager()->flushChunks(rc, oids);
+
+        // read back the file
+        cout << endl << "Read file      ";
+        DataBlock block;
+        binary16* bin16 = (binary16*) block.data;
+
+        fileOp.initColumn(curCol);
+        fileOp.setColParam(curCol,
+                1, // column number
+                width,
+                execplan::CalpontSystemCatalog::BINARY,
+                WriteEngine::WR_BINARY,
+                fid,
+                1); //compression type
+
+        curCol.dataFile.pFile = fileOp.openFile(
+                curCol,
+                dbRoot,
+                0,
+                0,
+                segFile,
+                false,
+                "r+b",
+                BYTE_PER_BLOCK * BYTE_PER_BLOCK); // buffer size is 64MB
+
+        CPPUNIT_ASSERT_EQUAL(NO_ERROR, rc);
+
+        int blocks = fileOp.blocksInFile(curCol.dataFile.pFile);
+
+        for (int b = 0; b < blocks; b++) {
+            rc = fileOp.chunkManager()->readBlock(curCol.dataFile.pFile, block.data, b); // ColumnOpCompress1.readBlock() is protected so ...
+            CPPUNIT_ASSERT_EQUAL(NO_ERROR, rc);
+            //cout << endl << bin16[0].uint64(0); 
+            CPPUNIT_ASSERT_EQUAL(b * 512UL + 3, bin16[0].uint64(0)); // Checking just first value of each block as it was written before 
+            CPPUNIT_ASSERT_EQUAL(b * 512UL + 5, bin16[0].uint64(1));                    
+        }
+
+        fileOp.clearColumn(curCol);
+        fileOp.closeFile(curCol.dataFile.pFile); // Seems done by clearColumn, but anyways...
+
+        cout << endl << "Delete file      ";
+
+        fileOp.deleteFile(fileName);
+        CPPUNIT_ASSERT(fileOp.exists(fileName) == false);
+        cout << endl << "End of test";
+    }
+    
    void testCleanup()
    {
        // shutdown
--- a/writeengine/shared/we_blockop.cpp
+++ b/writeengine/shared/we_blockop.cpp
@@ -159,6 +159,10 @@ uint64_t BlockOp::getEmptyRowValue(
        case CalpontSystemCatalog::UBIGINT :
            emptyVal = joblist::UBIGINTEMPTYROW;
            break;
+            
+        case CalpontSystemCatalog::BINARY :
+            emptyVal = joblist::BINARYEMPTYROW;
+            break;

        case CalpontSystemCatalog::CHAR :
        case CalpontSystemCatalog::VARCHAR :
@@ -267,9 +271,11 @@ void BlockOp::setEmptyBuf(
    // Optimize buffer initialization by constructing and copying in an array
    // instead of individual values.  This reduces the number of calls to
    // memcpy().
-    for (int j = 0; j < ARRAY_COUNT; j++)
+     
+    int w = width > 8 ? 8: width;
+    for(uint8_t* pos = emptyValArray, * end = pos + NBYTES_IN_ARRAY; pos < end; pos += w) //FIXME for no loop
    {
-        memcpy(emptyValArray + (j * width), &emptyVal, width);
+        memcpy(pos, &emptyVal, w);
    }

    int countFull128 = (bufSize / width) / ARRAY_COUNT;
--- a/writeengine/shared/we_convertor.cpp
+++ b/writeengine/shared/we_convertor.cpp
@@ -434,6 +434,11 @@ void Convertor::convertColType(CalpontSystemCatalog::ColDataType dataType,
        case CalpontSystemCatalog::UBIGINT:
            internalType = WriteEngine::WR_ULONGLONG;
            break;
+        
+        // Map BINARY to WR_BINARY
+        case CalpontSystemCatalog::BINARY:
+            internalType = WriteEngine::WR_BINARY;
+            break;    

        default:
            internalType = WriteEngine::WR_CHAR;
@@ -682,6 +687,11 @@ void Convertor::convertColType(ColStruct* curStruct)
        case CalpontSystemCatalog::UBIGINT:
            *internalType = WriteEngine::WR_ULONGLONG;
            break;
+                    
+        // Map BINARY to WR_BINARY
+        case CalpontSystemCatalog::BINARY:
+            *internalType = WriteEngine::WR_BINARY;
+            break;  

        default:
            *internalType = WriteEngine::WR_CHAR;
@@ -772,7 +782,11 @@ int Convertor::getCorrectRowWidth(CalpontSystemCatalog::ColDataType dataType, in
        case CalpontSystemCatalog::TIMESTAMP:
            newWidth = 8;
            break;
-
+        
+        case CalpontSystemCatalog::BINARY:
+            newWidth = width;
+            break;
+            
        case CalpontSystemCatalog::CHAR:
        case CalpontSystemCatalog::VARCHAR:
        case CalpontSystemCatalog::VARBINARY: // treat same as varchar for now
--- a/writeengine/shared/we_type.h
+++ b/writeengine/shared/we_type.h
@@ -108,9 +108,10 @@ enum ColType                            /** @brief Column type enumeration*/
    WR_USHORT           = 14,           /** @brief Unsigned Short */
    WR_UINT             = 15,           /** @brief Unsigned Int */
    WR_ULONGLONG        = 16,           /** @brief Unsigned Long long*/
-    WR_TEXT             = 17,            /** @brief TEXT */
+    WR_TEXT             = 17,           /** @brief TEXT */
    WR_MEDINT           = 18,           /** @brief Medium Int */
-    WR_UMEDINT          = 19            /** @brief Unsigned Medium Int */
+    WR_UMEDINT          = 19,           /** @brief Unsigned Medium Int */
+    WR_BINARY           = 20           /** @brief BINARY */            
 };

 // Describes relation of field to column for a bulk load