From e9161637b28f3b9fb26398d5e85d74e8959ca2c8 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Tue, 25 Jul 2017 18:13:27 -0700 Subject: [PATCH] Allow parameters to be modified from a separate file --- contrib/long_distance_matching/Makefile | 8 +- contrib/long_distance_matching/ldm.c | 539 -------------------- contrib/long_distance_matching/ldm.h | 30 +- contrib/long_distance_matching/ldm_common.c | 5 +- contrib/long_distance_matching/ldm_hash32.c | 12 +- contrib/long_distance_matching/ldm_hash64.c | 164 +----- contrib/long_distance_matching/ldm_params.h | 10 + contrib/long_distance_matching/main.c | 8 +- 8 files changed, 47 insertions(+), 729 deletions(-) delete mode 100644 contrib/long_distance_matching/ldm.c create mode 100644 contrib/long_distance_matching/ldm_params.h diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 292ce8517..b1fd3a1ee 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,16 +25,16 @@ LDFLAGS += -lzstd default: all -all: main-hash32 main-hash64 +all: main-64 main-integrated -main-hash64: ldm_common.c ldm_hash64.c main.c +main-64: ldm_common.c ldm_hash64.c main.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-hash32: ldm_common.c ldm_hash32.c main.c +main-integrated: ldm_common.c ldm_hash32.c main.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-hash64 main-hash32 + main-hash64 main-hash32 main-64 main-integrated @echo Cleaning completed diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c deleted file mode 100644 index fae35f9e4..000000000 --- a/contrib/long_distance_matching/ldm.c +++ /dev/null @@ -1,539 +0,0 @@ -#include -#include -#include -#include -#include - -#include "ldm.h" -#include "ldm_hashtable.h" - -#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) -#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) - -#define LDM_HASH_ENTRY_SIZE_LOG 3 - -//#define HASH_ONLY_EVERY_LOG 7 -#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG))) - -#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) - - -#define COMPUTE_STATS -#define OUTPUT_CONFIGURATION -#define CHECKSUM_CHAR_OFFSET 10 - -//#define RUN_CHECKS - -typedef U32 checksum_t; - -struct LDM_compressStats { - U32 windowSizeLog, hashTableSizeLog; - U32 numMatches; - U64 totalMatchLength; - U64 totalLiteralLength; - U64 totalOffset; - - U32 minOffset, maxOffset; - - U32 offsetHistogram[32]; -}; - -struct LDM_CCtx { - U64 isize; /* Input size */ - U64 maxOSize; /* Maximum output size */ - - const BYTE *ibase; /* Base of input */ - const BYTE *ip; /* Current input position */ - const BYTE *iend; /* End of input */ - - // Maximum input position such that hashing at the position does not exceed - // end of input. - const BYTE *ihashLimit; - - // Maximum input position such that finding a match of at least the minimum - // match length does not exceed end of input. - const BYTE *imatchLimit; - - const BYTE *obase; /* Base of output */ - BYTE *op; /* Output */ - - const BYTE *anchor; /* Anchor to start of current (match) block */ - - LDM_compressStats stats; /* Compression statistics */ - - LDM_hashTable *hashTable; - - const BYTE *lastPosHashed; /* Last position hashed */ - hash_t lastHash; /* Hash corresponding to lastPosHashed */ - checksum_t lastSum; - - const BYTE *nextIp; // TODO: this is redundant (ip + step) - const BYTE *nextPosHashed; - hash_t nextHash; /* Hash corresponding to nextPosHashed */ - checksum_t nextSum; - - unsigned step; // ip step, should be 1. - - const BYTE *lagIp; - hash_t lagHash; - checksum_t lagSum; - - // DEBUG - const BYTE *DEBUG_setNextHash; -}; - -// TODO: This can be done more efficiently (but it is not that important as it -// is only used for computing stats). -static int intLog2(U32 x) { - int ret = 0; - while (x >>= 1) { - ret++; - } - return ret; -} - -void LDM_printCompressStats(const LDM_compressStats *stats) { - int i = 0; - printf("=====================\n"); - printf("Compression statistics\n"); - printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", - stats->windowSizeLog, stats->hashTableSizeLog); - printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", - stats->numMatches, - stats->totalMatchLength, - 100.0 * (double)stats->totalMatchLength / - (double)(stats->totalMatchLength + stats->totalLiteralLength)); - printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / - (double)stats->numMatches); - printf("avg literal length, total literalLength: %.1f, %llu\n", - ((double)stats->totalLiteralLength) / (double)stats->numMatches, - stats->totalLiteralLength); - printf("avg offset length: %.1f\n", - ((double)stats->totalOffset) / (double)stats->numMatches); - printf("min offset, max offset: %u, %u\n", - stats->minOffset, stats->maxOffset); - - printf("\n"); - printf("offset histogram: offset, num matches, %% of matches\n"); - - for (; i <= intLog2(stats->maxOffset); i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - stats->offsetHistogram[i], - 100.0 * (double) stats->offsetHistogram[i] / - (double) stats->numMatches); - } - printf("\n"); - printf("=====================\n"); -} - -/** - * Convert a sum computed from getChecksum to a hash value in the range - * of the hash table. - */ -static hash_t checksumToHash(U32 sum) { - return HASH_hashU32(sum); -} - -/** - * Computes a 32-bit checksum based on rsync's checksum. - * - * a(k,l) = \sum_{i = k}^l x_i (mod M) - * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) - * checksum(k,l) = a(k,l) + 2^{16} * b(k,l) - */ -static checksum_t getChecksum(const BYTE *buf, U32 len) { - U32 i; - checksum_t s1, s2; - - s1 = s2 = 0; - for (i = 0; i < (len - 4); i += 4) { - s2 += (4 * (s1 + buf[i])) + (3 * buf[i + 1]) + - (2 * buf[i + 2]) + (buf[i + 3]) + - (10 * CHECKSUM_CHAR_OFFSET); - s1 += buf[i] + buf[i + 1] + buf[i + 2] + buf[i + 3] + - + (4 * CHECKSUM_CHAR_OFFSET); - - } - for(; i < len; i++) { - s1 += buf[i] + CHECKSUM_CHAR_OFFSET; - s2 += s1; - } - return (s1 & 0xffff) + (s2 << 16); -} - -/** - * Update a checksum computed from getChecksum(data, len). - * - * The checksum can be updated along its ends as follows: - * a(k+1, l+1) = (a(k,l) - x_k + x_{l+1}) (mod M) - * b(k+1, l+1) = (b(k,l) - (l-k+1)*x_k + (a(k+1,l+1)) (mod M) - * - * Thus toRemove should correspond to data[0]. - */ -static checksum_t updateChecksum(checksum_t sum, U32 len, - BYTE toRemove, BYTE toAdd) { - U32 s1 = (sum & 0xffff) - toRemove + toAdd; - U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; - - return (s1 & 0xffff) + (s2 << 16); -} - -/** - * Update cctx->nextSum, cctx->nextHash, and cctx->nextPosHashed - * based on cctx->lastSum and cctx->lastPosHashed. - * - * This uses a rolling hash and requires that the last position hashed - * corresponds to cctx->nextIp - step. - */ -static void setNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - U32 check; - if ((cctx->nextIp - cctx->ibase != 1) && - (cctx->nextIp - cctx->DEBUG_setNextHash != 1)) { - printf("CHECK debug fail: %zu %zu\n", cctx->nextIp - cctx->ibase, - cctx->DEBUG_setNextHash - cctx->ibase); - } - - cctx->DEBUG_setNextHash = cctx->nextIp; -#endif - - cctx->nextSum = updateChecksum( - cctx->lastSum, LDM_HASH_LENGTH, - cctx->lastPosHashed[0], - cctx->lastPosHashed[LDM_HASH_LENGTH]); - cctx->nextPosHashed = cctx->nextIp; - cctx->nextHash = checksumToHash(cctx->nextSum); - -#if LDM_LAG - if (cctx->ip - cctx->ibase > LDM_LAG) { - cctx->lagSum = updateChecksum( - cctx->lagSum, LDM_HASH_LENGTH, - cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); - cctx->lagIp++; - cctx->lagHash = checksumToHash(cctx->lagSum); - } -#endif - -#ifdef RUN_CHECKS - check = getChecksum(cctx->nextIp, LDM_HASH_LENGTH); - - if (check != cctx->nextSum) { - printf("CHECK: setNextHash failed %u %u\n", check, cctx->nextSum); - } - - if ((cctx->nextIp - cctx->lastPosHashed) != 1) { - printf("setNextHash: nextIp != lastPosHashed + 1. %zu %zu %zu\n", - cctx->nextIp - cctx->ibase, cctx->lastPosHashed - cctx->ibase, - cctx->ip - cctx->ibase); - } -#endif -} - -static void putHashOfCurrentPositionFromHash( - LDM_CCtx *cctx, hash_t hash, U32 checksum) { - // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. - // Note: this works only when cctx->step is 1. - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { -#if LDM_LAG - // Off by 1, but whatever - if (cctx->lagIp - cctx->ibase > 0) { - const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; - HASH_insert(cctx->hashTable, cctx->lagHash, entry); - } else { - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; - HASH_insert(cctx->hashTable, hash, entry); - } -#else - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; - HASH_insert(cctx->hashTable, hash, entry); -#endif - } - - cctx->lastPosHashed = cctx->ip; - cctx->lastHash = hash; - cctx->lastSum = checksum; -} - -/** - * Copy over the cctx->lastHash, cctx->lastSum, and cctx->lastPosHashed - * fields from the "next" fields. - * - * This requires that cctx->ip == cctx->nextPosHashed. - */ -static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) { -#ifdef RUN_CHECKS - if (cctx->ip != cctx->nextPosHashed) { - printf("CHECK failed: updateLastHashFromNextHash %zu\n", - cctx->ip - cctx->ibase); - } -#endif - putHashOfCurrentPositionFromHash(cctx, cctx->nextHash, cctx->nextSum); -} - -/** - * Insert hash of the current position into the hash table. - */ -static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { - checksum_t sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); - hash_t hash = checksumToHash(sum); - -#ifdef RUN_CHECKS - if (cctx->nextPosHashed != cctx->ip && (cctx->ip != cctx->ibase)) { - printf("CHECK failed: putHashOfCurrentPosition %zu\n", - cctx->ip - cctx->ibase); - } -#endif - - putHashOfCurrentPositionFromHash(cctx, hash, sum); -} - -void LDM_initializeCCtx(LDM_CCtx *cctx, - const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - cctx->isize = srcSize; - cctx->maxOSize = maxDstSize; - - cctx->ibase = (const BYTE *)src; - cctx->ip = cctx->ibase; - cctx->iend = cctx->ibase + srcSize; - - cctx->ihashLimit = cctx->iend - LDM_HASH_LENGTH; - cctx->imatchLimit = cctx->iend - LDM_MIN_MATCH_LENGTH; - - cctx->obase = (BYTE *)dst; - cctx->op = (BYTE *)dst; - - cctx->anchor = cctx->ibase; - - memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64, cctx->ibase, - LDM_MIN_MATCH_LENGTH, LDM_WINDOW_SIZE); - - cctx->stats.minOffset = UINT_MAX; - cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; - cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; - - - cctx->lastPosHashed = NULL; - - cctx->step = 1; // Fixed to be 1 for now. Changing may break things. - cctx->nextIp = cctx->ip + cctx->step; - cctx->nextPosHashed = 0; - - cctx->DEBUG_setNextHash = 0; -} - -void LDM_destroyCCtx(LDM_CCtx *cctx) { - HASH_destroyTable(cctx->hashTable); -} - -/** - * Finds the "best" match. - * - * Returns 0 if successful and 1 otherwise (i.e. no match can be found - * in the remaining input that is long enough). - * - * matchLength contains the forward length of the match. - */ -static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, - U64 *matchLength, U64 *backwardMatchLength) { - - LDM_hashEntry *entry = NULL; - cctx->nextIp = cctx->ip + cctx->step; - - while (entry == NULL) { - hash_t h; - checksum_t sum; - setNextHash(cctx); - h = cctx->nextHash; - sum = cctx->nextSum; - cctx->ip = cctx->nextIp; - cctx->nextIp += cctx->step; - - if (cctx->ip > cctx->imatchLimit) { - return 1; - } - - entry = HASH_getBestEntry(cctx->hashTable, h, sum, - cctx->ip, cctx->iend, - cctx->anchor, - matchLength, backwardMatchLength); - - if (entry != NULL) { - *match = entry->offset + cctx->ibase; - } - putHashOfCurrentPositionFromHash(cctx, h, sum); - } - setNextHash(cctx); - return 0; -} - -void LDM_encodeLiteralLengthAndLiterals( - LDM_CCtx *cctx, BYTE *pToken, const U64 literalLength) { - /* Encode the literal length. */ - if (literalLength >= RUN_MASK) { - int len = (int)literalLength - RUN_MASK; - *pToken = (RUN_MASK << ML_BITS); - for (; len >= 255; len -= 255) { - *(cctx->op)++ = 255; - } - *(cctx->op)++ = (BYTE)len; - } else { - *pToken = (BYTE)(literalLength << ML_BITS); - } - - /* Encode the literals. */ - memcpy(cctx->op, cctx->anchor, literalLength); - cctx->op += literalLength; -} - -void LDM_outputBlock(LDM_CCtx *cctx, - const U64 literalLength, - const U32 offset, - const U64 matchLength) { - BYTE *pToken = cctx->op++; - - /* Encode the literal length and literals. */ - LDM_encodeLiteralLengthAndLiterals(cctx, pToken, literalLength); - - /* Encode the offset. */ - MEM_write32(cctx->op, offset); - cctx->op += LDM_OFFSET_SIZE; - - /* Encode the match length. */ - if (matchLength >= ML_MASK) { - U64 matchLengthRemaining = matchLength; - *pToken += ML_MASK; - matchLengthRemaining -= ML_MASK; - MEM_write32(cctx->op, 0xFFFFFFFF); - while (matchLengthRemaining >= 4*0xFF) { - cctx->op += 4; - MEM_write32(cctx->op, 0xffffffff); - matchLengthRemaining -= 4*0xFF; - } - cctx->op += matchLengthRemaining / 255; - *(cctx->op)++ = (BYTE)(matchLengthRemaining % 255); - } else { - *pToken += (BYTE)(matchLength); - } -} - -// TODO: maxDstSize is unused. This function may seg fault when writing -// beyond the size of dst, as it does not check maxDstSize. Writing to -// a buffer and performing checks is a possible solution. -// -// This is based upon lz4. -size_t LDM_compress(const void *src, size_t srcSize, - void *dst, size_t maxDstSize) { - LDM_CCtx cctx; - const BYTE *match = NULL; - U64 forwardMatchLength = 0; - U64 backwardsMatchLength = 0; - - LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); - LDM_outputConfiguration(); - - /* Hash the first position and put it into the hash table. */ - LDM_putHashOfCurrentPosition(&cctx); - -#if LDM_LAG - cctx.lagIp = cctx.ip; - cctx.lagHash = cctx.lastHash; - cctx.lagSum = cctx.lastSum; -#endif - /** - * Find a match. - * If no more matches can be found (i.e. the length of the remaining input - * is less than the minimum match length), then stop searching for matches - * and encode the final literals. - */ - while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, - &backwardsMatchLength) == 0) { -#ifdef COMPUTE_STATS - cctx.stats.numMatches++; -#endif - - cctx.ip -= backwardsMatchLength; - match -= backwardsMatchLength; - - /** - * Write current block (literals, literal length, match offset, match - * length) and update pointers and hashes. - */ - { - const U32 literalLength = cctx.ip - cctx.anchor; - const U32 offset = cctx.ip - match; - const U32 matchLength = forwardMatchLength + - backwardsMatchLength - - LDM_MIN_MATCH_LENGTH; - - LDM_outputBlock(&cctx, literalLength, offset, matchLength); - -#ifdef COMPUTE_STATS - cctx.stats.totalLiteralLength += literalLength; - cctx.stats.totalOffset += offset; - cctx.stats.totalMatchLength += matchLength + LDM_MIN_MATCH_LENGTH; - cctx.stats.minOffset = - offset < cctx.stats.minOffset ? offset : cctx.stats.minOffset; - cctx.stats.maxOffset = - offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; - cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; -#endif - - // Move ip to end of block, inserting hashes at each position. - cctx.nextIp = cctx.ip + cctx.step; - while (cctx.ip < cctx.anchor + LDM_MIN_MATCH_LENGTH + - matchLength + literalLength) { - if (cctx.ip > cctx.lastPosHashed) { - // TODO: Simplify. - LDM_updateLastHashFromNextHash(&cctx); - setNextHash(&cctx); - } - cctx.ip++; - cctx.nextIp++; - } - } - - // Set start of next block to current input pointer. - cctx.anchor = cctx.ip; - LDM_updateLastHashFromNextHash(&cctx); - } - - /* Encode the last literals (no more matches). */ - { - const U32 lastRun = cctx.iend - cctx.anchor; - BYTE *pToken = cctx.op++; - LDM_encodeLiteralLengthAndLiterals(&cctx, pToken, lastRun); - } - -#ifdef COMPUTE_STATS - LDM_printCompressStats(&cctx.stats); - HASH_outputTableOccupancy(cctx.hashTable); -#endif - - { - const size_t ret = cctx.op - cctx.obase; - LDM_destroyCCtx(&cctx); - return ret; - } -} - -void LDM_outputConfiguration(void) { - printf("=====================\n"); - printf("Configuration\n"); - printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG); - printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n", - LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); - printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); - printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG); - printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); - printf("LDM_LAG %d\n", LDM_LAG); - printf("=====================\n"); -} - - - -void LDM_test(const BYTE *src) { - (void)src; -} - diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index f9ad383e1..b87a57bc8 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -2,6 +2,7 @@ #define LDM_H #include "mem.h" // from /lib/common/mem.h +#include "ldm_params.h" // The number of bytes storing the compressed and decompressed size // in the header. @@ -18,35 +19,38 @@ #define LDM_OFFSET_SIZE 4 // ============================================================================= -// User parameters. +// Modify parameters in ldm_params.h if "ldm_params.h" is included. // ============================================================================= +#ifndef LDM_PARAMS_H // Defines the size of the hash table. // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? -#define LDM_MEMORY_USAGE 25 + #define LDM_MEMORY_USAGE 25 // The number of entries in a hash bucket. -#define HASH_BUCKET_SIZE_LOG 3 // The maximum is 4 for now. + #define HASH_BUCKET_SIZE_LOG 3 // The maximum is 4 for now. // Defines the lag in inserting elements into the hash table. -#define LDM_LAG 0 + #define LDM_LAG 0 // The maximum window size. -#define LDM_WINDOW_SIZE_LOG 28 // Max value is 30 -#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) + #define LDM_WINDOW_SIZE_LOG 28 // Max value is 30 //These should be multiples of four (and perhaps set to the same value?). -#define LDM_MIN_MATCH_LENGTH 64 -#define LDM_HASH_LENGTH 64 + #define LDM_MIN_MATCH_LENGTH 64 -// Experimental. -//#define TMP_EVICTION // Experiment with eviction policies. -#define INSERT_BY_TAG // Insertion policy based on hash. + #define INSERT_BY_TAG 1 // Insertion policy based on hash. -#define USE_CHECKSUM 1 + #define USE_CHECKSUM 1 +#endif // ============================================================================= +#define COMPUTE_STATS +#define OUTPUT_CONFIGURATION + +#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) +#define LDM_HASH_LENGTH LDM_MIN_MATCH_LENGTH typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; @@ -164,6 +168,4 @@ void LDM_writeHeader(void *memPtr, U64 compressedSize, */ void LDM_outputConfiguration(void); -void LDM_test(const BYTE *src); - #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/ldm_common.c b/contrib/long_distance_matching/ldm_common.c index 1953656e3..26b716a1b 100644 --- a/contrib/long_distance_matching/ldm_common.c +++ b/contrib/long_distance_matching/ldm_common.c @@ -70,7 +70,8 @@ size_t LDM_decompress(const void *src, size_t compressedSize, dctx.ip += length; dctx.op = cpy; - //TODO : dynamic offset size + //TODO: dynamic offset size? + /* Encode the offset. */ offset = MEM_read32(dctx.ip); dctx.ip += LDM_OFFSET_SIZE; match = dctx.op - offset; @@ -89,7 +90,7 @@ size_t LDM_decompress(const void *src, size_t compressedSize, /* Copy match. */ cpy = dctx.op + length; - // Inefficient for now. + // TODO: this can be made more efficient. while (match < cpy - offset && dctx.op < dctx.oend) { *(dctx.op)++ = *match++; } diff --git a/contrib/long_distance_matching/ldm_hash32.c b/contrib/long_distance_matching/ldm_hash32.c index b80a5c017..94fa5e928 100644 --- a/contrib/long_distance_matching/ldm_hash32.c +++ b/contrib/long_distance_matching/ldm_hash32.c @@ -21,9 +21,7 @@ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) -#define COMPUTE_STATS -#define OUTPUT_CONFIGURATION -#define CHECKSUM_CHAR_OFFSET 1 +#define CHECKSUM_CHAR_OFFSET 10 // Take first match only. //#define ZSTD_SKIP @@ -779,8 +777,6 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } - // HASH_outputTableOffsetHistogram(&cctx); - /* Encode the last literals (no more matches). */ { const U32 lastRun = cctx.iend - cctx.anchor; @@ -815,9 +811,3 @@ void LDM_outputConfiguration(void) { -// TODO: implement and test hash function -void LDM_test(const BYTE *src) { - (void)src; -} - - diff --git a/contrib/long_distance_matching/ldm_hash64.c b/contrib/long_distance_matching/ldm_hash64.c index e51ac57d3..884f7b724 100644 --- a/contrib/long_distance_matching/ldm_hash64.c +++ b/contrib/long_distance_matching/ldm_hash64.c @@ -24,8 +24,6 @@ #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) -#define COMPUTE_STATS -#define OUTPUT_CONFIGURATION #define HASH_CHAR_OFFSET 10 // Take first match only. @@ -63,11 +61,6 @@ struct LDM_compressStats { U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG]; U64 TMP_totalHashCount; - - U64 TMP_totalInWindow; - U64 TMP_totalInserts; - - U64 TMP_matchCount; }; typedef struct LDM_hashTable LDM_hashTable; @@ -328,91 +321,12 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, return NULL; } -#ifdef TMP_EVICTION -void HASH_insert(LDM_hashTable *table, - const hash_t hash, const LDM_hashEntry entry, - LDM_CCtx *cctx) { - // Overwrite based on part of checksum. - /* - LDM_hashEntry *toOverwrite = - getBucket(table, hash) + table->bucketOffsets[hash]; - const BYTE *pMatch = toOverwrite->offset + cctx->ibase; - if (toOverwrite->offset != 0 && - cctx->ip - pMatch <= LDM_WINDOW_SIZE) { - cctx->stats.TMP_totalInWindow++; - } - - cctx->stats.TMP_totalInserts++; - *(toOverwrite) = entry; - */ - - /* - int i; - LDM_hashEntry *bucket = getBucket(table, hash); - for (i = 0; i < HASH_BUCKET_SIZE; i++) { - if (bucket[i].checksum == entry.checksum) { - bucket[i] = entry; - cctx->stats.TMP_matchCount++; - return; - } - } - */ - - // Find entry beyond window size, replace. Else, random. - int i; - LDM_hashEntry *bucket = getBucket(table, hash); - for (i = 0; i < HASH_BUCKET_SIZE; i++) { - if (cctx->ip - cctx->ibase - bucket[i].offset > LDM_WINDOW_SIZE) { - bucket[i] = entry; - return; - } - } - - i = rand() & (HASH_BUCKET_SIZE - 1); - *(bucket + i) = entry; - - - /** - * Sliding buffer style pointer - * Keep old entry as temporary. If the old entry is outside the window, - * overwrite and we are done. - * - * Backwards (insert at x): - * x, a, b b, c c c c, d d d d d d d d - * x, d d d d d d d d, c c c c, b b, a - * - * Else, find something to evict. - * If old entry has more ones, it takes - * the next spot. <-- reversed order? - * - * If window size > LDM_WINDOW_SIZE, - * overwrite, - * - * Insert forwards. If > tag, keep. Else evict. - * - */ - - - /* - *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; - table->bucketOffsets[hash]++; - table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; - */ - -// U16 mask = entry.checksum & (HASH_BUCKET_SIZE - 1); -// *(getBucket(table, hash) + mask) = entry; -} - -#else - void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; table->bucketOffsets[hash]++; table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; } -#endif // TMP_EVICTION - U32 HASH_getSize(const LDM_hashTable *table) { return table->numBuckets; @@ -489,7 +403,7 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { (double) stats->numMatches); } printf("\n"); -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG /* printf("Lower bit distribution\n"); for (i = 0; i < (1 << HASH_ONLY_EVERY_LOG); i++) { @@ -500,13 +414,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { */ #endif -#ifdef TMP_EVICTION - printf("Evicted something in window: %llu %6.3f\n", - stats->TMP_totalInWindow, - 100.0 * (double)stats->TMP_totalInWindow / - (double)stats->TMP_totalInserts); - printf("Match count: %llu\n", stats->TMP_matchCount); -#endif printf("=====================\n"); } @@ -524,7 +431,7 @@ static U32 getChecksum(U64 hash) { return (hash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF; } -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG static U32 lowerBitsFromHfHash(U64 hash) { // The number of bits used so far is LDM_HASHLOG + 32. // So there are 32 - LDM_HASHLOG bits left. @@ -611,7 +518,7 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->lastPosHashed[LDM_HASH_LENGTH]); cctx->nextPosHashed = cctx->nextIp; -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG { U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextHash); cctx->stats.TMP_totalHashCount++; @@ -648,7 +555,7 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { // Note: this works only when cctx->step is 1. #if LDM_LAG if (cctx -> lagIp - cctx->ibase > 0) { -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG U32 hashEveryMask = lowerBitsFromHfHash(cctx->lagHash); if (hashEveryMask == HASH_ONLY_EVERY) { #else @@ -663,14 +570,11 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase }; # endif -# ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, smallHash, entry, cctx); -# else HASH_insert(cctx->hashTable, smallHash, entry); -# endif } } else { -#ifdef INSERT_BY_TAG +#endif // LDM_LAG +#if INSERT_BY_TAG U32 hashEveryMask = lowerBitsFromHfHash(hash); if (hashEveryMask == HASH_ONLY_EVERY) { #else @@ -684,33 +588,9 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) { #else const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; #endif - -#ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, smallHash, entry, cctx); -#else HASH_insert(cctx->hashTable, smallHash, entry); -#endif } - } -#else -#ifdef INSERT_BY_TAG - U32 hashEveryMask = lowerBitsFromHfHash(hash); - if (hashEveryMask == HASH_ONLY_EVERY) { -#else - if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { -#endif - U32 smallHash = getSmallHash(hash); -#if USE_CHECKSUM - U32 checksum = getChecksum(hash); - const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; -#else - const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; -#endif -#ifdef TMP_EVICTION - HASH_insert(cctx->hashTable, smallHash, entry, cctx); -#else - HASH_insert(cctx->hashTable, smallHash, entry); -#endif +#if LDM_LAG } #endif @@ -812,7 +692,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, U64 hash; hash_t smallHash; U32 checksum; -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG U32 hashEveryMask; #endif setNextHash(cctx); @@ -820,7 +700,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, hash = cctx->nextHash; smallHash = getSmallHash(hash); checksum = getChecksum(hash); -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG hashEveryMask = lowerBitsFromHfHash(hash); #endif @@ -830,7 +710,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match, if (cctx->ip > cctx->imatchLimit) { return 1; } -#ifdef INSERT_BY_TAG +#if INSERT_BY_TAG if (hashEveryMask == HASH_ONLY_EVERY) { entry = HASH_getBestEntry(cctx, smallHash, checksum, @@ -923,10 +803,8 @@ size_t LDM_compress(const void *src, size_t srcSize, /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); -#if LDM_LAG cctx.lagIp = cctx.ip; cctx.lagHash = cctx.lastHash; -#endif /** * Find a match. * If no more matches can be found (i.e. the length of the remaining input @@ -1018,28 +896,8 @@ void LDM_outputConfiguration(void) { printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG); printf("LDM_LAG: %d\n", LDM_LAG); printf("USE_CHECKSUM: %d\n", USE_CHECKSUM); -#ifdef INSERT_BY_TAG - printf("INSERT_BY_TAG: %d\n", 1); -#else - printf("INSERT_BY_TAG: %d\n", 0); -#endif + printf("INSERT_BY_TAG: %d\n", INSERT_BY_TAG); printf("HASH_CHAR_OFFSET: %d\n", HASH_CHAR_OFFSET); printf("=====================\n"); } -// TODO: implement and test hash function -void LDM_test(const BYTE *src) { - const U32 diff = 100; - const BYTE *pCur = src + diff; - U64 hash = getHash(pCur, LDM_HASH_LENGTH); - - for (; pCur < src + diff + 60; ++pCur) { - U64 nextHash = getHash(pCur + 1, LDM_HASH_LENGTH); - U64 updatedHash = updateHash(hash, LDM_HASH_LENGTH, - pCur[0], pCur[LDM_HASH_LENGTH]); - hash = nextHash; - printf("%llu %llu\n", nextHash, updatedHash); - } -} - - diff --git a/contrib/long_distance_matching/ldm_params.h b/contrib/long_distance_matching/ldm_params.h new file mode 100644 index 000000000..0fcd30bd1 --- /dev/null +++ b/contrib/long_distance_matching/ldm_params.h @@ -0,0 +1,10 @@ +#ifndef LDM_PARAMS_H +#define LDM_PARAMS_H +#define LDM_MEMORY_USAGE 23 +#define HASH_BUCKET_SIZE_LOG 3 +#define LDM_LAG 0 +#define LDM_WINDOW_SIZE_LOG 28 +#define LDM_MIN_MATCH_LENGTH 64 +#define INSERT_BY_TAG 1 +#define USE_CHECKSUM 1 +#endif diff --git a/contrib/long_distance_matching/main.c b/contrib/long_distance_matching/main.c index cee5edbae..bdd385cea 100644 --- a/contrib/long_distance_matching/main.c +++ b/contrib/long_distance_matching/main.c @@ -12,7 +12,7 @@ #include "ldm.h" #include "zstd.h" -//#define DECOMPRESS_AND_VERIFY +#define DECOMPRESS_AND_VERIFY /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. @@ -71,10 +71,6 @@ static int compress(const char *fname, const char *oname) { return 1; } -#ifdef TEST - LDM_test((const BYTE *)src); -#endif - gettimeofday(&tv1, NULL); compressedSize = LDM_HEADER_SIZE + @@ -111,7 +107,7 @@ static int compress(const char *fname, const char *oname) { return 0; } -#ifdef DECOMPRESS +#ifdef DECOMPRESS_AND_VERIFY /* Decompress file compressed using LDM_compress. * The input file should have the LDM_HEADER followed by payload. * Returns 0 if succesful, and an error code otherwise.