diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md index a18311973..559776e2b 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -18,109 +18,109 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" - Fourth column is chosen d and fifth column is chosen k github: -NODICT 0.000005 2.999642 -RANDOM 0.036114 8.791189 -LEGACY 1.111024 8.173529 -COVER 57.856477 10.652243 8 1298 -COVER 5.769965 10.652243 8 1298 -FAST15 9.965877 10.555630 8 1874 -FAST15 0.140285 10.555630 8 1874 -FAST16 10.337194 10.701698 8 1106 -FAST16 0.114887 10.701698 8 1106 -FAST17 10.207121 10.650652 8 1106 -FAST17 0.135424 10.650652 8 1106 -FAST18 11.463120 10.499142 8 1826 -FAST18 0.154287 10.499142 8 1826 -FAST19 12.143020 10.527140 8 1826 -FAST19 0.158889 10.527140 8 1826 -FAST20 12.510857 10.494710 8 1826 -FAST20 0.171334 10.494710 8 1826 -FAST21 13.201432 10.503488 8 1778 -FAST21 0.192867 10.503488 8 1778 -FAST22 13.754560 10.509284 8 1826 -FAST22 0.206276 10.509284 8 1826 -FAST23 14.708633 10.509284 8 1826 -FAST23 0.221751 10.509284 8 1826 -FAST24 15.134848 10.512369 8 1826 -FAST24 0.234242 10.512369 8 1826 +NODICT 0.000025 2.999642 +RANDOM 0.030101 8.791189 +LEGACY 0.913108 8.173529 +COVER 59.234160 10.652243 8 1298 +COVER 6.258459 10.652243 8 1298 +FAST15 9.959246 10.555630 8 1874 +FAST15 0.077719 10.555630 8 1874 +FAST16 10.028343 10.701698 8 1106 +FAST16 0.078117 10.701698 8 1106 +FAST17 10.567355 10.650652 8 1106 +FAST17 0.124833 10.650652 8 1106 +FAST18 11.795287 10.499142 8 1826 +FAST18 0.086992 10.499142 8 1826 +FAST19 13.132451 10.527140 8 1826 +FAST19 0.134716 10.527140 8 1826 +FAST20 14.366314 10.494710 8 1826 +FAST20 0.128844 10.494710 8 1826 +FAST21 14.941238 10.503488 8 1778 +FAST21 0.134975 10.503488 8 1778 +FAST22 15.146226 10.509284 8 1826 +FAST22 0.146918 10.509284 8 1826 +FAST23 16.260552 10.509284 8 1826 +FAST23 0.158494 10.509284 8 1826 +FAST24 16.806037 10.512369 8 1826 +FAST24 0.190464 10.512369 8 1826 hg-commands: -NODICT 0.000004 2.425291 -RANDOM 0.055073 3.490331 -LEGACY 0.927414 3.911682 -COVER 72.749028 4.132653 8 386 -COVER 3.391066 4.132653 8 386 -FAST15 10.910989 3.920720 6 1106 -FAST15 0.130480 3.920720 6 1106 -FAST16 10.565224 4.033306 8 674 -FAST16 0.146228 4.033306 8 674 -FAST17 11.394137 4.064132 8 1490 -FAST17 0.175567 4.064132 8 1490 -FAST18 11.040248 4.086714 8 290 -FAST18 0.132692 4.086714 8 290 -FAST19 11.335856 4.097947 8 578 -FAST19 0.181441 4.097947 8 578 -FAST20 14.166272 4.102851 8 434 -FAST20 0.203632 4.102851 8 434 -FAST21 15.848896 4.105350 8 530 -FAST21 0.269518 4.105350 8 530 -FAST22 15.570995 4.104100 8 530 -FAST22 0.238512 4.104100 8 530 -FAST23 17.437566 4.098110 8 914 -FAST23 0.270788 4.098110 8 914 -FAST24 18.836604 4.117367 8 722 -FAST24 0.323618 4.117367 8 722 +NODICT 0.000026 2.425291 +RANDOM 0.046270 3.490331 +LEGACY 0.847904 3.911682 +COVER 71.691804 4.132653 8 386 +COVER 3.187085 4.132653 8 386 +FAST15 11.593687 3.920720 6 1106 +FAST15 0.082431 3.920720 6 1106 +FAST16 11.775958 4.033306 8 674 +FAST16 0.092587 4.033306 8 674 +FAST17 11.965064 4.064132 8 1490 +FAST17 0.106382 4.064132 8 1490 +FAST18 11.438197 4.086714 8 290 +FAST18 0.097293 4.086714 8 290 +FAST19 12.292512 4.097947 8 578 +FAST19 0.104406 4.097947 8 578 +FAST20 13.857857 4.102851 8 434 +FAST20 0.139467 4.102851 8 434 +FAST21 14.599613 4.105350 8 530 +FAST21 0.189416 4.105350 8 530 +FAST22 15.966109 4.104100 8 530 +FAST22 0.183817 4.104100 8 530 +FAST23 18.033645 4.098110 8 914 +FAST23 0.246641 4.098110 8 914 +FAST24 22.992891 4.117367 8 722 +FAST24 0.285994 4.117367 8 722 hg-changelog: -NODICT 0.000006 1.377613 -RANDOM 0.253393 2.097487 -LEGACY 2.410568 2.058907 -COVER 203.550681 2.189685 8 98 -COVER 7.381697 2.189685 8 98 -FAST15 45.960609 2.130794 6 386 -FAST15 0.512057 2.130794 6 386 -FAST16 44.594817 2.144845 8 194 -FAST16 0.601258 2.144845 8 194 -FAST17 45.852992 2.156099 8 242 -FAST17 0.500844 2.156099 8 242 -FAST18 46.624930 2.172439 6 98 -FAST18 0.680501 2.172439 6 98 -FAST19 47.754905 2.180321 6 98 -FAST19 0.606180 2.180321 6 98 -FAST20 56.733632 2.187431 6 98 -FAST20 0.710149 2.187431 6 98 -FAST21 59.723173 2.184185 6 146 -FAST21 0.875562 2.184185 6 146 -FAST22 66.570788 2.182830 6 98 -FAST22 1.061013 2.182830 6 98 -FAST23 73.817645 2.186399 8 98 -FAST23 0.838496 2.186399 8 98 -FAST24 78.059933 2.185608 6 98 -FAST24 0.843158 2.185608 6 98 +NODICT 0.000007 1.377613 +RANDOM 0.297345 2.097487 +LEGACY 2.633992 2.058907 +COVER 219.179786 2.189685 8 98 +COVER 6.620852 2.189685 8 98 +FAST15 47.635082 2.130794 6 386 +FAST15 0.321297 2.130794 6 386 +FAST16 43.837676 2.144845 8 194 +FAST16 0.312640 2.144845 8 194 +FAST17 49.349017 2.156099 8 242 +FAST17 0.348459 2.156099 8 242 +FAST18 51.153784 2.172439 6 98 +FAST18 0.353106 2.172439 6 98 +FAST19 52.627045 2.180321 6 98 +FAST19 0.390612 2.180321 6 98 +FAST20 63.748782 2.187431 6 98 +FAST20 0.489544 2.187431 6 98 +FAST21 68.709198 2.184185 6 146 +FAST21 0.530852 2.184185 6 146 +FAST22 68.491639 2.182830 6 98 +FAST22 0.645699 2.182830 6 98 +FAST23 72.558688 2.186399 8 98 +FAST23 0.593539 2.186399 8 98 +FAST24 76.137195 2.185608 6 98 +FAST24 0.680132 2.185608 6 98 hg-manifest: -NODICT 0.000005 1.866385 -RANDOM 0.735840 2.309436 -LEGACY 9.322081 2.506977 -COVER 885.961515 2.582528 8 434 -COVER 32.678552 2.582528 8 434 -FAST15 114.414413 2.392920 6 1826 -FAST15 1.412690 2.392920 6 1826 -FAST16 113.869718 2.480762 6 1922 -FAST16 1.539424 2.480762 6 1922 -FAST17 113.333636 2.548285 6 1682 -FAST17 1.473196 2.548285 6 1682 -FAST18 111.717871 2.567634 6 386 -FAST18 1.421200 2.567634 6 386 -FAST19 112.428344 2.581653 8 338 -FAST19 1.412185 2.581653 8 338 -FAST20 128.897480 2.586881 8 194 -FAST20 1.586570 2.586881 8 194 -FAST21 168.465684 2.590051 6 242 -FAST21 2.190732 2.590051 6 242 -FAST22 202.320435 2.591376 6 194 -FAST22 2.667877 2.591376 6 194 -FAST23 228.952201 2.591131 8 434 -FAST23 3.315501 2.591131 8 434 -FAST24 327.320020 2.591548 6 290 -FAST24 5.048348 2.591548 6 290 +NODICT 0.000026 1.866385 +RANDOM 0.784554 2.309436 +LEGACY 10.193714 2.506977 +COVER 988.206583 2.582528 8 434 +COVER 39.726199 2.582528 8 434 +FAST15 168.388819 2.392920 6 1826 +FAST15 1.272178 2.392920 6 1826 +FAST16 161.822607 2.480762 6 1922 +FAST16 1.164908 2.480762 6 1922 +FAST17 157.688544 2.548285 6 1682 +FAST17 1.222439 2.548285 6 1682 +FAST18 154.529585 2.567634 6 386 +FAST18 1.217596 2.567634 6 386 +FAST19 160.244979 2.581653 8 338 +FAST19 1.282450 2.581653 8 338 +FAST20 191.503297 2.586881 8 194 +FAST20 2.009748 2.586881 8 194 +FAST21 226.389709 2.590051 6 242 +FAST21 2.494543 2.590051 6 242 +FAST22 217.859055 2.591376 6 194 +FAST22 2.295693 2.591376 6 194 +FAST23 236.819791 2.591131 8 434 +FAST23 2.744711 2.591131 8 434 +FAST24 269.187800 2.591548 6 290 +FAST24 2.923671 2.591548 6 290 diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c index 75008a087..d92e8d5cb 100644 --- a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -91,14 +91,26 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, info->samplesSizes, info->nbSamples, *randomParams); }else if(coverParams) { - dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, - info->samplesSizes, info->nbSamples, coverParams); + /* Run the optimize version if either k or d is not provided */ + if (!coverParams->d || !coverParams->k){ + dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, coverParams); + } else { + dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *coverParams); + } } else if(legacyParams) { dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer, info->samplesSizes, info->nbSamples, *legacyParams); } else if(fastParams) { - dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, - info->samplesSizes, info->nbSamples, fastParams); + /* Run the optimize version if either k or d is not provided */ + if (!fastParams->d || !fastParams->k) { + dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, fastParams); + } else { + dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *fastParams); + } } else { dictSize = 0; } @@ -403,7 +415,6 @@ int main(int argCount, const char* argv[]) goto _cleanup; } - /* for fastCover (with k and d provided) */ const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); @@ -411,7 +422,6 @@ int main(int argCount, const char* argv[]) result = 1; goto _cleanup; } - } diff --git a/contrib/experimental_dict_builders/fastCover/README.md b/contrib/experimental_dict_builders/fastCover/README.md index 66e00ee04..ad377743f 100644 --- a/contrib/experimental_dict_builders/fastCover/README.md +++ b/contrib/experimental_dict_builders/fastCover/README.md @@ -16,8 +16,8 @@ make test ###Usage: -To build a random dictionary with the provided arguments: make ARG= followed by arguments - +To build a FASTCOVER dictionary with the provided arguments: make ARG= followed by arguments +If k or d is not provided, the optimize version of FASTCOVER is run. ### Examples: make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520" diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.c b/contrib/experimental_dict_builders/fastCover/fastCover.c index cf71075ab..84d841b10 100644 --- a/contrib/experimental_dict_builders/fastCover/fastCover.c +++ b/contrib/experimental_dict_builders/fastCover/fastCover.c @@ -629,6 +629,55 @@ _cleanup: } } +ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) { + BYTE* const dict = (BYTE*)dictBuffer; + FASTCOVER_ctx_t ctx; + parameters.splitPoint = 1.0; + /* Initialize global data */ + g_displayLevel = parameters.zParams.notificationLevel; + /* Checks */ + if (!FASTCOVER_checkParameters(parameters, dictBufferCapacity)) { + DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + /* Initialize context */ + if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, + parameters.d, parameters.splitPoint, parameters.f)) { + DISPLAYLEVEL(1, "Failed to initialize context\n"); + return ERROR(GENERIC); + } + /* Build the dictionary */ + DISPLAYLEVEL(2, "Building dictionary\n"); + { + const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer, + dictBufferCapacity, parameters); + + const size_t dictionarySize = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + samplesBuffer, samplesSizes, (unsigned)ctx.nbTrainSamples, + parameters.zParams); + if (!ZSTD_isError(dictionarySize)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", + (U32)dictionarySize); + } + FASTCOVER_ctx_destroy(&ctx); + return dictionarySize; + } +} + + + ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, @@ -657,15 +706,15 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( /* Checks */ if (splitPoint <= 0 || splitPoint > 1) { - LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); + LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n"); return ERROR(GENERIC); } if (kMinK < kMaxD || kMaxK < kMinK) { - LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); + LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n"); return ERROR(GENERIC); } if (nbSamples == 0) { - DISPLAYLEVEL(1, "fast must have at least one input file\n"); + DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n"); return ERROR(GENERIC); } if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.h b/contrib/experimental_dict_builders/fastCover/fastCover.h index eca04baab..958e9f423 100644 --- a/contrib/experimental_dict_builders/fastCover/fastCover.h +++ b/contrib/experimental_dict_builders/fastCover/fastCover.h @@ -12,9 +12,6 @@ #include "zdict.h" - - - typedef struct { unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ @@ -26,7 +23,6 @@ typedef struct { } ZDICT_fastCover_params_t; - /*! ZDICT_optimizeTrainFromBuffer_fastCover(): * Train a dictionary from an array of samples using a modified version of the COVER algorithm. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, @@ -41,7 +37,21 @@ typedef struct { * or an error code, which can be tested with ZDICT_isError(). * On success `*parameters` contains the parameters selected. */ -ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( + ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + ZDICT_fastCover_params_t *parameters); + + +/*! ZDICT_trainFromBuffer_fastCover(): + * Train a dictionary from an array of samples using a modified version of the COVER algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * d, k, and f are required. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + */ +ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, - ZDICT_fastCover_params_t *parameters); + const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters); diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c index f286b0506..df7d91812 100644 --- a/contrib/experimental_dict_builders/fastCover/main.c +++ b/contrib/experimental_dict_builders/fastCover/main.c @@ -64,8 +64,14 @@ int FASTCOVER_trainFromFiles(const char* dictFileName, sampleInfo *info, EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ { size_t dictSize; - dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, - info->samplesSizes, info->nbSamples, params); + /* Run the optimize version if either k or d is not provided */ + if (!params->d || !params->k) { + dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, params); + } else { + dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *params); + } DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100)); if (ZDICT_isError(dictSize)) { DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ @@ -92,8 +98,8 @@ int main(int argCount, const char* argv[]) int operationResult = 0; /* Initialize arguments to default values */ - unsigned k = 200; - unsigned d = 8; + unsigned k = 0; + unsigned d = 0; unsigned f = 23; unsigned steps = 32; unsigned nbThreads = 1; diff --git a/contrib/experimental_dict_builders/fastCover/test.sh b/contrib/experimental_dict_builders/fastCover/test.sh index 91d4f4923..f86915b59 100644 --- a/contrib/experimental_dict_builders/fastCover/test.sh +++ b/contrib/experimental_dict_builders/fastCover/test.sh @@ -1,8 +1,8 @@ -echo "Building fastCover dictionary with in=../../lib/common k=200 f=20 out=dict1" -./main in=../../../lib/common k=200 f=20 out=dict1 +echo "Building fastCover dictionary with in=../../lib/common f=20 out=dict1" +./main in=../../../lib/common f=20 out=dict1 zstd -be3 -D dict1 -r ../../../lib/common -q -echo "Building fastCover dictionary with in=../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000" -./main in=../../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000 +echo "Building fastCover dictionary with in=../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000" +./main in=../../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000 zstd -be3 -D dict2 -r ../../../lib/common -q echo "Building fastCover dictionary with 2 sample sources" ./main in=../../../lib/common in=../../../lib/compress out=dict3