1
0
mirror of https://github.com/facebook/zstd.git synced 2025-08-05 19:15:58 +03:00

Add non-optimize FASTCOVER (#1260)

* Add non-optimize FASTCOVER

* Minor fix

* Pass param as value instead of pointer
This commit is contained in:
Jennifer Liu
2018-08-01 11:06:16 -07:00
committed by Nick Terrell
parent 1420129fda
commit 0acb0abd1e
7 changed files with 201 additions and 126 deletions

View File

@@ -18,109 +18,109 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
- Fourth column is chosen d and fifth column is chosen k - Fourth column is chosen d and fifth column is chosen k
github: github:
NODICT 0.000005 2.999642 NODICT 0.000025 2.999642
RANDOM 0.036114 8.791189 RANDOM 0.030101 8.791189
LEGACY 1.111024 8.173529 LEGACY 0.913108 8.173529
COVER 57.856477 10.652243 8 1298 COVER 59.234160 10.652243 8 1298
COVER 5.769965 10.652243 8 1298 COVER 6.258459 10.652243 8 1298
FAST15 9.965877 10.555630 8 1874 FAST15 9.959246 10.555630 8 1874
FAST15 0.140285 10.555630 8 1874 FAST15 0.077719 10.555630 8 1874
FAST16 10.337194 10.701698 8 1106 FAST16 10.028343 10.701698 8 1106
FAST16 0.114887 10.701698 8 1106 FAST16 0.078117 10.701698 8 1106
FAST17 10.207121 10.650652 8 1106 FAST17 10.567355 10.650652 8 1106
FAST17 0.135424 10.650652 8 1106 FAST17 0.124833 10.650652 8 1106
FAST18 11.463120 10.499142 8 1826 FAST18 11.795287 10.499142 8 1826
FAST18 0.154287 10.499142 8 1826 FAST18 0.086992 10.499142 8 1826
FAST19 12.143020 10.527140 8 1826 FAST19 13.132451 10.527140 8 1826
FAST19 0.158889 10.527140 8 1826 FAST19 0.134716 10.527140 8 1826
FAST20 12.510857 10.494710 8 1826 FAST20 14.366314 10.494710 8 1826
FAST20 0.171334 10.494710 8 1826 FAST20 0.128844 10.494710 8 1826
FAST21 13.201432 10.503488 8 1778 FAST21 14.941238 10.503488 8 1778
FAST21 0.192867 10.503488 8 1778 FAST21 0.134975 10.503488 8 1778
FAST22 13.754560 10.509284 8 1826 FAST22 15.146226 10.509284 8 1826
FAST22 0.206276 10.509284 8 1826 FAST22 0.146918 10.509284 8 1826
FAST23 14.708633 10.509284 8 1826 FAST23 16.260552 10.509284 8 1826
FAST23 0.221751 10.509284 8 1826 FAST23 0.158494 10.509284 8 1826
FAST24 15.134848 10.512369 8 1826 FAST24 16.806037 10.512369 8 1826
FAST24 0.234242 10.512369 8 1826 FAST24 0.190464 10.512369 8 1826
hg-commands: hg-commands:
NODICT 0.000004 2.425291 NODICT 0.000026 2.425291
RANDOM 0.055073 3.490331 RANDOM 0.046270 3.490331
LEGACY 0.927414 3.911682 LEGACY 0.847904 3.911682
COVER 72.749028 4.132653 8 386 COVER 71.691804 4.132653 8 386
COVER 3.391066 4.132653 8 386 COVER 3.187085 4.132653 8 386
FAST15 10.910989 3.920720 6 1106 FAST15 11.593687 3.920720 6 1106
FAST15 0.130480 3.920720 6 1106 FAST15 0.082431 3.920720 6 1106
FAST16 10.565224 4.033306 8 674 FAST16 11.775958 4.033306 8 674
FAST16 0.146228 4.033306 8 674 FAST16 0.092587 4.033306 8 674
FAST17 11.394137 4.064132 8 1490 FAST17 11.965064 4.064132 8 1490
FAST17 0.175567 4.064132 8 1490 FAST17 0.106382 4.064132 8 1490
FAST18 11.040248 4.086714 8 290 FAST18 11.438197 4.086714 8 290
FAST18 0.132692 4.086714 8 290 FAST18 0.097293 4.086714 8 290
FAST19 11.335856 4.097947 8 578 FAST19 12.292512 4.097947 8 578
FAST19 0.181441 4.097947 8 578 FAST19 0.104406 4.097947 8 578
FAST20 14.166272 4.102851 8 434 FAST20 13.857857 4.102851 8 434
FAST20 0.203632 4.102851 8 434 FAST20 0.139467 4.102851 8 434
FAST21 15.848896 4.105350 8 530 FAST21 14.599613 4.105350 8 530
FAST21 0.269518 4.105350 8 530 FAST21 0.189416 4.105350 8 530
FAST22 15.570995 4.104100 8 530 FAST22 15.966109 4.104100 8 530
FAST22 0.238512 4.104100 8 530 FAST22 0.183817 4.104100 8 530
FAST23 17.437566 4.098110 8 914 FAST23 18.033645 4.098110 8 914
FAST23 0.270788 4.098110 8 914 FAST23 0.246641 4.098110 8 914
FAST24 18.836604 4.117367 8 722 FAST24 22.992891 4.117367 8 722
FAST24 0.323618 4.117367 8 722 FAST24 0.285994 4.117367 8 722
hg-changelog: hg-changelog:
NODICT 0.000006 1.377613 NODICT 0.000007 1.377613
RANDOM 0.253393 2.097487 RANDOM 0.297345 2.097487
LEGACY 2.410568 2.058907 LEGACY 2.633992 2.058907
COVER 203.550681 2.189685 8 98 COVER 219.179786 2.189685 8 98
COVER 7.381697 2.189685 8 98 COVER 6.620852 2.189685 8 98
FAST15 45.960609 2.130794 6 386 FAST15 47.635082 2.130794 6 386
FAST15 0.512057 2.130794 6 386 FAST15 0.321297 2.130794 6 386
FAST16 44.594817 2.144845 8 194 FAST16 43.837676 2.144845 8 194
FAST16 0.601258 2.144845 8 194 FAST16 0.312640 2.144845 8 194
FAST17 45.852992 2.156099 8 242 FAST17 49.349017 2.156099 8 242
FAST17 0.500844 2.156099 8 242 FAST17 0.348459 2.156099 8 242
FAST18 46.624930 2.172439 6 98 FAST18 51.153784 2.172439 6 98
FAST18 0.680501 2.172439 6 98 FAST18 0.353106 2.172439 6 98
FAST19 47.754905 2.180321 6 98 FAST19 52.627045 2.180321 6 98
FAST19 0.606180 2.180321 6 98 FAST19 0.390612 2.180321 6 98
FAST20 56.733632 2.187431 6 98 FAST20 63.748782 2.187431 6 98
FAST20 0.710149 2.187431 6 98 FAST20 0.489544 2.187431 6 98
FAST21 59.723173 2.184185 6 146 FAST21 68.709198 2.184185 6 146
FAST21 0.875562 2.184185 6 146 FAST21 0.530852 2.184185 6 146
FAST22 66.570788 2.182830 6 98 FAST22 68.491639 2.182830 6 98
FAST22 1.061013 2.182830 6 98 FAST22 0.645699 2.182830 6 98
FAST23 73.817645 2.186399 8 98 FAST23 72.558688 2.186399 8 98
FAST23 0.838496 2.186399 8 98 FAST23 0.593539 2.186399 8 98
FAST24 78.059933 2.185608 6 98 FAST24 76.137195 2.185608 6 98
FAST24 0.843158 2.185608 6 98 FAST24 0.680132 2.185608 6 98
hg-manifest: hg-manifest:
NODICT 0.000005 1.866385 NODICT 0.000026 1.866385
RANDOM 0.735840 2.309436 RANDOM 0.784554 2.309436
LEGACY 9.322081 2.506977 LEGACY 10.193714 2.506977
COVER 885.961515 2.582528 8 434 COVER 988.206583 2.582528 8 434
COVER 32.678552 2.582528 8 434 COVER 39.726199 2.582528 8 434
FAST15 114.414413 2.392920 6 1826 FAST15 168.388819 2.392920 6 1826
FAST15 1.412690 2.392920 6 1826 FAST15 1.272178 2.392920 6 1826
FAST16 113.869718 2.480762 6 1922 FAST16 161.822607 2.480762 6 1922
FAST16 1.539424 2.480762 6 1922 FAST16 1.164908 2.480762 6 1922
FAST17 113.333636 2.548285 6 1682 FAST17 157.688544 2.548285 6 1682
FAST17 1.473196 2.548285 6 1682 FAST17 1.222439 2.548285 6 1682
FAST18 111.717871 2.567634 6 386 FAST18 154.529585 2.567634 6 386
FAST18 1.421200 2.567634 6 386 FAST18 1.217596 2.567634 6 386
FAST19 112.428344 2.581653 8 338 FAST19 160.244979 2.581653 8 338
FAST19 1.412185 2.581653 8 338 FAST19 1.282450 2.581653 8 338
FAST20 128.897480 2.586881 8 194 FAST20 191.503297 2.586881 8 194
FAST20 1.586570 2.586881 8 194 FAST20 2.009748 2.586881 8 194
FAST21 168.465684 2.590051 6 242 FAST21 226.389709 2.590051 6 242
FAST21 2.190732 2.590051 6 242 FAST21 2.494543 2.590051 6 242
FAST22 202.320435 2.591376 6 194 FAST22 217.859055 2.591376 6 194
FAST22 2.667877 2.591376 6 194 FAST22 2.295693 2.591376 6 194
FAST23 228.952201 2.591131 8 434 FAST23 236.819791 2.591131 8 434
FAST23 3.315501 2.591131 8 434 FAST23 2.744711 2.591131 8 434
FAST24 327.320020 2.591548 6 290 FAST24 269.187800 2.591548 6 290
FAST24 5.048348 2.591548 6 290 FAST24 2.923671 2.591548 6 290

View File

@@ -91,14 +91,26 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *randomParams); info->samplesSizes, info->nbSamples, *randomParams);
}else if(coverParams) { }else if(coverParams) {
dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, /* Run the optimize version if either k or d is not provided */
info->samplesSizes, info->nbSamples, coverParams); if (!coverParams->d || !coverParams->k){
dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, coverParams);
} else {
dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *coverParams);
}
} else if(legacyParams) { } else if(legacyParams) {
dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer, dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *legacyParams); info->samplesSizes, info->nbSamples, *legacyParams);
} else if(fastParams) { } else if(fastParams) {
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, /* Run the optimize version if either k or d is not provided */
info->samplesSizes, info->nbSamples, fastParams); if (!fastParams->d || !fastParams->k) {
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, fastParams);
} else {
dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *fastParams);
}
} else { } else {
dictSize = 0; dictSize = 0;
} }
@@ -403,7 +415,6 @@ int main(int argCount, const char* argv[])
goto _cleanup; goto _cleanup;
} }
/* for fastCover (with k and d provided) */ /* for fastCover (with k and d provided) */
const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam); const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100)); DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
@@ -411,7 +422,6 @@ int main(int argCount, const char* argv[])
result = 1; result = 1;
goto _cleanup; goto _cleanup;
} }
} }

View File

@@ -16,8 +16,8 @@ make test
###Usage: ###Usage:
To build a random dictionary with the provided arguments: make ARG= followed by arguments To build a FASTCOVER dictionary with the provided arguments: make ARG= followed by arguments
If k or d is not provided, the optimize version of FASTCOVER is run.
### Examples: ### Examples:
make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520" make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520"

View File

@@ -629,6 +629,55 @@ _cleanup:
} }
} }
ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) {
BYTE* const dict = (BYTE*)dictBuffer;
FASTCOVER_ctx_t ctx;
parameters.splitPoint = 1.0;
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Checks */
if (!FASTCOVER_checkParameters(parameters, dictBufferCapacity)) {
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
return ERROR(GENERIC);
}
if (nbSamples == 0) {
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
return ERROR(GENERIC);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall);
}
/* Initialize context */
if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
parameters.d, parameters.splitPoint, parameters.f)) {
DISPLAYLEVEL(1, "Failed to initialize context\n");
return ERROR(GENERIC);
}
/* Build the dictionary */
DISPLAYLEVEL(2, "Building dictionary\n");
{
const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
dictBufferCapacity, parameters);
const size_t dictionarySize = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
samplesBuffer, samplesSizes, (unsigned)ctx.nbTrainSamples,
parameters.zParams);
if (!ZSTD_isError(dictionarySize)) {
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
(U32)dictionarySize);
}
FASTCOVER_ctx_destroy(&ctx);
return dictionarySize;
}
}
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples, const size_t *samplesSizes, unsigned nbSamples,
@@ -657,15 +706,15 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
/* Checks */ /* Checks */
if (splitPoint <= 0 || splitPoint > 1) { if (splitPoint <= 0 || splitPoint > 1) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
return ERROR(GENERIC); return ERROR(GENERIC);
} }
if (kMinK < kMaxD || kMaxK < kMinK) { if (kMinK < kMaxD || kMaxK < kMinK) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
return ERROR(GENERIC); return ERROR(GENERIC);
} }
if (nbSamples == 0) { if (nbSamples == 0) {
DISPLAYLEVEL(1, "fast must have at least one input file\n"); DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
return ERROR(GENERIC); return ERROR(GENERIC);
} }
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {

View File

@@ -12,9 +12,6 @@
#include "zdict.h" #include "zdict.h"
typedef struct { typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
@@ -26,7 +23,6 @@ typedef struct {
} ZDICT_fastCover_params_t; } ZDICT_fastCover_params_t;
/*! ZDICT_optimizeTrainFromBuffer_fastCover(): /*! ZDICT_optimizeTrainFromBuffer_fastCover():
* Train a dictionary from an array of samples using a modified version of the COVER algorithm. * Train a dictionary from an array of samples using a modified version of the COVER algorithm.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`, * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
@@ -41,7 +37,21 @@ typedef struct {
* or an error code, which can be tested with ZDICT_isError(). * or an error code, which can be tested with ZDICT_isError().
* On success `*parameters` contains the parameters selected. * On success `*parameters` contains the parameters selected.
*/ */
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t *parameters);
/*! ZDICT_trainFromBuffer_fastCover():
* Train a dictionary from an array of samples using a modified version of the COVER algorithm.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`.
* d, k, and f are required.
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples, const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters);
ZDICT_fastCover_params_t *parameters);

View File

@@ -64,8 +64,14 @@ int FASTCOVER_trainFromFiles(const char* dictFileName, sampleInfo *info,
EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */
{ size_t dictSize; { size_t dictSize;
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, /* Run the optimize version if either k or d is not provided */
info->samplesSizes, info->nbSamples, params); if (!params->d || !params->k) {
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, params);
} else {
dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *params);
}
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100)); DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100));
if (ZDICT_isError(dictSize)) { if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
@@ -92,8 +98,8 @@ int main(int argCount, const char* argv[])
int operationResult = 0; int operationResult = 0;
/* Initialize arguments to default values */ /* Initialize arguments to default values */
unsigned k = 200; unsigned k = 0;
unsigned d = 8; unsigned d = 0;
unsigned f = 23; unsigned f = 23;
unsigned steps = 32; unsigned steps = 32;
unsigned nbThreads = 1; unsigned nbThreads = 1;

View File

@@ -1,8 +1,8 @@
echo "Building fastCover dictionary with in=../../lib/common k=200 f=20 out=dict1" echo "Building fastCover dictionary with in=../../lib/common f=20 out=dict1"
./main in=../../../lib/common k=200 f=20 out=dict1 ./main in=../../../lib/common f=20 out=dict1
zstd -be3 -D dict1 -r ../../../lib/common -q zstd -be3 -D dict1 -r ../../../lib/common -q
echo "Building fastCover dictionary with in=../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000" echo "Building fastCover dictionary with in=../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000"
./main in=../../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000 ./main in=../../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000
zstd -be3 -D dict2 -r ../../../lib/common -q zstd -be3 -D dict2 -r ../../../lib/common -q
echo "Building fastCover dictionary with 2 sample sources" echo "Building fastCover dictionary with 2 sample sources"
./main in=../../../lib/common in=../../../lib/compress out=dict3 ./main in=../../../lib/common in=../../../lib/compress out=dict3