1
0
mirror of https://github.com/facebook/zstd.git synced 2025-08-01 09:47:01 +03:00

Add non-optimize FASTCOVER (#1260)

* Add non-optimize FASTCOVER

* Minor fix

* Pass param as value instead of pointer
This commit is contained in:
Jennifer Liu
2018-08-01 11:06:16 -07:00
committed by Nick Terrell
parent 1420129fda
commit 0acb0abd1e
7 changed files with 201 additions and 126 deletions

View File

@ -18,109 +18,109 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
- Fourth column is chosen d and fifth column is chosen k
github:
NODICT 0.000005 2.999642
RANDOM 0.036114 8.791189
LEGACY 1.111024 8.173529
COVER 57.856477 10.652243 8 1298
COVER 5.769965 10.652243 8 1298
FAST15 9.965877 10.555630 8 1874
FAST15 0.140285 10.555630 8 1874
FAST16 10.337194 10.701698 8 1106
FAST16 0.114887 10.701698 8 1106
FAST17 10.207121 10.650652 8 1106
FAST17 0.135424 10.650652 8 1106
FAST18 11.463120 10.499142 8 1826
FAST18 0.154287 10.499142 8 1826
FAST19 12.143020 10.527140 8 1826
FAST19 0.158889 10.527140 8 1826
FAST20 12.510857 10.494710 8 1826
FAST20 0.171334 10.494710 8 1826
FAST21 13.201432 10.503488 8 1778
FAST21 0.192867 10.503488 8 1778
FAST22 13.754560 10.509284 8 1826
FAST22 0.206276 10.509284 8 1826
FAST23 14.708633 10.509284 8 1826
FAST23 0.221751 10.509284 8 1826
FAST24 15.134848 10.512369 8 1826
FAST24 0.234242 10.512369 8 1826
NODICT 0.000025 2.999642
RANDOM 0.030101 8.791189
LEGACY 0.913108 8.173529
COVER 59.234160 10.652243 8 1298
COVER 6.258459 10.652243 8 1298
FAST15 9.959246 10.555630 8 1874
FAST15 0.077719 10.555630 8 1874
FAST16 10.028343 10.701698 8 1106
FAST16 0.078117 10.701698 8 1106
FAST17 10.567355 10.650652 8 1106
FAST17 0.124833 10.650652 8 1106
FAST18 11.795287 10.499142 8 1826
FAST18 0.086992 10.499142 8 1826
FAST19 13.132451 10.527140 8 1826
FAST19 0.134716 10.527140 8 1826
FAST20 14.366314 10.494710 8 1826
FAST20 0.128844 10.494710 8 1826
FAST21 14.941238 10.503488 8 1778
FAST21 0.134975 10.503488 8 1778
FAST22 15.146226 10.509284 8 1826
FAST22 0.146918 10.509284 8 1826
FAST23 16.260552 10.509284 8 1826
FAST23 0.158494 10.509284 8 1826
FAST24 16.806037 10.512369 8 1826
FAST24 0.190464 10.512369 8 1826
hg-commands:
NODICT 0.000004 2.425291
RANDOM 0.055073 3.490331
LEGACY 0.927414 3.911682
COVER 72.749028 4.132653 8 386
COVER 3.391066 4.132653 8 386
FAST15 10.910989 3.920720 6 1106
FAST15 0.130480 3.920720 6 1106
FAST16 10.565224 4.033306 8 674
FAST16 0.146228 4.033306 8 674
FAST17 11.394137 4.064132 8 1490
FAST17 0.175567 4.064132 8 1490
FAST18 11.040248 4.086714 8 290
FAST18 0.132692 4.086714 8 290
FAST19 11.335856 4.097947 8 578
FAST19 0.181441 4.097947 8 578
FAST20 14.166272 4.102851 8 434
FAST20 0.203632 4.102851 8 434
FAST21 15.848896 4.105350 8 530
FAST21 0.269518 4.105350 8 530
FAST22 15.570995 4.104100 8 530
FAST22 0.238512 4.104100 8 530
FAST23 17.437566 4.098110 8 914
FAST23 0.270788 4.098110 8 914
FAST24 18.836604 4.117367 8 722
FAST24 0.323618 4.117367 8 722
NODICT 0.000026 2.425291
RANDOM 0.046270 3.490331
LEGACY 0.847904 3.911682
COVER 71.691804 4.132653 8 386
COVER 3.187085 4.132653 8 386
FAST15 11.593687 3.920720 6 1106
FAST15 0.082431 3.920720 6 1106
FAST16 11.775958 4.033306 8 674
FAST16 0.092587 4.033306 8 674
FAST17 11.965064 4.064132 8 1490
FAST17 0.106382 4.064132 8 1490
FAST18 11.438197 4.086714 8 290
FAST18 0.097293 4.086714 8 290
FAST19 12.292512 4.097947 8 578
FAST19 0.104406 4.097947 8 578
FAST20 13.857857 4.102851 8 434
FAST20 0.139467 4.102851 8 434
FAST21 14.599613 4.105350 8 530
FAST21 0.189416 4.105350 8 530
FAST22 15.966109 4.104100 8 530
FAST22 0.183817 4.104100 8 530
FAST23 18.033645 4.098110 8 914
FAST23 0.246641 4.098110 8 914
FAST24 22.992891 4.117367 8 722
FAST24 0.285994 4.117367 8 722
hg-changelog:
NODICT 0.000006 1.377613
RANDOM 0.253393 2.097487
LEGACY 2.410568 2.058907
COVER 203.550681 2.189685 8 98
COVER 7.381697 2.189685 8 98
FAST15 45.960609 2.130794 6 386
FAST15 0.512057 2.130794 6 386
FAST16 44.594817 2.144845 8 194
FAST16 0.601258 2.144845 8 194
FAST17 45.852992 2.156099 8 242
FAST17 0.500844 2.156099 8 242
FAST18 46.624930 2.172439 6 98
FAST18 0.680501 2.172439 6 98
FAST19 47.754905 2.180321 6 98
FAST19 0.606180 2.180321 6 98
FAST20 56.733632 2.187431 6 98
FAST20 0.710149 2.187431 6 98
FAST21 59.723173 2.184185 6 146
FAST21 0.875562 2.184185 6 146
FAST22 66.570788 2.182830 6 98
FAST22 1.061013 2.182830 6 98
FAST23 73.817645 2.186399 8 98
FAST23 0.838496 2.186399 8 98
FAST24 78.059933 2.185608 6 98
FAST24 0.843158 2.185608 6 98
NODICT 0.000007 1.377613
RANDOM 0.297345 2.097487
LEGACY 2.633992 2.058907
COVER 219.179786 2.189685 8 98
COVER 6.620852 2.189685 8 98
FAST15 47.635082 2.130794 6 386
FAST15 0.321297 2.130794 6 386
FAST16 43.837676 2.144845 8 194
FAST16 0.312640 2.144845 8 194
FAST17 49.349017 2.156099 8 242
FAST17 0.348459 2.156099 8 242
FAST18 51.153784 2.172439 6 98
FAST18 0.353106 2.172439 6 98
FAST19 52.627045 2.180321 6 98
FAST19 0.390612 2.180321 6 98
FAST20 63.748782 2.187431 6 98
FAST20 0.489544 2.187431 6 98
FAST21 68.709198 2.184185 6 146
FAST21 0.530852 2.184185 6 146
FAST22 68.491639 2.182830 6 98
FAST22 0.645699 2.182830 6 98
FAST23 72.558688 2.186399 8 98
FAST23 0.593539 2.186399 8 98
FAST24 76.137195 2.185608 6 98
FAST24 0.680132 2.185608 6 98
hg-manifest:
NODICT 0.000005 1.866385
RANDOM 0.735840 2.309436
LEGACY 9.322081 2.506977
COVER 885.961515 2.582528 8 434
COVER 32.678552 2.582528 8 434
FAST15 114.414413 2.392920 6 1826
FAST15 1.412690 2.392920 6 1826
FAST16 113.869718 2.480762 6 1922
FAST16 1.539424 2.480762 6 1922
FAST17 113.333636 2.548285 6 1682
FAST17 1.473196 2.548285 6 1682
FAST18 111.717871 2.567634 6 386
FAST18 1.421200 2.567634 6 386
FAST19 112.428344 2.581653 8 338
FAST19 1.412185 2.581653 8 338
FAST20 128.897480 2.586881 8 194
FAST20 1.586570 2.586881 8 194
FAST21 168.465684 2.590051 6 242
FAST21 2.190732 2.590051 6 242
FAST22 202.320435 2.591376 6 194
FAST22 2.667877 2.591376 6 194
FAST23 228.952201 2.591131 8 434
FAST23 3.315501 2.591131 8 434
FAST24 327.320020 2.591548 6 290
FAST24 5.048348 2.591548 6 290
NODICT 0.000026 1.866385
RANDOM 0.784554 2.309436
LEGACY 10.193714 2.506977
COVER 988.206583 2.582528 8 434
COVER 39.726199 2.582528 8 434
FAST15 168.388819 2.392920 6 1826
FAST15 1.272178 2.392920 6 1826
FAST16 161.822607 2.480762 6 1922
FAST16 1.164908 2.480762 6 1922
FAST17 157.688544 2.548285 6 1682
FAST17 1.222439 2.548285 6 1682
FAST18 154.529585 2.567634 6 386
FAST18 1.217596 2.567634 6 386
FAST19 160.244979 2.581653 8 338
FAST19 1.282450 2.581653 8 338
FAST20 191.503297 2.586881 8 194
FAST20 2.009748 2.586881 8 194
FAST21 226.389709 2.590051 6 242
FAST21 2.494543 2.590051 6 242
FAST22 217.859055 2.591376 6 194
FAST22 2.295693 2.591376 6 194
FAST23 236.819791 2.591131 8 434
FAST23 2.744711 2.591131 8 434
FAST24 269.187800 2.591548 6 290
FAST24 2.923671 2.591548 6 290

View File

@ -91,14 +91,26 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *randomParams);
}else if(coverParams) {
dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, coverParams);
/* Run the optimize version if either k or d is not provided */
if (!coverParams->d || !coverParams->k){
dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, coverParams);
} else {
dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *coverParams);
}
} else if(legacyParams) {
dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *legacyParams);
} else if(fastParams) {
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, fastParams);
/* Run the optimize version if either k or d is not provided */
if (!fastParams->d || !fastParams->k) {
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, fastParams);
} else {
dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *fastParams);
}
} else {
dictSize = 0;
}
@ -403,7 +415,6 @@ int main(int argCount, const char* argv[])
goto _cleanup;
}
/* for fastCover (with k and d provided) */
const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
@ -411,7 +422,6 @@ int main(int argCount, const char* argv[])
result = 1;
goto _cleanup;
}
}

View File

@ -16,8 +16,8 @@ make test
###Usage:
To build a random dictionary with the provided arguments: make ARG= followed by arguments
To build a FASTCOVER dictionary with the provided arguments: make ARG= followed by arguments
If k or d is not provided, the optimize version of FASTCOVER is run.
### Examples:
make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520"

View File

@ -629,6 +629,55 @@ _cleanup:
}
}
ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) {
BYTE* const dict = (BYTE*)dictBuffer;
FASTCOVER_ctx_t ctx;
parameters.splitPoint = 1.0;
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Checks */
if (!FASTCOVER_checkParameters(parameters, dictBufferCapacity)) {
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
return ERROR(GENERIC);
}
if (nbSamples == 0) {
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
return ERROR(GENERIC);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall);
}
/* Initialize context */
if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
parameters.d, parameters.splitPoint, parameters.f)) {
DISPLAYLEVEL(1, "Failed to initialize context\n");
return ERROR(GENERIC);
}
/* Build the dictionary */
DISPLAYLEVEL(2, "Building dictionary\n");
{
const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
dictBufferCapacity, parameters);
const size_t dictionarySize = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
samplesBuffer, samplesSizes, (unsigned)ctx.nbTrainSamples,
parameters.zParams);
if (!ZSTD_isError(dictionarySize)) {
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
(U32)dictionarySize);
}
FASTCOVER_ctx_destroy(&ctx);
return dictionarySize;
}
}
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples,
@ -657,15 +706,15 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
/* Checks */
if (splitPoint <= 0 || splitPoint > 1) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
return ERROR(GENERIC);
}
if (kMinK < kMaxD || kMaxK < kMinK) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
return ERROR(GENERIC);
}
if (nbSamples == 0) {
DISPLAYLEVEL(1, "fast must have at least one input file\n");
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
return ERROR(GENERIC);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {

View File

@ -12,9 +12,6 @@
#include "zdict.h"
typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
@ -26,7 +23,6 @@ typedef struct {
} ZDICT_fastCover_params_t;
/*! ZDICT_optimizeTrainFromBuffer_fastCover():
* Train a dictionary from an array of samples using a modified version of the COVER algorithm.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
@ -41,7 +37,21 @@ typedef struct {
* or an error code, which can be tested with ZDICT_isError().
* On success `*parameters` contains the parameters selected.
*/
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t *parameters);
/*! ZDICT_trainFromBuffer_fastCover():
* Train a dictionary from an array of samples using a modified version of the COVER algorithm.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`.
* d, k, and f are required.
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t *parameters);
const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters);

View File

@ -64,8 +64,14 @@ int FASTCOVER_trainFromFiles(const char* dictFileName, sampleInfo *info,
EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */
{ size_t dictSize;
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, params);
/* Run the optimize version if either k or d is not provided */
if (!params->d || !params->k) {
dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, params);
} else {
dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
info->samplesSizes, info->nbSamples, *params);
}
DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100));
if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
@ -92,8 +98,8 @@ int main(int argCount, const char* argv[])
int operationResult = 0;
/* Initialize arguments to default values */
unsigned k = 200;
unsigned d = 8;
unsigned k = 0;
unsigned d = 0;
unsigned f = 23;
unsigned steps = 32;
unsigned nbThreads = 1;

View File

@ -1,8 +1,8 @@
echo "Building fastCover dictionary with in=../../lib/common k=200 f=20 out=dict1"
./main in=../../../lib/common k=200 f=20 out=dict1
echo "Building fastCover dictionary with in=../../lib/common f=20 out=dict1"
./main in=../../../lib/common f=20 out=dict1
zstd -be3 -D dict1 -r ../../../lib/common -q
echo "Building fastCover dictionary with in=../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000"
./main in=../../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000
echo "Building fastCover dictionary with in=../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000"
./main in=../../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000
zstd -be3 -D dict2 -r ../../../lib/common -q
echo "Building fastCover dictionary with 2 sample sources"
./main in=../../../lib/common in=../../../lib/compress out=dict3