1
0
mirror of https://github.com/facebook/zstd.git synced 2025-07-30 22:23:13 +03:00

Merge pull request #1225 from jennifermliu/dev

Split samples when building dictionary for COVER
This commit is contained in:
Nick Terrell
2018-07-13 13:26:15 -07:00
committed by GitHub
8 changed files with 76 additions and 21 deletions

View File

@ -39,6 +39,7 @@
* Constants * Constants
***************************************/ ***************************************/
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
#define DEFAULT_SPLITPOINT 1.0
/*-************************************* /*-*************************************
* Console display * Console display
@ -203,6 +204,8 @@ typedef struct {
size_t *offsets; size_t *offsets;
const size_t *samplesSizes; const size_t *samplesSizes;
size_t nbSamples; size_t nbSamples;
size_t nbTrainSamples;
size_t nbTestSamples;
U32 *suffix; U32 *suffix;
size_t suffixSize; size_t suffixSize;
U32 *freqs; U32 *freqs;
@ -222,7 +225,7 @@ static COVER_ctx_t *g_ctx = NULL;
*/ */
static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) { static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
size_t sum = 0; size_t sum = 0;
size_t i; unsigned i;
for (i = 0; i < nbSamples; ++i) { for (i = 0; i < nbSamples; ++i) {
sum += samplesSizes[i]; sum += samplesSizes[i];
} }
@ -494,6 +497,10 @@ static int COVER_checkParameters(ZDICT_cover_params_t parameters,
if (parameters.d > parameters.k) { if (parameters.d > parameters.k) {
return 0; return 0;
} }
/* 0 < splitPoint <= 1 */
if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
return 0;
}
return 1; return 1;
} }
@ -531,9 +538,14 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
*/ */
static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples, const size_t *samplesSizes, unsigned nbSamples,
unsigned d) { unsigned d, double splitPoint) {
const BYTE *const samples = (const BYTE *)samplesBuffer; const BYTE *const samples = (const BYTE *)samplesBuffer;
const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
/* Split samples into testing and training sets */
const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
/* Checks */ /* Checks */
if (totalSamplesSize < MAX(d, sizeof(U64)) || if (totalSamplesSize < MAX(d, sizeof(U64)) ||
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
@ -541,15 +553,29 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
(U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
return 0; return 0;
} }
/* Check if there are at least 5 training samples */
if (nbTrainSamples < 5) {
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
return 0;
}
/* Check if there's testing sample */
if (nbTestSamples < 1) {
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
return 0;
}
/* Zero the context */ /* Zero the context */
memset(ctx, 0, sizeof(*ctx)); memset(ctx, 0, sizeof(*ctx));
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples, DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
(U32)totalSamplesSize); (U32)trainingSamplesSize);
DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
(U32)testSamplesSize);
ctx->samples = samples; ctx->samples = samples;
ctx->samplesSizes = samplesSizes; ctx->samplesSizes = samplesSizes;
ctx->nbSamples = nbSamples; ctx->nbSamples = nbSamples;
ctx->nbTrainSamples = nbTrainSamples;
ctx->nbTestSamples = nbTestSamples;
/* Partial suffix array */ /* Partial suffix array */
ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1; ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
/* Maps index to the dmerID */ /* Maps index to the dmerID */
ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
@ -563,7 +589,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
ctx->freqs = NULL; ctx->freqs = NULL;
ctx->d = d; ctx->d = d;
/* Fill offsets from the samlesSizes */ /* Fill offsets from the samplesSizes */
{ {
U32 i; U32 i;
ctx->offsets[0] = 0; ctx->offsets[0] = 0;
@ -665,7 +691,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
BYTE* const dict = (BYTE*)dictBuffer; BYTE* const dict = (BYTE*)dictBuffer;
COVER_ctx_t ctx; COVER_ctx_t ctx;
COVER_map_t activeDmers; COVER_map_t activeDmers;
parameters.splitPoint = 1.0;
/* Initialize global data */ /* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel; g_displayLevel = parameters.zParams.notificationLevel;
/* Checks */ /* Checks */
@ -684,7 +710,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
} }
/* Initialize context and activeDmers */ /* Initialize context and activeDmers */
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
parameters.d)) { parameters.d, parameters.splitPoint)) {
return ERROR(GENERIC); return ERROR(GENERIC);
} }
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
@ -839,7 +865,7 @@ typedef struct COVER_tryParameters_data_s {
} COVER_tryParameters_data_t; } COVER_tryParameters_data_t;
/** /**
* Tries a set of parameters and upates the COVER_best_t with the results. * Tries a set of parameters and updates the COVER_best_t with the results.
* This function is thread safe if zstd is compiled with multithreaded support. * This function is thread safe if zstd is compiled with multithreaded support.
* It takes its parameters as an *OWNING* opaque pointer to support threading. * It takes its parameters as an *OWNING* opaque pointer to support threading.
*/ */
@ -870,7 +896,7 @@ static void COVER_tryParameters(void *opaque) {
dictBufferCapacity, parameters); dictBufferCapacity, parameters);
dictBufferCapacity = ZDICT_finalizeDictionary( dictBufferCapacity = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
parameters.zParams); parameters.zParams);
if (ZDICT_isError(dictBufferCapacity)) { if (ZDICT_isError(dictBufferCapacity)) {
DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
@ -889,7 +915,8 @@ static void COVER_tryParameters(void *opaque) {
/* Allocate dst with enough space to compress the maximum sized sample */ /* Allocate dst with enough space to compress the maximum sized sample */
{ {
size_t maxSampleSize = 0; size_t maxSampleSize = 0;
for (i = 0; i < ctx->nbSamples; ++i) { i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
for (; i < ctx->nbSamples; ++i) {
maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize); maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize);
} }
dstCapacity = ZSTD_compressBound(maxSampleSize); dstCapacity = ZSTD_compressBound(maxSampleSize);
@ -904,7 +931,8 @@ static void COVER_tryParameters(void *opaque) {
} }
/* Compress each sample and sum their sizes (or error) */ /* Compress each sample and sum their sizes (or error) */
totalCompressedSize = dictBufferCapacity; totalCompressedSize = dictBufferCapacity;
for (i = 0; i < ctx->nbSamples; ++i) { i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0;
for (; i < ctx->nbSamples; ++i) {
const size_t size = ZSTD_compress_usingCDict( const size_t size = ZSTD_compress_usingCDict(
cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i],
ctx->samplesSizes[i], cdict); ctx->samplesSizes[i], cdict);
@ -941,6 +969,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
ZDICT_cover_params_t *parameters) { ZDICT_cover_params_t *parameters) {
/* constants */ /* constants */
const unsigned nbThreads = parameters->nbThreads; const unsigned nbThreads = parameters->nbThreads;
const double splitPoint =
parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint;
const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
@ -958,6 +988,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
POOL_ctx *pool = NULL; POOL_ctx *pool = NULL;
/* Checks */ /* Checks */
if (splitPoint <= 0 || splitPoint > 1) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
return ERROR(GENERIC);
}
if (kMinK < kMaxD || kMaxK < kMinK) { if (kMinK < kMaxD || kMaxK < kMinK) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
return ERROR(GENERIC); return ERROR(GENERIC);
@ -988,7 +1022,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
/* Initialize the context for this value of d */ /* Initialize the context for this value of d */
COVER_ctx_t ctx; COVER_ctx_t ctx;
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) { if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
COVER_best_destroy(&best); COVER_best_destroy(&best);
POOL_free(pool); POOL_free(pool);
@ -1013,6 +1047,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
data->parameters = *parameters; data->parameters = *parameters;
data->parameters.k = k; data->parameters.k = k;
data->parameters.d = d; data->parameters.d = d;
data->parameters.splitPoint = splitPoint;
data->parameters.steps = kSteps; data->parameters.steps = kSteps;
data->parameters.zParams.notificationLevel = g_displayLevel; data->parameters.zParams.notificationLevel = g_displayLevel;
/* Check the parameters */ /* Check the parameters */

View File

@ -86,6 +86,7 @@ typedef struct {
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
ZDICT_params_t zParams; ZDICT_params_t zParams;
} ZDICT_cover_params_t; } ZDICT_cover_params_t;

View File

@ -150,7 +150,7 @@ Advanced arguments :
Dictionary builder : Dictionary builder :
--train ## : create a dictionary from a training set of files --train ## : create a dictionary from a training set of files
--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args --train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args
--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9) --train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9)
-o file : `file` is dictionary name (default: dictionary) -o file : `file` is dictionary name (default: dictionary)
--maxdict=# : limit dictionary to specified size (default: 112640) --maxdict=# : limit dictionary to specified size (default: 112640)

View File

@ -323,7 +323,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
srcBuffer, sampleSizes, fs.nbSamples, srcBuffer, sampleSizes, fs.nbSamples,
coverParams); coverParams);
if (!ZDICT_isError(dictSize)) { if (!ZDICT_isError(dictSize)) {
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps); unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d, coverParams->steps, splitPercentage);
} }
} else { } else {
dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer, dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,

View File

@ -217,8 +217,9 @@ Split input files in blocks of size # (default: no split)
A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\. A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\.
. .
.TP .TP
\fB\-\-train\-cover[=k#,d=#,steps=#]\fR \fB\-\-train\-cover[=k#,d=#,steps=#,split=#]\fR
Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. Requires that \fId\fR <= \fIk\fR\. Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or \fIsplit\fR <= 0, then the default value of 100 is used\. If \fIsplit\fR is 100, all input samples are used for both training and testing
to find optimal _d_ and _k_ to build dictionary.Requires that \fId\fR <= \fIk\fR\.
. .
.IP .IP
Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\. Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\.

View File

@ -223,11 +223,12 @@ Compression of small files similar to the sample set will be greatly improved.
This compares favorably to 4 bytes default. This compares favorably to 4 bytes default.
However, it's up to the dictionary manager to not assign twice the same ID to However, it's up to the dictionary manager to not assign twice the same ID to
2 different dictionaries. 2 different dictionaries.
* `--train-cover[=k#,d=#,steps=#]`: * `--train-cover[=k#,d=#,steps=#,split=#]`:
Select parameters for the default dictionary builder algorithm named cover. Select parameters for the default dictionary builder algorithm named cover.
If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8. If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
If _k_ is not specified, then it tries _steps_ values in the range [50, 2000]. If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
If _steps_ is not specified, then the default value of 40 is used. If _steps_ is not specified, then the default value of 40 is used.
If _split_ is not specified or split <= 0, then the default value of 100 is used.
Requires that _d_ <= _k_. Requires that _d_ <= _k_.
Selects segments of size _k_ with highest score to put in the dictionary. Selects segments of size _k_ with highest score to put in the dictionary.
@ -237,6 +238,8 @@ Compression of small files similar to the sample set will be greatly improved.
algorithm will run faster with d <= _8_. algorithm will run faster with d <= _8_.
Good values for _k_ vary widely based on the input data, but a safe range is Good values for _k_ vary widely based on the input data, but a safe range is
[2 * _d_, 2000]. [2 * _d_, 2000].
If _split_ is 100, all input samples are used for both training and testing
to find optimal _d_ and _k_ to build dictionary.
Supports multithreading if `zstd` is compiled with threading support. Supports multithreading if `zstd` is compiled with threading support.
Examples: Examples:
@ -249,6 +252,8 @@ Compression of small files similar to the sample set will be greatly improved.
`zstd --train-cover=k=50 FILEs` `zstd --train-cover=k=50 FILEs`
`zstd --train-cover=k=50,split=60 FILEs`
* `--train-legacy[=selectivity=#]`: * `--train-legacy[=selectivity=#]`:
Use legacy dictionary builder algorithm with the given dictionary Use legacy dictionary builder algorithm with the given dictionary
_selectivity_ (default: 9). _selectivity_ (default: 9).

View File

@ -84,6 +84,7 @@ static U32 g_ldmMinMatch = 0;
static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT; static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT;
static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT; static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT;
#define DEFAULT_SPLITPOINT 1.0
/*-************************************ /*-************************************
* Display Macros * Display Macros
@ -170,7 +171,7 @@ static int usage_advanced(const char* programName)
DISPLAY( "\n"); DISPLAY( "\n");
DISPLAY( "Dictionary builder : \n"); DISPLAY( "Dictionary builder : \n");
DISPLAY( "--train ## : create a dictionary from a training set of files \n"); DISPLAY( "--train ## : create a dictionary from a training set of files \n");
DISPLAY( "--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args\n"); DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel); DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName); DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize); DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
@ -282,10 +283,15 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "split=")) {
unsigned splitPercentage = readU32FromChar(&stringPtr);
params->splitPoint = (double)splitPercentage / 100.0;
if (stringPtr[0]==',') { stringPtr++; continue; } else break;
}
return 0; return 0;
} }
if (stringPtr[0] != 0) return 0; if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps); DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100));
return 1; return 1;
} }
@ -310,6 +316,7 @@ static ZDICT_cover_params_t defaultCoverParams(void)
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));
params.d = 8; params.d = 8;
params.steps = 4; params.steps = 4;
params.splitPoint = DEFAULT_SPLITPOINT;
return params; return params;
} }
#endif #endif

View File

@ -412,7 +412,7 @@ $ECHO "\n===> cover dictionary builder : advanced options "
TESTFILE=../programs/zstdcli.c TESTFILE=../programs/zstdcli.c
./datagen > tmpDict ./datagen > tmpDict
$ECHO "- Create first dictionary" $ECHO "- Create first dictionary"
$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c -o tmpDict $ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c -o tmpDict
cp $TESTFILE tmp cp $TESTFILE tmp
$ZSTD -f tmp -D tmpDict $ZSTD -f tmp -D tmpDict
$ZSTD -d tmp.zst -D tmpDict -fo result $ZSTD -d tmp.zst -D tmpDict -fo result
@ -426,6 +426,11 @@ cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
$ECHO "- Create dictionary with size limit" $ECHO "- Create dictionary with size limit"
$ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K $ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K
rm tmp* rm tmp*
$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples"
$ZSTD --train-cover=split=90 -r *.c ../programs/*.c
$ZSTD --train-cover=split=80 -r *.c ../programs/*.c
$ECHO "- Create dictionary using all samples for both training and testing"
$ZSTD --train-cover=split=100 -r *.c ../programs/*.c
$ECHO "\n===> legacy dictionary builder " $ECHO "\n===> legacy dictionary builder "