1
0
mirror of https://github.com/facebook/zstd.git synced 2025-07-30 22:23:13 +03:00

Simplify COVER parameters

This commit is contained in:
Nick Terrell
2017-01-02 12:40:43 -08:00
parent cbb3ce376b
commit 3a1fefcf00
5 changed files with 172 additions and 199 deletions

View File

@ -361,137 +361,103 @@ typedef struct {
* Segments of are scored according to the function: * Segments of are scored according to the function:
* *
* Let F(d) be the frequency of dmer d. * Let F(d) be the frequency of dmer d.
* Let L(S) be the length of segment S. * Let S_i be the dmer at position i of segment S which has length k.
* Let S_i be the dmer at position i of segment S.
* *
* F(S_1) + F(S_2) + ... + F(S_{L(S)-d+1}) * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
* Score(S) = --------------------------------------
* smoothing + L(S)
* *
* We try kStep segment lengths in the range [kMin, kMax]. * Once the dmer d is in the dictionay we set F(d) = 0.
* For each segment length we find the best segment according to Score.
* We then take the best segment overall according to Score and return it.
*
* The difference from the paper is that we try multiple segment lengths.
* We want to fit the segment length closer to the length of the useful part.
* Longer segments allow longer matches, so they are worth more than shorter
* ones. However, if the extra length isn't high frequency it hurts us.
* We add the smoothing in to give an advantage to longer segments.
* The larger smoothing is, the more longer matches are favored.
*/ */
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs, static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
COVER_map_t *activeDmers, U32 begin, COVER_map_t *activeDmers, U32 begin,
U32 end, COVER_params_t parameters) { U32 end, COVER_params_t parameters) {
/* Saves the best segment of any length tried */ /* Constants */
COVER_segment_t globalBestSegment = {0, 0, 0}; const U32 k = parameters.k;
/* For each segment length */ const U32 d = parameters.d;
U32 k; const U32 dmersInK = k - d + 1;
U32 step = MAX((parameters.kMax - parameters.kMin) / parameters.kStep, 1); /* Try each segment (activeSegment) and save the best (bestSegment) */
for (k = parameters.kMin; k <= parameters.kMax; k += step) { COVER_segment_t bestSegment = {0, 0, 0};
/* Save the best segment of this length */ COVER_segment_t activeSegment;
COVER_segment_t bestSegment = {0, 0, 0}; /* Reset the activeDmers in the segment */
COVER_segment_t activeSegment; COVER_map_clear(activeDmers);
const size_t dmersInK = k - ctx->d + 1; /* The activeSegment starts at the beginning of the epoch. */
/* Reset the activeDmers in the segment */ activeSegment.begin = begin;
COVER_map_clear(activeDmers); activeSegment.end = begin;
activeSegment.begin = begin; activeSegment.score = 0;
activeSegment.end = begin; /* Slide the activeSegment through the whole epoch.
activeSegment.score = 0; * Save the best segment in bestSegment.
/* Slide the active segment through the whole epoch. */
* Save the best segment in bestSegment. while (activeSegment.end < end) {
*/ /* The dmerId for the dmer at the next position */
while (activeSegment.end < end) { U32 newDmer = ctx->dmerAt[activeSegment.end];
/* The dmerId for the dmer at the next position */ /* The entry in activeDmers for this dmerId */
U32 newDmer = ctx->dmerAt[activeSegment.end]; U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
/* The entry in activeDmers for this dmerId */ /* If the dmer isn't already present in the segment add its score. */
U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer); if (*newDmerOcc == 0) {
/* If the dmer isn't already present in the segment add its score. */ /* The paper suggest using the L-0.5 norm, but experiments show that it
if (*newDmerOcc == 0) { * doesn't help.
/* The paper suggest using the L-0.5 norm, but experiments show that it */
* doesn't help. activeSegment.score += freqs[newDmer];
*/ }
activeSegment.score += freqs[newDmer]; /* Add the dmer to the segment */
} activeSegment.end += 1;
/* Add the dmer to the segment */ *newDmerOcc += 1;
activeSegment.end += 1;
*newDmerOcc += 1;
/* If the window is now too large, drop the first position */ /* If the window is now too large, drop the first position */
if (activeSegment.end - activeSegment.begin == dmersInK + 1) { if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
U32 delDmer = ctx->dmerAt[activeSegment.begin]; U32 delDmer = ctx->dmerAt[activeSegment.begin];
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer); U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
activeSegment.begin += 1; activeSegment.begin += 1;
*delDmerOcc -= 1; *delDmerOcc -= 1;
/* If this is the last occurence of the dmer, subtract its score */ /* If this is the last occurence of the dmer, subtract its score */
if (*delDmerOcc == 0) { if (*delDmerOcc == 0) {
COVER_map_remove(activeDmers, delDmer); COVER_map_remove(activeDmers, delDmer);
activeSegment.score -= freqs[delDmer]; activeSegment.score -= freqs[delDmer];
}
}
/* If this segment is the best so far save it */
if (activeSegment.score > bestSegment.score) {
bestSegment = activeSegment;
} }
} }
{
/* Trim off the zero frequency head and tail from the segment. */ /* If this segment is the best so far save it */
U32 newBegin = bestSegment.end; if (activeSegment.score > bestSegment.score) {
U32 newEnd = bestSegment.begin; bestSegment = activeSegment;
U32 pos;
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
U32 freq = freqs[ctx->dmerAt[pos]];
if (freq != 0) {
newBegin = MIN(newBegin, pos);
newEnd = pos + 1;
}
}
bestSegment.begin = newBegin;
bestSegment.end = newEnd;
/* Calculate the final score normalizing for segment length */
bestSegment.score /=
(parameters.smoothing + (bestSegment.end - bestSegment.begin));
}
/* If this segment is the best so far for any length save it */
if (bestSegment.score > globalBestSegment.score) {
globalBestSegment = bestSegment;
} }
} }
{
/* Trim off the zero frequency head and tail from the segment. */
U32 newBegin = bestSegment.end;
U32 newEnd = bestSegment.begin;
U32 pos;
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
U32 freq = freqs[ctx->dmerAt[pos]];
if (freq != 0) {
newBegin = MIN(newBegin, pos);
newEnd = pos + 1;
}
}
bestSegment.begin = newBegin;
bestSegment.end = newEnd;
}
{ {
/* Zero out the frequency of each dmer covered by the chosen segment. */ /* Zero out the frequency of each dmer covered by the chosen segment. */
size_t pos; U32 pos;
for (pos = globalBestSegment.begin; pos != globalBestSegment.end; ++pos) { for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
freqs[ctx->dmerAt[pos]] = 0; freqs[ctx->dmerAt[pos]] = 0;
} }
} }
return globalBestSegment; return bestSegment;
} }
/** /**
* Check the validity of the parameters. * Check the validity of the parameters.
* If the parameters are valid and any are default, set them to the correct * Returns non-zero if the parameters are valid and 0 otherwise.
* values.
* Returns 1 on success, 0 on failure.
*/ */
static int COVER_defaultParameters(COVER_params_t *parameters) { static int COVER_checkParameters(COVER_params_t parameters) {
/* kMin and d are required parameters */ /* k and d are required parameters */
if (parameters->d == 0 || parameters->kMin == 0) { if (parameters.d == 0 || parameters.k == 0) {
return 0; return 0;
} }
/* d <= kMin */ /* d <= k */
if (parameters->d > parameters->kMin) { if (parameters.d > parameters.k) {
return 0; return 0;
} }
/* If kMax is set (non-zero) then kMin <= kMax */
if (parameters->kMax != 0 && parameters->kMax < parameters->kMin) {
return 0;
}
/* If kMax is set, then kStep must be as well */
if (parameters->kMax != 0 && parameters->kStep == 0) {
return 0;
}
parameters->kMax = MAX(parameters->kMin, parameters->kMax);
parameters->kStep = MAX(1, parameters->kStep);
return 1; return 1;
} }
@ -607,17 +573,22 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
/* Divide the data up into epochs of equal size. /* Divide the data up into epochs of equal size.
* We will select at least one segment from each epoch. * We will select at least one segment from each epoch.
*/ */
const U32 epochs = (U32)(dictBufferCapacity / parameters.kMax); const U32 epochs = (U32)(dictBufferCapacity / parameters.k);
const U32 epochSize = (U32)(ctx->suffixSize / epochs); const U32 epochSize = (U32)(ctx->suffixSize / epochs);
size_t epoch; size_t epoch;
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs, DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs,
epochSize); epochSize);
/* Loop through the epochs until there are no more segments or the dictionary
* is full.
*/
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
const U32 epochBegin = (U32)(epoch * epochSize); const U32 epochBegin = (U32)(epoch * epochSize);
const U32 epochEnd = epochBegin + epochSize; const U32 epochEnd = epochBegin + epochSize;
size_t segmentSize; size_t segmentSize;
/* Select a segment */
COVER_segment_t segment = COVER_selectSegment( COVER_segment_t segment = COVER_selectSegment(
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters); ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
/* Trim the segment if necessary and if it is empty then we are done */
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
if (segmentSize == 0) { if (segmentSize == 0) {
break; break;
@ -661,9 +632,8 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
BYTE *const dict = (BYTE *)dictBuffer; BYTE *const dict = (BYTE *)dictBuffer;
COVER_ctx_t ctx; COVER_ctx_t ctx;
COVER_map_t activeDmers; COVER_map_t activeDmers;
size_t rc;
/* Checks */ /* Checks */
if (!COVER_defaultParameters(&parameters)) { if (!COVER_checkParameters(parameters)) {
DISPLAYLEVEL(1, "Cover parameters incorrect\n"); DISPLAYLEVEL(1, "Cover parameters incorrect\n");
return ERROR(GENERIC); return ERROR(GENERIC);
} }
@ -672,6 +642,8 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
return ERROR(GENERIC); return ERROR(GENERIC);
} }
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall); return ERROR(dstSize_tooSmall);
} }
/* Initialize global data */ /* Initialize global data */
@ -682,7 +654,7 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
DISPLAYLEVEL(1, "Failed to initialize context\n"); DISPLAYLEVEL(1, "Failed to initialize context\n");
return ERROR(GENERIC); return ERROR(GENERIC);
} }
if (!COVER_map_init(&activeDmers, parameters.kMax - parameters.d + 1)) { if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
COVER_ctx_destroy(&ctx); COVER_ctx_destroy(&ctx);
return ERROR(GENERIC); return ERROR(GENERIC);
@ -694,18 +666,17 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer, COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
dictBufferCapacity, parameters); dictBufferCapacity, parameters);
ZDICT_params_t zdictParams = COVER_translateParams(parameters); ZDICT_params_t zdictParams = COVER_translateParams(parameters);
DISPLAYLEVEL(2, "Dictionary content size: %u", const size_t dictionarySize = ZDICT_finalizeDictionary(
(U32)(dictBufferCapacity - tail)); dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
rc = ZDICT_finalizeDictionary(dict, dictBufferCapacity, dict + tail, samplesBuffer, samplesSizes, nbSamples, zdictParams);
dictBufferCapacity - tail, samplesBuffer, if (!ZSTD_isError(dictionarySize)) {
samplesSizes, nbSamples, zdictParams); DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
(U32)dictionarySize);
}
COVER_ctx_destroy(&ctx);
COVER_map_destroy(&activeDmers);
return dictionarySize;
} }
if (!ZSTD_isError(rc)) {
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", (U32)rc);
}
COVER_ctx_destroy(&ctx);
COVER_map_destroy(&activeDmers);
return rc;
} }
/** /**
@ -713,7 +684,8 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
* 1. Synchronizing threads. * 1. Synchronizing threads.
* 2. Saving the best parameters and dictionary. * 2. Saving the best parameters and dictionary.
* *
* All of the methods are thread safe if `ZSTD_PTHREAD` is defined. * All of the methods except COVER_best_init() are thread safe if zstd is
* compiled with multithreaded support.
*/ */
typedef struct COVER_best_s { typedef struct COVER_best_s {
#ifdef ZSTD_PTHREAD #ifdef ZSTD_PTHREAD
@ -852,26 +824,26 @@ typedef struct COVER_tryParameters_data_s {
/** /**
* Tries a set of parameters and upates the COVER_best_t with the results. * Tries a set of parameters and upates the COVER_best_t with the results.
* This function is thread safe if ZSTD_PTHREAD is defined. * This function is thread safe if zstd is compiled with multithreaded support.
* It takes its parameters as an *OWNING* opaque pointer to support threading. * It takes its parameters as an *OWNING* opaque pointer to support threading.
*/ */
static void COVER_tryParameters(void *opaque) { static void COVER_tryParameters(void *opaque) {
/* Save parameters as local variables */ /* Save parameters as local variables */
COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)opaque; COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
const COVER_ctx_t *ctx = data->ctx; const COVER_ctx_t *const ctx = data->ctx;
COVER_params_t parameters = data->parameters; const COVER_params_t parameters = data->parameters;
size_t dictBufferCapacity = data->dictBufferCapacity; size_t dictBufferCapacity = data->dictBufferCapacity;
size_t totalCompressedSize = ERROR(GENERIC); size_t totalCompressedSize = ERROR(GENERIC);
/* Allocate space for hash table, dict, and freqs */ /* Allocate space for hash table, dict, and freqs */
COVER_map_t activeDmers; COVER_map_t activeDmers;
BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
if (!COVER_map_init(&activeDmers, parameters.kMax - parameters.d + 1)) { if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
goto _cleanup; goto _cleanup;
} }
if (!dict || !freqs) { if (!dict || !freqs) {
DISPLAYLEVEL(1, "Failed to allocate dictionary buffer\n"); DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
goto _cleanup; goto _cleanup;
} }
/* Copy the frequencies because we need to modify them */ /* Copy the frequencies because we need to modify them */
@ -880,7 +852,7 @@ static void COVER_tryParameters(void *opaque) {
{ {
const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
dictBufferCapacity, parameters); dictBufferCapacity, parameters);
ZDICT_params_t zdictParams = COVER_translateParams(parameters); const ZDICT_params_t zdictParams = COVER_translateParams(parameters);
dictBufferCapacity = ZDICT_finalizeDictionary( dictBufferCapacity = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams); ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams);
@ -954,27 +926,42 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
unsigned nbSamples, unsigned nbSamples,
COVER_params_t *parameters) { COVER_params_t *parameters) {
/* constants */ /* constants */
const unsigned dMin = parameters->d == 0 ? 6 : parameters->d; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
const unsigned dMax = parameters->d == 0 ? 16 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d;
const unsigned min = parameters->kMin == 0 ? 32 : parameters->kMin; const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k;
const unsigned max = parameters->kMax == 0 ? 1024 : parameters->kMax; const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k;
const unsigned kStep = parameters->kStep == 0 ? 8 : parameters->kStep; const unsigned kSteps = parameters->steps == 0 ? 256 : parameters->steps;
const unsigned step = MAX((max - min) / kStep, 1); const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
const unsigned kIterations =
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
/* Local variables */ /* Local variables */
unsigned iteration = 1;
const unsigned iterations =
(1 + (dMax - dMin) / 2) * (((1 + kStep) * (2 + kStep)) / 2) * 4;
const int displayLevel = parameters->notificationLevel; const int displayLevel = parameters->notificationLevel;
unsigned iteration = 1;
unsigned d; unsigned d;
unsigned k;
COVER_best_t best; COVER_best_t best;
/* Checks */
if (kMinK < kMaxD || kMaxK < kMinK) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
return ERROR(GENERIC);
}
if (nbSamples == 0) {
DISPLAYLEVEL(1, "Cover must have at least one input file\n");
return ERROR(GENERIC);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall);
}
/* Initialization */
COVER_best_init(&best); COVER_best_init(&best);
/* Turn down display level to clean up display at level 2 and below */ /* Turn down global display level to clean up display at level 2 and below */
g_displayLevel = parameters->notificationLevel - 1; g_displayLevel = parameters->notificationLevel - 1;
/* Loop through d first because each new value needs a new context */ /* Loop through d first because each new value needs a new context */
LOCALDISPLAYLEVEL(displayLevel, 3, "Trying %u different sets of parameters\n", LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
iterations); kIterations);
for (d = dMin; d <= dMax; d += 2) { for (d = kMinD; d <= kMaxD; d += 2) {
unsigned kMin;
/* Initialize the context for this value of d */ /* Initialize the context for this value of d */
COVER_ctx_t ctx; COVER_ctx_t ctx;
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
@ -983,44 +970,37 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
COVER_best_destroy(&best); COVER_best_destroy(&best);
return ERROR(GENERIC); return ERROR(GENERIC);
} }
/* Loop through the rest of the parameters reusing the same context */ /* Loop through k reusing the same context */
for (kMin = min; kMin <= max; kMin += step) { for (k = kMinK; k <= kMaxK; k += kStepSize) {
unsigned kMax; /* Prepare the arguments */
LOCALDISPLAYLEVEL(displayLevel, 3, "kMin=%u\n", kMin); COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
for (kMax = kMin; kMax <= max; kMax += step) { sizeof(COVER_tryParameters_data_t));
unsigned smoothing; LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
LOCALDISPLAYLEVEL(displayLevel, 3, "kMax=%u\n", kMax); if (!data) {
for (smoothing = kMin / 4; smoothing <= kMin * 2; smoothing *= 2) { LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
/* Prepare the arguments */ COVER_best_destroy(&best);
COVER_tryParameters_data_t *data = COVER_ctx_destroy(&ctx);
(COVER_tryParameters_data_t *)malloc( return ERROR(GENERIC);
sizeof(COVER_tryParameters_data_t));
LOCALDISPLAYLEVEL(displayLevel, 3, "smoothing=%u\n", smoothing);
if (!data) {
LOCALDISPLAYLEVEL(displayLevel, 1,
"Failed to allocate parameters\n");
COVER_best_destroy(&best);
COVER_ctx_destroy(&ctx);
return ERROR(GENERIC);
}
data->ctx = &ctx;
data->best = &best;
data->dictBufferCapacity = dictBufferCapacity;
data->parameters = *parameters;
data->parameters.d = d;
data->parameters.kMin = kMin;
data->parameters.kStep = kStep;
data->parameters.kMax = kMax;
data->parameters.smoothing = smoothing;
/* Call the function and pass ownership of data to it */
COVER_best_start(&best);
COVER_tryParameters(data);
/* Print status */
LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
(U32)((iteration * 100) / iterations));
++iteration;
}
} }
data->ctx = &ctx;
data->best = &best;
data->dictBufferCapacity = dictBufferCapacity;
data->parameters = *parameters;
data->parameters.k = k;
data->parameters.d = d;
data->parameters.steps = kSteps;
/* Check the parameters */
if (!COVER_checkParameters(data->parameters)) {
DISPLAYLEVEL(1, "Cover parameters incorrect\n");
continue;
}
/* Call the function and pass ownership of data to it */
COVER_best_start(&best);
COVER_tryParameters(data);
/* Print status */
LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
(U32)((iteration * 100) / kIterations));
++iteration;
} }
COVER_best_wait(&best); COVER_best_wait(&best);
COVER_ctx_destroy(&ctx); COVER_ctx_destroy(&ctx);

View File

@ -91,11 +91,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dict
kMin and d are the only required parameters. kMin and d are the only required parameters.
*/ */
typedef struct { typedef struct {
unsigned d; /* dmer size : constraint: <= kMin : Should probably be in the range [6, 16]. */ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned kMin; /* Minimum segment size : constraint: > 0 */ unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned kStep; /* Try kStep segment lengths uniformly distributed in the range [kMin, kMax] : 0 (default) only if kMax == 0 */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (256) : Higher means more parameters checked */
unsigned kMax; /* Maximum segment size : 0 = kMin (default) : constraint : 0 or >= kMin */
unsigned smoothing; /* Higher smoothing => larger segments are selected. Only useful if kMax > kMin. */
unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */
@ -125,11 +123,10 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCap
`*parameters` is filled with the best parameters found, and the dictionary `*parameters` is filled with the best parameters found, and the dictionary
constructed with those parameters is stored in `dictBuffer`. constructed with those parameters is stored in `dictBuffer`.
All of the {d, kMin, kStep, kMax} are optional, and smoothing is ignored. All of the parameters d, k, steps are optional.
If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
If kStep is non-zero then it is used, otherwise we pick 8. if steps is zero it defaults to its default value.
If kMin and kMax are non-zero, then they limit the search space for kMin and kMax, If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
otherwise we check kMin and kMax values in the range [32, 1024].
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
or an error code, which can be tested with ZDICT_isError(). or an error code, which can be tested with ZDICT_isError().

View File

@ -279,9 +279,7 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles, dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles,
coverParams); coverParams);
if (!ZDICT_isError(dictSize)) { if (!ZDICT_isError(dictSize)) {
DISPLAYLEVEL(2, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n", DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
coverParams->smoothing, coverParams->kMin,
coverParams->kStep, coverParams->kMax, coverParams->d);
} }
} else { } else {
dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize, dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize,

View File

@ -205,15 +205,13 @@ static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *para
{ {
memset(params, 0, sizeof(*params)); memset(params, 0, sizeof(*params));
for (; ;) { for (; ;) {
if (longCommandWArg(&stringPtr, "smoothing=")) { params->smoothing = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "k=") || longCommandWArg(&stringPtr, "kMin=") || longCommandWArg(&stringPtr, "kmin=")) { params->kMin = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "kStep=") || longCommandWArg(&stringPtr, "kstep=")) { params->kStep = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "kMax=") || longCommandWArg(&stringPtr, "kmax=")) { params->kMax = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; }
return 0; return 0;
} }
if (stringPtr[0] != 0) return 0; if (stringPtr[0] != 0) return 0;
DISPLAYLEVEL(4, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n", params->smoothing, params->kMin, params->kStep, params->kMax, params->d); DISPLAYLEVEL(4, "k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps);
return 1; return 1;
} }
#endif #endif

View File

@ -266,13 +266,13 @@ $ZSTD -f tmp -D tmpDict
$ZSTD -d tmp.zst -D tmpDict -fo result $ZSTD -d tmp.zst -D tmpDict -fo result
$DIFF $TESTFILE result $DIFF $TESTFILE result
$ECHO "- Create second (different) dictionary" $ECHO "- Create second (different) dictionary"
$ZSTD --train --cover=kmin=46,kstep=2,kmax=64,d=6,smoothing=23 *.c ../programs/*.c ../programs/*.h -o tmpDictC $ZSTD --train --cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC
$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!" $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
$ECHO "- Create dictionary with short dictID" $ECHO "- Create dictionary with short dictID"
$ZSTD --train --cover=k=46,d=8 *.c ../programs/*.c --dictID 1 -o tmpDict1 $ZSTD --train --cover=k=46,d=8 *.c ../programs/*.c --dictID 1 -o tmpDict1
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
$ECHO "- Create dictionary with size limit" $ECHO "- Create dictionary with size limit"
$ZSTD --train --optimize-cover=kstep=2,d=8 *.c ../programs/*.c -o tmpDict2 --maxdict 4K $ZSTD --train --optimize-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict 4K
rm tmp* rm tmp*