From 0ef06f2e8aa814ae0de80543cae3398aabf1d358 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 29 Jun 2018 12:33:34 -0700 Subject: [PATCH 01/19] Split samples into train and test sets --- lib/dictBuilder/cover.c | 69 ++++++++++++++++++++++++++++++++--------- lib/dictBuilder/zdict.h | 1 + 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 448f71372..c03ae9a11 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -39,6 +39,7 @@ * Constants ***************************************/ #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define DEFAULT_SPLITPOINT 0.8 /*-************************************* * Console display @@ -203,6 +204,8 @@ typedef struct { size_t *offsets; const size_t *samplesSizes; size_t nbSamples; + size_t nbTrainSamples; + size_t nbTestSamples; U32 *suffix; size_t suffixSize; U32 *freqs; @@ -220,10 +223,10 @@ static COVER_ctx_t *g_ctx = NULL; /** * Returns the sum of the sample sizes. */ -static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) { +static size_t COVER_sum(const size_t *samplesSizes, unsigned firstSample, unsigned lastSample) { size_t sum = 0; size_t i; - for (i = 0; i < nbSamples; ++i) { + for (i = firstSample; i < lastSample; ++i) { sum += samplesSizes[i]; } return sum; @@ -494,6 +497,10 @@ static int COVER_checkParameters(ZDICT_cover_params_t parameters, if (parameters.d > parameters.k) { return 0; } + /* 0 < splitPoint < 1 */ + if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){ + return 0; + } return 1; } @@ -531,9 +538,10 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) { */ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, - unsigned d) { + unsigned d, double splitPoint) { const BYTE *const samples = (const BYTE *)samplesBuffer; - const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); + const unsigned first = 0; + const size_t totalSamplesSize = COVER_sum(samplesSizes, first, nbSamples); /* Checks */ if (totalSamplesSize < MAX(d, sizeof(U64)) || totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { @@ -541,15 +549,38 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); return 0; } + /* Split samples into testing and training sets */ + const unsigned nbTrainSamples = nbSamples * splitPoint; + const unsigned nbTestSamples = nbSamples - nbTrainSamples; + /* Check if there's training sample */ + if (nbTrainSamples < 1) { + DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); + DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100)); + DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); + return 0; + } + /* Check if there's testing sample when splitPoint is nonzero */ + if (nbTestSamples < 1 && splitPoint != 1.0) { + DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); + DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100)); + DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); + return 0; + } + const size_t trainingSamplesSize = COVER_sum(samplesSizes, first, nbTrainSamples); + const size_t testSamplesSize = COVER_sum(samplesSizes, nbTrainSamples, nbSamples); /* Zero the context */ memset(ctx, 0, sizeof(*ctx)); - DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbSamples, - (U32)totalSamplesSize); + DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples, + (U32)trainingSamplesSize); + DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples, + (U32)testSamplesSize); ctx->samples = samples; ctx->samplesSizes = samplesSizes; ctx->nbSamples = nbSamples; + ctx->nbTrainSamples = nbTrainSamples; + ctx->nbTestSamples = nbTestSamples; /* Partial suffix array */ - ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1; + ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1; ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); /* Maps index to the dmerID */ ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); @@ -563,7 +594,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, ctx->freqs = NULL; ctx->d = d; - /* Fill offsets from the samlesSizes */ + /* Fill offsets from the samplesSizes */ { U32 i; ctx->offsets[0] = 0; @@ -665,7 +696,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( BYTE* const dict = (BYTE*)dictBuffer; COVER_ctx_t ctx; COVER_map_t activeDmers; - + parameters.splitPoint = 1.0; /* Initialize global data */ g_displayLevel = parameters.zParams.notificationLevel; /* Checks */ @@ -683,8 +714,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( return ERROR(dstSize_tooSmall); } /* Initialize context and activeDmers */ + const double all = 1.0; if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, - parameters.d)) { + parameters.d, all)) { return ERROR(GENERIC); } if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { @@ -839,7 +871,7 @@ typedef struct COVER_tryParameters_data_s { } COVER_tryParameters_data_t; /** - * Tries a set of parameters and upates the COVER_best_t with the results. + * Tries a set of parameters and updates the COVER_best_t with the results. * This function is thread safe if zstd is compiled with multithreaded support. * It takes its parameters as an *OWNING* opaque pointer to support threading. */ @@ -870,7 +902,7 @@ static void COVER_tryParameters(void *opaque) { dictBufferCapacity, parameters); dictBufferCapacity = ZDICT_finalizeDictionary( dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, - ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, + ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, parameters.zParams); if (ZDICT_isError(dictBufferCapacity)) { DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); @@ -889,7 +921,7 @@ static void COVER_tryParameters(void *opaque) { /* Allocate dst with enough space to compress the maximum sized sample */ { size_t maxSampleSize = 0; - for (i = 0; i < ctx->nbSamples; ++i) { + for (i = ctx->nbTrainSamples; i < ctx->nbSamples; ++i) { maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize); } dstCapacity = ZSTD_compressBound(maxSampleSize); @@ -904,7 +936,7 @@ static void COVER_tryParameters(void *opaque) { } /* Compress each sample and sum their sizes (or error) */ totalCompressedSize = dictBufferCapacity; - for (i = 0; i < ctx->nbSamples; ++i) { + for (i = ctx->nbTrainSamples; i < ctx->nbSamples; ++i) { const size_t size = ZSTD_compress_usingCDict( cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], ctx->samplesSizes[i], cdict); @@ -941,6 +973,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( ZDICT_cover_params_t *parameters) { /* constants */ const unsigned nbThreads = parameters->nbThreads; + const double splitPoint = + parameters->splitPoint == 0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; @@ -958,6 +992,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( POOL_ctx *pool = NULL; /* Checks */ + if (splitPoint <= 0 || splitPoint >= 1) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); + return ERROR(GENERIC); + } if (kMinK < kMaxD || kMaxK < kMinK) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); return ERROR(GENERIC); @@ -988,7 +1026,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( /* Initialize the context for this value of d */ COVER_ctx_t ctx; LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); - if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d)) { + if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) { LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); COVER_best_destroy(&best); POOL_free(pool); @@ -1013,6 +1051,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( data->parameters = *parameters; data->parameters.k = k; data->parameters.d = d; + data->parameters.splitPoint = splitPoint; data->parameters.steps = kSteps; data->parameters.zParams.notificationLevel = g_displayLevel; /* Check the parameters */ diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h index ad459c2d7..45d78b05f 100644 --- a/lib/dictBuilder/zdict.h +++ b/lib/dictBuilder/zdict.h @@ -86,6 +86,7 @@ typedef struct { unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ + double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training */ ZDICT_params_t zParams; } ZDICT_cover_params_t; From 59797d3328aab09caf5b56e2a02640c7fae2b87b Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 29 Jun 2018 12:47:03 -0700 Subject: [PATCH 02/19] Fix splitPoint floating point comparison problem --- lib/dictBuilder/cover.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index c03ae9a11..dbb90c1f0 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -714,9 +714,8 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( return ERROR(dstSize_tooSmall); } /* Initialize context and activeDmers */ - const double all = 1.0; if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, - parameters.d, all)) { + parameters.d, parameters.splitPoint)) { return ERROR(GENERIC); } if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { @@ -974,7 +973,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = - parameters->splitPoint == 0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + parameters->splitPoint == 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; From e061d840167f6f21f9ad4ea9c9fe3185eeb32a36 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 29 Jun 2018 15:38:08 -0700 Subject: [PATCH 03/19] Another fix to comparator --- lib/dictBuilder/cover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index dbb90c1f0..4dead220a 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -560,7 +560,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, return 0; } /* Check if there's testing sample when splitPoint is nonzero */ - if (nbTestSamples < 1 && splitPoint != 1.0) { + if (nbTestSamples < 1 && splitPoint < 1.0) { DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100)); DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); @@ -973,7 +973,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = - parameters->splitPoint == 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; From f9d19b83fb92f0d99433604253cb4a198f2b90d3 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 29 Jun 2018 15:46:56 -0700 Subject: [PATCH 04/19] Fix variable declaration problem --- lib/dictBuilder/cover.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 4dead220a..997036ecf 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -542,6 +542,11 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const BYTE *const samples = (const BYTE *)samplesBuffer; const unsigned first = 0; const size_t totalSamplesSize = COVER_sum(samplesSizes, first, nbSamples); + /* Split samples into testing and training sets */ + const unsigned nbTrainSamples = nbSamples * splitPoint; + const unsigned nbTestSamples = nbSamples - nbTrainSamples; + const size_t trainingSamplesSize = COVER_sum(samplesSizes, first, nbTrainSamples); + const size_t testSamplesSize = COVER_sum(samplesSizes, nbTrainSamples, nbSamples); /* Checks */ if (totalSamplesSize < MAX(d, sizeof(U64)) || totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { @@ -549,9 +554,6 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); return 0; } - /* Split samples into testing and training sets */ - const unsigned nbTrainSamples = nbSamples * splitPoint; - const unsigned nbTestSamples = nbSamples - nbTrainSamples; /* Check if there's training sample */ if (nbTrainSamples < 1) { DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); @@ -566,8 +568,6 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); return 0; } - const size_t trainingSamplesSize = COVER_sum(samplesSizes, first, nbTrainSamples); - const size_t testSamplesSize = COVER_sum(samplesSizes, nbTrainSamples, nbSamples); /* Zero the context */ memset(ctx, 0, sizeof(*ctx)); DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples, From 52fbbbcb6b906a6acd00ece3dfeed0a480d28751 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 29 Jun 2018 16:17:20 -0700 Subject: [PATCH 05/19] Explicitly cast double to unsigned --- lib/dictBuilder/cover.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 997036ecf..53f3d79a8 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -543,7 +543,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const unsigned first = 0; const size_t totalSamplesSize = COVER_sum(samplesSizes, first, nbSamples); /* Split samples into testing and training sets */ - const unsigned nbTrainSamples = nbSamples * splitPoint; + double tmp = (double)nbSamples * splitPoint; + const unsigned nbTrainSamples = (unsigned)tmp; const unsigned nbTestSamples = nbSamples - nbTrainSamples; const size_t trainingSamplesSize = COVER_sum(samplesSizes, first, nbTrainSamples); const size_t testSamplesSize = COVER_sum(samplesSizes, nbTrainSamples, nbSamples); From 348e5f77a95922f4bf2232df1bd220ce665cc369 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 29 Jun 2018 17:54:41 -0700 Subject: [PATCH 06/19] Add split=# to cli --- lib/dictBuilder/cover.c | 8 ++++---- programs/zstd.1.md | 5 ++++- programs/zstdcli.c | 8 +++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 53f3d79a8..a3195aa77 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -558,15 +558,15 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, /* Check if there's training sample */ if (nbTrainSamples < 1) { DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); - DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100)); - DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); return 0; } /* Check if there's testing sample when splitPoint is nonzero */ if (nbTestSamples < 1 && splitPoint < 1.0) { DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); - DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100)); - DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); + return 0; + } + if (nbTrainSamples + nbTestSamples != nbSamples) { + DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples"); return 0; } /* Zero the context */ diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 4b3818141..c45bdb386 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -223,11 +223,12 @@ Compression of small files similar to the sample set will be greatly improved. This compares favorably to 4 bytes default. However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries. -* `--train-cover[=k#,d=#,steps=#]`: +* `--train-cover[=k#,d=#,steps=#,split=#]`: Select parameters for the default dictionary builder algorithm named cover. If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8. If _k_ is not specified, then it tries _steps_ values in the range [50, 2000]. If _steps_ is not specified, then the default value of 40 is used. + If _split_ is not specified, then the default value of 80 is used. Requires that _d_ <= _k_. Selects segments of size _k_ with highest score to put in the dictionary. @@ -249,6 +250,8 @@ Compression of small files similar to the sample set will be greatly improved. `zstd --train-cover=k=50 FILEs` + `zstd --train-cover=k=50,split=60 FILEs` + * `--train-legacy[=selectivity=#]`: Use legacy dictionary builder algorithm with the given dictionary _selectivity_ (default: 9). diff --git a/programs/zstdcli.c b/programs/zstdcli.c index ae8c9cba9..68404d660 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -278,14 +278,20 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params) { memset(params, 0, sizeof(*params)); + unsigned splitPercentage = 100; for (; ;) { if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } + if (longCommandWArg(&stringPtr, "split=")) { + splitPercentage = readU32FromChar(&stringPtr); + params->splitPoint = (double)splitPercentage / 100.0; + if (stringPtr[0]==',') { stringPtr++; continue; } else break; + } return 0; } if (stringPtr[0] != 0) return 0; - DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps); + DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplitPoint=%d\n", params->k, params->d, params->steps, splitPercentage); return 1; } From 84e8b2a3059eb6360cc2dc7aeda3e0961119ce15 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 29 Jun 2018 18:02:02 -0700 Subject: [PATCH 07/19] Fix another declaration issue --- programs/zstdcli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 68404d660..74dc607a3 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -277,8 +277,8 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) */ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params) { - memset(params, 0, sizeof(*params)); unsigned splitPercentage = 100; + memset(params, 0, sizeof(*params)); for (; ;) { if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } From 8afcb8eea76fb534950cbd2eab88847d6afb1442 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Sun, 1 Jul 2018 19:59:37 -0700 Subject: [PATCH 08/19] Update documentation --- programs/README.md | 2 +- programs/dibio.c | 3 ++- programs/zstd.1 | 4 ++-- programs/zstdcli.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/programs/README.md b/programs/README.md index a308fccf9..2833875e5 100644 --- a/programs/README.md +++ b/programs/README.md @@ -150,7 +150,7 @@ Advanced arguments : Dictionary builder : --train ## : create a dictionary from a training set of files ---train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args +--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args --train-legacy[=s=#] : use the legacy algorithm with selectivity (default: 9) -o file : `file` is dictionary name (default: dictionary) --maxdict=# : limit dictionary to specified size (default: 112640) diff --git a/programs/dibio.c b/programs/dibio.c index 112259ddc..5d1f6d6c4 100644 --- a/programs/dibio.c +++ b/programs/dibio.c @@ -323,7 +323,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, srcBuffer, sampleSizes, fs.nbSamples, coverParams); if (!ZDICT_isError(dictSize)) { - DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps); + unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100); + DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d, coverParams->steps, splitPercentage); } } else { dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer, diff --git a/programs/zstd.1 b/programs/zstd.1 index 507933c97..e1ebd297e 100644 --- a/programs/zstd.1 +++ b/programs/zstd.1 @@ -217,8 +217,8 @@ Split input files in blocks of size # (default: no split) A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\. . .TP -\fB\-\-train\-cover[=k#,d=#,steps=#]\fR -Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. Requires that \fId\fR <= \fIk\fR\. +\fB\-\-train\-cover[=k#,d=#,steps=#,split=#]\fR +Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified, then the default value of 80 is used\. Requires that \fId\fR <= \fIk\fR\. . .IP Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\. diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 74dc607a3..28bed2309 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -170,7 +170,7 @@ static int usage_advanced(const char* programName) DISPLAY( "\n"); DISPLAY( "Dictionary builder : \n"); DISPLAY( "--train ## : create a dictionary from a training set of files \n"); - DISPLAY( "--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args\n"); + DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n"); DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel); DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName); DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize); From 1a14f8639c60146e5e9a4bcced3849c057ed4244 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Mon, 2 Jul 2018 11:37:04 -0700 Subject: [PATCH 09/19] Update COVER dictionary builder tests --- tests/playTests.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/playTests.sh b/tests/playTests.sh index 09a7377f2..985b12d2d 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -408,7 +408,7 @@ $ECHO "\n===> cover dictionary builder : advanced options " TESTFILE=../programs/zstdcli.c ./datagen > tmpDict $ECHO "- Create first dictionary" -$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c -o tmpDict +$ZSTD --train-cover=k=46,d=8,split=80 *.c ../programs/*.c -o tmpDict cp $TESTFILE tmp $ZSTD -f tmp -D tmpDict $ZSTD -d tmp.zst -D tmpDict -fo result @@ -422,6 +422,9 @@ cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" $ECHO "- Create dictionary with size limit" $ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K rm tmp* +$ECHO "- Compare size of dictionary from 90% training samples with 80% training samples" +$ZSTD --train-cover=split=90 -r *.c ../programs/*.c +$ZSTD --train-cover=split=80 -r *.c ../programs/*.c $ECHO "\n===> legacy dictionary builder " From 16e75e88048874dfe1e21964ad7067b0eebf0ef4 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 3 Jul 2018 12:07:06 -0700 Subject: [PATCH 10/19] Update minimal training sample size --- lib/dictBuilder/cover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index a3195aa77..1fe8d89ee 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -555,8 +555,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); return 0; } - /* Check if there's training sample */ - if (nbTrainSamples < 1) { + /* Check if there are at least 5 training samples */ + if (nbTrainSamples < 5) { DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); return 0; } From 0881184c89917f1c21ffb53b972cb42b3fb2780a Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 3 Jul 2018 17:53:27 -0700 Subject: [PATCH 11/19] Some edits based on pull request comments --- lib/dictBuilder/cover.c | 13 ++++++------- programs/zstdcli.c | 7 ++++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 1fe8d89ee..7ac7eb1c3 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -225,7 +225,7 @@ static COVER_ctx_t *g_ctx = NULL; */ static size_t COVER_sum(const size_t *samplesSizes, unsigned firstSample, unsigned lastSample) { size_t sum = 0; - size_t i; + unsigned i; for (i = firstSample; i < lastSample; ++i) { sum += samplesSizes[i]; } @@ -540,13 +540,12 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, unsigned d, double splitPoint) { const BYTE *const samples = (const BYTE *)samplesBuffer; - const unsigned first = 0; - const size_t totalSamplesSize = COVER_sum(samplesSizes, first, nbSamples); + const unsigned kFirst = 0; + const size_t totalSamplesSize = COVER_sum(samplesSizes, kFirst, nbSamples); /* Split samples into testing and training sets */ - double tmp = (double)nbSamples * splitPoint; - const unsigned nbTrainSamples = (unsigned)tmp; + const unsigned nbTrainSamples = (unsigned)((double)nbSamples * splitPoint); const unsigned nbTestSamples = nbSamples - nbTrainSamples; - const size_t trainingSamplesSize = COVER_sum(samplesSizes, first, nbTrainSamples); + const size_t trainingSamplesSize = COVER_sum(samplesSizes, kFirst, nbTrainSamples); const size_t testSamplesSize = COVER_sum(samplesSizes, nbTrainSamples, nbSamples); /* Checks */ if (totalSamplesSize < MAX(d, sizeof(U64)) || @@ -560,7 +559,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); return 0; } - /* Check if there's testing sample when splitPoint is nonzero */ + /* Check if there's testing sample when splitPoint is not 1.0 */ if (nbTestSamples < 1 && splitPoint < 1.0) { DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); return 0; diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 28bed2309..5408d2a51 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -84,6 +84,7 @@ static U32 g_ldmMinMatch = 0; static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT; static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT; +#define DEFAULT_SPLITPOINT 0.8 /*-************************************ * Display Macros @@ -277,21 +278,20 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) */ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params) { - unsigned splitPercentage = 100; memset(params, 0, sizeof(*params)); for (; ;) { if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "split=")) { - splitPercentage = readU32FromChar(&stringPtr); + unsigned splitPercentage = readU32FromChar(&stringPtr); params->splitPoint = (double)splitPercentage / 100.0; if (stringPtr[0]==',') { stringPtr++; continue; } else break; } return 0; } if (stringPtr[0] != 0) return 0; - DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplitPoint=%d\n", params->k, params->d, params->steps, splitPercentage); + DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100)); return 1; } @@ -316,6 +316,7 @@ static ZDICT_cover_params_t defaultCoverParams(void) memset(¶ms, 0, sizeof(params)); params.d = 8; params.steps = 4; + params.splitPoint = DEFAULT_SPLITPOINT; return params; } #endif From a085d1aae1dbd841baa8ed927465e4d686ccc213 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Thu, 5 Jul 2018 10:38:45 -0700 Subject: [PATCH 12/19] Allow splitPoint==1.0 (using all samples for both training and testing) --- lib/dictBuilder/cover.c | 22 ++++++++++++---------- tests/playTests.sh | 2 ++ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 7ac7eb1c3..2c19c0052 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -543,10 +543,10 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const unsigned kFirst = 0; const size_t totalSamplesSize = COVER_sum(samplesSizes, kFirst, nbSamples); /* Split samples into testing and training sets */ - const unsigned nbTrainSamples = (unsigned)((double)nbSamples * splitPoint); - const unsigned nbTestSamples = nbSamples - nbTrainSamples; - const size_t trainingSamplesSize = COVER_sum(samplesSizes, kFirst, nbTrainSamples); - const size_t testSamplesSize = COVER_sum(samplesSizes, nbTrainSamples, nbSamples); + const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples; + const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples; + const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, kFirst, nbTrainSamples) : totalSamplesSize; + const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples, nbSamples) : totalSamplesSize; /* Checks */ if (totalSamplesSize < MAX(d, sizeof(U64)) || totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { @@ -559,12 +559,13 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); return 0; } - /* Check if there's testing sample when splitPoint is not 1.0 */ - if (nbTestSamples < 1 && splitPoint < 1.0) { + /* Check if there's testing sample */ + if (nbTestSamples < 1) { DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); return 0; } - if (nbTrainSamples + nbTestSamples != nbSamples) { + /* Check if nbTrainSamples plus nbTestSamples add up to nbSamples when splitPoint is less than 1*/ + if (nbTrainSamples + nbTestSamples != nbSamples && splitPoint < 1.0) { DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples"); return 0; } @@ -920,7 +921,8 @@ static void COVER_tryParameters(void *opaque) { /* Allocate dst with enough space to compress the maximum sized sample */ { size_t maxSampleSize = 0; - for (i = ctx->nbTrainSamples; i < ctx->nbSamples; ++i) { + i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0; + for (; i < ctx->nbSamples; ++i) { maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize); } dstCapacity = ZSTD_compressBound(maxSampleSize); @@ -973,7 +975,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = - parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + (parameters->splitPoint <= 0.0 || parameters->splitPoint > 1.0) ? DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; @@ -991,7 +993,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( POOL_ctx *pool = NULL; /* Checks */ - if (splitPoint <= 0 || splitPoint >= 1) { + if (splitPoint <= 0 || splitPoint > 1) { LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); return ERROR(GENERIC); } diff --git a/tests/playTests.sh b/tests/playTests.sh index 985b12d2d..3e1531375 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -425,6 +425,8 @@ rm tmp* $ECHO "- Compare size of dictionary from 90% training samples with 80% training samples" $ZSTD --train-cover=split=90 -r *.c ../programs/*.c $ZSTD --train-cover=split=80 -r *.c ../programs/*.c +$ECHO "- Create dictionary using all samples for both training and testing" +$ZSTD --train-cover=split=100 -r *.c ../programs/*.c $ECHO "\n===> legacy dictionary builder " From bfad1af0317a6994e3b46cc3111bf3796b8e82ac Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Thu, 5 Jul 2018 11:05:31 -0700 Subject: [PATCH 13/19] Update doc for split==100 --- programs/zstd.1 | 2 +- programs/zstd.1.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/zstd.1 b/programs/zstd.1 index e1ebd297e..b63ef4f2a 100644 --- a/programs/zstd.1 +++ b/programs/zstd.1 @@ -218,7 +218,7 @@ A dictionary ID is a locally unique ID that a decoder can use to verify it is us . .TP \fB\-\-train\-cover[=k#,d=#,steps=#,split=#]\fR -Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified, then the default value of 80 is used\. Requires that \fId\fR <= \fIk\fR\. +Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or \fIsplit\fR <= 0 or \fIsplit\fR > 100, then the default value of 80 is used\. Requires that \fId\fR <= \fIk\fR\. . .IP Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\. diff --git a/programs/zstd.1.md b/programs/zstd.1.md index c45bdb386..47035f1c0 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -228,7 +228,7 @@ Compression of small files similar to the sample set will be greatly improved. If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8. If _k_ is not specified, then it tries _steps_ values in the range [50, 2000]. If _steps_ is not specified, then the default value of 40 is used. - If _split_ is not specified, then the default value of 80 is used. + If _split_ is not specified or split <= 0 or split > 100, then the default value of 80 is used. Requires that _d_ <= _k_. Selects segments of size _k_ with highest score to put in the dictionary. From 0bbff012119464fe4facf9b94f7502286df5f56d Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Thu, 5 Jul 2018 22:40:32 -0700 Subject: [PATCH 14/19] Fix testing parameter --- lib/dictBuilder/cover.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 2c19c0052..5fd2c9c74 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -937,7 +937,8 @@ static void COVER_tryParameters(void *opaque) { } /* Compress each sample and sum their sizes (or error) */ totalCompressedSize = dictBufferCapacity; - for (i = ctx->nbTrainSamples; i < ctx->nbSamples; ++i) { + i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0; + for (; i < ctx->nbSamples; ++i) { const size_t size = ZSTD_compress_usingCDict( cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], ctx->samplesSizes[i], cdict); From 015a00af0f5f61c5093d990fe9cf7ec3d78c8839 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 6 Jul 2018 14:24:18 -0700 Subject: [PATCH 15/19] Change cover_sum back to 2 parameters and fix splitPoint issues --- lib/dictBuilder/cover.c | 18 ++++++------------ lib/dictBuilder/zdict.h | 2 +- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 5fd2c9c74..176c386c4 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -223,10 +223,10 @@ static COVER_ctx_t *g_ctx = NULL; /** * Returns the sum of the sample sizes. */ -static size_t COVER_sum(const size_t *samplesSizes, unsigned firstSample, unsigned lastSample) { +static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) { size_t sum = 0; unsigned i; - for (i = firstSample; i < lastSample; ++i) { + for (i = 0; i < nbSamples; ++i) { sum += samplesSizes[i]; } return sum; @@ -540,13 +540,12 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, unsigned d, double splitPoint) { const BYTE *const samples = (const BYTE *)samplesBuffer; - const unsigned kFirst = 0; - const size_t totalSamplesSize = COVER_sum(samplesSizes, kFirst, nbSamples); + const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); /* Split samples into testing and training sets */ const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples; const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples; - const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, kFirst, nbTrainSamples) : totalSamplesSize; - const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples, nbSamples) : totalSamplesSize; + const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize; + const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize; /* Checks */ if (totalSamplesSize < MAX(d, sizeof(U64)) || totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { @@ -564,11 +563,6 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); return 0; } - /* Check if nbTrainSamples plus nbTestSamples add up to nbSamples when splitPoint is less than 1*/ - if (nbTrainSamples + nbTestSamples != nbSamples && splitPoint < 1.0) { - DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples"); - return 0; - } /* Zero the context */ memset(ctx, 0, sizeof(*ctx)); DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples, @@ -976,7 +970,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = - (parameters->splitPoint <= 0.0 || parameters->splitPoint > 1.0) ? DEFAULT_SPLITPOINT : parameters->splitPoint; + parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h index 45d78b05f..8244c3bac 100644 --- a/lib/dictBuilder/zdict.h +++ b/lib/dictBuilder/zdict.h @@ -86,7 +86,7 @@ typedef struct { unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ - double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training */ + double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, 0 means default (0.8) */ ZDICT_params_t zParams; } ZDICT_cover_params_t; From 7efabb2cf68fed6ebbd92dbecd653496e4e26a42 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Mon, 9 Jul 2018 12:26:53 -0700 Subject: [PATCH 16/19] Only make 0.0 default splitPoint --- lib/dictBuilder/cover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 176c386c4..3a97965cf 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -970,7 +970,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = - parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + parameters->splitPoint == 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; From 456f290e31cc81d436f15f17f15f711e0fcba47b Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Mon, 9 Jul 2018 13:53:25 -0700 Subject: [PATCH 17/19] Change back to splitPoint<=0 --- lib/dictBuilder/cover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 3a97965cf..176c386c4 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -970,7 +970,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = - parameters->splitPoint == 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; From 5021441d86ab366a70584a326eba44d9c9e5aaea Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 10 Jul 2018 11:19:33 -0700 Subject: [PATCH 18/19] Change default splitPoint to 100 --- lib/dictBuilder/cover.c | 4 ++-- lib/dictBuilder/zdict.h | 2 +- programs/zstd.1 | 2 +- programs/zstd.1.md | 2 +- programs/zstdcli.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 176c386c4..e32991652 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -39,7 +39,7 @@ * Constants ***************************************/ #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) -#define DEFAULT_SPLITPOINT 0.8 +#define DEFAULT_SPLITPOINT 1.0 /*-************************************* * Console display @@ -497,7 +497,7 @@ static int COVER_checkParameters(ZDICT_cover_params_t parameters, if (parameters.d > parameters.k) { return 0; } - /* 0 < splitPoint < 1 */ + /* 0 < splitPoint <= 1 */ if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){ return 0; } diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h index 8244c3bac..9357e40a6 100644 --- a/lib/dictBuilder/zdict.h +++ b/lib/dictBuilder/zdict.h @@ -86,7 +86,7 @@ typedef struct { unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ - double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, 0 means default (0.8) */ + double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, 0 means default (1.0) */ ZDICT_params_t zParams; } ZDICT_cover_params_t; diff --git a/programs/zstd.1 b/programs/zstd.1 index b63ef4f2a..3e9e29423 100644 --- a/programs/zstd.1 +++ b/programs/zstd.1 @@ -218,7 +218,7 @@ A dictionary ID is a locally unique ID that a decoder can use to verify it is us . .TP \fB\-\-train\-cover[=k#,d=#,steps=#,split=#]\fR -Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or \fIsplit\fR <= 0 or \fIsplit\fR > 100, then the default value of 80 is used\. Requires that \fId\fR <= \fIk\fR\. +Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or \fIsplit\fR <= 0 or \fIsplit\fR > 100, then the default value of 100 is used\. Requires that \fId\fR <= \fIk\fR\. . .IP Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\. diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 47035f1c0..df6f777df 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -228,7 +228,7 @@ Compression of small files similar to the sample set will be greatly improved. If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8. If _k_ is not specified, then it tries _steps_ values in the range [50, 2000]. If _steps_ is not specified, then the default value of 40 is used. - If _split_ is not specified or split <= 0 or split > 100, then the default value of 80 is used. + If _split_ is not specified or split <= 0 or split > 100, then the default value of 100 is used. Requires that _d_ <= _k_. Selects segments of size _k_ with highest score to put in the dictionary. diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 5408d2a51..a466a7ff3 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -84,7 +84,7 @@ static U32 g_ldmMinMatch = 0; static U32 g_ldmHashEveryLog = LDM_PARAM_DEFAULT; static U32 g_ldmBucketSizeLog = LDM_PARAM_DEFAULT; -#define DEFAULT_SPLITPOINT 0.8 +#define DEFAULT_SPLITPOINT 1.0 /*-************************************ * Display Macros From 612b346ed51c8497e40c521c77808833361945ab Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 11 Jul 2018 15:50:28 -0700 Subject: [PATCH 19/19] Add explanation for split=100 --- lib/dictBuilder/zdict.h | 2 +- programs/zstd.1 | 3 ++- programs/zstd.1.md | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h index 9357e40a6..4094669d1 100644 --- a/lib/dictBuilder/zdict.h +++ b/lib/dictBuilder/zdict.h @@ -86,7 +86,7 @@ typedef struct { unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ - double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, 0 means default (1.0) */ + double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */ ZDICT_params_t zParams; } ZDICT_cover_params_t; diff --git a/programs/zstd.1 b/programs/zstd.1 index 3e9e29423..9d1e45cfe 100644 --- a/programs/zstd.1 +++ b/programs/zstd.1 @@ -218,7 +218,8 @@ A dictionary ID is a locally unique ID that a decoder can use to verify it is us . .TP \fB\-\-train\-cover[=k#,d=#,steps=#,split=#]\fR -Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or \fIsplit\fR <= 0 or \fIsplit\fR > 100, then the default value of 100 is used\. Requires that \fId\fR <= \fIk\fR\. +Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. If \fIsplit\fR is not specified or \fIsplit\fR <= 0, then the default value of 100 is used\. If \fIsplit\fR is 100, all input samples are used for both training and testing +to find optimal _d_ and _k_ to build dictionary.Requires that \fId\fR <= \fIk\fR\. . .IP Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\. diff --git a/programs/zstd.1.md b/programs/zstd.1.md index df6f777df..7e21073d7 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -228,7 +228,7 @@ Compression of small files similar to the sample set will be greatly improved. If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8. If _k_ is not specified, then it tries _steps_ values in the range [50, 2000]. If _steps_ is not specified, then the default value of 40 is used. - If _split_ is not specified or split <= 0 or split > 100, then the default value of 100 is used. + If _split_ is not specified or split <= 0, then the default value of 100 is used. Requires that _d_ <= _k_. Selects segments of size _k_ with highest score to put in the dictionary. @@ -238,6 +238,8 @@ Compression of small files similar to the sample set will be greatly improved. algorithm will run faster with d <= _8_. Good values for _k_ vary widely based on the input data, but a safe range is [2 * _d_, 2000]. + If _split_ is 100, all input samples are used for both training and testing + to find optimal _d_ and _k_ to build dictionary. Supports multithreading if `zstd` is compiled with threading support. Examples: