From 348e5f77a95922f4bf2232df1bd220ce665cc369 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 29 Jun 2018 17:54:41 -0700 Subject: [PATCH] Add split=# to cli --- lib/dictBuilder/cover.c | 8 ++++---- programs/zstd.1.md | 5 ++++- programs/zstdcli.c | 8 +++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 53f3d79a8..a3195aa77 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -558,15 +558,15 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, /* Check if there's training sample */ if (nbTrainSamples < 1) { DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); - DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100)); - DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); return 0; } /* Check if there's testing sample when splitPoint is nonzero */ if (nbTestSamples < 1 && splitPoint < 1.0) { DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); - DISPLAYLEVEL(1, "splitPoint is %i", (int)(splitPoint*100)); - DISPLAYLEVEL(1, "nbSamples is %u", nbSamples); + return 0; + } + if (nbTrainSamples + nbTestSamples != nbSamples) { + DISPLAYLEVEL(1, "nbTrainSamples plus nbTestSamples don't add up to nbSamples"); return 0; } /* Zero the context */ diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 4b3818141..c45bdb386 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -223,11 +223,12 @@ Compression of small files similar to the sample set will be greatly improved. This compares favorably to 4 bytes default. However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries. -* `--train-cover[=k#,d=#,steps=#]`: +* `--train-cover[=k#,d=#,steps=#,split=#]`: Select parameters for the default dictionary builder algorithm named cover. If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8. If _k_ is not specified, then it tries _steps_ values in the range [50, 2000]. If _steps_ is not specified, then the default value of 40 is used. + If _split_ is not specified, then the default value of 80 is used. Requires that _d_ <= _k_. Selects segments of size _k_ with highest score to put in the dictionary. @@ -249,6 +250,8 @@ Compression of small files similar to the sample set will be greatly improved. `zstd --train-cover=k=50 FILEs` + `zstd --train-cover=k=50,split=60 FILEs` + * `--train-legacy[=selectivity=#]`: Use legacy dictionary builder algorithm with the given dictionary _selectivity_ (default: 9). diff --git a/programs/zstdcli.c b/programs/zstdcli.c index ae8c9cba9..68404d660 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -278,14 +278,20 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params) { memset(params, 0, sizeof(*params)); + unsigned splitPercentage = 100; for (; ;) { if (longCommandWArg(&stringPtr, "k=")) { params->k = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } if (longCommandWArg(&stringPtr, "steps=")) { params->steps = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } + if (longCommandWArg(&stringPtr, "split=")) { + splitPercentage = readU32FromChar(&stringPtr); + params->splitPoint = (double)splitPercentage / 100.0; + if (stringPtr[0]==',') { stringPtr++; continue; } else break; + } return 0; } if (stringPtr[0] != 0) return 0; - DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps); + DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplitPoint=%d\n", params->k, params->d, params->steps, splitPercentage); return 1; }