From feba969a6932d35b97701e4581ae042dcc32ab00 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 1 May 2017 10:13:56 -0700 Subject: [PATCH 1/5] Fix LZ4_MSG in xzstd --- programs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/Makefile b/programs/Makefile index 2619614c5..2ab29e1ef 100644 --- a/programs/Makefile +++ b/programs/Makefile @@ -145,7 +145,7 @@ zstd-nogz : ZLIB_MSG := $(NO_ZLIB_MSG) zstd-nogz : LZMA_MSG := $(NO_LZMA_MSG) xzstd : CPPFLAGS += $(ZLIBCPP) $(LZMACPP) xzstd : LDFLAGS += $(ZLIBLD) $(LZMALD) -xzstd : LZ4_MSG := $(NO_LZMA_MSG) +xzstd : LZ4_MSG := $(NO_LZ4_MSG) zstd4 : CPPFLAGS += $(ZLIBCPP) $(LZ4CPP) zstd4 : LDFLAGS += $(ZLIBLD) $(LZ4LD) zstd4 : LZMA_MSG := $(NO_LZMA_MSG) From 865918dd041ec7bdfb730ecc2c346659e469216e Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 1 May 2017 10:14:15 -0700 Subject: [PATCH 2/5] Fix typo in zdict.h --- lib/dictBuilder/zdict.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h index 669b78d08..9b53de346 100644 --- a/lib/dictBuilder/zdict.h +++ b/lib/dictBuilder/zdict.h @@ -88,7 +88,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dict /*! COVER_params_t : For all values 0 means default. - kMin and d are the only required parameters. + k and d are the only required parameters. */ typedef struct { unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ From f2d9ef1dc0d491b75877a9200876a7c16dd9694d Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 1 May 2017 10:25:49 -0700 Subject: [PATCH 3/5] [cover] Optimize case where d <= 8 --- lib/dictBuilder/cover.c | 37 ++++++++++++++++++++++++++++++------- programs/zstd.1 | 4 ++-- programs/zstd.1.md | 3 ++- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 1db42f95b..4235f112b 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -234,10 +234,22 @@ static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) { * Returns 1 if the dmer at lp is greater than the dmer at rp. */ static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) { - const U32 lhs = *(const U32 *)lp; - const U32 rhs = *(const U32 *)rp; + U32 const lhs = *(U32 const *)lp; + U32 const rhs = *(U32 const *)rp; return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d); } +/** + * Faster version for d <= 8. + */ +static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) { + U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1); + U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask; + U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask; + if (lhs < rhs) { + return -1; + } + return (lhs > rhs); +} /** * Same as COVER_cmp() except ties are broken by pointer value @@ -251,6 +263,16 @@ static int COVER_strict_cmp(const void *lp, const void *rp) { } return result; } +/** + * Faster version for d <= 8. + */ +static int COVER_strict_cmp8(const void *lp, const void *rp) { + int result = COVER_cmp8(g_ctx, lp, rp); + if (result == 0) { + result = lp < rp ? -1 : 1; + } + return result; +} /** * Returns the first pointer in [first, last) whose element does not compare @@ -506,7 +528,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const BYTE *const samples = (const BYTE *)samplesBuffer; const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); /* Checks */ - if (totalSamplesSize < d || + if (totalSamplesSize < MAX(d, sizeof(U64)) || totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n", (COVER_MAX_SAMPLES_SIZE >> 20)); @@ -520,7 +542,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, ctx->samplesSizes = samplesSizes; ctx->nbSamples = nbSamples; /* Partial suffix array */ - ctx->suffixSize = totalSamplesSize - d + 1; + ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1; ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); /* Maps index to the dmerID */ ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); @@ -554,7 +576,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, } /* qsort doesn't take an opaque pointer, so pass as a global */ g_ctx = ctx; - qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp); + qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), + (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); } DISPLAYLEVEL(2, "Computing frequencies\n"); /* For each dmer group (group of positions with the same first d bytes): @@ -564,8 +587,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, * 2. We calculate how many samples the dmer occurs in and save it in * freqs[dmerId]. */ - COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp, - &COVER_group); + COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, + (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group); ctx->freqs = ctx->suffix; ctx->suffix = NULL; return 1; diff --git a/programs/zstd.1 b/programs/zstd.1 index 999dc8169..5bd966a84 100644 --- a/programs/zstd.1 +++ b/programs/zstd.1 @@ -1,5 +1,5 @@ . -.TH "ZSTD" "1" "April 2017" "zstd 1.1.5" "User Commands" +.TH "ZSTD" "1" "May 2017" "zstd 1.2.0" "User Commands" . .SH "NAME" \fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files @@ -188,7 +188,7 @@ dictionary selectivity level (default: 9) the smaller the value, the denser the . .TP \fB\-\-cover=k#,d=#\fR -Use alternate dictionary builder algorithm named cover with parameters \fIk\fR and \fId\fR with \fId\fR <= \fIk\fR\. Selects segments of size \fIk\fR with the highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of of size \fId\fR\. Generally \fId\fR should be in the range [6, 24]\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [32, 2048]\. +Use alternate dictionary builder algorithm named cover with parameters \fIk\fR and \fId\fR with \fId\fR <= \fIk\fR\. Selects segments of size \fIk\fR with the highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], but no more than 24\. When \fId\fR <= 8, the dictionary builder will run significantly faster\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [32, 2048]\. . .br Example: \fB\-\-train \-\-cover=k=64,d=8 FILEs\fR\. diff --git a/programs/zstd.1.md b/programs/zstd.1.md index f2d04d16f..0919da702 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -186,7 +186,8 @@ Typical gains range from 10% (at 64KB) to x5 better (at <1KB). Selects segments of size _k_ with the highest score to put in the dictionary. The score of a segment is computed by the sum of the frequencies of all the subsegments of of size _d_. - Generally _d_ should be in the range [6, 24]. + Generally _d_ should be in the range [6, 8], but no more than 24. + When _d_ <= 8, the dictionary builder will run significantly faster. Good values for _k_ vary widely based on the input data, but a safe range is [32, 2048].
Example: `--train --cover=k=64,d=8 FILEs`. From 020b960e13499cb2cd524cc6b7ddb0ccbd76c361 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 1 May 2017 21:26:33 -0700 Subject: [PATCH 4/5] [cover] Make optimization faster --- lib/dictBuilder/cover.c | 4 ++-- programs/zstd.1 | 6 +++--- programs/zstd.1.md | 9 +++------ 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 4235f112b..0e156bd58 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -939,8 +939,8 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer, /* constants */ const unsigned nbThreads = parameters->nbThreads; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; - const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d; - const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k; + const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; + const unsigned kMinK = parameters->k == 0 ? 40 + kMaxD : parameters->k; const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k; const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps; const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); diff --git a/programs/zstd.1 b/programs/zstd.1 index 5bd966a84..648c3365c 100644 --- a/programs/zstd.1 +++ b/programs/zstd.1 @@ -195,13 +195,13 @@ Example: \fB\-\-train \-\-cover=k=64,d=8 FILEs\fR\. . .TP \fB\-\-optimize\-cover[=steps=#,k=#,d=#]\fR -If \fIsteps\fR is not specified, the default value of 32 is used\. If \fIk\fR is not specified, the \fIk\fR values in [16, 2048] are checked for each value of \fId\fR\. If \fId\fR is not specified, the values checked are [6, 8, \.\.\., 16]\. +If \fIsteps\fR is not specified, the default value of 32 is used\. If \fIk\fR is not specified, the \fIk\fR values in [48, 2048] are checked for each value of \fId\fR\. If \fId\fR is not specified, the values checked are [6, 8]\. . .IP Runs the cover dictionary builder for each parameter set and saves the optimal parameters and dictionary\. Prints optimal parameters and writes optimal dictionary into output file\. Supports multithreading if \fBzstd\fR is compiled with threading support\. . .IP -The parameter \fIk\fR is more sensitive than \fId\fR, and is faster to optimize over\. Suggested use is to run with a \fIsteps\fR <= 32 with neither \fIk\fR nor \fId\fR set\. Once it completes, use the value of \fId\fR it selects with a higher \fIsteps\fR (in the range [256, 1024])\. +The parameter \fIk\fR is more sensitive than \fId\fR, and is faster to optimize over\. . .IP Examples : @@ -210,7 +210,7 @@ Examples : \fBzstd \-\-train \-\-optimize\-cover FILEs\fR . .IP -\fBzstd \-\-train \-\-optimize\-cover=d=d,steps=512 FILEs\fR +\fBzstd \-\-train \-\-optimize\-cover=d=8,steps=512 FILEs\fR . .SH "BENCHMARK" . diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 0919da702..ea346cfb7 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -194,9 +194,9 @@ Typical gains range from 10% (at 64KB) to x5 better (at <1KB). * `--optimize-cover[=steps=#,k=#,d=#]`: If _steps_ is not specified, the default value of 32 is used. - If _k_ is not specified, the _k_ values in [16, 2048] are checked for each + If _k_ is not specified, the _k_ values in [48, 2048] are checked for each value of _d_. - If _d_ is not specified, the values checked are [6, 8, ..., 16]. + If _d_ is not specified, the values checked are [6, 8]. Runs the cover dictionary builder for each parameter set and saves the optimal parameters and dictionary. @@ -204,15 +204,12 @@ Typical gains range from 10% (at 64KB) to x5 better (at <1KB). Supports multithreading if `zstd` is compiled with threading support. The parameter _k_ is more sensitive than _d_, and is faster to optimize over. - Suggested use is to run with a _steps_ <= 32 with neither _k_ nor _d_ set. - Once it completes, use the value of _d_ it selects with a higher _steps_ - (in the range [256, 1024]). Examples : `zstd --train --optimize-cover FILEs` - `zstd --train --optimize-cover=d=d,steps=512 FILEs` + `zstd --train --optimize-cover=d=8,steps=512 FILEs` BENCHMARK From f376d47c11e083c08a35ec3de6e4a66b7da60b14 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 1 May 2017 23:40:20 -0700 Subject: [PATCH 5/5] [CLI] Switch dictionary builder on CLI to cover --- lib/dictBuilder/cover.c | 6 ++-- programs/zstd.1 | 54 ++++++++++++++++++------------- programs/zstd.1.md | 71 +++++++++++++++++++++++------------------ programs/zstdcli.c | 62 ++++++++++++++++++++++++++--------- tests/playTests.sh | 32 +++++++++++++++---- 5 files changed, 147 insertions(+), 78 deletions(-) diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c index 0e156bd58..1863c8f34 100644 --- a/lib/dictBuilder/cover.c +++ b/lib/dictBuilder/cover.c @@ -940,9 +940,9 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer, const unsigned nbThreads = parameters->nbThreads; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; - const unsigned kMinK = parameters->k == 0 ? 40 + kMaxD : parameters->k; - const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k; - const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps; + const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; + const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k; + const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps; const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); const unsigned kIterations = (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); diff --git a/programs/zstd.1 b/programs/zstd.1 index 648c3365c..6cc5f7e9d 100644 --- a/programs/zstd.1 +++ b/programs/zstd.1 @@ -168,49 +168,57 @@ All arguments after \fB\-\-\fR are treated as files . .TP \fB\-\-train FILEs\fR -use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\. +Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\. +. +.IP +Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\. . .TP \fB\-o file\fR -dictionary saved into \fBfile\fR (default name: dictionary) +Dictionary saved into \fBfile\fR (default name: dictionary)\. . .TP \fB\-\-maxdict=#\fR -limit dictionary to specified size (default : (112640) +Limit dictionary to specified size (default: 112640)\. . .TP \fB\-\-dictID=#\fR A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\. . .TP -\fB\-s#\fR -dictionary selectivity level (default: 9) the smaller the value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. +\fB\-\-train\-cover[=k#,d=#,steps=#]\fR +Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. Requires that \fId\fR <= \fIk\fR\. +. +.IP +Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\. +. +.IP +Examples: +. +.IP +\fBzstd \-\-train\-cover FILEs\fR +. +.IP +\fBzstd \-\-train\-cover=k=50,d=8 FILEs\fR +. +.IP +\fBzstd \-\-train\-cover=d=8,steps=500 FILEs\fR +. +.IP +\fBzstd \-\-train\-cover=k=50 FILEs\fR . .TP -\fB\-\-cover=k#,d=#\fR -Use alternate dictionary builder algorithm named cover with parameters \fIk\fR and \fId\fR with \fId\fR <= \fIk\fR\. Selects segments of size \fIk\fR with the highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], but no more than 24\. When \fId\fR <= 8, the dictionary builder will run significantly faster\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [32, 2048]\. -. -.br -Example: \fB\-\-train \-\-cover=k=64,d=8 FILEs\fR\. -. -.TP -\fB\-\-optimize\-cover[=steps=#,k=#,d=#]\fR -If \fIsteps\fR is not specified, the default value of 32 is used\. If \fIk\fR is not specified, the \fIk\fR values in [48, 2048] are checked for each value of \fId\fR\. If \fId\fR is not specified, the values checked are [6, 8]\. +\fB\-\-train\-legacy[=selectivity=#]\fR +Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\. . .IP -Runs the cover dictionary builder for each parameter set and saves the optimal parameters and dictionary\. Prints optimal parameters and writes optimal dictionary into output file\. Supports multithreading if \fBzstd\fR is compiled with threading support\. +Examples: . .IP -The parameter \fIk\fR is more sensitive than \fId\fR, and is faster to optimize over\. +\fBzstd \-\-train\-legacy FILEs\fR . .IP -Examples : -. -.IP -\fBzstd \-\-train \-\-optimize\-cover FILEs\fR -. -.IP -\fBzstd \-\-train \-\-optimize\-cover=d=8,steps=512 FILEs\fR +\fBzstd \-\-train\-legacy=selectivity=8 FILEs\fR . .SH "BENCHMARK" . diff --git a/programs/zstd.1.md b/programs/zstd.1.md index ea346cfb7..118c9f2f8 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -158,14 +158,19 @@ It will improve compression ratio of small files. Typical gains range from 10% (at 64KB) to x5 better (at <1KB). * `--train FILEs`: - use FILEs as training set to create a dictionary. + Use FILEs as training set to create a dictionary. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary). + + Supports multithreading if `zstd` is compiled with threading support. + Additional parameters can be specified with `--train-cover`. + The legacy dictionary builder can be accessed with `--train-legacy`. + Equivalent to `--train-cover=d=8,steps=4`. * `-o file`: - dictionary saved into `file` (default name: dictionary) + Dictionary saved into `file` (default name: dictionary). * `--maxdict=#`: - limit dictionary to specified size (default : (112640) + Limit dictionary to specified size (default: 112640). * `--dictID=#`: A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary. @@ -176,40 +181,44 @@ Typical gains range from 10% (at 64KB) to x5 better (at <1KB). This compares favorably to 4 bytes default. However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries. -* `-s#`: - dictionary selectivity level (default: 9) - the smaller the value, the denser the dictionary, - improving its efficiency but reducing its possible maximum size. -* `--cover=k#,d=#`: - Use alternate dictionary builder algorithm named cover with parameters - _k_ and _d_ with _d_ <= _k_. - Selects segments of size _k_ with the highest score to put in the dictionary. +* `--train-cover[=k#,d=#,steps=#]`: + Select parameters for the default dictionary builder algorithm named cover. + If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8. + If _k_ is not specified, then it tries _steps_ values in the range [50, 2000]. + If _steps_ is not specified, then the default value of 40 is used. + Requires that _d_ <= _k_. + + Selects segments of size _k_ with highest score to put in the dictionary. The score of a segment is computed by the sum of the frequencies of all the - subsegments of of size _d_. - Generally _d_ should be in the range [6, 8], but no more than 24. - When _d_ <= 8, the dictionary builder will run significantly faster. - Good values for _k_ vary widely based on the input data, - but a safe range is [32, 2048].
- Example: `--train --cover=k=64,d=8 FILEs`. - -* `--optimize-cover[=steps=#,k=#,d=#]`: - If _steps_ is not specified, the default value of 32 is used. - If _k_ is not specified, the _k_ values in [48, 2048] are checked for each - value of _d_. - If _d_ is not specified, the values checked are [6, 8]. - - Runs the cover dictionary builder for each parameter set - and saves the optimal parameters and dictionary. - Prints optimal parameters and writes optimal dictionary into output file. + subsegments of size _d_. + Generally _d_ should be in the range [6, 8], occasionally up to 16, but the + algorithm will run faster with d <= _8_. + Good values for _k_ vary widely based on the input data, but a safe range is + [2 * _d_, 2000]. Supports multithreading if `zstd` is compiled with threading support. - The parameter _k_ is more sensitive than _d_, and is faster to optimize over. + Examples: - Examples : + `zstd --train-cover FILEs` - `zstd --train --optimize-cover FILEs` + `zstd --train-cover=k=50,d=8 FILEs` - `zstd --train --optimize-cover=d=8,steps=512 FILEs` + `zstd --train-cover=d=8,steps=500 FILEs` + + `zstd --train-cover=k=50 FILEs` + +* `--train-legacy[=selectivity=#]`: + Use legacy dictionary builder algorithm with the given dictionary + _selectivity_ (default: 9). + The smaller the _selectivity_ value, the denser the dictionary, + improving its efficiency but reducing its possible maximum size. + `--train-legacy=s=#` is also accepted. + + Examples: + + `zstd --train-legacy FILEs` + + `zstd --train-legacy=selectivity=8 FILEs` BENCHMARK diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 79bc84877..f6ce2e1a6 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -153,11 +153,10 @@ static int usage_advanced(const char* programName) DISPLAY( "\n"); DISPLAY( "Dictionary builder :\n"); DISPLAY( "--train ## : create a dictionary from a training set of files \n"); - DISPLAY( "--cover=k=#,d=# : use the cover algorithm with parameters k and d \n"); - DISPLAY( "--optimize-cover[=steps=#,k=#,d=#] : optimize cover parameters with optional parameters\n"); + DISPLAY( "--train-cover[=k=#,d=#,steps=#] : use the cover algorithm with optional args\n"); + DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel); DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName); DISPLAY( "--maxdict=# : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize); - DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel); DISPLAY( "--dictID=# : force dictionary ID to specified value (default: random)\n"); #endif #ifndef ZSTD_NOBENCH @@ -241,11 +240,11 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) #ifndef ZSTD_NODICT /** * parseCoverParameters() : - * reads cover parameters from *stringPtr (e.g. "--cover=smoothing=100,kmin=48,kstep=4,kmax=64,d=8") into *params + * reads cover parameters from *stringPtr (e.g. "--train-cover=k=48,d=8,steps=32") into *params * @return 1 means that cover parameters were correct * @return 0 in case of malformed parameters */ -static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *params) +static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t* params) { memset(params, 0, sizeof(*params)); for (; ;) { @@ -255,9 +254,33 @@ static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *para return 0; } if (stringPtr[0] != 0) return 0; - DISPLAYLEVEL(4, "k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps); + DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\n", params->k, params->d, params->steps); return 1; } + +/** + * parseLegacyParameters() : + * reads legacy dictioanry builter parameters from *stringPtr (e.g. "--train-legacy=selectivity=8") into *selectivity + * @return 1 means that legacy dictionary builder parameters were correct + * @return 0 in case of malformed parameters + */ +static unsigned parseLegacyParameters(const char* stringPtr, unsigned* selectivity) +{ + if (!longCommandWArg(&stringPtr, "s=") && !longCommandWArg(&stringPtr, "selectivity=")) { return 0; } + *selectivity = readU32FromChar(&stringPtr); + if (stringPtr[0] != 0) return 0; + DISPLAYLEVEL(4, "legacy: selectivity=%u\n", *selectivity); + return 1; +} + +static COVER_params_t defaultCoverParams(void) +{ + COVER_params_t params; + memset(¶ms, 0, sizeof(params)); + params.d = 8; + params.steps = 4; + return params; +} #endif @@ -331,8 +354,8 @@ int main(int argCount, const char* argv[]) unsigned fileNamesNb; #endif #ifndef ZSTD_NODICT - COVER_params_t coverParams; - int cover = 0; + COVER_params_t coverParams = defaultCoverParams(); + int cover = 1; #endif /* init */ @@ -413,18 +436,26 @@ int main(int argCount, const char* argv[]) /* long commands with arguments */ #ifndef ZSTD_NODICT - if (longCommandWArg(&argument, "--cover=")) { - cover=1; if (!parseCoverParameters(argument, &coverParams)) CLEAN_RETURN(badusage(programName)); - continue; - } - if (longCommandWArg(&argument, "--optimize-cover")) { - cover=2; + if (longCommandWArg(&argument, "--train-cover")) { + operation = zom_train; + outFileName = g_defaultDictName; + cover = 1; /* Allow optional arguments following an = */ if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); } else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); } else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); } continue; } + if (longCommandWArg(&argument, "--train-legacy")) { + operation = zom_train; + outFileName = g_defaultDictName; + cover = 0; + /* Allow optional arguments following an = */ + if (*argument == 0) { continue; } + else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); } + else if (!parseLegacyParameters(argument, &dictSelect)) { CLEAN_RETURN(badusage(programName)); } + continue; + } #endif if (longCommandWArg(&argument, "--threads=")) { nbThreads = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; } @@ -659,11 +690,12 @@ int main(int argCount, const char* argv[]) if (operation==zom_train) { #ifndef ZSTD_NODICT if (cover) { + int const optimize = !coverParams.k || !coverParams.d; coverParams.nbThreads = nbThreads; coverParams.compressionLevel = dictCLevel; coverParams.notificationLevel = g_displayLevel; coverParams.dictID = dictID; - operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1); + operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, optimize); } else { ZDICT_params_t dictParams; memset(&dictParams, 0, sizeof(dictParams)); diff --git a/tests/playTests.sh b/tests/playTests.sh index 369506c2e..021fd59fe 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -314,9 +314,9 @@ esac rm -rf dirTestDict $ECHO "- dictionary builder on bogus input" $ECHO "Hello World" > tmp -$ZSTD --train -q tmp && die "Dictionary training should fail : not enough input source" +$ZSTD --train-legacy -q tmp && die "Dictionary training should fail : not enough input source" ./datagen -P0 -g10M > tmp -$ZSTD --train -q tmp && die "Dictionary training should fail : source is pure noise" +$ZSTD --train-legacy -q tmp && die "Dictionary training should fail : source is pure noise" rm tmp* @@ -325,19 +325,39 @@ $ECHO "\n**** cover dictionary tests **** " TESTFILE=../programs/zstdcli.c ./datagen > tmpDict $ECHO "- Create first dictionary" -$ZSTD --train --cover=k=46,d=8 *.c ../programs/*.c -o tmpDict +$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c -o tmpDict cp $TESTFILE tmp $ZSTD -f tmp -D tmpDict $ZSTD -d tmp.zst -D tmpDict -fo result $DIFF $TESTFILE result $ECHO "- Create second (different) dictionary" -$ZSTD --train --cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC +$ZSTD --train-cover=k=56,d=8 *.c ../programs/*.c ../programs/*.h -o tmpDictC $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!" $ECHO "- Create dictionary with short dictID" -$ZSTD --train --cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1 +$ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" $ECHO "- Create dictionary with size limit" -$ZSTD --train --optimize-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K +$ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K +rm tmp* + +$ECHO "\n**** legacy dictionary tests **** " + +TESTFILE=../programs/zstdcli.c +./datagen > tmpDict +$ECHO "- Create first dictionary" +$ZSTD --train-legacy=selectivity=8 *.c ../programs/*.c -o tmpDict +cp $TESTFILE tmp +$ZSTD -f tmp -D tmpDict +$ZSTD -d tmp.zst -D tmpDict -fo result +$DIFF $TESTFILE result +$ECHO "- Create second (different) dictionary" +$ZSTD --train-legacy=s=5 *.c ../programs/*.c ../programs/*.h -o tmpDictC +$ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!" +$ECHO "- Create dictionary with short dictID" +$ZSTD --train-legacy -s5 *.c ../programs/*.c --dictID=1 -o tmpDict1 +cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" +$ECHO "- Create dictionary with size limit" +$ZSTD --train-legacy -s9 *.c ../programs/*.c -o tmpDict2 --maxdict=4K rm tmp*