From 782bfb858afb88f014271f8efbb7bef909cee6db Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 15 Aug 2019 16:41:34 +0200 Subject: [PATCH 01/16] fixed very minor inefficiency (nbSeq==127) The nbSeq "short" format (1-byte) is compatible with any value < 128. However, the code would cautiously only accept values < 127. This is not an error, because the general 2-bytes format is compatible with small values < 128. Hence the inefficiency never triggered any warning. Spotted by Intel's Smita Kumar. --- lib/compress/zstd_compress.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index cd73db13b..b4ae4e877 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1981,12 +1981,17 @@ ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, /* Sequences Header */ RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, dstSize_tooSmall); - if (nbSeq < 0x7F) + if (nbSeq < 128) { *op++ = (BYTE)nbSeq; - else if (nbSeq < LONGNBSEQ) - op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; - else - op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + } else if (nbSeq < LONGNBSEQ) { + op[0] = (BYTE)((nbSeq>>8) + 0x80); + op[1] = (BYTE)nbSeq; + op+=2; + } else { + op[0]=0xFF; + MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)); + op+=3; + } assert(op <= oend); if (nbSeq==0) { /* Copy the old tables over as if we repeated them */ From b81d7cc6a070ec67d9d511ad96c0fe0ab98f81e9 Mon Sep 17 00:00:00 2001 From: Ed Maste Date: Thu, 15 Aug 2019 21:17:06 -0400 Subject: [PATCH 02/16] remove extraneous doubled ;s --- lib/dictBuilder/zdict.c | 2 +- programs/dibio.c | 2 +- tests/fuzzer.c | 6 +++--- tests/zbufftest.c | 4 ++-- tests/zstreamtest.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index ee21ee1a9..4a263d822 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -571,7 +571,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length) unsigned const prime1 = 2654435761U; unsigned const prime2 = 2246822519U; unsigned acc = prime1; - size_t p=0;; + size_t p=0; for (p=0; p> 21); diff --git a/programs/dibio.c b/programs/dibio.c index 12eb32680..ea4bb4bf1 100644 --- a/programs/dibio.c +++ b/programs/dibio.c @@ -201,7 +201,7 @@ static void DiB_fillNoise(void* buffer, size_t length) unsigned const prime1 = 2654435761U; unsigned const prime2 = 2246822519U; unsigned acc = prime1; - size_t p=0;; + size_t p=0; for (p=0; p Date: Fri, 16 Aug 2019 15:13:42 +0200 Subject: [PATCH 03/16] clarifications on the meaning of field `Block_Size` following comments from Intel's Smita Kumar. --- doc/zstd_compression_format.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md index 7d02426a5..39a8f1776 100644 --- a/doc/zstd_compression_format.md +++ b/doc/zstd_compression_format.md @@ -16,7 +16,7 @@ Distribution of this document is unlimited. ### Version -0.3.2 (17/07/19) +0.3.3 (16/08/19) Introduction @@ -358,6 +358,7 @@ It may be followed by an optional `Content_Checksum` __`Block_Type`__ The next 2 bits represent the `Block_Type`. +`Block_Type` influences the meaning of `Block_Size`. There are 4 block types : | Value | 0 | 1 | 2 | 3 | @@ -384,9 +385,12 @@ There are 4 block types : __`Block_Size`__ The upper 21 bits of `Block_Header` represent the `Block_Size`. -`Block_Size` is the size of the block excluding the header. -A block can contain any number of bytes (even zero), up to -`Block_Maximum_Decompressed_Size`, which is the smallest of: +When `Block_Type` is `Compressed_Block` or `Raw_Block`, +`Block_Size` is the size of `Block_Content`, hence excluding `Block_Header`. +When `Block_Type` is `RLE_Block`, `Block_Content`’s size is always 1, +and `Block_Size` represents the nb of times this byte must be repeated. +A block can contain and decompress into any number of bytes (even zero), +up to `Block_Maximum_Decompressed_Size`, which is the smallest of: - Window_Size - 128 KB @@ -1653,6 +1657,7 @@ or at least provide a meaningful error code explaining for which reason it canno Version changes --------------- +- 0.3.3 : clarifications for field Block_Size - 0.3.2 : remove additional block size restriction on compressed blocks - 0.3.1 : minor clarification regarding offset history update rules - 0.3.0 : minor edits to match RFC8478 From af0c9501d120ff83a7e0871a0b18ad7237813a8c Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Thu, 15 Aug 2019 23:57:55 -0700 Subject: [PATCH 04/16] Add --stream-size=# command --- programs/fileio.c | 21 +++++++++++++++++++-- programs/fileio.h | 1 + programs/zstdcli.c | 6 +++++- tests/playTests.sh | 22 +++++++++++++++++++++- 4 files changed, 46 insertions(+), 4 deletions(-) diff --git a/programs/fileio.c b/programs/fileio.c index 569a410c1..82d70075b 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -304,6 +304,7 @@ struct FIO_prefs_s { int ldmMinMatch; int ldmBucketSizeLog; int ldmHashRateLog; + size_t streamSrcSize; size_t targetCBlockSize; ZSTD_literalCompressionMode_e literalCompressionMode; @@ -349,6 +350,7 @@ FIO_prefs_t* FIO_createPreferences(void) ret->ldmMinMatch = 0; ret->ldmBucketSizeLog = FIO_LDM_PARAM_NOTSET; ret->ldmHashRateLog = FIO_LDM_PARAM_NOTSET; + ret->streamSrcSize = 0; ret->targetCBlockSize = 0; ret->literalCompressionMode = ZSTD_lcm_auto; return ret; @@ -418,6 +420,10 @@ void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable) { prefs->rsyncable = rsyncable; } +void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize) { + prefs->streamSrcSize = streamSrcSize; +} + void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize) { prefs->targetCBlockSize = targetCBlockSize; } @@ -698,9 +704,20 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_rsyncable, prefs->rsyncable) ); #endif /* dictionary */ - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, srcSize) ); /* set the value temporarily for dictionary loading, to adapt compression parameters */ + /* set the pledged size for dictionary loading, to adapt compression parameters */ + if (srcSize == ZSTD_CONTENTSIZE_UNKNOWN && prefs->streamSrcSize > 0) { + /* unknown source size; use the declared stream size and disable writing this size to frame during compression */ + CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, prefs->streamSrcSize) ); + CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_contentSizeFlag, 0) ); + } else { + /* use the known source size for adaption */ + CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, srcSize) ); + } CHECK( ZSTD_CCtx_loadDictionary(ress.cctx, dictBuffer, dictBuffSize) ); - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, ZSTD_CONTENTSIZE_UNKNOWN) ); /* reset */ + if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN || prefs->streamSrcSize == 0) { + /* reset pledge when src size is known or stream size is declared */ + CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, ZSTD_CONTENTSIZE_UNKNOWN) ); + } free(dictBuffer); } diff --git a/programs/fileio.h b/programs/fileio.h index 311f8c0e1..13f6f1d05 100644 --- a/programs/fileio.h +++ b/programs/fileio.h @@ -71,6 +71,7 @@ void FIO_setOverlapLog(FIO_prefs_t* const prefs, int overlapLog); void FIO_setRemoveSrcFile(FIO_prefs_t* const prefs, unsigned flag); void FIO_setSparseWrite(FIO_prefs_t* const prefs, unsigned sparse); /**< 0: no sparse; 1: disable on stdout; 2: always enabled */ void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable); +void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize); void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize); void FIO_setLiteralCompressionMode( FIO_prefs_t* const prefs, diff --git a/programs/zstdcli.c b/programs/zstdcli.c index de286cdf2..401e1ee2c 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -141,6 +141,7 @@ static int usage_advanced(const char* programName) DISPLAY( "--long[=#]: enable long distance matching with given window log (default: %u)\n", g_defaultMaxWindowLog); DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1); DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n"); + DISPLAY( "--stream-size=# : optimize compression parameters for streaming input of given number of bytes \n"); DISPLAY( "--target-compressed-block-size=# : make compressed block near targeted size \n"); #ifdef ZSTD_MULTITHREAD DISPLAY( " -T# : spawns # compression threads (default: 1, 0==# cores) \n"); @@ -588,6 +589,7 @@ int main(int argCount, const char* argv[]) const char* suffix = ZSTD_EXTENSION; unsigned maxDictSize = g_defaultMaxDictSize; unsigned dictID = 0; + size_t streamSrcSize = 0; size_t targetCBlockSize = 0; int dictCLevel = g_defaultDictCLevel; unsigned dictSelect = g_defaultSelectivityLevel; @@ -745,6 +747,7 @@ int main(int argCount, const char* argv[]) if (longCommandWArg(&argument, "--maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; } + if (longCommandWArg(&argument, "--stream-size=")) { streamSrcSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--long")) { unsigned ldmWindowLog = 0; @@ -1150,6 +1153,7 @@ int main(int argCount, const char* argv[]) FIO_setAdaptMin(prefs, adaptMin); FIO_setAdaptMax(prefs, adaptMax); FIO_setRsyncable(prefs, rsyncable); + FIO_setStreamSrcSize(prefs, streamSrcSize); FIO_setTargetCBlockSize(prefs, targetCBlockSize); FIO_setLiteralCompressionMode(prefs, literalCompressionMode); if (adaptMin > cLevel) cLevel = adaptMin; @@ -1160,7 +1164,7 @@ int main(int argCount, const char* argv[]) else operationResult = FIO_compressMultipleFilenames(prefs, filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams); #else - (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; /* not used when ZSTD_NOCOMPRESS set */ + (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)streamSrcSize; (void)targetCBlockSize; /* not used when ZSTD_NOCOMPRESS set */ DISPLAY("Compression not supported \n"); #endif } else { /* decompression or test */ diff --git a/tests/playTests.sh b/tests/playTests.sh index 69387321f..431a53a18 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -108,7 +108,6 @@ else fi - println "\n===> simple tests " ./datagen > tmp @@ -1020,4 +1019,25 @@ test -f dictionary rm -f tmp* dictionary +println "\n===> stream-size mode" + +./datagen -g11000 > tmp +println "test : basic file compression vs sized streaming compression" +$ZSTD -14 -f tmp -o tmp.zst |& tee file.out +cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11000 |& tee stream_sized.out + +file_ratio=$(cat file.out | awk '{print $4}' | sed 's/%//g') +stream_sized_ratio=$(cat stream_sized.out | awk '{print $4}' | sed 's/%//g') +rm file.out stream_sized.out + +ratio_diff=$(echo $file_ratio - $stream_sized_ratio | bc) +if [ $(echo "(100 * $ratio_diff) > 5" | bc -l) == 1 ] +then + die "greater than 0.05% difference between file and sized-streaming compression" +fi + +println "test : incorrect stream size" +cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size" + + rm -f tmp* From 85d07c6c474abc8f16b069332fbaba3054819d14 Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Fri, 16 Aug 2019 12:49:21 -0700 Subject: [PATCH 05/16] Tweak stdout, stderr redirection in new playTests --- tests/playTests.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/playTests.sh b/tests/playTests.sh index 431a53a18..c516aa712 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -1023,8 +1023,8 @@ println "\n===> stream-size mode" ./datagen -g11000 > tmp println "test : basic file compression vs sized streaming compression" -$ZSTD -14 -f tmp -o tmp.zst |& tee file.out -cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11000 |& tee stream_sized.out +$ZSTD -14 -f tmp -o tmp.zst 2>&1 | tee file.out +cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11000 2>&1 | tee stream_sized.out file_ratio=$(cat file.out | awk '{print $4}' | sed 's/%//g') stream_sized_ratio=$(cat stream_sized.out | awk '{print $4}' | sed 's/%//g') @@ -1035,7 +1035,10 @@ if [ $(echo "(100 * $ratio_diff) > 5" | bc -l) == 1 ] then die "greater than 0.05% difference between file and sized-streaming compression" fi - +println "test : sized streaming compression and decompression" +cat tmp | $ZSTD -14 -f tmp -o --stream-size=11000 tmp.zst +$ZSTD -df tmp.zst -o tmp_decompress +cmp tmp tmp_decompress || die "difference between original and decompressed file" println "test : incorrect stream size" cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size" From 97bb38635c0616ab024af0a71b3d115fcbc4382a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 17 Aug 2019 08:04:42 +0200 Subject: [PATCH 06/16] `number` instead of `nb` suggested by @terrelln --- doc/zstd_compression_format.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md index 39a8f1776..111dd98ae 100644 --- a/doc/zstd_compression_format.md +++ b/doc/zstd_compression_format.md @@ -388,7 +388,7 @@ The upper 21 bits of `Block_Header` represent the `Block_Size`. When `Block_Type` is `Compressed_Block` or `Raw_Block`, `Block_Size` is the size of `Block_Content`, hence excluding `Block_Header`. When `Block_Type` is `RLE_Block`, `Block_Content`’s size is always 1, -and `Block_Size` represents the nb of times this byte must be repeated. +and `Block_Size` represents the number of times this byte must be repeated. A block can contain and decompress into any number of bytes (even zero), up to `Block_Maximum_Decompressed_Size`, which is the smallest of: - Window_Size From c403b12f9dbab9ba3ab30fca6e1ce3c31d2f8923 Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Mon, 19 Aug 2019 09:01:31 -0700 Subject: [PATCH 07/16] Set pledged size just before compression --- programs/fileio.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/programs/fileio.c b/programs/fileio.c index 82d70075b..75b271a8f 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -704,21 +704,7 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_rsyncable, prefs->rsyncable) ); #endif /* dictionary */ - /* set the pledged size for dictionary loading, to adapt compression parameters */ - if (srcSize == ZSTD_CONTENTSIZE_UNKNOWN && prefs->streamSrcSize > 0) { - /* unknown source size; use the declared stream size and disable writing this size to frame during compression */ - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, prefs->streamSrcSize) ); - CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_contentSizeFlag, 0) ); - } else { - /* use the known source size for adaption */ - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, srcSize) ); - } CHECK( ZSTD_CCtx_loadDictionary(ress.cctx, dictBuffer, dictBuffSize) ); - if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN || prefs->streamSrcSize == 0) { - /* reset pledge when src size is known or stream size is declared */ - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, ZSTD_CONTENTSIZE_UNKNOWN) ); - } - free(dictBuffer); } @@ -1020,6 +1006,10 @@ FIO_compressZstdFrame(FIO_prefs_t* const prefs, /* init */ if (fileSize != UTIL_FILESIZE_UNKNOWN) { CHECK(ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize)); + } else if (prefs->streamSrcSize > 0) { + /* unknown source size; use the declared stream size and disable writing this size to frame during compression */ + CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, prefs->streamSrcSize) ); + CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_contentSizeFlag, 0) ); } (void)srcFileName; From bbd83c2ab34dc498a63ff10e869e26fbe1e221a4 Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Mon, 19 Aug 2019 09:11:22 -0700 Subject: [PATCH 08/16] Update man page --- programs/zstd.1.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 3ab2667a0..e3f729282 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -152,6 +152,12 @@ the last one takes effect. This feature does not work with `--single-thread`. You probably don't want to use it with long range mode, since it will decrease the effectiveness of the synchronization points, but your milage may vary. +* `--stream-size` : + When handling input from a stream, `zstd` must guess how large the source size + will be when optimizing compression parameters. If the stream size is relatively + small, this guess may be a poor one, resulting in a higher compression ratio than + expected. This feature will set the source size of a stream. Note that it must + be exact; incorrect stream sizes will cause an error. * `-D file`: use `file` as Dictionary to compress or decompress FILE(s) * `--no-dictID`: From f781cf672bfd23dd467a6dcf08ba094180032c76 Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Mon, 19 Aug 2019 11:07:43 -0700 Subject: [PATCH 09/16] Remove extraneous parameter --- programs/fileio.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/programs/fileio.c b/programs/fileio.c index 75b271a8f..f5ecf7296 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -639,7 +639,6 @@ typedef struct { static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, const char* dictFileName, int cLevel, - U64 srcSize, ZSTD_compressionParameters comprParams) { cRess_t ress; memset(&ress, 0, sizeof(ress)); @@ -1371,7 +1370,7 @@ int FIO_compressFilename(FIO_prefs_t* const prefs, U64 const fileSize = UTIL_getFileSize(srcFileName); U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : fileSize; - cRess_t const ress = FIO_createCResources(prefs, dictFileName, compressionLevel, srcSize, comprParams); + cRess_t const ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); int const result = FIO_compressFilename_srcFile(prefs, ress, dstFileName, srcFileName, compressionLevel); @@ -1424,8 +1423,7 @@ int FIO_compressMultipleFilenames(FIO_prefs_t* const prefs, int error = 0; U64 const firstFileSize = UTIL_getFileSize(inFileNamesTable[0]); U64 const firstSrcSize = (firstFileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : firstFileSize; - U64 const srcSize = (nbFiles != 1) ? ZSTD_CONTENTSIZE_UNKNOWN : firstSrcSize ; - cRess_t ress = FIO_createCResources(prefs, dictFileName, compressionLevel, srcSize, comprParams); + cRess_t ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); /* init */ assert(outFileName != NULL || suffix != NULL); From a24dc3a935a43ef8088711afadece9b94ae59a27 Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Mon, 19 Aug 2019 11:14:56 -0700 Subject: [PATCH 10/16] Remove extraneous variables --- programs/fileio.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/programs/fileio.c b/programs/fileio.c index f5ecf7296..5492f9448 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -1367,9 +1367,6 @@ int FIO_compressFilename(FIO_prefs_t* const prefs, const char* dictFileName, int compressionLevel, ZSTD_compressionParameters comprParams) { - U64 const fileSize = UTIL_getFileSize(srcFileName); - U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : fileSize; - cRess_t const ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); int const result = FIO_compressFilename_srcFile(prefs, ress, dstFileName, srcFileName, compressionLevel); @@ -1421,8 +1418,6 @@ int FIO_compressMultipleFilenames(FIO_prefs_t* const prefs, ZSTD_compressionParameters comprParams) { int error = 0; - U64 const firstFileSize = UTIL_getFileSize(inFileNamesTable[0]); - U64 const firstSrcSize = (firstFileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : firstFileSize; cRess_t ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); /* init */ From 30bfa228e84500855e80e792f7de796257d99ab3 Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Mon, 19 Aug 2019 11:20:28 -0700 Subject: [PATCH 11/16] Keep content size flag set in stream size mode --- programs/fileio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/programs/fileio.c b/programs/fileio.c index 5492f9448..873013a51 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -1006,9 +1006,8 @@ FIO_compressZstdFrame(FIO_prefs_t* const prefs, if (fileSize != UTIL_FILESIZE_UNKNOWN) { CHECK(ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize)); } else if (prefs->streamSrcSize > 0) { - /* unknown source size; use the declared stream size and disable writing this size to frame during compression */ + /* unknown source size; use the declared stream size */ CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, prefs->streamSrcSize) ); - CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_contentSizeFlag, 0) ); } (void)srcFileName; From 3982935aefbd6e8c0a8ceb7d54afc5b97f188fc7 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 20 Aug 2019 11:33:33 -0700 Subject: [PATCH 12/16] [fuzz] Improve fuzzer build script and docs * Remove the `make libFuzzer` target since it is broken and obsoleted by `CC=clang CXX=clang++ ./fuzz.py build all --enable-fuzzer`. The new `-fsanitize=fuzzer` is much better because it works with MSAN by default. * Improve the `./fuzz.py gen` command by making the input type explicit when creating a new target. * Update the `README` for `--enable-fuzzer`. Fixes #1727. --- tests/fuzz/Makefile | 9 -------- tests/fuzz/README.md | 19 ++++++++++------ tests/fuzz/fuzz.py | 53 ++++++++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index 8bf16b1fb..08dedd66f 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -113,15 +113,6 @@ zstd_frame_info: $(FUZZ_HEADERS) $(FUZZ_OBJ) zstd_frame_info.o libregression.a: $(FUZZ_HEADERS) $(PRGDIR)/util.h $(PRGDIR)/util.c regression_driver.o $(AR) $(FUZZ_ARFLAGS) $@ regression_driver.o -# Install libfuzzer (not usable for MSAN testing) -# Provided for convenience. To use this library run make libFuzzer and -# set LDFLAGS=-L. -.PHONY: libFuzzer -libFuzzer: - @$(RM) -rf Fuzzer - @git clone https://chromium.googlesource.com/chromium/llvm-project/compiler-rt/lib/fuzzer Fuzzer - @cd Fuzzer && ./build.sh - corpora/%_seed_corpus.zip: @mkdir -p corpora $(DOWNLOAD) $@ $(CORPORA_URL_PREFIX)$*_seed_corpus.zip diff --git a/tests/fuzz/README.md b/tests/fuzz/README.md index 9e0bb259a..856a57f82 100644 --- a/tests/fuzz/README.md +++ b/tests/fuzz/README.md @@ -35,6 +35,8 @@ The environment variables can be overridden with the corresponding flags `--cc`, `--cflags`, etc. The specific fuzzing engine is selected with `LIB_FUZZING_ENGINE` or `--lib-fuzzing-engine`, the default is `libregression.a`. +Alternatively, you can use Clang's built in fuzzing engine with +`--enable-fuzzer`. It has flags that can easily set up sanitizers `--enable-{a,ub,m}san`, and coverage instrumentation `--enable-coverage`. It sets sane defaults which can be overridden with flags `--debug`, @@ -51,22 +53,25 @@ The command used to run the fuzzer is printed for debugging. ## LibFuzzer ``` -# Build libfuzzer if necessary -make libFuzzer # Build the fuzz targets -./fuzz.py build all --enable-coverage --enable-asan --enable-ubsan --lib-fuzzing-engine Fuzzer/libFuzzer.a --cc clang --cxx clang++ +./fuzz.py build all --enable-fuzzer --enable-asan --enable-ubsan --cc clang --cxx clang++ # OR equivalently -CC=clang CXX=clang++ LIB_FUZZING_ENGINE=Fuzzer/libFuzzer.a ./fuzz.py build all --enable-coverage --enable-asan --enable-ubsan +CC=clang CXX=clang++ ./fuzz.py build all --enable-fuzzer --enable-asan --enable-ubsan # Run the fuzzer -./fuzz.py libfuzzer TARGET -max_len=8192 -jobs=4 +./fuzz.py libfuzzer TARGET ``` where `TARGET` could be `simple_decompress`, `stream_round_trip`, etc. ### MSAN -Fuzzing with `libFuzzer` and `MSAN` will require building a C++ standard library -and libFuzzer with MSAN. +Fuzzing with `libFuzzer` and `MSAN` is as easy as: + +``` +CC=clang CXX=clang++ ./fuzz.py build all --enable-fuzzer --enable-msan +./fuzz.py libfuzzer TARGET +``` + `fuzz.py` respects the environment variables / flags `MSAN_EXTRA_CPPFLAGS`, `MSAN_EXTRA_CFLAGS`, `MSAN_EXTRA_CXXFLAGS`, `MSAN_EXTRA_LDFLAGS` to easily pass the extra parameters only for MSAN. diff --git a/tests/fuzz/fuzz.py b/tests/fuzz/fuzz.py index d993209a0..faf8ce8ae 100755 --- a/tests/fuzz/fuzz.py +++ b/tests/fuzz/fuzz.py @@ -24,21 +24,38 @@ def abs_join(a, *p): return os.path.abspath(os.path.join(a, *p)) +class InputType(object): + RAW_DATA = 1 + COMPRESSED_DATA = 2 + + +class FrameType(object): + ZSTD = 1 + BLOCK = 2 + + +class TargetInfo(object): + def __init__(self, input_type, frame_type=FrameType.ZSTD): + self.input_type = input_type + self.frame_type = frame_type + + # Constants FUZZ_DIR = os.path.abspath(os.path.dirname(__file__)) CORPORA_DIR = abs_join(FUZZ_DIR, 'corpora') -TARGETS = [ - 'simple_round_trip', - 'stream_round_trip', - 'block_round_trip', - 'simple_decompress', - 'stream_decompress', - 'block_decompress', - 'dictionary_round_trip', - 'dictionary_decompress', - 'zstd_frame_info', - 'simple_compress', -] +TARGET_INFO = { + 'simple_round_trip': TargetInfo(InputType.RAW_DATA), + 'stream_round_trip': TargetInfo(InputType.RAW_DATA), + 'block_round_trip': TargetInfo(InputType.RAW_DATA, FrameType.BLOCK), + 'simple_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'stream_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'block_decompress': TargetInfo(InputType.COMPRESSED_DATA, FrameType.BLOCK), + 'dictionary_round_trip': TargetInfo(InputType.RAW_DATA), + 'dictionary_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'zstd_frame_info': TargetInfo(InputType.COMPRESSED_DATA), + 'simple_compress': TargetInfo(InputType.RAW_DATA), +} +TARGETS = list(TARGET_INFO.keys()) ALL_TARGETS = TARGETS + ['all'] FUZZ_RNG_SEED_SIZE = 4 @@ -67,7 +84,7 @@ MSAN_EXTRA_LDFLAGS = os.environ.get('MSAN_EXTRA_LDFLAGS', '') def create(r): d = os.path.abspath(r) if not os.path.isdir(d): - os.mkdir(d) + os.makedirs(d) return d @@ -158,7 +175,7 @@ def compiler_version(cc, cxx): assert(b'clang' in cxx_version_bytes) compiler = 'clang' elif b'gcc' in cc_version_bytes: - assert(b'gcc' in cxx_version_bytes) + assert(b'gcc' in cxx_version_bytes or b'g++' in cxx_version_bytes) compiler = 'gcc' if compiler is not None: version_regex = b'([0-9])+\.([0-9])+\.([0-9])+' @@ -699,7 +716,8 @@ def gen(args): '-o{}'.format(decompressed), ] - if 'block_' in args.TARGET: + info = TARGET_INFO[args.TARGET] + if info.frame_type == FrameType.BLOCK: cmd += [ '--gen-blocks', '--max-block-size-log={}'.format(args.max_size_log) @@ -710,10 +728,11 @@ def gen(args): print(' '.join(cmd)) subprocess.check_call(cmd) - if '_round_trip' in args.TARGET: + if info.input_type == InputType.RAW_DATA: print('using decompressed data in {}'.format(decompressed)) samples = decompressed - elif '_decompress' in args.TARGET: + else: + assert info.input_type == InputType.COMPRESSED_DATA print('using compressed data in {}'.format(compressed)) samples = compressed From 07f22d465d0f85aa00f20fc2f0b59a50ddfe494f Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 20 Aug 2019 17:13:04 -0700 Subject: [PATCH 13/16] [legacy] Fix buffer overflow in v0.2 and v0.4 raw literals decompression Extends the fix in PR#1722 to v0.2 and v0.4. These aren't built into zstd by default, and v0.5 onward are not affected. I only add the `srcSize > BLOCKSIZE` check to v0.4 because the comments say that it must hold, but the equivalent comment isn't present in v0.2. Credit to OSS-Fuzz. --- lib/legacy/zstd_v02.c | 1 + lib/legacy/zstd_v04.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/legacy/zstd_v02.c b/lib/legacy/zstd_v02.c index 793df6024..de0a4bd6b 100644 --- a/lib/legacy/zstd_v02.c +++ b/lib/legacy/zstd_v02.c @@ -2889,6 +2889,7 @@ static size_t ZSTD_decodeLiteralsBlock(void* ctx, const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2; /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */ if (litSize > srcSize-11) /* risk of reading too far with wildcopy */ { + if (litSize > BLOCKSIZE) return ERROR(corruption_detected); if (litSize > srcSize-3) return ERROR(corruption_detected); memcpy(dctx->litBuffer, istart, litSize); dctx->litPtr = dctx->litBuffer; diff --git a/lib/legacy/zstd_v04.c b/lib/legacy/zstd_v04.c index 645a6e313..201ce2b69 100644 --- a/lib/legacy/zstd_v04.c +++ b/lib/legacy/zstd_v04.c @@ -2655,6 +2655,7 @@ static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2; /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */ if (litSize > srcSize-11) /* risk of reading too far with wildcopy */ { + if (litSize > BLOCKSIZE) return ERROR(corruption_detected); if (litSize > srcSize-3) return ERROR(corruption_detected); memcpy(dctx->litBuffer, istart, litSize); dctx->litPtr = dctx->litBuffer; @@ -3034,9 +3035,12 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; + size_t litCSize; + + if (srcSize > BLOCKSIZE) return ERROR(corruption_detected); /* Decode literals sub-block */ - size_t litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; srcSize -= litCSize; From b3540507f5239f9d0810c3b9e2b920601687738d Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Wed, 21 Aug 2019 10:27:54 -0700 Subject: [PATCH 14/16] Remove bc from play tests --- tests/playTests.sh | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/tests/playTests.sh b/tests/playTests.sh index c516aa712..b74076763 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -408,6 +408,23 @@ println "compress multiple files including a missing one (notHere) : " $ZSTD -f tmp1 notHere tmp2 && die "missing file not detected!" +println "\n===> stream-size mode" + +./datagen -g11000 > tmp +println "test : basic file compression vs sized streaming compression" +file_size=$($ZSTD -14 -f tmp -o tmp.zst && wc -c < tmp.zst) +stream_size=$(cat tmp | $ZSTD -14 --stream-size=11000 | wc -c) +if [ "$stream_size" -gt "$file_size" ]; then + die "hinted compression larger than expected" +fi +println "test : sized streaming compression and decompression" +cat tmp | $ZSTD -14 -f tmp -o --stream-size=11000 tmp.zst +$ZSTD -df tmp.zst -o tmp_decompress +cmp tmp tmp_decompress || die "difference between original and decompressed file" +println "test : incorrect stream size" +cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size" + + println "\n===> dictionary tests " println "- test with raw dict (content only) " @@ -1019,28 +1036,4 @@ test -f dictionary rm -f tmp* dictionary -println "\n===> stream-size mode" - -./datagen -g11000 > tmp -println "test : basic file compression vs sized streaming compression" -$ZSTD -14 -f tmp -o tmp.zst 2>&1 | tee file.out -cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11000 2>&1 | tee stream_sized.out - -file_ratio=$(cat file.out | awk '{print $4}' | sed 's/%//g') -stream_sized_ratio=$(cat stream_sized.out | awk '{print $4}' | sed 's/%//g') -rm file.out stream_sized.out - -ratio_diff=$(echo $file_ratio - $stream_sized_ratio | bc) -if [ $(echo "(100 * $ratio_diff) > 5" | bc -l) == 1 ] -then - die "greater than 0.05% difference between file and sized-streaming compression" -fi -println "test : sized streaming compression and decompression" -cat tmp | $ZSTD -14 -f tmp -o --stream-size=11000 tmp.zst -$ZSTD -df tmp.zst -o tmp_decompress -cmp tmp tmp_decompress || die "difference between original and decompressed file" -println "test : incorrect stream size" -cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size" - - rm -f tmp* From 2cdda8b3c43c23981941cdd97c1a517981d2339e Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Thu, 22 Aug 2019 09:13:28 -0700 Subject: [PATCH 15/16] Minor documentation update --- programs/zstd.1.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/programs/zstd.1.md b/programs/zstd.1.md index e3f729282..ee5db9d63 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -144,6 +144,11 @@ the last one takes effect. Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible. _note_ : at the time of this writing, `--adapt` can remain stuck at low speed when combined with multiple worker threads (>=2). +* `--stream-size=#` : + When handling input from a stream, `zstd` must guess how large the source size + will be when optimizing compression parameters. This option sets the pledged source + size of a stream to eliminate that guesswork. Note that the pledged size must be exact; + incorrect stream sizes will cause an error. * `--rsyncable` : `zstd` will periodically synchronize the compression state to make the compressed file more rsync-friendly. There is a negligible impact to @@ -152,12 +157,6 @@ the last one takes effect. This feature does not work with `--single-thread`. You probably don't want to use it with long range mode, since it will decrease the effectiveness of the synchronization points, but your milage may vary. -* `--stream-size` : - When handling input from a stream, `zstd` must guess how large the source size - will be when optimizing compression parameters. If the stream size is relatively - small, this guess may be a poor one, resulting in a higher compression ratio than - expected. This feature will set the source size of a stream. Note that it must - be exact; incorrect stream sizes will cause an error. * `-D file`: use `file` as Dictionary to compress or decompress FILE(s) * `--no-dictID`: From fd486a846abcbf6c92496dd4915d6a46bd4790c2 Mon Sep 17 00:00:00 2001 From: Nick Magerko Date: Thu, 22 Aug 2019 09:37:47 -0700 Subject: [PATCH 16/16] Differentiate --stream-size from --size-hint --- programs/zstd.1.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/programs/zstd.1.md b/programs/zstd.1.md index ee5db9d63..1bdc42654 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -145,10 +145,10 @@ the last one takes effect. _note_ : at the time of this writing, `--adapt` can remain stuck at low speed when combined with multiple worker threads (>=2). * `--stream-size=#` : - When handling input from a stream, `zstd` must guess how large the source size - will be when optimizing compression parameters. This option sets the pledged source - size of a stream to eliminate that guesswork. Note that the pledged size must be exact; - incorrect stream sizes will cause an error. + Sets the pledged source size of input coming from a stream. This value must be exact, as it + will be included in the produced frame header. Incorrect stream sizes will cause an error. + This information will be used to better optimize compression parameters, resulting in + better and potentially faster compression, especially for smaller source sizes. * `--rsyncable` : `zstd` will periodically synchronize the compression state to make the compressed file more rsync-friendly. There is a negligible impact to