diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md index 7d02426a5..111dd98ae 100644 --- a/doc/zstd_compression_format.md +++ b/doc/zstd_compression_format.md @@ -16,7 +16,7 @@ Distribution of this document is unlimited. ### Version -0.3.2 (17/07/19) +0.3.3 (16/08/19) Introduction @@ -358,6 +358,7 @@ It may be followed by an optional `Content_Checksum` __`Block_Type`__ The next 2 bits represent the `Block_Type`. +`Block_Type` influences the meaning of `Block_Size`. There are 4 block types : | Value | 0 | 1 | 2 | 3 | @@ -384,9 +385,12 @@ There are 4 block types : __`Block_Size`__ The upper 21 bits of `Block_Header` represent the `Block_Size`. -`Block_Size` is the size of the block excluding the header. -A block can contain any number of bytes (even zero), up to -`Block_Maximum_Decompressed_Size`, which is the smallest of: +When `Block_Type` is `Compressed_Block` or `Raw_Block`, +`Block_Size` is the size of `Block_Content`, hence excluding `Block_Header`. +When `Block_Type` is `RLE_Block`, `Block_Content`’s size is always 1, +and `Block_Size` represents the number of times this byte must be repeated. +A block can contain and decompress into any number of bytes (even zero), +up to `Block_Maximum_Decompressed_Size`, which is the smallest of: - Window_Size - 128 KB @@ -1653,6 +1657,7 @@ or at least provide a meaningful error code explaining for which reason it canno Version changes --------------- +- 0.3.3 : clarifications for field Block_Size - 0.3.2 : remove additional block size restriction on compressed blocks - 0.3.1 : minor clarification regarding offset history update rules - 0.3.0 : minor edits to match RFC8478 diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 3660e9d1c..8308bf5d1 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2001,12 +2001,17 @@ ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, /* Sequences Header */ RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, dstSize_tooSmall); - if (nbSeq < 0x7F) + if (nbSeq < 128) { *op++ = (BYTE)nbSeq; - else if (nbSeq < LONGNBSEQ) - op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; - else - op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + } else if (nbSeq < LONGNBSEQ) { + op[0] = (BYTE)((nbSeq>>8) + 0x80); + op[1] = (BYTE)nbSeq; + op+=2; + } else { + op[0]=0xFF; + MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)); + op+=3; + } assert(op <= oend); if (nbSeq==0) { /* Copy the old tables over as if we repeated them */ diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index ee21ee1a9..4a263d822 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -571,7 +571,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length) unsigned const prime1 = 2654435761U; unsigned const prime2 = 2246822519U; unsigned acc = prime1; - size_t p=0;; + size_t p=0; for (p=0; p> 21); diff --git a/lib/legacy/zstd_v02.c b/lib/legacy/zstd_v02.c index 793df6024..de0a4bd6b 100644 --- a/lib/legacy/zstd_v02.c +++ b/lib/legacy/zstd_v02.c @@ -2889,6 +2889,7 @@ static size_t ZSTD_decodeLiteralsBlock(void* ctx, const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2; /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */ if (litSize > srcSize-11) /* risk of reading too far with wildcopy */ { + if (litSize > BLOCKSIZE) return ERROR(corruption_detected); if (litSize > srcSize-3) return ERROR(corruption_detected); memcpy(dctx->litBuffer, istart, litSize); dctx->litPtr = dctx->litBuffer; diff --git a/lib/legacy/zstd_v04.c b/lib/legacy/zstd_v04.c index 645a6e313..201ce2b69 100644 --- a/lib/legacy/zstd_v04.c +++ b/lib/legacy/zstd_v04.c @@ -2655,6 +2655,7 @@ static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const size_t litSize = (MEM_readLE32(istart) & 0xFFFFFF) >> 2; /* no buffer issue : srcSize >= MIN_CBLOCK_SIZE */ if (litSize > srcSize-11) /* risk of reading too far with wildcopy */ { + if (litSize > BLOCKSIZE) return ERROR(corruption_detected); if (litSize > srcSize-3) return ERROR(corruption_detected); memcpy(dctx->litBuffer, istart, litSize); dctx->litPtr = dctx->litBuffer; @@ -3034,9 +3035,12 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; + size_t litCSize; + + if (srcSize > BLOCKSIZE) return ERROR(corruption_detected); /* Decode literals sub-block */ - size_t litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; srcSize -= litCSize; diff --git a/programs/dibio.c b/programs/dibio.c index 12eb32680..ea4bb4bf1 100644 --- a/programs/dibio.c +++ b/programs/dibio.c @@ -201,7 +201,7 @@ static void DiB_fillNoise(void* buffer, size_t length) unsigned const prime1 = 2654435761U; unsigned const prime2 = 2246822519U; unsigned acc = prime1; - size_t p=0;; + size_t p=0; for (p=0; pldmMinMatch = 0; ret->ldmBucketSizeLog = FIO_LDM_PARAM_NOTSET; ret->ldmHashRateLog = FIO_LDM_PARAM_NOTSET; + ret->streamSrcSize = 0; ret->targetCBlockSize = 0; ret->srcSizeHint = 0; ret->literalCompressionMode = ZSTD_lcm_auto; @@ -421,6 +423,10 @@ void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable) { prefs->rsyncable = rsyncable; } +void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize) { + prefs->streamSrcSize = streamSrcSize; +} + void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize) { prefs->targetCBlockSize = targetCBlockSize; } @@ -640,7 +646,6 @@ typedef struct { static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, const char* dictFileName, int cLevel, - U64 srcSize, ZSTD_compressionParameters comprParams) { cRess_t ress; memset(&ress, 0, sizeof(ress)); @@ -707,10 +712,7 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_rsyncable, prefs->rsyncable) ); #endif /* dictionary */ - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, srcSize) ); /* set the value temporarily for dictionary loading, to adapt compression parameters */ CHECK( ZSTD_CCtx_loadDictionary(ress.cctx, dictBuffer, dictBuffSize) ); - CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, ZSTD_CONTENTSIZE_UNKNOWN) ); /* reset */ - free(dictBuffer); } @@ -1012,6 +1014,9 @@ FIO_compressZstdFrame(FIO_prefs_t* const prefs, /* init */ if (fileSize != UTIL_FILESIZE_UNKNOWN) { CHECK(ZSTD_CCtx_setPledgedSrcSize(ress.cctx, fileSize)); + } else if (prefs->streamSrcSize > 0) { + /* unknown source size; use the declared stream size */ + CHECK( ZSTD_CCtx_setPledgedSrcSize(ress.cctx, prefs->streamSrcSize) ); } (void)srcFileName; @@ -1370,10 +1375,7 @@ int FIO_compressFilename(FIO_prefs_t* const prefs, const char* dictFileName, int compressionLevel, ZSTD_compressionParameters comprParams) { - U64 const fileSize = UTIL_getFileSize(srcFileName); - U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : fileSize; - - cRess_t const ress = FIO_createCResources(prefs, dictFileName, compressionLevel, srcSize, comprParams); + cRess_t const ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); int const result = FIO_compressFilename_srcFile(prefs, ress, dstFileName, srcFileName, compressionLevel); @@ -1424,10 +1426,7 @@ int FIO_compressMultipleFilenames(FIO_prefs_t* const prefs, ZSTD_compressionParameters comprParams) { int error = 0; - U64 const firstFileSize = UTIL_getFileSize(inFileNamesTable[0]); - U64 const firstSrcSize = (firstFileSize == UTIL_FILESIZE_UNKNOWN) ? ZSTD_CONTENTSIZE_UNKNOWN : firstFileSize; - U64 const srcSize = (nbFiles != 1) ? ZSTD_CONTENTSIZE_UNKNOWN : firstSrcSize ; - cRess_t ress = FIO_createCResources(prefs, dictFileName, compressionLevel, srcSize, comprParams); + cRess_t ress = FIO_createCResources(prefs, dictFileName, compressionLevel, comprParams); /* init */ assert(outFileName != NULL || suffix != NULL); diff --git a/programs/fileio.h b/programs/fileio.h index fd49a749d..096d90b5c 100644 --- a/programs/fileio.h +++ b/programs/fileio.h @@ -71,6 +71,7 @@ void FIO_setOverlapLog(FIO_prefs_t* const prefs, int overlapLog); void FIO_setRemoveSrcFile(FIO_prefs_t* const prefs, unsigned flag); void FIO_setSparseWrite(FIO_prefs_t* const prefs, unsigned sparse); /**< 0: no sparse; 1: disable on stdout; 2: always enabled */ void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable); +void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize); void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize); void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint); void FIO_setLiteralCompressionMode( diff --git a/programs/zstd.1.md b/programs/zstd.1.md index f8349fa80..dff4d9eac 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -144,6 +144,11 @@ the last one takes effect. Due to the chaotic nature of dynamic adaptation, compressed result is not reproducible. _note_ : at the time of this writing, `--adapt` can remain stuck at low speed when combined with multiple worker threads (>=2). +* `--stream-size=#` : + Sets the pledged source size of input coming from a stream. This value must be exact, as it + will be included in the produced frame header. Incorrect stream sizes will cause an error. + This information will be used to better optimize compression parameters, resulting in + better and potentially faster compression, especially for smaller source sizes. * `--size-hint=#`: When handling input from a stream, `zstd` must guess how large the source size will be when optimizing compression parameters. If the stream size is relatively diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 98b9ffb90..98df728a9 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -141,6 +141,7 @@ static int usage_advanced(const char* programName) DISPLAY( "--long[=#]: enable long distance matching with given window log (default: %u)\n", g_defaultMaxWindowLog); DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1); DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n"); + DISPLAY( "--stream-size=# : optimize compression parameters for streaming input of given number of bytes \n"); DISPLAY( "--size-hint=# optimize compression parameters for streaming input of approximately this size\n"); DISPLAY( "--target-compressed-block-size=# : make compressed block near targeted size \n"); #ifdef ZSTD_MULTITHREAD @@ -589,6 +590,7 @@ int main(int argCount, const char* argv[]) const char* suffix = ZSTD_EXTENSION; unsigned maxDictSize = g_defaultMaxDictSize; unsigned dictID = 0; + size_t streamSrcSize = 0; size_t targetCBlockSize = 0; size_t srcSizeHint = 0; int dictCLevel = g_defaultDictCLevel; @@ -747,6 +749,7 @@ int main(int argCount, const char* argv[]) if (longCommandWArg(&argument, "--maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; } + if (longCommandWArg(&argument, "--stream-size=")) { streamSrcSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--size-hint=")) { srcSizeHint = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--long")) { @@ -1153,6 +1156,7 @@ int main(int argCount, const char* argv[]) FIO_setAdaptMin(prefs, adaptMin); FIO_setAdaptMax(prefs, adaptMax); FIO_setRsyncable(prefs, rsyncable); + FIO_setStreamSrcSize(prefs, streamSrcSize); FIO_setTargetCBlockSize(prefs, targetCBlockSize); FIO_setSrcSizeHint(prefs, srcSizeHint); FIO_setLiteralCompressionMode(prefs, literalCompressionMode); @@ -1164,7 +1168,7 @@ int main(int argCount, const char* argv[]) else operationResult = FIO_compressMultipleFilenames(prefs, filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams); #else - (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)srcSizeHint; /* not used when ZSTD_NOCOMPRESS set */ + (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; /* not used when ZSTD_NOCOMPRESS set */ DISPLAY("Compression not supported \n"); #endif } else { /* decompression or test */ diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index 8bf16b1fb..08dedd66f 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -113,15 +113,6 @@ zstd_frame_info: $(FUZZ_HEADERS) $(FUZZ_OBJ) zstd_frame_info.o libregression.a: $(FUZZ_HEADERS) $(PRGDIR)/util.h $(PRGDIR)/util.c regression_driver.o $(AR) $(FUZZ_ARFLAGS) $@ regression_driver.o -# Install libfuzzer (not usable for MSAN testing) -# Provided for convenience. To use this library run make libFuzzer and -# set LDFLAGS=-L. -.PHONY: libFuzzer -libFuzzer: - @$(RM) -rf Fuzzer - @git clone https://chromium.googlesource.com/chromium/llvm-project/compiler-rt/lib/fuzzer Fuzzer - @cd Fuzzer && ./build.sh - corpora/%_seed_corpus.zip: @mkdir -p corpora $(DOWNLOAD) $@ $(CORPORA_URL_PREFIX)$*_seed_corpus.zip diff --git a/tests/fuzz/README.md b/tests/fuzz/README.md index 9e0bb259a..856a57f82 100644 --- a/tests/fuzz/README.md +++ b/tests/fuzz/README.md @@ -35,6 +35,8 @@ The environment variables can be overridden with the corresponding flags `--cc`, `--cflags`, etc. The specific fuzzing engine is selected with `LIB_FUZZING_ENGINE` or `--lib-fuzzing-engine`, the default is `libregression.a`. +Alternatively, you can use Clang's built in fuzzing engine with +`--enable-fuzzer`. It has flags that can easily set up sanitizers `--enable-{a,ub,m}san`, and coverage instrumentation `--enable-coverage`. It sets sane defaults which can be overridden with flags `--debug`, @@ -51,22 +53,25 @@ The command used to run the fuzzer is printed for debugging. ## LibFuzzer ``` -# Build libfuzzer if necessary -make libFuzzer # Build the fuzz targets -./fuzz.py build all --enable-coverage --enable-asan --enable-ubsan --lib-fuzzing-engine Fuzzer/libFuzzer.a --cc clang --cxx clang++ +./fuzz.py build all --enable-fuzzer --enable-asan --enable-ubsan --cc clang --cxx clang++ # OR equivalently -CC=clang CXX=clang++ LIB_FUZZING_ENGINE=Fuzzer/libFuzzer.a ./fuzz.py build all --enable-coverage --enable-asan --enable-ubsan +CC=clang CXX=clang++ ./fuzz.py build all --enable-fuzzer --enable-asan --enable-ubsan # Run the fuzzer -./fuzz.py libfuzzer TARGET -max_len=8192 -jobs=4 +./fuzz.py libfuzzer TARGET ``` where `TARGET` could be `simple_decompress`, `stream_round_trip`, etc. ### MSAN -Fuzzing with `libFuzzer` and `MSAN` will require building a C++ standard library -and libFuzzer with MSAN. +Fuzzing with `libFuzzer` and `MSAN` is as easy as: + +``` +CC=clang CXX=clang++ ./fuzz.py build all --enable-fuzzer --enable-msan +./fuzz.py libfuzzer TARGET +``` + `fuzz.py` respects the environment variables / flags `MSAN_EXTRA_CPPFLAGS`, `MSAN_EXTRA_CFLAGS`, `MSAN_EXTRA_CXXFLAGS`, `MSAN_EXTRA_LDFLAGS` to easily pass the extra parameters only for MSAN. diff --git a/tests/fuzz/fuzz.py b/tests/fuzz/fuzz.py index d993209a0..faf8ce8ae 100755 --- a/tests/fuzz/fuzz.py +++ b/tests/fuzz/fuzz.py @@ -24,21 +24,38 @@ def abs_join(a, *p): return os.path.abspath(os.path.join(a, *p)) +class InputType(object): + RAW_DATA = 1 + COMPRESSED_DATA = 2 + + +class FrameType(object): + ZSTD = 1 + BLOCK = 2 + + +class TargetInfo(object): + def __init__(self, input_type, frame_type=FrameType.ZSTD): + self.input_type = input_type + self.frame_type = frame_type + + # Constants FUZZ_DIR = os.path.abspath(os.path.dirname(__file__)) CORPORA_DIR = abs_join(FUZZ_DIR, 'corpora') -TARGETS = [ - 'simple_round_trip', - 'stream_round_trip', - 'block_round_trip', - 'simple_decompress', - 'stream_decompress', - 'block_decompress', - 'dictionary_round_trip', - 'dictionary_decompress', - 'zstd_frame_info', - 'simple_compress', -] +TARGET_INFO = { + 'simple_round_trip': TargetInfo(InputType.RAW_DATA), + 'stream_round_trip': TargetInfo(InputType.RAW_DATA), + 'block_round_trip': TargetInfo(InputType.RAW_DATA, FrameType.BLOCK), + 'simple_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'stream_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'block_decompress': TargetInfo(InputType.COMPRESSED_DATA, FrameType.BLOCK), + 'dictionary_round_trip': TargetInfo(InputType.RAW_DATA), + 'dictionary_decompress': TargetInfo(InputType.COMPRESSED_DATA), + 'zstd_frame_info': TargetInfo(InputType.COMPRESSED_DATA), + 'simple_compress': TargetInfo(InputType.RAW_DATA), +} +TARGETS = list(TARGET_INFO.keys()) ALL_TARGETS = TARGETS + ['all'] FUZZ_RNG_SEED_SIZE = 4 @@ -67,7 +84,7 @@ MSAN_EXTRA_LDFLAGS = os.environ.get('MSAN_EXTRA_LDFLAGS', '') def create(r): d = os.path.abspath(r) if not os.path.isdir(d): - os.mkdir(d) + os.makedirs(d) return d @@ -158,7 +175,7 @@ def compiler_version(cc, cxx): assert(b'clang' in cxx_version_bytes) compiler = 'clang' elif b'gcc' in cc_version_bytes: - assert(b'gcc' in cxx_version_bytes) + assert(b'gcc' in cxx_version_bytes or b'g++' in cxx_version_bytes) compiler = 'gcc' if compiler is not None: version_regex = b'([0-9])+\.([0-9])+\.([0-9])+' @@ -699,7 +716,8 @@ def gen(args): '-o{}'.format(decompressed), ] - if 'block_' in args.TARGET: + info = TARGET_INFO[args.TARGET] + if info.frame_type == FrameType.BLOCK: cmd += [ '--gen-blocks', '--max-block-size-log={}'.format(args.max_size_log) @@ -710,10 +728,11 @@ def gen(args): print(' '.join(cmd)) subprocess.check_call(cmd) - if '_round_trip' in args.TARGET: + if info.input_type == InputType.RAW_DATA: print('using decompressed data in {}'.format(decompressed)) samples = decompressed - elif '_decompress' in args.TARGET: + else: + assert info.input_type == InputType.COMPRESSED_DATA print('using compressed data in {}'.format(compressed)) samples = compressed diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 2de7c0096..4788c356a 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -401,7 +401,7 @@ static int basicUnitTests(U32 const seed, double compressibility) DISPLAYLEVEL(3, "test%3i : check decompressed result : ", testNb++); { size_t u; for (u=0; u simple tests " ./datagen > tmp @@ -409,6 +408,23 @@ println "compress multiple files including a missing one (notHere) : " $ZSTD -f tmp1 notHere tmp2 && die "missing file not detected!" +println "\n===> stream-size mode" + +./datagen -g11000 > tmp +println "test : basic file compression vs sized streaming compression" +file_size=$($ZSTD -14 -f tmp -o tmp.zst && wc -c < tmp.zst) +stream_size=$(cat tmp | $ZSTD -14 --stream-size=11000 | wc -c) +if [ "$stream_size" -gt "$file_size" ]; then + die "hinted compression larger than expected" +fi +println "test : sized streaming compression and decompression" +cat tmp | $ZSTD -14 -f tmp -o --stream-size=11000 tmp.zst +$ZSTD -df tmp.zst -o tmp_decompress +cmp tmp tmp_decompress || die "difference between original and decompressed file" +println "test : incorrect stream size" +cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size" + + println "\n===> size-hint mode" ./datagen -g11000 > tmp diff --git a/tests/zbufftest.c b/tests/zbufftest.c index 8cbde3f4f..944148262 100644 --- a/tests/zbufftest.c +++ b/tests/zbufftest.c @@ -184,7 +184,7 @@ static int basicUnitTests(U32 seed, double compressibility, ZSTD_customMem custo DISPLAYLEVEL(4, "test%3i : check decompressed result : ", testNb++); { size_t i; for (i=0; i