diff --git a/doc/decompressor_errata.md b/doc/decompressor_errata.md index e170a62c1..83d4071cb 100644 --- a/doc/decompressor_errata.md +++ b/doc/decompressor_errata.md @@ -12,6 +12,26 @@ Each entry will contain: The document is in reverse chronological order, with the bugs that affect the most recent zstd decompressor versions listed first. +No sequence using the 2-bytes format +------------------------------------------------ + +**Last affected version**: v1.5.5 + +**Affected decompressor component(s)**: Library & CLI + +**Produced by the reference compressor**: No + +**Example Frame**: see zstd/tests/golden-decompression/zeroSeq_2B.zst + +The zstd decoder incorrectly expects FSE tables when there are 0 sequences present in the block +if the value 0 is encoded using the 2-bytes format. +Instead, it should immediately end the sequence section, and move on to next block. + +This situation was never generated by the reference compressor, +because representing 0 sequences with the 2-bytes format is inefficient +(the 1-byte format is always used in this case). + + Compressed block with a size of exactly 128 KB ------------------------------------------------ @@ -32,6 +52,7 @@ These blocks used to be disallowed by the spec up until spec version 0.3.2 when > A Compressed_Block has the extra restriction that Block_Size is always strictly less than the decompressed size. If this condition cannot be respected, the block must be sent uncompressed instead (Raw_Block). + Compressed block with 0 literals and 0 sequences ------------------------------------------------ @@ -51,6 +72,7 @@ Additionally, these blocks were disallowed by the spec up until spec version 0.3 > A Compressed_Block has the extra restriction that Block_Size is always strictly less than the decompressed size. If this condition cannot be respected, the block must be sent uncompressed instead (Raw_Block). + First block is RLE block ------------------------ @@ -72,6 +94,7 @@ block. https://github.com/facebook/zstd/blob/8814aa5bfa74f05a86e55e9d508da177a893ceeb/lib/compress/zstd_compress.c#L3527-L3535 + Tiny FSE Table & Block ---------------------- diff --git a/doc/educational_decoder/zstd_decompress.c b/doc/educational_decoder/zstd_decompress.c index 1da7c528d..921c8f54c 100644 --- a/doc/educational_decoder/zstd_decompress.c +++ b/doc/educational_decoder/zstd_decompress.c @@ -1018,12 +1018,7 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in, // This is a variable size field using between 1 and 3 bytes. Let's call its // first byte byte0." u8 header = IO_read_bits(in, 8); - if (header == 0) { - // "There are no sequences. The sequence section stops there. - // Regenerated content is defined entirely by literals section." - *sequences = NULL; - return 0; - } else if (header < 128) { + if (header < 128) { // "Number_of_Sequences = byte0 . Uses 1 byte." num_sequences = header; } else if (header < 255) { @@ -1034,6 +1029,12 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in, num_sequences = IO_read_bits(in, 16) + 0x7F00; } + if (num_sequences == 0) { + // "There are no sequences. The sequence section stops there." + *sequences = NULL; + return 0; + } + *sequences = malloc(num_sequences * sizeof(sequence_command_t)); if (!*sequences) { BAD_ALLOC(); diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md index 2a69c4c30..cd7308de1 100644 --- a/doc/zstd_compression_format.md +++ b/doc/zstd_compression_format.md @@ -16,7 +16,7 @@ Distribution of this document is unlimited. ### Version -0.3.9 (2023-03-08) +0.4.0 (2023-06-05) Introduction @@ -650,15 +650,16 @@ __`Number_of_Sequences`__ This is a variable size field using between 1 and 3 bytes. Let's call its first byte `byte0`. -- `if (byte0 == 0)` : there are no sequences. - The sequence section stops there. - Decompressed content is defined entirely as Literals Section content. - The FSE tables used in `Repeat_Mode` aren't updated. - `if (byte0 < 128)` : `Number_of_Sequences = byte0` . Uses 1 byte. - `if (byte0 < 255)` : `Number_of_Sequences = ((byte0 - 0x80) << 8) + byte1`. Uses 2 bytes. Note that the 2 bytes format fully overlaps the 1 byte format. - `if (byte0 == 255)`: `Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00`. Uses 3 bytes. +`if (Number_of_Sequences == 0)` : there are no sequences. + The sequence section stops immediately, + FSE tables used in `Repeat_Mode` aren't updated. + Block's decompressed content is defined solely by the Literals Section content. + __Symbol compression modes__ This is a single byte, defining the compression mode of each symbol type. @@ -1698,6 +1699,7 @@ or at least provide a meaningful error code explaining for which reason it canno Version changes --------------- +- 0.4.0 : fixed imprecise behavior for nbSeq==0, detected by Igor Pavlov - 0.3.9 : clarifications for Huffman-compressed literal sizes. - 0.3.8 : clarifications for Huffman Blocks and Huffman Tree descriptions. - 0.3.7 : clarifications for Repeat_Offsets, matching RFC8878 diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index 1f942f27b..f7c57a028 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -366,13 +366,13 @@ typedef struct { /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ -/* Used by: decompress, fullbench (does not get its definition from here) */ +/* Used by: decompress, fullbench */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr); /*! ZSTD_decodeSeqHeaders() : * decode sequence header from src */ -/* Used by: decompress, fullbench (does not get its definition from here) */ +/* Used by: zstd_decompress_block, fullbench */ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize); diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 5028a52f1..c63d06d6a 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -706,11 +706,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, /* SeqHead */ nbSeq = *ip++; - if (!nbSeq) { - *nbSeqPtr=0; - RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, ""); - return 1; - } if (nbSeq > 0x7F) { if (nbSeq == 0xFF) { RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); @@ -723,6 +718,11 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, } *nbSeqPtr = nbSeq; + if (nbSeq == 0) { + /* No sequence : section ends immediately */ + return (size_t)(ip - istart); + } + /* FSE table descriptors */ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); diff --git a/tests/golden-decompression/zeroSeq_2B.zst b/tests/golden-decompression/zeroSeq_2B.zst new file mode 100644 index 000000000..f9f3520a6 Binary files /dev/null and b/tests/golden-decompression/zeroSeq_2B.zst differ