From 3732a08f5b82ed87a744e65daa2f11f77dabe954 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 5 Jun 2023 16:03:00 -0700
Subject: [PATCH] fixed decoder behavior when nbSeqs==0 is encoded using 2
 bytes

The sequence section starts with a number, which tells how sequences are present in the section.
If this number if 0, the section automatically ends.

The number 0 can be represented using the 1 byte or the 2 bytes formats.
That's because the 2-bytes formats fully overlaps the 1 byte format.

However, when 0 is represented using the 2-bytes format,
the decoder was expecting the sequence section to continue,
and was looking for FSE tables, which is incorrect.

Fixed this behavior, in both the reference decoder and the educational behavior.

In practice, this behavior never happens,
because the encoder will always select the 1-byte format to represent 0,
since this is more efficient.

Completed the fix with a new golden sample for tests,
a clarification of the specification,
and a decoder errata paragraph.
---
 doc/decompressor_errata.md                |  23 ++++++++++++++++++++++
 doc/educational_decoder/zstd_decompress.c |  13 ++++++------
 doc/zstd_compression_format.md            |  12 ++++++-----
 lib/common/zstd_internal.h                |   4 ++--
 lib/decompress/zstd_decompress_block.c    |  10 +++++-----
 tests/golden-decompression/zeroSeq_2B.zst | Bin 0 -> 25 bytes
 6 files changed, 44 insertions(+), 18 deletions(-)
 create mode 100644 tests/golden-decompression/zeroSeq_2B.zst

diff --git a/doc/decompressor_errata.md b/doc/decompressor_errata.md
index e170a62c1..83d4071cb 100644
--- a/doc/decompressor_errata.md
+++ b/doc/decompressor_errata.md
@@ -12,6 +12,26 @@ Each entry will contain:
 The document is in reverse chronological order, with the bugs that affect the most recent zstd decompressor versions listed first.
 
 
+No sequence using the 2-bytes format
+------------------------------------------------
+
+**Last affected version**: v1.5.5
+
+**Affected decompressor component(s)**: Library & CLI
+
+**Produced by the reference compressor**: No
+
+**Example Frame**: see zstd/tests/golden-decompression/zeroSeq_2B.zst
+
+The zstd decoder incorrectly expects FSE tables when there are 0 sequences present in the block
+if the value 0 is encoded using the 2-bytes format.
+Instead, it should immediately end the sequence section, and move on to next block.
+
+This situation was never generated by the reference compressor,
+because representing 0 sequences with the 2-bytes format is inefficient
+(the 1-byte format is always used in this case).
+
+
 Compressed block with a size of exactly 128 KB
 ------------------------------------------------
 
@@ -32,6 +52,7 @@ These blocks used to be disallowed by the spec up until spec version 0.3.2 when
 
 > A Compressed_Block has the extra restriction that Block_Size is always strictly less than the decompressed size. If this condition cannot be respected, the block must be sent uncompressed instead (Raw_Block).
 
+
 Compressed block with 0 literals and 0 sequences
 ------------------------------------------------
 
@@ -51,6 +72,7 @@ Additionally, these blocks were disallowed by the spec up until spec version 0.3
 
 > A Compressed_Block has the extra restriction that Block_Size is always strictly less than the decompressed size. If this condition cannot be respected, the block must be sent uncompressed instead (Raw_Block).
 
+
 First block is RLE block
 ------------------------
 
@@ -72,6 +94,7 @@ block.
 
 https://github.com/facebook/zstd/blob/8814aa5bfa74f05a86e55e9d508da177a893ceeb/lib/compress/zstd_compress.c#L3527-L3535
 
+
 Tiny FSE Table & Block
 ----------------------
 
diff --git a/doc/educational_decoder/zstd_decompress.c b/doc/educational_decoder/zstd_decompress.c
index 1da7c528d..921c8f54c 100644
--- a/doc/educational_decoder/zstd_decompress.c
+++ b/doc/educational_decoder/zstd_decompress.c
@@ -1018,12 +1018,7 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in,
     // This is a variable size field using between 1 and 3 bytes. Let's call its
     // first byte byte0."
     u8 header = IO_read_bits(in, 8);
-    if (header == 0) {
-        // "There are no sequences. The sequence section stops there.
-        // Regenerated content is defined entirely by literals section."
-        *sequences = NULL;
-        return 0;
-    } else if (header < 128) {
+    if (header < 128) {
         // "Number_of_Sequences = byte0 . Uses 1 byte."
         num_sequences = header;
     } else if (header < 255) {
@@ -1034,6 +1029,12 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in,
         num_sequences = IO_read_bits(in, 16) + 0x7F00;
     }
 
+    if (num_sequences == 0) {
+        // "There are no sequences. The sequence section stops there."
+        *sequences = NULL;
+        return 0;
+    }
+
     *sequences = malloc(num_sequences * sizeof(sequence_command_t));
     if (!*sequences) {
         BAD_ALLOC();
diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md
index 2a69c4c30..cd7308de1 100644
--- a/doc/zstd_compression_format.md
+++ b/doc/zstd_compression_format.md
@@ -16,7 +16,7 @@ Distribution of this document is unlimited.
 
 ### Version
 
-0.3.9 (2023-03-08)
+0.4.0 (2023-06-05)
 
 
 Introduction
@@ -650,15 +650,16 @@ __`Number_of_Sequences`__
 
 This is a variable size field using between 1 and 3 bytes.
 Let's call its first byte `byte0`.
-- `if (byte0 == 0)` : there are no sequences.
-            The sequence section stops there.
-            Decompressed content is defined entirely as Literals Section content.
-            The FSE tables used in `Repeat_Mode` aren't updated.
 - `if (byte0 < 128)` : `Number_of_Sequences = byte0` . Uses 1 byte.
 - `if (byte0 < 255)` : `Number_of_Sequences = ((byte0 - 0x80) << 8) + byte1`. Uses 2 bytes.
             Note that the 2 bytes format fully overlaps the 1 byte format.
 - `if (byte0 == 255)`: `Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00`. Uses 3 bytes.
 
+`if (Number_of_Sequences == 0)` : there are no sequences.
+            The sequence section stops immediately,
+            FSE tables used in `Repeat_Mode` aren't updated.
+            Block's decompressed content is defined solely by the Literals Section content.
+
 __Symbol compression modes__
 
 This is a single byte, defining the compression mode of each symbol type.
@@ -1698,6 +1699,7 @@ or at least provide a meaningful error code explaining for which reason it canno
 
 Version changes
 ---------------
+- 0.4.0 : fixed imprecise behavior for nbSeq==0, detected by Igor Pavlov
 - 0.3.9 : clarifications for Huffman-compressed literal sizes.
 - 0.3.8 : clarifications for Huffman Blocks and Huffman Tree descriptions.
 - 0.3.7 : clarifications for Repeat_Offsets, matching RFC8878
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h
index 1f942f27b..f7c57a028 100644
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -366,13 +366,13 @@ typedef struct {
 
 /*! ZSTD_getcBlockSize() :
  *  Provides the size of compressed block from block header `src` */
-/* Used by: decompress, fullbench (does not get its definition from here) */
+/*  Used by: decompress, fullbench */
 size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
                           blockProperties_t* bpPtr);
 
 /*! ZSTD_decodeSeqHeaders() :
  *  decode sequence header from src */
-/* Used by: decompress, fullbench (does not get its definition from here) */
+/*  Used by: zstd_decompress_block, fullbench */
 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                        const void* src, size_t srcSize);
 
diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c
index 5028a52f1..c63d06d6a 100644
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@@ -706,11 +706,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
 
     /* SeqHead */
     nbSeq = *ip++;
-    if (!nbSeq) {
-        *nbSeqPtr=0;
-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
-        return 1;
-    }
     if (nbSeq > 0x7F) {
         if (nbSeq == 0xFF) {
             RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
@@ -723,6 +718,11 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
     }
     *nbSeqPtr = nbSeq;
 
+    if (nbSeq == 0) {
+        /* No sequence : section ends immediately */
+        return (size_t)(ip - istart);
+    }
+
     /* FSE table descriptors */
     RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
     {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
diff --git a/tests/golden-decompression/zeroSeq_2B.zst b/tests/golden-decompression/zeroSeq_2B.zst
new file mode 100644
index 0000000000000000000000000000000000000000..f9f3520a6eb823709594cbe57df3c1b497984f48
GIT binary patch
literal 25
gcmdPcs{faPp_PFl!y`2(Cto2vzbGd~k*k3L0BK<dz5oCK

literal 0
HcmV?d00001