From 7083f790084be3f8b9b58c365e55c3a1df7de236 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 5 Oct 2020 15:17:44 -0700 Subject: [PATCH 1/6] [bug] Fix dictContentType when reprocessing cdict Conditions to trigger: * CDict is loaded as raw content. * CDict starts with the zstd dictionary magic number. * The CDict is reprocessed (not attached or copied). * The new API is used (streaming or `ZSTD_compress2()`). Bug: The dictionary is loaded as a zstd dictionary, not a raw content dictionary, because the dict content type is set to `ZSTD_dct_auto`. Fix: Pass in the dictionary content type from cdict creation to the call to `ZSTD_compress_insertDictionary()`. Test: Added a test case that exposes the bug, and fixed the raw content tests to not modify the `dictBuffer`, which makes all future tests with the `dictBuffer` raw content, which doesn't seem intentional. --- lib/compress/zstd_compress.c | 4 ++- tests/fuzzer.c | 65 +++++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index ea4b04aab..9469fcf39 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -64,6 +64,7 @@ size_t ZSTD_compressBound(size_t srcSize) { struct ZSTD_CDict_s { const void* dictContent; size_t dictContentSize; + ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */ U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ ZSTD_cwksp workspace; ZSTD_matchState_t matchState; @@ -3188,7 +3189,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, ZSTD_compress_insertDictionary( cctx->blockState.prevCBlock, &cctx->blockState.matchState, &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, - cdict->dictContentSize, dictContentType, dtlm, + cdict->dictContentSize, cdict->dictContentType, dtlm, cctx->entropyWorkspace) : ZSTD_compress_insertDictionary( cctx->blockState.prevCBlock, &cctx->blockState.matchState, @@ -3460,6 +3461,7 @@ static size_t ZSTD_initCDict_internal( ZSTD_memcpy(internalBuffer, dictBuffer, dictSize); } cdict->dictContentSize = dictSize; + cdict->dictContentType = dictContentType; cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE); diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 8b10078ac..27a7bde12 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -1971,34 +1971,53 @@ static int basicUnitTests(U32 const seed, double compressibility) } DISPLAYLEVEL(3, "OK \n"); - DISPLAYLEVEL(3, "test%3i : Loading rawContent starting with dict header w/ ZSTD_dct_auto should fail : ", testNb++); - { - size_t ret; - MEM_writeLE32((char*)dictBuffer+2, ZSTD_MAGIC_DICTIONARY); - /* Either operation is allowed to fail, but one must fail. */ - ret = ZSTD_CCtx_loadDictionary_advanced( - cctx, (const char*)dictBuffer+2, dictSize-2, ZSTD_dlm_byRef, ZSTD_dct_auto); - if (!ZSTD_isError(ret)) { - ret = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, MIN(CNBuffSize, 100)); - if (!ZSTD_isError(ret)) goto _output_error; - } - } - DISPLAYLEVEL(3, "OK \n"); + { char* rawDictBuffer = (char*)malloc(dictSize); + assert(rawDictBuffer); + memcpy(rawDictBuffer, (char*)dictBuffer + 2, dictSize - 2); + memset(rawDictBuffer + dictSize - 2, 0, 2); + MEM_writeLE32((char*)rawDictBuffer, ZSTD_MAGIC_DICTIONARY); - DISPLAYLEVEL(3, "test%3i : Loading rawContent starting with dict header w/ ZSTD_dct_rawContent should pass : ", testNb++); - { - size_t ret; - MEM_writeLE32((char*)dictBuffer+2, ZSTD_MAGIC_DICTIONARY); - ret = ZSTD_CCtx_loadDictionary_advanced( - cctx, (const char*)dictBuffer+2, dictSize-2, ZSTD_dlm_byRef, ZSTD_dct_rawContent); - if (ZSTD_isError(ret)) goto _output_error; - ret = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, MIN(CNBuffSize, 100)); - if (ZSTD_isError(ret)) goto _output_error; + DISPLAYLEVEL(3, "test%3i : Loading rawContent starting with dict header w/ ZSTD_dct_auto should fail : ", testNb++); + { + size_t ret; + /* Either operation is allowed to fail, but one must fail. */ + ret = ZSTD_CCtx_loadDictionary_advanced( + cctx, (const char*)rawDictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); + if (!ZSTD_isError(ret)) { + ret = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, MIN(CNBuffSize, 100)); + if (!ZSTD_isError(ret)) goto _output_error; + } + } + DISPLAYLEVEL(3, "OK \n"); + + DISPLAYLEVEL(3, "test%3i : Loading rawContent starting with dict header w/ ZSTD_dct_rawContent should pass : ", testNb++); + { + size_t ret; + ret = ZSTD_CCtx_loadDictionary_advanced( + cctx, (const char*)rawDictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent); + if (ZSTD_isError(ret)) goto _output_error; + ret = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, MIN(CNBuffSize, 100)); + if (ZSTD_isError(ret)) goto _output_error; + } + DISPLAYLEVEL(3, "OK \n"); + + DISPLAYLEVEL(3, "test%3i : Testing non-attached CDict with ZSTD_dct_rawContent : ", testNb++); + { size_t const srcSize = MIN(CNBuffSize, 100); + ZSTD_CCtx_reset(cctx, ZSTD_reset_session_and_parameters); + /* Force the dictionary to be reloaded in raw content mode */ + CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_forceAttachDict, ZSTD_dictForceLoad)); + CHECK_Z(ZSTD_CCtx_loadDictionary_advanced(cctx, rawDictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent)); + cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, srcSize); + CHECK_Z(cSize); + } + DISPLAYLEVEL(3, "OK \n"); + + free(rawDictBuffer); } - DISPLAYLEVEL(3, "OK \n"); DISPLAYLEVEL(3, "test%3i : ZSTD_CCtx_refCDict() then set parameters : ", testNb++); { ZSTD_CDict* const cdict = ZSTD_createCDict(CNBuffer, dictSize, 1); + ZSTD_CCtx_reset(cctx, ZSTD_reset_session_and_parameters); CHECK_Z( ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, 1) ); CHECK_Z( ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, 12 )); CHECK_Z( ZSTD_CCtx_refCDict(cctx, cdict) ); From 012818df998993c0c20264c9c92c5bcdfd9fbe52 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 5 Oct 2020 15:46:50 -0700 Subject: [PATCH 2/6] [refactor] Remove ZSTD_resetCStream_internal() This function is only called in one place. It isn't a logical separation of duties, and it was only obsfucating the code now, so inline it. --- lib/compress/zstd_compress.c | 47 +++++++++++------------------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 9469fcf39..3e0339683 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -3793,34 +3793,6 @@ size_t ZSTD_CStreamOutSize(void) return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; } -static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx, - const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType, - const ZSTD_CDict* const cdict, - ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize) -{ - DEBUGLOG(4, "ZSTD_resetCStream_internal"); - /* Finalize the compression parameters */ - params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, dictSize); - /* params are supposed to be fully validated at this point */ - assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); - assert(!((dict) && (cdict))); /* either dict or cdict, not both */ - - FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, - dict, dictSize, dictContentType, ZSTD_dtlm_fast, - cdict, - ¶ms, pledgedSrcSize, - ZSTDb_buffered) , ""); - - cctx->inToCompress = 0; - cctx->inBuffPos = 0; - cctx->inBuffTarget = cctx->blockSize - + (cctx->blockSize == pledgedSrcSize); /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */ - cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; - cctx->streamStage = zcss_load; - cctx->frameEnded = 0; - return 0; /* ready to go */ -} - /* ZSTD_resetCStream(): * pledgedSrcSize == 0 means "unknown" */ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss) @@ -4162,12 +4134,21 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, cctx->appliedParams.nbWorkers = params.nbWorkers; } else #endif - { FORWARD_IF_ERROR( ZSTD_resetCStream_internal(cctx, - prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, - cctx->cdict, - params, cctx->pledgedSrcSizePlusOne-1) , ""); - assert(cctx->streamStage == zcss_load); + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, ZSTD_dtlm_fast, + cctx->cdict, + ¶ms, pledgedSrcSize, + ZSTDb_buffered) , ""); assert(cctx->appliedParams.nbWorkers == 0); + cctx->inToCompress = 0; + cctx->inBuffPos = 0; + /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */ + cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize); + cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; + cctx->streamStage = zcss_load; + cctx->frameEnded = 0; } } /* end of transparent initialization stage */ From 48ef15fb47395dcb57900cd7c247f2dd5af2d5cd Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 5 Oct 2020 15:49:55 -0700 Subject: [PATCH 3/6] [minor improvement] Pass dictSize when selecting parameters When selecting parameters in streaming compression with a dictionary use the dictionary size to select the parameters. --- lib/compress/zstd_compress.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 3e0339683..b0f0e9eaa 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -4109,7 +4109,9 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, params.compressionLevel = cctx->cdict->compressionLevel; /* let cdict take priority in terms of compression level */ DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1; /* auto-fix pledgedSrcSize */ - params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/); + params.cParams = ZSTD_getCParamsFromCCtxParams( + ¶ms, cctx->pledgedSrcSizePlusOne-1, + cctx->prefixDict.dict ? cctx->prefixDict.dictSize : (cctx->cdict ? cctx->cdict->dictContentSize : 0)); #ifdef ZSTD_MULTITHREAD From fadaab8c7c00f6b1589c207e1623ffee7a21e9e7 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 5 Oct 2020 15:38:10 -0700 Subject: [PATCH 4/6] [minor improvement] Pass 0 as the content size in the DDS The DDS structure can't be copied into the working tables like the DMS. So it doesn't need to account for the source size when sizing its parameters, just the dictionary size. --- lib/compress/zstd_compress.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index b0f0e9eaa..09ec152f3 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -878,7 +878,6 @@ ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long lo static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams( int const compressionLevel, - unsigned long long srcSizeHint, size_t const dictSize); static int ZSTD_dedicatedDictSearch_isSupported( const ZSTD_compressionParameters* cParams); @@ -3559,7 +3558,7 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( if (cctxParams.enableDedicatedDictSearch) { cParams = ZSTD_dedicatedDictSearch_getCParams( - cctxParams.compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + cctxParams.compressionLevel, dictSize); ZSTD_overrideCParams(&cParams, &cctxParams.cParams); } else { cParams = ZSTD_getCParamsFromCCtxParams( @@ -4362,9 +4361,9 @@ static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEV }, }; -static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, unsigned long long srcSizeHint, size_t const dictSize) +static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize) { - ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize); + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize); switch (cParams.strategy) { case ZSTD_fast: case ZSTD_dfast: From d5c688e8ae8959e1740fe3833251c88fca3e5e10 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Thu, 1 Oct 2020 13:12:23 -0700 Subject: [PATCH 5/6] Fix ZSTD_adjustCParams_internal() to handle dictionary logic Pass in the `ZSTD_cParamMode_e` to select how we define our cparams. Based on the mode we either take the `dictSize` into account or we set it to `0`. See the documentation for `ZSTD_cParamMode_e`. Some of the modes currently share the same behavior. But they have distinct modes because they are drastically different cases. E.g. compression + reprocessing the dictionary and creating a cdict. Additionally, when downsizing the hashLog and chainLog take the (adjusted) dictionary size into account, since the size of the dictionary gets added onto the window size. Adds a simple test to ensure that we aren't downsizing too far. --- lib/compress/zstd_compress.c | 160 +++++++++++++++++++------- lib/compress/zstd_compress_internal.h | 21 +++- lib/compress/zstdmt_compress.c | 2 +- tests/fuzzer.c | 13 +++ 4 files changed, 155 insertions(+), 41 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 09ec152f3..69ff6512d 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1063,24 +1063,71 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) return hashLog - btScale; } +/** ZSTD_dictAndWindowLog() : + * Returns an adjusted window log that is large enough to fit the source and the dictionary. + * The zstd format says that the entire dictionary is valid if one byte of the dictionary + * is within the window. So the hashLog and chainLog should be large enough to reference both + * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing + * the hashLog and windowLog. + */ +static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) +{ + const U64 maxWindowSize = 1ULL << ZSTD_WINDOWLOG_MAX; + /* No dictionary ==> No change */ + if (dictSize == 0) { + return windowLog; + } + assert(srcSize != ZSTD_CONTENTSIZE_UNKNOWN); /* Handled in ZSTD_adjustCParams_internal() */ + { + U64 const windowSize = 1ULL << windowLog; + U64 const dictAndWindowSize = dictSize + windowSize; + /* If the window size is already large enough to fit both the source and the dictionary + * then just use the window size. Otherwise adjust so that it fits the dictionary and + * the window. + */ + if (windowSize >= dictSize + srcSize) { + return windowLog; /* Window size large enough already */ + } else if (dictAndWindowSize >= maxWindowSize) { + return ZSTD_WINDOWLOG_MAX; /* Larger than max window log */ + } else { + return ZSTD_highbit32((U32)dictAndWindowSize - 1) + 1; + } + } +} + /** ZSTD_adjustCParams_internal() : * optimize `cPar` for a specified input (`srcSize` and `dictSize`). * mostly downsize to reduce memory consumption and initialization latency. * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. + * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`. * note : `srcSize==0` means 0! * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ static ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, - size_t dictSize) + size_t dictSize, + ZSTD_cParamMode_e mode) { - static const U64 minSrcSize = 513; /* (1<<9) + 1 */ - static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); assert(ZSTD_checkCParams(cPar)==0); if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) srcSize = minSrcSize; + switch (mode) { + case ZSTD_cpm_noAttachDict: + case ZSTD_cpm_unknown: + case ZSTD_cpm_createCDict: + break; + case ZSTD_cpm_attachDict: + dictSize = 0; + break; + default: + assert(0); + break; + } + /* resize windowLog if input is small enough, to use less memory */ if ( (srcSize < maxWindowResize) && (dictSize < maxWindowResize) ) { @@ -1090,10 +1137,11 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, ZSTD_highbit32(tSize-1) + 1; if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; } - if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1; - { U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); - if (cycleLog > cPar.windowLog) - cPar.chainLog -= (cycleLog - cPar.windowLog); + { U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize); + U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); + if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1; + if (cycleLog > dictAndWindowLog) + cPar.chainLog -= (cycleLog - dictAndWindowLog); } if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) @@ -1109,11 +1157,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, { cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; - return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize); + return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); } -static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize); -static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize); +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); static void ZSTD_overrideCParams( ZSTD_compressionParameters* cParams, @@ -1129,18 +1177,18 @@ static void ZSTD_overrideCParams( } ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( - const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize) + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) { ZSTD_compressionParameters cParams; if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { srcSizeHint = CCtxParams->srcSizeHint; } - cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize); + cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); assert(!ZSTD_checkCParams(cParams)); /* srcSizeHint == 0 means 0 */ - return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize); + return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); } static size_t @@ -1218,7 +1266,7 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) { ZSTD_compressionParameters const cParams = - ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0); + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); /* estimateCCtxSize is for one-shot compression. So no buffers should @@ -1236,7 +1284,7 @@ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) { - ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0); + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); return ZSTD_estimateCCtxSize_usingCParams(cParams); } @@ -1255,7 +1303,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) { RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); { ZSTD_compressionParameters const cParams = - ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0); + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); size_t const inBuffSize = ((size_t)1 << cParams.windowLog) + blockSize; size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1; @@ -1274,7 +1322,7 @@ size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) { - ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0); + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); return ZSTD_estimateCStreamSize_usingCParams(cParams); } @@ -1699,7 +1747,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, ZSTD_dedicatedDictSearch_revertCParams(&adjusted_cdict_cParams); } - params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, 0); + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, + cdict->dictContentSize, ZSTD_cpm_attachDict); params.cParams.windowLog = windowLog; FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, ZSTDcrp_makeClean, zbuff), ""); @@ -3235,7 +3284,7 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); @@ -3369,7 +3418,7 @@ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0); + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); assert(params.fParams.contentSizeFlag == 1); @@ -3424,7 +3473,7 @@ size_t ZSTD_estimateCDictSize_advanced( size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) { - ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy); } @@ -3562,14 +3611,14 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( ZSTD_overrideCParams(&cParams, &cctxParams.cParams); } else { cParams = ZSTD_getCParamsFromCCtxParams( - &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); } if (!ZSTD_dedicatedDictSearch_isSupported(&cParams)) { /* Fall back to non-DDSS params */ cctxParams.enableDedicatedDictSearch = 0; cParams = ZSTD_getCParamsFromCCtxParams( - &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); } cctxParams.cParams = cParams; @@ -3591,7 +3640,7 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, cParams, ZSTD_defaultCMem); @@ -3602,7 +3651,7 @@ ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionL ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, cParams, ZSTD_defaultCMem); @@ -3792,6 +3841,14 @@ size_t ZSTD_CStreamOutSize(void) return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; } +static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) +{ + if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) + return ZSTD_cpm_attachDict; + else + return ZSTD_cpm_noAttachDict; +} + /* ZSTD_resetCStream(): * pledgedSrcSize == 0 means "unknown" */ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss) @@ -4108,9 +4165,15 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, params.compressionLevel = cctx->cdict->compressionLevel; /* let cdict take priority in terms of compression level */ DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1; /* auto-fix pledgedSrcSize */ - params.cParams = ZSTD_getCParamsFromCCtxParams( - ¶ms, cctx->pledgedSrcSizePlusOne-1, - cctx->prefixDict.dict ? cctx->prefixDict.dictSize : (cctx->cdict ? cctx->cdict->dictContentSize : 0)); + { + size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); + ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); + params.cParams = ZSTD_getCParamsFromCCtxParams( + ¶ms, cctx->pledgedSrcSizePlusOne-1, + dictSize, mode); + } #ifdef ZSTD_MULTITHREAD @@ -4363,7 +4426,7 @@ static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEV static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize) { - ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize); + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize, ZSTD_cpm_createCDict); switch (cParams.strategy) { case ZSTD_fast: case ZSTD_dfast: @@ -4412,15 +4475,34 @@ static void ZSTD_dedicatedDictSearch_revertCParams( } } +static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) +{ + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: + case ZSTD_cpm_createCDict: + break; + case ZSTD_cpm_attachDict: + dictSize = 0; + break; + default: + assert(0); + break; + } + { int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN; + size_t const addedSize = unknown && dictSize > 0 ? 500 : 0; + return unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize; + } +} + /*! ZSTD_getCParams_internal() : * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. - * Use dictSize == 0 for unknown or unused. */ -static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) + * Use dictSize == 0 for unknown or unused. + * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */ +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) { - int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN; - size_t const addedSize = unknown && dictSize > 0 ? 500 : 0; - U64 const rSize = unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize; + U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode); U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); int row = compressionLevel; DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel); @@ -4430,7 +4512,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel); /* acceleration factor */ /* refine parameters based on srcSize & dictSize */ - return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize); + return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); } } @@ -4440,16 +4522,16 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; - return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize); + return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } /*! ZSTD_getParams() : * same idea as ZSTD_getCParams() * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). * Fields of `ZSTD_frameParameters` are set to default values */ -static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { +static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) { ZSTD_parameters params; - ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize); + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode); DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); ZSTD_memset(¶ms, 0, sizeof(params)); params.cParams = cParams; @@ -4463,5 +4545,5 @@ static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned lo * Fields of `ZSTD_frameParameters` are set to default values */ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; - return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize); + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 49ad87b1a..1fc9180aa 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -302,6 +302,25 @@ typedef enum { ZSTD_dedicatedDictSearch = 3 } ZSTD_dictMode_e; +typedef enum { + ZSTD_cpm_noAttachDict = 0, /* Compression with ZSTD_noDict or ZSTD_extDict. + * In this mode we use both the srcSize and the dictSize + * when selecting and adjusting parameters. + */ + ZSTD_cpm_attachDict = 1, /* Compression with ZSTD_dictMatchState or ZSTD_dedicatedDictSearch. + * In this mode we only take the srcSize into account when selecting + * and adjusting parameters. + */ + ZSTD_cpm_createCDict = 2, /* Creating a CDict. + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ + ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. + */ +} ZSTD_cParamMode_e; typedef size_t (*ZSTD_blockCompressor) ( ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -1090,7 +1109,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); * Note: srcSizeHint == 0 means 0! */ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( - const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize); + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); /*! ZSTD_initCStream_internal() : * Private use only. Init streaming operation. diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c index b809d602e..7bd8db484 100644 --- a/lib/compress/zstdmt_compress.c +++ b/lib/compress/zstdmt_compress.c @@ -1107,7 +1107,7 @@ void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_p DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)", compressionLevel); mtctx->params.compressionLevel = compressionLevel; - { ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, 0); + { ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); cParams.windowLog = saved_wlog; mtctx->params.cParams = cParams; } diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 27a7bde12..838772c09 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -372,6 +372,19 @@ static int basicUnitTests(U32 const seed, double compressibility) DISPLAYLEVEL(3, "%u (OK) \n", vn); } + DISPLAYLEVEL(3, "test%3u : ZSTD_adjustCParams : ", testNb++); + { + ZSTD_compressionParameters params; + memset(¶ms, 0, sizeof(params)); + params.windowLog = 10; + params.hashLog = 19; + params.chainLog = 19; + params = ZSTD_adjustCParams(params, 1000, 100000); + if (params.hashLog != 18) goto _output_error; + if (params.chainLog != 17) goto _output_error; + } + DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3u : compress %u bytes : ", testNb++, (unsigned)CNBuffSize); { ZSTD_CCtx* const cctx = ZSTD_createCCtx(); if (cctx==NULL) goto _output_error; From 7e6f91ed84e62513c2d12e7e40254704f593000c Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 12 Oct 2020 12:52:42 -0700 Subject: [PATCH 6/6] [minor] Improve docs and add an assert in response to review --- lib/compress/zstd_compress.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 69ff6512d..92010fd8a 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1064,11 +1064,12 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) } /** ZSTD_dictAndWindowLog() : - * Returns an adjusted window log that is large enough to fit the source and the dictionary. - * The zstd format says that the entire dictionary is valid if one byte of the dictionary - * is within the window. So the hashLog and chainLog should be large enough to reference both - * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing - * the hashLog and windowLog. + * Returns an adjusted window log that is large enough to fit the source and the dictionary. + * The zstd format says that the entire dictionary is valid if one byte of the dictionary + * is within the window. So the hashLog and chainLog should be large enough to reference both + * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing + * the hashLog and windowLog. + * NOTE: srcSize must not be ZSTD_CONTENTSIZE_UNKNOWN. */ static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) { @@ -1077,6 +1078,7 @@ static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) if (dictSize == 0) { return windowLog; } + assert(windowLog <= ZSTD_WINDOWLOG_MAX); assert(srcSize != ZSTD_CONTENTSIZE_UNKNOWN); /* Handled in ZSTD_adjustCParams_internal() */ { U64 const windowSize = 1ULL << windowLog;