1
0
mirror of https://github.com/facebook/zstd.git synced 2025-08-10 04:43:07 +03:00

wire up bmi2 support

This commit is contained in:
Nick Terrell
2020-08-17 13:44:49 -07:00
parent ba1fd17a9f
commit 612e947c5e
10 changed files with 226 additions and 34 deletions

View File

@@ -38,7 +38,8 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
/*-************************************************************** /*-**************************************************************
* FSE NCount encoding-decoding * FSE NCount encoding-decoding
****************************************************************/ ****************************************************************/
size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, FORCE_INLINE_TEMPLATE
size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize) const void* headerBuffer, size_t hbSize)
{ {
const BYTE* const istart = (const BYTE*) headerBuffer; const BYTE* const istart = (const BYTE*) headerBuffer;
@@ -175,6 +176,43 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
return ip-istart; return ip-istart;
} }
static size_t FSE_readNCount_body_default(
short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize)
{
return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
}
#if DYNAMIC_BMI2
TARGET_ATTRIBUTE("bmi2") static size_t FSE_readNCount_body_bmi2(
short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize)
{
return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
}
#endif
size_t FSE_readNCount_bmi2(
short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize, int bmi2)
{
#if DYNAMIC_BMI2
if (bmi2) {
return FSE_readNCount_body_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
}
#endif
(void)bmi2;
return FSE_readNCount_body_default(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize);
}
size_t FSE_readNCount(
short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
const void* headerBuffer, size_t hbSize)
{
return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize, /* bmi2 */ 0);
}
/*! HUF_readStats() : /*! HUF_readStats() :
Read compact Huffman tree, saved by HUF_writeCTable(). Read compact Huffman tree, saved by HUF_writeCTable().
`huffWeight` is destination buffer. `huffWeight` is destination buffer.
@@ -187,13 +225,14 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
const void* src, size_t srcSize) const void* src, size_t srcSize)
{ {
U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp)); return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
} }
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, FORCE_INLINE_TEMPLATE size_t HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr, U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize, const void* src, size_t srcSize,
void* workSpace, size_t wkspSize) void* workSpace, size_t wkspSize,
int bmi2)
{ {
U32 weightTotal; U32 weightTotal;
const BYTE* ip = (const BYTE*) src; const BYTE* ip = (const BYTE*) src;
@@ -217,7 +256,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
} } } } } }
else { /* header compressed with FSE (normal case) */ else { /* header compressed with FSE (normal case) */
if (iSize+1 > srcSize) return ERROR(srcSize_wrong); if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize); /* max (hwSize-1) values decoded, as last one is implied */ oSize = FSE_decompress_wksp_bmi2(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize, bmi2); /* max (hwSize-1) values decoded, as last one is implied */
if (FSE_isError(oSize)) return oSize; if (FSE_isError(oSize)) return oSize;
} }
@@ -252,3 +291,36 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
*nbSymbolsPtr = (U32)(oSize+1); *nbSymbolsPtr = (U32)(oSize+1);
return iSize+1; return iSize+1;
} }
static size_t HUF_readStats_body_default(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workSpace, size_t wkspSize)
{
return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 0);
}
#if DYNAMIC_BMI2
static TARGET_ATTRIBUTE("bmi2") size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workSpace, size_t wkspSize)
{
return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 1);
}
#endif
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workSpace, size_t wkspSize,
int bmi2)
{
#if DYNAMIC_BMI2
if (bmi2) {
return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
}
#endif
(void)bmi2;
return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
}

View File

@@ -228,6 +228,13 @@ FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
const void* rBuffer, size_t rBuffSize); const void* rBuffer, size_t rBuffSize);
/*! FSE_readNCount_bmi2():
* Same as FSE_readNCount() but pass bmi2=1 when your CPU supports BMI2 and 0 otherwise.
*/
FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
const void* rBuffer, size_t rBuffSize, int bmi2);
/*! Constructor and Destructor of FSE_DTable. /*! Constructor and Destructor of FSE_DTable.
Note that its size depends on 'tableLog' */ Note that its size depends on 'tableLog' */
typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
@@ -342,6 +349,9 @@ size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ /**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
/**< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
typedef enum { typedef enum {
FSE_repeat_none, /**< Cannot use the previous table */ FSE_repeat_none, /**< Cannot use the previous table */
FSE_repeat_check, /**< Can use the previous table but it must be checked */ FSE_repeat_check, /**< Can use the previous table but it must be checked */

View File

@@ -73,7 +73,7 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned
return FSE_buildDTable_wksp(dt, normalizedCounter, maxSymbolValue, tableLog, wksp, sizeof(wksp)); return FSE_buildDTable_wksp(dt, normalizedCounter, maxSymbolValue, tableLog, wksp, sizeof(wksp));
} }
size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
{ {
void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr); FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
@@ -178,6 +178,11 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
return 0; return 0;
} }
size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
{
return FSE_buildDTable_internal(dt, normalizedCounter, maxSymbolValue, tableLog, workSpace, wkspSize);
}
#ifndef FSE_COMMONDEFS_ONLY #ifndef FSE_COMMONDEFS_ONLY
@@ -306,6 +311,15 @@ size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
{
return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
}
FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
void* dst, size_t dstCapacity,
const void* cSrc, size_t cSrcSize,
unsigned maxLog, void* workSpace, size_t wkspSize,
int bmi2)
{ {
const BYTE* const istart = (const BYTE*)cSrc; const BYTE* const istart = (const BYTE*)cSrc;
const BYTE* ip = istart; const BYTE* ip = istart;
@@ -315,7 +329,7 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size
FSE_DTable* const dtable = (FSE_DTable*)workSpace; FSE_DTable* const dtable = (FSE_DTable*)workSpace;
/* normal FSE decoding mode */ /* normal FSE decoding mode */
size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize); size_t const NCountLength = FSE_readNCount_bmi2(counting, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
if (FSE_isError(NCountLength)) return NCountLength; if (FSE_isError(NCountLength)) return NCountLength;
if (tableLog > maxLog) return ERROR(tableLog_tooLarge); if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
assert(NCountLength <= cSrcSize); assert(NCountLength <= cSrcSize);
@@ -326,9 +340,40 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size
workSpace = dtable + FSE_DTABLE_SIZE_U32(tableLog); workSpace = dtable + FSE_DTABLE_SIZE_U32(tableLog);
wkspSize -= FSE_DTABLE_SIZE(tableLog); wkspSize -= FSE_DTABLE_SIZE(tableLog);
CHECK_F( FSE_buildDTable_wksp(dtable, counting, maxSymbolValue, tableLog, workSpace, wkspSize) ); CHECK_F( FSE_buildDTable_internal(dtable, counting, maxSymbolValue, tableLog, workSpace, wkspSize) );
return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, dtable); /* always return, even if it is an error code */ {
const void* ptr = dtable;
const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
const U32 fastMode = DTableH->fastMode;
/* select fast mode (static) */
if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
}
}
static size_t FSE_decompress_wksp_body_default(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
{
return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 0);
}
#if DYNAMIC_BMI2
TARGET_ATTRIBUTE("bmi2") static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
{
return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1);
}
#endif
size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2)
{
#if DYNAMIC_BMI2
if (bmi2) {
return FSE_decompress_wksp_body_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
}
#endif
(void)bmi2;
return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
} }

View File

@@ -231,13 +231,15 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
/*! HUF_readStats_wksp() : /*! HUF_readStats_wksp() :
* Same as HUF_readStats() but takes an external workspace which must be * Same as HUF_readStats() but takes an external workspace which must be
* 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE. * 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE.
* If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
*/ */
#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1) #define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1)
#define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned)) #define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned))
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize, const void* src, size_t srcSize,
void* workspace, size_t wkspSize); void* workspace, size_t wkspSize,
int bmi2);
/** HUF_readCTable() : /** HUF_readCTable() :
* Loading a CTable saved with HUF_writeCTable() */ * Loading a CTable saved with HUF_writeCTable() */
@@ -345,6 +347,9 @@ size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstS
#endif #endif
size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
#ifndef HUF_FORCE_DECOMPRESS_X2
size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
#endif
#endif /* HUF_STATIC_LINKING_ONLY */ #endif /* HUF_STATIC_LINKING_ONLY */

View File

@@ -180,6 +180,11 @@ typedef struct {
// TODO: Template based on BMI2 (5% boost) // TODO: Template based on BMI2 (5% boost)
size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
{
return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
}
size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
{ {
U32 tableLog = 0; U32 tableLog = 0;
U32 nbSymbols = 0; U32 nbSymbols = 0;
@@ -194,7 +199,7 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
/* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp)); iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
if (HUF_isError(iSize)) return iSize; if (HUF_isError(iSize)) return iSize;
/* Table header */ /* Table header */
@@ -220,13 +225,21 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
{ {
int n; int n;
int nextRankStart = 0; int nextRankStart = 0;
int const unroll = 4;
int const nLimit = (int)nbSymbols - unroll + 1;
for (n=0; n<(int)tableLog+1; n++) { for (n=0; n<(int)tableLog+1; n++) {
U32 const current = nextRankStart; U32 const current = nextRankStart;
nextRankStart += wksp->rankVal[n]; nextRankStart += wksp->rankVal[n];
wksp->rankStart[n] = current; wksp->rankStart[n] = current;
} }
// TODO: This loop is now the bottleneck: Can this be made faster? for (n=0; n < nLimit; n += unroll) {
for (n=0; n < (int)nbSymbols; ++n) { int u;
for (u=0; u < unroll; ++u) {
size_t const w = wksp->huffWeight[n+u];
wksp->symbols[wksp->rankStart[w]++] = n+u;
}
}
for (; n < (int)nbSymbols; ++n) {
size_t const w = wksp->huffWeight[n]; size_t const w = wksp->huffWeight[n];
wksp->symbols[wksp->rankStart[w]++] = n; wksp->symbols[wksp->rankStart[w]++] = n;
} }
@@ -540,8 +553,7 @@ static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size
{ {
const BYTE* ip = (const BYTE*) cSrc; const BYTE* ip = (const BYTE*) cSrc;
size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize, size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
workSpace, wkspSize);
if (HUF_isError(hSize)) return hSize; if (HUF_isError(hSize)) return hSize;
if (hSize >= cSrcSize) return ERROR(srcSize_wrong); if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
ip += hSize; cSrcSize -= hSize; ip += hSize; cSrcSize -= hSize;
@@ -1320,7 +1332,7 @@ size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstS
{ {
const BYTE* ip = (const BYTE*) cSrc; const BYTE* ip = (const BYTE*) cSrc;
size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize); size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
if (HUF_isError(hSize)) return hSize; if (HUF_isError(hSize)) return hSize;
if (hSize >= cSrcSize) return ERROR(srcSize_wrong); if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
ip += hSize; cSrcSize -= hSize; ip += hSize; cSrcSize -= hSize;

View File

@@ -1092,7 +1092,8 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
offcodeNCount, offcodeMaxValue, offcodeNCount, offcodeMaxValue,
OF_base, OF_bits, OF_base, OF_bits,
offcodeLog, offcodeLog,
entropy->workspace, sizeof(entropy->workspace)); entropy->workspace, sizeof(entropy->workspace),
/* bmi2 */0);
dictPtr += offcodeHeaderSize; dictPtr += offcodeHeaderSize;
} }
@@ -1106,7 +1107,8 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
matchlengthNCount, matchlengthMaxValue, matchlengthNCount, matchlengthMaxValue,
ML_base, ML_bits, ML_base, ML_bits,
matchlengthLog, matchlengthLog,
entropy->workspace, sizeof(entropy->workspace)); entropy->workspace, sizeof(entropy->workspace),
/* bmi2 */ 0);
dictPtr += matchlengthHeaderSize; dictPtr += matchlengthHeaderSize;
} }
@@ -1120,7 +1122,8 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
litlengthNCount, litlengthMaxValue, litlengthNCount, litlengthMaxValue,
LL_base, LL_bits, LL_base, LL_bits,
litlengthLog, litlengthLog,
entropy->workspace, sizeof(entropy->workspace)); entropy->workspace, sizeof(entropy->workspace),
/* bmi2 */ 0);
dictPtr += litlengthHeaderSize; dictPtr += litlengthHeaderSize;
} }

View File

@@ -364,8 +364,8 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB
* generate FSE decoding table for one symbol (ll, ml or off) * generate FSE decoding table for one symbol (ll, ml or off)
* cannot fail if input is valid => * cannot fail if input is valid =>
* all inputs are presumed validated at this stage */ * all inputs are presumed validated at this stage */
void FORCE_INLINE_TEMPLATE
ZSTD_buildFSETable(ZSTD_seqSymbol* dt, void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue, const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits, const U32* baseValue, const U32* nbAdditionalBits,
unsigned tableLog, void* wksp, size_t wkspSize) unsigned tableLog, void* wksp, size_t wkspSize)
@@ -378,6 +378,7 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
(void)wkspSize;
/* Sanity Checks */ /* Sanity Checks */
assert(maxSymbolValue <= MaxSeq); assert(maxSymbolValue <= MaxSeq);
@@ -483,6 +484,42 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
} }
} }
static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits,
unsigned tableLog, void* wksp, size_t wkspSize)
{
return ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
}
#if DYNAMIC_BMI2
TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits,
unsigned tableLog, void* wksp, size_t wkspSize)
{
return ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
}
#endif
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits,
unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
{
#if DYNAMIC_BMI2
if (bmi2) {
return ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue,
baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
}
#endif
(void)bmi2;
return ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue,
baseValue, nbAdditionalBits, tableLog, wksp, wkspSize);
}
/*! ZSTD_buildSeqTable() : /*! ZSTD_buildSeqTable() :
* @return : nb bytes read from src, * @return : nb bytes read from src,
@@ -492,7 +529,8 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
const void* src, size_t srcSize, const void* src, size_t srcSize,
const U32* baseValue, const U32* nbAdditionalBits, const U32* baseValue, const U32* nbAdditionalBits,
const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize) int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,
int bmi2)
{ {
switch(type) switch(type)
{ {
@@ -524,7 +562,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb
size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2);
*DTablePtr = DTableSpace; *DTablePtr = DTableSpace;
return headerSize; return headerSize;
} }
@@ -578,7 +616,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
LL_base, LL_bits, LL_base, LL_bits,
LL_defaultDTable, dctx->fseEntropy, LL_defaultDTable, dctx->fseEntropy,
dctx->ddictIsCold, nbSeq, dctx->ddictIsCold, nbSeq,
dctx->workspace, sizeof(dctx->workspace)); dctx->workspace, sizeof(dctx->workspace),
dctx->bmi2);
RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
ip += llhSize; ip += llhSize;
} }
@@ -589,7 +628,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
OF_base, OF_bits, OF_base, OF_bits,
OF_defaultDTable, dctx->fseEntropy, OF_defaultDTable, dctx->fseEntropy,
dctx->ddictIsCold, nbSeq, dctx->ddictIsCold, nbSeq,
dctx->workspace, sizeof(dctx->workspace)); dctx->workspace, sizeof(dctx->workspace),
dctx->bmi2);
RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
ip += ofhSize; ip += ofhSize;
} }
@@ -600,7 +640,8 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
ML_base, ML_bits, ML_base, ML_bits,
ML_defaultDTable, dctx->fseEntropy, ML_defaultDTable, dctx->fseEntropy,
dctx->ddictIsCold, nbSeq, dctx->ddictIsCold, nbSeq,
dctx->workspace, sizeof(dctx->workspace)); dctx->workspace, sizeof(dctx->workspace),
dctx->bmi2);
RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
ip += mlhSize; ip += mlhSize;
} }

View File

@@ -48,14 +48,15 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
* this function must be called with valid parameters only * this function must be called with valid parameters only
* (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.) * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
* in which case it cannot fail. * in which case it cannot fail.
* The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes. * The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes, which is
* defined in zstd_decompress_internal.h.
* Internal use only. * Internal use only.
*/ */
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue, const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits, const U32* baseValue, const U32* nbAdditionalBits,
unsigned tableLog, void* wksp, size_t wkspSize); unsigned tableLog, void* wksp, size_t wkspSize,
int bmi2);
#endif /* ZSTD_DEC_BLOCK_H */ #endif /* ZSTD_DEC_BLOCK_H */

View File

@@ -72,7 +72,9 @@ static const U32 ML_base[MaxML+1] = {
} ZSTD_seqSymbol; } ZSTD_seqSymbol;
#define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log))) #define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log)))
#define ZSTD_FSE_WKSP_SIZE_U32 130
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
typedef struct { typedef struct {
ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
@@ -80,7 +82,7 @@ typedef struct {
ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
U32 rep[ZSTD_REP_NUM]; U32 rep[ZSTD_REP_NUM];
U32 workspace[ZSTD_FSE_WKSP_SIZE_U32]; U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE];
} ZSTD_entropyDTables_t; } ZSTD_entropyDTables_t;
typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader, typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,

View File

@@ -232,10 +232,11 @@ FORCE_NOINLINE size_t ZSTD_decodeLiteralsHeader(ZSTD_DCtx* dctx, void const* src
} }
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
return HUF_readDTableX1_wksp( return HUF_readDTableX1_wksp_bmi2(
dctx->entropy.hufTable, dctx->entropy.hufTable,
istart+lhSize, litCSize, istart+lhSize, litCSize,
dctx->workspace, sizeof(dctx->workspace)); dctx->workspace, sizeof(dctx->workspace),
dctx->bmi2);
} }
} }
return 0; return 0;