mirror of
https://github.com/facebook/zstd.git
synced 2025-09-08 13:32:28 +03:00
AVX2 version of ZSTD_get1BlockSummary()
This commit is contained in:
@@ -7395,56 +7395,65 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx,
|
|||||||
return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0);
|
return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0 && defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */
|
/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
# define ALIGNED32 __attribute__((aligned(32)))
|
# define ALIGNED32 __attribute__((aligned(32)))
|
||||||
|
#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */
|
||||||
|
# define ALIGNED32 alignas(32)
|
||||||
#else
|
#else
|
||||||
|
/* this compiler will require its own alignment instruction */
|
||||||
# define ALIGNED32
|
# define ALIGNED32
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
|
BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
__m256i sumVec; /* accumulates match+lit in 32-bit lanes */
|
__m256i const zeroVec = _mm256_setzero_si256();
|
||||||
__m256i mask; /* shuffling control */
|
__m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */
|
||||||
ALIGNED32 int tmp[8]; /* temporary buffer for reduction */
|
__m256i shuffle32; /* shuffling control */
|
||||||
uint64_t sum;
|
ALIGNED32 U32 tmp[8]; /* temporary buffer for reduction */
|
||||||
int k;
|
size_t mSum = 0, lSum = 0;
|
||||||
|
|
||||||
sumVec = _mm256_setzero_si256();
|
|
||||||
mask = _mm256_setr_epi32(
|
|
||||||
1,5, /* match(0), match(1) */
|
|
||||||
2,6, /* lit(0), lit(1) */
|
|
||||||
1,5, /* match(0), match(1) */
|
|
||||||
2,6 /* lit(0), lit(1) */
|
|
||||||
);
|
|
||||||
|
|
||||||
/* Process 2 structs (32 bytes) at a time */
|
/* Process 2 structs (32 bytes) at a time */
|
||||||
for (i = 0; i + 2 <= count; i += 2) {
|
for (i = 0; i + 2 <= nbSeqs; i += 2) {
|
||||||
/* Load two consecutive MyStructs (8×4 = 32 bytes) */
|
/* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */
|
||||||
__m256i data = _mm256_loadu_si256((const __m256i*)&arr[i]);
|
__m256i data = _mm256_loadu_si256((const __m256i*)&seqs[i]);
|
||||||
/* Shuffle out lanes 1,2,5,6 => match(0), match(1), lit(0), lit(1), repeated */
|
/* check end of block signal */
|
||||||
__m256i selected = _mm256_permutevar8x32_epi32(data, mask);
|
__m256i cmp = _mm256_cmpeq_epi32(data, zeroVec);
|
||||||
|
int cmp_res = _mm256_movemask_epi8(cmp);
|
||||||
|
/* indices for match lengths correspond to bits [8..11], [24..27]
|
||||||
|
* => combined mask = 0x0F000F00 */
|
||||||
|
if (cmp_res & 0x0F000F00) break;
|
||||||
/* Accumulate in sumVec */
|
/* Accumulate in sumVec */
|
||||||
sumVec = _mm256_add_epi32(sumVec, selected);
|
sumVec = _mm256_add_epi32(sumVec, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Horizontal reduction of sumVec */
|
/* Horizontal reduction */
|
||||||
_mm256_store_si256((__m256i*)tmp, sumVec);
|
_mm256_store_si256((__m256i*)tmp, sumVec);
|
||||||
sum = 0;
|
lSum = tmp[1] + tmp[5];
|
||||||
for (k = 0; k < 8; k++) {
|
mSum = tmp[2] + tmp[6];
|
||||||
sum += (uint64_t)tmp[k]; /* each lane is match+lit from pairs, repeated twice */
|
|
||||||
|
/* Handle the leftover */
|
||||||
|
for (; i < nbSeqs; i++) {
|
||||||
|
lSum += seqs[i].litLength;
|
||||||
|
mSum += seqs[i].matchLength;
|
||||||
|
if (seqs[i].matchLength == 0) break; /* end of block */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Handle the leftover (if count is odd) */
|
if (i==nbSeqs) {
|
||||||
for (; i < count; i++) {
|
/* reaching end of sequences: end of block signal was not present */
|
||||||
sum += arr[i].matchLength;
|
BlockSummary bs;
|
||||||
sum += arr[i].litLength;
|
bs.nbSequences = ERROR(externalSequences_invalid);
|
||||||
|
return bs;
|
||||||
|
}
|
||||||
|
{ BlockSummary bs;
|
||||||
|
bs.nbSequences = i+1;
|
||||||
|
bs.blockSize = lSum + mSum;
|
||||||
|
bs.litSize = lSum;
|
||||||
|
return bs;
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
Reference in New Issue
Block a user