mirror of
https://github.com/postgres/postgres.git
synced 2025-08-28 18:48:04 +03:00
Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86.
Eliminate the separate 'len' variable from the loops, and also use the 4 byte instruction. This shaves off a few more cycles. Even though this routine that uses the special SSE 4.2 instructions is much faster than a generic routine, it's still a hot spot, so let's make it as fast as possible. Change the configure test to not test _mm_crc32_u64. That variant is only available in the 64-bit x86-64 architecture, not in 32-bit x86. Modify pg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. With these changes, the SSE accelerated CRC-32C implementation can also be used on 32-bit x86 systems. This also fixes the 32-bit MSVC build.
This commit is contained in:
@@ -22,30 +22,45 @@ pg_crc32c
|
||||
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
|
||||
{
|
||||
const unsigned char *p = data;
|
||||
const uint64 *p8;
|
||||
const unsigned char *pend = p + len;
|
||||
|
||||
/*
|
||||
* Process eight bytes of data at a time.
|
||||
*
|
||||
* NB: We do unaligned 8-byte accesses here. The Intel architecture
|
||||
* allows that, and performance testing didn't show any performance
|
||||
* gain from aligning the beginning address.
|
||||
* NB: We do unaligned accesses here. The Intel architecture allows that,
|
||||
* and performance testing didn't show any performance gain from aligning
|
||||
* the begin address.
|
||||
*/
|
||||
p8 = (const uint64 *) p;
|
||||
while (len >= 8)
|
||||
#ifdef __x86_64__
|
||||
while (p + 8 <= pend)
|
||||
{
|
||||
crc = (uint32) _mm_crc32_u64(crc, *p8++);
|
||||
len -= 8;
|
||||
crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
|
||||
p += 8;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle any remaining bytes one at a time.
|
||||
*/
|
||||
p = (const unsigned char *) p8;
|
||||
while (len > 0)
|
||||
/* Process remaining full four bytes if any */
|
||||
if (p + 4 <= pend)
|
||||
{
|
||||
crc = _mm_crc32_u8(crc, *p++);
|
||||
len--;
|
||||
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
|
||||
p += 4;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* Process four bytes at a time. (The eight byte instruction is not
|
||||
* available on the 32-bit x86 architecture).
|
||||
*/
|
||||
while (p + 4 <= pend)
|
||||
{
|
||||
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
|
||||
p += 4;
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
/* Process any remaining bytes one at a time. */
|
||||
while (p < pend)
|
||||
{
|
||||
crc = _mm_crc32_u8(crc, *p);
|
||||
p++;
|
||||
}
|
||||
|
||||
return crc;
|
||||
|
Reference in New Issue
Block a user