mirror of
https://github.com/postgres/postgres.git
synced 2025-07-26 01:22:12 +03:00
Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86.
Eliminate the separate 'len' variable from the loops, and also use the 4 byte instruction. This shaves off a few more cycles. Even though this routine that uses the special SSE 4.2 instructions is much faster than a generic routine, it's still a hot spot, so let's make it as fast as possible. Change the configure test to not test _mm_crc32_u64. That variant is only available in the 64-bit x86-64 architecture, not in 32-bit x86. Modify pg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. With these changes, the SSE accelerated CRC-32C implementation can also be used on 32-bit x86 systems. This also fixes the 32-bit MSVC build.
This commit is contained in:
@ -476,12 +476,16 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
|
|||||||
|
|
||||||
# PGAC_SSE42_CRC32_INTRINSICS
|
# PGAC_SSE42_CRC32_INTRINSICS
|
||||||
# -----------------------
|
# -----------------------
|
||||||
# Check if the compiler supports _mm_crc32_u8 and _mm_crc32_u64 intrinsics.
|
# Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
|
||||||
|
# using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
|
||||||
|
# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
|
||||||
|
# the other ones are, on x86-64 platforms)
|
||||||
|
#
|
||||||
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
|
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
|
||||||
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
|
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
|
||||||
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
|
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
|
||||||
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
|
||||||
AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=$1], [Ac_cachevar],
|
AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
|
||||||
[pgac_save_CFLAGS=$CFLAGS
|
[pgac_save_CFLAGS=$CFLAGS
|
||||||
CFLAGS="$pgac_save_CFLAGS $1"
|
CFLAGS="$pgac_save_CFLAGS $1"
|
||||||
ac_save_c_werror_flag=$ac_c_werror_flag
|
ac_save_c_werror_flag=$ac_c_werror_flag
|
||||||
@ -489,7 +493,7 @@ ac_c_werror_flag=yes
|
|||||||
AC_TRY_LINK([#include <nmmintrin.h>],
|
AC_TRY_LINK([#include <nmmintrin.h>],
|
||||||
[unsigned int crc = 0;
|
[unsigned int crc = 0;
|
||||||
crc = _mm_crc32_u8(crc, 0);
|
crc = _mm_crc32_u8(crc, 0);
|
||||||
crc = (unsigned int) _mm_crc32_u64(crc, 0);],
|
crc = _mm_crc32_u32(crc, 0);],
|
||||||
[Ac_cachevar=yes],
|
[Ac_cachevar=yes],
|
||||||
[Ac_cachevar=no])
|
[Ac_cachevar=no])
|
||||||
ac_c_werror_flag=$ac_save_c_werror_flag
|
ac_c_werror_flag=$ac_save_c_werror_flag
|
||||||
|
12
configure
vendored
12
configure
vendored
@ -14172,8 +14172,8 @@ fi
|
|||||||
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
|
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
|
||||||
# with the default compiler flags. If not, check if adding the -msse4.2
|
# with the default compiler flags. If not, check if adding the -msse4.2
|
||||||
# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
|
# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
|
||||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=" >&5
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
|
||||||
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=... " >&6; }
|
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
|
||||||
if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
|
if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
|
||||||
$as_echo_n "(cached) " >&6
|
$as_echo_n "(cached) " >&6
|
||||||
else
|
else
|
||||||
@ -14189,7 +14189,7 @@ main ()
|
|||||||
{
|
{
|
||||||
unsigned int crc = 0;
|
unsigned int crc = 0;
|
||||||
crc = _mm_crc32_u8(crc, 0);
|
crc = _mm_crc32_u8(crc, 0);
|
||||||
crc = (unsigned int) _mm_crc32_u64(crc, 0);
|
crc = _mm_crc32_u32(crc, 0);
|
||||||
;
|
;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -14212,8 +14212,8 @@ if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
|
if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
|
||||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2" >&5
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
|
||||||
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2... " >&6; }
|
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
|
||||||
if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
|
if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
|
||||||
$as_echo_n "(cached) " >&6
|
$as_echo_n "(cached) " >&6
|
||||||
else
|
else
|
||||||
@ -14229,7 +14229,7 @@ main ()
|
|||||||
{
|
{
|
||||||
unsigned int crc = 0;
|
unsigned int crc = 0;
|
||||||
crc = _mm_crc32_u8(crc, 0);
|
crc = _mm_crc32_u8(crc, 0);
|
||||||
crc = (unsigned int) _mm_crc32_u64(crc, 0);
|
crc = _mm_crc32_u32(crc, 0);
|
||||||
;
|
;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -22,30 +22,45 @@ pg_crc32c
|
|||||||
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
|
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
const unsigned char *p = data;
|
const unsigned char *p = data;
|
||||||
const uint64 *p8;
|
const unsigned char *pend = p + len;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Process eight bytes of data at a time.
|
* Process eight bytes of data at a time.
|
||||||
*
|
*
|
||||||
* NB: We do unaligned 8-byte accesses here. The Intel architecture
|
* NB: We do unaligned accesses here. The Intel architecture allows that,
|
||||||
* allows that, and performance testing didn't show any performance
|
* and performance testing didn't show any performance gain from aligning
|
||||||
* gain from aligning the beginning address.
|
* the begin address.
|
||||||
*/
|
*/
|
||||||
p8 = (const uint64 *) p;
|
#ifdef __x86_64__
|
||||||
while (len >= 8)
|
while (p + 8 <= pend)
|
||||||
{
|
{
|
||||||
crc = (uint32) _mm_crc32_u64(crc, *p8++);
|
crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
|
||||||
len -= 8;
|
p += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* Process remaining full four bytes if any */
|
||||||
* Handle any remaining bytes one at a time.
|
if (p + 4 <= pend)
|
||||||
*/
|
|
||||||
p = (const unsigned char *) p8;
|
|
||||||
while (len > 0)
|
|
||||||
{
|
{
|
||||||
crc = _mm_crc32_u8(crc, *p++);
|
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
|
||||||
len--;
|
p += 4;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
/*
|
||||||
|
* Process four bytes at a time. (The eight byte instruction is not
|
||||||
|
* available on the 32-bit x86 architecture).
|
||||||
|
*/
|
||||||
|
while (p + 4 <= pend)
|
||||||
|
{
|
||||||
|
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
|
||||||
|
p += 4;
|
||||||
|
}
|
||||||
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
|
/* Process any remaining bytes one at a time. */
|
||||||
|
while (p < pend)
|
||||||
|
{
|
||||||
|
crc = _mm_crc32_u8(crc, *p);
|
||||||
|
p++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return crc;
|
return crc;
|
||||||
|
Reference in New Issue
Block a user