mirror of
https://github.com/postgres/postgres.git
synced 2025-04-18 13:44:19 +03:00
Compute CRC32C using AVX-512 instructions where available
The previous implementation of CRC32C on x86 relied on the native CRC32 instruction from the SSE 4.2 extension, which operates on up to 8 bytes at a time. We can get a substantial speedup by using carryless multiplication on SIMD registers, processing 64 bytes per loop iteration. Shorter inputs fall back to ordinary CRC instructions. On Intel Tiger Lake hardware (2020), CRC is now 50% faster for inputs between 64 and 112 bytes, and 3x faster for 256 bytes. The VPCLMULQDQ instruction on 512-bit registers has been available on Intel hardware since 2019 and AMD since 2022. There is an older variant for 128-bit registers, but at least on Zen 2 it performs worse than normal CRC instructions for short inputs. We must now do a runtime check, even for builds that target SSE 4.2. This doesn't matter in practice for WAL (arguably the most critical case), because since commit e2809e3a1 the final computation with the 20-byte WAL header is inlined and unrolled when targeting that extension. Compared with two direct function calls, testing showed equal or slightly faster performance in performing an indirect function call on several dozen bytes followed by inlined instructions on constant input of 20 bytes. The MIT-licensed implementation was generated with the "generate" program from https://github.com/corsix/fast-crc32/ Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009 Co-authored-by: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> Co-authored-by: Paul Amonson <paul.d.amonson@intel.com> Reviewed-by: Nathan Bossart <nathandbossart@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> (earlier version) Reviewed-by: Matthew Sterrett <matthewsterrett2@gmail.com> (earlier version) Tested-by: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> Tested-by: David Rowley <<dgrowleyml@gmail.com>> (earlier version) Discussion: https://postgr.es/m/BL1PR11MB530401FA7E9B1CA432CF9DC3DC192@BL1PR11MB5304.namprd11.prod.outlook.com Discussion: https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com
This commit is contained in:
parent
683df3f4de
commit
3c6e8c1238
@ -581,6 +581,43 @@ fi
|
|||||||
undefine([Ac_cachevar])dnl
|
undefine([Ac_cachevar])dnl
|
||||||
])# PGAC_SSE42_CRC32_INTRINSICS
|
])# PGAC_SSE42_CRC32_INTRINSICS
|
||||||
|
|
||||||
|
# PGAC_AVX512_PCLMUL_INTRINSICS
|
||||||
|
# ---------------------------
|
||||||
|
# Check if the compiler supports AVX-512 carryless multiplication
|
||||||
|
# and three-way exclusive-or instructions used for computing CRC.
|
||||||
|
# AVX-512F is assumed to be supported if the above are.
|
||||||
|
#
|
||||||
|
# If the intrinsics are supported, sets pgac_avx512_pclmul_intrinsics.
|
||||||
|
AC_DEFUN([PGAC_AVX512_PCLMUL_INTRINSICS],
|
||||||
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_pclmul_intrinsics])])dnl
|
||||||
|
AC_CACHE_CHECK([for _mm512_clmulepi64_epi128], [Ac_cachevar],
|
||||||
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
|
||||||
|
__m512i x;
|
||||||
|
__m512i y;
|
||||||
|
|
||||||
|
#if defined(__has_attribute) && __has_attribute (target)
|
||||||
|
__attribute__((target("vpclmulqdq,avx512vl")))
|
||||||
|
#endif
|
||||||
|
static int avx512_pclmul_test(void)
|
||||||
|
{
|
||||||
|
__m128i z;
|
||||||
|
|
||||||
|
y = _mm512_clmulepi64_epi128(x, y, 0);
|
||||||
|
z = _mm_ternarylogic_epi64(
|
||||||
|
_mm512_castsi512_si128(y),
|
||||||
|
_mm512_extracti32x4_epi32(y, 1),
|
||||||
|
_mm512_extracti32x4_epi32(y, 2),
|
||||||
|
0x96);
|
||||||
|
return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
|
||||||
|
}],
|
||||||
|
[return avx512_pclmul_test();])],
|
||||||
|
[Ac_cachevar=yes],
|
||||||
|
[Ac_cachevar=no])])
|
||||||
|
if test x"$Ac_cachevar" = x"yes"; then
|
||||||
|
pgac_avx512_pclmul_intrinsics=yes
|
||||||
|
fi
|
||||||
|
undefine([Ac_cachevar])dnl
|
||||||
|
])# PGAC_AVX512_PCLMUL_INTRINSICS
|
||||||
|
|
||||||
# PGAC_ARMV8_CRC32C_INTRINSICS
|
# PGAC_ARMV8_CRC32C_INTRINSICS
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
|
91
configure
vendored
91
configure
vendored
@ -17864,17 +17864,21 @@ fi
|
|||||||
|
|
||||||
# Select CRC-32C implementation.
|
# Select CRC-32C implementation.
|
||||||
#
|
#
|
||||||
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
|
# There are three methods of calculating CRC, in order of increasing
|
||||||
# use the special CRC instructions for calculating CRC-32C. If we're not
|
# performance:
|
||||||
# targeting such a processor, but we can nevertheless produce code that uses
|
|
||||||
# the SSE intrinsics, compile both implementations and select which one to use
|
|
||||||
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
|
|
||||||
# running on.
|
|
||||||
#
|
#
|
||||||
# Similarly, if we are targeting an ARM processor that has the CRC
|
# 1. The fallback using a lookup table, called slicing-by-8
|
||||||
# instructions that are part of the ARMv8 CRC Extension, use them. And if
|
# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
|
||||||
# we're not targeting such a processor, but can nevertheless produce code that
|
# 3. Algorithms using carryless multiplication instructions
|
||||||
# uses the CRC instructions, compile both, and select at runtime.
|
# (e.g. Intel PCLMUL and Arm PMULL)
|
||||||
|
#
|
||||||
|
# If we can produce code (via function attributes or additional compiler
|
||||||
|
# flags) that uses #2 (and possibly #3), we compile all implementations
|
||||||
|
# and select which one to use at runtime, depending on what is supported
|
||||||
|
# by the processor we're running on.
|
||||||
|
#
|
||||||
|
# If we are targeting a processor that has #2, we can use that without
|
||||||
|
# runtime selection.
|
||||||
#
|
#
|
||||||
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
|
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
|
||||||
# instructions because until clang 16, using the ARM intrinsics still requires
|
# instructions because until clang 16, using the ARM intrinsics still requires
|
||||||
@ -17925,7 +17929,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
|
|||||||
|
|
||||||
$as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
|
$as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
|
||||||
|
|
||||||
PG_CRC32C_OBJS="pg_crc32c_sse42.o"
|
PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
|
||||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
|
||||||
$as_echo "SSE 4.2" >&6; }
|
$as_echo "SSE 4.2" >&6; }
|
||||||
else
|
else
|
||||||
@ -17974,6 +17978,71 @@ $as_echo "slicing-by-8" >&6; }
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# Check for carryless multiplication intrinsics to do vectorized CRC calculations.
|
||||||
|
#
|
||||||
|
if test x"$host_cpu" = x"x86_64"; then
|
||||||
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128" >&5
|
||||||
|
$as_echo_n "checking for _mm512_clmulepi64_epi128... " >&6; }
|
||||||
|
if ${pgac_cv_avx512_pclmul_intrinsics+:} false; then :
|
||||||
|
$as_echo_n "(cached) " >&6
|
||||||
|
else
|
||||||
|
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||||
|
/* end confdefs.h. */
|
||||||
|
#include <immintrin.h>
|
||||||
|
__m512i x;
|
||||||
|
__m512i y;
|
||||||
|
|
||||||
|
#if defined(__has_attribute) && __has_attribute (target)
|
||||||
|
__attribute__((target("vpclmulqdq,avx512vl")))
|
||||||
|
#endif
|
||||||
|
static int avx512_pclmul_test(void)
|
||||||
|
{
|
||||||
|
__m128i z;
|
||||||
|
|
||||||
|
y = _mm512_clmulepi64_epi128(x, y, 0);
|
||||||
|
z = _mm_ternarylogic_epi64(
|
||||||
|
_mm512_castsi512_si128(y),
|
||||||
|
_mm512_extracti32x4_epi32(y, 1),
|
||||||
|
_mm512_extracti32x4_epi32(y, 2),
|
||||||
|
0x96);
|
||||||
|
return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
|
||||||
|
}
|
||||||
|
int
|
||||||
|
main ()
|
||||||
|
{
|
||||||
|
return avx512_pclmul_test();
|
||||||
|
;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
_ACEOF
|
||||||
|
if ac_fn_c_try_link "$LINENO"; then :
|
||||||
|
pgac_cv_avx512_pclmul_intrinsics=yes
|
||||||
|
else
|
||||||
|
pgac_cv_avx512_pclmul_intrinsics=no
|
||||||
|
fi
|
||||||
|
rm -f core conftest.err conftest.$ac_objext \
|
||||||
|
conftest$ac_exeext conftest.$ac_ext
|
||||||
|
fi
|
||||||
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_pclmul_intrinsics" >&5
|
||||||
|
$as_echo "$pgac_cv_avx512_pclmul_intrinsics" >&6; }
|
||||||
|
if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then
|
||||||
|
pgac_avx512_pclmul_intrinsics=yes
|
||||||
|
fi
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
|
||||||
|
$as_echo_n "checking for vectorized CRC-32C... " >&6; }
|
||||||
|
if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
|
||||||
|
|
||||||
|
$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
|
||||||
|
|
||||||
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
|
||||||
|
$as_echo "AVX-512 with runtime check" >&6; }
|
||||||
|
else
|
||||||
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
|
||||||
|
$as_echo "none" >&6; }
|
||||||
|
fi
|
||||||
|
|
||||||
# Select semaphore implementation type.
|
# Select semaphore implementation type.
|
||||||
if test "$PORTNAME" != "win32"; then
|
if test "$PORTNAME" != "win32"; then
|
||||||
|
39
configure.ac
39
configure.ac
@ -2116,17 +2116,21 @@ AC_SUBST(CFLAGS_CRC)
|
|||||||
|
|
||||||
# Select CRC-32C implementation.
|
# Select CRC-32C implementation.
|
||||||
#
|
#
|
||||||
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
|
# There are three methods of calculating CRC, in order of increasing
|
||||||
# use the special CRC instructions for calculating CRC-32C. If we're not
|
# performance:
|
||||||
# targeting such a processor, but we can nevertheless produce code that uses
|
|
||||||
# the SSE intrinsics, compile both implementations and select which one to use
|
|
||||||
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
|
|
||||||
# running on.
|
|
||||||
#
|
#
|
||||||
# Similarly, if we are targeting an ARM processor that has the CRC
|
# 1. The fallback using a lookup table, called slicing-by-8
|
||||||
# instructions that are part of the ARMv8 CRC Extension, use them. And if
|
# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
|
||||||
# we're not targeting such a processor, but can nevertheless produce code that
|
# 3. Algorithms using carryless multiplication instructions
|
||||||
# uses the CRC instructions, compile both, and select at runtime.
|
# (e.g. Intel PCLMUL and Arm PMULL)
|
||||||
|
#
|
||||||
|
# If we can produce code (via function attributes or additional compiler
|
||||||
|
# flags) that uses #2 (and possibly #3), we compile all implementations
|
||||||
|
# and select which one to use at runtime, depending on what is supported
|
||||||
|
# by the processor we're running on.
|
||||||
|
#
|
||||||
|
# If we are targeting a processor that has #2, we can use that without
|
||||||
|
# runtime selection.
|
||||||
#
|
#
|
||||||
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
|
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
|
||||||
# instructions because until clang 16, using the ARM intrinsics still requires
|
# instructions because until clang 16, using the ARM intrinsics still requires
|
||||||
@ -2174,7 +2178,7 @@ fi
|
|||||||
AC_MSG_CHECKING([which CRC-32C implementation to use])
|
AC_MSG_CHECKING([which CRC-32C implementation to use])
|
||||||
if test x"$USE_SSE42_CRC32C" = x"1"; then
|
if test x"$USE_SSE42_CRC32C" = x"1"; then
|
||||||
AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
|
AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
|
||||||
PG_CRC32C_OBJS="pg_crc32c_sse42.o"
|
PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
|
||||||
AC_MSG_RESULT(SSE 4.2)
|
AC_MSG_RESULT(SSE 4.2)
|
||||||
else
|
else
|
||||||
if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
|
if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
|
||||||
@ -2207,6 +2211,19 @@ else
|
|||||||
fi
|
fi
|
||||||
AC_SUBST(PG_CRC32C_OBJS)
|
AC_SUBST(PG_CRC32C_OBJS)
|
||||||
|
|
||||||
|
# Check for carryless multiplication intrinsics to do vectorized CRC calculations.
|
||||||
|
#
|
||||||
|
if test x"$host_cpu" = x"x86_64"; then
|
||||||
|
PGAC_AVX512_PCLMUL_INTRINSICS()
|
||||||
|
fi
|
||||||
|
|
||||||
|
AC_MSG_CHECKING([for vectorized CRC-32C])
|
||||||
|
if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
|
||||||
|
AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
|
||||||
|
AC_MSG_RESULT(AVX-512 with runtime check)
|
||||||
|
else
|
||||||
|
AC_MSG_RESULT(none)
|
||||||
|
fi
|
||||||
|
|
||||||
# Select semaphore implementation type.
|
# Select semaphore implementation type.
|
||||||
if test "$PORTNAME" != "win32"; then
|
if test "$PORTNAME" != "win32"; then
|
||||||
|
58
meson.build
58
meson.build
@ -2349,17 +2349,21 @@ endif
|
|||||||
###############################################################
|
###############################################################
|
||||||
# Select CRC-32C implementation.
|
# Select CRC-32C implementation.
|
||||||
#
|
#
|
||||||
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
|
# There are three methods of calculating CRC, in order of increasing
|
||||||
# use the special CRC instructions for calculating CRC-32C. If we're not
|
# performance:
|
||||||
# targeting such a processor, but we can nevertheless produce code that uses
|
|
||||||
# the SSE intrinsics, compile both implementations and select which one to use
|
|
||||||
# at runtime, depending on whether SSE 4.2 is supported by the processor we're
|
|
||||||
# running on.
|
|
||||||
#
|
#
|
||||||
# Similarly, if we are targeting an ARM processor that has the CRC
|
# 1. The fallback using a lookup table, called slicing-by-8
|
||||||
# instructions that are part of the ARMv8 CRC Extension, use them. And if
|
# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension)
|
||||||
# we're not targeting such a processor, but can nevertheless produce code that
|
# 3. Algorithms using carryless multiplication instructions
|
||||||
# uses the CRC instructions, compile both, and select at runtime.
|
# (e.g. Intel PCLMUL and Arm PMULL)
|
||||||
|
#
|
||||||
|
# If we can produce code (via function attributes or additional compiler
|
||||||
|
# flags) that uses #2 (and possibly #3), we compile all implementations
|
||||||
|
# and select which one to use at runtime, depending on what is supported
|
||||||
|
# by the processor we're running on.
|
||||||
|
#
|
||||||
|
# If we are targeting a processor that has #2, we can use that without
|
||||||
|
# runtime selection.
|
||||||
#
|
#
|
||||||
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
|
# Note that we do not use __attribute__((target("..."))) for the ARM CRC
|
||||||
# instructions because until clang 16, using the ARM intrinsics still requires
|
# instructions because until clang 16, using the ARM intrinsics still requires
|
||||||
@ -2393,7 +2397,7 @@ int main(void)
|
|||||||
}
|
}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
if not cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32',
|
if not cc.links(prog, name: 'SSE 4.2 CRC32C',
|
||||||
args: test_c_args)
|
args: test_c_args)
|
||||||
# Do not use Intel SSE 4.2
|
# Do not use Intel SSE 4.2
|
||||||
elif (cc.get_define('__SSE4_2__') != '')
|
elif (cc.get_define('__SSE4_2__') != '')
|
||||||
@ -2408,6 +2412,38 @@ int main(void)
|
|||||||
have_optimized_crc = true
|
have_optimized_crc = true
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# Check if the compiler supports AVX-512 carryless multiplication
|
||||||
|
# and three-way exclusive-or instructions used for computing CRC.
|
||||||
|
# AVX-512F is assumed to be supported if the above are.
|
||||||
|
prog = '''
|
||||||
|
#include <immintrin.h>
|
||||||
|
__m512i x;
|
||||||
|
__m512i y;
|
||||||
|
|
||||||
|
#if defined(__has_attribute) && __has_attribute (target)
|
||||||
|
__attribute__((target("vpclmulqdq,avx512vl")))
|
||||||
|
#endif
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
__m128i z;
|
||||||
|
|
||||||
|
y = _mm512_clmulepi64_epi128(x, y, 0);
|
||||||
|
z = _mm_ternarylogic_epi64(
|
||||||
|
_mm512_castsi512_si128(y),
|
||||||
|
_mm512_extracti32x4_epi32(y, 1),
|
||||||
|
_mm512_extracti32x4_epi32(y, 2),
|
||||||
|
0x96);
|
||||||
|
/* return computed value, to prevent the above being optimized away */
|
||||||
|
return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
|
||||||
|
if cc.links(prog,
|
||||||
|
name: 'AVX-512 CRC32C',
|
||||||
|
args: test_c_args)
|
||||||
|
cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1)
|
||||||
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
elif host_cpu == 'arm' or host_cpu == 'aarch64'
|
elif host_cpu == 'arm' or host_cpu == 'aarch64'
|
||||||
|
@ -665,6 +665,9 @@
|
|||||||
/* Define to 1 to build with assertion checks. (--enable-cassert) */
|
/* Define to 1 to build with assertion checks. (--enable-cassert) */
|
||||||
#undef USE_ASSERT_CHECKING
|
#undef USE_ASSERT_CHECKING
|
||||||
|
|
||||||
|
/* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
|
||||||
|
#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
|
||||||
|
|
||||||
/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
|
/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
|
||||||
#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
|
#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
|
||||||
|
|
||||||
|
@ -42,7 +42,10 @@ typedef uint32 pg_crc32c;
|
|||||||
#define EQ_CRC32C(c1, c2) ((c1) == (c2))
|
#define EQ_CRC32C(c1, c2) ((c1) == (c2))
|
||||||
|
|
||||||
#if defined(USE_SSE42_CRC32C)
|
#if defined(USE_SSE42_CRC32C)
|
||||||
/* Use Intel SSE4.2 instructions. */
|
/*
|
||||||
|
* Use either Intel SSE 4.2 or AVX-512 instructions. We don't need a runtime check
|
||||||
|
* for SSE 4.2, so we can inline those in some cases.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <nmmintrin.h>
|
#include <nmmintrin.h>
|
||||||
|
|
||||||
@ -50,7 +53,11 @@ typedef uint32 pg_crc32c;
|
|||||||
((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
|
((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
|
||||||
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
|
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
|
||||||
|
|
||||||
|
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
|
||||||
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
|
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
|
||||||
|
#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
|
||||||
|
extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can only get here if the host compiler targets SSE 4.2, but on some
|
* We can only get here if the host compiler targets SSE 4.2, but on some
|
||||||
@ -82,9 +89,27 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
|
|||||||
return crc;
|
return crc;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return pg_comp_crc32c_sse42(crc, data, len);
|
/* Otherwise, use a runtime check for AVX-512 instructions. */
|
||||||
|
return pg_comp_crc32c(crc, data, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Use Intel SSE 4.2 or AVX-512 instructions, but perform a runtime check first
|
||||||
|
* to check that they are available.
|
||||||
|
*/
|
||||||
|
#define COMP_CRC32C(crc, data, len) \
|
||||||
|
((crc) = pg_comp_crc32c((crc), (data), (len)))
|
||||||
|
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
|
||||||
|
|
||||||
|
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
|
||||||
|
extern PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
|
||||||
|
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
|
||||||
|
#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
|
||||||
|
extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
|
||||||
|
#endif
|
||||||
|
|
||||||
#elif defined(USE_ARMV8_CRC32C)
|
#elif defined(USE_ARMV8_CRC32C)
|
||||||
/* Use ARMv8 CRC Extension instructions. */
|
/* Use ARMv8 CRC Extension instructions. */
|
||||||
|
|
||||||
@ -103,10 +128,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
|
|||||||
|
|
||||||
extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
|
extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
|
||||||
|
|
||||||
#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
|
#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
|
* Use ARMv8 instructions, but perform a runtime check first
|
||||||
* to check that they are available.
|
* to check that they are available.
|
||||||
*/
|
*/
|
||||||
#define COMP_CRC32C(crc, data, len) \
|
#define COMP_CRC32C(crc, data, len) \
|
||||||
@ -115,13 +140,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
|
|||||||
|
|
||||||
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
|
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
|
||||||
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
|
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
|
||||||
|
|
||||||
#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
|
|
||||||
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
|
|
||||||
#endif
|
|
||||||
#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
|
|
||||||
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
|
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
/*
|
/*
|
||||||
|
@ -86,6 +86,7 @@ replace_funcs_pos = [
|
|||||||
# x86/x64
|
# x86/x64
|
||||||
['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
|
['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
|
||||||
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
||||||
|
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
|
||||||
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
||||||
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*-------------------------------------------------------------------------
|
/*-------------------------------------------------------------------------
|
||||||
*
|
*
|
||||||
* pg_crc32c_sse42.c
|
* pg_crc32c_sse42.c
|
||||||
* Compute CRC-32C checksum using Intel SSE 4.2 instructions.
|
* Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
|
||||||
*
|
*
|
||||||
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
@ -15,6 +15,9 @@
|
|||||||
#include "c.h"
|
#include "c.h"
|
||||||
|
|
||||||
#include <nmmintrin.h>
|
#include <nmmintrin.h>
|
||||||
|
#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "port/pg_crc32c.h"
|
#include "port/pg_crc32c.h"
|
||||||
|
|
||||||
@ -68,3 +71,92 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
|
|||||||
|
|
||||||
return crc;
|
return crc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: There is no copyright notice in the following generated code.
|
||||||
|
*
|
||||||
|
* We have modified the output to
|
||||||
|
* - match our function declaration
|
||||||
|
* - match whitespace to our project style
|
||||||
|
* - add a threshold for the alignment stanza
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Generated by https://github.com/corsix/fast-crc32/ using: */
|
||||||
|
/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
|
||||||
|
/* MIT licensed */
|
||||||
|
|
||||||
|
#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
|
||||||
|
#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
|
||||||
|
|
||||||
|
pg_attribute_target("vpclmulqdq,avx512vl")
|
||||||
|
pg_crc32c
|
||||||
|
pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length)
|
||||||
|
{
|
||||||
|
/* adjust names to match generated code */
|
||||||
|
pg_crc32c crc0 = crc;
|
||||||
|
size_t len = length;
|
||||||
|
const char *buf = data;
|
||||||
|
|
||||||
|
/* Align on cacheline boundary. The threshold is somewhat arbitrary. */
|
||||||
|
if (unlikely(len > 256))
|
||||||
|
{
|
||||||
|
for (; len && ((uintptr_t) buf & 7); --len)
|
||||||
|
crc0 = _mm_crc32_u8(crc0, *buf++);
|
||||||
|
while (((uintptr_t) buf & 56) && len >= 8)
|
||||||
|
{
|
||||||
|
crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
|
||||||
|
buf += 8;
|
||||||
|
len -= 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 64)
|
||||||
|
{
|
||||||
|
const char *end = buf + len;
|
||||||
|
const char *limit = buf + len - 64;
|
||||||
|
__m128i z0;
|
||||||
|
|
||||||
|
/* First vector chunk. */
|
||||||
|
__m512i x0 = _mm512_loadu_si512((const void *) buf),
|
||||||
|
y0;
|
||||||
|
__m512i k;
|
||||||
|
|
||||||
|
k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
|
||||||
|
x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
|
||||||
|
buf += 64;
|
||||||
|
|
||||||
|
/* Main loop. */
|
||||||
|
while (buf <= limit)
|
||||||
|
{
|
||||||
|
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
|
||||||
|
x0 = _mm512_ternarylogic_epi64(x0, y0,
|
||||||
|
_mm512_loadu_si512((const void *) buf),
|
||||||
|
0x96);
|
||||||
|
buf += 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Reduce 512 bits to 128 bits. */
|
||||||
|
k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
|
||||||
|
0x3da6d0cb, 0, 0xba4fc28e, 0,
|
||||||
|
0xf20c0dfe, 0, 0x493c7d27, 0,
|
||||||
|
0, 0, 0, 0);
|
||||||
|
y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
|
||||||
|
y0 = _mm512_xor_si512(y0, k);
|
||||||
|
z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
|
||||||
|
_mm512_extracti32x4_epi32(y0, 1),
|
||||||
|
_mm512_extracti32x4_epi32(y0, 2),
|
||||||
|
0x96);
|
||||||
|
z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
|
||||||
|
|
||||||
|
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
|
||||||
|
crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
|
||||||
|
crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
|
||||||
|
len = end - buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
return pg_comp_crc32c_sse42(crc0, buf, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@ -20,30 +20,37 @@
|
|||||||
|
|
||||||
#include "c.h"
|
#include "c.h"
|
||||||
|
|
||||||
#ifdef HAVE__GET_CPUID
|
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
|
||||||
#include <cpuid.h>
|
#include <cpuid.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE__CPUID
|
#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_XSAVE_INTRINSICS
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "port/pg_crc32c.h"
|
#include "port/pg_crc32c.h"
|
||||||
|
|
||||||
static bool
|
/*
|
||||||
pg_crc32c_sse42_available(void)
|
* Does XGETBV say the ZMM registers are enabled?
|
||||||
{
|
*
|
||||||
unsigned int exx[4] = {0, 0, 0, 0};
|
* NB: Caller is responsible for verifying that osxsave is available
|
||||||
|
* before calling this.
|
||||||
#if defined(HAVE__GET_CPUID)
|
*/
|
||||||
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
|
#ifdef HAVE_XSAVE_INTRINSICS
|
||||||
#elif defined(HAVE__CPUID)
|
pg_attribute_target("xsave")
|
||||||
__cpuid(exx, 1);
|
#endif
|
||||||
#else
|
static bool
|
||||||
#error cpuid instruction not available
|
zmm_regs_available(void)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_XSAVE_INTRINSICS
|
||||||
|
return (_xgetbv(0) & 0xe6) == 0xe6;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -53,10 +60,48 @@ pg_crc32c_sse42_available(void)
|
|||||||
static pg_crc32c
|
static pg_crc32c
|
||||||
pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
|
pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
if (pg_crc32c_sse42_available())
|
unsigned int exx[4] = {0, 0, 0, 0};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Set fallback. We must guard since slicing-by-8 is not visible
|
||||||
|
* everywhere.
|
||||||
|
*/
|
||||||
|
#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
|
||||||
|
pg_comp_crc32c = pg_comp_crc32c_sb8;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE__GET_CPUID)
|
||||||
|
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||||
|
#elif defined(HAVE__CPUID)
|
||||||
|
__cpuid(exx, 1);
|
||||||
|
#else
|
||||||
|
#error cpuid instruction not available
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */
|
||||||
|
{
|
||||||
pg_comp_crc32c = pg_comp_crc32c_sse42;
|
pg_comp_crc32c = pg_comp_crc32c_sse42;
|
||||||
else
|
|
||||||
pg_comp_crc32c = pg_comp_crc32c_sb8;
|
if (exx[2] & (1 << 27) && /* OSXSAVE */
|
||||||
|
zmm_regs_available())
|
||||||
|
{
|
||||||
|
/* second cpuid call on leaf 7 to check extended AVX-512 support */
|
||||||
|
|
||||||
|
memset(exx, 0, 4 * sizeof(exx[0]));
|
||||||
|
|
||||||
|
#if defined(HAVE__GET_CPUID_COUNT)
|
||||||
|
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||||
|
#elif defined(HAVE__CPUIDEX)
|
||||||
|
__cpuidex(exx, 7, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
|
||||||
|
if (exx[2] & (1 << 10) && /* VPCLMULQDQ */
|
||||||
|
exx[1] & (1 << 31)) /* AVX512-VL */
|
||||||
|
pg_comp_crc32c = pg_comp_crc32c_avx512;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return pg_comp_crc32c(crc, data, len);
|
return pg_comp_crc32c(crc, data, len);
|
||||||
}
|
}
|
||||||
|
@ -2330,6 +2330,30 @@ SELECT crc32c('The quick brown fox jumps over the lazy dog.');
|
|||||||
419469235
|
419469235
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
SELECT crc32c(repeat('A', 127)::bytea);
|
||||||
|
crc32c
|
||||||
|
-----------
|
||||||
|
291820082
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT crc32c(repeat('A', 128)::bytea);
|
||||||
|
crc32c
|
||||||
|
-----------
|
||||||
|
816091258
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT crc32c(repeat('A', 129)::bytea);
|
||||||
|
crc32c
|
||||||
|
------------
|
||||||
|
4213642571
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT crc32c(repeat('A', 800)::bytea);
|
||||||
|
crc32c
|
||||||
|
------------
|
||||||
|
3134039419
|
||||||
|
(1 row)
|
||||||
|
|
||||||
--
|
--
|
||||||
-- encode/decode
|
-- encode/decode
|
||||||
--
|
--
|
||||||
|
@ -738,6 +738,11 @@ SELECT crc32('The quick brown fox jumps over the lazy dog.');
|
|||||||
SELECT crc32c('');
|
SELECT crc32c('');
|
||||||
SELECT crc32c('The quick brown fox jumps over the lazy dog.');
|
SELECT crc32c('The quick brown fox jumps over the lazy dog.');
|
||||||
|
|
||||||
|
SELECT crc32c(repeat('A', 127)::bytea);
|
||||||
|
SELECT crc32c(repeat('A', 128)::bytea);
|
||||||
|
SELECT crc32c(repeat('A', 129)::bytea);
|
||||||
|
SELECT crc32c(repeat('A', 800)::bytea);
|
||||||
|
|
||||||
--
|
--
|
||||||
-- encode/decode
|
-- encode/decode
|
||||||
--
|
--
|
||||||
|
Loading…
x
Reference in New Issue
Block a user