From 4d14ccd6af6e788a7b79ff3ed77bda5bc71d2edc Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 10 Aug 2023 11:36:15 +0700 Subject: [PATCH] Use native CRC instructions on 64-bit LoongArch As with the Intel and Arm CRC instructions, compiler intrinsics for them must be supported by the compiler. In contrast, no runtime check is needed. Aligned memory access is faster, so use the Arm coding as a model. YANG Xudong Discussion: https://postgr.es/m/b522a0c5-e3b2-99cc-6387-58134fb88cbe%40ymatrix.cn --- config/c-compiler.m4 | 33 +++++++++++++++ configure | 74 ++++++++++++++++++++++++++++++---- configure.ac | 38 ++++++++++++----- meson.build | 24 +++++++++++ src/include/pg_config.h.in | 3 ++ src/include/port/pg_crc32c.h | 9 +++++ src/port/meson.build | 3 ++ src/port/pg_crc32c_loongarch.c | 73 +++++++++++++++++++++++++++++++++ 8 files changed, 240 insertions(+), 17 deletions(-) create mode 100644 src/port/pg_crc32c_loongarch.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5be8f0f08dc..5db02b2ab75 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -661,3 +661,36 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS + +# PGAC_LOONGARCH_CRC32C_INTRINSICS +# --------------------------- +# Check if the compiler supports the LoongArch CRCC instructions, using +# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, +# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w +# intrinsic functions. +# +# We test for the 8-byte variant since platforms capable of running +# Postgres are 64-bit only (as of PG17), and we know CRC instructions +# are available there without a runtime check. +# +# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics. +AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl +AC_CACHE_CHECK( + [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w], + [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([], + [unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_loongarch_crc32c_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_LOONGARCH_CRC32C_INTRINSICS diff --git a/configure b/configure index 963fbbcf1e7..86ffccb1ee1 100755 --- a/configure +++ b/configure @@ -18047,6 +18047,47 @@ fi fi +# Check for LoongArch CRC intrinsics to do CRC calculations. +# +# Check if __builtin_loongarch_crcc_* intrinsics can be used +# with the default compiler flags. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5 +$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; } +if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_loongarch_crc32c_intrinsics=yes +else + pgac_cv_loongarch_crc32c_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5 +$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; } +if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then + pgac_loongarch_crc32c_intrinsics=yes +fi + + # Select CRC-32C implementation. @@ -18063,9 +18104,12 @@ fi # we're not targeting such a processor, but can nevertheless produce code that # uses the CRC instructions, compile both, and select at runtime. # -# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1 +# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then +# +# If we are targeting a LoongArch processor, CRC instructions are +# always available (at least on 64 bit), so no runtime check is needed. +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -18083,10 +18127,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + # LoongArch CRCC instructions. + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -18127,12 +18176,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } else + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + +$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h + + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 +$as_echo "LoongArch CRCC instructions" >&6; } + else $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } + fi fi fi fi diff --git a/configure.ac b/configure.ac index 5153b8b3fdc..116c5a1f68b 100644 --- a/configure.ac +++ b/configure.ac @@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc]) fi +# Check for LoongArch CRC intrinsics to do CRC calculations. +# +# Check if __builtin_loongarch_crcc_* intrinsics can be used +# with the default compiler flags. +PGAC_LOONGARCH_CRC32C_INTRINSICS() + AC_SUBST(CFLAGS_CRC) # Select CRC-32C implementation. @@ -2115,9 +2121,12 @@ AC_SUBST(CFLAGS_CRC) # we're not targeting such a processor, but can nevertheless produce code that # uses the CRC instructions, compile both, and select at runtime. # -# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1 +# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then +# +# If we are targeting a LoongArch processor, CRC instructions are +# always available (at least on 64 bit), so no runtime check is needed. +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -2135,10 +2144,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + # LoongArch CRCC instructions. + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -2166,9 +2180,15 @@ else PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) + else + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) + fi fi fi fi diff --git a/meson.build b/meson.build index 0a11efc97a1..2acb2040037 100644 --- a/meson.build +++ b/meson.build @@ -2065,6 +2065,30 @@ int main(void) cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1) have_optimized_crc = true endif + +elif host_cpu == 'loongarch64' + + prog = ''' +int main(void) +{ + unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + + /* return computed value, to prevent the above being optimized away */ + return crc == 0; +} +''' + + if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w', + args: test_c_args) + # Use LoongArch CRC instruction unconditionally + cdata.set('USE_LOONGARCH_CRC32C', 1) + have_optimized_crc = true + endif + endif if not have_optimized_crc diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index ee209d6d702..d8a2985567f 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -714,6 +714,9 @@ /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */ #undef USE_LLVM +/* Define to 1 to use LoongArch CRCC instructions. */ +#undef USE_LOONGARCH_CRC32C + /* Define to 1 to build with LZ4 support. (--with-lz4) */ #undef USE_LZ4 diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 7f8779261c3..d085f1dc00b 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#elif defined(USE_LOONGARCH_CRC32C) +/* Use LoongArch CRCC instructions. */ + +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); + #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* diff --git a/src/port/meson.build b/src/port/meson.build index 9d0cd93c438..deb354418db 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -92,6 +92,9 @@ replace_funcs_pos = [ ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], + # loongarch + ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], + # generic fallback ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c new file mode 100644 index 00000000000..db9da80e1bf --- /dev/null +++ b/src/port/pg_crc32c_loongarch.c @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_loongarch.c + * Compute CRC-32C checksum using LoongArch CRCC instructions + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_crc32c_loongarch.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "port/pg_crc32c.h" + +pg_crc32c +pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + const unsigned char *pend = p + len; + + /* + * LoongArch doesn't require alignment, but aligned memory access is + * significantly faster. Process leading bytes so that the loop below + * starts with a pointer aligned to eight bytes. + */ + if (!PointerIsAligned(p, uint16) && + p + 1 <= pend) + { + crc = __builtin_loongarch_crcc_w_b_w(*p, crc); + p += 1; + } + if (!PointerIsAligned(p, uint32) && + p + 2 <= pend) + { + crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc); + p += 2; + } + if (!PointerIsAligned(p, uint64) && + p + 4 <= pend) + { + crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc); + p += 4; + } + + /* Process eight bytes at a time, as far as we can. */ + while (p + 8 <= pend) + { + crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc); + p += 8; + } + + /* Process remaining 0-7 bytes. */ + if (p + 4 <= pend) + { + crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc); + p += 4; + } + if (p + 2 <= pend) + { + crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc); + p += 2; + } + if (p < pend) + { + crc = __builtin_loongarch_crcc_w_b_w(*p, crc); + } + + return crc; +}