mirror of
https://github.com/postgres/postgres.git
synced 2025-04-18 13:44:19 +03:00
Use native CRC instructions on 64-bit LoongArch
As with the Intel and Arm CRC instructions, compiler intrinsics for them must be supported by the compiler. In contrast, no runtime check is needed. Aligned memory access is faster, so use the Arm coding as a model. YANG Xudong Discussion: https://postgr.es/m/b522a0c5-e3b2-99cc-6387-58134fb88cbe%40ymatrix.cn
This commit is contained in:
parent
fa2e874946
commit
4d14ccd6af
@ -661,3 +661,36 @@ if test x"$Ac_cachevar" = x"yes"; then
|
||||
fi
|
||||
undefine([Ac_cachevar])dnl
|
||||
])# PGAC_ARMV8_CRC32C_INTRINSICS
|
||||
|
||||
# PGAC_LOONGARCH_CRC32C_INTRINSICS
|
||||
# ---------------------------
|
||||
# Check if the compiler supports the LoongArch CRCC instructions, using
|
||||
# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
|
||||
# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
|
||||
# intrinsic functions.
|
||||
#
|
||||
# We test for the 8-byte variant since platforms capable of running
|
||||
# Postgres are 64-bit only (as of PG17), and we know CRC instructions
|
||||
# are available there without a runtime check.
|
||||
#
|
||||
# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
|
||||
AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
|
||||
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
|
||||
AC_CACHE_CHECK(
|
||||
[for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
|
||||
[Ac_cachevar],
|
||||
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
|
||||
[unsigned int crc = 0;
|
||||
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
|
||||
/* return computed value, to prevent the above being optimized away */
|
||||
return crc == 0;])],
|
||||
[Ac_cachevar=yes],
|
||||
[Ac_cachevar=no])])
|
||||
if test x"$Ac_cachevar" = x"yes"; then
|
||||
pgac_loongarch_crc32c_intrinsics=yes
|
||||
fi
|
||||
undefine([Ac_cachevar])dnl
|
||||
])# PGAC_LOONGARCH_CRC32C_INTRINSICS
|
||||
|
74
configure
vendored
74
configure
vendored
@ -18047,6 +18047,47 @@ fi
|
||||
|
||||
fi
|
||||
|
||||
# Check for LoongArch CRC intrinsics to do CRC calculations.
|
||||
#
|
||||
# Check if __builtin_loongarch_crcc_* intrinsics can be used
|
||||
# with the default compiler flags.
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5
|
||||
$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; }
|
||||
if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
int
|
||||
main ()
|
||||
{
|
||||
unsigned int crc = 0;
|
||||
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
|
||||
/* return computed value, to prevent the above being optimized away */
|
||||
return crc == 0;
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"; then :
|
||||
pgac_cv_loongarch_crc32c_intrinsics=yes
|
||||
else
|
||||
pgac_cv_loongarch_crc32c_intrinsics=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5
|
||||
$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; }
|
||||
if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
|
||||
pgac_loongarch_crc32c_intrinsics=yes
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
# Select CRC-32C implementation.
|
||||
@ -18063,9 +18104,12 @@ fi
|
||||
# we're not targeting such a processor, but can nevertheless produce code that
|
||||
# uses the CRC instructions, compile both, and select at runtime.
|
||||
#
|
||||
# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
|
||||
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
|
||||
# in the template or configure command line.
|
||||
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
|
||||
#
|
||||
# If we are targeting a LoongArch processor, CRC instructions are
|
||||
# always available (at least on 64 bit), so no runtime check is needed.
|
||||
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
|
||||
# Use Intel SSE 4.2 if available.
|
||||
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
|
||||
USE_SSE42_CRC32C=1
|
||||
@ -18083,10 +18127,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
|
||||
if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
|
||||
USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
|
||||
else
|
||||
# fall back to slicing-by-8 algorithm, which doesn't require any
|
||||
# special CPU support.
|
||||
USE_SLICING_BY_8_CRC32C=1
|
||||
fi
|
||||
# LoongArch CRCC instructions.
|
||||
if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
|
||||
USE_LOONGARCH_CRC32C=1
|
||||
else
|
||||
# fall back to slicing-by-8 algorithm, which doesn't require any
|
||||
# special CPU support.
|
||||
USE_SLICING_BY_8_CRC32C=1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
@ -18127,12 +18176,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
|
||||
$as_echo "ARMv8 CRC instructions with runtime check" >&6; }
|
||||
else
|
||||
if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
|
||||
|
||||
$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
|
||||
|
||||
PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
|
||||
$as_echo "LoongArch CRCC instructions" >&6; }
|
||||
else
|
||||
|
||||
$as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
|
||||
|
||||
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
|
||||
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
|
||||
$as_echo "slicing-by-8" >&6; }
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
38
configure.ac
38
configure.ac
@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
|
||||
PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
|
||||
fi
|
||||
|
||||
# Check for LoongArch CRC intrinsics to do CRC calculations.
|
||||
#
|
||||
# Check if __builtin_loongarch_crcc_* intrinsics can be used
|
||||
# with the default compiler flags.
|
||||
PGAC_LOONGARCH_CRC32C_INTRINSICS()
|
||||
|
||||
AC_SUBST(CFLAGS_CRC)
|
||||
|
||||
# Select CRC-32C implementation.
|
||||
@ -2115,9 +2121,12 @@ AC_SUBST(CFLAGS_CRC)
|
||||
# we're not targeting such a processor, but can nevertheless produce code that
|
||||
# uses the CRC instructions, compile both, and select at runtime.
|
||||
#
|
||||
# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
|
||||
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
|
||||
# in the template or configure command line.
|
||||
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
|
||||
#
|
||||
# If we are targeting a LoongArch processor, CRC instructions are
|
||||
# always available (at least on 64 bit), so no runtime check is needed.
|
||||
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
|
||||
# Use Intel SSE 4.2 if available.
|
||||
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
|
||||
USE_SSE42_CRC32C=1
|
||||
@ -2135,10 +2144,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
|
||||
if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
|
||||
USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
|
||||
else
|
||||
# fall back to slicing-by-8 algorithm, which doesn't require any
|
||||
# special CPU support.
|
||||
USE_SLICING_BY_8_CRC32C=1
|
||||
fi
|
||||
# LoongArch CRCC instructions.
|
||||
if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
|
||||
USE_LOONGARCH_CRC32C=1
|
||||
else
|
||||
# fall back to slicing-by-8 algorithm, which doesn't require any
|
||||
# special CPU support.
|
||||
USE_SLICING_BY_8_CRC32C=1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
@ -2166,9 +2180,15 @@ else
|
||||
PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
|
||||
AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
|
||||
else
|
||||
AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
|
||||
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
|
||||
AC_MSG_RESULT(slicing-by-8)
|
||||
if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
|
||||
AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
|
||||
PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
|
||||
AC_MSG_RESULT(LoongArch CRCC instructions)
|
||||
else
|
||||
AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
|
||||
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
|
||||
AC_MSG_RESULT(slicing-by-8)
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
24
meson.build
24
meson.build
@ -2065,6 +2065,30 @@ int main(void)
|
||||
cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
|
||||
have_optimized_crc = true
|
||||
endif
|
||||
|
||||
elif host_cpu == 'loongarch64'
|
||||
|
||||
prog = '''
|
||||
int main(void)
|
||||
{
|
||||
unsigned int crc = 0;
|
||||
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
|
||||
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
|
||||
|
||||
/* return computed value, to prevent the above being optimized away */
|
||||
return crc == 0;
|
||||
}
|
||||
'''
|
||||
|
||||
if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
|
||||
args: test_c_args)
|
||||
# Use LoongArch CRC instruction unconditionally
|
||||
cdata.set('USE_LOONGARCH_CRC32C', 1)
|
||||
have_optimized_crc = true
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
if not have_optimized_crc
|
||||
|
@ -714,6 +714,9 @@
|
||||
/* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
|
||||
#undef USE_LLVM
|
||||
|
||||
/* Define to 1 to use LoongArch CRCC instructions. */
|
||||
#undef USE_LOONGARCH_CRC32C
|
||||
|
||||
/* Define to 1 to build with LZ4 support. (--with-lz4) */
|
||||
#undef USE_LZ4
|
||||
|
||||
|
@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
|
||||
|
||||
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
|
||||
|
||||
#elif defined(USE_LOONGARCH_CRC32C)
|
||||
/* Use LoongArch CRCC instructions. */
|
||||
|
||||
#define COMP_CRC32C(crc, data, len) \
|
||||
((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
|
||||
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
|
||||
|
||||
extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
|
||||
|
||||
#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
|
||||
|
||||
/*
|
||||
|
@ -92,6 +92,9 @@ replace_funcs_pos = [
|
||||
['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
|
||||
['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
|
||||
|
||||
# loongarch
|
||||
['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
|
||||
|
||||
# generic fallback
|
||||
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
|
||||
]
|
||||
|
73
src/port/pg_crc32c_loongarch.c
Normal file
73
src/port/pg_crc32c_loongarch.c
Normal file
@ -0,0 +1,73 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* pg_crc32c_loongarch.c
|
||||
* Compute CRC-32C checksum using LoongArch CRCC instructions
|
||||
*
|
||||
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/port/pg_crc32c_loongarch.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "c.h"
|
||||
|
||||
#include "port/pg_crc32c.h"
|
||||
|
||||
pg_crc32c
|
||||
pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
|
||||
{
|
||||
const unsigned char *p = data;
|
||||
const unsigned char *pend = p + len;
|
||||
|
||||
/*
|
||||
* LoongArch doesn't require alignment, but aligned memory access is
|
||||
* significantly faster. Process leading bytes so that the loop below
|
||||
* starts with a pointer aligned to eight bytes.
|
||||
*/
|
||||
if (!PointerIsAligned(p, uint16) &&
|
||||
p + 1 <= pend)
|
||||
{
|
||||
crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
|
||||
p += 1;
|
||||
}
|
||||
if (!PointerIsAligned(p, uint32) &&
|
||||
p + 2 <= pend)
|
||||
{
|
||||
crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
|
||||
p += 2;
|
||||
}
|
||||
if (!PointerIsAligned(p, uint64) &&
|
||||
p + 4 <= pend)
|
||||
{
|
||||
crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
|
||||
p += 4;
|
||||
}
|
||||
|
||||
/* Process eight bytes at a time, as far as we can. */
|
||||
while (p + 8 <= pend)
|
||||
{
|
||||
crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
|
||||
p += 8;
|
||||
}
|
||||
|
||||
/* Process remaining 0-7 bytes. */
|
||||
if (p + 4 <= pend)
|
||||
{
|
||||
crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
|
||||
p += 4;
|
||||
}
|
||||
if (p + 2 <= pend)
|
||||
{
|
||||
crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
|
||||
p += 2;
|
||||
}
|
||||
if (p < pend)
|
||||
{
|
||||
crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
|
||||
}
|
||||
|
||||
return crc;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user