1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-30 11:03:19 +03:00

Optimize pg_popcount() with AVX-512 instructions.

Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible.  Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers.  This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions.  If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().

Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16.  The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.

Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
This commit is contained in:
Nathan Bossart
2024-04-06 21:56:23 -05:00
parent 158f581923
commit 792752af4e
15 changed files with 696 additions and 3 deletions

View File

@ -694,3 +694,61 @@ if test x"$Ac_cachevar" = x"yes"; then
fi
undefine([Ac_cachevar])dnl
])# PGAC_LOONGARCH_CRC32C_INTRINSICS
# PGAC_XSAVE_INTRINSICS
# ---------------------
# Check if the compiler supports the XSAVE instructions using the _xgetbv
# intrinsic function.
#
# An optional compiler flag can be passed as argument (e.g., -mxsave). If the
# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE.
AC_DEFUN([PGAC_XSAVE_INTRINSICS],
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl
AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar],
[pgac_save_CFLAGS=$CFLAGS
CFLAGS="$pgac_save_CFLAGS $1"
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
[return _xgetbv(0) & 0xe0;])],
[Ac_cachevar=yes],
[Ac_cachevar=no])
CFLAGS="$pgac_save_CFLAGS"])
if test x"$Ac_cachevar" = x"yes"; then
CFLAGS_XSAVE="$1"
pgac_xsave_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_XSAVE_INTRINSICS
# PGAC_AVX512_POPCNT_INTRINSICS
# -----------------------------
# Check if the compiler supports the AVX-512 popcount instructions using the
# _mm512_setzero_si512, _mm512_maskz_loadu_epi8, _mm512_popcnt_epi64,
# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
#
# Optional compiler flags can be passed as argument (e.g., -mavx512vpopcntdq
# -mavx512bw). If the intrinsics are supported, sets
# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
[pgac_save_CFLAGS=$CFLAGS
CFLAGS="$pgac_save_CFLAGS $1"
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
[const char buf@<:@sizeof(__m512i)@:>@;
PG_INT64_TYPE popcnt = 0;
__m512i accum = _mm512_setzero_si512();
const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
const __m512i cnt = _mm512_popcnt_epi64(val);
accum = _mm512_add_epi64(accum, cnt);
popcnt = _mm512_reduce_add_epi64(accum);
/* return computed value, to prevent the above being optimized away */
return popcnt == 0;])],
[Ac_cachevar=yes],
[Ac_cachevar=no])
CFLAGS="$pgac_save_CFLAGS"])
if test x"$Ac_cachevar" = x"yes"; then
CFLAGS_POPCNT="$1"
pgac_avx512_popcnt_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_AVX512_POPCNT_INTRINSICS