mirror of
https://github.com/postgres/postgres.git
synced 2025-10-24 01:29:19 +03:00
Optimize popcount functions with ARM SVE intrinsics.
This commit introduces SVE implementations of pg_popcount{32,64}. Unlike the Neon versions, we need an additional configure-time check to determine if the compiler supports SVE intrinsics, and we need a runtime check to determine if the current CPU supports SVE instructions. Our testing showed that the SVE implementations are much faster for larger inputs and are comparable to the status quo for smaller inputs. Author: "Devanga.Susmitha@fujitsu.com" <Devanga.Susmitha@fujitsu.com> Co-authored-by: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com> Co-authored-by: "Malladi, Rama" <ramamalladi@hotmail.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Discussion: https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com Discussion: https://postgr.es/m/OSZPR01MB84990A9A02A3515C6E85A65B8B2A2%40OSZPR01MB8499.jpnprd01.prod.outlook.com
This commit is contained in:
@@ -708,3 +708,55 @@ if test x"$Ac_cachevar" = x"yes"; then
|
|||||||
fi
|
fi
|
||||||
undefine([Ac_cachevar])dnl
|
undefine([Ac_cachevar])dnl
|
||||||
])# PGAC_AVX512_POPCNT_INTRINSICS
|
])# PGAC_AVX512_POPCNT_INTRINSICS
|
||||||
|
|
||||||
|
# PGAC_SVE_POPCNT_INTRINSICS
|
||||||
|
# --------------------------
|
||||||
|
# Check if the compiler supports the SVE popcount instructions using the
|
||||||
|
# svptrue_b64, svdup_u64, svcntb, svld1_u64, svld1_u8, svadd_u64_x,
|
||||||
|
# svcnt_u64_x, svcnt_u8_x, svaddv_u64, svaddv_u8, svwhilelt_b8_s32,
|
||||||
|
# svand_n_u64_x, and svand_n_u8_x intrinsic functions.
|
||||||
|
#
|
||||||
|
# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics.
|
||||||
|
AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS],
|
||||||
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl
|
||||||
|
AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar],
|
||||||
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>
|
||||||
|
|
||||||
|
char buf[128];
|
||||||
|
|
||||||
|
#if defined(__has_attribute) && __has_attribute (target)
|
||||||
|
__attribute__((target("arch=armv8-a+sve")))
|
||||||
|
#endif
|
||||||
|
static int popcount_test(void)
|
||||||
|
{
|
||||||
|
svbool_t pred = svptrue_b64();
|
||||||
|
svuint8_t vec8;
|
||||||
|
svuint64_t accum1 = svdup_u64(0),
|
||||||
|
accum2 = svdup_u64(0),
|
||||||
|
vec64;
|
||||||
|
char *p = buf;
|
||||||
|
uint64_t popcnt,
|
||||||
|
mask = 0x5555555555555555;
|
||||||
|
|
||||||
|
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
|
||||||
|
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
|
||||||
|
p += svcntb();
|
||||||
|
|
||||||
|
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
|
||||||
|
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
|
||||||
|
p += svcntb();
|
||||||
|
|
||||||
|
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
|
||||||
|
|
||||||
|
pred = svwhilelt_b8_s32(0, sizeof(buf));
|
||||||
|
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
|
||||||
|
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
|
||||||
|
}]],
|
||||||
|
[return popcount_test();])],
|
||||||
|
[Ac_cachevar=yes],
|
||||||
|
[Ac_cachevar=no])])
|
||||||
|
if test x"$Ac_cachevar" = x"yes"; then
|
||||||
|
pgac_sve_popcnt_intrinsics=yes
|
||||||
|
fi
|
||||||
|
undefine([Ac_cachevar])dnl
|
||||||
|
])# PGAC_SVE_POPCNT_INTRINSICS
|
||||||
|
71
configure
vendored
71
configure
vendored
@@ -17517,6 +17517,77 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Check for SVE popcount intrinsics
|
||||||
|
#
|
||||||
|
if test x"$host_cpu" = x"aarch64"; then
|
||||||
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x" >&5
|
||||||
|
$as_echo_n "checking for svcnt_x... " >&6; }
|
||||||
|
if ${pgac_cv_sve_popcnt_intrinsics+:} false; then :
|
||||||
|
$as_echo_n "(cached) " >&6
|
||||||
|
else
|
||||||
|
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||||
|
/* end confdefs.h. */
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
char buf[128];
|
||||||
|
|
||||||
|
#if defined(__has_attribute) && __has_attribute (target)
|
||||||
|
__attribute__((target("arch=armv8-a+sve")))
|
||||||
|
#endif
|
||||||
|
static int popcount_test(void)
|
||||||
|
{
|
||||||
|
svbool_t pred = svptrue_b64();
|
||||||
|
svuint8_t vec8;
|
||||||
|
svuint64_t accum1 = svdup_u64(0),
|
||||||
|
accum2 = svdup_u64(0),
|
||||||
|
vec64;
|
||||||
|
char *p = buf;
|
||||||
|
uint64_t popcnt,
|
||||||
|
mask = 0x5555555555555555;
|
||||||
|
|
||||||
|
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
|
||||||
|
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
|
||||||
|
p += svcntb();
|
||||||
|
|
||||||
|
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
|
||||||
|
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
|
||||||
|
p += svcntb();
|
||||||
|
|
||||||
|
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
|
||||||
|
|
||||||
|
pred = svwhilelt_b8_s32(0, sizeof(buf));
|
||||||
|
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
|
||||||
|
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
|
||||||
|
}
|
||||||
|
int
|
||||||
|
main ()
|
||||||
|
{
|
||||||
|
return popcount_test();
|
||||||
|
;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
_ACEOF
|
||||||
|
if ac_fn_c_try_link "$LINENO"; then :
|
||||||
|
pgac_cv_sve_popcnt_intrinsics=yes
|
||||||
|
else
|
||||||
|
pgac_cv_sve_popcnt_intrinsics=no
|
||||||
|
fi
|
||||||
|
rm -f core conftest.err conftest.$ac_objext \
|
||||||
|
conftest$ac_exeext conftest.$ac_ext
|
||||||
|
fi
|
||||||
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sve_popcnt_intrinsics" >&5
|
||||||
|
$as_echo "$pgac_cv_sve_popcnt_intrinsics" >&6; }
|
||||||
|
if test x"$pgac_cv_sve_popcnt_intrinsics" = x"yes"; then
|
||||||
|
pgac_sve_popcnt_intrinsics=yes
|
||||||
|
fi
|
||||||
|
|
||||||
|
if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
|
||||||
|
|
||||||
|
$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
|
||||||
|
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
|
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
|
||||||
#
|
#
|
||||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5
|
||||||
|
@@ -2070,6 +2070,15 @@ if test x"$host_cpu" = x"x86_64"; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Check for SVE popcount intrinsics
|
||||||
|
#
|
||||||
|
if test x"$host_cpu" = x"aarch64"; then
|
||||||
|
PGAC_SVE_POPCNT_INTRINSICS()
|
||||||
|
if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
|
||||||
|
AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use SVE popcount instructions with a runtime check.])
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
|
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
|
||||||
#
|
#
|
||||||
PGAC_SSE42_CRC32_INTRINSICS()
|
PGAC_SSE42_CRC32_INTRINSICS()
|
||||||
|
48
meson.build
48
meson.build
@@ -2297,6 +2297,54 @@ int main(void)
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################
|
||||||
|
# Check for the availability of SVE popcount intrinsics.
|
||||||
|
###############################################################
|
||||||
|
|
||||||
|
if host_cpu == 'aarch64'
|
||||||
|
|
||||||
|
prog = '''
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
char buf[128];
|
||||||
|
|
||||||
|
#if defined(__has_attribute) && __has_attribute (target)
|
||||||
|
__attribute__((target("arch=armv8-a+sve")))
|
||||||
|
#endif
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
svbool_t pred = svptrue_b64();
|
||||||
|
svuint8_t vec8;
|
||||||
|
svuint64_t accum1 = svdup_u64(0),
|
||||||
|
accum2 = svdup_u64(0),
|
||||||
|
vec64;
|
||||||
|
char *p = buf;
|
||||||
|
uint64_t popcnt,
|
||||||
|
mask = 0x5555555555555555;
|
||||||
|
|
||||||
|
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
|
||||||
|
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
|
||||||
|
p += svcntb();
|
||||||
|
|
||||||
|
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
|
||||||
|
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
|
||||||
|
p += svcntb();
|
||||||
|
|
||||||
|
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
|
||||||
|
|
||||||
|
pred = svwhilelt_b8_s32(0, sizeof(buf));
|
||||||
|
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
|
||||||
|
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
|
||||||
|
if cc.links(prog, name: 'SVE popcount', args: test_c_args)
|
||||||
|
cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1)
|
||||||
|
endif
|
||||||
|
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
###############################################################
|
###############################################################
|
||||||
# Select CRC-32C implementation.
|
# Select CRC-32C implementation.
|
||||||
#
|
#
|
||||||
|
@@ -712,6 +712,9 @@
|
|||||||
/* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
|
/* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
|
||||||
#undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
|
#undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
|
||||||
|
|
||||||
|
/* Define to 1 to use SVE popcount instructions with a runtime check. */
|
||||||
|
#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
|
||||||
|
|
||||||
/* Define to build with systemd support. (--with-systemd) */
|
/* Define to build with systemd support. (--with-systemd) */
|
||||||
#undef USE_SYSTEMD
|
#undef USE_SYSTEMD
|
||||||
|
|
||||||
|
@@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
|
|||||||
extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
|
extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#elif POPCNT_AARCH64
|
||||||
|
/* Use the Neon version of pg_popcount{32,64} without function pointer. */
|
||||||
|
extern int pg_popcount32(uint32 word);
|
||||||
|
extern int pg_popcount64(uint64 word);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We can try to use an SVE-optimized pg_popcount() on some systems For that,
|
||||||
|
* we do use a function pointer.
|
||||||
|
*/
|
||||||
|
#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
|
||||||
|
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
|
||||||
|
extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask);
|
||||||
|
#else
|
||||||
|
extern uint64 pg_popcount_optimized(const char *buf, int bytes);
|
||||||
|
extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask);
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
/* Use a portable implementation -- no need for a function pointer. */
|
/* Use a portable implementation -- no need for a function pointer. */
|
||||||
extern int pg_popcount32(uint32 word);
|
extern int pg_popcount32(uint32 word);
|
||||||
|
@@ -18,6 +18,275 @@
|
|||||||
|
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The Neon versions are built regardless of whether we are building the SVE
|
||||||
|
* versions.
|
||||||
|
*/
|
||||||
|
static uint64 pg_popcount_neon(const char *buf, int bytes);
|
||||||
|
static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask);
|
||||||
|
|
||||||
|
#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These are the SVE implementations of the popcount functions.
|
||||||
|
*/
|
||||||
|
static uint64 pg_popcount_sve(const char *buf, int bytes);
|
||||||
|
static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The function pointers are initially set to "choose" functions. These
|
||||||
|
* functions will first set the pointers to the right implementations (based on
|
||||||
|
* what the current CPU supports) and then will call the pointer to fulfill the
|
||||||
|
* caller's request.
|
||||||
|
*/
|
||||||
|
static uint64 pg_popcount_choose(const char *buf, int bytes);
|
||||||
|
static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
|
||||||
|
uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
|
||||||
|
uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
|
||||||
|
|
||||||
|
static inline bool
|
||||||
|
pg_popcount_sve_available(void)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_ELF_AUX_INFO
|
||||||
|
unsigned long value;
|
||||||
|
|
||||||
|
return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
|
||||||
|
(value & HWCAP_SVE) != 0;
|
||||||
|
#elif defined(HAVE_GETAUXVAL)
|
||||||
|
return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
choose_popcount_functions(void)
|
||||||
|
{
|
||||||
|
if (pg_popcount_sve_available())
|
||||||
|
{
|
||||||
|
pg_popcount_optimized = pg_popcount_sve;
|
||||||
|
pg_popcount_masked_optimized = pg_popcount_masked_sve;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pg_popcount_optimized = pg_popcount_neon;
|
||||||
|
pg_popcount_masked_optimized = pg_popcount_masked_neon;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64
|
||||||
|
pg_popcount_choose(const char *buf, int bytes)
|
||||||
|
{
|
||||||
|
choose_popcount_functions();
|
||||||
|
return pg_popcount_optimized(buf, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64
|
||||||
|
pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
|
||||||
|
{
|
||||||
|
choose_popcount_functions();
|
||||||
|
return pg_popcount_masked_optimized(buf, bytes, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pg_popcount_sve
|
||||||
|
* Returns number of 1 bits in buf
|
||||||
|
*/
|
||||||
|
pg_attribute_target("arch=armv8-a+sve")
|
||||||
|
static uint64
|
||||||
|
pg_popcount_sve(const char *buf, int bytes)
|
||||||
|
{
|
||||||
|
svbool_t pred = svptrue_b64();
|
||||||
|
svuint64_t accum1 = svdup_u64(0),
|
||||||
|
accum2 = svdup_u64(0),
|
||||||
|
accum3 = svdup_u64(0),
|
||||||
|
accum4 = svdup_u64(0);
|
||||||
|
uint32 vec_len = svcntb(),
|
||||||
|
bytes_per_iteration = 4 * vec_len;
|
||||||
|
uint64 popcnt = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For better instruction-level parallelism, each loop iteration operates
|
||||||
|
* on a block of four registers.
|
||||||
|
*/
|
||||||
|
for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
|
||||||
|
{
|
||||||
|
svuint64_t vec;
|
||||||
|
|
||||||
|
vec = svld1_u64(pred, (const uint64 *) buf);
|
||||||
|
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
vec = svld1_u64(pred, (const uint64 *) buf);
|
||||||
|
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
vec = svld1_u64(pred, (const uint64 *) buf);
|
||||||
|
accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
vec = svld1_u64(pred, (const uint64 *) buf);
|
||||||
|
accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If enough data remains, do another iteration on a block of two
|
||||||
|
* registers.
|
||||||
|
*/
|
||||||
|
bytes_per_iteration = 2 * vec_len;
|
||||||
|
if (bytes >= bytes_per_iteration)
|
||||||
|
{
|
||||||
|
svuint64_t vec;
|
||||||
|
|
||||||
|
vec = svld1_u64(pred, (const uint64 *) buf);
|
||||||
|
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
vec = svld1_u64(pred, (const uint64 *) buf);
|
||||||
|
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
bytes -= bytes_per_iteration;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add the accumulators.
|
||||||
|
*/
|
||||||
|
popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
|
||||||
|
popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process any remaining data.
|
||||||
|
*/
|
||||||
|
for (; bytes > 0; bytes -= vec_len)
|
||||||
|
{
|
||||||
|
svuint8_t vec;
|
||||||
|
|
||||||
|
pred = svwhilelt_b8_s32(0, bytes);
|
||||||
|
vec = svld1_u8(pred, (const uint8 *) buf);
|
||||||
|
popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return popcnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pg_popcount_masked_sve
|
||||||
|
* Returns number of 1 bits in buf after applying the mask to each byte
|
||||||
|
*/
|
||||||
|
pg_attribute_target("arch=armv8-a+sve")
|
||||||
|
static uint64
|
||||||
|
pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask)
|
||||||
|
{
|
||||||
|
svbool_t pred = svptrue_b64();
|
||||||
|
svuint64_t accum1 = svdup_u64(0),
|
||||||
|
accum2 = svdup_u64(0),
|
||||||
|
accum3 = svdup_u64(0),
|
||||||
|
accum4 = svdup_u64(0);
|
||||||
|
uint32 vec_len = svcntb(),
|
||||||
|
bytes_per_iteration = 4 * vec_len;
|
||||||
|
uint64 popcnt = 0,
|
||||||
|
mask64 = ~UINT64CONST(0) / 0xFF * mask;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For better instruction-level parallelism, each loop iteration operates
|
||||||
|
* on a block of four registers.
|
||||||
|
*/
|
||||||
|
for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
|
||||||
|
{
|
||||||
|
svuint64_t vec;
|
||||||
|
|
||||||
|
vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
|
||||||
|
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
|
||||||
|
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
|
||||||
|
accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
|
||||||
|
accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If enough data remains, do another iteration on a block of two
|
||||||
|
* registers.
|
||||||
|
*/
|
||||||
|
bytes_per_iteration = 2 * vec_len;
|
||||||
|
if (bytes >= bytes_per_iteration)
|
||||||
|
{
|
||||||
|
svuint64_t vec;
|
||||||
|
|
||||||
|
vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
|
||||||
|
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
|
||||||
|
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
|
||||||
|
bytes -= bytes_per_iteration;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add the accumulators.
|
||||||
|
*/
|
||||||
|
popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
|
||||||
|
popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process any remaining data.
|
||||||
|
*/
|
||||||
|
for (; bytes > 0; bytes -= vec_len)
|
||||||
|
{
|
||||||
|
svuint8_t vec;
|
||||||
|
|
||||||
|
pred = svwhilelt_b8_s32(0, bytes);
|
||||||
|
vec = svand_n_u8_x(pred, svld1_u8(pred, (const uint8 *) buf), mask);
|
||||||
|
popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
|
||||||
|
buf += vec_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return popcnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When the SVE version isn't available, there's no point in using function
|
||||||
|
* pointers to vary the implementation. We instead just make these actual
|
||||||
|
* external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined.
|
||||||
|
* The compiler should be able to inline the Neon versions here.
|
||||||
|
*/
|
||||||
|
uint64
|
||||||
|
pg_popcount_optimized(const char *buf, int bytes)
|
||||||
|
{
|
||||||
|
return pg_popcount_neon(buf, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64
|
||||||
|
pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
|
||||||
|
{
|
||||||
|
return pg_popcount_masked_neon(buf, bytes, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* pg_popcount32
|
* pg_popcount32
|
||||||
* Return number of 1 bits in word
|
* Return number of 1 bits in word
|
||||||
@@ -44,11 +313,11 @@ pg_popcount64(uint64 word)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* pg_popcount_optimized
|
* pg_popcount_neon
|
||||||
* Returns number of 1 bits in buf
|
* Returns number of 1 bits in buf
|
||||||
*/
|
*/
|
||||||
uint64
|
static uint64
|
||||||
pg_popcount_optimized(const char *buf, int bytes)
|
pg_popcount_neon(const char *buf, int bytes)
|
||||||
{
|
{
|
||||||
uint8x16_t vec;
|
uint8x16_t vec;
|
||||||
uint64x2_t accum1 = vdupq_n_u64(0),
|
uint64x2_t accum1 = vdupq_n_u64(0),
|
||||||
@@ -124,11 +393,11 @@ pg_popcount_optimized(const char *buf, int bytes)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* pg_popcount_masked_optimized
|
* pg_popcount_masked_neon
|
||||||
* Returns number of 1 bits in buf after applying the mask to each byte
|
* Returns number of 1 bits in buf after applying the mask to each byte
|
||||||
*/
|
*/
|
||||||
uint64
|
static uint64
|
||||||
pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
|
pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask)
|
||||||
{
|
{
|
||||||
uint8x16_t vec,
|
uint8x16_t vec,
|
||||||
maskv = vdupq_n_u8(mask);
|
maskv = vdupq_n_u8(mask);
|
||||||
|
Reference in New Issue
Block a user