mirror of
https://github.com/postgres/postgres.git
synced 2025-07-26 01:22:12 +03:00
Optimize popcount functions with ARM Neon intrinsics.
This commit introduces Neon implementations of pg_popcount{32,64}, pg_popcount(), and pg_popcount_masked(). As in simd.h, we assume that all available AArch64 hardware supports Neon, so we don't need any new configure-time or runtime checks. Some compilers already emit Neon instructions for these functions, but our hand-rolled implementations for pg_popcount() and pg_popcount_masked() performed better in testing, likely due to better instruction-level parallelism. Author: "Chiranmoy.Bhattacharya@fujitsu.com" <Chiranmoy.Bhattacharya@fujitsu.com> Reviewed-by: John Naylor <johncnaylorls@gmail.com> Discussion: https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com
This commit is contained in:
@ -298,6 +298,15 @@ pg_ceil_log2_64(uint64 num)
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On AArch64, we can use Neon instructions if the compiler provides access to
|
||||||
|
* them (as indicated by __ARM_NEON). As in simd.h, we assume that all
|
||||||
|
* available 64-bit hardware has Neon support.
|
||||||
|
*/
|
||||||
|
#if defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
|
#define POPCNT_AARCH64 1
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef TRY_POPCNT_X86_64
|
#ifdef TRY_POPCNT_X86_64
|
||||||
/* Attempt to use the POPCNT instruction, but perform a runtime check first */
|
/* Attempt to use the POPCNT instruction, but perform a runtime check first */
|
||||||
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
|
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
|
||||||
|
@ -46,6 +46,7 @@ OBJS = \
|
|||||||
path.o \
|
path.o \
|
||||||
pg_bitutils.o \
|
pg_bitutils.o \
|
||||||
pg_localeconv_r.o \
|
pg_localeconv_r.o \
|
||||||
|
pg_popcount_aarch64.o \
|
||||||
pg_popcount_avx512.o \
|
pg_popcount_avx512.o \
|
||||||
pg_strong_random.o \
|
pg_strong_random.o \
|
||||||
pgcheckdir.o \
|
pgcheckdir.o \
|
||||||
|
@ -9,6 +9,7 @@ pgport_sources = [
|
|||||||
'path.c',
|
'path.c',
|
||||||
'pg_bitutils.c',
|
'pg_bitutils.c',
|
||||||
'pg_localeconv_r.c',
|
'pg_localeconv_r.c',
|
||||||
|
'pg_popcount_aarch64.c',
|
||||||
'pg_popcount_avx512.c',
|
'pg_popcount_avx512.c',
|
||||||
'pg_strong_random.c',
|
'pg_strong_random.c',
|
||||||
'pgcheckdir.c',
|
'pgcheckdir.c',
|
||||||
|
@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = {
|
|||||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we are building the Neon versions, we don't need the "slow" fallbacks.
|
||||||
|
*/
|
||||||
|
#ifndef POPCNT_AARCH64
|
||||||
static inline int pg_popcount32_slow(uint32 word);
|
static inline int pg_popcount32_slow(uint32 word);
|
||||||
static inline int pg_popcount64_slow(uint64 word);
|
static inline int pg_popcount64_slow(uint64 word);
|
||||||
static uint64 pg_popcount_slow(const char *buf, int bytes);
|
static uint64 pg_popcount_slow(const char *buf, int bytes);
|
||||||
static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
|
static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef TRY_POPCNT_X86_64
|
#ifdef TRY_POPCNT_X86_64
|
||||||
static bool pg_popcount_available(void);
|
static bool pg_popcount_available(void);
|
||||||
@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
|
|||||||
|
|
||||||
#endif /* TRY_POPCNT_X86_64 */
|
#endif /* TRY_POPCNT_X86_64 */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we are building the Neon versions, we don't need the "slow" fallbacks.
|
||||||
|
*/
|
||||||
|
#ifndef POPCNT_AARCH64
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* pg_popcount32_slow
|
* pg_popcount32_slow
|
||||||
@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
|
|||||||
return popcnt;
|
return popcnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef TRY_POPCNT_X86_64
|
#endif /* ! POPCNT_AARCH64 */
|
||||||
|
|
||||||
|
#if !defined(TRY_POPCNT_X86_64) && !defined(POPCNT_AARCH64)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When the POPCNT instruction is not available, there's no point in using
|
* When special CPU instructions are not available, there's no point in using
|
||||||
* function pointers to vary the implementation between the fast and slow
|
* function pointers to vary the implementation between the fast and slow
|
||||||
* method. We instead just make these actual external functions when
|
* method. We instead just make these actual external functions. The compiler
|
||||||
* TRY_POPCNT_X86_64 is not defined. The compiler should be able to inline
|
* should be able to inline the slow versions here.
|
||||||
* the slow versions here.
|
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
pg_popcount32(uint32 word)
|
pg_popcount32(uint32 word)
|
||||||
@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
|
|||||||
return pg_popcount_masked_slow(buf, bytes, mask);
|
return pg_popcount_masked_slow(buf, bytes, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* !TRY_POPCNT_X86_64 */
|
#endif /* ! TRY_POPCNT_X86_64 && ! POPCNT_AARCH64 */
|
||||||
|
208
src/port/pg_popcount_aarch64.c
Normal file
208
src/port/pg_popcount_aarch64.c
Normal file
@ -0,0 +1,208 @@
|
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* pg_popcount_aarch64.c
|
||||||
|
* Holds the AArch64 popcount implementations.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2025, PostgreSQL Global Development Group
|
||||||
|
*
|
||||||
|
* IDENTIFICATION
|
||||||
|
* src/port/pg_popcount_aarch64.c
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
#include "c.h"
|
||||||
|
|
||||||
|
#include "port/pg_bitutils.h"
|
||||||
|
|
||||||
|
#ifdef POPCNT_AARCH64
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pg_popcount32
|
||||||
|
* Return number of 1 bits in word
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
pg_popcount32(uint32 word)
|
||||||
|
{
|
||||||
|
return pg_popcount64((uint64) word);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pg_popcount64
|
||||||
|
* Return number of 1 bits in word
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
pg_popcount64(uint64 word)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* For some compilers, __builtin_popcountl() already emits Neon
|
||||||
|
* instructions. The line below should compile to the same code on those
|
||||||
|
* systems.
|
||||||
|
*/
|
||||||
|
return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pg_popcount_optimized
|
||||||
|
* Returns number of 1 bits in buf
|
||||||
|
*/
|
||||||
|
uint64
|
||||||
|
pg_popcount_optimized(const char *buf, int bytes)
|
||||||
|
{
|
||||||
|
uint8x16_t vec;
|
||||||
|
uint64x2_t accum1 = vdupq_n_u64(0),
|
||||||
|
accum2 = vdupq_n_u64(0),
|
||||||
|
accum3 = vdupq_n_u64(0),
|
||||||
|
accum4 = vdupq_n_u64(0);
|
||||||
|
uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t);
|
||||||
|
uint64 popcnt = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For better instruction-level parallelism, each loop iteration operates
|
||||||
|
* on a block of four registers.
|
||||||
|
*/
|
||||||
|
for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
|
||||||
|
{
|
||||||
|
vec = vld1q_u8((const uint8 *) buf);
|
||||||
|
accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
vec = vld1q_u8((const uint8 *) buf);
|
||||||
|
accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
vec = vld1q_u8((const uint8 *) buf);
|
||||||
|
accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
vec = vld1q_u8((const uint8 *) buf);
|
||||||
|
accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If enough data remains, do another iteration on a block of two
|
||||||
|
* registers.
|
||||||
|
*/
|
||||||
|
bytes_per_iteration = 2 * sizeof(uint8x16_t);
|
||||||
|
if (bytes >= bytes_per_iteration)
|
||||||
|
{
|
||||||
|
vec = vld1q_u8((const uint8 *) buf);
|
||||||
|
accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
vec = vld1q_u8((const uint8 *) buf);
|
||||||
|
accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
bytes -= bytes_per_iteration;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add the accumulators.
|
||||||
|
*/
|
||||||
|
popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
|
||||||
|
popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process remaining 8-byte blocks.
|
||||||
|
*/
|
||||||
|
for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
|
||||||
|
{
|
||||||
|
popcnt += pg_popcount64(*((uint64 *) buf));
|
||||||
|
buf += sizeof(uint64);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process any remaining data byte-by-byte.
|
||||||
|
*/
|
||||||
|
while (bytes--)
|
||||||
|
popcnt += pg_number_of_ones[(unsigned char) *buf++];
|
||||||
|
|
||||||
|
return popcnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* pg_popcount_masked_optimized
|
||||||
|
* Returns number of 1 bits in buf after applying the mask to each byte
|
||||||
|
*/
|
||||||
|
uint64
|
||||||
|
pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
|
||||||
|
{
|
||||||
|
uint8x16_t vec,
|
||||||
|
maskv = vdupq_n_u8(mask);
|
||||||
|
uint64x2_t accum1 = vdupq_n_u64(0),
|
||||||
|
accum2 = vdupq_n_u64(0),
|
||||||
|
accum3 = vdupq_n_u64(0),
|
||||||
|
accum4 = vdupq_n_u64(0);
|
||||||
|
uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t);
|
||||||
|
uint64 popcnt = 0,
|
||||||
|
mask64 = ~UINT64CONST(0) / 0xFF * mask;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For better instruction-level parallelism, each loop iteration operates
|
||||||
|
* on a block of four registers.
|
||||||
|
*/
|
||||||
|
for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
|
||||||
|
{
|
||||||
|
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
|
||||||
|
accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
|
||||||
|
accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
|
||||||
|
accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
|
||||||
|
accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If enough data remains, do another iteration on a block of two
|
||||||
|
* registers.
|
||||||
|
*/
|
||||||
|
bytes_per_iteration = 2 * sizeof(uint8x16_t);
|
||||||
|
if (bytes >= bytes_per_iteration)
|
||||||
|
{
|
||||||
|
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
|
||||||
|
accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
|
||||||
|
accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
|
||||||
|
buf += sizeof(uint8x16_t);
|
||||||
|
|
||||||
|
bytes -= bytes_per_iteration;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add the accumulators.
|
||||||
|
*/
|
||||||
|
popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
|
||||||
|
popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process remining 8-byte blocks.
|
||||||
|
*/
|
||||||
|
for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
|
||||||
|
{
|
||||||
|
popcnt += pg_popcount64(*((uint64 *) buf) & mask64);
|
||||||
|
buf += sizeof(uint64);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process any remaining data byte-by-byte.
|
||||||
|
*/
|
||||||
|
while (bytes--)
|
||||||
|
popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
|
||||||
|
|
||||||
|
return popcnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* POPCNT_AARCH64 */
|
Reference in New Issue
Block a user