mirror of
https://github.com/postgres/postgres.git
synced 2025-12-19 17:02:53 +03:00
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required intrinsics both with and without extra compiler flags (e.g., -mxsave), and then depending on the results of those checks, we pick which files to compile with which flags. This is tedious and complicated, and it results in unsustainable coding patterns such as separate files for each portion of code may need to be built with different compiler flags. This commit introduces support for __attribute__((target(...))) and uses it for the AVX-512 code. This simplifies both the configure-time checks and the build scripts, and it allows us to place the functions that use the intrinsics in files that we otherwise do not want to build with special CPU instructions. We are careful to avoid using __attribute__((target(...))) on compilers that do not understand it, but we still perform the configure-time checks in case the compiler allows using the intrinsics without it (e.g., MSVC). A similar change could likely be made for some of the CRC-32C code, but that is left as a future exercise. Suggested-by: Andres Freund Reviewed-by: Raghuveer Devulapalli, Andres Freund Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
This commit is contained in:
@@ -262,9 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
|
||||
CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
|
||||
CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
|
||||
CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
|
||||
CFLAGS_POPCNT = @CFLAGS_POPCNT@
|
||||
CFLAGS_CRC = @CFLAGS_CRC@
|
||||
CFLAGS_XSAVE = @CFLAGS_XSAVE@
|
||||
PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
|
||||
PERMIT_MISSING_VARIABLE_DECLARATIONS = @PERMIT_MISSING_VARIABLE_DECLARATIONS@
|
||||
CXXFLAGS = @CXXFLAGS@
|
||||
@@ -772,9 +770,6 @@ LIBOBJS = @LIBOBJS@
|
||||
# files needed for the chosen CRC-32C implementation
|
||||
PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
|
||||
|
||||
# files needed for the chosen popcount implementation
|
||||
PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
|
||||
|
||||
LIBS := -lpgcommon -lpgport $(LIBS)
|
||||
|
||||
# to make ws2_32.lib the last library
|
||||
|
||||
@@ -174,6 +174,18 @@
|
||||
#define pg_attribute_nonnull(...)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* pg_attribute_target allows specifying different target options that the
|
||||
* function should be compiled with (e.g., for using special CPU instructions).
|
||||
* Note that there still needs to be a configure-time check to verify that a
|
||||
* specific target is understood by the compiler.
|
||||
*/
|
||||
#if __has_attribute (target)
|
||||
#define pg_attribute_target(...) __attribute__((target(__VA_ARGS__)))
|
||||
#else
|
||||
#define pg_attribute_target(...)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Append PG_USED_FOR_ASSERTS_ONLY to definitions of variables that are only
|
||||
* used in assert-enabled builds, to avoid compiler warnings about unused
|
||||
|
||||
@@ -102,10 +102,8 @@ pgxs_kv = {
|
||||
' '.join(cflags_no_missing_var_decls),
|
||||
|
||||
'CFLAGS_CRC': ' '.join(cflags_crc),
|
||||
'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
|
||||
'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
|
||||
'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
|
||||
'CFLAGS_XSAVE': ' '.join(cflags_xsave),
|
||||
|
||||
'LDFLAGS': var_ldflags,
|
||||
'LDFLAGS_EX': var_ldflags_ex,
|
||||
@@ -181,7 +179,7 @@ pgxs_empty = [
|
||||
'WANTED_LANGUAGES',
|
||||
|
||||
# Not needed because we don't build the server / PLs with the generated makefile
|
||||
'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
|
||||
'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
|
||||
'PG_TEST_EXTRA',
|
||||
'DTRACEFLAGS', # only server has dtrace probes
|
||||
|
||||
|
||||
@@ -38,13 +38,13 @@ LIBS += $(PTHREAD_LIBS)
|
||||
OBJS = \
|
||||
$(LIBOBJS) \
|
||||
$(PG_CRC32C_OBJS) \
|
||||
$(PG_POPCNT_OBJS) \
|
||||
bsearch_arg.o \
|
||||
chklocale.o \
|
||||
inet_net_ntop.o \
|
||||
noblock.o \
|
||||
path.o \
|
||||
pg_bitutils.o \
|
||||
pg_popcount_avx512.o \
|
||||
pg_strong_random.o \
|
||||
pgcheckdir.o \
|
||||
pgmkdirp.o \
|
||||
@@ -92,16 +92,6 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
|
||||
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
|
||||
pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
|
||||
|
||||
# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
|
||||
pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
|
||||
pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
|
||||
pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
|
||||
|
||||
# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
|
||||
pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
|
||||
pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
|
||||
pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
|
||||
|
||||
#
|
||||
# Shared library versions of object files
|
||||
#
|
||||
|
||||
@@ -7,6 +7,7 @@ pgport_sources = [
|
||||
'noblock.c',
|
||||
'path.c',
|
||||
'pg_bitutils.c',
|
||||
'pg_popcount_avx512.c',
|
||||
'pg_strong_random.c',
|
||||
'pgcheckdir.c',
|
||||
'pgmkdirp.c',
|
||||
@@ -84,8 +85,6 @@ replace_funcs_pos = [
|
||||
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
|
||||
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
||||
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
||||
['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
|
||||
['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'],
|
||||
|
||||
# arm / aarch64
|
||||
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
|
||||
@@ -100,8 +99,8 @@ replace_funcs_pos = [
|
||||
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
|
||||
]
|
||||
|
||||
pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave}
|
||||
pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []}
|
||||
pgport_cflags = {'crc': cflags_crc}
|
||||
pgport_sources_cflags = {'crc': []}
|
||||
|
||||
foreach f : replace_funcs_neg
|
||||
func = f.get(0)
|
||||
|
||||
@@ -12,7 +12,17 @@
|
||||
*/
|
||||
#include "c.h"
|
||||
|
||||
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
|
||||
#include <cpuid.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "port/pg_bitutils.h"
|
||||
|
||||
@@ -21,12 +31,82 @@
|
||||
* use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
|
||||
* the function pointers that are only used when TRY_POPCNT_FAST is set.
|
||||
*/
|
||||
#ifdef TRY_POPCNT_FAST
|
||||
#if defined(TRY_POPCNT_FAST) && defined(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK)
|
||||
|
||||
/*
|
||||
* Does CPUID say there's support for XSAVE instructions?
|
||||
*/
|
||||
static inline bool
|
||||
xsave_available(void)
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
|
||||
#if defined(HAVE__GET_CPUID)
|
||||
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
#elif defined(HAVE__CPUID)
|
||||
__cpuid(exx, 1);
|
||||
#else
|
||||
#error cpuid instruction not available
|
||||
#endif
|
||||
return (exx[2] & (1 << 27)) != 0; /* osxsave */
|
||||
}
|
||||
|
||||
/*
|
||||
* Does XGETBV say the ZMM registers are enabled?
|
||||
*
|
||||
* NB: Caller is responsible for verifying that xsave_available() returns true
|
||||
* before calling this.
|
||||
*/
|
||||
#ifdef HAVE_XSAVE_INTRINSICS
|
||||
pg_attribute_target("xsave")
|
||||
#endif
|
||||
static inline bool
|
||||
zmm_regs_available(void)
|
||||
{
|
||||
#ifdef HAVE_XSAVE_INTRINSICS
|
||||
return (_xgetbv(0) & 0xe6) == 0xe6;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Does CPUID say there's support for AVX-512 popcount and byte-and-word
|
||||
* instructions?
|
||||
*/
|
||||
static inline bool
|
||||
avx512_popcnt_available(void)
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
|
||||
#if defined(HAVE__GET_CPUID_COUNT)
|
||||
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
#elif defined(HAVE__CPUIDEX)
|
||||
__cpuidex(exx, 7, 0);
|
||||
#else
|
||||
#error cpuid instruction not available
|
||||
#endif
|
||||
return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
|
||||
(exx[1] & (1 << 30)) != 0; /* avx512-bw */
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the CPU supports the instructions required for the AVX-512
|
||||
* pg_popcount() implementation.
|
||||
*/
|
||||
bool
|
||||
pg_popcount_avx512_available(void)
|
||||
{
|
||||
return xsave_available() &&
|
||||
zmm_regs_available() &&
|
||||
avx512_popcnt_available();
|
||||
}
|
||||
|
||||
/*
|
||||
* pg_popcount_avx512
|
||||
* Returns the number of 1-bits in buf
|
||||
*/
|
||||
pg_attribute_target("avx512vpopcntdq", "avx512bw")
|
||||
uint64
|
||||
pg_popcount_avx512(const char *buf, int bytes)
|
||||
{
|
||||
@@ -82,6 +162,7 @@ pg_popcount_avx512(const char *buf, int bytes)
|
||||
* pg_popcount_masked_avx512
|
||||
* Returns the number of 1-bits in buf after applying the mask to each byte
|
||||
*/
|
||||
pg_attribute_target("avx512vpopcntdq", "avx512bw")
|
||||
uint64
|
||||
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
|
||||
{
|
||||
@@ -138,4 +219,5 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
|
||||
return _mm512_reduce_add_epi64(accum);
|
||||
}
|
||||
|
||||
#endif /* TRY_POPCNT_FAST */
|
||||
#endif /* TRY_POPCNT_FAST &&
|
||||
* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
|
||||
|
||||
@@ -1,102 +0,0 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* pg_popcount_avx512_choose.c
|
||||
* Test whether we can use the AVX-512 pg_popcount() implementation.
|
||||
*
|
||||
* Copyright (c) 2024, PostgreSQL Global Development Group
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/port/pg_popcount_avx512_choose.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "c.h"
|
||||
|
||||
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
|
||||
#include <cpuid.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_XSAVE_INTRINSICS
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "port/pg_bitutils.h"
|
||||
|
||||
/*
|
||||
* It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
|
||||
* use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
|
||||
* the function pointers that are only used when TRY_POPCNT_FAST is set.
|
||||
*/
|
||||
#ifdef TRY_POPCNT_FAST
|
||||
|
||||
/*
|
||||
* Does CPUID say there's support for XSAVE instructions?
|
||||
*/
|
||||
static inline bool
|
||||
xsave_available(void)
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
|
||||
#if defined(HAVE__GET_CPUID)
|
||||
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
#elif defined(HAVE__CPUID)
|
||||
__cpuid(exx, 1);
|
||||
#else
|
||||
#error cpuid instruction not available
|
||||
#endif
|
||||
return (exx[2] & (1 << 27)) != 0; /* osxsave */
|
||||
}
|
||||
|
||||
/*
|
||||
* Does XGETBV say the ZMM registers are enabled?
|
||||
*
|
||||
* NB: Caller is responsible for verifying that xsave_available() returns true
|
||||
* before calling this.
|
||||
*/
|
||||
static inline bool
|
||||
zmm_regs_available(void)
|
||||
{
|
||||
#ifdef HAVE_XSAVE_INTRINSICS
|
||||
return (_xgetbv(0) & 0xe6) == 0xe6;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Does CPUID say there's support for AVX-512 popcount and byte-and-word
|
||||
* instructions?
|
||||
*/
|
||||
static inline bool
|
||||
avx512_popcnt_available(void)
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
|
||||
#if defined(HAVE__GET_CPUID_COUNT)
|
||||
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
#elif defined(HAVE__CPUIDEX)
|
||||
__cpuidex(exx, 7, 0);
|
||||
#else
|
||||
#error cpuid instruction not available
|
||||
#endif
|
||||
return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
|
||||
(exx[1] & (1 << 30)) != 0; /* avx512-bw */
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the CPU supports the instructions required for the AVX-512
|
||||
* pg_popcount() implementation.
|
||||
*/
|
||||
bool
|
||||
pg_popcount_avx512_available(void)
|
||||
{
|
||||
return xsave_available() &&
|
||||
zmm_regs_available() &&
|
||||
avx512_popcnt_available();
|
||||
}
|
||||
|
||||
#endif /* TRY_POPCNT_FAST */
|
||||
Reference in New Issue
Block a user