1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-28 18:48:04 +03:00

Use Intel SSE 4.2 CRC instructions where available.

Modern x86 and x86-64 processors with SSE 4.2 support have special
instructions, crc32b and crc32q, for calculating CRC-32C. They greatly
speed up CRC calculation.

Whether the instructions can be used or not depends on the compiler and the
target architecture. If generation of SSE 4.2 instructions is allowed for
the target (-msse4.2 flag on gcc and clang), use them. If they are not
allowed by default, but the compiler supports the -msse4.2 flag to enable
them, compile just the CRC-32C function with -msse4.2 flag, and check at
runtime whether the processor we're running on supports it. If it doesn't,
fall back to the slicing-by-8 algorithm. (With the common defaults on
current operating systems, the runtime-check variant is what you get in
practice.)

Abhijit Menon-Sen, heavily modified by me, reviewed by Andres Freund.
This commit is contained in:
Heikki Linnakangas
2015-04-14 17:05:03 +03:00
parent 4f700bcd20
commit 3dc2d62d04
11 changed files with 534 additions and 5 deletions

View File

@@ -30,10 +30,10 @@ include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -I$(top_builddir)/src/port -DFRONTEND $(CPPFLAGS)
LIBS += $(PTHREAD_LIBS)
OBJS = $(LIBOBJS) chklocale.o erand48.o inet_net_ntop.o \
OBJS = $(LIBOBJS) $(PG_CRC32C_OBJS) chklocale.o erand48.o inet_net_ntop.o \
noblock.o path.o pgcheckdir.o pgmkdirp.o pgsleep.o \
pgstrcasecmp.o pqsignal.o \
qsort.o qsort_arg.o quotes.o sprompt.o tar.o thread.o pg_crc32c_sb8.o
qsort.o qsort_arg.o quotes.o sprompt.o tar.o thread.o
# foo_srv.o and foo.o are both built from foo.c, but only foo.o has -DFRONTEND
OBJS_SRV = $(OBJS:%.o=%_srv.o)
@@ -57,6 +57,10 @@ libpgport.a: $(OBJS)
# thread.o needs PTHREAD_CFLAGS (but thread_srv.o does not)
thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
# pg_crc32c_sse42.o and its _srv.o version need CFLAGS_SSE42
pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
#
# Server versions of object files
#

View File

@@ -0,0 +1,63 @@
/*-------------------------------------------------------------------------
*
* pg_crc32c_choose.c
* Choose which CRC-32C implementation to use, at runtime.
*
* Try to the special CRC instructions introduced in Intel SSE 4.2,
* if available on the platform we're running on, but fall back to the
* slicing-by-8 implementation otherwise.
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/port/pg_crc32c_choose.c
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#ifdef HAVE__GET_CPUID
#include <cpuid.h>
#endif
#ifdef HAVE__CPUID
#include <intrin.h>
#endif
#include "port/pg_crc32c.h"
static bool
pg_crc32c_sse42_available(void)
{
unsigned int exx[4] = {0, 0, 0, 0};
#if defined(HAVE__GET_CPUID)
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
#elif defined(HAVE__CPUID)
__cpuid(exx, 1);
#else
#error cpuid instruction not available
#endif
return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */
}
/*
* This gets called on the first call. It replaces the function pointer
* so that subsequent calls are routed directly to the chosen implementation.
*/
static pg_crc32c
pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
{
if (pg_crc32c_sse42_available())
pg_comp_crc32c = pg_comp_crc32c_sse42;
else
pg_comp_crc32c = pg_comp_crc32c_sb8;
return pg_comp_crc32c(crc, data, len);
}
pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;

View File

@@ -0,0 +1,52 @@
/*-------------------------------------------------------------------------
*
* pg_crc32c_sse42.c
* Compute CRC-32C checksum using Intel SSE 4.2 instructions.
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/port/pg_crc32c_sse42.c
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#include "port/pg_crc32c.h"
#include <nmmintrin.h>
pg_crc32c
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
{
const unsigned char *p = data;
const uint64 *p8;
/*
* Process eight bytes of data at a time.
*
* NB: We do unaligned 8-byte accesses here. The Intel architecture
* allows that, and performance testing didn't show any performance
* gain from aligning the beginning address.
*/
p8 = (const uint64 *) p;
while (len >= 8)
{
crc = (uint32) _mm_crc32_u64(crc, *p8++);
len -= 8;
}
/*
* Handle any remaining bytes one at a time.
*/
p = (const unsigned char *) p8;
while (len > 0)
{
crc = _mm_crc32_u8(crc, *p++);
len--;
}
return crc;
}