1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-08 11:22:35 +03:00

MDEV-19935 Create unified CRC-32 interface

Add CRC32C code to mysys. The x86-64 implementation uses PCMULQDQ in addition to CRC32 instruction
after Intel whitepaper, and is ported from rocksdb code.

Optimized ARM and POWER CRC32 were already present in mysys.
This commit is contained in:
Vladislav Vaintroub
2020-09-17 16:07:37 +02:00
parent ab56cbcd81
commit ccbe6bb6fc
24 changed files with 2096 additions and 1156 deletions

View File

@@ -103,15 +103,6 @@
#cmakedefine HAVE_LIBWRAP 1
#cmakedefine HAVE_SYSTEMD 1
#cmakedefine HAVE_CPUID_INSTRUCTION 1
#cmakedefine HAVE_CLMUL_INSTRUCTION 1
#cmakedefine HAVE_CRC32_VPMSUM 1
/* Support ARMv8 crc + crypto */
#cmakedefine HAVE_ARMV8_CRC 1
#cmakedefine HAVE_ARMV8_CRYPTO 1
#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS 1
/* Does "struct timespec" have a "sec" and "nsec" field? */
#cmakedefine HAVE_TIMESPEC_TS_SEC 1

View File

@@ -73,20 +73,8 @@ IF(WITH_INNOBASE_STORAGE_ENGINE)
# We use the InnoDB code directly in case the code changes.
ADD_DEFINITIONS("-DUNIV_INNOCHECKSUM")
# Avoid generating Hardware Capabilities due to crc32 instructions
IF(CMAKE_SYSTEM_NAME MATCHES "SunOS" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i386")
MY_CHECK_CXX_COMPILER_FLAG("-Wa,-nH")
IF(have_CXX__Wa__nH)
ADD_COMPILE_FLAGS(
../storage/innobase/ut/ut0crc32.cc
COMPILE_FLAGS "-Wa,-nH"
)
ENDIF()
ENDIF()
SET(INNOBASE_SOURCES
../storage/innobase/buf/buf0checksum.cc
../storage/innobase/ut/ut0crc32.cc
../storage/innobase/ut/ut0ut.cc
../storage/innobase/buf/buf0buf.cc
../storage/innobase/page/page0zip.cc

View File

@@ -1583,7 +1583,6 @@ int main(
/* enable when space_id of given file is zero. */
bool is_system_tablespace = false;
ut_crc32_init();
MY_INIT(argv[0]);
DBUG_ENTER("main");
DBUG_PROCESS(argv[0]);

View File

@@ -1833,7 +1833,6 @@ copy_back()
srv_max_n_threads = 1000;
sync_check_init();
ut_crc32_init();
/* copy undo tablespaces */

View File

@@ -97,8 +97,6 @@ main(int argc, char **argv)
{
MY_INIT(argv[0]);
my_checksum_init();
if (get_options(&argc, &argv)) {
goto err;
}

View File

@@ -4011,9 +4011,6 @@ fail:
ut_d(sync_check_enable());
/* Reset the system variables in the recovery module. */
trx_pool_init();
ut_crc32_init();
my_checksum_init();
recv_sys.create();
#ifdef WITH_INNODB_DISALLOW_WRITES
@@ -5386,7 +5383,6 @@ static bool xtrabackup_prepare_func(char** argv)
sync_check_init();
ut_d(sync_check_enable());
ut_crc32_init();
recv_sys.create();
log_sys.create();
recv_sys.recovery_on = true;

View File

@@ -901,18 +901,10 @@ extern int my_compress_buffer(uchar *dest, size_t *destLen,
extern int packfrm(const uchar *, size_t, uchar **, size_t *);
extern int unpackfrm(uchar **, size_t *, const uchar *);
void my_checksum_init(void);
#ifdef HAVE_CRC32_VPMSUM
extern ha_checksum my_checksum(ha_checksum, const void *, size_t);
#else
typedef ha_checksum (*my_crc32_t)(ha_checksum, const void *, size_t);
extern MYSQL_PLUGIN_IMPORT my_crc32_t my_checksum;
#endif
extern uint32 my_checksum(uint32, const void *, size_t);
extern uint32 my_crc32c(uint32, const void *, size_t);
#if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
int crc32_aarch64_available(void);
const char *crc32c_aarch64_available(void);
#endif
extern const char *my_crc32c_implementation();
#ifdef DBUG_ASSERT_EXISTS
extern void my_debug_put_break_here(void);

View File

@@ -16,7 +16,7 @@
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR} ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/mysys)
SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c
SET(MYSYS_SOURCES array.c charset-def.c charset.c crc32ieee.cc my_default.c
get_password.c
errors.c hash.c list.c
mf_cache.c mf_dirname.c mf_fn_ext.c
@@ -45,7 +45,7 @@ SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c
my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c
my_rdtsc.c my_context.c psi_noop.c
my_atomic_writes.c my_cpu.c my_likely.c my_largepage.c
file_logger.c my_dlerror.c)
file_logger.c my_dlerror.c crc32/crc32c.cc)
IF (WIN32)
SET (MYSYS_SOURCES ${MYSYS_SOURCES}
@@ -59,25 +59,23 @@ IF (WIN32)
ENDIF()
IF(MSVC)
SET(HAVE_CPUID_INSTRUCTION 1 CACHE BOOL "")
SET(HAVE_CLMUL_INSTRUCTION 1 CACHE BOOL "")
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
IF(CLANG_CL)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.cc crc32/crc32c.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
SET(HAVE_CPUID_INSTRUCTION 1 CACHE BOOL "")
MY_CHECK_C_COMPILER_FLAG(-msse4.2)
MY_CHECK_C_COMPILER_FLAG(-mpclmul)
CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
CHECK_INCLUDE_FILE(x86intrin.h HAVE_X86INTRIN_H)
IF(have_C__msse4.2 AND have_C__mpclmul AND HAVE_CPUID_H AND HAVE_X86INTRIN_H)
SET(HAVE_CLMUL_INSTRUCTION 1 CACHE BOOL "")
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_x86.c)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_x86.c crc32/crc32c.cc PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
ADD_DEFINITIONS(-DHAVE_SSE42 -DHAVE_PCLMUL)
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
IF(CMAKE_COMPILER_IS_GNUCC AND NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1)
IF(CMAKE_COMPILER_IS_GNUCC)
include(CheckCXXSourceCompiles)
CHECK_CXX_SOURCE_COMPILES("
@@ -99,23 +97,29 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
#include <sys/auxv.h>
int main() { foo(0); getauxval(AT_HWCAP); }" HAVE_ARMV8_CRYPTO)
CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
IF(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_MARCH)
IF(HAVE_ARMV8_CRC_CRYPTO_MARCH)
CHECK_INCLUDE_FILE(arm_acle.h HAVE_ARM_ACLE_H -march=armv8-a+crc+crypto)
IF(HAVE_ARM_ACLE_H)
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
ENDIF()
IF(HAVE_ARMV8_CRC)
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC)
ENDIF()
IF(HAVE_ARMV8_CRYPTO)
ADD_DEFINITIONS(-DHAVE_ARMV8_CRYPTO)
ENDIF()
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_arm64.c PROPERTIES
COMPILE_FLAGS "-march=armv8-a+crc+crypto")
ENDIF()
ENDIF()
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
SET(HAVE_CRC32_VPMSUM 1 PARENT_SCOPE)
SET(MYSYS_SOURCES ${MYSYS_SOURCES} $<TARGET_OBJECTS:crc32c> $<TARGET_OBJECTS:crc32ieee>)
ADD_LIBRARY(crc32c OBJECT crc32/crc32_ppc64.c)
ADD_LIBRARY(crc32ieee OBJECT crc32/crc32_ppc64.c)
SET_TARGET_PROPERTIES(crc32c crc32ieee PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector")
SET_TARGET_PROPERTIES(crc32ieee PROPERTIES COMPILE_DEFINITIONS "CRC32_FUNCTION=my_checksum;CRC32_CONSTANTS_HEADER=\"pcc_crc32_constants.h\"")
SET_TARGET_PROPERTIES(crc32c PROPERTIES COMPILE_DEFINITIONS "CRC32_FUNCTION=crc32c_vpmsum;CRC32_CONSTANTS_HEADER=\"pcc_crc32c_constants.h\"")
SET(MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_ppc64.c crc32/crc32c_ppc.c)
SET_SOURCE_FILES_PROPERTIES(crc32/crc32_ppc64.c crc32/crc32c_ppc.c PROPERTIES
COMPILE_FLAGS "${COMPILE_FLAGS} -maltivec -mvsx -mpower8-vector -mcrypto -mpower8-vector")
ADD_DEFINITIONS(-DHAVE_POWER8 -DHAS_ALTIVEC)
ENDIF()
IF(UNIX)

View File

@@ -57,6 +57,12 @@ asm(".arch_extension crypto");
#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32C3X8(buffer, ITR) \
__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
@@ -73,6 +79,11 @@ asm(".arch_extension crypto");
#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
#define CRC32X(crc, value) (crc) = __crc32d((crc), (value))
#define CRC32W(crc, value) (crc) = __crc32w((crc), (value))
#define CRC32H(crc, value) (crc) = __crc32h((crc), (value))
#define CRC32B(crc, value) (crc) = __crc32b((crc), (value))
#define CRC32C3X8(buffer, ITR) \
crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
@@ -119,7 +130,7 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
uint32_t crc0, crc1, crc2;
int64_t length= (int64_t)len;
crc= 0xFFFFFFFFU;
crc^= 0xffffffff;
/* Pmull runtime check here.
* Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).
@@ -282,16 +293,16 @@ unsigned int crc32_aarch64(unsigned int crc, const void *buf, size_t len)
/* if start pointer is not 8 bytes aligned */
while ((buf1 != (const uint8_t *) buf8) && len)
{
crc= __crc32b(crc, *buf1++);
CRC32B(crc, *buf1++);
len--;
}
for (; len >= 8; len-= 8)
crc= __crc32d(crc, *buf8++);
CRC32X(crc, *buf8++);
buf1= (const uint8_t *) buf8;
while (len--)
crc= __crc32b(crc, *buf1++);
CRC32B(crc, *buf1++);
return ~crc;
}

View File

@@ -1,675 +1,5 @@
/*
* Calculate the checksum of data that is 16 byte aligned and a multiple of
* 16 bytes.
*
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
* chunks in order to mask the latency of the vpmsum instructions. If we
* have more than 32 kB of data to checksum we repeat this step multiple
* times, passing in the previous 1024 bits.
*
* The next step is to reduce the 1024 bits to 64 bits. This step adds
* 32 bits of 0s to the end - this matches what a CRC does. We just
* calculate constants that land the data in this 32 bits.
*
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
* for n = CRC using POWER8 instructions. We use x = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* This code uses gcc vector builtins instead using assembly directly.
*
* Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of either:
*
* a) the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version, or
* b) the Apache License, Version 2.0
*/
#include <altivec.h>
#define POWER8_INTRINSICS
#define CRC32_FUNCTION my_checksum
#define CRC_TABLE
#ifdef CRC32_CONSTANTS_HEADER
#include CRC32_CONSTANTS_HEADER
#else
#include "crc32_constants.h"
#endif
#define VMX_ALIGN 16
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
#ifdef REFLECT
static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
unsigned long len)
{
while (len--)
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
return crc;
}
#else
static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
unsigned long len)
{
while (len--)
crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
return crc;
}
#endif
static unsigned int __attribute__ ((aligned (32)))
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
#ifndef CRC32_FUNCTION
#define CRC32_FUNCTION crc32_vpmsum
#endif
unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p,
unsigned long len)
{
unsigned int prealign;
unsigned int tail;
#ifdef CRC_XOR
crc ^= 0xffffffff;
#endif
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
crc = crc32_align(crc, p, len);
goto out;
}
if ((unsigned long)p & VMX_ALIGN_MASK) {
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
crc = crc32_align(crc, p, prealign);
len -= prealign;
p += prealign;
}
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
tail = len & VMX_ALIGN_MASK;
if (tail) {
p += len & ~VMX_ALIGN_MASK;
crc = crc32_align(crc, p, tail);
}
out:
#ifdef CRC_XOR
crc ^= 0xffffffff;
#endif
return crc;
}
#if defined (__clang__)
#include "clang_workaround.h"
#else
#define __builtin_pack_vector(a, b) __builtin_pack_vector_int128 ((a), (b))
#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0)
#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1)
#endif
/* When we have a load-store in a single-dispatch group and address overlap
* such that foward is not allowed (load-hit-store) the group must be flushed.
* A group ending NOP prevents the flush.
*/
#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory")
#if defined(__BIG_ENDIAN__) && defined (REFLECT)
#define BYTESWAP_DATA
#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
#define BYTESWAP_DATA
#endif
#ifdef BYTESWAP_DATA
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\
(__vector unsigned char) vc)
#if defined(__LITTLE_ENDIAN__)
/* Byte reverse permute constant LE. */
static const __vector unsigned long long vperm_const
__attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL,
0x0001020304050607UL };
#else
static const __vector unsigned long long vperm_const
__attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL,
0X0706050403020100UL };
#endif
#else
#define VEC_PERM(vr, va, vb, vc)
#endif
static unsigned int __attribute__ ((aligned (32)))
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
const __vector unsigned long long vzero = {0,0};
const __vector unsigned long long vones = {0xffffffffffffffffUL,
0xffffffffffffffffUL};
#ifdef REFLECT
__vector unsigned char vsht_splat;
const __vector unsigned long long vmask_32bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
(__vector unsigned char)vones, 4);
#endif
const __vector unsigned long long vmask_64bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
(__vector unsigned char)vones, 8);
__vector unsigned long long vcrc;
__vector unsigned long long vconst1, vconst2;
/* vdata0-vdata7 will contain our data (p). */
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4,
vdata5, vdata6, vdata7;
/* v0-v7 will contain our checksums */
__vector unsigned long long v0 = {0,0};
__vector unsigned long long v1 = {0,0};
__vector unsigned long long v2 = {0,0};
__vector unsigned long long v3 = {0,0};
__vector unsigned long long v4 = {0,0};
__vector unsigned long long v5 = {0,0};
__vector unsigned long long v6 = {0,0};
__vector unsigned long long v7 = {0,0};
/* Vector auxiliary variables. */
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
unsigned int result = 0;
unsigned int offset; /* Constant table offset. */
unsigned long i; /* Counter. */
unsigned long chunks;
unsigned long block_size;
int next_block = 0;
/* Align by 128 bits. The last 128 bit block will be processed at end. */
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
#ifdef REFLECT
vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc);
#else
vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL);
/* Shift into top 32 bits */
vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc,
(__vector unsigned char)vzero, 4);
#endif
/* Short version. */
if (len < 256) {
/* Calculate where in the constant table we need to start. */
offset = 256 - len;
vconst1 = vec_ld(offset, vcrc_short_const);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
/* xor initial value*/
vdata0 = vec_xor(vdata0, vcrc);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
for (i = 16; i < len; i += 16) {
vconst1 = vec_ld(offset + i, vcrc_short_const);
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
}
} else {
/* Load initial values. */
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
/* xor in initial value */
vdata0 = vec_xor(vdata0, vcrc);
p = (char *)p + 128;
do {
/* Checksum in blocks of MAX_SIZE. */
block_size = length;
if (block_size > MAX_SIZE) {
block_size = MAX_SIZE;
}
length = length - block_size;
/*
* Work out the offset into the constants table to start at. Each
* constant is 16 bytes, and it is used against 128 bytes of input
* data - 128 / 16 = 8
*/
offset = (MAX_SIZE/8) - (block_size/8);
/* We reduce our final 128 bytes in a separate step */
chunks = (block_size/128)-1;
vconst1 = vec_ld(offset, vcrc_const);
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst1);
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst1);
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2,
(__vector unsigned long long)vconst1);
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst1);
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
if (chunks > 1) {
offset += 16;
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
/*
* main loop. We modulo schedule it such that it takes three
* iterations to complete - first iteration load, second
* iteration vpmsum, third iteration xor.
*/
for (i = 0; i < chunks-2; i++) {
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
GROUP_ENDING_NOP;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata0, (__vector unsigned long long)vconst2);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata1, (__vector unsigned long long)vconst2);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata2, (__vector unsigned long long)vconst2);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata3, (__vector unsigned long long)vconst2);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata4, (__vector unsigned long long)vconst1);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata5, (__vector unsigned long long)vconst1);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata6, (__vector unsigned long long)vconst1);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata7, (__vector unsigned long long)vconst1);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
}
/* First cool down*/
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata0, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata1, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata2, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata3, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata4, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata5, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata6, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata7, (__vector unsigned long long)vconst1);
}/* else */
/* Second cool down. */
v0 = vec_xor(v0, va0);
v1 = vec_xor(v1, va1);
v2 = vec_xor(v2, va2);
v3 = vec_xor(v3, va3);
v4 = vec_xor(v4, va4);
v5 = vec_xor(v5, va5);
v6 = vec_xor(v6, va6);
v7 = vec_xor(v7, va7);
#ifdef REFLECT
/*
* vpmsumd produces a 96 bit result in the least significant bits
* of the register. Since we are bit reflected we have to shift it
* left 32 bits so it occupies the least significant bits in the
* bit reflected domain.
*/
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
(__vector unsigned char)vzero, 4);
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
(__vector unsigned char)vzero, 4);
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
(__vector unsigned char)vzero, 4);
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
(__vector unsigned char)vzero, 4);
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
(__vector unsigned char)vzero, 4);
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
(__vector unsigned char)vzero, 4);
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
(__vector unsigned char)vzero, 4);
#endif
/* xor with the last 1024 bits. */
va0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(va0, va0, va0, vperm_const);
va1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(va1, va1, va1, vperm_const);
va2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(va2, va2, va2, vperm_const);
va3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(va3, va3, va3, vperm_const);
va4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(va4, va4, va4, vperm_const);
va5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(va5, va5, va5, vperm_const);
va6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(va6, va6, va6, vperm_const);
va7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(va7, va7, va7, vperm_const);
p = (char *)p + 128;
vdata0 = vec_xor(v0, va0);
vdata1 = vec_xor(v1, va1);
vdata2 = vec_xor(v2, va2);
vdata3 = vec_xor(v3, va3);
vdata4 = vec_xor(v4, va4);
vdata5 = vec_xor(v5, va5);
vdata6 = vec_xor(v6, va6);
vdata7 = vec_xor(v7, va7);
/* Check if we have more blocks to process */
next_block = 0;
if (length != 0) {
next_block = 1;
/* zero v0-v7 */
v0 = vec_xor(v0, v0);
v1 = vec_xor(v1, v1);
v2 = vec_xor(v2, v2);
v3 = vec_xor(v3, v3);
v4 = vec_xor(v4, v4);
v5 = vec_xor(v5, v5);
v6 = vec_xor(v6, v6);
v7 = vec_xor(v7, v7);
}
length = length + 128;
} while (next_block);
/* Calculate how many bytes we have left. */
length = (len & 127);
/* Calculate where in (short) constant table we need to start. */
offset = 128 - length;
v0 = vec_ld(offset, vcrc_short_const);
v1 = vec_ld(offset + 16, vcrc_short_const);
v2 = vec_ld(offset + 32, vcrc_short_const);
v3 = vec_ld(offset + 48, vcrc_short_const);
v4 = vec_ld(offset + 64, vcrc_short_const);
v5 = vec_ld(offset + 80, vcrc_short_const);
v6 = vec_ld(offset + 96, vcrc_short_const);
v7 = vec_ld(offset + 112, vcrc_short_const);
offset += 128;
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata0,(__vector unsigned int)v0);
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata1,(__vector unsigned int)v1);
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata2,(__vector unsigned int)v2);
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata3,(__vector unsigned int)v3);
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata4,(__vector unsigned int)v4);
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata5,(__vector unsigned int)v5);
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata6,(__vector unsigned int)v6);
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata7,(__vector unsigned int)v7);
/* Now reduce the tail (0-112 bytes). */
for (i = 0; i < length; i+=16) {
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
va0 = vec_ld(offset + i,vcrc_short_const);
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata0,(__vector unsigned int)va0);
v0 = vec_xor(v0, va0);
}
/* xor all parallel chunks together. */
v0 = vec_xor(v0, v1);
v2 = vec_xor(v2, v3);
v4 = vec_xor(v4, v5);
v6 = vec_xor(v6, v7);
v0 = vec_xor(v0, v2);
v4 = vec_xor(v4, v6);
v0 = vec_xor(v0, v4);
}
/* Barrett Reduction */
vconst1 = vec_ld(0, v_Barrett_const);
vconst2 = vec_ld(16, v_Barrett_const);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)v0, 8);
v0 = vec_xor(v1,v0);
#ifdef REFLECT
/* shift left one bit */
vsht_splat = vec_splat_u8 (1);
v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0,
vsht_splat);
#endif
v0 = vec_and(v0, vmask_64bit);
#ifndef REFLECT
/*
* Now for the actual algorithm. The idea is to calculate q,
* the multiple of our polynomial that we need to subtract. By
* doing the computation 2x bits higher (ie 64 bits) and shifting the
* result back down 2x bits, we round down to the nearest multiple.
*/
/* ma */
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0,
(__vector unsigned long long)vconst1);
/* q = floor(ma/(2^64)) */
v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero,
(__vector unsigned char)v1, 8);
/* qn */
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (v0, v1);
/*
* Get the result into r3. We need to shift it left 8 bytes:
* V0 [ 0 1 2 X ]
* V0 [ 0 X 2 3 ]
*/
result = __builtin_unpack_vector_1 (v0);
#else
/*
* The reflected version of Barrett reduction. Instead of bit
* reflecting our data (which is expensive to do), we bit reflect our
* constants and our algorithm, which means the intermediate data in
* our vector registers goes from 0-63 instead of 63-0. We can reflect
* the algorithm because we don't carry in mod 2 arithmetic.
*/
/* bottom 32 bits of a */
v1 = vec_and(v0, vmask_32bit);
/* ma */
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
(__vector unsigned long long)vconst1);
/* bottom 32bits of ma */
v1 = vec_and(v1, vmask_32bit);
/* qn */
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (v0, v1);
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in
* the high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
/* shift result into top 64 bits of */
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
result = __builtin_unpack_vector_0 (v0);
#endif
return result;
}
#define POWER8_INTRINSICS
#include "pcc_crc32_constants.h"
#include "crc_ppc64.h"

1254
mysys/crc32/crc32c.cc Normal file

File diff suppressed because it is too large Load Diff

5
mysys/crc32/crc32c_ppc.c Normal file
View File

@@ -0,0 +1,5 @@
#define CRC32_FUNCTION crc32c_ppc
#define CRC_TABLE
#define POWER8_INTRINSICS
#include "pcc_crc32c_constants.h"
#include "crc_ppc64.h"

19
mysys/crc32/crc32c_ppc.h Normal file
View File

@@ -0,0 +1,19 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// Copyright (c) 2017 International Business Machines Corp.
// All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
unsigned len);
#ifdef __cplusplus
}
#endif

664
mysys/crc32/crc_ppc64.h Normal file
View File

@@ -0,0 +1,664 @@
/*
* Calculate the checksum of data that is 16 byte aligned and a multiple of
* 16 bytes.
*
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
* chunks in order to mask the latency of the vpmsum instructions. If we
* have more than 32 kB of data to checksum we repeat this step multiple
* times, passing in the previous 1024 bits.
*
* The next step is to reduce the 1024 bits to 64 bits. This step adds
* 32 bits of 0s to the end - this matches what a CRC does. We just
* calculate constants that land the data in this 32 bits.
*
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
* for n = CRC using POWER8 instructions. We use x = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* This code uses gcc vector builtins instead using assembly directly.
*
* Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of either:
*
* a) the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version, or
* b) the Apache License, Version 2.0
*/
#include <altivec.h>
#define VMX_ALIGN 16
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
#ifdef REFLECT
static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
unsigned long len)
{
while (len--)
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
return crc;
}
#else
static unsigned int crc32_align(unsigned int crc, const unsigned char *p,
unsigned long len)
{
while (len--)
crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
return crc;
}
#endif
static unsigned int __attribute__ ((aligned (32)))
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p,
unsigned long len)
{
unsigned int prealign;
unsigned int tail;
#ifdef CRC_XOR
crc ^= 0xffffffff;
#endif
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
crc = crc32_align(crc, p, len);
goto out;
}
if ((unsigned long)p & VMX_ALIGN_MASK) {
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
crc = crc32_align(crc, p, prealign);
len -= prealign;
p += prealign;
}
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
tail = len & VMX_ALIGN_MASK;
if (tail) {
p += len & ~VMX_ALIGN_MASK;
crc = crc32_align(crc, p, tail);
}
out:
#ifdef CRC_XOR
crc ^= 0xffffffff;
#endif
return crc;
}
#if defined (__clang__)
#include "clang_workaround.h"
#else
#define __builtin_pack_vector(a, b) __builtin_pack_vector_int128 ((a), (b))
#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0)
#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1)
#endif
/* When we have a load-store in a single-dispatch group and address overlap
* such that foward is not allowed (load-hit-store) the group must be flushed.
* A group ending NOP prevents the flush.
*/
#define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory")
#if defined(__BIG_ENDIAN__) && defined (REFLECT)
#define BYTESWAP_DATA
#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
#define BYTESWAP_DATA
#endif
#ifdef BYTESWAP_DATA
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\
(__vector unsigned char) vc)
#if defined(__LITTLE_ENDIAN__)
/* Byte reverse permute constant LE. */
static const __vector unsigned long long vperm_const
__attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL,
0x0001020304050607UL };
#else
static const __vector unsigned long long vperm_const
__attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL,
0X0706050403020100UL };
#endif
#else
#define VEC_PERM(vr, va, vb, vc)
#endif
static unsigned int __attribute__ ((aligned (32)))
__crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
const __vector unsigned long long vzero = {0,0};
const __vector unsigned long long vones = {0xffffffffffffffffUL,
0xffffffffffffffffUL};
#ifdef REFLECT
__vector unsigned char vsht_splat;
const __vector unsigned long long vmask_32bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
(__vector unsigned char)vones, 4);
#endif
const __vector unsigned long long vmask_64bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
(__vector unsigned char)vones, 8);
__vector unsigned long long vcrc;
__vector unsigned long long vconst1, vconst2;
/* vdata0-vdata7 will contain our data (p). */
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4,
vdata5, vdata6, vdata7;
/* v0-v7 will contain our checksums */
__vector unsigned long long v0 = {0,0};
__vector unsigned long long v1 = {0,0};
__vector unsigned long long v2 = {0,0};
__vector unsigned long long v3 = {0,0};
__vector unsigned long long v4 = {0,0};
__vector unsigned long long v5 = {0,0};
__vector unsigned long long v6 = {0,0};
__vector unsigned long long v7 = {0,0};
/* Vector auxiliary variables. */
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
unsigned int result = 0;
unsigned int offset; /* Constant table offset. */
unsigned long i; /* Counter. */
unsigned long chunks;
unsigned long block_size;
int next_block = 0;
/* Align by 128 bits. The last 128 bit block will be processed at end. */
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
#ifdef REFLECT
vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc);
#else
vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL);
/* Shift into top 32 bits */
vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc,
(__vector unsigned char)vzero, 4);
#endif
/* Short version. */
if (len < 256) {
/* Calculate where in the constant table we need to start. */
offset = 256 - len;
vconst1 = vec_ld(offset, vcrc_short_const);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
/* xor initial value*/
vdata0 = vec_xor(vdata0, vcrc);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
for (i = 16; i < len; i += 16) {
vconst1 = vec_ld(offset + i, vcrc_short_const);
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw
((__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
v0 = vec_xor(v0, vdata0);
}
} else {
/* Load initial values. */
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
/* xor in initial value */
vdata0 = vec_xor(vdata0, vcrc);
p = (char *)p + 128;
do {
/* Checksum in blocks of MAX_SIZE. */
block_size = length;
if (block_size > MAX_SIZE) {
block_size = MAX_SIZE;
}
length = length - block_size;
/*
* Work out the offset into the constants table to start at. Each
* constant is 16 bytes, and it is used against 128 bytes of input
* data - 128 / 16 = 8
*/
offset = (MAX_SIZE/8) - (block_size/8);
/* We reduce our final 128 bytes in a separate step */
chunks = (block_size/128)-1;
vconst1 = vec_ld(offset, vcrc_const);
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0,
(__vector unsigned long long)vconst1);
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1,
(__vector unsigned long long)vconst1);
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2,
(__vector unsigned long long)vconst1);
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3,
(__vector unsigned long long)vconst1);
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4,
(__vector unsigned long long)vconst1);
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5,
(__vector unsigned long long)vconst1);
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6,
(__vector unsigned long long)vconst1);
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7,
(__vector unsigned long long)vconst1);
if (chunks > 1) {
offset += 16;
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
/*
* main loop. We modulo schedule it such that it takes three
* iterations to complete - first iteration load, second
* iteration vpmsum, third iteration xor.
*/
for (i = 0; i < chunks-2; i++) {
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
GROUP_ENDING_NOP;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata0, (__vector unsigned long long)vconst2);
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata1, (__vector unsigned long long)vconst2);
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata2, (__vector unsigned long long)vconst2);
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata3, (__vector unsigned long long)vconst2);
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
vconst2 = vec_ld(offset, vcrc_const);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata4, (__vector unsigned long long)vconst1);
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata5, (__vector unsigned long long)vconst1);
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata6, (__vector unsigned long long)vconst1);
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata7, (__vector unsigned long long)vconst1);
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
p = (char *)p + 128;
}
/* First cool down*/
vconst1 = vec_ld(offset, vcrc_const);
offset += 16;
v0 = vec_xor(v0, va0);
va0 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata0, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v1 = vec_xor(v1, va1);
va1 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata1, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v2 = vec_xor(v2, va2);
va2 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata2, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v3 = vec_xor(v3, va3);
va3 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata3, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v4 = vec_xor(v4, va4);
va4 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata4, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v5 = vec_xor(v5, va5);
va5 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata5, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v6 = vec_xor(v6, va6);
va6 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata6, (__vector unsigned long long)vconst1);
GROUP_ENDING_NOP;
v7 = vec_xor(v7, va7);
va7 = __builtin_crypto_vpmsumd ((__vector unsigned long
long)vdata7, (__vector unsigned long long)vconst1);
}/* else */
/* Second cool down. */
v0 = vec_xor(v0, va0);
v1 = vec_xor(v1, va1);
v2 = vec_xor(v2, va2);
v3 = vec_xor(v3, va3);
v4 = vec_xor(v4, va4);
v5 = vec_xor(v5, va5);
v6 = vec_xor(v6, va6);
v7 = vec_xor(v7, va7);
#ifdef REFLECT
/*
* vpmsumd produces a 96 bit result in the least significant bits
* of the register. Since we are bit reflected we have to shift it
* left 32 bits so it occupies the least significant bits in the
* bit reflected domain.
*/
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
(__vector unsigned char)vzero, 4);
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
(__vector unsigned char)vzero, 4);
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
(__vector unsigned char)vzero, 4);
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
(__vector unsigned char)vzero, 4);
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
(__vector unsigned char)vzero, 4);
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
(__vector unsigned char)vzero, 4);
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
(__vector unsigned char)vzero, 4);
#endif
/* xor with the last 1024 bits. */
va0 = vec_ld(0, (__vector unsigned long long*) p);
VEC_PERM(va0, va0, va0, vperm_const);
va1 = vec_ld(16, (__vector unsigned long long*) p);
VEC_PERM(va1, va1, va1, vperm_const);
va2 = vec_ld(32, (__vector unsigned long long*) p);
VEC_PERM(va2, va2, va2, vperm_const);
va3 = vec_ld(48, (__vector unsigned long long*) p);
VEC_PERM(va3, va3, va3, vperm_const);
va4 = vec_ld(64, (__vector unsigned long long*) p);
VEC_PERM(va4, va4, va4, vperm_const);
va5 = vec_ld(80, (__vector unsigned long long*) p);
VEC_PERM(va5, va5, va5, vperm_const);
va6 = vec_ld(96, (__vector unsigned long long*) p);
VEC_PERM(va6, va6, va6, vperm_const);
va7 = vec_ld(112, (__vector unsigned long long*) p);
VEC_PERM(va7, va7, va7, vperm_const);
p = (char *)p + 128;
vdata0 = vec_xor(v0, va0);
vdata1 = vec_xor(v1, va1);
vdata2 = vec_xor(v2, va2);
vdata3 = vec_xor(v3, va3);
vdata4 = vec_xor(v4, va4);
vdata5 = vec_xor(v5, va5);
vdata6 = vec_xor(v6, va6);
vdata7 = vec_xor(v7, va7);
/* Check if we have more blocks to process */
next_block = 0;
if (length != 0) {
next_block = 1;
/* zero v0-v7 */
v0 = vec_xor(v0, v0);
v1 = vec_xor(v1, v1);
v2 = vec_xor(v2, v2);
v3 = vec_xor(v3, v3);
v4 = vec_xor(v4, v4);
v5 = vec_xor(v5, v5);
v6 = vec_xor(v6, v6);
v7 = vec_xor(v7, v7);
}
length = length + 128;
} while (next_block);
/* Calculate how many bytes we have left. */
length = (len & 127);
/* Calculate where in (short) constant table we need to start. */
offset = 128 - length;
v0 = vec_ld(offset, vcrc_short_const);
v1 = vec_ld(offset + 16, vcrc_short_const);
v2 = vec_ld(offset + 32, vcrc_short_const);
v3 = vec_ld(offset + 48, vcrc_short_const);
v4 = vec_ld(offset + 64, vcrc_short_const);
v5 = vec_ld(offset + 80, vcrc_short_const);
v6 = vec_ld(offset + 96, vcrc_short_const);
v7 = vec_ld(offset + 112, vcrc_short_const);
offset += 128;
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata0,(__vector unsigned int)v0);
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata1,(__vector unsigned int)v1);
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata2,(__vector unsigned int)v2);
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata3,(__vector unsigned int)v3);
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata4,(__vector unsigned int)v4);
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata5,(__vector unsigned int)v5);
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata6,(__vector unsigned int)v6);
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata7,(__vector unsigned int)v7);
/* Now reduce the tail (0-112 bytes). */
for (i = 0; i < length; i+=16) {
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
va0 = vec_ld(offset + i,vcrc_short_const);
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw (
(__vector unsigned int)vdata0,(__vector unsigned int)va0);
v0 = vec_xor(v0, va0);
}
/* xor all parallel chunks together. */
v0 = vec_xor(v0, v1);
v2 = vec_xor(v2, v3);
v4 = vec_xor(v4, v5);
v6 = vec_xor(v6, v7);
v0 = vec_xor(v0, v2);
v4 = vec_xor(v4, v6);
v0 = vec_xor(v0, v4);
}
/* Barrett Reduction */
vconst1 = vec_ld(0, v_Barrett_const);
vconst2 = vec_ld(16, v_Barrett_const);
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)v0, 8);
v0 = vec_xor(v1,v0);
#ifdef REFLECT
/* shift left one bit */
vsht_splat = vec_splat_u8 (1);
v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0,
vsht_splat);
#endif
v0 = vec_and(v0, vmask_64bit);
#ifndef REFLECT
/*
* Now for the actual algorithm. The idea is to calculate q,
* the multiple of our polynomial that we need to subtract. By
* doing the computation 2x bits higher (ie 64 bits) and shifting the
* result back down 2x bits, we round down to the nearest multiple.
*/
/* ma */
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0,
(__vector unsigned long long)vconst1);
/* q = floor(ma/(2^64)) */
v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero,
(__vector unsigned char)v1, 8);
/* qn */
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (v0, v1);
/*
* Get the result into r3. We need to shift it left 8 bytes:
* V0 [ 0 1 2 X ]
* V0 [ 0 X 2 3 ]
*/
result = __builtin_unpack_vector_1 (v0);
#else
/*
* The reflected version of Barrett reduction. Instead of bit
* reflecting our data (which is expensive to do), we bit reflect our
* constants and our algorithm, which means the intermediate data in
* our vector registers goes from 0-63 instead of 63-0. We can reflect
* the algorithm because we don't carry in mod 2 arithmetic.
*/
/* bottom 32 bits of a */
v1 = vec_and(v0, vmask_32bit);
/* ma */
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
(__vector unsigned long long)vconst1);
/* bottom 32bits of ma */
v1 = vec_and(v1, vmask_32bit);
/* qn */
v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (v0, v1);
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in
* the high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
/* shift result into top 64 bits of */
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
result = __builtin_unpack_vector_0 (v0);
#endif
return result;
}

View File

@@ -18,40 +18,46 @@
#include <my_sys.h>
#include <zlib.h>
#if !defined(HAVE_CRC32_VPMSUM)
/* TODO: remove this once zlib adds inherent support for hardware accelerated
crc32 for all architectures. */
static unsigned int my_crc32_zlib(unsigned int crc, const void *data,
size_t len)
{
return (unsigned int) crc32(crc, data, (unsigned int) len);
return (unsigned int) crc32(crc, (const Bytef *)data, (unsigned int) len);
}
my_crc32_t my_checksum= my_crc32_zlib;
#endif
#ifdef HAVE_CLMUL_INSTRUCTION
extern int crc32_pclmul_enabled();
extern unsigned int crc32_pclmul(unsigned int, const void *, size_t);
/*----------------------------- x86_64 ---------------------------------*/
void my_checksum_init(void)
{
if (crc32_pclmul_enabled())
my_checksum= crc32_pclmul;
}
#ifdef HAVE_PCLMUL
extern "C" int crc32_pclmul_enabled();
extern "C" unsigned int crc32_pclmul(unsigned int, const void *, size_t);
#elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
/*----------------------------- aarch64 --------------------------------*/
extern unsigned int crc32_aarch64(unsigned int, const void *, size_t);
/* Ideally all ARM 64 bit processor should support crc32 but if some model
doesn't support better to find it out through auxillary vector. */
void my_checksum_init(void)
{
if (crc32_aarch64_available())
my_checksum= crc32_aarch64;
}
#else
void my_checksum_init(void) {}
extern "C" int crc32_aarch64_available();
extern "C" unsigned int crc32_aarch64(unsigned int, const void *, size_t);
#endif
typedef unsigned int (*my_crc32_t)(unsigned int, const void *, size_t);
static my_crc32_t init_crc32()
{
my_crc32_t func= my_crc32_zlib;
#ifdef HAVE_PCLMUL
if (crc32_pclmul_enabled())
func = crc32_pclmul;
#elif defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
if (crc32_aarch64_available())
func= crc32_aarch64;
#endif
return func;
}
static const my_crc32_t my_checksum_func= init_crc32();
#ifndef __powerpc64__
/* For powerpc, my_checksum is defined elsewhere.*/
extern "C" unsigned int my_checksum(unsigned int crc, const void *data, size_t len)
{
return my_checksum_func(crc, data, len);
}
#endif

View File

@@ -100,9 +100,6 @@ my_bool my_init(void)
/* Initialize our mutex handling */
my_mutex_init();
/* Initialize CPU architecture specific hardware based crc32 optimization */
my_checksum_init();
if (my_thread_global_init())
return 1;

View File

@@ -264,7 +264,6 @@ SET(INNOBASE_SOURCES
include/ut0byte.h
include/ut0byte.ic
include/ut0counter.h
include/ut0crc32.h
include/ut0dbg.h
include/ut0list.h
include/ut0list.ic
@@ -340,7 +339,6 @@ SET(INNOBASE_SOURCES
trx/trx0sys.cc
trx/trx0trx.cc
trx/trx0undo.cc
ut/ut0crc32.cc
ut/ut0dbg.cc
ut/ut0list.cc
ut/ut0mem.cc

View File

@@ -28,33 +28,10 @@ Created Aug 10, 2011 Vasil Dimov
#define ut0crc32_h
#include "univ.i"
/********************************************************************//**
Initializes the data structures used by ut_crc32*(). Does not do any
allocations, would not hurt if called twice, but would be pointless. */
void ut_crc32_init();
/** Append data to a CRC-32C checksum.
@param crc current checksum
@param s data to append to the checksum
@param size data length in bytes
@return CRC-32C, using the GF(2) primitive polynomial 0x11EDC6F41,
or 0x1EDC6F41 without the highest degree term */
typedef uint32_t (*ut_crc32_func_t)(uint32_t crc, const byte *s, size_t size);
/** Pointer to CRC32 calculation function. */
extern ut_crc32_func_t ut_crc32_low;
/** Text description of CRC32 implementation */
extern const char* ut_crc32_implementation;
/** Compute CRC-32C over a string of bytes.
@param s data
@param len data length in bytes
@return the CRC-32C of the data */
#include <my_sys.h>
static inline uint32_t ut_crc32(const byte *s, size_t size)
{
return ut_crc32_low(0, s, size);
return my_crc32c(0, s, size);
}
#endif /* ut0crc32_h */

View File

@@ -209,16 +209,6 @@ IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro"
PROPERTIES COMPILE_FLAGS -xO3)
ENDIF()
# Avoid generating Hardware Capabilities due to crc32 instructions
IF(CMAKE_SYSTEM_NAME MATCHES "SunOS" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i386")
MY_CHECK_CXX_COMPILER_FLAG("-Wa,-nH")
IF(have_CXX__Wa__nH)
ADD_COMPILE_FLAGS(
ut/ut0crc32.cc
COMPILE_FLAGS "-Wa,-nH"
)
ENDIF()
ENDIF()
IF(MSVC)
# Avoid "unreferenced label" warning in generated file

View File

@@ -762,7 +762,6 @@ static void srv_init()
/* Initialize some INFORMATION SCHEMA internal structures */
trx_i_s_cache_init(trx_i_s_cache);
ut_crc32_init();
}
/*********************************************************************//**

View File

@@ -1223,7 +1223,7 @@ dberr_t srv_start(bool create_new_db)
srv_boot();
ib::info() << ut_crc32_implementation;
ib::info() << my_crc32c_implementation();
if (!srv_read_only_mode) {

View File

@@ -1,346 +0,0 @@
/*****************************************************************************
Copyright (c) 2009, 2010 Facebook, Inc. All Rights Reserved.
Copyright (c) 2011, 2015, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2016, 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
*****************************************************************************/
/***************************************************************//**
@file ut/ut0crc32.cc
CRC32 implementation from Facebook, based on the zlib implementation.
Created Aug 8, 2011, Vasil Dimov, based on mysys/my_crc32.c and
mysys/my_perf.c, contributed by Facebook under the following license.
********************************************************************/
/* Copyright (C) 2009-2010 Facebook, Inc. All Rights Reserved.
Dual licensed under BSD license and GPLv2.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY FACEBOOK, INC. ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
EVENT SHALL FACEBOOK, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
/* The below CRC32 implementation is based on the implementation included with
* zlib with modifications to process 8 bytes at a time and using SSE 4.2
* extensions when available. The polynomial constant has been changed to
* match the one used by SSE 4.2 and does not return the same value as the
* version used by zlib. The original zlib copyright notice follows. */
/* crc32.c -- compute the CRC-32 of a buf stream
* Copyright (C) 1995-2005 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
* CRC methods: exclusive-oring 32 bits of buf at a time, and pre-computing
* tables for updating the shift register in one step with three exclusive-ors
* instead of four steps with four exclusive-ors. This results in about a
* factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
*/
// First include (the generated) my_config.h, to get correct platform defines.
#include "my_config.h"
#include <string.h>
#include "ut0crc32.h"
#include "my_valgrind.h"
#ifdef HAVE_CPUID_INSTRUCTION
# ifdef _MSC_VER
# include <intrin.h>
# else
# include <cpuid.h>
# if defined __GNUC__ && !defined __clang__ && __GNUC__ < 5
/* <nmmintrin.h> does not really work in GCC before version 5 */
# define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data)
# define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data)
# define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data)
# else
# include <nmmintrin.h>
# endif
# endif
#endif
/* CRC32 hardware implementation. */
#ifdef HAVE_CRC32_VPMSUM
extern "C"
unsigned int crc32c_vpmsum(unsigned int crc, const unsigned char *p, unsigned long len);
ut_crc32_func_t ut_crc32_low= crc32c_vpmsum;
const char* ut_crc32_implementation = "Using POWER8 crc32 instructions";
#else
# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
extern "C" {
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len);
};
# elif defined HAVE_CPUID_INSTRUCTION
/** return whether SSE4.2 instructions are available */
static inline bool has_sse4_2()
{
/* We assume that the CPUID instruction and its parameter 1 are available.
We do not support any precursors of the Intel 80486. */
# ifdef _MSC_VER
int data[4];
__cpuid(data, 1);
return !!(data[2] & 1 << 20);
# else
uint32_t reax = 0, rebx = 0, recx = 0, redx = 0;
__cpuid(1, reax, rebx, recx, redx);
return !!(recx & 1 << 20);
# endif
}
/** Append 8 bits (1 byte) to a CRC-32C checksum.
@param crc CRC-32C checksum so far
@param data data to be checksummed
@return the updated CRC-32C */
__attribute__((target("sse4.2")))
static inline ulint ut_crc32c_8(ulint crc, byte data)
{
return _mm_crc32_u8(static_cast<uint32_t>(crc), data);
}
/** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
@param[in] crc CRC-32C checksum so far
@param[in] data 8 bytes of aligned data
@return the updated CRC-32C */
__attribute__((target("sse4.2")))
static inline ulint ut_crc32c_64(ulint crc, uint64_t data)
{
# if SIZEOF_SIZE_T > 4
return _mm_crc32_u64(crc, data);
# else
crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data));
crc= _mm_crc32_u32(crc, static_cast<uint32_t>(data >> 32));
return crc;
# endif
}
/** Calculate CRC-32C using dedicated IA-32 or AMD64 instructions
@param crc current checksum
@param buf data to append to the checksum
@param len data length in bytes
@return CRC-32C (polynomial 0x11EDC6F41) */
uint32_t ut_crc32_hw(uint32_t crc, const byte *buf, size_t len)
{
ulint c= static_cast<uint32_t>(~crc);
/* Calculate byte-by-byte up to an 8-byte aligned address. After
this consume the input 8-bytes at a time. */
while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0)
{
c= ut_crc32c_8(c, *buf++);
len--;
}
const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf);
for (; len >= 128; len-= 128)
{
/* This call is repeated 16 times. 16 * 8 = 128. */
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
c= ut_crc32c_64(c, *b64++);
}
for (; len >= 8; len-= 8)
c= ut_crc32c_64(c, *b64++);
buf= reinterpret_cast<const byte*>(b64);
while (len--)
c= ut_crc32c_8(c, *buf++);
return ~static_cast<uint32_t>(c);
}
# endif /* (defined(__GNUC__) && defined(__i386__)) || _MSC_VER */
/* CRC32 software implementation. */
/* Precalculated table used to generate the CRC32 if the CPU does not
have support for it */
static uint32_t ut_crc32_slice8_table[8][256];
/********************************************************************//**
Initializes the table that is used to generate the CRC32 if the CPU does
not have support for it. */
static
void
ut_crc32_slice8_table_init()
/*========================*/
{
/* bit-reversed poly 0x1EDC6F41 (from SSE42 crc32 instruction) */
static const uint32_t poly = 0x82f63b78;
uint32_t n;
uint32_t k;
uint32_t c;
for (n = 0; n < 256; n++) {
c = n;
for (k = 0; k < 8; k++) {
c = (c & 1) ? (poly ^ (c >> 1)) : (c >> 1);
}
ut_crc32_slice8_table[0][n] = c;
}
for (n = 0; n < 256; n++) {
c = ut_crc32_slice8_table[0][n];
for (k = 1; k < 8; k++) {
c = ut_crc32_slice8_table[0][c & 0xFF] ^ (c >> 8);
ut_crc32_slice8_table[k][n] = c;
}
}
}
/** Append 8 bits (1 byte) to a CRC-32C checksum.
@param crc CRC-32C checksum so far
@param data data to be checksummed
@return the updated CRC-32C */
static inline uint32_t ut_crc32c_8_sw(uint32_t crc, byte data)
{
const uint8_t i= (crc ^ data) & 0xFF;
return (crc >> 8) ^ ut_crc32_slice8_table[0][i];
}
/** Append 64 bits (8 aligned bytes) to a CRC-32C checksum
@param[in] crc CRC-32C checksum so far
@param[in] data 8 bytes of aligned data
@return the updated CRC-32C */
static inline uint32_t ut_crc32c_64_sw(uint32_t crc, uint64_t data)
{
# ifdef WORDS_BIGENDIAN
data= data << 56 |
(data & 0x000000000000FF00ULL) << 40 |
(data & 0x0000000000FF0000ULL) << 24 |
(data & 0x00000000FF000000ULL) << 8 |
(data & 0x000000FF00000000ULL) >> 8 |
(data & 0x0000FF0000000000ULL) >> 24 |
(data & 0x00FF000000000000ULL) >> 40 |
data >> 56;
# endif /* WORDS_BIGENDIAN */
data^= crc;
return
ut_crc32_slice8_table[7][(data ) & 0xFF] ^
ut_crc32_slice8_table[6][(data >> 8) & 0xFF] ^
ut_crc32_slice8_table[5][(data >> 16) & 0xFF] ^
ut_crc32_slice8_table[4][(data >> 24) & 0xFF] ^
ut_crc32_slice8_table[3][(data >> 32) & 0xFF] ^
ut_crc32_slice8_table[2][(data >> 40) & 0xFF] ^
ut_crc32_slice8_table[1][(data >> 48) & 0xFF] ^
ut_crc32_slice8_table[0][(data >> 56)];
}
/** Calculate CRC-32C using a look-up table.
@param crc current checksum
@param buf data to append to the checksum
@param len data length in bytes
@return CRC-32C (polynomial 0x11EDC6F41) */
uint32_t ut_crc32_sw(uint32_t crc, const byte *buf, size_t len)
{
crc= ~crc;
/* Calculate byte-by-byte up to an 8-byte aligned address. After
this consume the input 8-bytes at a time. */
while (len > 0 && (reinterpret_cast<uintptr_t>(buf) & 7) != 0)
{
crc= ut_crc32c_8_sw(crc, *buf++);
len--;
}
const uint64_t* b64= reinterpret_cast<const uint64_t*>(buf);
for (; len >= 8; len-= 8)
crc= ut_crc32c_64_sw(crc, *b64++);
buf= reinterpret_cast<const byte*>(b64);
while (len--)
crc= ut_crc32c_8_sw(crc, *buf++);
return ~crc;
}
ut_crc32_func_t ut_crc32_low= ut_crc32_sw;
const char *ut_crc32_implementation= "Using generic crc32 instructions";
#endif
/********************************************************************//**
Initializes the data structures used by ut_crc32*(). Does not do any
allocations, would not hurt if called twice, but would be pointless. */
void ut_crc32_init()
{
#ifndef HAVE_CRC32_VPMSUM
# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
if (const char *crc32c_implementation= crc32c_aarch64_available())
{
ut_crc32_low= crc32c_aarch64;
ut_crc32_implementation= crc32c_implementation;
return;
}
# elif defined HAVE_CPUID_INSTRUCTION
if (has_sse4_2())
{
ut_crc32_low= ut_crc32_hw;
ut_crc32_implementation= "Using SSE4.2 crc32 instructions";
return;
}
# endif
ut_crc32_slice8_table_init();
#endif /* !HAVE_CRC32_VPMSUM */
}

View File

@@ -15,11 +15,11 @@
MY_ADD_TESTS(bitmap base64 my_atomic my_rdtsc lf my_malloc my_getopt dynstring
byte_order
queues stacktrace LINK_LIBRARIES mysys)
queues stacktrace crc32 LINK_LIBRARIES mysys)
MY_ADD_TESTS(my_vsnprintf LINK_LIBRARIES strings mysys)
MY_ADD_TESTS(aes LINK_LIBRARIES mysys mysys_ssl)
ADD_DEFINITIONS(${SSL_DEFINES})
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
MY_ADD_TESTS(ma_dyncol LINK_LIBRARIES mysys)
IF(WIN32)

69
unittest/mysys/crc32-t.c Normal file
View File

@@ -0,0 +1,69 @@
/* Copyright (c) MariaDB 2020
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
#include <my_global.h>
#include <my_sys.h>
#include <my_crypt.h>
#include <tap.h>
#include <string.h>
#include <ctype.h>
#include <zlib.h>
/*
Check that optimized crc32 (ieee, or ethernet polynomical) returns the same
result as zlib (not so well optimized, yet, but trustworthy)
*/
#define DO_TEST_CRC32(crc,str) \
ok(crc32(crc,(const Bytef *)str,(uint)(sizeof(str)-1)) == my_checksum(crc, str, sizeof(str)-1), "crc32 '%s'",str)
/* Check that CRC32-C calculation returns correct result*/
#define DO_TEST_CRC32C(crc,str,expected) \
do { \
unsigned int v = my_crc32c(crc, str, sizeof(str)-1); \
printf("crc32(%u,'%s',%zu)=%u\n",crc,str,sizeof(str)-1,v); \
ok(expected == my_crc32c(crc, str, sizeof(str)-1),"crc32c '%s'",str); \
}while(0)
#define LONG_STR "1234567890234568900212345678901231213123321212123123123123123"\
"............................................................................." \
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" \
"yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy" \
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
int main(int argc __attribute__((unused)),char *argv[])
{
MY_INIT(argv[0]);
plan(14);
printf("%s\n",my_crc32c_implementation());
DO_TEST_CRC32(0,"");
DO_TEST_CRC32(1,"");
DO_TEST_CRC32(0,"12345");
DO_TEST_CRC32(1,"12345");
DO_TEST_CRC32(0,"1234567890123456789");
DO_TEST_CRC32(0, LONG_STR);
ok(0 == my_checksum(0, NULL, 0) , "crc32 data = NULL, length = 0");
DO_TEST_CRC32C(0,"", 0);
DO_TEST_CRC32C(1,"", 1);
DO_TEST_CRC32C(0, "12345", 416359221);
DO_TEST_CRC32C(1, "12345", 549473433);
DO_TEST_CRC32C(0, "1234567890123456789", 2366987449);
DO_TEST_CRC32C(0, LONG_STR, 3009234172);
ok(0 == my_crc32c(0, NULL, 0), "crc32c data = NULL, length = 0");
my_end(0);
return exit_status();
}