mirror of
https://github.com/Mbed-TLS/mbedtls.git
synced 2025-07-30 22:43:08 +03:00
Merge pull request #7624 from daverodgman/aes-perf
AES perf improvements
This commit is contained in:
4
ChangeLog.d/aes-perf.txt
Normal file
4
ChangeLog.d/aes-perf.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
Features
|
||||||
|
* AES performance improvements on 64-bit architectures. Uplift
|
||||||
|
varies by platform, toolchain, optimisation flags and mode,
|
||||||
|
in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most.
|
@ -1039,6 +1039,24 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(MBEDTLS_CIPHER_MODE_CBC)
|
#if defined(MBEDTLS_CIPHER_MODE_CBC)
|
||||||
|
|
||||||
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||||
|
/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
|
||||||
|
* the result for the next block in CBC, and the cost of transferring that data from
|
||||||
|
* NEON registers, it is faster to use the following on aarch64.
|
||||||
|
* For 32-bit arm, NEON should be faster. */
|
||||||
|
#define CBC_XOR_16(r, a, b) do { \
|
||||||
|
mbedtls_put_unaligned_uint64(r, \
|
||||||
|
mbedtls_get_unaligned_uint64(a) ^ \
|
||||||
|
mbedtls_get_unaligned_uint64(b)); \
|
||||||
|
mbedtls_put_unaligned_uint64(r + 8, \
|
||||||
|
mbedtls_get_unaligned_uint64(a + 8) ^ \
|
||||||
|
mbedtls_get_unaligned_uint64(b + 8)); \
|
||||||
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* AES-CBC buffer encryption/decryption
|
* AES-CBC buffer encryption/decryption
|
||||||
*/
|
*/
|
||||||
@ -1072,6 +1090,8 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
const unsigned char *ivp = iv;
|
||||||
|
|
||||||
if (mode == MBEDTLS_AES_DECRYPT) {
|
if (mode == MBEDTLS_AES_DECRYPT) {
|
||||||
while (length > 0) {
|
while (length > 0) {
|
||||||
memcpy(temp, input, 16);
|
memcpy(temp, input, 16);
|
||||||
@ -1079,8 +1099,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
|
|||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
CBC_XOR_16(output, output, iv);
|
||||||
mbedtls_xor(output, output, iv, 16);
|
|
||||||
|
|
||||||
memcpy(iv, temp, 16);
|
memcpy(iv, temp, 16);
|
||||||
|
|
||||||
@ -1090,18 +1109,19 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
while (length > 0) {
|
while (length > 0) {
|
||||||
mbedtls_xor(output, input, iv, 16);
|
CBC_XOR_16(output, input, ivp);
|
||||||
|
|
||||||
ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
|
ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
memcpy(iv, output, 16);
|
ivp = output;
|
||||||
|
|
||||||
input += 16;
|
input += 16;
|
||||||
output += 16;
|
output += 16;
|
||||||
length -= 16;
|
length -= 16;
|
||||||
}
|
}
|
||||||
|
memcpy(iv, ivp, 16);
|
||||||
}
|
}
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
|
||||||
@ -1176,7 +1196,7 @@ int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
while (blocks--) {
|
while (blocks--) {
|
||||||
if (leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0) {
|
if (MBEDTLS_UNLIKELY(leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0)) {
|
||||||
/* We are on the last block in a decrypt operation that has
|
/* We are on the last block in a decrypt operation that has
|
||||||
* leftover bytes, so we need to use the next tweak for this block,
|
* leftover bytes, so we need to use the next tweak for this block,
|
||||||
* and this tweak for the leftover bytes. Save the current tweak for
|
* and this tweak for the leftover bytes. Save the current tweak for
|
||||||
|
@ -31,6 +31,10 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#if defined(__ARM_NEON)
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#endif /* __ARM_NEON */
|
||||||
|
|
||||||
/** Helper to define a function as static except when building invasive tests.
|
/** Helper to define a function as static except when building invasive tests.
|
||||||
*
|
*
|
||||||
* If a function is only used inside its own source file and should be
|
* If a function is only used inside its own source file and should be
|
||||||
@ -125,10 +129,25 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
|
|||||||
{
|
{
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
|
#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
|
||||||
|
#if defined(__ARM_NEON)
|
||||||
|
for (; (i + 16) <= n; i += 16) {
|
||||||
|
uint8x16_t v1 = vld1q_u8(a + i);
|
||||||
|
uint8x16_t v2 = vld1q_u8(b + i);
|
||||||
|
uint8x16_t x = veorq_u8(v1, v2);
|
||||||
|
vst1q_u8(r + i, x);
|
||||||
|
}
|
||||||
|
#elif defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__)
|
||||||
|
/* This codepath probably only makes sense on architectures with 64-bit registers */
|
||||||
|
for (; (i + 8) <= n; i += 8) {
|
||||||
|
uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
|
||||||
|
mbedtls_put_unaligned_uint64(r + i, x);
|
||||||
|
}
|
||||||
|
#else
|
||||||
for (; (i + 4) <= n; i += 4) {
|
for (; (i + 4) <= n; i += 4) {
|
||||||
uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
|
uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
|
||||||
mbedtls_put_unaligned_uint32(r + i, x);
|
mbedtls_put_unaligned_uint32(r + i, x);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
for (; i < n; i++) {
|
for (; i < n; i++) {
|
||||||
r[i] = a[i] ^ b[i];
|
r[i] = a[i] ^ b[i];
|
||||||
@ -164,4 +183,16 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
|
|||||||
#define MBEDTLS_STATIC_ASSERT(expr, msg)
|
#define MBEDTLS_STATIC_ASSERT(expr, msg)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Define compiler branch hints */
|
||||||
|
#if defined(__has_builtin)
|
||||||
|
#if __has_builtin(__builtin_expect)
|
||||||
|
#define MBEDTLS_LIKELY(x) __builtin_expect((x), 1)
|
||||||
|
#define MBEDTLS_UNLIKELY(x) __builtin_expect((x), 0)
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#if !defined(MBEDTLS_LIKELY)
|
||||||
|
#define MBEDTLS_LIKELY(x) x
|
||||||
|
#define MBEDTLS_UNLIKELY(x) x
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* MBEDTLS_LIBRARY_COMMON_H */
|
#endif /* MBEDTLS_LIBRARY_COMMON_H */
|
||||||
|
@ -18,3 +18,45 @@ mbedtls_xor:8
|
|||||||
|
|
||||||
Block xor, length 16
|
Block xor, length 16
|
||||||
mbedtls_xor:16
|
mbedtls_xor:16
|
||||||
|
|
||||||
|
Block xor, length 64
|
||||||
|
mbedtls_xor:64
|
||||||
|
|
||||||
|
Block xor, length 256
|
||||||
|
mbedtls_xor:256
|
||||||
|
|
||||||
|
Block xor, length 257
|
||||||
|
mbedtls_xor:257
|
||||||
|
|
||||||
|
Block xor, length 16+8
|
||||||
|
mbedtls_xor:24
|
||||||
|
|
||||||
|
Block xor, length 16+8+4
|
||||||
|
mbedtls_xor:28
|
||||||
|
|
||||||
|
Block xor, length 16+8+4+1
|
||||||
|
mbedtls_xor:29
|
||||||
|
|
||||||
|
Block xor, length 16+8+1
|
||||||
|
mbedtls_xor:25
|
||||||
|
|
||||||
|
Block xor, length 16+4
|
||||||
|
mbedtls_xor:20
|
||||||
|
|
||||||
|
Block xor, length 16+4+1
|
||||||
|
mbedtls_xor:21
|
||||||
|
|
||||||
|
Block xor, length 16+1
|
||||||
|
mbedtls_xor:17
|
||||||
|
|
||||||
|
Block xor, length 8+4
|
||||||
|
mbedtls_xor:12
|
||||||
|
|
||||||
|
Block xor, length 8+4+1
|
||||||
|
mbedtls_xor:13
|
||||||
|
|
||||||
|
Block xor, length 8+1
|
||||||
|
mbedtls_xor:9
|
||||||
|
|
||||||
|
Block xor, length 4+1
|
||||||
|
mbedtls_xor:5
|
||||||
|
Reference in New Issue
Block a user