diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt new file mode 100644 index 0000000000..ca2ced92ed --- /dev/null +++ b/ChangeLog.d/aes-perf.txt @@ -0,0 +1,4 @@ +Features + * AES performance improvements on 64-bit architectures. Uplift + varies by platform, toolchain, optimisation flags and mode, + in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most. diff --git a/library/aes.c b/library/aes.c index 3efe9305c2..0a61d1b070 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1039,6 +1039,24 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx, } #if defined(MBEDTLS_CIPHER_MODE_CBC) + +#if defined(__ARM_NEON) && defined(__aarch64__) +/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on + * the result for the next block in CBC, and the cost of transferring that data from + * NEON registers, it is faster to use the following on aarch64. + * For 32-bit arm, NEON should be faster. */ +#define CBC_XOR_16(r, a, b) do { \ + mbedtls_put_unaligned_uint64(r, \ + mbedtls_get_unaligned_uint64(a) ^ \ + mbedtls_get_unaligned_uint64(b)); \ + mbedtls_put_unaligned_uint64(r + 8, \ + mbedtls_get_unaligned_uint64(a + 8) ^ \ + mbedtls_get_unaligned_uint64(b + 8)); \ +} while (0) +#else +#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16) +#endif + /* * AES-CBC buffer encryption/decryption */ @@ -1072,6 +1090,8 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } #endif + const unsigned char *ivp = iv; + if (mode == MBEDTLS_AES_DECRYPT) { while (length > 0) { memcpy(temp, input, 16); @@ -1079,8 +1099,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, if (ret != 0) { goto exit; } - - mbedtls_xor(output, output, iv, 16); + CBC_XOR_16(output, output, iv); memcpy(iv, temp, 16); @@ -1090,18 +1109,19 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } } else { while (length > 0) { - mbedtls_xor(output, input, iv, 16); + CBC_XOR_16(output, input, ivp); ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); if (ret != 0) { goto exit; } - memcpy(iv, output, 16); + ivp = output; input += 16; output += 16; length -= 16; } + memcpy(iv, ivp, 16); } ret = 0; @@ -1176,7 +1196,7 @@ int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx, } while (blocks--) { - if (leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0) { + if (MBEDTLS_UNLIKELY(leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0)) { /* We are on the last block in a decrypt operation that has * leftover bytes, so we need to use the next tweak for this block, * and this tweak for the leftover bytes. Save the current tweak for diff --git a/library/common.h b/library/common.h index eb159a7c48..b48a1fc667 100644 --- a/library/common.h +++ b/library/common.h @@ -31,6 +31,10 @@ #include #include +#if defined(__ARM_NEON) +#include +#endif /* __ARM_NEON */ + /** Helper to define a function as static except when building invasive tests. * * If a function is only used inside its own source file and should be @@ -125,10 +129,25 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned { size_t i = 0; #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) +#if defined(__ARM_NEON) + for (; (i + 16) <= n; i += 16) { + uint8x16_t v1 = vld1q_u8(a + i); + uint8x16_t v2 = vld1q_u8(b + i); + uint8x16_t x = veorq_u8(v1, v2); + vst1q_u8(r + i, x); + } +#elif defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) + /* This codepath probably only makes sense on architectures with 64-bit registers */ + for (; (i + 8) <= n; i += 8) { + uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); + mbedtls_put_unaligned_uint64(r + i, x); + } +#else for (; (i + 4) <= n; i += 4) { uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); mbedtls_put_unaligned_uint32(r + i, x); } +#endif #endif for (; i < n; i++) { r[i] = a[i] ^ b[i]; @@ -164,4 +183,16 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned #define MBEDTLS_STATIC_ASSERT(expr, msg) #endif +/* Define compiler branch hints */ +#if defined(__has_builtin) +#if __has_builtin(__builtin_expect) +#define MBEDTLS_LIKELY(x) __builtin_expect((x), 1) +#define MBEDTLS_UNLIKELY(x) __builtin_expect((x), 0) +#endif +#endif +#if !defined(MBEDTLS_LIKELY) +#define MBEDTLS_LIKELY(x) x +#define MBEDTLS_UNLIKELY(x) x +#endif + #endif /* MBEDTLS_LIBRARY_COMMON_H */ diff --git a/tests/suites/test_suite_common.data b/tests/suites/test_suite_common.data index 500852d540..bd2c413b30 100644 --- a/tests/suites/test_suite_common.data +++ b/tests/suites/test_suite_common.data @@ -18,3 +18,45 @@ mbedtls_xor:8 Block xor, length 16 mbedtls_xor:16 + +Block xor, length 64 +mbedtls_xor:64 + +Block xor, length 256 +mbedtls_xor:256 + +Block xor, length 257 +mbedtls_xor:257 + +Block xor, length 16+8 +mbedtls_xor:24 + +Block xor, length 16+8+4 +mbedtls_xor:28 + +Block xor, length 16+8+4+1 +mbedtls_xor:29 + +Block xor, length 16+8+1 +mbedtls_xor:25 + +Block xor, length 16+4 +mbedtls_xor:20 + +Block xor, length 16+4+1 +mbedtls_xor:21 + +Block xor, length 16+1 +mbedtls_xor:17 + +Block xor, length 8+4 +mbedtls_xor:12 + +Block xor, length 8+4+1 +mbedtls_xor:13 + +Block xor, length 8+1 +mbedtls_xor:9 + +Block xor, length 4+1 +mbedtls_xor:5