Merge pull request #7624 from daverodgman/aes-perf

AES perf improvements
2025-07-30 22:43:08 +03:00 · 2023-06-15 12:10:06 +01:00
parent 6edf8b8c7b 28a97acb3c
commit 2e7d57270e
4 changed files with 102 additions and 5 deletions
--- a/ChangeLog.d/aes-perf.txt
+++ b/ChangeLog.d/aes-perf.txt
@ -0,0 +1,4 @@
 Features
   * AES performance improvements on 64-bit architectures. Uplift
     varies by platform, toolchain, optimisation flags and mode,
     in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most.
--- a/library/aes.c
+++ b/library/aes.c
@ -1039,6 +1039,24 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx,
 }
 #if defined(MBEDTLS_CIPHER_MODE_CBC)
 #if defined(__ARM_NEON) && defined(__aarch64__)
 /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
 * the result for the next block in CBC, and the cost of transferring that data from
 * NEON registers, it is faster to use the following on aarch64.
 * For 32-bit arm, NEON should be faster. */
 #define CBC_XOR_16(r, a, b) do {                                           \
        mbedtls_put_unaligned_uint64(r,                                    \
                                     mbedtls_get_unaligned_uint64(a) ^     \
                                     mbedtls_get_unaligned_uint64(b));     \
        mbedtls_put_unaligned_uint64(r + 8,                                \
                                     mbedtls_get_unaligned_uint64(a + 8) ^ \
                                     mbedtls_get_unaligned_uint64(b + 8)); \
 } while (0)
 #else
 #define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)
 #endif
 /*
 * AES-CBC buffer encryption/decryption
 */
@ -1072,6 +1090,8 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
    }
 #endif
    const unsigned char *ivp = iv;
    if (mode == MBEDTLS_AES_DECRYPT) {
        while (length > 0) {
            memcpy(temp, input, 16);
@ -1079,8 +1099,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
            if (ret != 0) {
                goto exit;
            }
-
+            CBC_XOR_16(output, output, iv);
            mbedtls_xor(output, output, iv, 16);
            memcpy(iv, temp, 16);
@ -1090,18 +1109,19 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
        }
    } else {
        while (length > 0) {
-            mbedtls_xor(output, input, iv, 16);
+            CBC_XOR_16(output, input, ivp);
            ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
            if (ret != 0) {
                goto exit;
            }
-            memcpy(iv, output, 16);
+            ivp = output;
            input  += 16;
            output += 16;
            length -= 16;
        }
        memcpy(iv, ivp, 16);
    }
    ret = 0;
@ -1176,7 +1196,7 @@ int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
    }
    while (blocks--) {
-        if (leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0) {
+        if (MBEDTLS_UNLIKELY(leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0)) {
            /* We are on the last block in a decrypt operation that has
             * leftover bytes, so we need to use the next tweak for this block,
             * and this tweak for the leftover bytes. Save the current tweak for
--- a/library/common.h
+++ b/library/common.h
@ -31,6 +31,10 @@
 #include <stdint.h>
 #include <stddef.h>
 #if defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif /* __ARM_NEON */
 /** Helper to define a function as static except when building invasive tests.
 *
 * If a function is only used inside its own source file and should be
@ -125,10 +129,25 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
 {
    size_t i = 0;
 #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
 #if defined(__ARM_NEON)
    for (; (i + 16) <= n; i += 16) {
        uint8x16_t v1 = vld1q_u8(a + i);
        uint8x16_t v2 = vld1q_u8(b + i);
        uint8x16_t x = veorq_u8(v1, v2);
        vst1q_u8(r + i, x);
    }
 #elif defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__)
    /* This codepath probably only makes sense on architectures with 64-bit registers */
    for (; (i + 8) <= n; i += 8) {
        uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
        mbedtls_put_unaligned_uint64(r + i, x);
    }
 #else
    for (; (i + 4) <= n; i += 4) {
        uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
        mbedtls_put_unaligned_uint32(r + i, x);
    }
 #endif
 #endif
    for (; i < n; i++) {
        r[i] = a[i] ^ b[i];
@ -164,4 +183,16 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
 #define MBEDTLS_STATIC_ASSERT(expr, msg)
 #endif
 /* Define compiler branch hints */
 #if defined(__has_builtin)
 #if __has_builtin(__builtin_expect)
 #define MBEDTLS_LIKELY(x)       __builtin_expect((x), 1)
 #define MBEDTLS_UNLIKELY(x)     __builtin_expect((x), 0)
 #endif
 #endif
 #if !defined(MBEDTLS_LIKELY)
 #define MBEDTLS_LIKELY(x)       x
 #define MBEDTLS_UNLIKELY(x)     x
 #endif
 #endif /* MBEDTLS_LIBRARY_COMMON_H */
--- a/tests/suites/test_suite_common.data
+++ b/tests/suites/test_suite_common.data
@ -18,3 +18,45 @@ mbedtls_xor:8
 Block xor, length 16
 mbedtls_xor:16
 Block xor, length 64
 mbedtls_xor:64
 Block xor, length 256
 mbedtls_xor:256
 Block xor, length 257
 mbedtls_xor:257
 Block xor, length 16+8
 mbedtls_xor:24
 Block xor, length 16+8+4
 mbedtls_xor:28
 Block xor, length 16+8+4+1
 mbedtls_xor:29
 Block xor, length 16+8+1
 mbedtls_xor:25
 Block xor, length 16+4
 mbedtls_xor:20
 Block xor, length 16+4+1
 mbedtls_xor:21
 Block xor, length 16+1
 mbedtls_xor:17
 Block xor, length 8+4
 mbedtls_xor:12
 Block xor, length 8+4+1
 mbedtls_xor:13
 Block xor, length 8+1
 mbedtls_xor:9
 Block xor, length 4+1
 mbedtls_xor:5