From 0805ad10b28cdfb4c8074383df27db35044cf993 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 May 2023 11:48:10 +0100 Subject: [PATCH 01/19] XOR perf improvements Signed-off-by: Dave Rodgman --- ChangeLog.d/aes-perf.txt | 3 +++ library/common.h | 7 +++++++ 2 files changed, 10 insertions(+) create mode 100644 ChangeLog.d/aes-perf.txt diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt new file mode 100644 index 0000000000..26819b2846 --- /dev/null +++ b/ChangeLog.d/aes-perf.txt @@ -0,0 +1,3 @@ +Features + * AES performance improvements (XTS, GCM, CCM and CMAC) on 64-bit + architectures, of around 5-10%. diff --git a/library/common.h b/library/common.h index eb159a7c48..82001a9f9d 100644 --- a/library/common.h +++ b/library/common.h @@ -125,6 +125,13 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned { size_t i = 0; #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) +#if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) + /* This codepath probably only makes sense on architectures with 64-bit registers */ + for (; (i + 8) <= n; i += 8) { + uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); + mbedtls_put_unaligned_uint64(r + i, x); + } +#endif for (; (i + 4) <= n; i += 4) { uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); mbedtls_put_unaligned_uint32(r + i, x); From 6f40f8bf0104e279258afcfc97936807c5b91ee3 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Mon, 22 May 2023 18:21:20 +0100 Subject: [PATCH 02/19] Add NEON to mbedtls_xor Signed-off-by: Dave Rodgman --- library/common.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/library/common.h b/library/common.h index 82001a9f9d..9c096024eb 100644 --- a/library/common.h +++ b/library/common.h @@ -31,6 +31,10 @@ #include #include +#ifdef __ARM_NEON +#include +#endif /* __ARM_NEON */ + /** Helper to define a function as static except when building invasive tests. * * If a function is only used inside its own source file and should be @@ -125,7 +129,14 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned { size_t i = 0; #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) -#if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) +#if defined(__aarch64__) && defined(__ARM_NEON) + for (; (i + 16) <= n; i += 16) { + uint64x2_t v1 = vld1q_u64((uint64_t *) a); + uint64x2_t v2 = vld1q_u64((uint64_t *) b); + uint64x2_t x = veorq_u64(v1, v2); + vst1q_u64((uint64_t *) r, x); + } +#elif defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) /* This codepath probably only makes sense on architectures with 64-bit registers */ for (; (i + 8) <= n; i += 8) { uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); From 797c4ff36569921d5bcbd29deed77d9bc199f858 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Mon, 22 May 2023 19:41:58 +0100 Subject: [PATCH 03/19] Make AES-CBC more efficient Signed-off-by: Dave Rodgman --- library/aes.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/library/aes.c b/library/aes.c index 69da5828ac..eb3f873e72 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1068,36 +1068,45 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } #endif + const unsigned char *ivp = iv; + if (mode == MBEDTLS_AES_DECRYPT) { - while (length > 0) { - memcpy(temp, input, 16); - ret = mbedtls_aes_crypt_ecb(ctx, mode, input, output); - if (ret != 0) { - goto exit; + if (length >= 16) { + unsigned char temp2[16]; + memcpy(temp, input + length - 16, 16); + + while (length > 0) { + ret = mbedtls_aes_crypt_ecb(ctx, mode, input, temp2); + if (ret != 0) { + goto exit; + } + + mbedtls_xor(output, temp2, ivp, 16); + + ivp = input; + + input += 16; + output += 16; + length -= 16; } - mbedtls_xor(output, output, iv, 16); - memcpy(iv, temp, 16); - - input += 16; - output += 16; - length -= 16; } } else { while (length > 0) { - mbedtls_xor(output, input, iv, 16); + mbedtls_xor(output, input, ivp, 16); ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); if (ret != 0) { goto exit; } - memcpy(iv, output, 16); + ivp = output; input += 16; output += 16; length -= 16; } + memcpy(iv, ivp, 16); } ret = 0; From b19b63a6397874b28d30ce26a8e295730dca84a8 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Mon, 22 May 2023 19:49:24 +0100 Subject: [PATCH 04/19] Changelog update Signed-off-by: Dave Rodgman --- ChangeLog.d/aes-perf.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt index 26819b2846..7adb7ce020 100644 --- a/ChangeLog.d/aes-perf.txt +++ b/ChangeLog.d/aes-perf.txt @@ -1,3 +1,4 @@ Features - * AES performance improvements (XTS, GCM, CCM and CMAC) on 64-bit - architectures, of around 5-10%. + * AES performance improvements on 64-bit architectures. Uplift + varies by platform, toolchain and mode, in the 0 - 54% range. + Aarch64, gcc -Os and GCM/XTS benefit the most. From 262d8ced795d72114c3bfdb0786ae8561ec3e266 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Mon, 22 May 2023 23:13:45 +0100 Subject: [PATCH 05/19] Fix AES-CBC for in-place operation Signed-off-by: Dave Rodgman --- library/aes.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/library/aes.c b/library/aes.c index eb3f873e72..bfcaf352b5 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1071,26 +1071,21 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, const unsigned char *ivp = iv; if (mode == MBEDTLS_AES_DECRYPT) { - if (length >= 16) { - unsigned char temp2[16]; - memcpy(temp, input + length - 16, 16); - - while (length > 0) { - ret = mbedtls_aes_crypt_ecb(ctx, mode, input, temp2); - if (ret != 0) { - goto exit; - } - - mbedtls_xor(output, temp2, ivp, 16); - - ivp = input; - - input += 16; - output += 16; - length -= 16; + unsigned char temp2[16]; + while (length > 0) { + memcpy(temp, input, 16); + ret = mbedtls_aes_crypt_ecb(ctx, mode, input, temp2); + if (ret != 0) { + goto exit; } + mbedtls_xor(output, temp2, iv, 16); + memcpy(iv, temp, 16); + + input += 16; + output += 16; + length -= 16; } } else { while (length > 0) { From 7613b3d6b871a1dbd2b84eb51b14c6d45d9970ea Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 23 May 2023 00:48:38 +0100 Subject: [PATCH 06/19] Fix xor fail for large block size Signed-off-by: Dave Rodgman --- library/common.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/library/common.h b/library/common.h index 9c096024eb..94b8c5db34 100644 --- a/library/common.h +++ b/library/common.h @@ -131,10 +131,10 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) #if defined(__aarch64__) && defined(__ARM_NEON) for (; (i + 16) <= n; i += 16) { - uint64x2_t v1 = vld1q_u64((uint64_t *) a); - uint64x2_t v2 = vld1q_u64((uint64_t *) b); + uint64x2_t v1 = vld1q_u64((uint64_t *) (a + i)); + uint64x2_t v2 = vld1q_u64((uint64_t *) (b + i)); uint64x2_t x = veorq_u64(v1, v2); - vst1q_u64((uint64_t *) r, x); + vst1q_u64((uint64_t *) (r + i), x); } #elif defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) /* This codepath probably only makes sense on architectures with 64-bit registers */ From 9c1128edaa140349a1ff1c89f1153a489530f1b3 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 23 May 2023 00:49:46 +0100 Subject: [PATCH 07/19] Add tests for xor over large blocks Signed-off-by: Dave Rodgman --- tests/suites/test_suite_common.data | 42 +++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/suites/test_suite_common.data b/tests/suites/test_suite_common.data index 500852d540..bd2c413b30 100644 --- a/tests/suites/test_suite_common.data +++ b/tests/suites/test_suite_common.data @@ -18,3 +18,45 @@ mbedtls_xor:8 Block xor, length 16 mbedtls_xor:16 + +Block xor, length 64 +mbedtls_xor:64 + +Block xor, length 256 +mbedtls_xor:256 + +Block xor, length 257 +mbedtls_xor:257 + +Block xor, length 16+8 +mbedtls_xor:24 + +Block xor, length 16+8+4 +mbedtls_xor:28 + +Block xor, length 16+8+4+1 +mbedtls_xor:29 + +Block xor, length 16+8+1 +mbedtls_xor:25 + +Block xor, length 16+4 +mbedtls_xor:20 + +Block xor, length 16+4+1 +mbedtls_xor:21 + +Block xor, length 16+1 +mbedtls_xor:17 + +Block xor, length 8+4 +mbedtls_xor:12 + +Block xor, length 8+4+1 +mbedtls_xor:13 + +Block xor, length 8+1 +mbedtls_xor:9 + +Block xor, length 4+1 +mbedtls_xor:5 From 3f47b3f7a31d0e7be11b936740626ed68d866d0a Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 23 May 2023 16:11:22 +0100 Subject: [PATCH 08/19] Extend NEON use to 32-bit Arm Signed-off-by: Dave Rodgman --- library/common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/library/common.h b/library/common.h index 94b8c5db34..ac6883ce19 100644 --- a/library/common.h +++ b/library/common.h @@ -31,7 +31,7 @@ #include #include -#ifdef __ARM_NEON +#if defined(__ARM_NEON) #include #endif /* __ARM_NEON */ @@ -129,7 +129,7 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned { size_t i = 0; #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) -#if defined(__aarch64__) && defined(__ARM_NEON) +#if defined(__ARM_NEON) for (; (i + 16) <= n; i += 16) { uint64x2_t v1 = vld1q_u64((uint64_t *) (a + i)); uint64x2_t v2 = vld1q_u64((uint64_t *) (b + i)); From f1e396c42724896b9d31ac727043da45a35d5e26 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Sun, 4 Jun 2023 12:00:11 -0400 Subject: [PATCH 09/19] improve cbc encrypt perf Signed-off-by: Dave Rodgman --- library/aes.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/library/aes.c b/library/aes.c index bfcaf352b5..e9395d4eca 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1068,8 +1068,6 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } #endif - const unsigned char *ivp = iv; - if (mode == MBEDTLS_AES_DECRYPT) { unsigned char temp2[16]; while (length > 0) { @@ -1089,19 +1087,18 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } } else { while (length > 0) { - mbedtls_xor(output, input, ivp, 16); + mbedtls_xor(temp, input, iv, 16); - ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); + ret = mbedtls_aes_crypt_ecb(ctx, mode, temp, iv); + memcpy(output, iv, 16); if (ret != 0) { goto exit; } - ivp = output; input += 16; output += 16; length -= 16; } - memcpy(iv, ivp, 16); } ret = 0; From 9d1635e7429ab105b9c65fc564a2979d9ac7d46a Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Sun, 4 Jun 2023 12:55:15 -0400 Subject: [PATCH 10/19] Revert not-useful changes to AES-CBC decrypt Signed-off-by: Dave Rodgman --- library/aes.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/library/aes.c b/library/aes.c index e9395d4eca..a137fb14e0 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1069,15 +1069,14 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, #endif if (mode == MBEDTLS_AES_DECRYPT) { - unsigned char temp2[16]; while (length > 0) { memcpy(temp, input, 16); - ret = mbedtls_aes_crypt_ecb(ctx, mode, input, temp2); + ret = mbedtls_aes_crypt_ecb(ctx, mode, input, output); if (ret != 0) { goto exit; } - mbedtls_xor(output, temp2, iv, 16); + mbedtls_xor(output, output, iv, 16); memcpy(iv, temp, 16); From 2070c2074eb33271e86efc31c9861e967d4a59ac Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 7 Jun 2023 16:25:58 +0100 Subject: [PATCH 11/19] Avoid possible NEON alignment issue Signed-off-by: Dave Rodgman --- library/common.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/library/common.h b/library/common.h index ac6883ce19..7e21a0c34b 100644 --- a/library/common.h +++ b/library/common.h @@ -131,10 +131,10 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) #if defined(__ARM_NEON) for (; (i + 16) <= n; i += 16) { - uint64x2_t v1 = vld1q_u64((uint64_t *) (a + i)); - uint64x2_t v2 = vld1q_u64((uint64_t *) (b + i)); - uint64x2_t x = veorq_u64(v1, v2); - vst1q_u64((uint64_t *) (r + i), x); + uint8x16_t v1 = vld1q_u8((uint64_t *) (a + i)); + uint8x16_t v2 = vld1q_u8((uint64_t *) (b + i)); + uint8x16_t x = veorq_u8(v1, v2); + vst1q_u8((uint64_t *) (r + i), x); } #elif defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) /* This codepath probably only makes sense on architectures with 64-bit registers */ From f32176c0e38df6cf2d5b83c89501673303f32d3c Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 9 Jun 2023 16:25:49 +0100 Subject: [PATCH 12/19] Remove unnecessary cast Signed-off-by: Dave Rodgman --- library/common.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/library/common.h b/library/common.h index 7e21a0c34b..9e1c4f6f41 100644 --- a/library/common.h +++ b/library/common.h @@ -131,10 +131,10 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) #if defined(__ARM_NEON) for (; (i + 16) <= n; i += 16) { - uint8x16_t v1 = vld1q_u8((uint64_t *) (a + i)); - uint8x16_t v2 = vld1q_u8((uint64_t *) (b + i)); + uint8x16_t v1 = vld1q_u8(a + i); + uint8x16_t v2 = vld1q_u8(b + i); uint8x16_t x = veorq_u8(v1, v2); - vst1q_u8((uint64_t *) (r + i), x); + vst1q_u8(r + i, x); } #elif defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) /* This codepath probably only makes sense on architectures with 64-bit registers */ From 360e04f3791e62ab217f27cb145ebf309b2d75dc Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 9 Jun 2023 17:18:32 +0100 Subject: [PATCH 13/19] Fix AES-XTS perf regression Signed-off-by: Dave Rodgman --- library/aes.c | 2 +- library/common.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/library/aes.c b/library/aes.c index a137fb14e0..aa230fd15a 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1172,7 +1172,7 @@ int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx, } while (blocks--) { - if (leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0) { + if (MBEDTLS_UNLIKELY(leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0)) { /* We are on the last block in a decrypt operation that has * leftover bytes, so we need to use the next tweak for this block, * and this tweak for the leftover bytes. Save the current tweak for diff --git a/library/common.h b/library/common.h index 9e1c4f6f41..724c44ed8c 100644 --- a/library/common.h +++ b/library/common.h @@ -182,4 +182,16 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned #define MBEDTLS_STATIC_ASSERT(expr, msg) #endif +/* Define compiler branch hints */ +#if defined(__has_builtin) +#if __has_builtin(__builtin_expect) +#define MBEDTLS_LIKELY(x) __builtin_expect((x),1) +#define MBEDTLS_UNLIKELY(x) __builtin_expect((x),0) +#endif +#endif +#if !defined(MBEDTLS_LIKELY) +#define MBEDTLS_LIKELY(x) x +#define MBEDTLS_UNLIKELY(x) x +#endif + #endif /* MBEDTLS_LIBRARY_COMMON_H */ From 0e225978712be3919ee92e1ccceeaf33865a5ef8 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 9 Jun 2023 17:18:53 +0100 Subject: [PATCH 14/19] Update Changelog Signed-off-by: Dave Rodgman --- ChangeLog.d/aes-perf.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt index 7adb7ce020..ca2ced92ed 100644 --- a/ChangeLog.d/aes-perf.txt +++ b/ChangeLog.d/aes-perf.txt @@ -1,4 +1,4 @@ Features * AES performance improvements on 64-bit architectures. Uplift - varies by platform, toolchain and mode, in the 0 - 54% range. - Aarch64, gcc -Os and GCM/XTS benefit the most. + varies by platform, toolchain, optimisation flags and mode, + in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most. From 159dc099fda4922b47ffbf49a074fa69bb29729a Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 9 Jun 2023 19:46:07 +0100 Subject: [PATCH 15/19] Code style Signed-off-by: Dave Rodgman --- library/common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/library/common.h b/library/common.h index 724c44ed8c..89f3b1ffb0 100644 --- a/library/common.h +++ b/library/common.h @@ -185,8 +185,8 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned /* Define compiler branch hints */ #if defined(__has_builtin) #if __has_builtin(__builtin_expect) -#define MBEDTLS_LIKELY(x) __builtin_expect((x),1) -#define MBEDTLS_UNLIKELY(x) __builtin_expect((x),0) +#define MBEDTLS_LIKELY(x) __builtin_expect((x), 1) +#define MBEDTLS_UNLIKELY(x) __builtin_expect((x), 0) #endif #endif #if !defined(MBEDTLS_LIKELY) From 5c394ff2033f5f812896da41e38a9ab8b565e6e3 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 9 Jun 2023 20:10:36 +0100 Subject: [PATCH 16/19] Use a single fast-path in mbedtls_xor, gains around 1% in benchmarks Signed-off-by: Dave Rodgman --- library/common.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/library/common.h b/library/common.h index 89f3b1ffb0..b48a1fc667 100644 --- a/library/common.h +++ b/library/common.h @@ -142,11 +142,12 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); mbedtls_put_unaligned_uint64(r + i, x); } -#endif +#else for (; (i + 4) <= n; i += 4) { uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); mbedtls_put_unaligned_uint32(r + i, x); } +#endif #endif for (; i < n; i++) { r[i] = a[i] ^ b[i]; From 906c63cf3571b64db8eed423f1240195e4cb14e3 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Jun 2023 17:53:51 +0100 Subject: [PATCH 17/19] Revert "improve cbc encrypt perf" This reverts commit f1e396c42724896b9d31ac727043da45a35d5e26. Performance is slightly better with this reverted, especially for AES-CBC 192. Signed-off-by: Dave Rodgman --- library/aes.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/library/aes.c b/library/aes.c index aa230fd15a..6d8cf2e3c1 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1068,6 +1068,8 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } #endif + const unsigned char *ivp = iv; + if (mode == MBEDTLS_AES_DECRYPT) { while (length > 0) { memcpy(temp, input, 16); @@ -1086,18 +1088,19 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } } else { while (length > 0) { - mbedtls_xor(temp, input, iv, 16); + mbedtls_xor(output, input, ivp, 16); - ret = mbedtls_aes_crypt_ecb(ctx, mode, temp, iv); - memcpy(output, iv, 16); + ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); if (ret != 0) { goto exit; } + ivp = output; input += 16; output += 16; length -= 16; } + memcpy(iv, ivp, 16); } ret = 0; From d05e7f1ab3e76de673a409424ba29f7cc187ef8f Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Jun 2023 18:58:48 +0100 Subject: [PATCH 18/19] Do not use NEON for AES-CBC on aarch64 Signed-off-by: Dave Rodgman --- library/aes.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/library/aes.c b/library/aes.c index 6d8cf2e3c1..6a7e6102fd 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1035,6 +1035,24 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx, } #if defined(MBEDTLS_CIPHER_MODE_CBC) + +#if defined(__ARM_NEON) && defined(__aarch64__) + /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on + * the result for the next block in CBC, and the cost of transferring that data from + * NEON registers, it is faster to use the following on aarch64. + * For 32-bit arm, NEON should be faster. */ +#define CBC_XOR_16(r, a, b) do { \ + mbedtls_put_unaligned_uint64(r, \ + mbedtls_get_unaligned_uint64(a) ^ \ + mbedtls_get_unaligned_uint64(b)); \ + mbedtls_put_unaligned_uint64(r + 8, \ + mbedtls_get_unaligned_uint64(a + 8) ^ \ + mbedtls_get_unaligned_uint64(b + 8)); \ +} while (0) +#else +#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16) +#endif + /* * AES-CBC buffer encryption/decryption */ @@ -1077,8 +1095,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, if (ret != 0) { goto exit; } - - mbedtls_xor(output, output, iv, 16); + CBC_XOR_16(output, output, iv); memcpy(iv, temp, 16); @@ -1088,7 +1105,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } } else { while (length > 0) { - mbedtls_xor(output, input, ivp, 16); + CBC_XOR_16(output, input, ivp); ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); if (ret != 0) { From 28a97acb3ca0104ec6816a148ecf94c8001afe27 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Jun 2023 20:15:15 +0100 Subject: [PATCH 19/19] code style Signed-off-by: Dave Rodgman --- library/aes.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/library/aes.c b/library/aes.c index 6a7e6102fd..2ed195a5da 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1037,17 +1037,17 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx, #if defined(MBEDTLS_CIPHER_MODE_CBC) #if defined(__ARM_NEON) && defined(__aarch64__) - /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on - * the result for the next block in CBC, and the cost of transferring that data from - * NEON registers, it is faster to use the following on aarch64. - * For 32-bit arm, NEON should be faster. */ +/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on + * the result for the next block in CBC, and the cost of transferring that data from + * NEON registers, it is faster to use the following on aarch64. + * For 32-bit arm, NEON should be faster. */ #define CBC_XOR_16(r, a, b) do { \ - mbedtls_put_unaligned_uint64(r, \ - mbedtls_get_unaligned_uint64(a) ^ \ - mbedtls_get_unaligned_uint64(b)); \ - mbedtls_put_unaligned_uint64(r + 8, \ - mbedtls_get_unaligned_uint64(a + 8) ^ \ - mbedtls_get_unaligned_uint64(b + 8)); \ + mbedtls_put_unaligned_uint64(r, \ + mbedtls_get_unaligned_uint64(a) ^ \ + mbedtls_get_unaligned_uint64(b)); \ + mbedtls_put_unaligned_uint64(r + 8, \ + mbedtls_get_unaligned_uint64(a + 8) ^ \ + mbedtls_get_unaligned_uint64(b + 8)); \ } while (0) #else #define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)