Merge pull request #8326 from daverodgman/aesce-thumb2

Support hw-accelerated AES on Thumb and Arm
2025-07-29 11:41:15 +03:00 · 2023-11-27 09:58:58 +00:00
parent c94f8f1163 5e41937eba
commit 9fbac381e6
6 changed files with 219 additions and 32 deletions
--- a/library/aes.c
+++ b/library/aes.c
@ -23,9 +23,9 @@
 #include "mbedtls/error.h"

 #if defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
-#if !((defined(MBEDTLS_ARCH_IS_ARM64) && defined(MBEDTLS_AESCE_C)) || \
-    (defined(MBEDTLS_ARCH_IS_X64)   && defined(MBEDTLS_AESNI_C)) || \
-    (defined(MBEDTLS_ARCH_IS_X86)   && defined(MBEDTLS_AESNI_C)))
+#if !((defined(MBEDTLS_ARCH_IS_ARMV8_A) && defined(MBEDTLS_AESCE_C)) || \
+    (defined(MBEDTLS_ARCH_IS_X64)       && defined(MBEDTLS_AESNI_C)) || \
+    (defined(MBEDTLS_ARCH_IS_X86)       && defined(MBEDTLS_AESNI_C)))
 #error "MBEDTLS_AES_USE_HARDWARE_ONLY defined, but not all prerequisites"
 #endif
 #endif
--- a/library/aesce.c
+++ b/library/aesce.c
@ -5,8 +5,17 @@
 *  SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
 */

-#if defined(__aarch64__) && !defined(__ARM_FEATURE_CRYPTO) && \
-    defined(__clang__) && __clang_major__ >= 4
+#if defined(__clang__) &&  (__clang_major__ >= 4)
+
+/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
+ * but that is defined by build_info.h, and we need this block to happen first. */
+#if defined(__ARM_ARCH)
+#if __ARM_ARCH >= 8
+#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
+#endif
+#endif
+
+#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
 /* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
 *
 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
@ -27,6 +36,8 @@
 #define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
 #endif

+#endif /* defined(__clang__) &&  (__clang_major__ >= 4) */
+
 #include <string.h>
 #include "common.h"

@ -34,12 +45,14 @@

 #include "aesce.h"

-#if defined(MBEDTLS_ARCH_IS_ARM64)
+#if defined(MBEDTLS_ARCH_IS_ARMV8_A) && defined(__ARM_NEON)

 /* Compiler version checks. */
 #if defined(__clang__)
-#   if __clang_major__ < 4
-#       error "Minimum version of Clang for MBEDTLS_AESCE_C is 4.0."
+#   if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
+#       error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
+#   elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
+#       error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
 #   endif
 #elif defined(__GNUC__)
 #   if __GNUC__ < 6
@ -52,6 +65,15 @@
 #   if _MSC_VER < 1929
 #       error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
 #   endif
+#elif defined(__ARMCC_VERSION)
+#    if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
+/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
+ * If someone verified that, please update this and document of
+ * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
+#         error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
+#    elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
+#         error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
+#    endif
 #endif

 #ifdef __ARM_NEON
@ -84,8 +106,19 @@

 #if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)

-#include <asm/hwcap.h>
 #include <sys/auxv.h>
+#if !defined(HWCAP_NEON)
+#define HWCAP_NEON  (1 << 12)
+#endif
+#if !defined(HWCAP2_AES)
+#define HWCAP2_AES  (1 << 0)
+#endif
+#if !defined(HWCAP_AES)
+#define HWCAP_AES   (1 << 3)
+#endif
+#if !defined(HWCAP_ASIMD)
+#define HWCAP_ASIMD (1 << 1)
+#endif

 signed char mbedtls_aesce_has_support_result = -1;

@ -102,6 +135,16 @@ int mbedtls_aesce_has_support_impl(void)
     * once, but that is harmless.
     */
    if (mbedtls_aesce_has_support_result == -1) {
+#if defined(MBEDTLS_ARCH_IS_ARM32)
+        unsigned long auxval  = getauxval(AT_HWCAP);
+        unsigned long auxval2 = getauxval(AT_HWCAP2);
+        if (((auxval  & HWCAP_NEON) == HWCAP_NEON) &&
+            ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
+            mbedtls_aesce_has_support_result = 1;
+        } else {
+            mbedtls_aesce_has_support_result = 0;
+        }
+#else
        unsigned long auxval = getauxval(AT_HWCAP);
        if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
            (HWCAP_ASIMD | HWCAP_AES)) {
@ -109,6 +152,7 @@ int mbedtls_aesce_has_support_impl(void)
        } else {
            mbedtls_aesce_has_support_result = 0;
        }
+#endif
    }
    return mbedtls_aesce_has_support_result;
 }
@ -362,24 +406,91 @@ int mbedtls_aesce_setkey_enc(unsigned char *rk,

 #if defined(MBEDTLS_GCM_C)

-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 5
-/* Some intrinsics are not available for GCC 5.X. */
-#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
-#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
-static inline poly64_t vget_low_p64(poly64x2_t __a)
+#if defined(MBEDTLS_ARCH_IS_ARM32)
+
+#if defined(__clang__)
+/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
+ * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
+ * These are only required for GCM.
+ */
+#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
+
+typedef uint8x16_t poly128_t;
+
+static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
 {
-    uint64x2_t tmp = (uint64x2_t) (__a);
-    uint64x1_t lo = vcreate_u64(vgetq_lane_u64(tmp, 0));
-    return (poly64_t) (lo);
+    poly128_t r;
+    asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
+    return r;
 }
-#endif /* !__clang__ && __GNUC__ && __GNUC__ == 5*/
+
+/* This is set to cause some more missing intrinsics to be defined below */
+#define COMMON_MISSING_INTRINSICS
+
+static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
+{
+    return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
+                     (poly64_t) (vget_high_u64((uint64x2_t) b)));
+}
+
+#endif /* defined(__clang__) */
+
+static inline uint8x16_t vrbitq_u8(uint8x16_t x)
+{
+    /* There is no vrbitq_u8 instruction in A32/T32, so provide
+     * an equivalent non-Neon implementation. Reverse bit order in each
+     * byte with 4x rbit, rev. */
+    asm ("ldm  %[p], { r2-r5 } \n\t"
+         "rbit r2, r2          \n\t"
+         "rev  r2, r2          \n\t"
+         "rbit r3, r3          \n\t"
+         "rev  r3, r3          \n\t"
+         "rbit r4, r4          \n\t"
+         "rev  r4, r4          \n\t"
+         "rbit r5, r5          \n\t"
+         "rev  r5, r5          \n\t"
+         "stm  %[p], { r2-r5 } \n\t"
+         :
+         /* Output: 16 bytes of memory pointed to by &x */
+         "+m" (*(uint8_t(*)[16]) &x)
+         :
+         [p] "r" (&x)
+         :
+         "r2", "r3", "r4", "r5"
+         );
+    return x;
+}
+
+#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
+
+#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
+/* Some intrinsics are not available for GCC 5.X. */
+#define COMMON_MISSING_INTRINSICS
+#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
+
+
+#if defined(COMMON_MISSING_INTRINSICS)
+
+/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
+
+#define vreinterpretq_p64_u8(a)  ((poly64x2_t) a)
+#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
+
+static inline poly64x1_t vget_low_p64(poly64x2_t a)
+{
+    uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
+    return (poly64x1_t) r;
+
+}
+
+#endif /* COMMON_MISSING_INTRINSICS */

 /* vmull_p64/vmull_high_p64 wrappers.
 *
 * Older compilers miss some intrinsic functions for `poly*_t`. We use
 * uint8x16_t and uint8x16x3_t as input/output parameters.
 */
-#if defined(__GNUC__) && !defined(__clang__)
+#if defined(MBEDTLS_COMPILER_IS_GCC)
 /* GCC reports incompatible type error without cast. GCC think poly64_t and
 * poly64x1_t are different, that is different with MSVC and Clang. */
 #define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
@ -388,7 +499,8 @@ static inline poly64_t vget_low_p64(poly64x2_t __a)
 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
 * cast for clang also. */
 #define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
-#endif
+#endif /* MBEDTLS_COMPILER_IS_GCC */
+
 static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
 {

@ -464,7 +576,7 @@ static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
    /* use 'asm' as an optimisation barrier to prevent loading MODULO from
     * memory. It is for GNUC compatible compilers.
     */
-    asm ("" : "+w" (r));
+    asm volatile ("" : "+w" (r));
 #endif
    uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
    uint8x16_t h, m, l; /* input high/middle/low 128b */
@ -507,6 +619,6 @@ void mbedtls_aesce_gcm_mult(unsigned char c[16],
 #undef MBEDTLS_POP_TARGET_PRAGMA
 #endif

-#endif /* MBEDTLS_ARCH_IS_ARM64 */
+#endif /* MBEDTLS_ARCH_IS_ARMV8_A */

 #endif /* MBEDTLS_AESCE_C */
--- a/library/aesce.h
+++ b/library/aesce.h
@ -2,7 +2,7 @@
 * \file aesce.h
 *
 * \brief Support hardware AES acceleration on Armv8-A processors with
- *        the Armv8-A Cryptographic Extension in AArch64 execution state.
+ *        the Armv8-A Cryptographic Extension.
 *
 * \warning These functions are only for internal use by other library
 *          functions; you must not call them directly.
@ -19,7 +19,7 @@
 #include "mbedtls/aes.h"


-#if defined(MBEDTLS_AESCE_C) && defined(MBEDTLS_ARCH_IS_ARM64)
+#if defined(MBEDTLS_AESCE_C) && defined(MBEDTLS_ARCH_IS_ARMV8_A) && defined(__ARM_NEON)

 #define MBEDTLS_AESCE_HAVE_CODE

@ -118,6 +118,12 @@ int mbedtls_aesce_setkey_enc(unsigned char *rk,
 }
 #endif

-#endif /* MBEDTLS_AESCE_C && MBEDTLS_ARCH_IS_ARM64 */
+#else
+
+#if defined(MBEDTLS_AES_USE_HARDWARE_ONLY) && defined(MBEDTLS_ARCH_IS_ARMV8_A)
+#error "AES hardware acceleration not supported on this platform"
+#endif
+
+#endif /* MBEDTLS_AESCE_C && MBEDTLS_ARCH_IS_ARMV8_A && __ARM_NEON */

 #endif /* MBEDTLS_AESCE_H */