From 255a0f591633531169242d58b0a1c7f95df4470b Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 13 Feb 2024 17:55:18 +0000 Subject: [PATCH 01/15] Rotate right instead of left Signed-off-by: Dave Rodgman --- library/sha3.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 5df08f91c8..93c5b7e681 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -36,9 +36,7 @@ static const uint64_t rc[24] = { }; static const uint8_t rho[24] = { - 1, 62, 28, 27, 36, 44, 6, 55, 20, - 3, 10, 43, 25, 39, 41, 45, 15, - 21, 8, 18, 2, 61, 56, 14 + 63, 2, 36, 37, 28, 20, 58, 9, 44, 61, 54, 21, 39, 25, 23, 19, 49, 43, 56, 46, 62, 3, 8, 50 }; static const uint8_t pi[24] = { @@ -46,7 +44,7 @@ static const uint8_t pi[24] = { 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1, }; -#define ROT64(x, y) (((x) << (y)) | ((x) >> (64U - (y)))) +#define ROTR64(x, y) (((x) << (64U - (y))) | ((x) >> (y))) // 64-bit rotate right #define ABSORB(ctx, idx, v) do { ctx->state[(idx) >> 3] ^= ((uint64_t) (v)) << (((idx) & 0x7) << 3); \ } while (0) #define SQUEEZE(ctx, idx) ((uint8_t) (ctx->state[(idx) >> 3] >> (((idx) & 0x7) << 3))) @@ -69,24 +67,24 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) lane[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; lane[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - t = lane[4] ^ ROT64(lane[1], 1); + t = lane[4] ^ ROTR64(lane[1], 63); s[0] ^= t; s[5] ^= t; s[10] ^= t; s[15] ^= t; s[20] ^= t; - t = lane[0] ^ ROT64(lane[2], 1); + t = lane[0] ^ ROTR64(lane[2], 63); s[1] ^= t; s[6] ^= t; s[11] ^= t; s[16] ^= t; s[21] ^= t; - t = lane[1] ^ ROT64(lane[3], 1); + t = lane[1] ^ ROTR64(lane[3], 63); s[2] ^= t; s[7] ^= t; s[12] ^= t; s[17] ^= t; s[22] ^= t; - t = lane[2] ^ ROT64(lane[4], 1); + t = lane[2] ^ ROTR64(lane[4], 63); s[3] ^= t; s[8] ^= t; s[13] ^= t; s[18] ^= t; s[23] ^= t; - t = lane[3] ^ ROT64(lane[0], 1); + t = lane[3] ^ ROTR64(lane[0], 63); s[4] ^= t; s[9] ^= t; s[14] ^= t; s[19] ^= t; s[24] ^= t; /* Rho */ for (i = 1; i < 25; i++) { - s[i] = ROT64(s[i], rho[i-1]); + s[i] = ROTR64(s[i], rho[i-1]); } /* Pi */ From d407e0df1bb74c1628834fc4ad43e56836b48d21 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 13 Feb 2024 18:27:55 +0000 Subject: [PATCH 02/15] Read rho table in 4-byte chunks Signed-off-by: Dave Rodgman --- library/sha3.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 93c5b7e681..29908fb342 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -35,8 +35,8 @@ static const uint64_t rc[24] = { 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008, }; -static const uint8_t rho[24] = { - 63, 2, 36, 37, 28, 20, 58, 9, 44, 61, 54, 21, 39, 25, 23, 19, 49, 43, 56, 46, 62, 3, 8, 50 +static const uint32_t rho[6] = { + 0x3f022425, 0x1c143a09, 0x2c3d3615, 0x27191713, 0x312b382e, 0x3e030832 }; static const uint8_t pi[24] = { @@ -83,8 +83,13 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) s[4] ^= t; s[9] ^= t; s[14] ^= t; s[19] ^= t; s[24] ^= t; /* Rho */ - for (i = 1; i < 25; i++) { - s[i] = ROTR64(s[i], rho[i-1]); + for (i = 1; i < 25; i += 4) { + uint32_t r = rho[(i - 1) >> 2]; + for (int j = i; j < i + 4; j++) { + uint8_t r8 = r >> 24; + r <<= 8; + s[j] = ROTR64(s[j], r8); + } } /* Pi */ From cfb126f1baa05c204e7cc5f378f9fdf2c295711b Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 13 Feb 2024 18:35:41 +0000 Subject: [PATCH 03/15] Read pi table in 4-byte chunks Signed-off-by: Dave Rodgman --- library/sha3.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 29908fb342..9b22cfa04c 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -39,9 +39,8 @@ static const uint32_t rho[6] = { 0x3f022425, 0x1c143a09, 0x2c3d3615, 0x27191713, 0x312b382e, 0x3e030832 }; -static const uint8_t pi[24] = { - 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, - 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1, +static const uint32_t pi[6] = { + 0x0a070b11, 0x12030510, 0x08151804, 0x0f17130d, 0x0c02140e, 0x16090601 }; #define ROTR64(x, y) (((x) << (64U - (y))) | ((x) >> (y))) // 64-bit rotate right @@ -94,8 +93,13 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) /* Pi */ t = s[1]; - for (i = 0; i < 24; i++) { - SWAP(s[pi[i]], t); + for (i = 0; i < 24; i += 4) { + uint32_t p = pi[i >> 2]; + for (unsigned j = 0; j < 4; j++) { + uint8_t p8 = (uint8_t) (p >> 24); + p <<= 8; + SWAP(s[p8], t); + } } /* Chi */ From 418f85957930e0d4e48b93abe73f0f9097dce50f Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 13 Feb 2024 19:22:28 +0000 Subject: [PATCH 04/15] fix cast warning Signed-off-by: Dave Rodgman --- library/sha3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/sha3.c b/library/sha3.c index 9b22cfa04c..6b14a84369 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -85,7 +85,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) for (i = 1; i < 25; i += 4) { uint32_t r = rho[(i - 1) >> 2]; for (int j = i; j < i + 4; j++) { - uint8_t r8 = r >> 24; + uint8_t r8 = (uint8_t) (r >> 24); r <<= 8; s[j] = ROTR64(s[j], r8); } From 6fd6542e9cadd0f66042ea5c281c6189c2816dea Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Feb 2024 01:20:33 +0000 Subject: [PATCH 05/15] Roll/unroll various bits Signed-off-by: Dave Rodgman --- library/sha3.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/library/sha3.c b/library/sha3.c index 6b14a84369..99a8acbaaf 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -10,6 +10,10 @@ * https://nvlpubs.nist.gov/nistpubs/fips/nist.fips.202.pdf */ +#undef MBEDTLS_SHA3_THETA_UNROLL +#define MBEDTLS_SHA3_RHO_UNROLL +#define MBEDTLS_SHA3_PI_UNROLL + #include "common.h" #if defined(MBEDTLS_SHA3_C) @@ -60,6 +64,15 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) uint64_t t; /* Theta */ +#if !defined(MBEDTLS_SHA3_THETA_UNROLL) + for (i = 0; i < 5; i++) { + lane[i] = s[i] ^ s[i + 5] ^ s[i + 10] ^ s[i + 15] ^ s[i + 20]; + } + for (i = 0; i < 5; i++) { + t = lane[(i + 4) % 5] ^ ROTR64(lane[(i + 1) % 5], 63); + s[i] ^= t; s[i + 5] ^= t; s[i + 10] ^= t; s[i + 15] ^= t; s[i + 20] ^= t; + } +#else lane[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; lane[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; lane[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; @@ -80,19 +93,28 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) t = lane[3] ^ ROTR64(lane[0], 63); s[4] ^= t; s[9] ^= t; s[14] ^= t; s[19] ^= t; s[24] ^= t; +#endif /* Rho */ for (i = 1; i < 25; i += 4) { uint32_t r = rho[(i - 1) >> 2]; +#if !defined(MBEDTLS_SHA3_RHO_UNROLL) for (int j = i; j < i + 4; j++) { uint8_t r8 = (uint8_t) (r >> 24); r <<= 8; s[j] = ROTR64(s[j], r8); } +#else + s[i + 0] = ROTR64(s[i + 0], MBEDTLS_BYTE_3(r)); + s[i + 1] = ROTR64(s[i + 1], MBEDTLS_BYTE_2(r)); + s[i + 2] = ROTR64(s[i + 2], MBEDTLS_BYTE_1(r)); + s[i + 3] = ROTR64(s[i + 3], MBEDTLS_BYTE_0(r)); +#endif } /* Pi */ t = s[1]; +#if !defined(MBEDTLS_SHA3_PI_UNROLL) for (i = 0; i < 24; i += 4) { uint32_t p = pi[i >> 2]; for (unsigned j = 0; j < 4; j++) { @@ -101,6 +123,26 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) SWAP(s[p8], t); } } +#else + uint32_t p = pi[0]; + SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); + SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + p = pi[1]; + SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); + SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + p = pi[2]; + SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); + SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + p = pi[3]; + SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); + SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + p = pi[4]; + SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); + SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + p = pi[5]; + SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); + SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); +#endif /* Chi */ lane[0] = s[0]; lane[1] = s[1]; lane[2] = s[2]; lane[3] = s[3]; lane[4] = s[4]; From a111c0c894238cc134451f267016cc821f903cfd Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Feb 2024 09:31:41 +0000 Subject: [PATCH 06/15] Improve docs; pacify check-names Signed-off-by: Dave Rodgman --- library/sha3.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 99a8acbaaf..64a87a8a82 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -10,9 +10,20 @@ * https://nvlpubs.nist.gov/nistpubs/fips/nist.fips.202.pdf */ -#undef MBEDTLS_SHA3_THETA_UNROLL -#define MBEDTLS_SHA3_RHO_UNROLL -#define MBEDTLS_SHA3_PI_UNROLL +/* + * These macros select manually unrolled implementations of parts of the main permutation function. + * + * Unrolling has a major impact on both performance and code size. gcc performance benefits a lot + * from manually unrolling at higher optimisation levels. + * + * Rolling up the theta loop saves a lot of code-size at small performance cost. The code-size + * saving then enables us to unroll the other loops for a net code-size saving with a net + * performance win. + */ +#undef MBEDTLS_SHA3_THETA_UNROLL //no-check-names +#define MBEDTLS_SHA3_RHO_UNROLL //no-check-names +#define MBEDTLS_SHA3_PI_UNROLL //no-check-names + #include "common.h" From aaba623fb4cf9dc4d3c1080255aaad90142965a9 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Feb 2024 10:52:54 +0000 Subject: [PATCH 07/15] pacify check-names Signed-off-by: Dave Rodgman --- library/sha3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/sha3.c b/library/sha3.c index 64a87a8a82..9585d017a2 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -75,7 +75,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) uint64_t t; /* Theta */ -#if !defined(MBEDTLS_SHA3_THETA_UNROLL) +#if !defined(MBEDTLS_SHA3_THETA_UNROLL) //no-check-names for (i = 0; i < 5; i++) { lane[i] = s[i] ^ s[i + 5] ^ s[i + 10] ^ s[i + 15] ^ s[i + 20]; } From 865480279c6799e3eeca0ef6d2fd1e6a8ada1b3f Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Feb 2024 10:02:58 +0000 Subject: [PATCH 08/15] roll-up chi loop on clang Signed-off-by: Dave Rodgman --- library/sha3.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/library/sha3.c b/library/sha3.c index 9585d017a2..97d45d3c11 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -23,7 +23,7 @@ #undef MBEDTLS_SHA3_THETA_UNROLL //no-check-names #define MBEDTLS_SHA3_RHO_UNROLL //no-check-names #define MBEDTLS_SHA3_PI_UNROLL //no-check-names - +#undef MBEDTLS_SHA3_CHI_UNROLL //no-check-names #include "common.h" @@ -156,6 +156,17 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) #endif /* Chi */ +#if !defined(MBEDTLS_SHA3_CHI_UNROLL) && !defined(MBEDTLS_COMPILER_IS_GCC) + /* GCC doesn't perform well with the rolled-up version, especially at -O2. */ + for (i = 0; i <= 20; i += 5) { + lane[0] = s[i]; lane[1] = s[i + 1]; lane[2] = s[i + 2]; lane[3] = s[i + 3]; lane[4] = s[i + 4]; + s[i + 0] ^= (~lane[1]) & lane[2]; + s[i + 1] ^= (~lane[2]) & lane[3]; + s[i + 2] ^= (~lane[3]) & lane[4]; + s[i + 3] ^= (~lane[4]) & lane[0]; + s[i + 4] ^= (~lane[0]) & lane[1]; + } +#else lane[0] = s[0]; lane[1] = s[1]; lane[2] = s[2]; lane[3] = s[3]; lane[4] = s[4]; s[0] ^= (~lane[1]) & lane[2]; s[1] ^= (~lane[2]) & lane[3]; @@ -190,6 +201,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) s[22] ^= (~lane[3]) & lane[4]; s[23] ^= (~lane[4]) & lane[0]; s[24] ^= (~lane[0]) & lane[1]; +#endif /* Iota */ s[0] ^= rc[round]; From 1cf3585ee4fe188664de6b75d62b280b3d720072 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Feb 2024 12:11:47 +0000 Subject: [PATCH 09/15] pacify check-names Signed-off-by: Dave Rodgman --- library/sha3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/sha3.c b/library/sha3.c index 97d45d3c11..5a854e4db6 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -156,7 +156,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) #endif /* Chi */ -#if !defined(MBEDTLS_SHA3_CHI_UNROLL) && !defined(MBEDTLS_COMPILER_IS_GCC) +#if !defined(MBEDTLS_SHA3_CHI_UNROLL) && !defined(MBEDTLS_COMPILER_IS_GCC) //no-check-names /* GCC doesn't perform well with the rolled-up version, especially at -O2. */ for (i = 0; i <= 20; i += 5) { lane[0] = s[i]; lane[1] = s[i + 1]; lane[2] = s[i + 2]; lane[3] = s[i + 3]; lane[4] = s[i + 4]; From 693fb4f0b2ccdde00ffdd7c1b5d2531b10ed79b9 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Feb 2024 13:46:30 +0000 Subject: [PATCH 10/15] roll up chi loop for gcc -Os Signed-off-by: Dave Rodgman --- library/sha3.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 5a854e4db6..935a666c09 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -10,6 +10,8 @@ * https://nvlpubs.nist.gov/nistpubs/fips/nist.fips.202.pdf */ +#include "common.h" + /* * These macros select manually unrolled implementations of parts of the main permutation function. * @@ -23,9 +25,13 @@ #undef MBEDTLS_SHA3_THETA_UNROLL //no-check-names #define MBEDTLS_SHA3_RHO_UNROLL //no-check-names #define MBEDTLS_SHA3_PI_UNROLL //no-check-names -#undef MBEDTLS_SHA3_CHI_UNROLL //no-check-names - -#include "common.h" +#if !defined(MBEDTLS_COMPILER_IS_GCC) || defined(__OPTIMIZE_SIZE__) +/* GCC doesn't perform well with the rolled-up version, especially at -O2, so only enable on gcc + * if optimising for size. Always enable for other compilers. */ +#undef MBEDTLS_SHA3_CHI_UNROLL //no-check-names +#else +#define MBEDTLS_SHA3_CHI_UNROLL //no-check-names +#endif #if defined(MBEDTLS_SHA3_C) @@ -156,8 +162,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) #endif /* Chi */ -#if !defined(MBEDTLS_SHA3_CHI_UNROLL) && !defined(MBEDTLS_COMPILER_IS_GCC) //no-check-names - /* GCC doesn't perform well with the rolled-up version, especially at -O2. */ +#if !defined(MBEDTLS_SHA3_CHI_UNROLL) //no-check-names for (i = 0; i <= 20; i += 5) { lane[0] = s[i]; lane[1] = s[i + 1]; lane[2] = s[i + 2]; lane[3] = s[i + 3]; lane[4] = s[i + 4]; s[i + 0] ^= (~lane[1]) & lane[2]; From 427a5a1915fba74849b071e7f41b8e660a0066ad Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 14 Feb 2024 13:57:53 +0000 Subject: [PATCH 11/15] Docs; minor tidy-up Signed-off-by: Dave Rodgman --- library/sha3.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 935a666c09..5ebe9fc57c 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -12,6 +12,8 @@ #include "common.h" +#if defined(MBEDTLS_SHA3_C) + /* * These macros select manually unrolled implementations of parts of the main permutation function. * @@ -21,6 +23,9 @@ * Rolling up the theta loop saves a lot of code-size at small performance cost. The code-size * saving then enables us to unroll the other loops for a net code-size saving with a net * performance win. + * + * Depending on your compiler and target, it may be beneficial to adjust these; the defaults here + * should give sensible trade-offs for gcc and clang. */ #undef MBEDTLS_SHA3_THETA_UNROLL //no-check-names #define MBEDTLS_SHA3_RHO_UNROLL //no-check-names @@ -33,8 +38,6 @@ #define MBEDTLS_SHA3_CHI_UNROLL //no-check-names #endif -#if defined(MBEDTLS_SHA3_C) - #include "mbedtls/sha3.h" #include "mbedtls/platform_util.h" #include "mbedtls/error.h" @@ -164,7 +167,8 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) /* Chi */ #if !defined(MBEDTLS_SHA3_CHI_UNROLL) //no-check-names for (i = 0; i <= 20; i += 5) { - lane[0] = s[i]; lane[1] = s[i + 1]; lane[2] = s[i + 2]; lane[3] = s[i + 3]; lane[4] = s[i + 4]; + lane[0] = s[i]; lane[1] = s[i + 1]; lane[2] = s[i + 2]; + lane[3] = s[i + 3]; lane[4] = s[i + 4]; s[i + 0] ^= (~lane[1]) & lane[2]; s[i + 1] ^= (~lane[2]) & lane[3]; s[i + 2] ^= (~lane[3]) & lane[4]; From 08b81bf1e8c2e412263102dcda56fd6f2c8e1ed3 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Mon, 26 Feb 2024 18:03:29 +0000 Subject: [PATCH 12/15] Test all unroll variations Signed-off-by: Dave Rodgman --- library/sha3.c | 34 +++++++++++++++++++++------------- tests/scripts/all.sh | 20 ++++++++++++++++++++ 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 5ebe9fc57c..299e278d40 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -24,18 +24,26 @@ * saving then enables us to unroll the other loops for a net code-size saving with a net * performance win. * - * Depending on your compiler and target, it may be beneficial to adjust these; the defaults here - * should give sensible trade-offs for gcc and clang. + * Depending on your size/perf priorities, compiler and target, it may be beneficial to adjust + * these; the defaults here should give sensible trade-offs for gcc and clang. */ -#undef MBEDTLS_SHA3_THETA_UNROLL //no-check-names -#define MBEDTLS_SHA3_RHO_UNROLL //no-check-names -#define MBEDTLS_SHA3_PI_UNROLL //no-check-names -#if !defined(MBEDTLS_COMPILER_IS_GCC) || defined(__OPTIMIZE_SIZE__) +#if !defined(MBEDTLS_SHA3_THETA_UNROLL) + #define MBEDTLS_SHA3_THETA_UNROLL 0 //no-check-names +#endif +#if !defined(MBEDTLS_SHA3_PI_UNROLL) + #define MBEDTLS_SHA3_PI_UNROLL 1 //no-check-names +#endif +#if !defined(MBEDTLS_SHA3_CHI_UNROLL) + #if !defined(MBEDTLS_COMPILER_IS_GCC) || defined(__OPTIMIZE_SIZE__) /* GCC doesn't perform well with the rolled-up version, especially at -O2, so only enable on gcc * if optimising for size. Always enable for other compilers. */ -#undef MBEDTLS_SHA3_CHI_UNROLL //no-check-names -#else -#define MBEDTLS_SHA3_CHI_UNROLL //no-check-names + #define MBEDTLS_SHA3_CHI_UNROLL 0 //no-check-names + #else + #define MBEDTLS_SHA3_CHI_UNROLL 1 //no-check-names + #endif +#endif +#if !defined(MBEDTLS_SHA3_RHO_UNROLL) + #define MBEDTLS_SHA3_RHO_UNROLL 1 //no-check-names #endif #include "mbedtls/sha3.h" @@ -84,7 +92,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) uint64_t t; /* Theta */ -#if !defined(MBEDTLS_SHA3_THETA_UNROLL) //no-check-names +#if MBEDTLS_SHA3_THETA_UNROLL == 0 //no-check-names for (i = 0; i < 5; i++) { lane[i] = s[i] ^ s[i + 5] ^ s[i + 10] ^ s[i + 15] ^ s[i + 20]; } @@ -118,7 +126,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) /* Rho */ for (i = 1; i < 25; i += 4) { uint32_t r = rho[(i - 1) >> 2]; -#if !defined(MBEDTLS_SHA3_RHO_UNROLL) +#if MBEDTLS_SHA3_RHO_UNROLL == 0 for (int j = i; j < i + 4; j++) { uint8_t r8 = (uint8_t) (r >> 24); r <<= 8; @@ -134,7 +142,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) /* Pi */ t = s[1]; -#if !defined(MBEDTLS_SHA3_PI_UNROLL) +#if MBEDTLS_SHA3_PI_UNROLL == 0 for (i = 0; i < 24; i += 4) { uint32_t p = pi[i >> 2]; for (unsigned j = 0; j < 4; j++) { @@ -165,7 +173,7 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) #endif /* Chi */ -#if !defined(MBEDTLS_SHA3_CHI_UNROLL) //no-check-names +#if MBEDTLS_SHA3_CHI_UNROLL == 0 //no-check-names for (i = 0; i <= 20; i += 5) { lane[0] = s[i]; lane[1] = s[i + 1]; lane[2] = s[i + 2]; lane[3] = s[i + 3]; lane[4] = s[i + 4]; diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh index af32c065dc..2b51f9e531 100755 --- a/tests/scripts/all.sh +++ b/tests/scripts/all.sh @@ -4687,6 +4687,26 @@ component_test_aesni () { # ~ 60s not grep -q "AES note: built-in implementation." ./programs/test/selftest } +component_test_sha3_variations() { + msg "sha3 loop unroll variations" + + # define minimal config sufficient to test SHA3 + cat > include/mbedtls/mbedtls_config.h << END + #define MBEDTLS_SELF_TEST + #define MBEDTLS_SHA3_C +END + + msg "all loops unrolled" + make clean + make -C tests test_suite_shax CFLAGS="-DMBEDTLS_SHA3_THETA_UNROLL=1 -DMBEDTLS_SHA3_PI_UNROLL=1 -DMBEDTLS_SHA3_CHI_UNROLL=1 -DMBEDTLS_SHA3_RHO_UNROLL=1" + ./tests/test_suite_shax + + msg "all loops rolled up" + make clean + make -C tests test_suite_shax CFLAGS="-DMBEDTLS_SHA3_THETA_UNROLL=0 -DMBEDTLS_SHA3_PI_UNROLL=0 -DMBEDTLS_SHA3_CHI_UNROLL=0 -DMBEDTLS_SHA3_RHO_UNROLL=0" + ./tests/test_suite_shax +} + support_test_aesni_m32() { support_test_m32_no_asm && (lscpu | grep -qw aes) } From 40c837dc40592b58a4570b0670ff39fbde6facc1 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Mon, 26 Feb 2024 18:33:23 +0000 Subject: [PATCH 13/15] Simplify pi rolled-up variant Signed-off-by: Dave Rodgman --- library/sha3.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 299e278d40..b41879cf0b 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -72,7 +72,7 @@ static const uint32_t rho[6] = { }; static const uint32_t pi[6] = { - 0x0a070b11, 0x12030510, 0x08151804, 0x0f17130d, 0x0c02140e, 0x16090601 + 0x110b070a, 0x10050312, 0x04181508, 0x0d13170f, 0x0e14020c, 0x01060916 }; #define ROTR64(x, y) (((x) << (64U - (y))) | ((x) >> (y))) // 64-bit rotate right @@ -146,30 +146,29 @@ static void keccak_f1600(mbedtls_sha3_context *ctx) for (i = 0; i < 24; i += 4) { uint32_t p = pi[i >> 2]; for (unsigned j = 0; j < 4; j++) { - uint8_t p8 = (uint8_t) (p >> 24); - p <<= 8; - SWAP(s[p8], t); + SWAP(s[p & 0xff], t); + p >>= 8; } } #else uint32_t p = pi[0]; - SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); - SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + SWAP(s[MBEDTLS_BYTE_0(p)], t); SWAP(s[MBEDTLS_BYTE_1(p)], t); + SWAP(s[MBEDTLS_BYTE_2(p)], t); SWAP(s[MBEDTLS_BYTE_3(p)], t); p = pi[1]; - SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); - SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + SWAP(s[MBEDTLS_BYTE_0(p)], t); SWAP(s[MBEDTLS_BYTE_1(p)], t); + SWAP(s[MBEDTLS_BYTE_2(p)], t); SWAP(s[MBEDTLS_BYTE_3(p)], t); p = pi[2]; - SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); - SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + SWAP(s[MBEDTLS_BYTE_0(p)], t); SWAP(s[MBEDTLS_BYTE_1(p)], t); + SWAP(s[MBEDTLS_BYTE_2(p)], t); SWAP(s[MBEDTLS_BYTE_3(p)], t); p = pi[3]; - SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); - SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + SWAP(s[MBEDTLS_BYTE_0(p)], t); SWAP(s[MBEDTLS_BYTE_1(p)], t); + SWAP(s[MBEDTLS_BYTE_2(p)], t); SWAP(s[MBEDTLS_BYTE_3(p)], t); p = pi[4]; - SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); - SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + SWAP(s[MBEDTLS_BYTE_0(p)], t); SWAP(s[MBEDTLS_BYTE_1(p)], t); + SWAP(s[MBEDTLS_BYTE_2(p)], t); SWAP(s[MBEDTLS_BYTE_3(p)], t); p = pi[5]; - SWAP(s[MBEDTLS_BYTE_3(p)], t); SWAP(s[MBEDTLS_BYTE_2(p)], t); - SWAP(s[MBEDTLS_BYTE_1(p)], t); SWAP(s[MBEDTLS_BYTE_0(p)], t); + SWAP(s[MBEDTLS_BYTE_0(p)], t); SWAP(s[MBEDTLS_BYTE_1(p)], t); + SWAP(s[MBEDTLS_BYTE_2(p)], t); SWAP(s[MBEDTLS_BYTE_3(p)], t); #endif /* Chi */ From 8a4df2293a5247b7a5b659d5588a3850f4512ae5 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 1 Mar 2024 15:12:59 +0000 Subject: [PATCH 14/15] Adjust default unroll settings Signed-off-by: Dave Rodgman --- library/sha3.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index b41879cf0b..2bc3b4cdf1 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -20,26 +20,29 @@ * Unrolling has a major impact on both performance and code size. gcc performance benefits a lot * from manually unrolling at higher optimisation levels. * - * Rolling up the theta loop saves a lot of code-size at small performance cost. The code-size - * saving then enables us to unroll the other loops for a net code-size saving with a net - * performance win. - * * Depending on your size/perf priorities, compiler and target, it may be beneficial to adjust - * these; the defaults here should give sensible trade-offs for gcc and clang. + * these; the defaults here should give sensible trade-offs for gcc and clang on aarch64 and + * x86-64. */ #if !defined(MBEDTLS_SHA3_THETA_UNROLL) - #define MBEDTLS_SHA3_THETA_UNROLL 0 //no-check-names + #if defined(__OPTIMIZE_SIZE__) + #define MBEDTLS_SHA3_THETA_UNROLL 0 //no-check-names + #else + #define MBEDTLS_SHA3_THETA_UNROLL 1 //no-check-names + #endif #endif #if !defined(MBEDTLS_SHA3_PI_UNROLL) - #define MBEDTLS_SHA3_PI_UNROLL 1 //no-check-names + #if defined(__OPTIMIZE_SIZE__) + #define MBEDTLS_SHA3_PI_UNROLL 0 //no-check-names + #else + #define MBEDTLS_SHA3_PI_UNROLL 1 //no-check-names + #endif #endif #if !defined(MBEDTLS_SHA3_CHI_UNROLL) - #if !defined(MBEDTLS_COMPILER_IS_GCC) || defined(__OPTIMIZE_SIZE__) -/* GCC doesn't perform well with the rolled-up version, especially at -O2, so only enable on gcc - * if optimising for size. Always enable for other compilers. */ - #define MBEDTLS_SHA3_CHI_UNROLL 0 //no-check-names - #else + #if defined(__OPTIMIZE_SIZE__) #define MBEDTLS_SHA3_CHI_UNROLL 1 //no-check-names + #else + #define MBEDTLS_SHA3_CHI_UNROLL 0 //no-check-names #endif #endif #if !defined(MBEDTLS_SHA3_RHO_UNROLL) From a38fad9dad256be6e7f1595b4112e4a945448bb3 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Mon, 4 Mar 2024 18:27:32 +0000 Subject: [PATCH 15/15] Adjust defaults Signed-off-by: Dave Rodgman --- library/sha3.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/library/sha3.c b/library/sha3.c index 2bc3b4cdf1..81ea6a8a4d 100644 --- a/library/sha3.c +++ b/library/sha3.c @@ -25,26 +25,18 @@ * x86-64. */ #if !defined(MBEDTLS_SHA3_THETA_UNROLL) - #if defined(__OPTIMIZE_SIZE__) - #define MBEDTLS_SHA3_THETA_UNROLL 0 //no-check-names - #else - #define MBEDTLS_SHA3_THETA_UNROLL 1 //no-check-names - #endif -#endif -#if !defined(MBEDTLS_SHA3_PI_UNROLL) - #if defined(__OPTIMIZE_SIZE__) - #define MBEDTLS_SHA3_PI_UNROLL 0 //no-check-names - #else - #define MBEDTLS_SHA3_PI_UNROLL 1 //no-check-names - #endif + #define MBEDTLS_SHA3_THETA_UNROLL 0 //no-check-names #endif #if !defined(MBEDTLS_SHA3_CHI_UNROLL) #if defined(__OPTIMIZE_SIZE__) - #define MBEDTLS_SHA3_CHI_UNROLL 1 //no-check-names - #else #define MBEDTLS_SHA3_CHI_UNROLL 0 //no-check-names + #else + #define MBEDTLS_SHA3_CHI_UNROLL 1 //no-check-names #endif #endif +#if !defined(MBEDTLS_SHA3_PI_UNROLL) + #define MBEDTLS_SHA3_PI_UNROLL 1 //no-check-names +#endif #if !defined(MBEDTLS_SHA3_RHO_UNROLL) #define MBEDTLS_SHA3_RHO_UNROLL 1 //no-check-names #endif