1
0
mirror of https://github.com/esp8266/Arduino.git synced 2025-07-04 01:23:50 +03:00

Update mmu_get... and mmu_set... (#8290)

These changes are needed to address bugs that can emerge with the improved optimization from the GCC 10.3 compiler.

Updated performance inline functions `mmu_get_uint8()`, ... and `mmu_set_uint8()`, ...  to comply with strict-aliasing rules. 
Without this change, stale data may be referenced. This issue was revealed in discussions on https://github.com/esp8266/Arduino/issues/8261#issue-963529268 

Changes to avoid over-optimization of 32-bit wide transfers from IRAM, turning into 8-bit or 16-bit transfers by the new GCC 10.3 compiler. This has been a reoccurring/tricky problem for me with the new compiler. 

So far referencing the 32-bit value loaded by way of an Extended ASM R/W output register has stopped the compiler from optimizing down to an 8-bit or 16-bit transfer.

Example:
```cpp
  uint32_t val;
  __builtin_memcpy(&val, v32, sizeof(uint32_t));
  asm volatile ("" :"+r"(val)); // inject 32-bit dependency
  ...
```

Updated example `irammem.ino`
* do a simple test of compliance to strict-aliasing rules
* For `mmu_get_uint8()`, added tests to evaluate if 32-bit wide transfers were converted to an 8-bit transfer.
This commit is contained in:
M Hightower
2021-10-12 17:47:12 -07:00
committed by GitHub
parent 9d024d17fd
commit b7a2f44b50
4 changed files with 306 additions and 42 deletions

View File

@ -26,13 +26,24 @@
extern "C" {
#endif
//C This turns on range checking. Is this the value you want to trigger it?
// This turns on range checking.
#ifdef DEBUG_ESP_CORE
#define DEBUG_ESP_MMU
#endif
#if defined(CORE_MOCK)
#define ets_uart_printf(...) do {} while(false)
#define XCHAL_INSTRAM0_VADDR 0x40000000
#define XCHAL_INSTRAM1_VADDR 0x40100000
#define XCHAL_INSTROM0_VADDR 0x40200000
#else
#include <sys/config.h> // For config/core-isa.h
/*
Cautiously use XCHAL_..._VADDR values where possible.
While XCHAL_..._VADDR values in core-isa.h may define the Xtensa processor
CONFIG options, they are not always an indication of DRAM, IRAM, or ROM
size or position in the address space.
*/
#endif
/*
@ -71,32 +82,34 @@ DBG_MMU_FLUSH(0)
static inline __attribute__((always_inline))
bool mmu_is_iram(const void *addr) {
#define IRAM_START 0x40100000UL
const uintptr_t iram_start = (uintptr_t)XCHAL_INSTRAM1_VADDR;
#ifndef MMU_IRAM_SIZE
#if defined(__GNUC__) && !defined(CORE_MOCK)
#warning "MMU_IRAM_SIZE was undefined, setting to 0x8000UL!"
#endif
#define MMU_IRAM_SIZE 0x8000UL
#define MMU_IRAM_SIZE 0x8000ul
#endif
#define IRAM_END (IRAM_START + MMU_IRAM_SIZE)
const uintptr_t iram_end = iram_start + MMU_IRAM_SIZE;
return (IRAM_START <= (uintptr_t)addr && IRAM_END > (uintptr_t)addr);
return (iram_start <= (uintptr_t)addr && iram_end > (uintptr_t)addr);
}
static inline __attribute__((always_inline))
bool mmu_is_dram(const void *addr) {
#define DRAM_START 0x3FF80000UL
#define DRAM_END 0x40000000UL
const uintptr_t dram_start = 0x3FFE8000ul;
// The start of the Boot ROM sits at the end of DRAM. 0x40000000ul;
const uintptr_t dram_end = (uintptr_t)XCHAL_INSTRAM0_VADDR;
return (DRAM_START <= (uintptr_t)addr && DRAM_END > (uintptr_t)addr);
return (dram_start <= (uintptr_t)addr && dram_end > (uintptr_t)addr);
}
static inline __attribute__((always_inline))
bool mmu_is_icache(const void *addr) {
#define ICACHE_START 0x40200000UL
#define ICACHE_END (ICACHE_START + 0x100000UL)
extern void _irom0_text_end(void);
const uintptr_t icache_start = (uintptr_t)XCHAL_INSTROM0_VADDR;
const uintptr_t icache_end = (uintptr_t)_irom0_text_end;
return (ICACHE_START <= (uintptr_t)addr && ICACHE_END > (uintptr_t)addr);
return (icache_start <= (uintptr_t)addr && icache_end > (uintptr_t)addr);
}
#ifdef DEBUG_ESP_MMU
@ -127,8 +140,26 @@ bool mmu_is_icache(const void *addr) {
static inline __attribute__((always_inline))
uint8_t mmu_get_uint8(const void *p8) {
ASSERT_RANGE_TEST_READ(p8);
uint32_t val = (*(uint32_t *)((uintptr_t)p8 & ~0x3));
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
// https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#how-do-we-type-pun-correctly
// Comply with strict-aliasing rules. Using memcpy is a Standards suggested
// method for type punning. The compiler optimizer will replace the memcpy
// with an `l32i` instruction. Using __builtin_memcpy to ensure we get the
// effects of the compiler optimization and not some #define version of
// memcpy.
void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
// Use an empty ASM to reference the 32-bit value. This will block the
// compiler from immediately optimizing to an 8-bit or 16-bit load instruction
// against IRAM memory. (This approach was inspired by
// https://github.com/esp8266/Arduino/pull/7780#discussion_r548303374)
// This issue was seen when using a constant address with the GCC 10.3
// compiler.
// As a general practice, I think referencing by way of Extended ASM R/W
// output register will stop the the compiler from reloading the value later
// as 8-bit load from IRAM.
asm volatile ("" :"+r"(val)); // inject 32-bit dependency
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
val >>= pos;
return (uint8_t)val;
}
@ -136,8 +167,11 @@ uint8_t mmu_get_uint8(const void *p8) {
static inline __attribute__((always_inline))
uint16_t mmu_get_uint16(const uint16_t *p16) {
ASSERT_RANGE_TEST_READ(p16);
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)0x3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(val));
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
val >>= pos;
return (uint16_t)val;
}
@ -145,8 +179,11 @@ uint16_t mmu_get_uint16(const uint16_t *p16) {
static inline __attribute__((always_inline))
int16_t mmu_get_int16(const int16_t *p16) {
ASSERT_RANGE_TEST_READ(p16);
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t val;
__builtin_memcpy(&val, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(val));
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
val >>= pos;
return (int16_t)val;
}
@ -154,30 +191,43 @@ int16_t mmu_get_int16(const int16_t *p16) {
static inline __attribute__((always_inline))
uint8_t mmu_set_uint8(void *p8, const uint8_t val) {
ASSERT_RANGE_TEST_WRITE(p8);
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
uint32_t sval = val << pos;
uint32_t valmask = 0x0FF << pos;
uint32_t valmask = 0x0FFu << pos;
void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));
uint32_t *p32 = (uint32_t *)((uintptr_t)p8 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
/*
This 32-bit dependency injection does not appear to be needed with the
current GCC 10.3; however, that could change in the future versions. Or, I
may not have the right test for it to fail.
*/
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}
static inline __attribute__((always_inline))
uint16_t mmu_set_uint16(uint16_t *p16, const uint16_t val) {
ASSERT_RANGE_TEST_WRITE(p16);
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
uint32_t sval = val << pos;
uint32_t valmask = 0x0FFFF << pos;
uint32_t valmask = 0x0FFFFu << pos;
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));
uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}
@ -185,32 +235,36 @@ static inline __attribute__((always_inline))
int16_t mmu_set_int16(int16_t *p16, const int16_t val) {
ASSERT_RANGE_TEST_WRITE(p16);
uint32_t sval = (uint16_t)val;
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
sval <<= pos;
uint32_t valmask = 0x0FFFF << pos;
uint32_t valmask = 0x0FFFFu << pos;
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
uint32_t ival;
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
asm volatile ("" :"+r"(ival));
uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
uint32_t ival = *p32;
ival &= (~valmask);
ival |= sval;
*p32 = ival;
asm volatile ("" :"+r"(ival));
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
return val;
}
#if (MMU_IRAM_SIZE > 32*1024) && !defined(MMU_SEC_HEAP)
extern void _text_end(void);
#define MMU_SEC_HEAP mmu_sec_heap()
#define MMU_SEC_HEAP_SIZE mmu_sec_heap_size()
static inline __attribute__((always_inline))
void *mmu_sec_heap(void) {
uint32_t sec_heap = (uint32_t)_text_end + 32;
return (void *)(sec_heap &= ~7);
extern void _text_end(void);
uintptr_t sec_heap = (uintptr_t)_text_end + (uintptr_t)32u;
return (void *)(sec_heap &= ~(uintptr_t)7u);
}
static inline __attribute__((always_inline))
size_t mmu_sec_heap_size(void) {
return (size_t)0xC000UL - ((size_t)mmu_sec_heap() - 0x40100000UL);
return (size_t)0xC000ul - ((uintptr_t)mmu_sec_heap() - (uintptr_t)XCHAL_INSTRAM1_VADDR);
}
#endif

View File

@ -17,6 +17,12 @@
#include <umm_malloc/umm_malloc.h>
#if defined(UMM_HEAP_IRAM)
#if defined(CORE_MOCK)
#define XCHAL_INSTRAM1_VADDR 0x40100000
#else
#include <sys/config.h> // For config/core-isa.h
#endif
// durable - as in long life, persisting across reboots.
struct durable {
uint32_t bootCounter;
@ -30,7 +36,7 @@ struct durable {
#define IRAM_RESERVE_SZ ((sizeof(struct durable) + 7UL) & ~7UL)
// Position its address just above the reduced 2nd Heap.
#define IRAM_RESERVE (0x40100000UL + 0xC000UL - IRAM_RESERVE_SZ)
#define IRAM_RESERVE ((uintptr_t)XCHAL_INSTRAM1_VADDR + 0xC000UL - IRAM_RESERVE_SZ)
// Define a reference with the right properties to make access easier.
#define DURABLE ((struct durable *)IRAM_RESERVE)
@ -100,9 +106,9 @@ extern "C" void umm_init_iram(void) {
adjustments and checksums. These can affect the persistence of data across
reboots.
*/
uint32_t sec_heap = (uint32_t)_text_end + 32;
uintptr_t sec_heap = (uintptr_t)_text_end + 32;
sec_heap &= ~7;
size_t sec_heap_sz = 0xC000UL - (sec_heap - 0x40100000UL);
size_t sec_heap_sz = 0xC000UL - (sec_heap - (uintptr_t)XCHAL_INSTRAM1_VADDR);
sec_heap_sz -= IRAM_RESERVE_SZ; // Shrink IRAM heap
if (0xC000UL > sec_heap_sz) {

View File

@ -3,6 +3,12 @@
#include <umm_malloc/umm_malloc.h>
#include <umm_malloc/umm_heap_select.h>
#if defined(CORE_MOCK)
#define XCHAL_INSTRAM1_VADDR 0x40100000
#else
#include <sys/config.h> // For config/core-isa.h
#endif
uint32_t timed_byte_read(char *pc, uint32_t * o);
uint32_t timed_byte_read2(char *pc, uint32_t * o);
int divideA_B(int a, int b);
@ -102,7 +108,7 @@ void print_mmu_status(Print& oStream) {
#ifdef MMU_IRAM_SIZE
oStream.printf_P(PSTR(" IRAM Size: %u"), MMU_IRAM_SIZE);
oStream.println();
const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - 0x40100000UL);
const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - (uintptr_t)XCHAL_INSTRAM1_VADDR);
oStream.printf_P(PSTR(" IRAM free: %u"), iram_free);
oStream.println();
#endif

View File

@ -14,6 +14,204 @@
#define ETS_PRINTF ets_uart_printf
#endif
/*
Verify mmu_get_uint16()'s compliance with strict-aliasing rules under
different optimizations.
*/
#pragma GCC push_options
// reference
#pragma GCC optimize("O0") // We expect -O0 to generate the correct results
__attribute__((noinline))
void aliasTestReference(uint16_t *x) {
// Without adhearance to strict-aliasing, this sequence of code would fail
// when optimized by GCC Version 10.3
size_t len = 3;
for (size_t u = 0; u < len; u++) {
uint16_t x1 = mmu_get_uint16(&x[0]);
for (size_t v = 0; v < len; v++) {
x[v] = mmu_get_uint16(&x[v]) + x1;
}
}
}
// Tests
#pragma GCC optimize("Os")
__attribute__((noinline))
void aliasTestOs(uint16_t *x) {
size_t len = 3;
for (size_t u = 0; u < len; u++) {
uint16_t x1 = mmu_get_uint16(&x[0]);
for (size_t v = 0; v < len; v++) {
x[v] = mmu_get_uint16(&x[v]) + x1;
}
}
}
#pragma GCC optimize("O2")
__attribute__((noinline))
void aliasTestO2(uint16_t *x) {
size_t len = 3;
for (size_t u = 0; u < len; u++) {
uint16_t x1 = mmu_get_uint16(&x[0]);
for (size_t v = 0; v < len; v++) {
x[v] = mmu_get_uint16(&x[v]) + x1;
}
}
}
#pragma GCC optimize("O3")
__attribute__((noinline))
void aliasTestO3(uint16_t *x) {
size_t len = 3;
for (size_t u = 0; u < len; u++) {
uint16_t x1 = mmu_get_uint16(&x[0]);
for (size_t v = 0; v < len; v++) {
x[v] = mmu_get_uint16(&x[v]) + x1;
}
}
}
// Evaluate if optomizer may have changed 32-bit access to 8-bit.
// 8-bit access will take longer as it will be processed thought
// the exception handler. For this case the -O0 version will appear faster.
#pragma GCC optimize("O0")
__attribute__((noinline)) IRAM_ATTR
uint32_t timedRead_Reference(uint8_t *res) {
// This test case was verified with GCC 10.3
// There is a code case that can result in 32-bit wide IRAM load from memory
// being optimized down to an 8-bit memory access. In this test case we need
// to supply a constant IRAM address that is not 0 when anded with 3u.
// This section verifies that the workaround implimented by the inline
// function mmu_get_uint8() is preventing this. See comments for function
// mmu_get_uint8(() in mmu_iram.h for more details.
const uint8_t *x = (const uint8_t *)0x40100003ul;
uint32_t b = ESP.getCycleCount();
*res = mmu_get_uint8(x);
return ESP.getCycleCount() - b;
}
#pragma GCC optimize("Os")
__attribute__((noinline)) IRAM_ATTR
uint32_t timedRead_Os(uint8_t *res) {
const uint8_t *x = (const uint8_t *)0x40100003ul;
uint32_t b = ESP.getCycleCount();
*res = mmu_get_uint8(x);
return ESP.getCycleCount() - b;
}
#pragma GCC optimize("O2")
__attribute__((noinline)) IRAM_ATTR
uint32_t timedRead_O2(uint8_t *res) {
const uint8_t *x = (const uint8_t *)0x40100003ul;
uint32_t b = ESP.getCycleCount();
*res = mmu_get_uint8(x);
return ESP.getCycleCount() - b;
}
#pragma GCC optimize("O3")
__attribute__((noinline)) IRAM_ATTR
uint32_t timedRead_O3(uint8_t *res) {
const uint8_t *x = (const uint8_t *)0x40100003ul;
uint32_t b = ESP.getCycleCount();
*res = mmu_get_uint8(x);
return ESP.getCycleCount() - b;
}
#pragma GCC pop_options
bool test4_32bit_loads() {
bool result = true;
uint8_t res;
uint32_t cycle_count_ref, cycle_count;
Serial.printf("\r\nFor mmu_get_uint8, verify that 32-bit wide IRAM access is preserved across different optimizations:\r\n");
cycle_count_ref = timedRead_Reference(&res);
/*
If the optimizer (for options -Os, -O2, and -O3) replaces the 32-bit wide
IRAM access with an 8-bit, the exception handler will get invoked on memory
reads. The total execution time will show a significant increase when
compared to the reference (option -O0).
*/
Serial.printf(" Option -O0, cycle count %5u - reference\r\n", cycle_count_ref);
cycle_count = timedRead_Os(&res);
Serial.printf(" Option -Os, cycle count %5u ", cycle_count);
if (cycle_count_ref > cycle_count) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
}
cycle_count = timedRead_O2(&res);
Serial.printf(" Option -O2, cycle count %5u ", cycle_count);
if (cycle_count_ref > cycle_count) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
}
cycle_count = timedRead_O3(&res);
Serial.printf(" Option -O3, cycle count %5u ", cycle_count);
if (cycle_count_ref > cycle_count) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
}
return result;
}
void printPunFail(uint16_t *ref, uint16_t *x, size_t sz) {
Serial.printf(" Expected:");
for (size_t i = 0; i < sz; i++) {
Serial.printf(" %3u", ref[i]);
}
Serial.printf("\r\n Got: ");
for (size_t i = 0; i < sz; i++) {
Serial.printf(" %3u", x[i]);
}
Serial.printf("\r\n");
}
bool testPunning() {
bool result = true;
// Get reference result for verifing test
alignas(uint32_t) uint16_t x_ref[] = {1, 2, 3, 0};
aliasTestReference(x_ref); // -O0
Serial.printf("mmu_get_uint16() strict-aliasing tests with different optimizations:\r\n");
{
alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0};
aliasTestOs(x);
Serial.printf(" Option -Os ");
if (0 == memcmp(x_ref, x, sizeof(x_ref))) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t));
}
}
{
alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0};
aliasTestO2(x);
Serial.printf(" Option -O2 ");
if (0 == memcmp(x_ref, x, sizeof(x_ref))) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t));
}
}
{
alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0};
aliasTestO3(x);
Serial.printf(" Option -O3 ");
if (0 == memcmp(x_ref, x, sizeof(x_ref))) {
Serial.printf("- passed\r\n");
} else {
result = false;
Serial.printf("- failed\r\n");
printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t));
}
}
return result;
}
uint32_t cyclesToRead_nKx32(int n, unsigned int *x, uint32_t *res) {
uint32_t b = ESP.getCycleCount();
uint32_t sum = 0;
@ -95,7 +293,6 @@ uint32_t cyclesToWrite_nKx8(int n, unsigned char*x) {
}
// Compare with Inline
uint32_t cyclesToRead_nKx16_viaInline(int n, unsigned short *x, uint32_t *res) {
uint32_t b = ESP.getCycleCount();
uint32_t sum = 0;
@ -159,6 +356,7 @@ uint32_t cyclesToWrite_nKx8_viaInline(int n, unsigned char*x) {
return ESP.getCycleCount() - b;
}
bool perfTest_nK(int nK, uint32_t *mem, uint32_t *imem) {
uint32_t res, verify_res;
uint32_t t;
@ -317,7 +515,7 @@ void setup() {
Serial.println();
if (perfTest_nK(1, mem, imem)) {
if (perfTest_nK(1, mem, imem) && testPunning() && test4_32bit_loads()) {
Serial.println();
} else {
Serial.println("\r\n*******************************");