1
0
mirror of https://github.com/esp8266/Arduino.git synced 2025-07-24 19:42:27 +03:00

Extended ASM got fussy when using different optimizations. ie. HWDT (#7816)

resets. It seems you should not use input registers for scratch registers.
Add an extra output register instead. No code size increase.

Light refactoring for readability

Added "C" reference code for Extended ASM

Save two cycles by loading a0 early in exc-c-wrapper-handler.S

Use optimization O2
Net change in size, 0 bytes with optimization. Save 4 bytes w/o Optimization.
With changes and "O2" save 3 cycles on write and 6 cycles on read.
This commit is contained in:
M Hightower
2021-01-07 21:05:24 -08:00
committed by GitHub
parent 2f5979f1f0
commit 67e1dfc5a4
2 changed files with 87 additions and 52 deletions

View File

@ -18,10 +18,10 @@
*/
/*
* This exception handler, allows for byte or short accesses to iRAM or PROGMEM
* to succeed without causing a crash. It is still preferred to use the xxx_P
* macros whenever possible, since they are probably 30x faster than this
* exception handler method.
* This exception handler handles EXCCAUSE_LOAD_STORE_ERROR. It allows for a
* byte or short access to iRAM or PROGMEM to succeed without causing a crash.
* When reading, it is still preferred to use the xxx_P macros when possible
* since they are probably 30x faster than this exception handler method.
*
* Code taken directly from @pvvx's public domain code in
* https://github.com/pvvx/esp8266web/blob/master/app/sdklib/system/app_main.c
@ -37,6 +37,16 @@
#include <Schedule.h>
#include <debug.h>
// All of these optimization were tried and now work
// These results were from irammem.ino using GCC 10.2
// DRAM reference uint16 9 AVG cycles/transfer
// #pragma GCC optimize("O0") // uint16, 289 AVG cycles/transfer, IRAM: +180
// #pragma GCC optimize("O1") // uint16, 241 AVG cycles/transfer, IRAM: +16
#pragma GCC optimize("O2") // uint16, 230 AVG cycles/transfer, IRAM: +4
// #pragma GCC optimize("O3") // uint16, 230 AVG cycles/transfer, IRAM: +4
// #pragma GCC optimize("Ofast") // uint16, 230 AVG cycles/transfer, IRAM: +4
// #pragma GCC optimize("Os") // uint16, 233 AVG cycles/transfer, IRAM: 27556 +0
extern "C" {
#define LOAD_MASK 0x00f00fu
@ -50,32 +60,55 @@ extern "C" {
static fn_c_exception_handler_t old_c_handler = NULL;
static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, int cause)
static
IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, int cause)
{
do {
/*
Had to split out some of the asm, compiler was reusing a register that it
needed later. A crash would come or go away with the slightest unrelated
changes elsewhere in the function.
In adapting the public domain version, a crash would come or go away with
the slightest unrelated changes elsewhere in the function. Observed that
register a15 was used for epc1, then clobbered by `rsr.` I now believe a
"&" on the output register would have resolved the problem.
Register a15 was used for epc1, then clobbered for rsr. Maybe an
__asm("":::"memory") before starting the asm would help for these cases.
For this instance moved setting epc1 closer to where it was used.
Edit. "&" on output register would have resolved the problem.
Refactored to reduce and consolidate register usage.
However, I have refactored the Extended ASM to reduce and consolidate
register usage and corrected the issue.
The positioning of the Extended ASM block (as early as possible in the
compiled function) is in part controlled by the immediate need for
output variable `insn`. This placement aids in getting excvaddr read as
early as possible.
*/
uint32_t insn;
__asm(
"movi %0, ~3\n\t" /* prepare a mask for the EPC */
"and %0, %0, %1\n\t" /* apply mask for 32bit aligned base */
"ssa8l %1\n\t" /* set up shift register for src op */
"l32i %1, %0, 0\n\t" /* load part 1 */
"l32i %0, %0, 4\n\t" /* load part 2 */
"src %0, %0, %1\n\t" /* right shift to get faulting instruction */
:"=&r"(insn)
:"r"(ef->epc)
:
);
uint32_t insn, excvaddr;
#if 1
{
uint32_t tmp;
__asm__ (
"rsr.excvaddr %[vaddr]\n\t" /* Read faulting address as early as possible */
"movi.n %[tmp], ~3\n\t" /* prepare a mask for the EPC */
"and %[tmp], %[tmp], %[epc]\n\t" /* apply mask for 32-bit aligned base */
"ssa8l %[epc]\n\t" /* set up shift register for src op */
"l32i %[insn], %[tmp], 0\n\t" /* load part 1 */
"l32i %[tmp], %[tmp], 4\n\t" /* load part 2 */
"src %[insn], %[tmp], %[insn]\n\t" /* right shift to get faulting instruction */
: [vaddr]"=&r"(excvaddr), [insn]"=&r"(insn), [tmp]"=&r"(tmp)
: [epc]"r"(ef->epc) :);
}
#else
{
__asm__ __volatile__ ("rsr.excvaddr %0;" : "=r"(excvaddr):: "memory");
/*
"C" reference code for the ASM to document intent.
May also prove useful when issolating possible issues with Extended ASM,
optimizations, new compilers, etc.
*/
uint32_t epc = ef->epc;
uint32_t *pWord = (uint32_t *)(epc & ~3);
uint64_t big_word = ((uint64_t)pWord[1] << 32) | pWord[0];
uint32_t pos = (epc & 3) * 8;
insn = (uint32_t)(big_word >>= pos);
}
#endif
uint32_t what = insn & LOAD_MASK;
uint32_t valmask = 0;
@ -102,10 +135,6 @@ static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef,
--regno; /* account for skipped a1 in exception_frame */
}
uint32_t excvaddr;
/* read out the faulting address */
__asm("rsr %0, EXCVADDR;" :"=r"(excvaddr)::);
#ifdef DEBUG_ESP_MMU
/* debug option to validate address so we don't hide memory access bugs in APP */
if (mmu_is_iram((void *)excvaddr) || (is_read && mmu_is_icache((void *)excvaddr))) {
@ -114,31 +143,34 @@ static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef,
continue; /* fail */
}
#endif
{
uint32_t *pWord = (uint32_t *)(excvaddr & ~0x3);
uint32_t pos = (excvaddr & 0x3) * 8;
uint32_t mem_val = *pWord;
if (is_read) {
/* Load, shift and mask down to correct size */
uint32_t val = (*(uint32_t *)(excvaddr & ~0x3));
val >>= (excvaddr & 0x3) * 8;
val &= valmask;
if (is_read) {
/* shift and mask down to correct size */
mem_val >>= pos;
mem_val &= valmask;
/* Sign-extend for L16SI, if applicable */
if (what == L16SI_MATCH && (val & 0x8000)) {
val |= 0xffff0000;
/* Sign-extend for L16SI, if applicable */
if (what == L16SI_MATCH && (mem_val & 0x8000)) {
mem_val |= 0xffff0000;
}
ef->a_reg[regno] = mem_val; /* carry out the load */
} else { /* is write */
uint32_t val = ef->a_reg[regno]; /* get value to store from register */
val <<= pos;
valmask <<= pos;
val &= valmask;
/* mask out field, and merge */
mem_val &= (~valmask);
mem_val |= val;
*pWord = mem_val; /* carry out the store */
}
ef->a_reg[regno] = val; /* carry out the load */
} else { /* is write */
uint32_t val = ef->a_reg[regno]; /* get value to store from register */
val <<= (excvaddr & 0x3) * 8;
valmask <<= (excvaddr & 0x3) * 8;
val &= valmask;
/* Load, mask out field, and merge */
uint32_t dst_val = (*(uint32_t *)(excvaddr & ~0x3));
dst_val &= (~valmask);
dst_val |= val;
(*(uint32_t *)(excvaddr & ~0x3)) = dst_val; /* carry out the store */
}
ef->epc += 3; /* resume at following instruction */
@ -201,6 +233,7 @@ static void _set_exception_handler_wrapper(int cause) {
}
}
void install_non32xfer_exception_handler(void) __attribute__((weak));
void install_non32xfer_exception_handler(void) {
if (NULL == old_c_handler) {
// Set the "C" exception handler the wrapper will call

View File

@ -189,6 +189,9 @@ _xtos_c_wrapper_handler:
// Restore special registers
l32i a14, a1, UEXC_sar
// load early - saves two cycles - @mhightower83
movi a0, _xtos_return_from_exc
// For compatibility with Arduino interrupt architecture, we keep interrupts 100%
// disabled.
// /*
@ -201,7 +204,6 @@ _xtos_c_wrapper_handler:
// rsil a12, 1 // XCHAL_EXCM_LEVEL
rsil a12, 15 // All levels blocked.
wsr a14, SAR
movi a0, _xtos_return_from_exc
jx a0
/* FIXME: what about _GeneralException ? */