Correct millis() drift, issue #3078 (#4264)

* Correct millis() drift, issue 3078 * Add 'test_millis_mm.ino' runtime benchmark * Eliminate 'punning' warning Add union 'acc' in millis() to eliminate 'punning' compiler warning * Correct minor typo * Eliminate 'punning' warning Add union 'acc' to eliminate 'punning' compiler warning in 'millis_test'_DEBUG() and 'millis_test()'
2025-07-01 03:26:58 +03:00 · 2018-03-12 12:28:04 -06:00
parent 5c0a57ff6d
commit 3934d10f21
2 changed files with 618 additions and 5 deletions
--- a/cores/esp8266/core_esp8266_wiring.c
+++ b/cores/esp8266/core_esp8266_wiring.c
@ -61,11 +61,133 @@ void micros_overflow_tick(void* arg) {
    micros_at_last_overflow_tick = m;
 }
-unsigned long ICACHE_RAM_ATTR millis() {
+//---------------------------------------------------------------------------
 // millis() 'magic multiplier' approximation
 //
 // This function corrects the cumlative (296us / usec overflow) drift
 // seen in the orignal 'millis()' function.
 //
 // Input:
 //    'm' - 32-bit usec counter,           0 <= m <= 0xFFFFFFFF
 //    'c' - 32-bit usec overflow counter   0 <= c <  0x00400000
 // Output:
 //    Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF)
 //
 // Notes:
 //
 // 1) This routine approximates the 64-bit integer division,
 //
 //    quotient =  ( 2^32 c + m ) / 1000,
 //
 //    through the use of 'magic' multipliers. A slow division is replaced by
 //    a faster multiply using a scaled multiplicative inverse of the divisor:
 //
 //    quotient =~ ( 2^32 c + m ) * k,  where k = Ceiling[ 2^n / 1000 ]
 //
 //    The precision difference between multiplier and divisor sets the
 //    upper-bound of the dividend which can be successfully divided.
 //
 //    For this application, n = 64, and the divisor (1000) has 10-bits of 
 //    precision. This sets the dividend upper-bound to (64 - 10) = 54 bits,
 //    and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value
 //    for 'c' = 0x0040,0000 , or +570 years of usec counter overflows.
 //
 // 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ):
 //
 //      prd = (2^32 kh + kl) * ( 2^32 c + m )
 //          = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m
 //               (d)         (c)         (b)       (a)
 //
 //    Graphically, the offset-sums align in little endian like this:
 //                  LS -> MS
 //            32       64       96      128
 //    | a[-1]  |  a[0]  |  a[1]  |  a[2]  |
 //    |       m kl      |    0   |    0   |  a[-1] not needed
 //    |        |       m kh      |        |
 //    |        |       c kl      |        |  a[1] holds the result
 //    |        |        |       c kh      |  a[2] can be discarded
 //
 //    As only the high-word of 'm kl' and low-word of 'c kh' contribute to the
 //    overall result, only (2) 32-bit words are needed for the accumulator.
 //
 // 3) As C++ does not intrinsically test for addition overflows, one must
 //    code specifically to detect them. This approximation skips these
 //    overflow checks for speed, hence the sum,
 //
 //    highword( m kl ) + m kh + c kl  <  (2^64-1),  MUST NOT OVERFLOW.
 //
 //    To meet this criteria, not only do we have to pick 'k' to achieve our
 //    desired precision, we also have to split 'k' appropriately to avoid
 //    any addition overflows.
 //
 //    'k' should be also chosen to align the various products on byte
 //    boundaries to avoid any 64-bit shifts before additions, as they incur
 //    major time penalties. The 'k' chosen for this specific division by 1000
 //    was picked primarily to avoid shifts as well as for precision.
 //
 //    For the reasons list above, this routine is NOT a general one.
 //    Changing divisors could break the overflow requirement and force
 //    picking a 'k' split which requires shifts before additions.
 //
 //              ** Test THOROUGHLY after making changes **
 //
 // 4) Results of time benchmarks run on an ESP8266 Huzzah feather are:
 //
 //         usec   x Orig   Comment
 // Orig:   3.18   1.00     Original code
 // Corr:  13.21   4.15     64-bit reference code
 // Test:   4.60   1.45     64-bit magic multiply, 4x32
 //
 // The magic multiplier routine runs ~3x faster than the reference. Execution
 // times can vary considerably with the numbers being multiplied, so one
 // should derate this factor to around 2x, worst case.
 //
 //   Reference function: corrected millis(), 64-bit arithmetic,
 //                       truncated to 32-bits by return
 //   unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
 //   {
 //     // Get usec system time, usec overflow conter
 //     ......
 //     return ( (c * 4294967296 + m) / 1000 );  // 64-bit division is SLOW
 //   } //millis_corr
 //
 // 5) See this link for a good discussion on magic multipliers:
 //    http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
 //
 #define  MAGIC_1E3_wLO  0x4bc6a7f0    // LS part
 #define  MAGIC_1E3_wHI  0x00418937    // MS part, magic multiplier
 unsigned long ICACHE_RAM_ATTR millis()
 {
  union {
     uint64_t  q;     // Accumulator, 64-bit, little endian
     uint32_t  a[2];  // ..........., 32-bit  segments
  } acc;
  acc.a[1] = 0;       // Zero high-acc
  // Get usec system time, usec overflow counter
  uint32_t  m = system_get_time();
-    uint32_t c = micros_overflow_count + ((m < micros_at_last_overflow_tick) ? 1 : 0);
+  uint32_t  c = micros_overflow_count +
-    return c * 4294967 + m / 1000;
+                   ((m < micros_at_last_overflow_tick) ? 1 : 0);
-}
+
  // (a) Init. low-acc with high-word of 1st product. The right-shift
  //     falls on a byte boundary, hence is relatively quick.
  acc.q  = ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
  // (b) Offset sum, low-acc
  acc.q += ( m * (uint64_t)MAGIC_1E3_wHI );
  // (c) Offset sum, low-acc
  acc.q += ( c * (uint64_t)MAGIC_1E3_wLO );
  // (d) Truncated sum, high-acc
  acc.a[1] += (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
  return ( acc.a[1] );  // Extract result, high-acc
 } //millis
 unsigned long ICACHE_RAM_ATTR micros() {
    return system_get_time();
--- a/tests/device/test_millis_mm/test_millis_mm.ino
+++ b/tests/device/test_millis_mm/test_millis_mm.ino
@ -0,0 +1,491 @@
 //
 // Millis() Runtime Benchmarke
 //
 //  Code to determine the runtime in 'usec' of various millis()
 //  functions.
 //
 #include <Arduino.h>
 #include <ESP8266WiFi.h>
 #include <stdio.h>
 #include <BSTest.h>
 // Include API-Headers
 extern "C" {                  // SDK functions for Arduino IDE access
 #include "osapi.h"
 #include "user_interface.h"
 #include "espconn.h"
 }  //end, 'extern C'
 //---------------------------------------------------------------------------
 // Globals
 //---------------------------------------------------------------------------
 //                 Here                    // In 'core_esp8266_wiring.c'
 static os_timer_t  us_ovf_timer;           // 'micros_overflow_timer'
 static uint32_t    us_cnt = 0;             // 'm' in 'micros_overflow_tick()'
 static uint32_t    us_ovflow = 0;          // 'micros_overflow_count'
 static uint32_t    us_at_last_ovf = 0;     // 'micros_at_last_overflow_tick'
 static bool        fixed_systime = false;  // Testing vars
 static bool        debugf = false;
 static uint32_t    us_systime = 0;
 static float       nsf = 0;                // Normalization factor
 static uint32_t    cntref = 0;             // Ref. comparision count
 //---------------------------------------------------------------------------
 // Interrupt code lifted directly from "cores/core_esp8266_wiring.c",
 // with some variable renaming
 //---------------------------------------------------------------------------
 // Callback for usec counter overflow timer interrupt
 void  us_overflow_tick ( void* arg )
 {
  us_cnt = system_get_time();
  // Check for usec counter overflow
  if ( us_cnt < us_at_last_ovf ) {
    ++us_ovflow;
  } //end-if
  us_at_last_ovf = us_cnt;
 } //us_overflow_tick
 //---------------------------------------------------------------------------
 #define  REPEAT 1
 #define  ONCE   0
 void us_count_init ( void )
 {
  os_timer_setfn( &us_ovf_timer, (os_timer_func_t*)&us_overflow_tick, 0 );
  os_timer_arm( &us_ovf_timer, 60000, REPEAT );
 } //us_count_init
 //---------------------------------------------------------------------------
 // Wrapper(s) for our benchmark
 //---------------------------------------------------------------------------
 // Set a fixed value of usec system time
 void  set_systime ( uint32_t usec )
 {
  us_systime = usec;
 } //set_systime
 //---------------------------------------------------------------------------
 // Wrapper to return a fixed system time
 uint32_t system_get_timeA ( void )
 {
  return ( fixed_systime ? us_systime : system_get_time() );
 } //system_get_timeA
 //---------------------------------------------------------------------------
 // Functions to be tested
 //---------------------------------------------------------------------------
 // Print integer list as hex
 void viewhex( uint16_t *p, uint8_t n )
 {
  Serial.print( "0x" );
  for ( uint8_t i = 0; i < n; i++ )
  {
    Serial.printf( "%04X ", p[ (n - 1) - i ] );
  }
 } //viewhex
 //---------------------------------------------------------------------------
 // Support routine for 'millis_test_DEBUG()'
 // Print accumulator value along interm summed into it
 void view_accsum ( const char *desc, uint16_t *acc, uint16_t *itrm )
 {
   Serial.print( "acc:  " );
   viewhex( acc, 4 );
   Serial.printf( "  %s = ", desc );
   viewhex( itrm, 4 );
   Serial.println();
 } //view_accsum
 //---------------------------------------------------------------------------
 // FOR BENCHTEST
 // Original millis() function
 unsigned long ICACHE_RAM_ATTR millis_orig ( void )
 {
  // Get usec system time, usec overflow conter
  uint32_t  m = system_get_time();
  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
  return ( (c * 4294967) + m / 1000 );
 } //millis_orig
 //---------------------------------------------------------------------------
 // FOR DEBUG
 // Corrected millis(), 64-bit arithmetic gold standard
 // truncated to 32-bits by return
 unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
 {
  // Get usec system time, usec overflow conter
  uint32_t  m = system_get_timeA();   // DEBUG
  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
  return ( (c * 4294967296 + m) / 1000 );
 } //millis_corr_DEBUG
 //---------------------------------------------------------------------------
 // FOR BENCHMARK
 unsigned long ICACHE_RAM_ATTR millis_corr ( void )
 {
  // Get usec system time, usec overflow conter
  uint32_t  m = system_get_time();
  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
  return ( (c * 4294967296 + m) / 1000 );
 } //millis_corr
 //---------------------------------------------------------------------------
 // FOR DEBUG
 // millis() 'magic multiplier' approximation
 //
 // This function corrects the cumlative (296us / usec overflow) drift
 // seen in the orignal 'millis()' function.
 //
 // Input:
 //    'm' - 32-bit usec counter,           0 <= m <= 0xFFFFFFFF
 //    'c' - 32-bit usec overflow counter   0 <= c <  0x00400000
 // Output:
 //    Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF)
 //
 // Notes:
 //
 // 1) This routine approximates the 64-bit integer division,
 //
 //    quotient =  ( 2^32 c + m ) / 1000,
 //
 //    through the use of 'magic' multipliers. A slow division is replaced by
 //    a faster multiply using a scaled multiplicative inverse of the divisor:
 //
 //    quotient =~ ( 2^32 c + m ) * k,  where k = Ceiling[ 2^n / 1000 ]
 //
 //    The precision difference between multiplier and divisor sets the
 //    upper-bound of the dividend which can be successfully divided.
 //
 //    For this application, n = 64, and the divisor (1000) has 10-bits of 
 //    precision. This sets the dividend upper-bound to (64 - 10) = 54 bits,
 //    and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value
 //    for 'c' = 0x0040,0000 , or +570 years of usec counter overflows.
 //
 // 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ):
 //
 //      prd = (2^32 kh + kl) * ( 2^32 c + m )
 //          = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m
 //               (d)         (c)         (b)       (a)
 //
 //    Graphically, the offset-sums align in little endian like this:
 //                  LS -> MS
 //            32       64       96      128
 //    | a[-1]  |  a[0]  |  a[1]  |  a[2]  |
 //    |       m kl      |    0   |    0   |  a[-1] not needed
 //    |        |       m kh      |        |
 //    |        |       c kl      |        |  a[1] holds the result
 //    |        |        |       c kh      |  a[2] can be discarded
 //
 //    As only the high-word of 'm kl' and low-word of 'c kh' contribute to the
 //    overall result, only (2) 32-bit words are needed for the accumulator.
 //
 // 3) As C++ does not intrinsically test for addition overflows, one must
 //    code specifically to detect them. This approximation skips these
 //    overflow checks for speed, hence the sum,
 //
 //    highword( m kl ) + m kh + c kl  <  (2^64-1),  MUST NOT OVERFLOW.
 //
 //    To meet this criteria, not only do we have to pick 'k' to achieve our
 //    desired precision, we also have to split 'k' appropriately to avoid
 //    any addition overflows.
 //
 //    'k' should be also chosen to align the various products on byte
 //    boundaries to avoid any 64-bit shifts before additions, as they incur
 //    major time penalties. The 'k' chosen for this specific division by 1000
 //    was picked primarily to avoid shifts as well as for precision.
 //
 //    For the reasons list above, this routine is NOT a general one.
 //    Changing divisors could break the overflow requirement and force
 //    picking a 'k' split which requires shifts before additions.
 //
 //              ** Test THOROUGHLY after making changes **
 //
 // 4) Results of time benchmarks run on an ESP8266 Huzzah feather are:
 //
 //         usec   x Orig   Comment
 // Orig:   3.18   1.00     Original code
 // Corr:  13.21   4.15     64-bit reference code
 // Test:   4.60   1.45     64-bit magic multiply, 4x32
 //
 // The magic multiplier routine runs ~3x faster than the reference. Execution
 // times can vary considerably with the numbers being multiplied, so one
 // should derate this factor to around 2x, worst case.
 //
 //   Reference function: corrected millis(), 64-bit arithmetic,
 //                       truncated to 32-bits by return
 //   unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
 //   {
 //     // Get usec system time, usec overflow conter
 //     ......
 //     return ( (c * 4294967296 + m) / 1000 );  // 64-bit division is SLOW
 //   } //millis_corr
 //
 // 5) See this link for a good discussion on magic multipliers:
 //    http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
 //
 #define  M_USEC_MAX     0xFFFFFFFF
 #define  C_COUNT_MAX    0x00400000
 #define  MAGIC_1E3_wLO  0x4bc6a7f0    // LS part
 #define  MAGIC_1E3_wHI  0x00418937    // MS part, magic multiplier
 unsigned long ICACHE_RAM_ATTR millis_test_DEBUG ( void )
 {
  union {
    uint64_t  q;     // Accumulator, 64-bit, little endian
    uint32_t  a[2];  // ..........., 32-bit  segments
  } acc;
  acc.a[1] = 0;      // Zero high-acc
  uint64_t  prd;     // Interm product
  // Get usec system time, usec overflow counter
  uint32_t  m = system_get_timeA();
  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
  // DEBUG: Show input vars
  if ( debugf )
     Serial.printf( "Test  m: 0x%08X  c: 0x%08X\n", m, c );
  // (a) Init. low-acc with high-word of 1st product. The right-shift
  //     falls on a byte boundary, hence is relatively quick.
  acc.q  = ( (prd = (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO )) >> 32 );
  // DEBUG: Show both accumulator and interm product
  if( debugf )
     view_accsum( "m kl", (uint16_t *)&acc.q, (uint16_t *)&prd );
  // (b) Offset sum, low-acc
  acc.q += ( prd = ( m * (uint64_t)MAGIC_1E3_wHI ) );
  // DEBUG: Show both accumulator and interm product
  if( debugf )
     view_accsum( "m kh", (uint16_t *)&acc.q, (uint16_t *)&prd );
  // (c) Offset sum, low-acc
  acc.q += ( prd = ( c * (uint64_t)MAGIC_1E3_wLO ) );
  // DEBUG: Show both accumulator and interm product
  if( debugf )
     view_accsum( "c kl", (uint16_t *)&acc.q, (uint16_t *)&prd );
  // (d) Truncated sum, high-acc
  acc.a[1] += (uint32_t)( prd = ( c * (uint64_t)MAGIC_1E3_wHI ) );
  // DEBUG: Show both accumulator and interm product
  if( debugf )
     view_accsum( "c kh", (uint16_t *)&acc.q, (uint16_t *)&prd );
  return ( acc.a[1] );  // Extract result, high-acc
 } //millis_test_DEBUG
 //---------------------------------------------------------------------------
 // FOR BENCHTEST
 unsigned long ICACHE_RAM_ATTR millis_test ( void )
 {
  union {
    uint64_t  q;     // Accumulator, 64-bit, little endian
    uint32_t  a[2];  // ..........., 32-bit  segments
  } acc;
  acc.a[1] = 0;      // Zero high-acc
  // Get usec system time, usec overflow counter
  uint32_t  m = system_get_time();
  uint32_t  c = us_ovflow + ((m < us_at_last_ovf) ? 1 : 0);
  // (a) Init. low-acc with high-word of 1st product. The right-shift
  //     falls on a byte boundary, hence is relatively quick.
  acc.q  = ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
  // (b) Offset sum, low-acc
  acc.q += ( m * (uint64_t)MAGIC_1E3_wHI );
  // (c) Offset sum, low-acc
  acc.q += ( c * (uint64_t)MAGIC_1E3_wLO );
  // (d) Truncated sum, high-acc
  acc.a[1] += (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
  return ( acc.a[1] );  // Extract result, high-acc
 } //millis_test
 //---------------------------------------------------------------------------
 // Execution time benchmark
 //---------------------------------------------------------------------------
 // Print benchmark result
 void  millis_rtms_print ( const char *pream, uint32_t cntx, const char *desc )
 {
  Serial.print( pream );
  Serial.print( nsf * (float)cntx );
  Serial.print( "   " );
  Serial.print( (float)cntx / (float)cntref );
  Serial.print( "     " );  
  Serial.println( desc );
 } //millis_rtms_print
 //---------------------------------------------------------------------------
 void  Millis_RunTimes ( void )
 {
  Serial.println();
  Serial.println( "Millis() RunTime Benchmarks" );
  uint32_t lc = 100000;          // Samples
  nsf = 1 / float(lc);           // Normalization (global)
  uint32_t bgn;
  uint32_t cnto  = 0, cntv = 0;
  uint32_t cntcx = 0, cntc = 0;
  uint32_t cntfx = 0, cntf = 0;
  // Setup timer values
  fixed_systime = true; 
  us_ovflow      =  C_COUNT_MAX;
  us_at_last_ovf = (M_USEC_MAX - (20 * 1000000));   // Max. less 20 sec
 // No printing, systime active
  debugf = false; fixed_systime = false;
  for (uint32_t i = 0; i < lc; i++ )
  {
    bgn = system_get_time();
    millis_orig();      
    cnto += system_get_time() - bgn;
    bgn = system_get_time();
    millis();      
    cntv += system_get_time() - bgn;
    bgn = system_get_time();
    millis_corr_DEBUG();
    cntcx += system_get_time() - bgn;
    bgn = system_get_time();
    millis_corr();
    cntc += system_get_time() - bgn;
    bgn = system_get_time();
    millis_test_DEBUG();
    cntfx += system_get_time() - bgn;
    bgn = system_get_time();
    millis_test();
    cntf += system_get_time() - bgn;
    yield();
  } //end-for
  cntref = cnto;  // Set global ref. count
  Serial.println();
  Serial.println( "         usec   x Orig   Comment" );
  millis_rtms_print( " Orig:   ",  cntref, "Original code" ); 
  millis_rtms_print( " Core:   ",  cntv,   "Current  core" ); 
  Serial.println();
  millis_rtms_print( "  Ref:  ",   cntcx,  "64-bit reference code,      DEBUG" );
  millis_rtms_print( " Test:   ",  cntfx,  "64-bit magic multiply, 4x32 DEBUG" );  
  Serial.println();
  millis_rtms_print( "  Ref:  ",   cntc,   "64-bit reference code" );  
  millis_rtms_print( " Test:   ",  cntf,   "64-bit magic multiply, 4x32" );
  Serial.println();
  Serial.println( F("*** End, Bench Test ***") );
  Serial.println();
 } //Millis_RunTimes
 //---------------------------------------------------------------------------
 // Debug millis_test()
 //---------------------------------------------------------------------------
 bool  Debug_Millis_Test ( void )
 {
  uint32_t  m, msc, mstx;
  int32_t   diff;
 // Switch over to fixed system time, enable printing
  fixed_systime = true;  debugf = true;
  us_ovflow    = C_COUNT_MAX;
  m            = (M_USEC_MAX - (0 * 1000000));
  set_systime( m );
  us_at_last_ovf = m - 1;  // Disables 'c' bump
 // Millis() comparison, test vs. reference
  Serial.println();
  mstx = millis_test_DEBUG();
  msc  = millis_corr_DEBUG();
  diff = (int32_t)(mstx - msc);
  Serial.println();
  Serial.println( "         m         Test    Reference   Difference" );
  Serial.printf( "0x%08x   0x%08x   0x%08X    %9d\n", m, mstx, msc, diff );
  Serial.println();
 // No printing, variable systime
  debugf = false; fixed_systime = false;
  return( (bool)( diff == 0 ) );   // Good test, matches reference
 } //Debug_Millis_Test
 //---------------------------------------------------------------------------
 //---------------------------------------------------------------------------
 BS_ENV_DECLARE();
 void setup ()
 {
  Serial.begin(115200);
  WiFi.mode( WIFI_OFF );
  us_count_init();        // Start up timer overflow sampling  
  BS_RUN(Serial);
 } //setup
 //---------------------------------------------------------------------------
 void loop(void)
 {
   yield();
 } //loop
 //---------------------------------------------------------------------------
 // Test cases
 //---------------------------------------------------------------------------
 TEST_CASE( "Millis RunTime Benchmarks", "[bs]" )
 {
    Millis_RunTimes();
 } //testcase1
 TEST_CASE( "Debug 'millis_test()' Code", "[bs]" )
 {
    bool ok = Debug_Millis_Test();
    CHECK( ok );
 } //testcase2
 //---------------------------------------------------------------------------
 //---------------------------------------------------------------------------