1
0
mirror of https://github.com/redis/go-redis.git synced 2025-10-18 22:08:50 +03:00
Files
go-redis/maintnotifications/circuit_breaker.go
Nedyalko Dyakov 75ddeb3d5a feat(e2e-testing): maintnotifications e2e and refactor (#3526)
* e2e wip

* cleanup

* remove unused fault injector mock

* errChan in test

* remove log messages tests

* cleanup log messages

* s/hitless/maintnotifications/

* fix moving when none

* better logs

* test with second client after action has started

* Fixes

Signed-off-by: Elena Kolevska <elena@kolevska.com>

* Test fix

Signed-off-by: Elena Kolevska <elena@kolevska.com>

* feat(e2e-test): Extended e2e tests

* imroved e2e test resiliency

---------

Signed-off-by: Elena Kolevska <elena@kolevska.com>
Co-authored-by: Elena Kolevska <elena@kolevska.com>
Co-authored-by: Elena Kolevska <elena-kolevska@users.noreply.github.com>
Co-authored-by: Hristo Temelski <hristo.temelski@redis.com>
2025-09-26 19:17:09 +03:00

354 lines
10 KiB
Go

package maintnotifications
import (
"context"
"sync"
"sync/atomic"
"time"
"github.com/redis/go-redis/v9/internal"
"github.com/redis/go-redis/v9/internal/maintnotifications/logs"
)
// CircuitBreakerState represents the state of a circuit breaker
type CircuitBreakerState int32
const (
// CircuitBreakerClosed - normal operation, requests allowed
CircuitBreakerClosed CircuitBreakerState = iota
// CircuitBreakerOpen - failing fast, requests rejected
CircuitBreakerOpen
// CircuitBreakerHalfOpen - testing if service recovered
CircuitBreakerHalfOpen
)
func (s CircuitBreakerState) String() string {
switch s {
case CircuitBreakerClosed:
return "closed"
case CircuitBreakerOpen:
return "open"
case CircuitBreakerHalfOpen:
return "half-open"
default:
return "unknown"
}
}
// CircuitBreaker implements the circuit breaker pattern for endpoint-specific failure handling
type CircuitBreaker struct {
// Configuration
failureThreshold int // Number of failures before opening
resetTimeout time.Duration // How long to stay open before testing
maxRequests int // Max requests allowed in half-open state
// State tracking (atomic for lock-free access)
state atomic.Int32 // CircuitBreakerState
failures atomic.Int64 // Current failure count
successes atomic.Int64 // Success count in half-open state
requests atomic.Int64 // Request count in half-open state
lastFailureTime atomic.Int64 // Unix timestamp of last failure
lastSuccessTime atomic.Int64 // Unix timestamp of last success
// Endpoint identification
endpoint string
config *Config
}
// newCircuitBreaker creates a new circuit breaker for an endpoint
func newCircuitBreaker(endpoint string, config *Config) *CircuitBreaker {
// Use configuration values with sensible defaults
failureThreshold := 5
resetTimeout := 60 * time.Second
maxRequests := 3
if config != nil {
failureThreshold = config.CircuitBreakerFailureThreshold
resetTimeout = config.CircuitBreakerResetTimeout
maxRequests = config.CircuitBreakerMaxRequests
}
return &CircuitBreaker{
failureThreshold: failureThreshold,
resetTimeout: resetTimeout,
maxRequests: maxRequests,
endpoint: endpoint,
config: config,
state: atomic.Int32{}, // Defaults to CircuitBreakerClosed (0)
}
}
// IsOpen returns true if the circuit breaker is open (rejecting requests)
func (cb *CircuitBreaker) IsOpen() bool {
state := CircuitBreakerState(cb.state.Load())
return state == CircuitBreakerOpen
}
// shouldAttemptReset checks if enough time has passed to attempt reset
func (cb *CircuitBreaker) shouldAttemptReset() bool {
lastFailure := time.Unix(cb.lastFailureTime.Load(), 0)
return time.Since(lastFailure) >= cb.resetTimeout
}
// Execute runs the given function with circuit breaker protection
func (cb *CircuitBreaker) Execute(fn func() error) error {
// Single atomic state load for consistency
state := CircuitBreakerState(cb.state.Load())
switch state {
case CircuitBreakerOpen:
if cb.shouldAttemptReset() {
// Attempt transition to half-open
if cb.state.CompareAndSwap(int32(CircuitBreakerOpen), int32(CircuitBreakerHalfOpen)) {
cb.requests.Store(0)
cb.successes.Store(0)
if internal.LogLevel.InfoOrAbove() {
internal.Logger.Printf(context.Background(), logs.CircuitBreakerTransitioningToHalfOpen(cb.endpoint))
}
// Fall through to half-open logic
} else {
return ErrCircuitBreakerOpen
}
} else {
return ErrCircuitBreakerOpen
}
fallthrough
case CircuitBreakerHalfOpen:
requests := cb.requests.Add(1)
if requests > int64(cb.maxRequests) {
cb.requests.Add(-1) // Revert the increment
return ErrCircuitBreakerOpen
}
}
// Execute the function with consistent state
err := fn()
if err != nil {
cb.recordFailure()
return err
}
cb.recordSuccess()
return nil
}
// recordFailure records a failure and potentially opens the circuit
func (cb *CircuitBreaker) recordFailure() {
cb.lastFailureTime.Store(time.Now().Unix())
failures := cb.failures.Add(1)
state := CircuitBreakerState(cb.state.Load())
switch state {
case CircuitBreakerClosed:
if failures >= int64(cb.failureThreshold) {
if cb.state.CompareAndSwap(int32(CircuitBreakerClosed), int32(CircuitBreakerOpen)) {
if internal.LogLevel.WarnOrAbove() {
internal.Logger.Printf(context.Background(), logs.CircuitBreakerOpened(cb.endpoint, failures))
}
}
}
case CircuitBreakerHalfOpen:
// Any failure in half-open state immediately opens the circuit
if cb.state.CompareAndSwap(int32(CircuitBreakerHalfOpen), int32(CircuitBreakerOpen)) {
if internal.LogLevel.WarnOrAbove() {
internal.Logger.Printf(context.Background(), logs.CircuitBreakerReopened(cb.endpoint))
}
}
}
}
// recordSuccess records a success and potentially closes the circuit
func (cb *CircuitBreaker) recordSuccess() {
cb.lastSuccessTime.Store(time.Now().Unix())
state := CircuitBreakerState(cb.state.Load())
switch state {
case CircuitBreakerClosed:
// Reset failure count on success in closed state
cb.failures.Store(0)
case CircuitBreakerHalfOpen:
successes := cb.successes.Add(1)
// If we've had enough successful requests, close the circuit
if successes >= int64(cb.maxRequests) {
if cb.state.CompareAndSwap(int32(CircuitBreakerHalfOpen), int32(CircuitBreakerClosed)) {
cb.failures.Store(0)
if internal.LogLevel.InfoOrAbove() {
internal.Logger.Printf(context.Background(), logs.CircuitBreakerClosed(cb.endpoint, successes))
}
}
}
}
}
// GetState returns the current state of the circuit breaker
func (cb *CircuitBreaker) GetState() CircuitBreakerState {
return CircuitBreakerState(cb.state.Load())
}
// GetStats returns current statistics for monitoring
func (cb *CircuitBreaker) GetStats() CircuitBreakerStats {
return CircuitBreakerStats{
Endpoint: cb.endpoint,
State: cb.GetState(),
Failures: cb.failures.Load(),
Successes: cb.successes.Load(),
Requests: cb.requests.Load(),
LastFailureTime: time.Unix(cb.lastFailureTime.Load(), 0),
LastSuccessTime: time.Unix(cb.lastSuccessTime.Load(), 0),
}
}
// CircuitBreakerStats provides statistics about a circuit breaker
type CircuitBreakerStats struct {
Endpoint string
State CircuitBreakerState
Failures int64
Successes int64
Requests int64
LastFailureTime time.Time
LastSuccessTime time.Time
}
// CircuitBreakerEntry wraps a circuit breaker with access tracking
type CircuitBreakerEntry struct {
breaker *CircuitBreaker
lastAccess atomic.Int64 // Unix timestamp
created time.Time
}
// CircuitBreakerManager manages circuit breakers for multiple endpoints
type CircuitBreakerManager struct {
breakers sync.Map // map[string]*CircuitBreakerEntry
config *Config
cleanupStop chan struct{}
cleanupMu sync.Mutex
lastCleanup atomic.Int64 // Unix timestamp
}
// newCircuitBreakerManager creates a new circuit breaker manager
func newCircuitBreakerManager(config *Config) *CircuitBreakerManager {
cbm := &CircuitBreakerManager{
config: config,
cleanupStop: make(chan struct{}),
}
cbm.lastCleanup.Store(time.Now().Unix())
// Start background cleanup goroutine
go cbm.cleanupLoop()
return cbm
}
// GetCircuitBreaker returns the circuit breaker for an endpoint, creating it if necessary
func (cbm *CircuitBreakerManager) GetCircuitBreaker(endpoint string) *CircuitBreaker {
now := time.Now().Unix()
if entry, ok := cbm.breakers.Load(endpoint); ok {
cbEntry := entry.(*CircuitBreakerEntry)
cbEntry.lastAccess.Store(now)
return cbEntry.breaker
}
// Create new circuit breaker with metadata
newBreaker := newCircuitBreaker(endpoint, cbm.config)
newEntry := &CircuitBreakerEntry{
breaker: newBreaker,
created: time.Now(),
}
newEntry.lastAccess.Store(now)
actual, _ := cbm.breakers.LoadOrStore(endpoint, newEntry)
return actual.(*CircuitBreakerEntry).breaker
}
// GetAllStats returns statistics for all circuit breakers
func (cbm *CircuitBreakerManager) GetAllStats() []CircuitBreakerStats {
var stats []CircuitBreakerStats
cbm.breakers.Range(func(key, value interface{}) bool {
entry := value.(*CircuitBreakerEntry)
stats = append(stats, entry.breaker.GetStats())
return true
})
return stats
}
// cleanupLoop runs background cleanup of unused circuit breakers
func (cbm *CircuitBreakerManager) cleanupLoop() {
ticker := time.NewTicker(5 * time.Minute) // Cleanup every 5 minutes
defer ticker.Stop()
for {
select {
case <-ticker.C:
cbm.cleanup()
case <-cbm.cleanupStop:
return
}
}
}
// cleanup removes circuit breakers that haven't been accessed recently
func (cbm *CircuitBreakerManager) cleanup() {
// Prevent concurrent cleanups
if !cbm.cleanupMu.TryLock() {
return
}
defer cbm.cleanupMu.Unlock()
now := time.Now()
cutoff := now.Add(-30 * time.Minute).Unix() // 30 minute TTL
var toDelete []string
count := 0
cbm.breakers.Range(func(key, value interface{}) bool {
endpoint := key.(string)
entry := value.(*CircuitBreakerEntry)
count++
// Remove if not accessed recently
if entry.lastAccess.Load() < cutoff {
toDelete = append(toDelete, endpoint)
}
return true
})
// Delete expired entries
for _, endpoint := range toDelete {
cbm.breakers.Delete(endpoint)
}
// Log cleanup results
if len(toDelete) > 0 && internal.LogLevel.InfoOrAbove() {
internal.Logger.Printf(context.Background(), logs.CircuitBreakerCleanup(len(toDelete), count))
}
cbm.lastCleanup.Store(now.Unix())
}
// Shutdown stops the cleanup goroutine
func (cbm *CircuitBreakerManager) Shutdown() {
close(cbm.cleanupStop)
}
// Reset resets all circuit breakers (useful for testing)
func (cbm *CircuitBreakerManager) Reset() {
cbm.breakers.Range(func(key, value interface{}) bool {
entry := value.(*CircuitBreakerEntry)
breaker := entry.breaker
breaker.state.Store(int32(CircuitBreakerClosed))
breaker.failures.Store(0)
breaker.successes.Store(0)
breaker.requests.Store(0)
breaker.lastFailureTime.Store(0)
breaker.lastSuccessTime.Store(0)
return true
})
}