bugfixes and default config optimization

2025-09-08 19:52:07 +03:00 · 2025-08-29 20:00:39 +03:00
parent f9af4304aa
commit b65bc6e591
12 changed files with 257 additions and 177 deletions
--- a/async_handoff_integration_test.go
+++ b/async_handoff_integration_test.go
@@ -175,7 +175,7 @@ func TestEventDrivenHandoffIntegration(t *testing.T) {
 				// Get connection
 				conn, err := testPool.Get(ctx)
 				if err != nil {
-					t.Errorf("Failed to get connection %d: %v", id, err)
+					t.Errorf("Failed to get conn[%d]: %v", id, err)
 					return
 				}
--- a/hitless/README.md
+++ b/hitless/README.md
@@ -39,27 +39,31 @@ Config: &hitless.Config{
    PostHandoffRelaxedDuration: 20 * time.Second, // Keep relaxed timeout after handoff
    LogLevel:                   logging.LogLevelWarn, // LogLevelError, LogLevelWarn, LogLevelInfo, LogLevelDebug
    MaxWorkers:                 15, // Concurrent handoff workers
-    HandoffQueueSize:           50, // Handoff request queue size
+    HandoffQueueSize:           300, // Handoff request queue size
 }
 ```
 ### Worker Scaling
- **Auto-calculated**: `min(10, PoolSize/3)` - scales with pool size, capped at 10
+- **Auto-calculated**: `min(PoolSize/2, max(10, PoolSize/3))` - balanced scaling approach
- **Explicit values**: `max(10, set_value)` - enforces minimum 10 workers
+- **Explicit values**: `max(PoolSize/2, set_value)` - enforces minimum PoolSize/2 workers
 - **On-demand**: Workers created when needed, cleaned up when idle
 ### Queue Sizing
- **Auto-calculated**: `max(8 × MaxWorkers, max(50, PoolSize/2))` - hybrid scaling
+- **Auto-calculated**: `max(20 × MaxWorkers, PoolSize)` - hybrid scaling
-  - Worker-based: 8 handoffs per worker for burst processing
+  - Worker-based: 20 handoffs per worker for burst processing
-  - Pool-based: Scales with pool size (minimum 50, up to PoolSize/2)
+  - Pool-based: Scales directly with pool size
  - Takes the larger of the two for optimal performance
- **Explicit values**: `max(50, set_value)` - enforces minimum 50 when set
+- **Explicit values**: `max(200, set_value)` - enforces minimum 200 when set
- **Always capped**: Queue size never exceeds `2 × PoolSize` for memory efficiency
+- **Capping**: Queue size capped by `MaxActiveConns+1` (if set) or `5 × PoolSize` for memory efficiency
-**Examples:**
+**Examples (without MaxActiveConns):**
- Pool 10: Queue 50 (max(8×3, max(50, 5)) = max(24, 50) = 50)
+- Pool 10: Workers 5, Queue 100 (max(20×5, 10) = 100, capped at 5×10 = 50)
- Pool 100: Queue 80 (max(8×10, max(50, 50)) = max(80, 50) = 80)
+- Pool 100: Workers 33, Queue 660 (max(20×33, 100) = 660, capped at 5×100 = 500)
- Pool 200: Queue 100 (max(8×10, max(50, 100)) = max(80, 100) = 100)
+- Pool 200: Workers 66, Queue 1320 (max(20×66, 200) = 1320, capped at 5×200 = 1000)
 **Examples (with MaxActiveConns=150):**
 - Pool 100: Workers 33, Queue 151 (max(20×33, 100) = 660, capped at MaxActiveConns+1 = 151)
 - Pool 200: Workers 66, Queue 151 (max(20×66, 200) = 1320, capped at MaxActiveConns+1 = 151)
 ## Notification Hooks
@@ -94,7 +98,7 @@ type CustomHook struct{}
 func (h *CustomHook) PreHook(ctx context.Context, notificationCtx push.NotificationHandlerContext, notificationType string, notification []interface{}) ([]interface{}, bool) {
    // Log notification with connection details
    if conn, ok := notificationCtx.Conn.(*pool.Conn); ok {
-        log.Printf("Processing %s on connection %d", notificationType, conn.GetID())
+        log.Printf("Processing %s on conn[%d]", notificationType, conn.GetID())
    }
    return notification, true // Continue processing
 }
--- a/hitless/config.go
+++ b/hitless/config.go
@@ -103,18 +103,18 @@ type Config struct {
 	// MaxWorkers is the maximum number of worker goroutines for processing handoff requests.
 	// Workers are created on-demand and automatically cleaned up when idle.
-	// If zero, defaults to min(10, PoolSize/3) to handle bursts effectively.
+	// If zero, defaults to min(10, PoolSize/2) to handle bursts effectively.
-	// If explicitly set, enforces minimum of 10 workers.
+	// If explicitly set, enforces minimum of PoolSize/2
 	//
-	// Default: min(10, PoolSize/3), Minimum when set: 10
+	// Default: min(PoolSize/2, max(10, PoolSize/3)), Minimum when set: PoolSize/2
 	MaxWorkers int
 	// HandoffQueueSize is the size of the buffered channel used to queue handoff requests.
 	// If the queue is full, new handoff requests will be rejected.
 	// Scales with both worker count and pool size for better burst handling.
 	//
-	// Default: max(8x workers, max(50, PoolSize/2)), capped by 2x pool size
+	// Default: max(20×MaxWorkers, PoolSize), capped by MaxActiveConns+1 (if set) or 5×PoolSize
-	// When set: min 50, capped by 2x pool size
+	// When set: minimum 200, capped by MaxActiveConns+1 (if set) or 5×PoolSize
 	HandoffQueueSize int
 	// PostHandoffRelaxedDuration is how long to keep relaxed timeouts on the new connection
@@ -122,11 +122,6 @@ type Config struct {
 	// Default: 2 * RelaxedTimeout
 	PostHandoffRelaxedDuration time.Duration
 	// ScaleDownDelay is the delay before checking if workers should be scaled down.
 	// This prevents expensive checks on every handoff completion and avoids rapid scaling cycles.
 	// Default: 2 seconds
 	ScaleDownDelay time.Duration
 	// LogLevel controls the verbosity of hitless upgrade logging.
 	// LogLevelError (0) = errors only, LogLevelWarn (1) = warnings, LogLevelInfo (2) = info, LogLevelDebug (3) = debug
 	// Default: LogLevelWarn (warnings)
@@ -152,7 +147,6 @@ func DefaultConfig() *Config {
 		MaxWorkers:                 0, // Auto-calculated based on pool size
 		HandoffQueueSize:           0, // Auto-calculated based on max workers
 		PostHandoffRelaxedDuration: 0, // Auto-calculated based on relaxed timeout
 		ScaleDownDelay:             2 * time.Second,
 		LogLevel:                   LogLevelWarn,
 		// Connection Handoff Configuration
@@ -212,6 +206,13 @@ func (c *Config) ApplyDefaults() *Config {
 // using the provided pool size to calculate worker defaults.
 // This ensures that partially configured structs get sensible defaults for missing fields.
 func (c *Config) ApplyDefaultsWithPoolSize(poolSize int) *Config {
 	return c.ApplyDefaultsWithPoolConfig(poolSize, 0)
 }
 // ApplyDefaultsWithPoolConfig applies default values to any zero-value fields in the configuration,
 // using the provided pool size and max active connections to calculate worker and queue defaults.
 // This ensures that partially configured structs get sensible defaults for missing fields.
 func (c *Config) ApplyDefaultsWithPoolConfig(poolSize int, maxActiveConns int) *Config {
 	if c == nil {
 		return DefaultConfig().ApplyDefaultsWithPoolSize(poolSize)
 	}
@@ -251,19 +252,25 @@ func (c *Config) ApplyDefaultsWithPoolSize(poolSize int) *Config {
 	// Apply worker defaults based on pool size
 	result.applyWorkerDefaults(poolSize)
-	// Apply queue size defaults with hybrid scaling approach
+	// Apply queue size defaults with new scaling approach
 	if c.HandoffQueueSize <= 0 {
-		// Default: max(8x workers, max(50, PoolSize/2)), capped by 2x pool size
+		// Default: max(20x workers, PoolSize), capped by maxActiveConns or 5x pool size
-		workerBasedSize := result.MaxWorkers * 8
+		workerBasedSize := result.MaxWorkers * 20
-		poolBasedSize := util.Max(50, poolSize/2)
+		poolBasedSize := poolSize
 		result.HandoffQueueSize = util.Max(workerBasedSize, poolBasedSize)
 	} else {
-		// When explicitly set: enforce minimum of 50
+		// When explicitly set: enforce minimum of 200
-		result.HandoffQueueSize = util.Max(50, c.HandoffQueueSize)
+		result.HandoffQueueSize = util.Max(200, c.HandoffQueueSize)
 	}
-	// Always cap queue size by 2x pool size - balances burst capacity with memory efficiency
+	// Cap queue size: use maxActiveConns+1 if set, otherwise 5x pool size
-	result.HandoffQueueSize = util.Min(result.HandoffQueueSize, poolSize*2)
+	var queueCap int
 	if maxActiveConns > 0 {
 		queueCap = maxActiveConns + 1
 	} else {
 		queueCap = poolSize * 5
 	}
 	result.HandoffQueueSize = util.Min(result.HandoffQueueSize, queueCap)
 	// Ensure minimum queue size of 2 (fallback for very small pools)
 	if result.HandoffQueueSize < 2 {
@@ -276,12 +283,6 @@ func (c *Config) ApplyDefaultsWithPoolSize(poolSize int) *Config {
 		result.PostHandoffRelaxedDuration = c.PostHandoffRelaxedDuration
 	}
 	if c.ScaleDownDelay <= 0 {
 		result.ScaleDownDelay = defaults.ScaleDownDelay
 	} else {
 		result.ScaleDownDelay = c.ScaleDownDelay
 	}
 	// LogLevel: 0 is a valid value (errors only), so we need to check if it was explicitly set
 	// We'll use the provided value as-is, since 0 is valid
 	result.LogLevel = c.LogLevel
@@ -314,7 +315,6 @@ func (c *Config) Clone() *Config {
 		MaxWorkers:                 c.MaxWorkers,
 		HandoffQueueSize:           c.HandoffQueueSize,
 		PostHandoffRelaxedDuration: c.PostHandoffRelaxedDuration,
 		ScaleDownDelay:             c.ScaleDownDelay,
 		LogLevel:                   c.LogLevel,
 		// Configuration fields
@@ -330,11 +330,11 @@ func (c *Config) applyWorkerDefaults(poolSize int) {
 	}
 	if c.MaxWorkers == 0 {
-		// When not set: min(10, poolSize/3) - don't exceed 10 workers for small pools
+		// When not set: min(poolSize/2, max(10, poolSize/3)) - balanced scaling approach
-		c.MaxWorkers = util.Min(10, poolSize/3)
+		c.MaxWorkers = util.Min(poolSize/2, util.Max(10, poolSize/3))
 	} else {
-		// When explicitly set: max(10, set_value) - ensure at least 10 workers
+		// When explicitly set: max(poolSize/2, set_value) - ensure at least poolSize/2 workers
-		c.MaxWorkers = util.Max(10, c.MaxWorkers)
+		c.MaxWorkers = util.Max(poolSize/2, c.MaxWorkers)
 	}
 	// Ensure minimum of 1 worker (fallback for very small pools)
--- a/hitless/errors.go
+++ b/hitless/errors.go
@@ -43,3 +43,8 @@ var (
 	// ErrConnectionInvalidHandoffState is returned when a connection is in an invalid state for handoff
 	ErrConnectionInvalidHandoffState = errors.New("hitless: connection is in invalid state for handoff")
 )
 // general errors
 var (
 	ErrShutdown = errors.New("hitless: shutdown")
 )
--- a/hitless/example_hooks.go
+++ b/hitless/example_hooks.go
@@ -38,7 +38,7 @@ func (mh *MetricsHook) PreHook(ctx context.Context, notificationCtx push.Notific
 	// Log connection information if available
 	if conn, ok := notificationCtx.Conn.(*pool.Conn); ok {
-		internal.Logger.Printf(ctx, "hitless: metrics hook processing %s notification on connection %d", notificationType, conn.GetID())
+		internal.Logger.Printf(ctx, "hitless: metrics hook processing %s notification on conn[%d]", notificationType, conn.GetID())
 	}
 	// Store start time in context for duration calculation
@@ -62,7 +62,7 @@ func (mh *MetricsHook) PostHook(ctx context.Context, notificationCtx push.Notifi
 		// Log error details with connection information
 		if conn, ok := notificationCtx.Conn.(*pool.Conn); ok {
-			internal.Logger.Printf(ctx, "hitless: metrics hook recorded error for %s notification on connection %d: %v", notificationType, conn.GetID(), result)
+			internal.Logger.Printf(ctx, "hitless: metrics hook recorded error for %s notification on conn[%d]: %v", notificationType, conn.GetID(), result)
 		}
 	}
 }
--- a/hitless/pool_hook.go
+++ b/hitless/pool_hook.go
@@ -2,7 +2,6 @@ package hitless
 import (
 	"context"
 	"errors"
 	"net"
 	"sync"
 	"sync/atomic"
@@ -45,8 +44,9 @@ type PoolHook struct {
 	// On-demand worker management
 	maxWorkers     int
-	activeWorkers int32         // Atomic counter for active workers
+	activeWorkers  atomic.Int32
 	workerTimeout  time.Duration // How long workers wait for work before exiting
 	workersScaling atomic.Bool
 	// Simple state tracking
 	pending sync.Map // map[uint64]int64 (connID -> seqID)
@@ -82,8 +82,9 @@ func NewPoolHookWithPoolSize(baseDialer func(context.Context, string, string) (n
 		// shutdown is a channel for signaling shutdown
 		shutdown:      make(chan struct{}),
 		maxWorkers:    config.MaxWorkers,
-		activeWorkers: 0,                // Start with no workers - create on demand
+		activeWorkers: atomic.Int32{}, // Start with no workers - create on demand
-		workerTimeout: 30 * time.Second, // Workers exit after 30s of inactivity
+		// NOTE: maybe we would like to make this configurable?
 		workerTimeout: 15 * time.Second, // Workers exit after 15s of inactivity
 		config:        config,
 		// Hitless manager for operation completion tracking
 		hitlessManager: hitlessManager,
@@ -98,15 +99,7 @@ func (ph *PoolHook) SetPool(pooler pool.Pooler) {
 // GetCurrentWorkers returns the current number of active workers (for testing)
 func (ph *PoolHook) GetCurrentWorkers() int {
-	return int(atomic.LoadInt32(&ph.activeWorkers))
+	return int(ph.activeWorkers.Load())
 }
 // GetScaleLevel returns 1 if workers are active, 0 if none (for testing compatibility)
 func (ph *PoolHook) GetScaleLevel() int {
 	if atomic.LoadInt32(&ph.activeWorkers) > 0 {
 		return 1
 	}
 	return 0
 }
 // IsHandoffPending returns true if the given connection has a pending handoff
@@ -181,14 +174,21 @@ func (ph *PoolHook) ensureWorkerAvailable() {
 	case <-ph.shutdown:
 		return
 	default:
 		if ph.workersScaling.CompareAndSwap(false, true) {
 			defer ph.workersScaling.Store(false)
 			// Check if we need a new worker
-		currentWorkers := atomic.LoadInt32(&ph.activeWorkers)
+			currentWorkers := ph.activeWorkers.Load()
-		if currentWorkers < int32(ph.maxWorkers) {
+			workersWas := currentWorkers
-			// Try to create a new worker (atomic increment to prevent race)
+			for currentWorkers <= int32(ph.maxWorkers) {
 			if atomic.CompareAndSwapInt32(&ph.activeWorkers, currentWorkers, currentWorkers+1) {
 				ph.workerWg.Add(1)
 				go ph.onDemandWorker()
 				currentWorkers++
 			}
 			// workersWas is always <= currentWorkers
 			// currentWorkers will be maxWorkers, but if we have a worker that was closed
 			// while we were creating new workers, just add the difference between
 			// the currentWorkers and the number of workers we observed initially (i.e. the number of workers we created)
 			ph.activeWorkers.Add(currentWorkers - workersWas)
 		}
 	}
 }
@@ -197,12 +197,21 @@ func (ph *PoolHook) ensureWorkerAvailable() {
 func (ph *PoolHook) onDemandWorker() {
 	defer func() {
 		// Decrement active worker count when exiting
-		atomic.AddInt32(&ph.activeWorkers, -1)
+		ph.activeWorkers.Add(-1)
 		ph.workerWg.Done()
 	}()
 	for {
 		select {
 		case <-ph.shutdown:
 			return
 		case <-time.After(ph.workerTimeout):
 			// Worker has been idle for too long, exit to save resources
 			if ph.config != nil && ph.config.LogLevel >= LogLevelDebug { // Debug level
 				internal.Logger.Printf(context.Background(),
 					"hitless: worker exiting due to inactivity timeout (%v)", ph.workerTimeout)
 			}
 			return
 		case request := <-ph.handoffQueue:
 			// Check for shutdown before processing
 			select {
@@ -214,17 +223,6 @@ func (ph *PoolHook) onDemandWorker() {
 				// Process the request
 				ph.processHandoffRequest(request)
 			}
 		case <-time.After(ph.workerTimeout):
 			// Worker has been idle for too long, exit to save resources
 			if ph.config != nil && ph.config.LogLevel >= LogLevelDebug { // Debug level
 				internal.Logger.Printf(context.Background(),
 					"hitless: worker exiting due to inactivity timeout (%v)", ph.workerTimeout)
 			}
 			return
 		case <-ph.shutdown:
 			return
 		}
 	}
 }
@@ -257,20 +255,21 @@ func (ph *PoolHook) processHandoffRequest(request HandoffRequest) {
 	}()
 	// Perform the handoff with cancellable context
-	shouldRetry, err := ph.performConnectionHandoffWithPool(shutdownCtx, request.Conn, request.Pool)
+	shouldRetry, err := ph.performConnectionHandoff(shutdownCtx, request.Conn)
 	minRetryBackoff := 500 * time.Millisecond
 	if err != nil {
 		if shouldRetry {
 			now := time.Now()
 			deadline, ok := shutdownCtx.Deadline()
 			thirdOfTimeout := handoffTimeout / 3
 			if !ok || deadline.Before(now) {
 				// wait half the timeout before retrying if no deadline or deadline has passed
-				deadline = now.Add(handoffTimeout / 2)
+				deadline = now.Add(thirdOfTimeout)
 			}
 			afterTime := deadline.Sub(now)
-			if afterTime > handoffTimeout/2 {
+			if afterTime > thirdOfTimeout {
-				afterTime = handoffTimeout / 2
+				afterTime = thirdOfTimeout
 			}
 			if afterTime < minRetryBackoff {
 				afterTime = minRetryBackoff
@@ -280,12 +279,12 @@ func (ph *PoolHook) processHandoffRequest(request HandoffRequest) {
 			time.AfterFunc(afterTime, func() {
 				if err := ph.queueHandoff(request.Conn); err != nil {
 					internal.Logger.Printf(context.Background(), "can't queue handoff for retry: %v", err)
-					ph.removeConn(context.Background(), request, err)
+					ph.closeConnFromRequest(context.Background(), request, err)
 				}
 			})
 			return
 		} else {
-			go ph.removeConn(ctx, request, err)
+			go ph.closeConnFromRequest(ctx, request, err)
 		}
 		// Clear handoff state if not returned for retry
@@ -297,21 +296,22 @@ func (ph *PoolHook) processHandoffRequest(request HandoffRequest) {
 	}
 }
-func (ph *PoolHook) removeConn(ctx context.Context, request HandoffRequest, err error) {
+// closeConn closes the connection and logs the reason
 func (ph *PoolHook) closeConnFromRequest(ctx context.Context, request HandoffRequest, err error) {
 	pooler := request.Pool
 	conn := request.Conn
 	if pooler != nil {
-		pooler.Remove(ctx, conn, err)
+		pooler.CloseConn(conn)
 		if ph.config != nil && ph.config.LogLevel >= LogLevelWarn { // Warning level
 			internal.Logger.Printf(ctx,
-				"hitless: removed connection %d from pool due to max handoff retries reached",
+				"hitless: removed conn[%d] from pool due to max handoff retries reached: %v",
-				conn.GetID())
+				conn.GetID(), err)
 		}
 	} else {
 		conn.Close()
 		if ph.config != nil && ph.config.LogLevel >= LogLevelWarn { // Warning level
 			internal.Logger.Printf(ctx,
-				"hitless: no pool provided for connection %d, cannot remove due to handoff initialization failure: %v",
+				"hitless: no pool provided for conn[%d], cannot remove due to handoff initialization failure: %v",
 				conn.GetID(), err)
 		}
 	}
@@ -332,11 +332,11 @@ func (ph *PoolHook) queueHandoff(conn *pool.Conn) error {
 	select {
 	// priority to shutdown
 	case <-ph.shutdown:
-		return errors.New("shutdown")
+		return ErrShutdown
 	default:
 		select {
 		case <-ph.shutdown:
-			return errors.New("shutdown")
+			return ErrShutdown
 		case ph.handoffQueue <- request:
 			// Store in pending map
 			ph.pending.Store(request.ConnID, request.SeqID)
@@ -344,13 +344,30 @@ func (ph *PoolHook) queueHandoff(conn *pool.Conn) error {
 			ph.ensureWorkerAvailable()
 			return nil
 		default:
 			select {
 			case <-ph.shutdown:
 				return ErrShutdown
 			case ph.handoffQueue <- request:
 				// Store in pending map
 				ph.pending.Store(request.ConnID, request.SeqID)
 				// Ensure we have a worker to process this request
 				ph.ensureWorkerAvailable()
 				return nil
 			case <-time.After(100 * time.Millisecond): // give workers a chance to process
 				// Queue is full - log and attempt scaling
 				queueLen := len(ph.handoffQueue)
 				queueCap := cap(ph.handoffQueue)
 				if ph.config != nil && ph.config.LogLevel >= LogLevelWarn { // Warning level
 					internal.Logger.Printf(context.Background(),
-					"hitless: handoff queue is full (%d/%d), attempting timeout queuing and scaling workers",
+						"hitless: handoff queue is full (%d/%d), cant queue handoff request for conn[%d] seqID[%d]",
-					queueLen, queueCap)
+						queueLen, queueCap, request.ConnID, request.SeqID)
 					if ph.config.LogLevel >= LogLevelDebug { // Debug level
 						ph.pending.Range(func(k, v interface{}) bool {
 							internal.Logger.Printf(context.Background(), "hitless: pending handoff for conn[%d] seqID[%d]", k, v)
 							return true
 						})
 					}
 				}
 			}
 		}
 	}
@@ -360,9 +377,9 @@ func (ph *PoolHook) queueHandoff(conn *pool.Conn) error {
 	return ErrHandoffQueueFull
 }
-// performConnectionHandoffWithPool performs the actual connection handoff with pool for connection removal on failure
+// performConnectionHandoff performs the actual connection handoff
 // When error is returned, the connection handoff should be retried if err is not ErrMaxHandoffRetriesReached
-func (ph *PoolHook) performConnectionHandoffWithPool(ctx context.Context, conn *pool.Conn, pooler pool.Pooler) (shouldRetry bool, err error) {
+func (ph *PoolHook) performConnectionHandoff(ctx context.Context, conn *pool.Conn) (shouldRetry bool, err error) {
 	// Clear handoff state after successful handoff
 	connID := conn.GetID()
@@ -381,7 +398,7 @@ func (ph *PoolHook) performConnectionHandoffWithPool(ctx context.Context, conn *
 	if retries > maxRetries {
 		if ph.config != nil && ph.config.LogLevel >= LogLevelWarn { // Warning level
 			internal.Logger.Printf(ctx,
-				"hitless: reached max retries (%d) for handoff of connection %d to %s",
+				"hitless: reached max retries (%d) for handoff of conn[%d] to %s",
 				maxRetries, conn.GetID(), conn.GetHandoffEndpoint())
 		}
 		// won't retry on ErrMaxHandoffRetriesReached
@@ -403,6 +420,23 @@ func (ph *PoolHook) performConnectionHandoffWithPool(ctx context.Context, conn *
 	// Get the old connection
 	oldConn := conn.GetNetConn()
 	// Apply relaxed timeout to the new connection for the configured post-handoff duration
 	// This gives the new connection more time to handle operations during cluster transition
 	// Setting this here (before initing the connection) ensures that the connection is going
 	// to use the relaxed timeout for the first operation (auth/ACL select)
 	if ph.config != nil && ph.config.PostHandoffRelaxedDuration > 0 {
 		relaxedTimeout := ph.config.RelaxedTimeout
 		// Set relaxed timeout with deadline - no background goroutine needed
 		deadline := time.Now().Add(ph.config.PostHandoffRelaxedDuration)
 		conn.SetRelaxedTimeoutWithDeadline(relaxedTimeout, relaxedTimeout, deadline)
 		if ph.config.LogLevel >= 2 { // Info level
 			internal.Logger.Printf(context.Background(),
 				"hitless: conn[%d] applied post-handoff relaxed timeout (%v) until %v",
 				connID, relaxedTimeout, deadline.Format("15:04:05.000"))
 		}
 	}
 	// Replace the connection and execute initialization
 	err = conn.SetNetConnAndInitConn(ctx, newNetConn)
 	if err != nil {
@@ -419,23 +453,6 @@ func (ph *PoolHook) performConnectionHandoffWithPool(ctx context.Context, conn *
 	conn.ClearHandoffState()
 	internal.Logger.Printf(ctx, "hitless: conn[%d] Handoff to %s successful", conn.GetID(), newEndpoint)
 	// Apply relaxed timeout to the new connection for the configured post-handoff duration
 	// This gives the new connection more time to handle operations during cluster transition
 	if ph.config != nil && ph.config.PostHandoffRelaxedDuration > 0 {
 		relaxedTimeout := ph.config.RelaxedTimeout
 		postHandoffDuration := ph.config.PostHandoffRelaxedDuration
 		// Set relaxed timeout with deadline - no background goroutine needed
 		deadline := time.Now().Add(postHandoffDuration)
 		conn.SetRelaxedTimeoutWithDeadline(relaxedTimeout, relaxedTimeout, deadline)
 		if ph.config.LogLevel >= 2 { // Info level
 			internal.Logger.Printf(context.Background(),
 				"hitless: conn[%d] applied post-handoff relaxed timeout (%v) until %v",
 				connID, relaxedTimeout, deadline.Format("15:04:05.000"))
 		}
 	}
 	return false, nil
 }
--- a/hitless/pool_hook_test.go
+++ b/hitless/pool_hook_test.go
@@ -460,7 +460,7 @@ func TestConnectionHook(t *testing.T) {
 		for i := 0; i < 5; i++ {
 			connections[i] = createMockPoolConnection()
 			if err := connections[i].MarkForHandoff("new-endpoint:6379", int64(i)); err != nil {
-				t.Fatalf("Failed to mark connection %d for handoff: %v", i, err)
+				t.Fatalf("Failed to mark conn[%d] for handoff: %v", i, err)
 			}
 			// Set a mock initialization function
 			connections[i].SetInitConnFunc(func(ctx context.Context, cn *pool.Conn) error {
@@ -510,9 +510,6 @@ func TestConnectionHook(t *testing.T) {
 		if processor.GetCurrentWorkers() != 0 {
 			t.Errorf("Expected 0 initial workers with on-demand system, got %d", processor.GetCurrentWorkers())
 		}
 		if processor.GetScaleLevel() != 0 {
 			t.Errorf("Processor should be at scale level 0 initially, got %d", processor.GetScaleLevel())
 		}
 		if processor.maxWorkers != 15 {
 			t.Errorf("Expected maxWorkers=15, got %d", processor.maxWorkers)
 		}
@@ -718,7 +715,7 @@ func TestConnectionHook(t *testing.T) {
 		for i := 0; i < 10; i++ {
 			conn := createMockPoolConnection()
 			if err := conn.MarkForHandoff("new-endpoint:6379", int64(i+1)); err != nil {
-				t.Fatalf("Failed to mark connection %d for handoff: %v", i, err)
+				t.Fatalf("Failed to mark conn[%d] for handoff: %v", i, err)
 			}
 			// Set a mock initialization function
 			conn.SetInitConnFunc(func(ctx context.Context, cn *pool.Conn) error {
@@ -731,7 +728,7 @@ func TestConnectionHook(t *testing.T) {
 			}
 			if !shouldPool || shouldRemove {
-				t.Errorf("Connection %d should be pooled after handoff (shouldPool=%v, shouldRemove=%v)",
+				t.Errorf("conn[%d] should be pooled after handoff (shouldPool=%v, shouldRemove=%v)",
 					i, shouldPool, shouldRemove)
 			}
 		}
@@ -795,7 +792,7 @@ func TestConnectionHook(t *testing.T) {
 		// Verify that the connection was removed from the pool
 		if !mockPool.WasRemoved(conn.GetID()) {
-			t.Errorf("Connection %d should have been removed from pool after handoff failure", conn.GetID())
+			t.Errorf("conn[%d] should have been removed from pool after handoff failure", conn.GetID())
 		}
 		t.Logf("Connection removal on handoff failure test completed successfully")
--- a/hitless/push_notification_handler.go
+++ b/hitless/push_notification_handler.go
@@ -103,6 +103,15 @@ func (snh *NotificationHandler) handleMoving(ctx context.Context, handlerCtx pus
 		return ErrInvalidNotification
 	}
 	// If the connection is closed or not pooled, we can ignore the notification
 	// this connection won't be remembered by the pool and will be garbage collected
 	// Keep pubsub connections around since they are not pooled but are long-lived
 	// and should be allowed to handoff (the pubsub instance will reconnect and change
 	// the underlying *pool.Conn)
 	if (poolConn.IsClosed() || !poolConn.IsPooled()) && !poolConn.IsPubSub() {
 		return nil
 	}
 	deadline := time.Now().Add(time.Duration(timeS) * time.Second)
 	// If newEndpoint is empty, we should schedule a handoff to the current endpoint in timeS/2 seconds
 	if newEndpoint == "" || newEndpoint == internal.RedisNull {
@@ -133,6 +142,7 @@ func (snh *NotificationHandler) handleMoving(ctx context.Context, handlerCtx pus
 func (snh *NotificationHandler) markConnForHandoff(conn *pool.Conn, newEndpoint string, seqID int64, deadline time.Time) error {
 	if err := conn.MarkForHandoff(newEndpoint, seqID); err != nil {
 		internal.Logger.Printf(context.Background(), "hitless: failed to mark connection for handoff: %v", err)
 		// Connection is already marked for handoff, which is acceptable
 		// This can happen if multiple MOVING notifications are received for the same connection
 		return nil
--- a/internal/pool/conn.go
+++ b/internal/pool/conn.go
@@ -45,6 +45,7 @@ type Conn struct {
 	Inited    atomic.Bool
 	pooled    bool
 	pubsub    bool
 	closed    atomic.Bool
 	createdAt time.Time
 	expiresAt time.Time
@@ -201,6 +202,16 @@ func (cn *Conn) IsUsable() bool {
 	return cn.isUsable()
 }
 // IsPooled returns true if the connection is managed by a pool and will be pooled on Put.
 func (cn *Conn) IsPooled() bool {
 	return cn.pooled
 }
 // IsPubSub returns true if the connection is used for PubSub.
 func (cn *Conn) IsPubSub() bool {
 	return cn.pubsub
 }
 func (cn *Conn) IsInited() bool {
 	return cn.Inited.Load()
 }
@@ -223,9 +234,7 @@ func (cn *Conn) SetRelaxedTimeout(readTimeout, writeTimeout time.Duration) {
 // After the deadline, timeouts automatically revert to normal values.
 // Uses atomic operations for lock-free access.
 func (cn *Conn) SetRelaxedTimeoutWithDeadline(readTimeout, writeTimeout time.Duration, deadline time.Time) {
-	cn.relaxedCounter.Add(1)
+	cn.SetRelaxedTimeout(readTimeout, writeTimeout)
 	cn.relaxedReadTimeoutNs.Store(int64(readTimeout))
 	cn.relaxedWriteTimeoutNs.Store(int64(writeTimeout))
 	cn.relaxedDeadlineNs.Store(deadline.UnixNano())
 }
@@ -354,15 +363,12 @@ func (cn *Conn) ExecuteInitConn(ctx context.Context) error {
 	if cn.initConnFunc != nil {
 		return cn.initConnFunc(ctx, cn)
 	}
-	return fmt.Errorf("redis: no initConnFunc set for connection %d", cn.GetID())
+	return fmt.Errorf("redis: no initConnFunc set for conn[%d]", cn.GetID())
 }
 func (cn *Conn) SetNetConn(netConn net.Conn) {
 	// Store the new connection atomically first (lock-free)
 	cn.setNetConn(netConn)
 	// Clear relaxed timeouts when connection is replaced
 	cn.clearRelaxedTimeout()
 	// Protect reader reset operations to avoid data races
 	// Use write lock since we're modifying the reader state
 	cn.readerMu.Lock()
--- a/internal/pool/pool.go
+++ b/internal/pool/pool.go
@@ -28,13 +28,14 @@ var (
 	// when popping from the idle connection pool. This handles cases where connections
 	// are temporarily marked as unusable (e.g., during hitless upgrades or network issues).
 	// Value of 50 provides sufficient resilience without excessive overhead.
 	// This is capped by the idle connection count, so we won't loop excessively.
 	popAttempts = 50
 	// getAttempts is the maximum number of attempts to get a connection that passes
 	// hook validation (e.g., hitless upgrade hooks). This protects against race conditions
 	// where hooks might temporarily reject connections during cluster transitions.
-	// Value of 2 balances resilience with performance - most hook rejections resolve quickly.
+	// Value of 3 balances resilience with performance - most hook rejections resolve quickly.
-	getAttempts = 2
+	getAttempts = 3
 	minTime      = time.Unix(-2208988800, 0) // Jan 1, 1900
 	maxTime      = minTime.Add(1<<63 - 1)
@@ -262,7 +263,9 @@ func (p *ConnPool) newConn(ctx context.Context, pooled bool) (*Conn, error) {
 		return nil, ErrPoolExhausted
 	}
-	cn, err := p.dialConn(ctx, pooled)
+	dialCtx, cancel := context.WithTimeout(ctx, p.cfg.DialTimeout)
 	defer cancel()
 	cn, err := p.dialConn(dialCtx, pooled)
 	if err != nil {
 		return nil, err
 	}
@@ -309,15 +312,37 @@ func (p *ConnPool) dialConn(ctx context.Context, pooled bool) (*Conn, error) {
 		return nil, p.getLastDialError()
 	}
-	netConn, err := p.cfg.Dialer(ctx)
+	// Retry dialing with backoff
-	if err != nil {
+	// the context timeout is already handled by the context passed in
-		p.setLastDialError(err)
+	// so we may never reach the max retries, higher values don't hurt
-		if atomic.AddUint32(&p.dialErrorsNum, 1) == uint32(p.cfg.PoolSize) {
+	const maxRetries = 10
-			go p.tryDial()
+	const backoffDuration = 100 * time.Millisecond
 	var lastErr error
 	for attempt := 0; attempt < maxRetries; attempt++ {
 		// Add backoff delay for retry attempts
 		// (not for the first attempt, do at least one)
 		if attempt > 0 {
 			select {
 			case <-ctx.Done():
 				// we should have lastErr set, but just in case
 				if lastErr == nil {
 					lastErr = ctx.Err()
 				}
 				break
 			case <-time.After(backoffDuration):
 				// Continue with retry
 			}
 		return nil, err
 		}
 		netConn, err := p.cfg.Dialer(ctx)
 		if err != nil {
 			lastErr = err
 			// Continue to next retry attempt
 			continue
 		}
 		// Success - create connection
 		cn := NewConnWithBufferSize(netConn, p.cfg.ReadBufferSize, p.cfg.WriteBufferSize)
 		cn.pooled = pooled
 		if p.cfg.ConnMaxLifetime > 0 {
@@ -329,6 +354,15 @@ func (p *ConnPool) dialConn(ctx context.Context, pooled bool) (*Conn, error) {
 		return cn, nil
 	}
 	internal.Logger.Printf(ctx, "redis: connection pool: failed to dial after %d attempts: %v", maxRetries, lastErr)
 	// All retries failed - handle error tracking
 	p.setLastDialError(lastErr)
 	if atomic.AddUint32(&p.dialErrorsNum, 1) == uint32(p.cfg.PoolSize) {
 		go p.tryDial()
 	}
 	return nil, lastErr
 }
 func (p *ConnPool) tryDial() {
 	for {
 		if p.closed() {
@@ -386,7 +420,7 @@ func (p *ConnPool) getConn(ctx context.Context) (*Conn, error) {
 	attempts := 0
 	for {
 		if attempts >= getAttempts {
-			internal.Logger.Printf(ctx, "redis: connection pool: was not able to get a connection accepted by hook after %d attempts", attempts)
+			internal.Logger.Printf(ctx, "redis: connection pool: was not able to get a healthy connection after %d attempts", attempts)
 			break
 		}
 		attempts++
@@ -448,7 +482,6 @@ func (p *ConnPool) getConn(ctx context.Context) (*Conn, error) {
 			return nil, err
 		}
 	}
 	return newcn, nil
 }
@@ -497,6 +530,7 @@ func (p *ConnPool) popIdle() (*Conn, error) {
 	if p.closed() {
 		return nil, ErrClosed
 	}
 	defer p.checkMinIdleConns()
 	n := len(p.idleConns)
 	if n == 0 {
@@ -540,12 +574,11 @@ func (p *ConnPool) popIdle() (*Conn, error) {
 	}
 	// If we exhausted all attempts without finding a usable connection, return nil
-	if int32(attempts) >= p.poolSize.Load() {
+	if attempts > 1 && attempts >= maxAttempts && int32(attempts) >= p.poolSize.Load() {
 		internal.Logger.Printf(context.Background(), "redis: connection pool: failed to get a usable connection after %d attempts", attempts)
 		return nil, nil
 	}
 	p.checkMinIdleConns()
 	return cn, nil
 }
@@ -631,11 +664,7 @@ func (p *ConnPool) Put(ctx context.Context, cn *Conn) {
 func (p *ConnPool) Remove(_ context.Context, cn *Conn, reason error) {
 	p.removeConnWithLock(cn)
 	// Only free a turn if the connection was actually pooled
 	// This prevents queue imbalance when removing connections that weren't obtained through Get()
 	if cn.pooled {
 	p.freeTurn()
 	}
 	_ = p.closeConn(cn)
@@ -655,12 +684,22 @@ func (p *ConnPool) removeConnWithLock(cn *Conn) {
 }
 func (p *ConnPool) removeConn(cn *Conn) {
-	delete(p.conns, cn.GetID())
+	cid := cn.GetID()
 	delete(p.conns, cid)
 	atomic.AddUint32(&p.stats.StaleConns, 1)
 	// Decrement pool size counter when removing a connection
 	if cn.pooled {
 		p.poolSize.Add(-1)
 		// this can be idle conn
 		for idx, ic := range p.idleConns {
 			if ic.GetID() == cid {
 				internal.Logger.Printf(context.Background(), "redis: connection pool: removing idle conn[%d]", cid)
 				p.idleConns = append(p.idleConns[:idx], p.idleConns[idx+1:]...)
 				p.idleConnsLen.Add(-1)
 				break
 			}
 		}
 	}
 }
@@ -745,6 +784,7 @@ func (p *ConnPool) isHealthyConn(cn *Conn, now time.Time) bool {
 		return false
 	}
 	// Check if connection has exceeded idle timeout
 	if p.cfg.ConnMaxIdleTime > 0 && now.Sub(cn.UsedAt()) >= p.cfg.ConnMaxIdleTime {
 		return false
 	}
--- a/internal/pool/pubsub.go
+++ b/internal/pool/pubsub.go
@@ -43,6 +43,7 @@ func (p *PubSubPool) NewConn(ctx context.Context, network string, addr string, c
 		return nil, err
 	}
 	cn := NewConnWithBufferSize(netConn, p.opt.ReadBufferSize, p.opt.WriteBufferSize)
 	cn.pubsub = true
 	atomic.AddUint32(&p.stats.Created, 1)
 	return cn, nil
@@ -55,7 +56,7 @@ func (p *PubSubPool) TrackConn(cn *Conn) {
 func (p *PubSubPool) UntrackConn(cn *Conn) {
 	if !cn.IsUsable() || cn.ShouldHandoff() {
-		internal.Logger.Printf(context.Background(), "pubsub: untracking connection %d [usable, handoff] = [%v, %v]", cn.GetID(), cn.IsUsable(), cn.ShouldHandoff())
+		internal.Logger.Printf(context.Background(), "pubsub: untracking conn[%d] [usable, handoff] = [%v, %v]", cn.GetID(), cn.IsUsable(), cn.ShouldHandoff())
 	}
 	atomic.AddUint32(&p.stats.Active, ^uint32(0))
 	atomic.AddUint32(&p.stats.Untracked, 1)
--- a/options.go
+++ b/options.go
@@ -109,7 +109,7 @@ type Options struct {
 	// DialTimeout for establishing new connections.
 	//
-	// default: 5 seconds
+	// default: 10 seconds
 	DialTimeout time.Duration
 	// ReadTimeout for socket reads. If reached, commands will fail
@@ -275,7 +275,7 @@ func (opt *Options) init() {
 		opt.Protocol = 3
 	}
 	if opt.DialTimeout == 0 {
-		opt.DialTimeout = 5 * time.Second
+		opt.DialTimeout = 10 * time.Second
 	}
 	if opt.Dialer == nil {
 		opt.Dialer = NewDialer(opt)
@@ -335,7 +335,7 @@ func (opt *Options) init() {
 		opt.MaxRetryBackoff = 512 * time.Millisecond
 	}
-	opt.HitlessUpgradeConfig = opt.HitlessUpgradeConfig.ApplyDefaultsWithPoolSize(opt.PoolSize)
+	opt.HitlessUpgradeConfig = opt.HitlessUpgradeConfig.ApplyDefaultsWithPoolConfig(opt.PoolSize, opt.MaxActiveConns)
 	// auto-detect endpoint type if not specified
 	endpointType := opt.HitlessUpgradeConfig.EndpointType