1
0
mirror of https://github.com/redis/go-redis.git synced 2025-10-20 09:52:25 +03:00
Files
go-redis/maintnotifications/e2e/fault_injector_test.go
Nedyalko Dyakov 75ddeb3d5a feat(e2e-testing): maintnotifications e2e and refactor (#3526)
* e2e wip

* cleanup

* remove unused fault injector mock

* errChan in test

* remove log messages tests

* cleanup log messages

* s/hitless/maintnotifications/

* fix moving when none

* better logs

* test with second client after action has started

* Fixes

Signed-off-by: Elena Kolevska <elena@kolevska.com>

* Test fix

Signed-off-by: Elena Kolevska <elena@kolevska.com>

* feat(e2e-test): Extended e2e tests

* imroved e2e test resiliency

---------

Signed-off-by: Elena Kolevska <elena@kolevska.com>
Co-authored-by: Elena Kolevska <elena@kolevska.com>
Co-authored-by: Elena Kolevska <elena-kolevska@users.noreply.github.com>
Co-authored-by: Hristo Temelski <hristo.temelski@redis.com>
2025-09-26 19:17:09 +03:00

566 lines
16 KiB
Go

package e2e
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strconv"
"strings"
"time"
)
// ActionType represents the type of fault injection action
type ActionType string
const (
// Redis cluster actions
ActionClusterFailover ActionType = "cluster_failover"
ActionClusterReshard ActionType = "cluster_reshard"
ActionClusterAddNode ActionType = "cluster_add_node"
ActionClusterRemoveNode ActionType = "cluster_remove_node"
ActionClusterMigrate ActionType = "cluster_migrate"
// Node-level actions
ActionNodeRestart ActionType = "node_restart"
ActionNodeStop ActionType = "node_stop"
ActionNodeStart ActionType = "node_start"
ActionNodeKill ActionType = "node_kill"
// Network simulation actions
ActionNetworkPartition ActionType = "network_partition"
ActionNetworkLatency ActionType = "network_latency"
ActionNetworkPacketLoss ActionType = "network_packet_loss"
ActionNetworkBandwidth ActionType = "network_bandwidth"
ActionNetworkRestore ActionType = "network_restore"
// Redis configuration actions
ActionConfigChange ActionType = "config_change"
ActionMaintenanceMode ActionType = "maintenance_mode"
ActionSlotMigration ActionType = "slot_migration"
// Sequence and complex actions
ActionSequence ActionType = "sequence_of_actions"
ActionExecuteCommand ActionType = "execute_command"
)
// ActionStatus represents the status of an action
type ActionStatus string
const (
StatusPending ActionStatus = "pending"
StatusRunning ActionStatus = "running"
StatusFinished ActionStatus = "finished"
StatusFailed ActionStatus = "failed"
StatusSuccess ActionStatus = "success"
StatusCancelled ActionStatus = "cancelled"
)
// ActionRequest represents a request to trigger an action
type ActionRequest struct {
Type ActionType `json:"type"`
Parameters map[string]interface{} `json:"parameters,omitempty"`
}
// ActionResponse represents the response from triggering an action
type ActionResponse struct {
ActionID string `json:"action_id"`
Status string `json:"status"`
Message string `json:"message,omitempty"`
}
// ActionStatusResponse represents the status of an action
type ActionStatusResponse struct {
ActionID string `json:"action_id"`
Status ActionStatus `json:"status"`
Error interface{} `json:"error,omitempty"`
Output map[string]interface{} `json:"output,omitempty"`
Progress float64 `json:"progress,omitempty"`
StartTime time.Time `json:"start_time,omitempty"`
EndTime time.Time `json:"end_time,omitempty"`
}
// SequenceAction represents an action in a sequence
type SequenceAction struct {
Type ActionType `json:"type"`
Parameters map[string]interface{} `json:"params,omitempty"`
Delay time.Duration `json:"delay,omitempty"`
}
// FaultInjectorClient provides programmatic control over test infrastructure
type FaultInjectorClient struct {
baseURL string
httpClient *http.Client
}
// NewFaultInjectorClient creates a new fault injector client
func NewFaultInjectorClient(baseURL string) *FaultInjectorClient {
return &FaultInjectorClient{
baseURL: strings.TrimSuffix(baseURL, "/"),
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// GetBaseURL returns the base URL of the fault injector server
func (c *FaultInjectorClient) GetBaseURL() string {
return c.baseURL
}
// ListActions lists all available actions
func (c *FaultInjectorClient) ListActions(ctx context.Context) ([]ActionType, error) {
var actions []ActionType
err := c.request(ctx, "GET", "/actions", nil, &actions)
return actions, err
}
// TriggerAction triggers a specific action
func (c *FaultInjectorClient) TriggerAction(ctx context.Context, action ActionRequest) (*ActionResponse, error) {
var response ActionResponse
err := c.request(ctx, "POST", "/action", action, &response)
return &response, err
}
func (c *FaultInjectorClient) TriggerSequence(ctx context.Context, bdbID int, actions []SequenceAction) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionSequence,
Parameters: map[string]interface{}{
"bdb_id": bdbID,
"actions": actions,
},
})
}
// GetActionStatus gets the status of a specific action
func (c *FaultInjectorClient) GetActionStatus(ctx context.Context, actionID string) (*ActionStatusResponse, error) {
var status ActionStatusResponse
err := c.request(ctx, "GET", fmt.Sprintf("/action/%s", actionID), nil, &status)
return &status, err
}
// WaitForAction waits for an action to complete
func (c *FaultInjectorClient) WaitForAction(ctx context.Context, actionID string, options ...WaitOption) (*ActionStatusResponse, error) {
config := &waitConfig{
pollInterval: 1 * time.Second,
maxWaitTime: 60 * time.Second,
}
for _, opt := range options {
opt(config)
}
deadline := time.Now().Add(config.maxWaitTime)
ticker := time.NewTicker(config.pollInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(time.Until(deadline)):
return nil, fmt.Errorf("timeout waiting for action %s after %v", actionID, config.maxWaitTime)
case <-ticker.C:
status, err := c.GetActionStatus(ctx, actionID)
if err != nil {
return nil, fmt.Errorf("failed to get action status: %w", err)
}
switch status.Status {
case StatusFinished, StatusSuccess, StatusFailed, StatusCancelled:
return status, nil
}
}
}
}
// Cluster Management Actions
// TriggerClusterFailover triggers a cluster failover
func (c *FaultInjectorClient) TriggerClusterFailover(ctx context.Context, nodeID string, force bool) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionClusterFailover,
Parameters: map[string]interface{}{
"node_id": nodeID,
"force": force,
},
})
}
// TriggerClusterReshard triggers cluster resharding
func (c *FaultInjectorClient) TriggerClusterReshard(ctx context.Context, slots []int, sourceNode, targetNode string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionClusterReshard,
Parameters: map[string]interface{}{
"slots": slots,
"source_node": sourceNode,
"target_node": targetNode,
},
})
}
// TriggerSlotMigration triggers migration of specific slots
func (c *FaultInjectorClient) TriggerSlotMigration(ctx context.Context, startSlot, endSlot int, sourceNode, targetNode string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionSlotMigration,
Parameters: map[string]interface{}{
"start_slot": startSlot,
"end_slot": endSlot,
"source_node": sourceNode,
"target_node": targetNode,
},
})
}
// Node Management Actions
// RestartNode restarts a specific Redis node
func (c *FaultInjectorClient) RestartNode(ctx context.Context, nodeID string, graceful bool) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNodeRestart,
Parameters: map[string]interface{}{
"node_id": nodeID,
"graceful": graceful,
},
})
}
// StopNode stops a specific Redis node
func (c *FaultInjectorClient) StopNode(ctx context.Context, nodeID string, graceful bool) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNodeStop,
Parameters: map[string]interface{}{
"node_id": nodeID,
"graceful": graceful,
},
})
}
// StartNode starts a specific Redis node
func (c *FaultInjectorClient) StartNode(ctx context.Context, nodeID string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNodeStart,
Parameters: map[string]interface{}{
"node_id": nodeID,
},
})
}
// KillNode forcefully kills a Redis node
func (c *FaultInjectorClient) KillNode(ctx context.Context, nodeID string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNodeKill,
Parameters: map[string]interface{}{
"node_id": nodeID,
},
})
}
// Network Simulation Actions
// SimulateNetworkPartition simulates a network partition
func (c *FaultInjectorClient) SimulateNetworkPartition(ctx context.Context, nodes []string, duration time.Duration) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNetworkPartition,
Parameters: map[string]interface{}{
"nodes": nodes,
"duration": duration.String(),
},
})
}
// SimulateNetworkLatency adds network latency
func (c *FaultInjectorClient) SimulateNetworkLatency(ctx context.Context, nodes []string, latency time.Duration, jitter time.Duration) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNetworkLatency,
Parameters: map[string]interface{}{
"nodes": nodes,
"latency": latency.String(),
"jitter": jitter.String(),
},
})
}
// SimulatePacketLoss simulates packet loss
func (c *FaultInjectorClient) SimulatePacketLoss(ctx context.Context, nodes []string, lossPercent float64) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNetworkPacketLoss,
Parameters: map[string]interface{}{
"nodes": nodes,
"loss_percent": lossPercent,
},
})
}
// LimitBandwidth limits network bandwidth
func (c *FaultInjectorClient) LimitBandwidth(ctx context.Context, nodes []string, bandwidth string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNetworkBandwidth,
Parameters: map[string]interface{}{
"nodes": nodes,
"bandwidth": bandwidth,
},
})
}
// RestoreNetwork restores normal network conditions
func (c *FaultInjectorClient) RestoreNetwork(ctx context.Context, nodes []string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionNetworkRestore,
Parameters: map[string]interface{}{
"nodes": nodes,
},
})
}
// Configuration Actions
// ChangeConfig changes Redis configuration
func (c *FaultInjectorClient) ChangeConfig(ctx context.Context, nodeID string, config map[string]string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionConfigChange,
Parameters: map[string]interface{}{
"node_id": nodeID,
"config": config,
},
})
}
// EnableMaintenanceMode enables maintenance mode
func (c *FaultInjectorClient) EnableMaintenanceMode(ctx context.Context, nodeID string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionMaintenanceMode,
Parameters: map[string]interface{}{
"node_id": nodeID,
"enabled": true,
},
})
}
// DisableMaintenanceMode disables maintenance mode
func (c *FaultInjectorClient) DisableMaintenanceMode(ctx context.Context, nodeID string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionMaintenanceMode,
Parameters: map[string]interface{}{
"node_id": nodeID,
"enabled": false,
},
})
}
// Complex Actions
// ExecuteSequence executes a sequence of actions
func (c *FaultInjectorClient) ExecuteSequence(ctx context.Context, actions []SequenceAction) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionSequence,
Parameters: map[string]interface{}{
"actions": actions,
},
})
}
// ExecuteCommand executes a custom command
func (c *FaultInjectorClient) ExecuteCommand(ctx context.Context, nodeID, command string) (*ActionResponse, error) {
return c.TriggerAction(ctx, ActionRequest{
Type: ActionExecuteCommand,
Parameters: map[string]interface{}{
"node_id": nodeID,
"command": command,
},
})
}
// Convenience Methods
// SimulateClusterUpgrade simulates a complete cluster upgrade scenario
func (c *FaultInjectorClient) SimulateClusterUpgrade(ctx context.Context, nodes []string) (*ActionResponse, error) {
actions := make([]SequenceAction, 0, len(nodes)*2)
// Rolling restart of all nodes
for i, nodeID := range nodes {
actions = append(actions, SequenceAction{
Type: ActionNodeRestart,
Parameters: map[string]interface{}{
"node_id": nodeID,
"graceful": true,
},
Delay: time.Duration(i*10) * time.Second, // Stagger restarts
})
}
return c.ExecuteSequence(ctx, actions)
}
// SimulateNetworkIssues simulates various network issues
func (c *FaultInjectorClient) SimulateNetworkIssues(ctx context.Context, nodes []string) (*ActionResponse, error) {
actions := []SequenceAction{
{
Type: ActionNetworkLatency,
Parameters: map[string]interface{}{
"nodes": nodes,
"latency": "100ms",
"jitter": "20ms",
},
},
{
Type: ActionNetworkPacketLoss,
Parameters: map[string]interface{}{
"nodes": nodes,
"loss_percent": 2.0,
},
Delay: 30 * time.Second,
},
{
Type: ActionNetworkRestore,
Parameters: map[string]interface{}{
"nodes": nodes,
},
Delay: 60 * time.Second,
},
}
return c.ExecuteSequence(ctx, actions)
}
// Helper types and functions
type waitConfig struct {
pollInterval time.Duration
maxWaitTime time.Duration
}
type WaitOption func(*waitConfig)
// WithPollInterval sets the polling interval for waiting
func WithPollInterval(interval time.Duration) WaitOption {
return func(c *waitConfig) {
c.pollInterval = interval
}
}
// WithMaxWaitTime sets the maximum wait time
func WithMaxWaitTime(maxWait time.Duration) WaitOption {
return func(c *waitConfig) {
c.maxWaitTime = maxWait
}
}
// Internal HTTP request method
func (c *FaultInjectorClient) request(ctx context.Context, method, path string, body interface{}, result interface{}) error {
url := c.baseURL + path
var reqBody io.Reader
if body != nil {
jsonData, err := json.Marshal(body)
if err != nil {
return fmt.Errorf("failed to marshal request body: %w", err)
}
reqBody = bytes.NewReader(jsonData)
}
req, err := http.NewRequestWithContext(ctx, method, url, reqBody)
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
if body != nil {
req.Header.Set("Content-Type", "application/json")
}
resp, err := c.httpClient.Do(req)
if err != nil {
return fmt.Errorf("failed to execute request: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("failed to read response body: %w", err)
}
if resp.StatusCode >= 400 {
return fmt.Errorf("HTTP %d: %s", resp.StatusCode, string(respBody))
}
if result != nil {
if err := json.Unmarshal(respBody, result); err != nil {
// happens when the API changes and the response structure changes
// sometimes the output of the action status is map, sometimes it is json.
// since we don't have a proper response structure we are going to handle it here
if result, ok := result.(*ActionStatusResponse); ok {
mapResult := map[string]interface{}{}
err = json.Unmarshal(respBody, &mapResult)
if err != nil {
fmt.Println("Failed to unmarshal response:", string(respBody))
panic(err)
}
result.Error = mapResult["error"]
result.Output = map[string]interface{}{"result": mapResult["output"]}
if status, ok := mapResult["status"].(string); ok {
result.Status = ActionStatus(status)
}
if result.Status == StatusSuccess || result.Status == StatusFailed || result.Status == StatusCancelled {
result.EndTime = time.Now()
}
if progress, ok := mapResult["progress"].(float64); ok {
result.Progress = progress
}
if actionID, ok := mapResult["action_id"].(string); ok {
result.ActionID = actionID
}
return nil
}
fmt.Println("Failed to unmarshal response:", string(respBody))
panic(err)
}
}
return nil
}
// Utility functions for common scenarios
// GetClusterNodes returns a list of cluster node IDs
func GetClusterNodes() []string {
// TODO Implement
// This would typically be configured via environment or discovery
return []string{"node-1", "node-2", "node-3", "node-4", "node-5", "node-6"}
}
// GetMasterNodes returns a list of master node IDs
func GetMasterNodes() []string {
// TODO Implement
return []string{"node-1", "node-2", "node-3"}
}
// GetSlaveNodes returns a list of slave node IDs
func GetSlaveNodes() []string {
// TODO Implement
return []string{"node-4", "node-5", "node-6"}
}
// ParseNodeID extracts node ID from various formats
func ParseNodeID(nodeAddr string) string {
// Extract node ID from address like "redis-node-1:7001" -> "node-1"
parts := strings.Split(nodeAddr, ":")
if len(parts) > 0 {
addr := parts[0]
if strings.Contains(addr, "redis-") {
return strings.TrimPrefix(addr, "redis-")
}
return addr
}
return nodeAddr
}
// FormatSlotRange formats a slot range for Redis commands
func FormatSlotRange(start, end int) string {
if start == end {
return strconv.Itoa(start)
}
return fmt.Sprintf("%d-%d", start, end)
}