bifrost/core/mcp/healthmonitor.go

package mcp

import (
	"context"
	"fmt"
	"sync"
	"time"

	"github.com/mark3labs/mcp-go/client"
	"github.com/mark3labs/mcp-go/mcp"
	"github.com/maximhq/bifrost/core/schemas"
)

const (
	// Health check configuration
	DefaultHealthCheckInterval = 10 * time.Second // Interval between health checks
	DefaultHealthCheckTimeout  = 5 * time.Second  // Timeout for each health check
	MaxConsecutiveFailures     = 5                // Number of failures before marking as unhealthy
)

// ClientHealthMonitor tracks the health status of an MCP client
type ClientHealthMonitor struct {
	manager                *MCPManager
	clientID               string
	interval               time.Duration
	timeout                time.Duration
	maxConsecutiveFailures int
	logger                 schemas.Logger
	mu                     sync.Mutex
	ticker                 *time.Ticker
	ctx                    context.Context
	cancel                 context.CancelFunc
	isMonitoring           bool
	consecutiveFailures    int
	isPingAvailable        bool // Whether the MCP server supports ping for health checks
	isReconnecting         bool // Whether a reconnection attempt is currently in progress
}

// NewClientHealthMonitor creates a new health monitor for an MCP client
func NewClientHealthMonitor(
	manager *MCPManager,
	clientID string,
	interval time.Duration,
	isPingAvailable bool,
	logger schemas.Logger,
) *ClientHealthMonitor {
	if interval == 0 {
		interval = DefaultHealthCheckInterval
	}

	if logger == nil {
		logger = defaultLogger
	}

	return &ClientHealthMonitor{
		manager:                manager,
		clientID:               clientID,
		interval:               interval,
		timeout:                DefaultHealthCheckTimeout,
		maxConsecutiveFailures: MaxConsecutiveFailures,
		logger:                 logger,
		isMonitoring:           false,
		consecutiveFailures:    0,
		isPingAvailable:        isPingAvailable,
	}
}

// Start begins monitoring the client's health in a background goroutine
func (chm *ClientHealthMonitor) Start() {
	chm.mu.Lock()
	defer chm.mu.Unlock()

	if chm.isMonitoring {
		return // Already monitoring
	}

	// Check client exists FIRST before allocating resources
	chm.manager.mu.RLock()
	clientState, exists := chm.manager.clientMap[chm.clientID]
	chm.manager.mu.RUnlock()

	if !exists {
		// Use clientID for logging when client is missing
		chm.logger.Error("%s Health monitor failed to start for client %s, client not found in manager", MCPLogPrefix, chm.clientID)
		return
	}

	// Now allocate resources (after validation)
	chm.isMonitoring = true
	chm.ctx, chm.cancel = context.WithCancel(context.Background())
	chm.ticker = time.NewTicker(chm.interval)

	go chm.monitorLoop()
	chm.logger.Debug("%s Health monitor started for client %s", MCPLogPrefix, clientState.ExecutionConfig.Name)
}

// Stop stops monitoring the client's health
func (chm *ClientHealthMonitor) Stop() {
	chm.mu.Lock()
	defer chm.mu.Unlock()

	if !chm.isMonitoring {
		return // Not monitoring
	}

	// Always perform cleanup - do not access manager.clientMap here to avoid
	// deadlock when Stop() is called from removeClientUnsafe() which already
	// holds the manager's write lock
	chm.isMonitoring = false
	if chm.ticker != nil {
		chm.ticker.Stop()
	}
	if chm.cancel != nil {
		chm.cancel()
	}

	chm.logger.Debug("%s Health monitor stopped for client %s", MCPLogPrefix, chm.clientID)
}

// monitorLoop runs the health check loop
func (chm *ClientHealthMonitor) monitorLoop() {
	for {
		select {
		case <-chm.ctx.Done():
			return
		case <-chm.ticker.C:
			chm.performHealthCheck()
		}
	}
}

// performHealthCheck performs a health check on the client.
// On max consecutive failures it marks the client as disconnected and spawns
// a background reconnection attempt (with full retry backoff via ReconnectClient).
func (chm *ClientHealthMonitor) performHealthCheck() {
	// Skip while a reconnection attempt is already in flight
	chm.mu.Lock()
	if chm.isReconnecting {
		chm.mu.Unlock()
		return
	}
	chm.mu.Unlock()

	// Get the client connection — capture Conn while holding the lock so we
	// don't race with removeClientUnsafe zeroing it under the write lock.
	chm.manager.mu.RLock()
	clientState, exists := chm.manager.clientMap[chm.clientID]
	var conn *client.Client
	if exists && clientState != nil {
		conn = clientState.Conn
	}
	chm.manager.mu.RUnlock()

	if !exists {
		chm.Stop()
		return
	}

	var err error
	if conn == nil {
		// No active connection — treat as a health check failure
		err = fmt.Errorf("no active connection")
	} else {
		// Perform health check with timeout
		ctx, cancel := context.WithTimeout(context.Background(), chm.timeout)
		defer cancel()

		if chm.isPingAvailable {
			err = conn.Ping(ctx)
		} else {
			listRequest := mcp.ListToolsRequest{
				PaginatedRequest: mcp.PaginatedRequest{
					Request: mcp.Request{
						Method: string(mcp.MethodToolsList),
					},
				},
			}
			_, err = conn.ListTools(ctx, listRequest)
		}
	}

	if err != nil {
		chm.incrementFailures()

		if chm.getConsecutiveFailures() >= chm.maxConsecutiveFailures {
			chm.updateClientState(schemas.MCPConnectionStateDisconnected)
			chm.mu.Lock()
			if !chm.isReconnecting {
				chm.isReconnecting = true
				go chm.attemptReconnect()
			}
			chm.mu.Unlock()
		}
	} else {
		chm.resetFailures()
		chm.updateClientState(schemas.MCPConnectionStateConnected)
	}
}

// attemptReconnect runs in a background goroutine and calls ReconnectClient,
// which internally applies full exponential backoff retry logic.
// On success the failure counter is reset; on failure the isReconnecting flag
// is cleared so the next health check cycle can try again.
func (chm *ClientHealthMonitor) attemptReconnect() {
	defer func() {
		chm.mu.Lock()
		chm.isReconnecting = false
		chm.mu.Unlock()
	}()

	chm.logger.Debug("%s Attempting to reconnect MCP client %s...", MCPLogPrefix, chm.clientID)

	if err := chm.manager.ReconnectClient(chm.clientID); err != nil {
		chm.logger.Warn("%s Failed to reconnect MCP client %s: %v", MCPLogPrefix, chm.clientID, err)
		return
	}

	chm.logger.Info("%s Successfully reconnected MCP client %s", MCPLogPrefix, chm.clientID)
	chm.resetFailures()
}

// updateClientState updates the client's connection state
func (chm *ClientHealthMonitor) updateClientState(state schemas.MCPConnectionState) {
	chm.manager.mu.Lock()
	clientState, exists := chm.manager.clientMap[chm.clientID]
	if !exists {
		chm.manager.mu.Unlock()
		return
	}

	// Only update if state changed
	stateChanged := clientState.State != state
	if stateChanged {
		clientState.State = state
	}
	chm.manager.mu.Unlock()

	// Log after releasing the lock
	if stateChanged {
		chm.logger.Info(fmt.Sprintf("%s Client %s connection state changed to: %s", MCPLogPrefix, clientState.ExecutionConfig.Name, state))
	}
}

// incrementFailures increments the consecutive failure counter
func (chm *ClientHealthMonitor) incrementFailures() {
	chm.mu.Lock()
	defer chm.mu.Unlock()
	chm.consecutiveFailures++
}

// resetFailures resets the consecutive failure counter
func (chm *ClientHealthMonitor) resetFailures() {
	chm.mu.Lock()
	defer chm.mu.Unlock()
	chm.consecutiveFailures = 0
}

// getConsecutiveFailures returns the current consecutive failure count
func (chm *ClientHealthMonitor) getConsecutiveFailures() int {
	chm.mu.Lock()
	defer chm.mu.Unlock()
	return chm.consecutiveFailures
}

// HealthMonitorManager manages all client health monitors
type HealthMonitorManager struct {
	monitors map[string]*ClientHealthMonitor
	mu       sync.RWMutex
}

// NewHealthMonitorManager creates a new health monitor manager
func NewHealthMonitorManager() *HealthMonitorManager {
	return &HealthMonitorManager{
		monitors: make(map[string]*ClientHealthMonitor),
	}
}

// StartMonitoring starts monitoring a specific client
func (hmm *HealthMonitorManager) StartMonitoring(monitor *ClientHealthMonitor) {
	hmm.mu.Lock()
	defer hmm.mu.Unlock()

	// Stop any existing monitor for this client
	if existing, ok := hmm.monitors[monitor.clientID]; ok {
		existing.Stop()
	}

	hmm.monitors[monitor.clientID] = monitor
	monitor.Start()
}

// StopMonitoring stops monitoring a specific client
func (hmm *HealthMonitorManager) StopMonitoring(clientID string) {
	hmm.mu.Lock()
	defer hmm.mu.Unlock()

	if monitor, ok := hmm.monitors[clientID]; ok {
		monitor.Stop()
		delete(hmm.monitors, clientID)
	}
}

// StopAll stops all monitoring
func (hmm *HealthMonitorManager) StopAll() {
	hmm.mu.Lock()
	defer hmm.mu.Unlock()

	for _, monitor := range hmm.monitors {
		monitor.Stop()
	}
	hmm.monitors = make(map[string]*ClientHealthMonitor)
}