package mcp import ( "context" "fmt" "sync" "time" "github.com/mark3labs/mcp-go/client" "github.com/mark3labs/mcp-go/mcp" "github.com/maximhq/bifrost/core/schemas" ) const ( // Health check configuration DefaultHealthCheckInterval = 10 * time.Second // Interval between health checks DefaultHealthCheckTimeout = 5 * time.Second // Timeout for each health check MaxConsecutiveFailures = 5 // Number of failures before marking as unhealthy ) // ClientHealthMonitor tracks the health status of an MCP client type ClientHealthMonitor struct { manager *MCPManager clientID string interval time.Duration timeout time.Duration maxConsecutiveFailures int logger schemas.Logger mu sync.Mutex ticker *time.Ticker ctx context.Context cancel context.CancelFunc isMonitoring bool consecutiveFailures int isPingAvailable bool // Whether the MCP server supports ping for health checks isReconnecting bool // Whether a reconnection attempt is currently in progress } // NewClientHealthMonitor creates a new health monitor for an MCP client func NewClientHealthMonitor( manager *MCPManager, clientID string, interval time.Duration, isPingAvailable bool, logger schemas.Logger, ) *ClientHealthMonitor { if interval == 0 { interval = DefaultHealthCheckInterval } if logger == nil { logger = defaultLogger } return &ClientHealthMonitor{ manager: manager, clientID: clientID, interval: interval, timeout: DefaultHealthCheckTimeout, maxConsecutiveFailures: MaxConsecutiveFailures, logger: logger, isMonitoring: false, consecutiveFailures: 0, isPingAvailable: isPingAvailable, } } // Start begins monitoring the client's health in a background goroutine func (chm *ClientHealthMonitor) Start() { chm.mu.Lock() defer chm.mu.Unlock() if chm.isMonitoring { return // Already monitoring } // Check client exists FIRST before allocating resources chm.manager.mu.RLock() clientState, exists := chm.manager.clientMap[chm.clientID] chm.manager.mu.RUnlock() if !exists { // Use clientID for logging when client is missing chm.logger.Error("%s Health monitor failed to start for client %s, client not found in manager", MCPLogPrefix, chm.clientID) return } // Now allocate resources (after validation) chm.isMonitoring = true chm.ctx, chm.cancel = context.WithCancel(context.Background()) chm.ticker = time.NewTicker(chm.interval) go chm.monitorLoop() chm.logger.Debug("%s Health monitor started for client %s", MCPLogPrefix, clientState.ExecutionConfig.Name) } // Stop stops monitoring the client's health func (chm *ClientHealthMonitor) Stop() { chm.mu.Lock() defer chm.mu.Unlock() if !chm.isMonitoring { return // Not monitoring } // Always perform cleanup - do not access manager.clientMap here to avoid // deadlock when Stop() is called from removeClientUnsafe() which already // holds the manager's write lock chm.isMonitoring = false if chm.ticker != nil { chm.ticker.Stop() } if chm.cancel != nil { chm.cancel() } chm.logger.Debug("%s Health monitor stopped for client %s", MCPLogPrefix, chm.clientID) } // monitorLoop runs the health check loop func (chm *ClientHealthMonitor) monitorLoop() { for { select { case <-chm.ctx.Done(): return case <-chm.ticker.C: chm.performHealthCheck() } } } // performHealthCheck performs a health check on the client. // On max consecutive failures it marks the client as disconnected and spawns // a background reconnection attempt (with full retry backoff via ReconnectClient). func (chm *ClientHealthMonitor) performHealthCheck() { // Skip while a reconnection attempt is already in flight chm.mu.Lock() if chm.isReconnecting { chm.mu.Unlock() return } chm.mu.Unlock() // Get the client connection — capture Conn while holding the lock so we // don't race with removeClientUnsafe zeroing it under the write lock. chm.manager.mu.RLock() clientState, exists := chm.manager.clientMap[chm.clientID] var conn *client.Client if exists && clientState != nil { conn = clientState.Conn } chm.manager.mu.RUnlock() if !exists { chm.Stop() return } var err error if conn == nil { // No active connection — treat as a health check failure err = fmt.Errorf("no active connection") } else { // Perform health check with timeout ctx, cancel := context.WithTimeout(context.Background(), chm.timeout) defer cancel() if chm.isPingAvailable { err = conn.Ping(ctx) } else { listRequest := mcp.ListToolsRequest{ PaginatedRequest: mcp.PaginatedRequest{ Request: mcp.Request{ Method: string(mcp.MethodToolsList), }, }, } _, err = conn.ListTools(ctx, listRequest) } } if err != nil { chm.incrementFailures() if chm.getConsecutiveFailures() >= chm.maxConsecutiveFailures { chm.updateClientState(schemas.MCPConnectionStateDisconnected) chm.mu.Lock() if !chm.isReconnecting { chm.isReconnecting = true go chm.attemptReconnect() } chm.mu.Unlock() } } else { chm.resetFailures() chm.updateClientState(schemas.MCPConnectionStateConnected) } } // attemptReconnect runs in a background goroutine and calls ReconnectClient, // which internally applies full exponential backoff retry logic. // On success the failure counter is reset; on failure the isReconnecting flag // is cleared so the next health check cycle can try again. func (chm *ClientHealthMonitor) attemptReconnect() { defer func() { chm.mu.Lock() chm.isReconnecting = false chm.mu.Unlock() }() chm.logger.Debug("%s Attempting to reconnect MCP client %s...", MCPLogPrefix, chm.clientID) if err := chm.manager.ReconnectClient(chm.clientID); err != nil { chm.logger.Warn("%s Failed to reconnect MCP client %s: %v", MCPLogPrefix, chm.clientID, err) return } chm.logger.Info("%s Successfully reconnected MCP client %s", MCPLogPrefix, chm.clientID) chm.resetFailures() } // updateClientState updates the client's connection state func (chm *ClientHealthMonitor) updateClientState(state schemas.MCPConnectionState) { chm.manager.mu.Lock() clientState, exists := chm.manager.clientMap[chm.clientID] if !exists { chm.manager.mu.Unlock() return } // Only update if state changed stateChanged := clientState.State != state if stateChanged { clientState.State = state } chm.manager.mu.Unlock() // Log after releasing the lock if stateChanged { chm.logger.Info(fmt.Sprintf("%s Client %s connection state changed to: %s", MCPLogPrefix, clientState.ExecutionConfig.Name, state)) } } // incrementFailures increments the consecutive failure counter func (chm *ClientHealthMonitor) incrementFailures() { chm.mu.Lock() defer chm.mu.Unlock() chm.consecutiveFailures++ } // resetFailures resets the consecutive failure counter func (chm *ClientHealthMonitor) resetFailures() { chm.mu.Lock() defer chm.mu.Unlock() chm.consecutiveFailures = 0 } // getConsecutiveFailures returns the current consecutive failure count func (chm *ClientHealthMonitor) getConsecutiveFailures() int { chm.mu.Lock() defer chm.mu.Unlock() return chm.consecutiveFailures } // HealthMonitorManager manages all client health monitors type HealthMonitorManager struct { monitors map[string]*ClientHealthMonitor mu sync.RWMutex } // NewHealthMonitorManager creates a new health monitor manager func NewHealthMonitorManager() *HealthMonitorManager { return &HealthMonitorManager{ monitors: make(map[string]*ClientHealthMonitor), } } // StartMonitoring starts monitoring a specific client func (hmm *HealthMonitorManager) StartMonitoring(monitor *ClientHealthMonitor) { hmm.mu.Lock() defer hmm.mu.Unlock() // Stop any existing monitor for this client if existing, ok := hmm.monitors[monitor.clientID]; ok { existing.Stop() } hmm.monitors[monitor.clientID] = monitor monitor.Start() } // StopMonitoring stops monitoring a specific client func (hmm *HealthMonitorManager) StopMonitoring(clientID string) { hmm.mu.Lock() defer hmm.mu.Unlock() if monitor, ok := hmm.monitors[clientID]; ok { monitor.Stop() delete(hmm.monitors, clientID) } } // StopAll stops all monitoring func (hmm *HealthMonitorManager) StopAll() { hmm.mu.Lock() defer hmm.mu.Unlock() for _, monitor := range hmm.monitors { monitor.Stop() } hmm.monitors = make(map[string]*ClientHealthMonitor) }