first commit

2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions
--- a/core/mcp/healthmonitor.go
+++ b/core/mcp/healthmonitor.go
@@ -0,0 +1,312 @@
+package mcp
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/mark3labs/mcp-go/client"
+	"github.com/mark3labs/mcp-go/mcp"
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+const (
+	// Health check configuration
+	DefaultHealthCheckInterval = 10 * time.Second // Interval between health checks
+	DefaultHealthCheckTimeout  = 5 * time.Second  // Timeout for each health check
+	MaxConsecutiveFailures     = 5                // Number of failures before marking as unhealthy
+)
+
+// ClientHealthMonitor tracks the health status of an MCP client
+type ClientHealthMonitor struct {
+	manager                *MCPManager
+	clientID               string
+	interval               time.Duration
+	timeout                time.Duration
+	maxConsecutiveFailures int
+	logger                 schemas.Logger
+	mu                     sync.Mutex
+	ticker                 *time.Ticker
+	ctx                    context.Context
+	cancel                 context.CancelFunc
+	isMonitoring           bool
+	consecutiveFailures    int
+	isPingAvailable        bool // Whether the MCP server supports ping for health checks
+	isReconnecting         bool // Whether a reconnection attempt is currently in progress
+}
+
+// NewClientHealthMonitor creates a new health monitor for an MCP client
+func NewClientHealthMonitor(
+	manager *MCPManager,
+	clientID string,
+	interval time.Duration,
+	isPingAvailable bool,
+	logger schemas.Logger,
+) *ClientHealthMonitor {
+	if interval == 0 {
+		interval = DefaultHealthCheckInterval
+	}
+
+	if logger == nil {
+		logger = defaultLogger
+	}
+
+	return &ClientHealthMonitor{
+		manager:                manager,
+		clientID:               clientID,
+		interval:               interval,
+		timeout:                DefaultHealthCheckTimeout,
+		maxConsecutiveFailures: MaxConsecutiveFailures,
+		logger:                 logger,
+		isMonitoring:           false,
+		consecutiveFailures:    0,
+		isPingAvailable:        isPingAvailable,
+	}
+}
+
+// Start begins monitoring the client's health in a background goroutine
+func (chm *ClientHealthMonitor) Start() {
+	chm.mu.Lock()
+	defer chm.mu.Unlock()
+
+	if chm.isMonitoring {
+		return // Already monitoring
+	}
+
+	// Check client exists FIRST before allocating resources
+	chm.manager.mu.RLock()
+	clientState, exists := chm.manager.clientMap[chm.clientID]
+	chm.manager.mu.RUnlock()
+
+	if !exists {
+		// Use clientID for logging when client is missing
+		chm.logger.Error("%s Health monitor failed to start for client %s, client not found in manager", MCPLogPrefix, chm.clientID)
+		return
+	}
+
+	// Now allocate resources (after validation)
+	chm.isMonitoring = true
+	chm.ctx, chm.cancel = context.WithCancel(context.Background())
+	chm.ticker = time.NewTicker(chm.interval)
+
+	go chm.monitorLoop()
+	chm.logger.Debug("%s Health monitor started for client %s", MCPLogPrefix, clientState.ExecutionConfig.Name)
+}
+
+// Stop stops monitoring the client's health
+func (chm *ClientHealthMonitor) Stop() {
+	chm.mu.Lock()
+	defer chm.mu.Unlock()
+
+	if !chm.isMonitoring {
+		return // Not monitoring
+	}
+
+	// Always perform cleanup - do not access manager.clientMap here to avoid
+	// deadlock when Stop() is called from removeClientUnsafe() which already
+	// holds the manager's write lock
+	chm.isMonitoring = false
+	if chm.ticker != nil {
+		chm.ticker.Stop()
+	}
+	if chm.cancel != nil {
+		chm.cancel()
+	}
+
+	chm.logger.Debug("%s Health monitor stopped for client %s", MCPLogPrefix, chm.clientID)
+}
+
+// monitorLoop runs the health check loop
+func (chm *ClientHealthMonitor) monitorLoop() {
+	for {
+		select {
+		case <-chm.ctx.Done():
+			return
+		case <-chm.ticker.C:
+			chm.performHealthCheck()
+		}
+	}
+}
+
+// performHealthCheck performs a health check on the client.
+// On max consecutive failures it marks the client as disconnected and spawns
+// a background reconnection attempt (with full retry backoff via ReconnectClient).
+func (chm *ClientHealthMonitor) performHealthCheck() {
+	// Skip while a reconnection attempt is already in flight
+	chm.mu.Lock()
+	if chm.isReconnecting {
+		chm.mu.Unlock()
+		return
+	}
+	chm.mu.Unlock()
+
+	// Get the client connection — capture Conn while holding the lock so we
+	// don't race with removeClientUnsafe zeroing it under the write lock.
+	chm.manager.mu.RLock()
+	clientState, exists := chm.manager.clientMap[chm.clientID]
+	var conn *client.Client
+	if exists && clientState != nil {
+		conn = clientState.Conn
+	}
+	chm.manager.mu.RUnlock()
+
+	if !exists {
+		chm.Stop()
+		return
+	}
+
+	var err error
+	if conn == nil {
+		// No active connection — treat as a health check failure
+		err = fmt.Errorf("no active connection")
+	} else {
+		// Perform health check with timeout
+		ctx, cancel := context.WithTimeout(context.Background(), chm.timeout)
+		defer cancel()
+
+		if chm.isPingAvailable {
+			err = conn.Ping(ctx)
+		} else {
+			listRequest := mcp.ListToolsRequest{
+				PaginatedRequest: mcp.PaginatedRequest{
+					Request: mcp.Request{
+						Method: string(mcp.MethodToolsList),
+					},
+				},
+			}
+			_, err = conn.ListTools(ctx, listRequest)
+		}
+	}
+
+	if err != nil {
+		chm.incrementFailures()
+
+		if chm.getConsecutiveFailures() >= chm.maxConsecutiveFailures {
+			chm.updateClientState(schemas.MCPConnectionStateDisconnected)
+			chm.mu.Lock()
+			if !chm.isReconnecting {
+				chm.isReconnecting = true
+				go chm.attemptReconnect()
+			}
+			chm.mu.Unlock()
+		}
+	} else {
+		chm.resetFailures()
+		chm.updateClientState(schemas.MCPConnectionStateConnected)
+	}
+}
+
+// attemptReconnect runs in a background goroutine and calls ReconnectClient,
+// which internally applies full exponential backoff retry logic.
+// On success the failure counter is reset; on failure the isReconnecting flag
+// is cleared so the next health check cycle can try again.
+func (chm *ClientHealthMonitor) attemptReconnect() {
+	defer func() {
+		chm.mu.Lock()
+		chm.isReconnecting = false
+		chm.mu.Unlock()
+	}()
+
+	chm.logger.Debug("%s Attempting to reconnect MCP client %s...", MCPLogPrefix, chm.clientID)
+
+	if err := chm.manager.ReconnectClient(chm.clientID); err != nil {
+		chm.logger.Warn("%s Failed to reconnect MCP client %s: %v", MCPLogPrefix, chm.clientID, err)
+		return
+	}
+
+	chm.logger.Info("%s Successfully reconnected MCP client %s", MCPLogPrefix, chm.clientID)
+	chm.resetFailures()
+}
+
+// updateClientState updates the client's connection state
+func (chm *ClientHealthMonitor) updateClientState(state schemas.MCPConnectionState) {
+	chm.manager.mu.Lock()
+	clientState, exists := chm.manager.clientMap[chm.clientID]
+	if !exists {
+		chm.manager.mu.Unlock()
+		return
+	}
+
+	// Only update if state changed
+	stateChanged := clientState.State != state
+	if stateChanged {
+		clientState.State = state
+	}
+	chm.manager.mu.Unlock()
+
+	// Log after releasing the lock
+	if stateChanged {
+		chm.logger.Info(fmt.Sprintf("%s Client %s connection state changed to: %s", MCPLogPrefix, clientState.ExecutionConfig.Name, state))
+	}
+}
+
+// incrementFailures increments the consecutive failure counter
+func (chm *ClientHealthMonitor) incrementFailures() {
+	chm.mu.Lock()
+	defer chm.mu.Unlock()
+	chm.consecutiveFailures++
+}
+
+// resetFailures resets the consecutive failure counter
+func (chm *ClientHealthMonitor) resetFailures() {
+	chm.mu.Lock()
+	defer chm.mu.Unlock()
+	chm.consecutiveFailures = 0
+}
+
+// getConsecutiveFailures returns the current consecutive failure count
+func (chm *ClientHealthMonitor) getConsecutiveFailures() int {
+	chm.mu.Lock()
+	defer chm.mu.Unlock()
+	return chm.consecutiveFailures
+}
+
+// HealthMonitorManager manages all client health monitors
+type HealthMonitorManager struct {
+	monitors map[string]*ClientHealthMonitor
+	mu       sync.RWMutex
+}
+
+// NewHealthMonitorManager creates a new health monitor manager
+func NewHealthMonitorManager() *HealthMonitorManager {
+	return &HealthMonitorManager{
+		monitors: make(map[string]*ClientHealthMonitor),
+	}
+}
+
+// StartMonitoring starts monitoring a specific client
+func (hmm *HealthMonitorManager) StartMonitoring(monitor *ClientHealthMonitor) {
+	hmm.mu.Lock()
+	defer hmm.mu.Unlock()
+
+	// Stop any existing monitor for this client
+	if existing, ok := hmm.monitors[monitor.clientID]; ok {
+		existing.Stop()
+	}
+
+	hmm.monitors[monitor.clientID] = monitor
+	monitor.Start()
+}
+
+// StopMonitoring stops monitoring a specific client
+func (hmm *HealthMonitorManager) StopMonitoring(clientID string) {
+	hmm.mu.Lock()
+	defer hmm.mu.Unlock()
+
+	if monitor, ok := hmm.monitors[clientID]; ok {
+		monitor.Stop()
+		delete(hmm.monitors, clientID)
+	}
+}
+
+// StopAll stops all monitoring
+func (hmm *HealthMonitorManager) StopAll() {
+	hmm.mu.Lock()
+	defer hmm.mu.Unlock()
+
+	for _, monitor := range hmm.monitors {
+		monitor.Stop()
+	}
+	hmm.monitors = make(map[string]*ClientHealthMonitor)
+}