Files
bifrost/core/mcp/healthmonitor.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

313 lines
8.4 KiB
Go

package mcp
import (
"context"
"fmt"
"sync"
"time"
"github.com/mark3labs/mcp-go/client"
"github.com/mark3labs/mcp-go/mcp"
"github.com/maximhq/bifrost/core/schemas"
)
const (
// Health check configuration
DefaultHealthCheckInterval = 10 * time.Second // Interval between health checks
DefaultHealthCheckTimeout = 5 * time.Second // Timeout for each health check
MaxConsecutiveFailures = 5 // Number of failures before marking as unhealthy
)
// ClientHealthMonitor tracks the health status of an MCP client
type ClientHealthMonitor struct {
manager *MCPManager
clientID string
interval time.Duration
timeout time.Duration
maxConsecutiveFailures int
logger schemas.Logger
mu sync.Mutex
ticker *time.Ticker
ctx context.Context
cancel context.CancelFunc
isMonitoring bool
consecutiveFailures int
isPingAvailable bool // Whether the MCP server supports ping for health checks
isReconnecting bool // Whether a reconnection attempt is currently in progress
}
// NewClientHealthMonitor creates a new health monitor for an MCP client
func NewClientHealthMonitor(
manager *MCPManager,
clientID string,
interval time.Duration,
isPingAvailable bool,
logger schemas.Logger,
) *ClientHealthMonitor {
if interval == 0 {
interval = DefaultHealthCheckInterval
}
if logger == nil {
logger = defaultLogger
}
return &ClientHealthMonitor{
manager: manager,
clientID: clientID,
interval: interval,
timeout: DefaultHealthCheckTimeout,
maxConsecutiveFailures: MaxConsecutiveFailures,
logger: logger,
isMonitoring: false,
consecutiveFailures: 0,
isPingAvailable: isPingAvailable,
}
}
// Start begins monitoring the client's health in a background goroutine
func (chm *ClientHealthMonitor) Start() {
chm.mu.Lock()
defer chm.mu.Unlock()
if chm.isMonitoring {
return // Already monitoring
}
// Check client exists FIRST before allocating resources
chm.manager.mu.RLock()
clientState, exists := chm.manager.clientMap[chm.clientID]
chm.manager.mu.RUnlock()
if !exists {
// Use clientID for logging when client is missing
chm.logger.Error("%s Health monitor failed to start for client %s, client not found in manager", MCPLogPrefix, chm.clientID)
return
}
// Now allocate resources (after validation)
chm.isMonitoring = true
chm.ctx, chm.cancel = context.WithCancel(context.Background())
chm.ticker = time.NewTicker(chm.interval)
go chm.monitorLoop()
chm.logger.Debug("%s Health monitor started for client %s", MCPLogPrefix, clientState.ExecutionConfig.Name)
}
// Stop stops monitoring the client's health
func (chm *ClientHealthMonitor) Stop() {
chm.mu.Lock()
defer chm.mu.Unlock()
if !chm.isMonitoring {
return // Not monitoring
}
// Always perform cleanup - do not access manager.clientMap here to avoid
// deadlock when Stop() is called from removeClientUnsafe() which already
// holds the manager's write lock
chm.isMonitoring = false
if chm.ticker != nil {
chm.ticker.Stop()
}
if chm.cancel != nil {
chm.cancel()
}
chm.logger.Debug("%s Health monitor stopped for client %s", MCPLogPrefix, chm.clientID)
}
// monitorLoop runs the health check loop
func (chm *ClientHealthMonitor) monitorLoop() {
for {
select {
case <-chm.ctx.Done():
return
case <-chm.ticker.C:
chm.performHealthCheck()
}
}
}
// performHealthCheck performs a health check on the client.
// On max consecutive failures it marks the client as disconnected and spawns
// a background reconnection attempt (with full retry backoff via ReconnectClient).
func (chm *ClientHealthMonitor) performHealthCheck() {
// Skip while a reconnection attempt is already in flight
chm.mu.Lock()
if chm.isReconnecting {
chm.mu.Unlock()
return
}
chm.mu.Unlock()
// Get the client connection — capture Conn while holding the lock so we
// don't race with removeClientUnsafe zeroing it under the write lock.
chm.manager.mu.RLock()
clientState, exists := chm.manager.clientMap[chm.clientID]
var conn *client.Client
if exists && clientState != nil {
conn = clientState.Conn
}
chm.manager.mu.RUnlock()
if !exists {
chm.Stop()
return
}
var err error
if conn == nil {
// No active connection — treat as a health check failure
err = fmt.Errorf("no active connection")
} else {
// Perform health check with timeout
ctx, cancel := context.WithTimeout(context.Background(), chm.timeout)
defer cancel()
if chm.isPingAvailable {
err = conn.Ping(ctx)
} else {
listRequest := mcp.ListToolsRequest{
PaginatedRequest: mcp.PaginatedRequest{
Request: mcp.Request{
Method: string(mcp.MethodToolsList),
},
},
}
_, err = conn.ListTools(ctx, listRequest)
}
}
if err != nil {
chm.incrementFailures()
if chm.getConsecutiveFailures() >= chm.maxConsecutiveFailures {
chm.updateClientState(schemas.MCPConnectionStateDisconnected)
chm.mu.Lock()
if !chm.isReconnecting {
chm.isReconnecting = true
go chm.attemptReconnect()
}
chm.mu.Unlock()
}
} else {
chm.resetFailures()
chm.updateClientState(schemas.MCPConnectionStateConnected)
}
}
// attemptReconnect runs in a background goroutine and calls ReconnectClient,
// which internally applies full exponential backoff retry logic.
// On success the failure counter is reset; on failure the isReconnecting flag
// is cleared so the next health check cycle can try again.
func (chm *ClientHealthMonitor) attemptReconnect() {
defer func() {
chm.mu.Lock()
chm.isReconnecting = false
chm.mu.Unlock()
}()
chm.logger.Debug("%s Attempting to reconnect MCP client %s...", MCPLogPrefix, chm.clientID)
if err := chm.manager.ReconnectClient(chm.clientID); err != nil {
chm.logger.Warn("%s Failed to reconnect MCP client %s: %v", MCPLogPrefix, chm.clientID, err)
return
}
chm.logger.Info("%s Successfully reconnected MCP client %s", MCPLogPrefix, chm.clientID)
chm.resetFailures()
}
// updateClientState updates the client's connection state
func (chm *ClientHealthMonitor) updateClientState(state schemas.MCPConnectionState) {
chm.manager.mu.Lock()
clientState, exists := chm.manager.clientMap[chm.clientID]
if !exists {
chm.manager.mu.Unlock()
return
}
// Only update if state changed
stateChanged := clientState.State != state
if stateChanged {
clientState.State = state
}
chm.manager.mu.Unlock()
// Log after releasing the lock
if stateChanged {
chm.logger.Info(fmt.Sprintf("%s Client %s connection state changed to: %s", MCPLogPrefix, clientState.ExecutionConfig.Name, state))
}
}
// incrementFailures increments the consecutive failure counter
func (chm *ClientHealthMonitor) incrementFailures() {
chm.mu.Lock()
defer chm.mu.Unlock()
chm.consecutiveFailures++
}
// resetFailures resets the consecutive failure counter
func (chm *ClientHealthMonitor) resetFailures() {
chm.mu.Lock()
defer chm.mu.Unlock()
chm.consecutiveFailures = 0
}
// getConsecutiveFailures returns the current consecutive failure count
func (chm *ClientHealthMonitor) getConsecutiveFailures() int {
chm.mu.Lock()
defer chm.mu.Unlock()
return chm.consecutiveFailures
}
// HealthMonitorManager manages all client health monitors
type HealthMonitorManager struct {
monitors map[string]*ClientHealthMonitor
mu sync.RWMutex
}
// NewHealthMonitorManager creates a new health monitor manager
func NewHealthMonitorManager() *HealthMonitorManager {
return &HealthMonitorManager{
monitors: make(map[string]*ClientHealthMonitor),
}
}
// StartMonitoring starts monitoring a specific client
func (hmm *HealthMonitorManager) StartMonitoring(monitor *ClientHealthMonitor) {
hmm.mu.Lock()
defer hmm.mu.Unlock()
// Stop any existing monitor for this client
if existing, ok := hmm.monitors[monitor.clientID]; ok {
existing.Stop()
}
hmm.monitors[monitor.clientID] = monitor
monitor.Start()
}
// StopMonitoring stops monitoring a specific client
func (hmm *HealthMonitorManager) StopMonitoring(clientID string) {
hmm.mu.Lock()
defer hmm.mu.Unlock()
if monitor, ok := hmm.monitors[clientID]; ok {
monitor.Stop()
delete(hmm.monitors, clientID)
}
}
// StopAll stops all monitoring
func (hmm *HealthMonitorManager) StopAll() {
hmm.mu.Lock()
defer hmm.mu.Unlock()
for _, monitor := range hmm.monitors {
monitor.Stop()
}
hmm.monitors = make(map[string]*ClientHealthMonitor)
}