313 lines
8.4 KiB
Go
313 lines
8.4 KiB
Go
package mcp
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/mark3labs/mcp-go/client"
|
|
"github.com/mark3labs/mcp-go/mcp"
|
|
"github.com/maximhq/bifrost/core/schemas"
|
|
)
|
|
|
|
const (
|
|
// Health check configuration
|
|
DefaultHealthCheckInterval = 10 * time.Second // Interval between health checks
|
|
DefaultHealthCheckTimeout = 5 * time.Second // Timeout for each health check
|
|
MaxConsecutiveFailures = 5 // Number of failures before marking as unhealthy
|
|
)
|
|
|
|
// ClientHealthMonitor tracks the health status of an MCP client
|
|
type ClientHealthMonitor struct {
|
|
manager *MCPManager
|
|
clientID string
|
|
interval time.Duration
|
|
timeout time.Duration
|
|
maxConsecutiveFailures int
|
|
logger schemas.Logger
|
|
mu sync.Mutex
|
|
ticker *time.Ticker
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
isMonitoring bool
|
|
consecutiveFailures int
|
|
isPingAvailable bool // Whether the MCP server supports ping for health checks
|
|
isReconnecting bool // Whether a reconnection attempt is currently in progress
|
|
}
|
|
|
|
// NewClientHealthMonitor creates a new health monitor for an MCP client
|
|
func NewClientHealthMonitor(
|
|
manager *MCPManager,
|
|
clientID string,
|
|
interval time.Duration,
|
|
isPingAvailable bool,
|
|
logger schemas.Logger,
|
|
) *ClientHealthMonitor {
|
|
if interval == 0 {
|
|
interval = DefaultHealthCheckInterval
|
|
}
|
|
|
|
if logger == nil {
|
|
logger = defaultLogger
|
|
}
|
|
|
|
return &ClientHealthMonitor{
|
|
manager: manager,
|
|
clientID: clientID,
|
|
interval: interval,
|
|
timeout: DefaultHealthCheckTimeout,
|
|
maxConsecutiveFailures: MaxConsecutiveFailures,
|
|
logger: logger,
|
|
isMonitoring: false,
|
|
consecutiveFailures: 0,
|
|
isPingAvailable: isPingAvailable,
|
|
}
|
|
}
|
|
|
|
// Start begins monitoring the client's health in a background goroutine
|
|
func (chm *ClientHealthMonitor) Start() {
|
|
chm.mu.Lock()
|
|
defer chm.mu.Unlock()
|
|
|
|
if chm.isMonitoring {
|
|
return // Already monitoring
|
|
}
|
|
|
|
// Check client exists FIRST before allocating resources
|
|
chm.manager.mu.RLock()
|
|
clientState, exists := chm.manager.clientMap[chm.clientID]
|
|
chm.manager.mu.RUnlock()
|
|
|
|
if !exists {
|
|
// Use clientID for logging when client is missing
|
|
chm.logger.Error("%s Health monitor failed to start for client %s, client not found in manager", MCPLogPrefix, chm.clientID)
|
|
return
|
|
}
|
|
|
|
// Now allocate resources (after validation)
|
|
chm.isMonitoring = true
|
|
chm.ctx, chm.cancel = context.WithCancel(context.Background())
|
|
chm.ticker = time.NewTicker(chm.interval)
|
|
|
|
go chm.monitorLoop()
|
|
chm.logger.Debug("%s Health monitor started for client %s", MCPLogPrefix, clientState.ExecutionConfig.Name)
|
|
}
|
|
|
|
// Stop stops monitoring the client's health
|
|
func (chm *ClientHealthMonitor) Stop() {
|
|
chm.mu.Lock()
|
|
defer chm.mu.Unlock()
|
|
|
|
if !chm.isMonitoring {
|
|
return // Not monitoring
|
|
}
|
|
|
|
// Always perform cleanup - do not access manager.clientMap here to avoid
|
|
// deadlock when Stop() is called from removeClientUnsafe() which already
|
|
// holds the manager's write lock
|
|
chm.isMonitoring = false
|
|
if chm.ticker != nil {
|
|
chm.ticker.Stop()
|
|
}
|
|
if chm.cancel != nil {
|
|
chm.cancel()
|
|
}
|
|
|
|
chm.logger.Debug("%s Health monitor stopped for client %s", MCPLogPrefix, chm.clientID)
|
|
}
|
|
|
|
// monitorLoop runs the health check loop
|
|
func (chm *ClientHealthMonitor) monitorLoop() {
|
|
for {
|
|
select {
|
|
case <-chm.ctx.Done():
|
|
return
|
|
case <-chm.ticker.C:
|
|
chm.performHealthCheck()
|
|
}
|
|
}
|
|
}
|
|
|
|
// performHealthCheck performs a health check on the client.
|
|
// On max consecutive failures it marks the client as disconnected and spawns
|
|
// a background reconnection attempt (with full retry backoff via ReconnectClient).
|
|
func (chm *ClientHealthMonitor) performHealthCheck() {
|
|
// Skip while a reconnection attempt is already in flight
|
|
chm.mu.Lock()
|
|
if chm.isReconnecting {
|
|
chm.mu.Unlock()
|
|
return
|
|
}
|
|
chm.mu.Unlock()
|
|
|
|
// Get the client connection — capture Conn while holding the lock so we
|
|
// don't race with removeClientUnsafe zeroing it under the write lock.
|
|
chm.manager.mu.RLock()
|
|
clientState, exists := chm.manager.clientMap[chm.clientID]
|
|
var conn *client.Client
|
|
if exists && clientState != nil {
|
|
conn = clientState.Conn
|
|
}
|
|
chm.manager.mu.RUnlock()
|
|
|
|
if !exists {
|
|
chm.Stop()
|
|
return
|
|
}
|
|
|
|
var err error
|
|
if conn == nil {
|
|
// No active connection — treat as a health check failure
|
|
err = fmt.Errorf("no active connection")
|
|
} else {
|
|
// Perform health check with timeout
|
|
ctx, cancel := context.WithTimeout(context.Background(), chm.timeout)
|
|
defer cancel()
|
|
|
|
if chm.isPingAvailable {
|
|
err = conn.Ping(ctx)
|
|
} else {
|
|
listRequest := mcp.ListToolsRequest{
|
|
PaginatedRequest: mcp.PaginatedRequest{
|
|
Request: mcp.Request{
|
|
Method: string(mcp.MethodToolsList),
|
|
},
|
|
},
|
|
}
|
|
_, err = conn.ListTools(ctx, listRequest)
|
|
}
|
|
}
|
|
|
|
if err != nil {
|
|
chm.incrementFailures()
|
|
|
|
if chm.getConsecutiveFailures() >= chm.maxConsecutiveFailures {
|
|
chm.updateClientState(schemas.MCPConnectionStateDisconnected)
|
|
chm.mu.Lock()
|
|
if !chm.isReconnecting {
|
|
chm.isReconnecting = true
|
|
go chm.attemptReconnect()
|
|
}
|
|
chm.mu.Unlock()
|
|
}
|
|
} else {
|
|
chm.resetFailures()
|
|
chm.updateClientState(schemas.MCPConnectionStateConnected)
|
|
}
|
|
}
|
|
|
|
// attemptReconnect runs in a background goroutine and calls ReconnectClient,
|
|
// which internally applies full exponential backoff retry logic.
|
|
// On success the failure counter is reset; on failure the isReconnecting flag
|
|
// is cleared so the next health check cycle can try again.
|
|
func (chm *ClientHealthMonitor) attemptReconnect() {
|
|
defer func() {
|
|
chm.mu.Lock()
|
|
chm.isReconnecting = false
|
|
chm.mu.Unlock()
|
|
}()
|
|
|
|
chm.logger.Debug("%s Attempting to reconnect MCP client %s...", MCPLogPrefix, chm.clientID)
|
|
|
|
if err := chm.manager.ReconnectClient(chm.clientID); err != nil {
|
|
chm.logger.Warn("%s Failed to reconnect MCP client %s: %v", MCPLogPrefix, chm.clientID, err)
|
|
return
|
|
}
|
|
|
|
chm.logger.Info("%s Successfully reconnected MCP client %s", MCPLogPrefix, chm.clientID)
|
|
chm.resetFailures()
|
|
}
|
|
|
|
// updateClientState updates the client's connection state
|
|
func (chm *ClientHealthMonitor) updateClientState(state schemas.MCPConnectionState) {
|
|
chm.manager.mu.Lock()
|
|
clientState, exists := chm.manager.clientMap[chm.clientID]
|
|
if !exists {
|
|
chm.manager.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Only update if state changed
|
|
stateChanged := clientState.State != state
|
|
if stateChanged {
|
|
clientState.State = state
|
|
}
|
|
chm.manager.mu.Unlock()
|
|
|
|
// Log after releasing the lock
|
|
if stateChanged {
|
|
chm.logger.Info(fmt.Sprintf("%s Client %s connection state changed to: %s", MCPLogPrefix, clientState.ExecutionConfig.Name, state))
|
|
}
|
|
}
|
|
|
|
// incrementFailures increments the consecutive failure counter
|
|
func (chm *ClientHealthMonitor) incrementFailures() {
|
|
chm.mu.Lock()
|
|
defer chm.mu.Unlock()
|
|
chm.consecutiveFailures++
|
|
}
|
|
|
|
// resetFailures resets the consecutive failure counter
|
|
func (chm *ClientHealthMonitor) resetFailures() {
|
|
chm.mu.Lock()
|
|
defer chm.mu.Unlock()
|
|
chm.consecutiveFailures = 0
|
|
}
|
|
|
|
// getConsecutiveFailures returns the current consecutive failure count
|
|
func (chm *ClientHealthMonitor) getConsecutiveFailures() int {
|
|
chm.mu.Lock()
|
|
defer chm.mu.Unlock()
|
|
return chm.consecutiveFailures
|
|
}
|
|
|
|
// HealthMonitorManager manages all client health monitors
|
|
type HealthMonitorManager struct {
|
|
monitors map[string]*ClientHealthMonitor
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
// NewHealthMonitorManager creates a new health monitor manager
|
|
func NewHealthMonitorManager() *HealthMonitorManager {
|
|
return &HealthMonitorManager{
|
|
monitors: make(map[string]*ClientHealthMonitor),
|
|
}
|
|
}
|
|
|
|
// StartMonitoring starts monitoring a specific client
|
|
func (hmm *HealthMonitorManager) StartMonitoring(monitor *ClientHealthMonitor) {
|
|
hmm.mu.Lock()
|
|
defer hmm.mu.Unlock()
|
|
|
|
// Stop any existing monitor for this client
|
|
if existing, ok := hmm.monitors[monitor.clientID]; ok {
|
|
existing.Stop()
|
|
}
|
|
|
|
hmm.monitors[monitor.clientID] = monitor
|
|
monitor.Start()
|
|
}
|
|
|
|
// StopMonitoring stops monitoring a specific client
|
|
func (hmm *HealthMonitorManager) StopMonitoring(clientID string) {
|
|
hmm.mu.Lock()
|
|
defer hmm.mu.Unlock()
|
|
|
|
if monitor, ok := hmm.monitors[clientID]; ok {
|
|
monitor.Stop()
|
|
delete(hmm.monitors, clientID)
|
|
}
|
|
}
|
|
|
|
// StopAll stops all monitoring
|
|
func (hmm *HealthMonitorManager) StopAll() {
|
|
hmm.mu.Lock()
|
|
defer hmm.mu.Unlock()
|
|
|
|
for _, monitor := range hmm.monitors {
|
|
monitor.Stop()
|
|
}
|
|
hmm.monitors = make(map[string]*ClientHealthMonitor)
|
|
}
|