first commit
This commit is contained in:
312
core/mcp/healthmonitor.go
Normal file
312
core/mcp/healthmonitor.go
Normal file
@@ -0,0 +1,312 @@
|
||||
package mcp
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/mark3labs/mcp-go/client"
|
||||
"github.com/mark3labs/mcp-go/mcp"
|
||||
"github.com/maximhq/bifrost/core/schemas"
|
||||
)
|
||||
|
||||
const (
|
||||
// Health check configuration
|
||||
DefaultHealthCheckInterval = 10 * time.Second // Interval between health checks
|
||||
DefaultHealthCheckTimeout = 5 * time.Second // Timeout for each health check
|
||||
MaxConsecutiveFailures = 5 // Number of failures before marking as unhealthy
|
||||
)
|
||||
|
||||
// ClientHealthMonitor tracks the health status of an MCP client
|
||||
type ClientHealthMonitor struct {
|
||||
manager *MCPManager
|
||||
clientID string
|
||||
interval time.Duration
|
||||
timeout time.Duration
|
||||
maxConsecutiveFailures int
|
||||
logger schemas.Logger
|
||||
mu sync.Mutex
|
||||
ticker *time.Ticker
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
isMonitoring bool
|
||||
consecutiveFailures int
|
||||
isPingAvailable bool // Whether the MCP server supports ping for health checks
|
||||
isReconnecting bool // Whether a reconnection attempt is currently in progress
|
||||
}
|
||||
|
||||
// NewClientHealthMonitor creates a new health monitor for an MCP client
|
||||
func NewClientHealthMonitor(
|
||||
manager *MCPManager,
|
||||
clientID string,
|
||||
interval time.Duration,
|
||||
isPingAvailable bool,
|
||||
logger schemas.Logger,
|
||||
) *ClientHealthMonitor {
|
||||
if interval == 0 {
|
||||
interval = DefaultHealthCheckInterval
|
||||
}
|
||||
|
||||
if logger == nil {
|
||||
logger = defaultLogger
|
||||
}
|
||||
|
||||
return &ClientHealthMonitor{
|
||||
manager: manager,
|
||||
clientID: clientID,
|
||||
interval: interval,
|
||||
timeout: DefaultHealthCheckTimeout,
|
||||
maxConsecutiveFailures: MaxConsecutiveFailures,
|
||||
logger: logger,
|
||||
isMonitoring: false,
|
||||
consecutiveFailures: 0,
|
||||
isPingAvailable: isPingAvailable,
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins monitoring the client's health in a background goroutine
|
||||
func (chm *ClientHealthMonitor) Start() {
|
||||
chm.mu.Lock()
|
||||
defer chm.mu.Unlock()
|
||||
|
||||
if chm.isMonitoring {
|
||||
return // Already monitoring
|
||||
}
|
||||
|
||||
// Check client exists FIRST before allocating resources
|
||||
chm.manager.mu.RLock()
|
||||
clientState, exists := chm.manager.clientMap[chm.clientID]
|
||||
chm.manager.mu.RUnlock()
|
||||
|
||||
if !exists {
|
||||
// Use clientID for logging when client is missing
|
||||
chm.logger.Error("%s Health monitor failed to start for client %s, client not found in manager", MCPLogPrefix, chm.clientID)
|
||||
return
|
||||
}
|
||||
|
||||
// Now allocate resources (after validation)
|
||||
chm.isMonitoring = true
|
||||
chm.ctx, chm.cancel = context.WithCancel(context.Background())
|
||||
chm.ticker = time.NewTicker(chm.interval)
|
||||
|
||||
go chm.monitorLoop()
|
||||
chm.logger.Debug("%s Health monitor started for client %s", MCPLogPrefix, clientState.ExecutionConfig.Name)
|
||||
}
|
||||
|
||||
// Stop stops monitoring the client's health
|
||||
func (chm *ClientHealthMonitor) Stop() {
|
||||
chm.mu.Lock()
|
||||
defer chm.mu.Unlock()
|
||||
|
||||
if !chm.isMonitoring {
|
||||
return // Not monitoring
|
||||
}
|
||||
|
||||
// Always perform cleanup - do not access manager.clientMap here to avoid
|
||||
// deadlock when Stop() is called from removeClientUnsafe() which already
|
||||
// holds the manager's write lock
|
||||
chm.isMonitoring = false
|
||||
if chm.ticker != nil {
|
||||
chm.ticker.Stop()
|
||||
}
|
||||
if chm.cancel != nil {
|
||||
chm.cancel()
|
||||
}
|
||||
|
||||
chm.logger.Debug("%s Health monitor stopped for client %s", MCPLogPrefix, chm.clientID)
|
||||
}
|
||||
|
||||
// monitorLoop runs the health check loop
|
||||
func (chm *ClientHealthMonitor) monitorLoop() {
|
||||
for {
|
||||
select {
|
||||
case <-chm.ctx.Done():
|
||||
return
|
||||
case <-chm.ticker.C:
|
||||
chm.performHealthCheck()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// performHealthCheck performs a health check on the client.
|
||||
// On max consecutive failures it marks the client as disconnected and spawns
|
||||
// a background reconnection attempt (with full retry backoff via ReconnectClient).
|
||||
func (chm *ClientHealthMonitor) performHealthCheck() {
|
||||
// Skip while a reconnection attempt is already in flight
|
||||
chm.mu.Lock()
|
||||
if chm.isReconnecting {
|
||||
chm.mu.Unlock()
|
||||
return
|
||||
}
|
||||
chm.mu.Unlock()
|
||||
|
||||
// Get the client connection — capture Conn while holding the lock so we
|
||||
// don't race with removeClientUnsafe zeroing it under the write lock.
|
||||
chm.manager.mu.RLock()
|
||||
clientState, exists := chm.manager.clientMap[chm.clientID]
|
||||
var conn *client.Client
|
||||
if exists && clientState != nil {
|
||||
conn = clientState.Conn
|
||||
}
|
||||
chm.manager.mu.RUnlock()
|
||||
|
||||
if !exists {
|
||||
chm.Stop()
|
||||
return
|
||||
}
|
||||
|
||||
var err error
|
||||
if conn == nil {
|
||||
// No active connection — treat as a health check failure
|
||||
err = fmt.Errorf("no active connection")
|
||||
} else {
|
||||
// Perform health check with timeout
|
||||
ctx, cancel := context.WithTimeout(context.Background(), chm.timeout)
|
||||
defer cancel()
|
||||
|
||||
if chm.isPingAvailable {
|
||||
err = conn.Ping(ctx)
|
||||
} else {
|
||||
listRequest := mcp.ListToolsRequest{
|
||||
PaginatedRequest: mcp.PaginatedRequest{
|
||||
Request: mcp.Request{
|
||||
Method: string(mcp.MethodToolsList),
|
||||
},
|
||||
},
|
||||
}
|
||||
_, err = conn.ListTools(ctx, listRequest)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
chm.incrementFailures()
|
||||
|
||||
if chm.getConsecutiveFailures() >= chm.maxConsecutiveFailures {
|
||||
chm.updateClientState(schemas.MCPConnectionStateDisconnected)
|
||||
chm.mu.Lock()
|
||||
if !chm.isReconnecting {
|
||||
chm.isReconnecting = true
|
||||
go chm.attemptReconnect()
|
||||
}
|
||||
chm.mu.Unlock()
|
||||
}
|
||||
} else {
|
||||
chm.resetFailures()
|
||||
chm.updateClientState(schemas.MCPConnectionStateConnected)
|
||||
}
|
||||
}
|
||||
|
||||
// attemptReconnect runs in a background goroutine and calls ReconnectClient,
|
||||
// which internally applies full exponential backoff retry logic.
|
||||
// On success the failure counter is reset; on failure the isReconnecting flag
|
||||
// is cleared so the next health check cycle can try again.
|
||||
func (chm *ClientHealthMonitor) attemptReconnect() {
|
||||
defer func() {
|
||||
chm.mu.Lock()
|
||||
chm.isReconnecting = false
|
||||
chm.mu.Unlock()
|
||||
}()
|
||||
|
||||
chm.logger.Debug("%s Attempting to reconnect MCP client %s...", MCPLogPrefix, chm.clientID)
|
||||
|
||||
if err := chm.manager.ReconnectClient(chm.clientID); err != nil {
|
||||
chm.logger.Warn("%s Failed to reconnect MCP client %s: %v", MCPLogPrefix, chm.clientID, err)
|
||||
return
|
||||
}
|
||||
|
||||
chm.logger.Info("%s Successfully reconnected MCP client %s", MCPLogPrefix, chm.clientID)
|
||||
chm.resetFailures()
|
||||
}
|
||||
|
||||
// updateClientState updates the client's connection state
|
||||
func (chm *ClientHealthMonitor) updateClientState(state schemas.MCPConnectionState) {
|
||||
chm.manager.mu.Lock()
|
||||
clientState, exists := chm.manager.clientMap[chm.clientID]
|
||||
if !exists {
|
||||
chm.manager.mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
// Only update if state changed
|
||||
stateChanged := clientState.State != state
|
||||
if stateChanged {
|
||||
clientState.State = state
|
||||
}
|
||||
chm.manager.mu.Unlock()
|
||||
|
||||
// Log after releasing the lock
|
||||
if stateChanged {
|
||||
chm.logger.Info(fmt.Sprintf("%s Client %s connection state changed to: %s", MCPLogPrefix, clientState.ExecutionConfig.Name, state))
|
||||
}
|
||||
}
|
||||
|
||||
// incrementFailures increments the consecutive failure counter
|
||||
func (chm *ClientHealthMonitor) incrementFailures() {
|
||||
chm.mu.Lock()
|
||||
defer chm.mu.Unlock()
|
||||
chm.consecutiveFailures++
|
||||
}
|
||||
|
||||
// resetFailures resets the consecutive failure counter
|
||||
func (chm *ClientHealthMonitor) resetFailures() {
|
||||
chm.mu.Lock()
|
||||
defer chm.mu.Unlock()
|
||||
chm.consecutiveFailures = 0
|
||||
}
|
||||
|
||||
// getConsecutiveFailures returns the current consecutive failure count
|
||||
func (chm *ClientHealthMonitor) getConsecutiveFailures() int {
|
||||
chm.mu.Lock()
|
||||
defer chm.mu.Unlock()
|
||||
return chm.consecutiveFailures
|
||||
}
|
||||
|
||||
// HealthMonitorManager manages all client health monitors
|
||||
type HealthMonitorManager struct {
|
||||
monitors map[string]*ClientHealthMonitor
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewHealthMonitorManager creates a new health monitor manager
|
||||
func NewHealthMonitorManager() *HealthMonitorManager {
|
||||
return &HealthMonitorManager{
|
||||
monitors: make(map[string]*ClientHealthMonitor),
|
||||
}
|
||||
}
|
||||
|
||||
// StartMonitoring starts monitoring a specific client
|
||||
func (hmm *HealthMonitorManager) StartMonitoring(monitor *ClientHealthMonitor) {
|
||||
hmm.mu.Lock()
|
||||
defer hmm.mu.Unlock()
|
||||
|
||||
// Stop any existing monitor for this client
|
||||
if existing, ok := hmm.monitors[monitor.clientID]; ok {
|
||||
existing.Stop()
|
||||
}
|
||||
|
||||
hmm.monitors[monitor.clientID] = monitor
|
||||
monitor.Start()
|
||||
}
|
||||
|
||||
// StopMonitoring stops monitoring a specific client
|
||||
func (hmm *HealthMonitorManager) StopMonitoring(clientID string) {
|
||||
hmm.mu.Lock()
|
||||
defer hmm.mu.Unlock()
|
||||
|
||||
if monitor, ok := hmm.monitors[clientID]; ok {
|
||||
monitor.Stop()
|
||||
delete(hmm.monitors, clientID)
|
||||
}
|
||||
}
|
||||
|
||||
// StopAll stops all monitoring
|
||||
func (hmm *HealthMonitorManager) StopAll() {
|
||||
hmm.mu.Lock()
|
||||
defer hmm.mu.Unlock()
|
||||
|
||||
for _, monitor := range hmm.monitors {
|
||||
monitor.Stop()
|
||||
}
|
||||
hmm.monitors = make(map[string]*ClientHealthMonitor)
|
||||
}
|
||||
Reference in New Issue
Block a user