bifrost/core/mcp/clientmanager.go

package mcp

import (
	"context"
	"fmt"
	"maps"
	"os"
	"slices"
	"strings"

	"github.com/mark3labs/mcp-go/client"
	"github.com/mark3labs/mcp-go/client/transport"
	"github.com/mark3labs/mcp-go/mcp"
	"github.com/mark3labs/mcp-go/server"
	"github.com/maximhq/bifrost/core/schemas"
)

// GetClients returns all MCP clients managed by the manager.
//
// Returns:
//   - []*schemas.MCPClientState: List of all MCP clients
func (m *MCPManager) GetClients() []schemas.MCPClientState {
	m.mu.RLock()
	defer m.mu.RUnlock()

	clients := make([]schemas.MCPClientState, 0, len(m.clientMap))
	for _, client := range m.clientMap {
		snapshot := *client
		if client.ToolMap != nil {
			snapshot.ToolMap = make(map[string]schemas.ChatTool, len(client.ToolMap))
			maps.Copy(snapshot.ToolMap, client.ToolMap)
		}
		clients = append(clients, snapshot)
	}

	return clients
}

// ReconnectClient attempts to reconnect an MCP client if it is disconnected.
// It validates that the client exists and then establishes a new connection using
// the client's existing configuration. Retry logic is handled internally by
// connectToMCPClient (5 retries, 1-30 seconds per step).
//
// Parameters:
//   - id: ID of the client to reconnect
//
// Returns:
//   - error: Any error that occurred during reconnection
func (m *MCPManager) ReconnectClient(id string) error {
	m.mu.Lock()
	client, ok := m.clientMap[id]
	if !ok {
		m.mu.Unlock()
		return fmt.Errorf("client %s not found", id)
	}
	config := client.ExecutionConfig
	m.mu.Unlock()

	// Guard against concurrent reconnects for the same client from any caller
	// (health monitor, manual API call, etc.). LoadOrStore is atomic — whichever
	// caller arrives second gets the "already in progress" error immediately.
	if _, alreadyReconnecting := m.reconnectingClients.LoadOrStore(id, true); alreadyReconnecting {
		return fmt.Errorf("reconnect already in progress for this client")
	}
	defer m.reconnectingClients.Delete(id)

	// Reconnect using the client's configuration
	// Retry logic is handled internally by connectToMCPClient
	if err := m.connectToMCPClient(config); err != nil {
		return fmt.Errorf("failed to reconnect MCP client %s: %w", id, err)
	}

	return nil
}

// AddClient adds a new MCP client to the manager.
// It validates the client configuration and establishes a connection.
// If connection fails, the client entry is retained in Disconnected state and
// a health monitor is started to automatically reconnect with exponential backoff.
//
// Parameters:
//   - config: MCP client configuration
//
// Returns:
//   - error: Any error that occurred during client addition or connection
func (m *MCPManager) AddClient(config *schemas.MCPClientConfig) error {
	if err := validateMCPClientConfig(config); err != nil {
		return fmt.Errorf("invalid MCP client configuration: %w", err)
	}

	// Make a copy of the config to use after unlocking
	configCopy := config

	// Check if a client with the same name already exists (GetClientByName has its own lock)
	if client := m.GetClientByName(config.Name); client != nil {
		return fmt.Errorf("MCP client with name '%s' already exists", config.Name)
	}

	m.mu.Lock()

	if _, ok := m.clientMap[config.ID]; ok {
		m.mu.Unlock()
		return fmt.Errorf("client %s already exists", config.Name)
	}

	// Create placeholder entry
	m.clientMap[config.ID] = &schemas.MCPClientState{
		Name:            config.Name,
		ExecutionConfig: config,
		ToolMap:         make(map[string]schemas.ChatTool),
		ToolNameMapping: make(map[string]string),
		ConnectionInfo: &schemas.MCPClientConnectionInfo{
			Type: config.ConnectionType,
		},
	}

	// Temporarily unlock for the connection attempt
	// This is to avoid deadlocks when the connection attempt is made
	m.mu.Unlock()

	// Per-user OAuth: skip persistent connection. Auth is per-request at runtime.
	// The admin verifies the configuration via a sample login before this is called,
	// and tools are populated separately via SetClientTools().
	if configCopy.AuthType == schemas.MCPAuthTypePerUserOauth {
		m.mu.Lock()
		if client, exists := m.clientMap[config.ID]; exists {
			if config.ConnectionString != nil {
				url := config.ConnectionString.GetValue()
				client.ConnectionInfo.ConnectionURL = &url
			}
			// Restore discovered tools from config (persisted in DB across restarts)
			if len(config.DiscoveredTools) > 0 {
				for toolName, tool := range config.DiscoveredTools {
					client.ToolMap[toolName] = tool
				}
				client.ToolNameMapping = config.DiscoveredToolNameMapping
				client.State = schemas.MCPConnectionStateConnected
				m.logger.Info("%s Per-user OAuth MCP client '%s' restored with %d tools", MCPLogPrefix, config.Name, len(config.DiscoveredTools))
			} else {
				client.State = schemas.MCPConnectionStatePendingTools
				m.logger.Info("%s Per-user OAuth MCP client '%s' registered (connection deferred to runtime)", MCPLogPrefix, config.Name)
			}
		}
		m.mu.Unlock()
		return nil
	}

	// Connect using the copied config
	if err := m.connectToMCPClient(configCopy); err != nil {
		// Clean up the failed entry — this is a user-initiated action (UI/API),
		// so surface the error cleanly rather than retaining a ghost entry.
		m.mu.Lock()
		delete(m.clientMap, config.ID)
		m.mu.Unlock()
		return fmt.Errorf("failed to connect to MCP client %s: %w", config.Name, err)
	}

	return nil
}

// VerifyPerUserOAuthConnection creates a temporary MCP connection using the
// provided access token to verify the server is reachable and discover available
// tools. The connection is closed after verification. This is used during
// per-user OAuth client setup when the admin does a test login to validate the
// OAuth configuration before saving the MCP client.
//
// Parameters:
//   - config: MCP client configuration (connection URL, name, etc.)
//   - accessToken: temporary OAuth access token from the admin's test login
//
// Returns:
//   - map[string]schemas.ChatTool: discovered tools keyed by prefixed name
//   - map[string]string: tool name mapping (sanitized → original MCP name)
//   - error: any error during verification
func (m *MCPManager) VerifyPerUserOAuthConnection(ctx context.Context, config *schemas.MCPClientConfig, accessToken string) (map[string]schemas.ChatTool, map[string]string, error) {
	if config.ConnectionString == nil || config.ConnectionString.GetValue() == "" {
		return nil, nil, fmt.Errorf("connection URL is required for per-user OAuth verification")
	}

	// Create HTTP transport with the admin's temporary Bearer token
	headers := map[string]string{
		"Authorization": "Bearer " + accessToken,
	}
	httpTransport, err := transport.NewStreamableHTTP(config.ConnectionString.GetValue(), transport.WithHTTPHeaders(headers))
	if err != nil {
		return nil, nil, fmt.Errorf("failed to create HTTP transport for verification: %w", err)
	}

	// Create temporary MCP client
	tempClient := client.NewClient(httpTransport)
	ctx, cancel := context.WithTimeout(ctx, MCPClientConnectionEstablishTimeout)
	defer cancel()

	// Start transport
	if err := tempClient.Start(ctx); err != nil {
		return nil, nil, fmt.Errorf("failed to start MCP connection for verification: %w", err)
	}
	defer tempClient.Close()

	// Initialize MCP handshake
	initRequest := mcp.InitializeRequest{
		Params: mcp.InitializeParams{
			ProtocolVersion: mcp.LATEST_PROTOCOL_VERSION,
			Capabilities:    mcp.ClientCapabilities{},
			ClientInfo: mcp.Implementation{
				Name:    fmt.Sprintf("Bifrost-%s-verify", config.Name),
				Version: "1.0.0",
			},
		},
	}
	if _, err := tempClient.Initialize(ctx, initRequest); err != nil {
		return nil, nil, fmt.Errorf("failed to initialize MCP connection for verification: %w", err)
	}

	// Discover tools
	tools, toolNameMapping, err := retrieveExternalTools(ctx, tempClient, config.Name, m.logger)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to discover tools during verification: %w", err)
	}

	m.logger.Info("%s Per-user OAuth verification succeeded for '%s': discovered %d tools", MCPLogPrefix, config.Name, len(tools))
	return tools, toolNameMapping, nil
}

// SetClientTools updates the tool map and name mapping for an existing client.
// This is used to populate tools discovered during per-user OAuth verification,
// where tool discovery happens separately from client creation.
//
// Parameters:
//   - clientID: ID of the client to update
//   - tools: discovered tools keyed by prefixed name
//   - toolNameMapping: mapping from sanitized tool names to original MCP names
func (m *MCPManager) SetClientTools(clientID string, tools map[string]schemas.ChatTool, toolNameMapping map[string]string) {
	m.mu.Lock()
	defer m.mu.Unlock()

	if client, exists := m.clientMap[clientID]; exists {
		for toolName, tool := range tools {
			client.ToolMap[toolName] = tool
		}
		client.ToolNameMapping = toolNameMapping
		client.State = schemas.MCPConnectionStateConnected
		m.logger.Debug("%s Set %d tools on client '%s'", MCPLogPrefix, len(tools), client.Name)
	}
}

// RemoveClient removes an MCP client from the manager.
// It handles cleanup for all transport types (HTTP, STDIO, SSE).
//
// Parameters:
//   - id: ID of the client to remove
func (m *MCPManager) RemoveClient(id string) error {
	m.mu.Lock()
	defer m.mu.Unlock()

	return m.removeClientUnsafe(id)
}

// removeClientUnsafe removes an MCP client from the manager without acquiring locks.
// This is an internal method that should only be called when the caller already holds
// the appropriate lock. It handles cleanup for all transport types including cancellation
// of SSE contexts and closing of transport connections.
//
// Parameters:
//   - id: ID of the client to remove
//
// Returns:
//   - error: Any error that occurred during client removal
func (m *MCPManager) removeClientUnsafe(id string) error {
	client, ok := m.clientMap[id]
	if !ok {
		return fmt.Errorf("client %s not found", id)
	}
	m.logger.Info("%s Disconnecting MCP server '%s'", MCPLogPrefix, client.ExecutionConfig.Name)
	// Stop health monitoring for this client
	m.healthMonitorManager.StopMonitoring(id)
	m.logger.Debug("%s Stopped health monitoring for MCP server '%s'", MCPLogPrefix, client.ExecutionConfig.Name)
	// Stop tool syncing for this client
	m.toolSyncManager.StopSyncing(id)
	m.logger.Debug("%s Stopped tool syncing for MCP server '%s'", MCPLogPrefix, client.ExecutionConfig.Name)
	// Cancel SSE context if present (required for proper SSE cleanup)
	if client.CancelFunc != nil {
		client.CancelFunc()
		client.CancelFunc = nil
	}
	m.logger.Debug("%s Cancelled SSE context for MCP server '%s'", MCPLogPrefix, client.ExecutionConfig.Name)
	// Close the client transport connection
	// This handles cleanup for all transport types (HTTP, STDIO, SSE)
	if client.Conn != nil {
		if err := client.Conn.Close(); err != nil {
			m.logger.Error("%s Failed to close MCP server '%s': %v", MCPLogPrefix, client.ExecutionConfig.Name, err)
		}
		client.Conn = nil
	}
	m.logger.Debug("%s Closed client transport connection for MCP server '%s'", MCPLogPrefix, client.ExecutionConfig.Name)
	// Clear client tool map
	client.ToolMap = make(map[string]schemas.ChatTool)

	delete(m.clientMap, id)
	return nil
}

// UpdateClient updates an existing MCP client's configuration and refreshes its tool list.
// It updates the client's execution config with new settings and retrieves updated tools
// from the MCP server if the client is connected.
// This method does not refresh the client's tool list.
// To refresh the client's tool list, use the ReconnectClient method.
//
// Parameters:
//   - id: ID of the client to edit
//   - updatedConfig: Updated client configuration with new settings
//
// Returns:
//   - error: Any error that occurred during client update or tool retrieval
func (m *MCPManager) UpdateClient(id string, updatedConfig *schemas.MCPClientConfig) error {
	m.mu.Lock()
	defer m.mu.Unlock()

	client, ok := m.clientMap[id]
	if !ok {
		return fmt.Errorf("client %s not found", id)
	}

	if err := ValidateMCPClientName(updatedConfig.Name); err != nil {
		return fmt.Errorf("invalid MCP client configuration: %w", err)
	}

	if updatedConfig.ConnectionType != "" && updatedConfig.ConnectionType != client.ExecutionConfig.ConnectionType {
		return fmt.Errorf("connection type cannot be updated for client %s", id)
	}
	if updatedConfig.ConnectionString != nil && !updatedConfig.ConnectionString.Equals(client.ExecutionConfig.ConnectionString) {
		return fmt.Errorf("connection string cannot be updated for client %s", id)
	}
	if updatedConfig.StdioConfig != nil && !stdioConfigEqual(updatedConfig.StdioConfig, client.ExecutionConfig.StdioConfig) {
		return fmt.Errorf("stdio config cannot be updated for client %s", id)
	}
	if updatedConfig.InProcessServer != nil && updatedConfig.InProcessServer != client.ExecutionConfig.InProcessServer {
		return fmt.Errorf("in-process server cannot be updated for client %s", id)
	}

	oldName := client.ExecutionConfig.Name

	// Create a new config struct (immutable pattern) to avoid race conditions
	// with concurrent reads. Any snapshot holding the old ExecutionConfig pointer
	// will continue to see consistent data.
	newConfig := &schemas.MCPClientConfig{
		// Immutable fields - copy from existing config
		ID:               client.ExecutionConfig.ID,
		ConnectionType:   client.ExecutionConfig.ConnectionType,
		ConnectionString: client.ExecutionConfig.ConnectionString,
		StdioConfig:      client.ExecutionConfig.StdioConfig,
		AuthType:         client.ExecutionConfig.AuthType,
		OauthConfigID:    client.ExecutionConfig.OauthConfigID,
		State:            client.ExecutionConfig.State,
		InProcessServer:  client.ExecutionConfig.InProcessServer,
		ConfigHash:       client.ExecutionConfig.ConfigHash,
		ToolPricing:      maps.Clone(client.ExecutionConfig.ToolPricing),
		// Updatable fields - copy from updated config with proper cloning
		Name:                  updatedConfig.Name,
		IsCodeModeClient:      updatedConfig.IsCodeModeClient,
		Headers:               maps.Clone(updatedConfig.Headers),
		ToolsToExecute:        slices.Clone(updatedConfig.ToolsToExecute),
		ToolsToAutoExecute:    slices.Clone(updatedConfig.ToolsToAutoExecute),
		AllowedExtraHeaders:   slices.Clone(updatedConfig.AllowedExtraHeaders),
		IsPingAvailable:       updatedConfig.IsPingAvailable,
		ToolSyncInterval:      updatedConfig.ToolSyncInterval,
		AllowOnAllVirtualKeys: updatedConfig.AllowOnAllVirtualKeys,
	}

	// Atomically replace the config pointer
	client.ExecutionConfig = newConfig

	// If the client name has changed, update all tool name prefixes in the ToolMap
	if oldName != updatedConfig.Name {
		oldPrefix := oldName + "-"
		newPrefix := updatedConfig.Name + "-"

		// Create a new ToolMap with updated tool names
		newToolMap := make(map[string]schemas.ChatTool, len(client.ToolMap))
		for oldToolName, tool := range client.ToolMap {
			var newToolName string
			if strings.HasPrefix(oldToolName, oldPrefix) {
				// Update the tool name by replacing the old prefix with the new prefix
				newToolName = newPrefix + strings.TrimPrefix(oldToolName, oldPrefix)
			} else {
				newToolName = oldToolName
			}

			// Update the tool's function name if it's a function tool
			if tool.Function != nil {
				updatedTool := tool
				updatedTool.Function.Name = newToolName
				newToolMap[newToolName] = updatedTool
			} else {
				newToolMap[newToolName] = tool
			}
		}

		// Replace the old ToolMap with the new one
		client.ToolMap = newToolMap

		// Also update the client Name field
		client.Name = updatedConfig.Name
	}

	return nil
}

func stdioConfigEqual(a, b *schemas.MCPStdioConfig) bool {
	if a == nil || b == nil {
		return a == b
	}
	if a.Command != b.Command {
		return false
	}
	if len(a.Args) != len(b.Args) || len(a.Envs) != len(b.Envs) {
		return false
	}
	for i, arg := range a.Args {
		if b.Args[i] != arg {
			return false
		}
	}
	for i, env := range a.Envs {
		if b.Envs[i] != env {
			return false
		}
	}
	return true
}

// RegisterTool registers a typed tool handler with the local MCP server.
// This is a convenience function that handles the conversion between typed Go
// handlers and the MCP protocol.
//
// Type Parameters:
//   - T: The expected argument type for the tool (must be JSON-deserializable)
//
// Parameters:
//   - name: Unique tool name
//   - description: Human-readable tool description
//   - handler: Typed function that handles tool execution
//   - toolSchema: Bifrost tool schema for function calling
//
// Returns:
//   - error: Any registration error
//
// Example:
//
//	type EchoArgs struct {
//	    Message string `json:"message"`
//	}
//
//	err := bifrost.RegisterMCPTool("echo", "Echo a message",
//	    func(args EchoArgs) (string, error) {
//	        return args.Message, nil
//	    }, toolSchema)
func (m *MCPManager) RegisterTool(name, description string, toolFunction MCPToolFunction[any], toolSchema schemas.ChatTool) error {
	// Ensure local server is set up
	if err := m.setupLocalHost(); err != nil {
		return fmt.Errorf("failed to setup local host: %w", err)
	}

	// Validate tool name
	if strings.TrimSpace(name) == "" {
		return fmt.Errorf("tool name is required")
	}
	if strings.Contains(name, "-") {
		return fmt.Errorf("tool name cannot contain hyphens")
	}
	if strings.Contains(name, " ") {
		return fmt.Errorf("tool name cannot contain spaces")
	}
	if len(name) > 0 && name[0] >= '0' && name[0] <= '9' {
		return fmt.Errorf("tool name cannot start with a number")
	}

	m.mu.Lock()
	defer m.mu.Unlock()

	// Verify internal client exists
	internalClient, ok := m.clientMap[BifrostMCPClientKey]
	if !ok {
		return fmt.Errorf("bifrost client not found")
	}

	// Create prefixed tool name for consistency with external tools
	// Format: bifrostInternal-toolName
	prefixedToolName := fmt.Sprintf("%s-%s", BifrostMCPClientKey, name)

	// Check if tool name already exists to prevent silent overwrites
	if _, exists := internalClient.ToolMap[prefixedToolName]; exists {
		return fmt.Errorf("tool '%s' is already registered", name)
	}

	m.logger.Debug("%s Registering typed tool: %s -> prefixed as %s (client: %s)", MCPLogPrefix, name, prefixedToolName, BifrostMCPClientKey)
	m.logger.Info("%s Registering typed tool: %s", MCPLogPrefix, name)

	// Create MCP handler wrapper that converts between typed and MCP interfaces
	mcpHandler := func(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
		// Extract arguments from the request using the request's methods
		args := request.GetArguments()
		result, err := toolFunction(args)
		if err != nil {
			return mcp.NewToolResultError(fmt.Sprintf("Error: %s", err.Error())), nil
		}
		return mcp.NewToolResultText(result), nil
	}

	// Register the tool with the local MCP server using AddTool (unprefixed)
	if m.server != nil {
		tool := mcp.NewTool(name, mcp.WithDescription(description))
		m.server.AddTool(tool, mcpHandler)
	}

	// Store tool definition with prefixed name for consistency with external tools
	// Update the tool schema to use the prefixed name
	toolSchema.Function.Name = prefixedToolName
	internalClient.ToolMap[prefixedToolName] = toolSchema

	return nil
}

// ============================================================================
// CONNECTION HELPER METHODS
// ============================================================================

// connectToMCPClient establishes a connection to an external MCP server and
// registers its available tools with the manager. Uses exponential backoff
// retry logic (5 retries, 1-30 seconds) for connection establishment.
func (m *MCPManager) connectToMCPClient(config *schemas.MCPClientConfig) error {
	// First lock: Initialize or validate client entry
	m.mu.Lock()

	// Initialize or validate client entry
	if existingClient, exists := m.clientMap[config.ID]; exists {
		// Client entry exists from config, check for existing connection, if it does then close
		if existingClient.CancelFunc != nil {
			existingClient.CancelFunc()
			existingClient.CancelFunc = nil
		}
		if existingClient.Conn != nil {
			existingClient.Conn.Close()
		}
		// Update connection type for this connection attempt
		existingClient.ConnectionInfo.Type = config.ConnectionType
	}
	// Create new client entry with configuration.
	// Initialize State to Disconnected so the API never returns an empty state
	// during connection attempts; it transitions to Connected only on success.
	m.clientMap[config.ID] = &schemas.MCPClientState{
		Name:            config.Name,
		ExecutionConfig: config,
		State:           schemas.MCPConnectionStateDisconnected,
		ToolMap:         make(map[string]schemas.ChatTool),
		ToolNameMapping: make(map[string]string),
		ConnectionInfo: &schemas.MCPClientConnectionInfo{
			Type: config.ConnectionType,
		},
	}
	m.mu.Unlock()

	// Heavy operations performed outside lock
	var externalClient *client.Client
	var connectionInfo *schemas.MCPClientConnectionInfo
	var err error

	// Initialize the external client with timeout
	// For SSE and STDIO connections, we need a long-lived context for the connection
	// but use a timeout context for the initialization phase to prevent indefinite hangs
	var ctx context.Context
	var cancel context.CancelFunc
	var longLivedCtx context.Context
	var longLivedCancel context.CancelFunc

	if config.ConnectionType == schemas.MCPConnectionTypeSSE || config.ConnectionType == schemas.MCPConnectionTypeSTDIO {
		// Create long-lived context for the connection (subprocess lifetime)
		// Use context.Background() to avoid inheriting deadline from m.ctx
		// This prevents STDIO/SSE from being limited by HTTP request timeouts
		longLivedCtx, longLivedCancel = context.WithCancel(context.Background())

		// Use long-lived context for starting the transport (spawns subprocess)
		// but create a timeout context for initialization to prevent hangs
		ctx = longLivedCtx
		cancel = longLivedCancel
	} else {
		// Other connection types (HTTP) can use timeout context
		ctx, cancel = context.WithTimeout(m.ctx, MCPClientConnectionEstablishTimeout)
		defer cancel()
	}

	// Start the transport first (required for STDIO and SSE clients) with retry logic
	// Each retry attempt uses a fresh client instance to avoid resource leaks
	m.logger.Debug("%s [%s] Starting transport...", MCPLogPrefix, config.Name)
	transportRetryConfig := DefaultRetryConfig
	err = ExecuteWithRetry(
		m.ctx,
		func() error {
			// Close previous client if this is a retry attempt
			if externalClient != nil {
				if closeErr := externalClient.Close(); closeErr != nil {
					m.logger.Warn("%s Failed to close external client during retry: %v", MCPLogPrefix, closeErr)
				}
			}
			// Create a fresh client for this attempt
			var createErr error
			switch config.ConnectionType {
			case schemas.MCPConnectionTypeHTTP:
				externalClient, connectionInfo, createErr = m.createHTTPConnection(m.ctx, config)
			case schemas.MCPConnectionTypeSTDIO:
				externalClient, connectionInfo, createErr = m.createSTDIOConnection(m.ctx, config)
			case schemas.MCPConnectionTypeSSE:
				externalClient, connectionInfo, createErr = m.createSSEConnection(m.ctx, config)
			case schemas.MCPConnectionTypeInProcess:
				externalClient, connectionInfo, createErr = m.createInProcessConnection(m.ctx, config)
			default:
				return fmt.Errorf("unknown connection type: %s", config.ConnectionType)
			}
			if createErr != nil {
				return createErr
			}
			// Create per-attempt timeout context for Start operation
			// Each attempt has a deadline to prevent indefinite hangs
			var perAttemptCtx context.Context
			if config.ConnectionType == schemas.MCPConnectionTypeSSE || config.ConnectionType == schemas.MCPConnectionTypeSTDIO {
				// For STDIO/SSE: use longLivedCtx directly without additional timeout
				// The subprocess needs the context to stay valid for the entire connection lifetime
				// Do NOT defer cancel - the context manages the subprocess lifetime
				perAttemptCtx = longLivedCtx
				m.logger.Debug("%s [%s] Starting transport...", MCPLogPrefix, config.Name)
			} else {
				// HTTP already has timeout
				perAttemptCtx = ctx
			}
			// Start the fresh client with the per-attempt timeout
			return externalClient.Start(perAttemptCtx)
		},
		transportRetryConfig,
		m.logger,
	)
	if err != nil {
		if config.ConnectionType == schemas.MCPConnectionTypeSSE || config.ConnectionType == schemas.MCPConnectionTypeSTDIO {
			cancel() // Cancel long-lived context on error
		}
		// Close external client connection to prevent transport/goroutine leaks
		if externalClient != nil {
			if closeErr := externalClient.Close(); closeErr != nil {
				m.logger.Warn("%s Failed to close external client during cleanup: %v", MCPLogPrefix, closeErr)
			}
		}
		return fmt.Errorf("failed to start MCP client transport %s after %d retries: %v", config.Name, transportRetryConfig.MaxRetries, err)
	}
	m.logger.Debug("%s [%s] Transport started successfully", MCPLogPrefix, config.Name)

	// Create proper initialize request for external client
	extInitRequest := mcp.InitializeRequest{
		Params: mcp.InitializeParams{
			ProtocolVersion: mcp.LATEST_PROTOCOL_VERSION,
			Capabilities:    mcp.ClientCapabilities{},
			ClientInfo: mcp.Implementation{
				Name:    fmt.Sprintf("Bifrost-%s", config.Name),
				Version: "1.0.0",
			},
		},
	}

	// Initialize client with retry logic
	initRetryConfig := DefaultRetryConfig
	err = ExecuteWithRetry(
		m.ctx,
		func() error {
			// For STDIO/SSE: Use a timeout context for initialization to prevent indefinite hangs
			// The subprocess will continue running with the long-lived context
			var initCtx context.Context
			var initCancel context.CancelFunc

			if config.ConnectionType == schemas.MCPConnectionTypeSSE || config.ConnectionType == schemas.MCPConnectionTypeSTDIO {
				// Create timeout context for initialization phase only
				initCtx, initCancel = context.WithTimeout(longLivedCtx, MCPClientConnectionEstablishTimeout)
				defer initCancel()
				m.logger.Debug("%s [%s] Initializing client with %v timeout...", MCPLogPrefix, config.Name, MCPClientConnectionEstablishTimeout)
			} else {
				// HTTP already has timeout
				initCtx = ctx
			}
			_, initErr := externalClient.Initialize(initCtx, extInitRequest)
			return initErr
		},
		initRetryConfig,
		m.logger,
	)
	if err != nil {
		if config.ConnectionType == schemas.MCPConnectionTypeSSE || config.ConnectionType == schemas.MCPConnectionTypeSTDIO {
			cancel() // Cancel long-lived context on error
		}
		// Close external client connection to prevent transport/goroutine leaks
		if externalClient != nil {
			if closeErr := externalClient.Close(); closeErr != nil {
				m.logger.Warn("%s Failed to close external client during cleanup: %v", MCPLogPrefix, closeErr)
			}
		}
		return fmt.Errorf("failed to initialize MCP client %s after %d retries: %v", config.Name, initRetryConfig.MaxRetries, err)
	}
	m.logger.Debug("%s [%s] Client initialized successfully", MCPLogPrefix, config.Name)

	// Retrieve tools from the external server (this also requires network I/O)
	// Use a bounded timeout context to prevent indefinite hangs during tool retrieval.
	// For STDIO/SSE, ctx is longLivedCtx (no timeout), so we create a separate one here.
	m.logger.Debug("%s [%s] Retrieving tools...", MCPLogPrefix, config.Name)
	toolRetrievalCtx, toolRetrievalCancel := context.WithTimeout(m.ctx, MCPClientConnectionEstablishTimeout)
	defer toolRetrievalCancel()
	tools, toolNameMapping, err := retrieveExternalTools(toolRetrievalCtx, externalClient, config.Name, m.logger)
	if err != nil {
		m.logger.Warn("%s Failed to retrieve tools from %s: %v", MCPLogPrefix, config.Name, err)
		// Continue with connection even if tool retrieval fails
		tools = make(map[string]schemas.ChatTool)
		toolNameMapping = make(map[string]string)
	}
	m.logger.Debug("%s [%s] Retrieved %d tools", MCPLogPrefix, config.Name, len(tools))

	// Second lock: Update client with final connection details and tools
	m.mu.Lock()

	// Verify client still exists (could have been cleaned up during heavy operations)
	if client, exists := m.clientMap[config.ID]; exists {
		// Store the external client connection and details
		client.Conn = externalClient
		client.ConnectionInfo = connectionInfo
		client.State = schemas.MCPConnectionStateConnected

		// Store cancel function for SSE and STDIO connections to enable proper cleanup
		if config.ConnectionType == schemas.MCPConnectionTypeSSE || config.ConnectionType == schemas.MCPConnectionTypeSTDIO {
			client.CancelFunc = cancel
		}

		// Store discovered tools
		for toolName, tool := range tools {
			client.ToolMap[toolName] = tool
		}

		// Store tool name mapping for execution (sanitized_name -> original_mcp_name)
		client.ToolNameMapping = toolNameMapping

		m.logger.Debug("%s [%s] Registering %d tools. Client config - ID: %s, Name: %s, IsCodeModeClient: %v", MCPLogPrefix, config.Name, len(tools), config.ID, config.Name, config.IsCodeModeClient)
		m.logger.Info("%s Connected to MCP server '%s'", MCPLogPrefix, config.Name)
	} else {
		// Release lock before cleanup and return
		m.mu.Unlock()
		// Clean up resources before returning error: client was removed during connection setup
		// Cancel long-lived context if it was created
		if (config.ConnectionType == schemas.MCPConnectionTypeSSE || config.ConnectionType == schemas.MCPConnectionTypeSTDIO) && cancel != nil {
			cancel()
		}
		// Close external client connection to prevent transport/goroutine leaks
		if externalClient != nil {
			if err := externalClient.Close(); err != nil {
				m.logger.Warn("%s Failed to close external client during cleanup: %v", MCPLogPrefix, err)
			}
		}
		return fmt.Errorf("client %s was removed during connection setup", config.Name)
	}

	// Release lock BEFORE starting monitors to prevent deadlock
	// (StartMonitoring -> Start() tries to acquire RLock on the same mutex)
	m.mu.Unlock()

	// Register OnConnectionLost hook for SSE connections to detect idle timeouts
	if config.ConnectionType == schemas.MCPConnectionTypeSSE && externalClient != nil {
		externalClient.OnConnectionLost(func(err error) {
			m.logger.Warn("%s SSE connection lost for MCP server '%s': %v", MCPLogPrefix, config.Name, err)
			// Update state to disconnected
			m.mu.Lock()
			if client, exists := m.clientMap[config.ID]; exists {
				client.State = schemas.MCPConnectionStateDisconnected
			}
			m.mu.Unlock()
		})
	}

	// Start health monitoring for the client
	isPingAvailable := true
	if config.IsPingAvailable != nil {
		isPingAvailable = *config.IsPingAvailable
	}
	monitor := NewClientHealthMonitor(m, config.ID, DefaultHealthCheckInterval, isPingAvailable, m.logger)
	m.healthMonitorManager.StartMonitoring(monitor)

	// Start tool syncing for the client (skip for internal bifrost client)
	if config.ID != BifrostMCPClientKey {
		syncInterval := ResolveToolSyncInterval(config, m.toolSyncManager.GetGlobalInterval())
		if syncInterval > 0 {
			syncer := NewClientToolSyncer(m, config.ID, config.Name, syncInterval, m.logger)
			m.toolSyncManager.StartSyncing(syncer)
		}
	}

	return nil
}

// createHTTPConnection creates an HTTP-based MCP client connection without holding locks.
func (m *MCPManager) createHTTPConnection(ctx context.Context, config *schemas.MCPClientConfig) (*client.Client, *schemas.MCPClientConnectionInfo, error) {
	if config.ConnectionString == nil {
		return nil, nil, fmt.Errorf("HTTP connection string is required")
	}
	// Prepare connection info
	connectionInfo := &schemas.MCPClientConnectionInfo{
		Type:          config.ConnectionType,
		ConnectionURL: config.ConnectionString.GetValuePtr(),
	}
	headers, err := config.HttpHeaders(ctx, m.oauth2Provider)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to get HTTP headers: %w", err)
	}
	// Create StreamableHTTP transport
	httpTransport, err := transport.NewStreamableHTTP(config.ConnectionString.GetValue(), transport.WithHTTPHeaders(headers))
	if err != nil {
		return nil, nil, fmt.Errorf("failed to create HTTP transport: %w", err)
	}
	client := client.NewClient(httpTransport)
	return client, connectionInfo, nil
}

// createSTDIOConnection creates a STDIO-based MCP client connection without holding locks.
func (m *MCPManager) createSTDIOConnection(_ context.Context, config *schemas.MCPClientConfig) (*client.Client, *schemas.MCPClientConnectionInfo, error) {
	if config.StdioConfig == nil {
		return nil, nil, fmt.Errorf("stdio config is required")
	}

	// Prepare STDIO command info for display
	cmdString := fmt.Sprintf("%s %s", config.StdioConfig.Command, strings.Join(config.StdioConfig.Args, " "))

	// Check if environment variables are set
	for _, env := range config.StdioConfig.Envs {
		if os.Getenv(env) == "" {
			return nil, nil, fmt.Errorf("environment variable %s is not set for MCP client %s", env, config.Name)
		}
	}

	// Create STDIO transport
	stdioTransport := transport.NewStdio(
		config.StdioConfig.Command,
		config.StdioConfig.Envs,
		config.StdioConfig.Args...,
	)

	// Prepare connection info
	connectionInfo := &schemas.MCPClientConnectionInfo{
		Type:               config.ConnectionType,
		StdioCommandString: &cmdString,
	}

	client := client.NewClient(stdioTransport)

	// Return nil for cmd since mark3labs/mcp-go manages the process internally
	return client, connectionInfo, nil
}

// createSSEConnection creates a SSE-based MCP client connection without holding locks.
func (m *MCPManager) createSSEConnection(ctx context.Context, config *schemas.MCPClientConfig) (*client.Client, *schemas.MCPClientConnectionInfo, error) {
	if config.ConnectionString == nil {
		return nil, nil, fmt.Errorf("SSE connection string is required")
	}

	// Prepare connection info
	connectionInfo := &schemas.MCPClientConnectionInfo{
		Type:          config.ConnectionType,
		ConnectionURL: config.ConnectionString.GetValuePtr(), // Reuse HTTPConnectionURL field for SSE URL display
	}

	headers, err := config.HttpHeaders(ctx, m.oauth2Provider)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to get HTTP headers: %w", err)
	}

	// Create SSE transport
	sseTransport, err := transport.NewSSE(config.ConnectionString.GetValue(), transport.WithHeaders(headers))
	if err != nil {
		return nil, nil, fmt.Errorf("failed to create SSE transport: %w", err)
	}

	client := client.NewClient(sseTransport)

	return client, connectionInfo, nil
}

// createInProcessConnection creates an in-process MCP client connection without holding locks.
// This allows direct connection to an MCP server running in the same process, providing
// the lowest latency and highest performance for tool execution.
func (m *MCPManager) createInProcessConnection(_ context.Context, config *schemas.MCPClientConfig) (*client.Client, *schemas.MCPClientConnectionInfo, error) {
	if config.InProcessServer == nil {
		return nil, nil, fmt.Errorf("InProcess connection requires a server instance")
	}

	// Create in-process client directly connected to the provided server
	inProcessClient, err := client.NewInProcessClient(config.InProcessServer)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to create in-process client: %w", err)
	}

	// Prepare connection info
	connectionInfo := &schemas.MCPClientConnectionInfo{
		Type: config.ConnectionType,
	}

	return inProcessClient, connectionInfo, nil
}

// ============================================================================
// LOCAL MCP SERVER AND CLIENT MANAGEMENT
// ============================================================================

// setupLocalHost initializes the local MCP server and client if not already running.
// This creates a STDIO-based server for local tool hosting and a corresponding client.
// This is called automatically when tools are registered or when the server is needed.
//
// Returns:
//   - error: Any setup error
func (m *MCPManager) setupLocalHost() error {
	// First check: fast path if already initialized
	m.mu.Lock()
	if m.server != nil && m.serverRunning {
		m.mu.Unlock()
		return nil
	}
	m.mu.Unlock()

	// Create server and client into local variables (outside lock to avoid
	// holding lock during object creation, even though it's lightweight)
	server, err := m.createLocalMCPServer()
	if err != nil {
		return fmt.Errorf("failed to create local MCP server: %w", err)
	}

	client, err := m.createLocalMCPClient()
	if err != nil {
		return fmt.Errorf("failed to create local MCP client: %w", err)
	}

	// Second check and assignment: hold lock for atomic check-and-set
	m.mu.Lock()
	// Double-check: another goroutine might have initialized while we were creating
	if m.server != nil && m.serverRunning {
		m.mu.Unlock()
		return nil
	}

	// Assign server and client atomically while holding the lock
	m.server = server
	m.clientMap[BifrostMCPClientKey] = client
	m.mu.Unlock()

	// Start the server and initialize client connection
	// (startLocalMCPServer already locks internally)
	return m.startLocalMCPServer()
}

// createLocalMCPServer creates a new local MCP server instance with STDIO transport.
// This server will host tools registered via RegisterTool function.
//
// Returns:
//   - *server.MCPServer: Configured MCP server instance
//   - error: Any creation error
func (m *MCPManager) createLocalMCPServer() (*server.MCPServer, error) {
	// Create MCP server
	mcpServer := server.NewMCPServer(
		"Bifrost-MCP-Server",
		"1.0.0",
		server.WithToolCapabilities(true),
	)

	return mcpServer, nil
}

// createLocalMCPClient creates a placeholder client entry for the local MCP server.
// The actual in-process client connection will be established in startLocalMCPServer.
//
// Returns:
//   - *schemas.MCPClientState: Placeholder client for local server
//   - error: Any creation error
func (m *MCPManager) createLocalMCPClient() (*schemas.MCPClientState, error) {
	// Don't create the actual client connection here - it will be created
	// after the server is ready using NewInProcessClient
	return &schemas.MCPClientState{
		ExecutionConfig: &schemas.MCPClientConfig{
			ID:             BifrostMCPClientKey,
			Name:           BifrostMCPClientKey, // Use same value as ID for consistent prefixing
			ToolsToExecute: []string{"*"},       // Allow all tools for internal client
		},
		ToolMap:         make(map[string]schemas.ChatTool),
		ToolNameMapping: make(map[string]string),
		ConnectionInfo: &schemas.MCPClientConnectionInfo{
			Type: schemas.MCPConnectionTypeInProcess, // Accurate: in-process (in-memory) transport
		},
	}, nil
}

// startLocalMCPServer creates an in-process connection between the local server and client.
//
// Returns:
//   - error: Any startup error
func (m *MCPManager) startLocalMCPServer() error {
	m.mu.Lock()
	defer m.mu.Unlock()

	// Check if server is already running
	if m.server != nil && m.serverRunning {
		return nil
	}

	if m.server == nil {
		return fmt.Errorf("server not initialized")
	}

	// Create in-process client directly connected to the server
	inProcessClient, err := client.NewInProcessClient(m.server)
	if err != nil {
		return fmt.Errorf("failed to create in-process MCP client: %w", err)
	}

	// Update the client connection
	clientEntry, ok := m.clientMap[BifrostMCPClientKey]
	if !ok {
		return fmt.Errorf("bifrost client not found")
	}
	clientEntry.Conn = inProcessClient

	// Initialize the in-process client
	ctx, cancel := context.WithTimeout(m.ctx, MCPClientConnectionEstablishTimeout)
	defer cancel()

	// Create proper initialize request with correct structure
	initRequest := mcp.InitializeRequest{
		Params: mcp.InitializeParams{
			ProtocolVersion: mcp.LATEST_PROTOCOL_VERSION,
			Capabilities:    mcp.ClientCapabilities{},
			ClientInfo: mcp.Implementation{
				Name:    BifrostMCPClientName,
				Version: BifrostMCPVersion,
			},
		},
	}

	_, err = inProcessClient.Initialize(ctx, initRequest)
	if err != nil {
		return fmt.Errorf("failed to initialize MCP client: %w", err)
	}

	// Mark server as running
	m.serverRunning = true

	return nil
}