bifrost/core/internal/llmtests/reasoning.go

package llmtests

import (
	"context"
	"os"
	"testing"

	bifrost "github.com/maximhq/bifrost/core"
	"github.com/maximhq/bifrost/core/schemas"
)

// RunResponsesReasoningTest executes the reasoning test scenario to test thinking capabilities via Responses API only
func RunResponsesReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
	if !testConfig.Scenarios.Reasoning {
		t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
		return
	}

	// Skip if no reasoning model is configured
	if testConfig.ReasoningModel == "" {
		t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
		return
	}

	t.Run("ResponsesReasoning", func(t *testing.T) {
		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
			t.Parallel()
		}

		// Create a complex problem that requires step-by-step reasoning
		problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning."

		responsesMessages := []schemas.ResponsesMessage{
			CreateBasicResponsesMessage(problemPrompt),
		}

		// Execute Responses API test with retries
		responsesReq := &schemas.BifrostResponsesRequest{
			Provider: testConfig.Provider,
			Model:    testConfig.ReasoningModel,
			Input:    responsesMessages,
			Params: &schemas.ResponsesParameters{
				// Reasoning models (o3, o4-mini) allocate tokens between reasoning and text output.
				// Note: Older o1 models may not return message output via Responses API - use o3/o4-mini.
				// OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs.
				// See: https://platform.openai.com/docs/guides/reasoning#allocating-space-for-reasoning
				MaxOutputTokens: bifrost.Ptr(25000),
				// Configure reasoning-specific parameters
				Reasoning: &schemas.ResponsesParametersReasoning{
					Effort: bifrost.Ptr("high"), // High effort for complex reasoning
					// Summary: bifrost.Ptr("detailed"), // Detailed summary of reasoning process
				},
				// Include reasoning content in response
				Include: []string{"reasoning.encrypted_content"},
			},
			Fallbacks: testConfig.Fallbacks,
		}

		// Use retry framework with enhanced validation for reasoning
		retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
		retryContext := TestRetryContext{
			ScenarioName: "Reasoning",
			ExpectedBehavior: map[string]interface{}{
				"should_show_reasoning": true,
				"mathematical_problem":  true,
				"step_by_step":          true,
			},
			TestMetadata: map[string]interface{}{
				"provider":          testConfig.Provider,
				"model":             testConfig.ReasoningModel,
				"problem_type":      "mathematical",
				"complexity":        "high",
				"expects_reasoning": true,
			},
		}
		responsesRetryConfig := ResponsesRetryConfig{
			MaxAttempts: retryConfig.MaxAttempts,
			BaseDelay:   retryConfig.BaseDelay,
			MaxDelay:    retryConfig.MaxDelay,
			Conditions:  []ResponsesRetryCondition{}, // Add specific responses retry conditions as needed
			OnRetry:     retryConfig.OnRetry,
			OnFinalFail: retryConfig.OnFinalFail,
		}

		// Enhanced validation for reasoning scenarios
		expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
			"requires_reasoning": true,
		})
		expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)

		response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) {
			bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
			return client.ResponsesRequest(bfCtx, responsesReq)
		})

		if responsesError != nil {
			t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(responsesError))
		}

		// Log the response content
		responsesContent := GetResponsesContent(response)
		if responsesContent == "" {
			t.Logf("✅ Responses API reasoning result: <no content>")
		} else {
			maxLen := 300
			if len(responsesContent) < maxLen {
				maxLen = len(responsesContent)
			}
			t.Logf("✅ Responses API reasoning result: %s", responsesContent[:maxLen])
		}

		// Additional reasoning-specific validation (complementary to the main validation)
		reasoningDetected := validateResponsesAPIReasoning(t, response)
		if !reasoningDetected {
			t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content")
		} else {
			t.Logf("🧠 Reasoning structure detected in response")
		}

		t.Logf("🎉 Responses API passed Reasoning test!")
	})
}

// validateResponsesAPIReasoning performs additional validation specific to Responses API reasoning features
// Returns true if reasoning indicators are found
func validateResponsesAPIReasoning(t *testing.T, response *schemas.BifrostResponsesResponse) bool {
	if response == nil || response.Output == nil {
		return false
	}

	reasoningFound := false
	summaryFound := false
	reasoningContentFound := false

	// Check if response contains reasoning messages or reasoning content
	for _, message := range response.Output {
		// Check for ResponsesMessageTypeReasoning
		if message.Type != nil && *message.Type == schemas.ResponsesMessageTypeReasoning {
			reasoningFound = true
			t.Logf("🧠 Found ResponsesMessageTypeReasoning message in response")

			// Check for reasoning summary content
			if message.ResponsesReasoning != nil && len(message.ResponsesReasoning.Summary) > 0 {
				summaryFound = true
				t.Logf("📝 Found reasoning summary with %d content blocks", len(message.ResponsesReasoning.Summary))

				// Log first summary block for debugging
				if len(message.ResponsesReasoning.Summary) > 0 {
					firstSummary := message.ResponsesReasoning.Summary[0]
					if len(firstSummary.Text) > 0 {
						maxLen := 200
						if len(firstSummary.Text) < maxLen {
							maxLen = len(firstSummary.Text)
						}
						t.Logf("📋 First reasoning summary: %s", firstSummary.Text[:maxLen])
					} else {
						t.Logf("📋 First reasoning summary: (empty)")
					}
				}
			}

			// Check for encrypted reasoning content
			if message.ResponsesReasoning != nil && message.ResponsesReasoning.EncryptedContent != nil {
				t.Logf("🔐 Found encrypted reasoning content")
			}
		}

		// Check for content blocks with ResponsesOutputMessageContentTypeReasoning
		if message.Content != nil && message.Content.ContentBlocks != nil {
			for _, block := range message.Content.ContentBlocks {
				if block.Type == schemas.ResponsesOutputMessageContentTypeReasoning {
					reasoningContentFound = true
					t.Logf("🔍 Found ResponsesOutputMessageContentTypeReasoning content block")
				}
			}
		}
	}

	// Check if reasoning tokens were used
	if response.Usage != nil && response.Usage.OutputTokensDetails != nil &&
		response.Usage.OutputTokensDetails.ReasoningTokens > 0 {
		t.Logf("🔢 Reasoning tokens used: %d", response.Usage.OutputTokensDetails.ReasoningTokens)
		reasoningFound = true // Reasoning tokens indicate reasoning was performed
	}

	// Log findings
	detected := reasoningFound || reasoningContentFound
	if detected {
		t.Logf("✅ Responses API reasoning indicators detected")
		if reasoningFound {
			t.Logf("  - ResponsesMessageTypeReasoning or reasoning tokens found")
		}
		if reasoningContentFound {
			t.Logf("  - ResponsesOutputMessageContentTypeReasoning content blocks found")
		}
		if summaryFound {
			t.Logf("  - Reasoning summary content found")
		}
	} else {
		t.Logf("ℹ️ No explicit reasoning indicators found (may be provider-specific)")
	}

	return detected
}

// RunChatCompletionReasoningTest executes the reasoning test scenario to test thinking capabilities via Chat Completions API
func RunChatCompletionReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
	if !testConfig.Scenarios.Reasoning {
		t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
		return
	}

	// Skip if no reasoning model is configured
	if testConfig.ReasoningModel == "" {
		t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
		return
	}

	t.Run("ChatCompletionReasoning", func(t *testing.T) {
		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
			t.Parallel()
		}

		if testConfig.Provider == schemas.OpenAI {
			// OpenAI because reasoning for them in chat completions is extremely flaky
			t.Skip("Skipping ChatCompletionReasoning test for OpenAI")
			return
		}

		// Create a complex problem that requires step-by-step reasoning
		problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning."

		chatMessages := []schemas.ChatMessage{
			CreateBasicChatMessage(problemPrompt),
		}

		// Execute Chat Completions API test with retries
		chatReq := &schemas.BifrostChatRequest{
			Provider: testConfig.Provider,
			Model:    testConfig.ReasoningModel,
			Input:    chatMessages,
			Params: &schemas.ChatParameters{
				MaxCompletionTokens: bifrost.Ptr(1800),
				// Configure reasoning-specific parameters
				Reasoning: &schemas.ChatReasoning{
					Effort:    bifrost.Ptr("high"), // High effort for complex reasoning
					MaxTokens: bifrost.Ptr(1500),   // Maximum tokens for reasoning output
				},
			},
			Fallbacks: testConfig.Fallbacks,
		}

		// Use retry framework with enhanced validation for reasoning
		retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
		retryContext := TestRetryContext{
			ScenarioName: "Reasoning",
			ExpectedBehavior: map[string]interface{}{
				"should_show_reasoning": true,
				"mathematical_problem":  true,
				"step_by_step":          true,
			},
			TestMetadata: map[string]interface{}{
				"provider":          testConfig.Provider,
				"model":             testConfig.ReasoningModel,
				"problem_type":      "mathematical",
				"complexity":        "high",
				"expects_reasoning": true,
			},
		}
		chatRetryConfig := ChatRetryConfig{
			MaxAttempts: retryConfig.MaxAttempts,
			BaseDelay:   retryConfig.BaseDelay,
			MaxDelay:    retryConfig.MaxDelay,
			Conditions:  []ChatRetryCondition{}, // Add specific chat retry conditions as needed
			OnRetry:     retryConfig.OnRetry,
			OnFinalFail: retryConfig.OnFinalFail,
		}

		// Enhanced validation for reasoning scenarios
		expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
			"requires_reasoning": true,
		})
		expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)

		response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
			bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
			return client.ChatCompletionRequest(bfCtx, chatReq)
		})

		if chatError != nil {
			t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(chatError))
		}

		// Log the response content
		chatContent := GetChatContent(response)
		if chatContent == "" {
			t.Logf("✅ Chat Completions API reasoning result: <no content>")
		} else {
			maxLen := 300
			if len(chatContent) < maxLen {
				maxLen = len(chatContent)
			}
			t.Logf("✅ Chat Completions API reasoning result: %s", chatContent[:maxLen])
		}

		// Additional reasoning-specific validation (complementary to the main validation)
		reasoningDetected := validateChatCompletionReasoning(t, response)
		if !reasoningDetected {
			t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content")
		} else {
			t.Logf("🧠 Reasoning structure detected in response")
		}

		t.Logf("🎉 Chat Completions API passed Reasoning test!")
	})
}

// validateChatCompletionReasoning performs additional validation specific to Chat Completions API reasoning features
// Returns true if reasoning indicators are found
func validateChatCompletionReasoning(t *testing.T, response *schemas.BifrostChatResponse) bool {
	if response == nil || len(response.Choices) == 0 {
		return false
	}

	reasoningFound := false
	reasoningDetailsFound := false
	reasoningTokensFound := false

	// Check each choice for reasoning indicators
	for _, choice := range response.Choices {
		// Check for reasoning details in ChatNonStreamResponseChoice
		if choice.ChatNonStreamResponseChoice != nil && choice.ChatNonStreamResponseChoice.Message != nil {
			message := choice.ChatNonStreamResponseChoice.Message

			if message == nil {
				continue
			}

			// Check for reasoning content in message (for backward compatibility)
			if message.ChatAssistantMessage != nil && message.ChatAssistantMessage.Reasoning != nil && *message.ChatAssistantMessage.Reasoning != "" {
				reasoningFound = true
				t.Logf("🧠 Found reasoning content in message (length: %d)", len(*message.ChatAssistantMessage.Reasoning))

				// Log first 200 chars for debugging
				reasoningText := *message.ChatAssistantMessage.Reasoning
				maxLen := 200
				if len(reasoningText) < maxLen {
					maxLen = len(reasoningText)
				}
				t.Logf("📋 First reasoning content: %s", reasoningText[:maxLen])
			}

			// Check for reasoning details array
			if message.ChatAssistantMessage != nil && len(message.ChatAssistantMessage.ReasoningDetails) > 0 {
				reasoningDetailsFound = true
				t.Logf("📝 Found %d reasoning details entries", len(message.ChatAssistantMessage.ReasoningDetails))

				// Log details about each reasoning entry
				for i, detail := range message.ChatAssistantMessage.ReasoningDetails {
					t.Logf("  - Entry %d: Type=%s, Index=%d", i, detail.Type, detail.Index)

					switch detail.Type {
					case schemas.BifrostReasoningDetailsTypeSummary:
						if detail.Summary != nil {
							t.Logf("    Summary length: %d", len(*detail.Summary))
						}
					case schemas.BifrostReasoningDetailsTypeText:
						if detail.Text != nil {
							textLen := len(*detail.Text)
							t.Logf("    Text length: %d", textLen)
							if textLen > 0 {
								maxLen := 150
								if textLen < maxLen {
									maxLen = textLen
								}
								t.Logf("    Text preview: %s", (*detail.Text)[:maxLen])
							}
						}
					case schemas.BifrostReasoningDetailsTypeEncrypted:
						if detail.Data != nil {
							t.Logf("    Encrypted data length: %d", len(*detail.Data))
						}
						if detail.Signature != nil {
							t.Logf("    Signature present: %d bytes", len(*detail.Signature))
						}
					}
				}
			}
		}
	}

	// Check if reasoning tokens were used
	if response.Usage != nil && response.Usage.CompletionTokensDetails != nil &&
		response.Usage.CompletionTokensDetails.ReasoningTokens > 0 {
		reasoningTokensFound = true
		t.Logf("🔢 Reasoning tokens used: %d", response.Usage.CompletionTokensDetails.ReasoningTokens)
	}

	// Log findings
	detected := reasoningFound || reasoningDetailsFound || reasoningTokensFound
	if detected {
		t.Logf("✅ Chat Completions API reasoning indicators detected")
		if reasoningFound {
			t.Logf("  - Reasoning content found in message")
		}
		if reasoningDetailsFound {
			t.Logf("  - Reasoning details array found")
		}
		if reasoningTokensFound {
			t.Logf("  - Reasoning tokens usage reported")
		}
	} else {
		t.Logf("ℹ️ No explicit reasoning indicators found (may be provider-specific)")
	}

	return detected
}

// RunMultiTurnReasoningTest tests multi-turn conversations with reasoning content passthrough.
// It verifies that reasoning details (text + signature) from assistant messages are correctly
// passed back to the model in follow-up turns via the Chat Completions API.
func RunMultiTurnReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
	if !testConfig.Scenarios.Reasoning {
		t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
		return
	}

	if testConfig.ReasoningModel == "" {
		t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
		return
	}

	t.Run("MultiTurnReasoning", func(t *testing.T) {
		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
			t.Parallel()
		}

		if testConfig.Provider == schemas.OpenAI {
			t.Skip("Skipping MultiTurnReasoning test for OpenAI")
			return
		}

		// Step 1: Send initial reasoning request
		initialPrompt := "What is 15 * 17? Think step by step."
		chatMessages := []schemas.ChatMessage{
			CreateBasicChatMessage(initialPrompt),
		}

		chatReq := &schemas.BifrostChatRequest{
			Provider: testConfig.Provider,
			Model:    testConfig.ReasoningModel,
			Input:    chatMessages,
			Params: &schemas.ChatParameters{
				MaxCompletionTokens: bifrost.Ptr(4000),
				Reasoning: &schemas.ChatReasoning{
					Effort: bifrost.Ptr("low"),
				},
			},
			Fallbacks: testConfig.Fallbacks,
		}

		retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
		retryContext := TestRetryContext{
			ScenarioName: "MultiTurnReasoning_Step1",
			ExpectedBehavior: map[string]interface{}{
				"should_show_reasoning": true,
				"multi_turn":            true,
			},
			TestMetadata: map[string]interface{}{
				"provider": testConfig.Provider,
				"model":    testConfig.ReasoningModel,
				"step":     "initial",
			},
		}
		chatRetryConfig := ChatRetryConfig{
			MaxAttempts: retryConfig.MaxAttempts,
			BaseDelay:   retryConfig.BaseDelay,
			MaxDelay:    retryConfig.MaxDelay,
			Conditions:  []ChatRetryCondition{},
			OnRetry:     retryConfig.OnRetry,
			OnFinalFail: retryConfig.OnFinalFail,
		}
		expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
			"requires_reasoning": true,
		})
		expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)

		firstResponse, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "MultiTurnReasoning_Step1", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
			bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
			return client.ChatCompletionRequest(bfCtx, chatReq)
		})

		if chatError != nil {
			t.Fatalf("Step 1 failed: %v", GetErrorMessage(chatError))
		}

		firstContent := GetChatContent(firstResponse)
		if firstContent == "" {
			t.Fatal("Step 1: Expected non-empty response content")
		}
		t.Logf("Step 1 response: %s", truncateString(firstContent, 200))

		// Extract reasoning details from first response
		var reasoningDetails []schemas.ChatReasoningDetails
		if len(firstResponse.Choices) > 0 {
			choice := firstResponse.Choices[0]
			if choice.ChatNonStreamResponseChoice != nil &&
				choice.ChatNonStreamResponseChoice.Message != nil &&
				choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage != nil {
				reasoningDetails = choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage.ReasoningDetails
			}
		}

		t.Logf("Step 1: Found %d reasoning detail entries", len(reasoningDetails))

		// Step 2: Build multi-turn conversation with reasoning details passed back
		multiTurnMessages := []schemas.ChatMessage{
			CreateBasicChatMessage(initialPrompt),
			{
				Role: schemas.ChatMessageRoleAssistant,
				Content: &schemas.ChatMessageContent{
					ContentStr: &firstContent,
				},
				ChatAssistantMessage: &schemas.ChatAssistantMessage{
					ReasoningDetails: reasoningDetails,
				},
			},
			CreateBasicChatMessage("Now multiply that result by 2."),
		}

		multiTurnReq := &schemas.BifrostChatRequest{
			Provider: testConfig.Provider,
			Model:    testConfig.ReasoningModel,
			Input:    multiTurnMessages,
			Params: &schemas.ChatParameters{
				MaxCompletionTokens: bifrost.Ptr(4000),
				Reasoning: &schemas.ChatReasoning{
					Effort: bifrost.Ptr("low"),
				},
			},
			Fallbacks: testConfig.Fallbacks,
		}

		retryContext2 := TestRetryContext{
			ScenarioName: "MultiTurnReasoning_Step2",
			ExpectedBehavior: map[string]interface{}{
				"multi_turn":            true,
				"reasoning_passthrough": true,
			},
			TestMetadata: map[string]interface{}{
				"provider": testConfig.Provider,
				"model":    testConfig.ReasoningModel,
				"step":     "follow_up",
			},
		}

		secondResponse, chatError2 := WithChatTestRetry(t, chatRetryConfig, retryContext2, expectations, "MultiTurnReasoning_Step2", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
			bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
			return client.ChatCompletionRequest(bfCtx, multiTurnReq)
		})

		if chatError2 != nil {
			t.Fatalf("Step 2 (multi-turn with reasoning passthrough) failed: %v", GetErrorMessage(chatError2))
		}

		secondContent := GetChatContent(secondResponse)
		if secondContent == "" {
			t.Error("Step 2: Expected non-empty response content")
		} else {
			t.Logf("Step 2 response: %s", truncateString(secondContent, 200))
		}

		t.Log("Multi-turn reasoning passthrough test passed!")
	})
}

// min returns the smaller of two integers
func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}