first commit

2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions
--- a/core/internal/llmtests/reasoning.go
+++ b/core/internal/llmtests/reasoning.go
@@ -0,0 +1,583 @@
+package llmtests
+
+import (
+	"context"
+	"os"
+	"testing"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// RunResponsesReasoningTest executes the reasoning test scenario to test thinking capabilities via Responses API only
+func RunResponsesReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
+	if !testConfig.Scenarios.Reasoning {
+		t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
+		return
+	}
+
+	// Skip if no reasoning model is configured
+	if testConfig.ReasoningModel == "" {
+		t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
+		return
+	}
+
+	t.Run("ResponsesReasoning", func(t *testing.T) {
+		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+			t.Parallel()
+		}
+
+		// Create a complex problem that requires step-by-step reasoning
+		problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning."
+
+		responsesMessages := []schemas.ResponsesMessage{
+			CreateBasicResponsesMessage(problemPrompt),
+		}
+
+		// Execute Responses API test with retries
+		responsesReq := &schemas.BifrostResponsesRequest{
+			Provider: testConfig.Provider,
+			Model:    testConfig.ReasoningModel,
+			Input:    responsesMessages,
+			Params: &schemas.ResponsesParameters{
+				// Reasoning models (o3, o4-mini) allocate tokens between reasoning and text output.
+				// Note: Older o1 models may not return message output via Responses API - use o3/o4-mini.
+				// OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs.
+				// See: https://platform.openai.com/docs/guides/reasoning#allocating-space-for-reasoning
+				MaxOutputTokens: bifrost.Ptr(25000),
+				// Configure reasoning-specific parameters
+				Reasoning: &schemas.ResponsesParametersReasoning{
+					Effort: bifrost.Ptr("high"), // High effort for complex reasoning
+					// Summary: bifrost.Ptr("detailed"), // Detailed summary of reasoning process
+				},
+				// Include reasoning content in response
+				Include: []string{"reasoning.encrypted_content"},
+			},
+			Fallbacks: testConfig.Fallbacks,
+		}
+
+		// Use retry framework with enhanced validation for reasoning
+		retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
+		retryContext := TestRetryContext{
+			ScenarioName: "Reasoning",
+			ExpectedBehavior: map[string]interface{}{
+				"should_show_reasoning": true,
+				"mathematical_problem":  true,
+				"step_by_step":          true,
+			},
+			TestMetadata: map[string]interface{}{
+				"provider":          testConfig.Provider,
+				"model":             testConfig.ReasoningModel,
+				"problem_type":      "mathematical",
+				"complexity":        "high",
+				"expects_reasoning": true,
+			},
+		}
+		responsesRetryConfig := ResponsesRetryConfig{
+			MaxAttempts: retryConfig.MaxAttempts,
+			BaseDelay:   retryConfig.BaseDelay,
+			MaxDelay:    retryConfig.MaxDelay,
+			Conditions:  []ResponsesRetryCondition{}, // Add specific responses retry conditions as needed
+			OnRetry:     retryConfig.OnRetry,
+			OnFinalFail: retryConfig.OnFinalFail,
+		}
+
+		// Enhanced validation for reasoning scenarios
+		expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
+			"requires_reasoning": true,
+		})
+		expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
+
+		response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) {
+			bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+			return client.ResponsesRequest(bfCtx, responsesReq)
+		})
+
+		if responsesError != nil {
+			t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(responsesError))
+		}
+
+		// Log the response content
+		responsesContent := GetResponsesContent(response)
+		if responsesContent == "" {
+			t.Logf("✅ Responses API reasoning result: <no content>")
+		} else {
+			maxLen := 300
+			if len(responsesContent) < maxLen {
+				maxLen = len(responsesContent)
+			}
+			t.Logf("✅ Responses API reasoning result: %s", responsesContent[:maxLen])
+		}
+
+		// Additional reasoning-specific validation (complementary to the main validation)
+		reasoningDetected := validateResponsesAPIReasoning(t, response)
+		if !reasoningDetected {
+			t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content")
+		} else {
+			t.Logf("🧠 Reasoning structure detected in response")
+		}
+
+		t.Logf("🎉 Responses API passed Reasoning test!")
+	})
+}
+
+// validateResponsesAPIReasoning performs additional validation specific to Responses API reasoning features
+// Returns true if reasoning indicators are found
+func validateResponsesAPIReasoning(t *testing.T, response *schemas.BifrostResponsesResponse) bool {
+	if response == nil || response.Output == nil {
+		return false
+	}
+
+	reasoningFound := false
+	summaryFound := false
+	reasoningContentFound := false
+
+	// Check if response contains reasoning messages or reasoning content
+	for _, message := range response.Output {
+		// Check for ResponsesMessageTypeReasoning
+		if message.Type != nil && *message.Type == schemas.ResponsesMessageTypeReasoning {
+			reasoningFound = true
+			t.Logf("🧠 Found ResponsesMessageTypeReasoning message in response")
+
+			// Check for reasoning summary content
+			if message.ResponsesReasoning != nil && len(message.ResponsesReasoning.Summary) > 0 {
+				summaryFound = true
+				t.Logf("📝 Found reasoning summary with %d content blocks", len(message.ResponsesReasoning.Summary))
+
+				// Log first summary block for debugging
+				if len(message.ResponsesReasoning.Summary) > 0 {
+					firstSummary := message.ResponsesReasoning.Summary[0]
+					if len(firstSummary.Text) > 0 {
+						maxLen := 200
+						if len(firstSummary.Text) < maxLen {
+							maxLen = len(firstSummary.Text)
+						}
+						t.Logf("📋 First reasoning summary: %s", firstSummary.Text[:maxLen])
+					} else {
+						t.Logf("📋 First reasoning summary: (empty)")
+					}
+				}
+			}
+
+			// Check for encrypted reasoning content
+			if message.ResponsesReasoning != nil && message.ResponsesReasoning.EncryptedContent != nil {
+				t.Logf("🔐 Found encrypted reasoning content")
+			}
+		}
+
+		// Check for content blocks with ResponsesOutputMessageContentTypeReasoning
+		if message.Content != nil && message.Content.ContentBlocks != nil {
+			for _, block := range message.Content.ContentBlocks {
+				if block.Type == schemas.ResponsesOutputMessageContentTypeReasoning {
+					reasoningContentFound = true
+					t.Logf("🔍 Found ResponsesOutputMessageContentTypeReasoning content block")
+				}
+			}
+		}
+	}
+
+	// Check if reasoning tokens were used
+	if response.Usage != nil && response.Usage.OutputTokensDetails != nil &&
+		response.Usage.OutputTokensDetails.ReasoningTokens > 0 {
+		t.Logf("🔢 Reasoning tokens used: %d", response.Usage.OutputTokensDetails.ReasoningTokens)
+		reasoningFound = true // Reasoning tokens indicate reasoning was performed
+	}
+
+	// Log findings
+	detected := reasoningFound || reasoningContentFound
+	if detected {
+		t.Logf("✅ Responses API reasoning indicators detected")
+		if reasoningFound {
+			t.Logf("  - ResponsesMessageTypeReasoning or reasoning tokens found")
+		}
+		if reasoningContentFound {
+			t.Logf("  - ResponsesOutputMessageContentTypeReasoning content blocks found")
+		}
+		if summaryFound {
+			t.Logf("  - Reasoning summary content found")
+		}
+	} else {
+		t.Logf("ℹ️ No explicit reasoning indicators found (may be provider-specific)")
+	}
+
+	return detected
+}
+
+// RunChatCompletionReasoningTest executes the reasoning test scenario to test thinking capabilities via Chat Completions API
+func RunChatCompletionReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
+	if !testConfig.Scenarios.Reasoning {
+		t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
+		return
+	}
+
+	// Skip if no reasoning model is configured
+	if testConfig.ReasoningModel == "" {
+		t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
+		return
+	}
+
+	t.Run("ChatCompletionReasoning", func(t *testing.T) {
+		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+			t.Parallel()
+		}
+
+		if testConfig.Provider == schemas.OpenAI {
+			// OpenAI because reasoning for them in chat completions is extremely flaky
+			t.Skip("Skipping ChatCompletionReasoning test for OpenAI")
+			return
+		}
+
+		// Create a complex problem that requires step-by-step reasoning
+		problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning."
+
+		chatMessages := []schemas.ChatMessage{
+			CreateBasicChatMessage(problemPrompt),
+		}
+
+		// Execute Chat Completions API test with retries
+		chatReq := &schemas.BifrostChatRequest{
+			Provider: testConfig.Provider,
+			Model:    testConfig.ReasoningModel,
+			Input:    chatMessages,
+			Params: &schemas.ChatParameters{
+				MaxCompletionTokens: bifrost.Ptr(1800),
+				// Configure reasoning-specific parameters
+				Reasoning: &schemas.ChatReasoning{
+					Effort:    bifrost.Ptr("high"), // High effort for complex reasoning
+					MaxTokens: bifrost.Ptr(1500),   // Maximum tokens for reasoning output
+				},
+			},
+			Fallbacks: testConfig.Fallbacks,
+		}
+
+		// Use retry framework with enhanced validation for reasoning
+		retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
+		retryContext := TestRetryContext{
+			ScenarioName: "Reasoning",
+			ExpectedBehavior: map[string]interface{}{
+				"should_show_reasoning": true,
+				"mathematical_problem":  true,
+				"step_by_step":          true,
+			},
+			TestMetadata: map[string]interface{}{
+				"provider":          testConfig.Provider,
+				"model":             testConfig.ReasoningModel,
+				"problem_type":      "mathematical",
+				"complexity":        "high",
+				"expects_reasoning": true,
+			},
+		}
+		chatRetryConfig := ChatRetryConfig{
+			MaxAttempts: retryConfig.MaxAttempts,
+			BaseDelay:   retryConfig.BaseDelay,
+			MaxDelay:    retryConfig.MaxDelay,
+			Conditions:  []ChatRetryCondition{}, // Add specific chat retry conditions as needed
+			OnRetry:     retryConfig.OnRetry,
+			OnFinalFail: retryConfig.OnFinalFail,
+		}
+
+		// Enhanced validation for reasoning scenarios
+		expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
+			"requires_reasoning": true,
+		})
+		expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
+
+		response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
+			bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+			return client.ChatCompletionRequest(bfCtx, chatReq)
+		})
+
+		if chatError != nil {
+			t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(chatError))
+		}
+
+		// Log the response content
+		chatContent := GetChatContent(response)
+		if chatContent == "" {
+			t.Logf("✅ Chat Completions API reasoning result: <no content>")
+		} else {
+			maxLen := 300
+			if len(chatContent) < maxLen {
+				maxLen = len(chatContent)
+			}
+			t.Logf("✅ Chat Completions API reasoning result: %s", chatContent[:maxLen])
+		}
+
+		// Additional reasoning-specific validation (complementary to the main validation)
+		reasoningDetected := validateChatCompletionReasoning(t, response)
+		if !reasoningDetected {
+			t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content")
+		} else {
+			t.Logf("🧠 Reasoning structure detected in response")
+		}
+
+		t.Logf("🎉 Chat Completions API passed Reasoning test!")
+	})
+}
+
+// validateChatCompletionReasoning performs additional validation specific to Chat Completions API reasoning features
+// Returns true if reasoning indicators are found
+func validateChatCompletionReasoning(t *testing.T, response *schemas.BifrostChatResponse) bool {
+	if response == nil || len(response.Choices) == 0 {
+		return false
+	}
+
+	reasoningFound := false
+	reasoningDetailsFound := false
+	reasoningTokensFound := false
+
+	// Check each choice for reasoning indicators
+	for _, choice := range response.Choices {
+		// Check for reasoning details in ChatNonStreamResponseChoice
+		if choice.ChatNonStreamResponseChoice != nil && choice.ChatNonStreamResponseChoice.Message != nil {
+			message := choice.ChatNonStreamResponseChoice.Message
+
+			if message == nil {
+				continue
+			}
+
+			// Check for reasoning content in message (for backward compatibility)
+			if message.ChatAssistantMessage != nil && message.ChatAssistantMessage.Reasoning != nil && *message.ChatAssistantMessage.Reasoning != "" {
+				reasoningFound = true
+				t.Logf("🧠 Found reasoning content in message (length: %d)", len(*message.ChatAssistantMessage.Reasoning))
+
+				// Log first 200 chars for debugging
+				reasoningText := *message.ChatAssistantMessage.Reasoning
+				maxLen := 200
+				if len(reasoningText) < maxLen {
+					maxLen = len(reasoningText)
+				}
+				t.Logf("📋 First reasoning content: %s", reasoningText[:maxLen])
+			}
+
+			// Check for reasoning details array
+			if message.ChatAssistantMessage != nil && len(message.ChatAssistantMessage.ReasoningDetails) > 0 {
+				reasoningDetailsFound = true
+				t.Logf("📝 Found %d reasoning details entries", len(message.ChatAssistantMessage.ReasoningDetails))
+
+				// Log details about each reasoning entry
+				for i, detail := range message.ChatAssistantMessage.ReasoningDetails {
+					t.Logf("  - Entry %d: Type=%s, Index=%d", i, detail.Type, detail.Index)
+
+					switch detail.Type {
+					case schemas.BifrostReasoningDetailsTypeSummary:
+						if detail.Summary != nil {
+							t.Logf("    Summary length: %d", len(*detail.Summary))
+						}
+					case schemas.BifrostReasoningDetailsTypeText:
+						if detail.Text != nil {
+							textLen := len(*detail.Text)
+							t.Logf("    Text length: %d", textLen)
+							if textLen > 0 {
+								maxLen := 150
+								if textLen < maxLen {
+									maxLen = textLen
+								}
+								t.Logf("    Text preview: %s", (*detail.Text)[:maxLen])
+							}
+						}
+					case schemas.BifrostReasoningDetailsTypeEncrypted:
+						if detail.Data != nil {
+							t.Logf("    Encrypted data length: %d", len(*detail.Data))
+						}
+						if detail.Signature != nil {
+							t.Logf("    Signature present: %d bytes", len(*detail.Signature))
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// Check if reasoning tokens were used
+	if response.Usage != nil && response.Usage.CompletionTokensDetails != nil &&
+		response.Usage.CompletionTokensDetails.ReasoningTokens > 0 {
+		reasoningTokensFound = true
+		t.Logf("🔢 Reasoning tokens used: %d", response.Usage.CompletionTokensDetails.ReasoningTokens)
+	}
+
+	// Log findings
+	detected := reasoningFound || reasoningDetailsFound || reasoningTokensFound
+	if detected {
+		t.Logf("✅ Chat Completions API reasoning indicators detected")
+		if reasoningFound {
+			t.Logf("  - Reasoning content found in message")
+		}
+		if reasoningDetailsFound {
+			t.Logf("  - Reasoning details array found")
+		}
+		if reasoningTokensFound {
+			t.Logf("  - Reasoning tokens usage reported")
+		}
+	} else {
+		t.Logf("ℹ️ No explicit reasoning indicators found (may be provider-specific)")
+	}
+
+	return detected
+}
+
+// RunMultiTurnReasoningTest tests multi-turn conversations with reasoning content passthrough.
+// It verifies that reasoning details (text + signature) from assistant messages are correctly
+// passed back to the model in follow-up turns via the Chat Completions API.
+func RunMultiTurnReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
+	if !testConfig.Scenarios.Reasoning {
+		t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
+		return
+	}
+
+	if testConfig.ReasoningModel == "" {
+		t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
+		return
+	}
+
+	t.Run("MultiTurnReasoning", func(t *testing.T) {
+		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+			t.Parallel()
+		}
+
+		if testConfig.Provider == schemas.OpenAI {
+			t.Skip("Skipping MultiTurnReasoning test for OpenAI")
+			return
+		}
+
+		// Step 1: Send initial reasoning request
+		initialPrompt := "What is 15 * 17? Think step by step."
+		chatMessages := []schemas.ChatMessage{
+			CreateBasicChatMessage(initialPrompt),
+		}
+
+		chatReq := &schemas.BifrostChatRequest{
+			Provider: testConfig.Provider,
+			Model:    testConfig.ReasoningModel,
+			Input:    chatMessages,
+			Params: &schemas.ChatParameters{
+				MaxCompletionTokens: bifrost.Ptr(4000),
+				Reasoning: &schemas.ChatReasoning{
+					Effort: bifrost.Ptr("low"),
+				},
+			},
+			Fallbacks: testConfig.Fallbacks,
+		}
+
+		retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
+		retryContext := TestRetryContext{
+			ScenarioName: "MultiTurnReasoning_Step1",
+			ExpectedBehavior: map[string]interface{}{
+				"should_show_reasoning": true,
+				"multi_turn":            true,
+			},
+			TestMetadata: map[string]interface{}{
+				"provider": testConfig.Provider,
+				"model":    testConfig.ReasoningModel,
+				"step":     "initial",
+			},
+		}
+		chatRetryConfig := ChatRetryConfig{
+			MaxAttempts: retryConfig.MaxAttempts,
+			BaseDelay:   retryConfig.BaseDelay,
+			MaxDelay:    retryConfig.MaxDelay,
+			Conditions:  []ChatRetryCondition{},
+			OnRetry:     retryConfig.OnRetry,
+			OnFinalFail: retryConfig.OnFinalFail,
+		}
+		expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
+			"requires_reasoning": true,
+		})
+		expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
+
+		firstResponse, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "MultiTurnReasoning_Step1", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
+			bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+			return client.ChatCompletionRequest(bfCtx, chatReq)
+		})
+
+		if chatError != nil {
+			t.Fatalf("Step 1 failed: %v", GetErrorMessage(chatError))
+		}
+
+		firstContent := GetChatContent(firstResponse)
+		if firstContent == "" {
+			t.Fatal("Step 1: Expected non-empty response content")
+		}
+		t.Logf("Step 1 response: %s", truncateString(firstContent, 200))
+
+		// Extract reasoning details from first response
+		var reasoningDetails []schemas.ChatReasoningDetails
+		if len(firstResponse.Choices) > 0 {
+			choice := firstResponse.Choices[0]
+			if choice.ChatNonStreamResponseChoice != nil &&
+				choice.ChatNonStreamResponseChoice.Message != nil &&
+				choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage != nil {
+				reasoningDetails = choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage.ReasoningDetails
+			}
+		}
+
+		t.Logf("Step 1: Found %d reasoning detail entries", len(reasoningDetails))
+
+		// Step 2: Build multi-turn conversation with reasoning details passed back
+		multiTurnMessages := []schemas.ChatMessage{
+			CreateBasicChatMessage(initialPrompt),
+			{
+				Role: schemas.ChatMessageRoleAssistant,
+				Content: &schemas.ChatMessageContent{
+					ContentStr: &firstContent,
+				},
+				ChatAssistantMessage: &schemas.ChatAssistantMessage{
+					ReasoningDetails: reasoningDetails,
+				},
+			},
+			CreateBasicChatMessage("Now multiply that result by 2."),
+		}
+
+		multiTurnReq := &schemas.BifrostChatRequest{
+			Provider: testConfig.Provider,
+			Model:    testConfig.ReasoningModel,
+			Input:    multiTurnMessages,
+			Params: &schemas.ChatParameters{
+				MaxCompletionTokens: bifrost.Ptr(4000),
+				Reasoning: &schemas.ChatReasoning{
+					Effort: bifrost.Ptr("low"),
+				},
+			},
+			Fallbacks: testConfig.Fallbacks,
+		}
+
+		retryContext2 := TestRetryContext{
+			ScenarioName: "MultiTurnReasoning_Step2",
+			ExpectedBehavior: map[string]interface{}{
+				"multi_turn":            true,
+				"reasoning_passthrough": true,
+			},
+			TestMetadata: map[string]interface{}{
+				"provider": testConfig.Provider,
+				"model":    testConfig.ReasoningModel,
+				"step":     "follow_up",
+			},
+		}
+
+		secondResponse, chatError2 := WithChatTestRetry(t, chatRetryConfig, retryContext2, expectations, "MultiTurnReasoning_Step2", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
+			bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+			return client.ChatCompletionRequest(bfCtx, multiTurnReq)
+		})
+
+		if chatError2 != nil {
+			t.Fatalf("Step 2 (multi-turn with reasoning passthrough) failed: %v", GetErrorMessage(chatError2))
+		}
+
+		secondContent := GetChatContent(secondResponse)
+		if secondContent == "" {
+			t.Error("Step 2: Expected non-empty response content")
+		} else {
+			t.Logf("Step 2 response: %s", truncateString(secondContent, 200))
+		}
+
+		t.Log("Multi-turn reasoning passthrough test passed!")
+	})
+}
+
+// min returns the smaller of two integers
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}