Files
bifrost/core/internal/llmtests/reasoning_opus.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

644 lines
22 KiB
Go

package llmtests
import (
"context"
"os"
"testing"
bifrost "github.com/maximhq/bifrost/core"
"github.com/maximhq/bifrost/core/schemas"
)
// OpusReasoningTestConfig holds configuration for Opus-specific reasoning tests
type OpusReasoningTestConfig struct {
Provider schemas.ModelProvider
Opus45Model string // Opus 4.5 model identifier
Opus46Model string // Opus 4.6 model identifier
Fallbacks []schemas.Fallback
SkipOpus45 bool // Skip Opus 4.5 tests
SkipOpus46 bool // Skip Opus 4.6 tests
SkipReason string // Reason for skipping
}
// GetOpusReasoningTestConfigs returns test configurations for Opus reasoning across providers
func GetOpusReasoningTestConfigs() []OpusReasoningTestConfig {
return []OpusReasoningTestConfig{
{
Provider: schemas.Anthropic,
Opus45Model: "claude-opus-4-5-20251101",
Opus46Model: "claude-opus-4-6-20260210",
Fallbacks: []schemas.Fallback{},
},
{
Provider: schemas.Bedrock,
Opus45Model: "global.anthropic.claude-opus-4-5-20251101-v1:0",
Opus46Model: "global.anthropic.claude-opus-4-6-v1",
Fallbacks: []schemas.Fallback{},
},
{
Provider: schemas.Azure,
Opus45Model: "claude-opus-4-5", // Uses deployment name
Opus46Model: "claude-opus-4-6", // Uses deployment name
Fallbacks: []schemas.Fallback{},
},
{
Provider: schemas.Vertex,
Opus45Model: "claude-opus-4-5", // Uses deployment name
Opus46Model: "claude-opus-4-6", // Uses deployment name
Fallbacks: []schemas.Fallback{},
},
}
}
// RunOpus45ReasoningTest tests extended thinking with Opus 4.5 (budget_tokens mode)
func RunOpus45ReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, config OpusReasoningTestConfig) {
if config.SkipOpus45 {
t.Skipf("Skipping Opus 4.5 test: %s", config.SkipReason)
return
}
if config.Opus45Model == "" {
t.Skip("No Opus 4.5 model configured")
return
}
t.Run("Opus45_ExtendedThinking", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
// Complex reasoning problem
problemPrompt := "Solve this step by step: A train leaves station A at 9:00 AM traveling at 60 mph. Another train leaves station B (300 miles away) at 10:00 AM traveling towards station A at 80 mph. At what time will they meet, and how far from station A?"
// Create a test config for retry framework
testConfig := ComprehensiveTestConfig{
Provider: config.Provider,
ReasoningModel: config.Opus45Model,
Scenarios: TestScenarios{
Reasoning: true,
},
Fallbacks: config.Fallbacks,
}
// Test via Responses API
t.Run("ResponsesAPI", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
responsesMessages := []schemas.ResponsesMessage{
CreateBasicResponsesMessage(problemPrompt),
}
responsesReq := &schemas.BifrostResponsesRequest{
Provider: config.Provider,
Model: config.Opus45Model,
Input: responsesMessages,
Params: &schemas.ResponsesParameters{
MaxOutputTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ResponsesParametersReasoning{
Effort: bifrost.Ptr("high"),
},
Include: []string{"reasoning.encrypted_content"},
},
Fallbacks: config.Fallbacks,
}
// Use retry framework with enhanced validation for reasoning
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Opus45_Reasoning_Responses",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"mathematical_problem": true,
"step_by_step": true,
"model_version": "opus-4.5",
"thinking_mode": "budget_tokens",
},
TestMetadata: map[string]interface{}{
"provider": config.Provider,
"model": config.Opus45Model,
"problem_type": "mathematical",
"complexity": "high",
"expects_reasoning": true,
},
}
responsesRetryConfig := ResponsesRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ResponsesRetryCondition{},
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
// Enhanced validation for reasoning scenarios
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Opus45_Reasoning_Responses", func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ResponsesRequest(bfCtx, responsesReq)
})
if responsesError != nil {
t.Fatalf("❌ Opus 4.5 Responses API reasoning test failed after retries: %v", GetErrorMessage(responsesError))
}
// Validate response has content
content := GetResponsesContent(response)
if content == "" {
t.Error("Expected non-empty response content")
} else {
t.Logf("✅ Opus 4.5 reasoning response (first 200 chars): %s", truncateString(content, 200))
}
// Check for reasoning indicators
reasoningDetected := validateResponsesAPIReasoning(t, response)
if !reasoningDetected {
t.Logf("⚠️ No explicit reasoning indicators found in response structure")
} else {
t.Logf("🧠 Reasoning structure detected in response")
}
t.Log("🎉 Opus 4.5 Responses API reasoning test passed!")
})
// Test via Chat Completions API
t.Run("ChatCompletionsAPI", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
chatMessages := []schemas.ChatMessage{
CreateBasicChatMessage(problemPrompt),
}
chatReq := &schemas.BifrostChatRequest{
Provider: config.Provider,
Model: config.Opus45Model,
Input: chatMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("high"),
MaxTokens: bifrost.Ptr(2000), // Budget tokens for Opus 4.5
},
},
Fallbacks: config.Fallbacks,
}
// Use retry framework with enhanced validation for reasoning
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Opus45_Reasoning_Chat",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"mathematical_problem": true,
"step_by_step": true,
"model_version": "opus-4.5",
"thinking_mode": "budget_tokens",
},
TestMetadata: map[string]interface{}{
"provider": config.Provider,
"model": config.Opus45Model,
"problem_type": "mathematical",
"complexity": "high",
"expects_reasoning": true,
},
}
chatRetryConfig := ChatRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ChatRetryCondition{},
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
// Enhanced validation for reasoning scenarios
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Opus45_Reasoning_Chat", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, chatReq)
})
if chatError != nil {
t.Fatalf("❌ Opus 4.5 Chat Completions API reasoning test failed after retries: %v", GetErrorMessage(chatError))
}
// Validate response has content
content := GetChatContent(response)
if content == "" {
t.Error("Expected non-empty response content")
} else {
t.Logf("✅ Opus 4.5 reasoning response (first 200 chars): %s", truncateString(content, 200))
}
// Check for reasoning indicators
reasoningDetected := validateChatCompletionReasoning(t, response)
if !reasoningDetected {
t.Logf("⚠️ No explicit reasoning indicators found in response structure")
} else {
t.Logf("🧠 Reasoning structure detected in response")
}
t.Log("🎉 Opus 4.5 Chat Completions API reasoning test passed!")
})
})
}
// RunOpus46ReasoningTest tests adaptive thinking with Opus 4.6 (adaptive mode + effort)
func RunOpus46ReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, config OpusReasoningTestConfig) {
if config.SkipOpus46 {
t.Skipf("Skipping Opus 4.6 test: %s", config.SkipReason)
return
}
if config.Opus46Model == "" {
t.Skip("No Opus 4.6 model configured")
return
}
t.Run("Opus46_AdaptiveThinking", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
// Complex reasoning problem that benefits from adaptive thinking
problemPrompt := "Analyze this logic puzzle: Five people (A, B, C, D, E) are sitting in a row. A is not at either end. B is somewhere to the left of C. D is not next to E. E is at one of the ends. In how many different valid arrangements can they sit? Show your reasoning."
// Create a test config for retry framework
testConfig := ComprehensiveTestConfig{
Provider: config.Provider,
ReasoningModel: config.Opus46Model,
Scenarios: TestScenarios{
Reasoning: true,
},
Fallbacks: config.Fallbacks,
}
// Test via Responses API with different effort levels
effortLevels := []string{"low", "medium", "high"}
for _, effort := range effortLevels {
effort := effort // capture range variable
t.Run("ResponsesAPI_Effort_"+effort, func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
responsesMessages := []schemas.ResponsesMessage{
CreateBasicResponsesMessage(problemPrompt),
}
responsesReq := &schemas.BifrostResponsesRequest{
Provider: config.Provider,
Model: config.Opus46Model,
Input: responsesMessages,
Params: &schemas.ResponsesParameters{
MaxOutputTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ResponsesParametersReasoning{
Effort: bifrost.Ptr(effort), // Adaptive thinking uses effort parameter
},
Include: []string{"reasoning.encrypted_content"},
},
Fallbacks: config.Fallbacks,
}
// Use retry framework with enhanced validation for reasoning
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Opus46_Reasoning_Responses_" + effort,
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"logic_puzzle": true,
"step_by_step": true,
"model_version": "opus-4.6",
"thinking_mode": "adaptive",
"effort_level": effort,
},
TestMetadata: map[string]interface{}{
"provider": config.Provider,
"model": config.Opus46Model,
"problem_type": "logic_puzzle",
"complexity": "high",
"expects_reasoning": true,
"effort": effort,
},
}
responsesRetryConfig := ResponsesRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ResponsesRetryCondition{},
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
// Enhanced validation for reasoning scenarios
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Opus46_Reasoning_Responses_"+effort, func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ResponsesRequest(bfCtx, responsesReq)
})
if responsesError != nil {
t.Fatalf("❌ Opus 4.6 Responses API (effort=%s) reasoning test failed after retries: %v", effort, GetErrorMessage(responsesError))
}
// Validate response has content
content := GetResponsesContent(response)
if content == "" {
t.Errorf("Expected non-empty response content for effort=%s", effort)
} else {
t.Logf("✅ Opus 4.6 (effort=%s) response (first 200 chars): %s", effort, truncateString(content, 200))
}
// Check for reasoning indicators
reasoningDetected := validateResponsesAPIReasoning(t, response)
if !reasoningDetected {
t.Logf("⚠️ No explicit reasoning indicators found in response structure")
} else {
t.Logf("🧠 Reasoning structure detected in response")
}
t.Logf("🎉 Opus 4.6 Responses API (effort=%s) reasoning test passed!", effort)
})
}
// Test via Chat Completions API
t.Run("ChatCompletionsAPI", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
chatMessages := []schemas.ChatMessage{
CreateBasicChatMessage(problemPrompt),
}
chatReq := &schemas.BifrostChatRequest{
Provider: config.Provider,
Model: config.Opus46Model,
Input: chatMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("high"), // Opus 4.6 uses adaptive thinking with effort
// Note: MaxTokens (budget_tokens) is NOT used for Opus 4.6
},
},
Fallbacks: config.Fallbacks,
}
// Use retry framework with enhanced validation for reasoning
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Opus46_Reasoning_Chat",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"logic_puzzle": true,
"step_by_step": true,
"model_version": "opus-4.6",
"thinking_mode": "adaptive",
},
TestMetadata: map[string]interface{}{
"provider": config.Provider,
"model": config.Opus46Model,
"problem_type": "logic_puzzle",
"complexity": "high",
"expects_reasoning": true,
},
}
chatRetryConfig := ChatRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ChatRetryCondition{},
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
// Enhanced validation for reasoning scenarios
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Opus46_Reasoning_Chat", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, chatReq)
})
if chatError != nil {
t.Fatalf("❌ Opus 4.6 Chat Completions API reasoning test failed after retries: %v", GetErrorMessage(chatError))
}
// Validate response has content
content := GetChatContent(response)
if content == "" {
t.Error("Expected non-empty response content")
} else {
t.Logf("✅ Opus 4.6 reasoning response (first 200 chars): %s", truncateString(content, 200))
}
// Check for reasoning indicators
reasoningDetected := validateChatCompletionReasoning(t, response)
if !reasoningDetected {
t.Logf("⚠️ No explicit reasoning indicators found in response structure")
} else {
t.Logf("🧠 Reasoning structure detected in response")
}
t.Log("🎉 Opus 4.6 Chat Completions API reasoning test passed!")
})
})
}
// RunOpus46MultiTurnReasoningTest tests multi-turn conversations with reasoning content passthrough.
// This verifies that reasoning details (text + signature) from assistant messages are correctly
// passed back to the model in follow-up turns.
func RunOpus46MultiTurnReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, config OpusReasoningTestConfig) {
if config.SkipOpus46 {
t.Skipf("Skipping Opus 4.6 multi-turn test: %s", config.SkipReason)
return
}
if config.Opus46Model == "" {
t.Skip("No Opus 4.6 model configured")
return
}
t.Run("Opus46_MultiTurnReasoning", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
testConfig := ComprehensiveTestConfig{
Provider: config.Provider,
ReasoningModel: config.Opus46Model,
Scenarios: TestScenarios{Reasoning: true},
Fallbacks: config.Fallbacks,
}
// Step 1: Send initial reasoning request
initialPrompt := "What is 15 * 17? Think step by step."
chatMessages := []schemas.ChatMessage{
CreateBasicChatMessage(initialPrompt),
}
chatReq := &schemas.BifrostChatRequest{
Provider: config.Provider,
Model: config.Opus46Model,
Input: chatMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("low"),
},
},
Fallbacks: config.Fallbacks,
}
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Opus46_MultiTurn_Step1",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"model_version": "opus-4.6",
"thinking_mode": "adaptive",
},
TestMetadata: map[string]interface{}{
"provider": config.Provider,
"model": config.Opus46Model,
"step": "initial",
},
}
chatRetryConfig := ChatRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ChatRetryCondition{},
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
firstResponse, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Opus46_MultiTurn_Step1", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, chatReq)
})
if chatError != nil {
t.Fatalf("Step 1 failed: %v", GetErrorMessage(chatError))
}
firstContent := GetChatContent(firstResponse)
if firstContent == "" {
t.Fatal("Step 1: Expected non-empty response content")
}
t.Logf("Step 1 response (first 200 chars): %s", truncateString(firstContent, 200))
// Extract reasoning details from first response
var reasoningDetails []schemas.ChatReasoningDetails
if len(firstResponse.Choices) > 0 {
choice := firstResponse.Choices[0]
if choice.ChatNonStreamResponseChoice != nil &&
choice.ChatNonStreamResponseChoice.Message != nil &&
choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage != nil {
reasoningDetails = choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage.ReasoningDetails
}
}
t.Logf("Step 1: Found %d reasoning detail entries", len(reasoningDetails))
// Step 2: Build multi-turn conversation with reasoning details passed back
multiTurnMessages := []schemas.ChatMessage{
CreateBasicChatMessage(initialPrompt),
{
Role: schemas.ChatMessageRoleAssistant,
Content: &schemas.ChatMessageContent{
ContentStr: &firstContent,
},
ChatAssistantMessage: &schemas.ChatAssistantMessage{
ReasoningDetails: reasoningDetails,
},
},
CreateBasicChatMessage("Now multiply that result by 2."),
}
multiTurnReq := &schemas.BifrostChatRequest{
Provider: config.Provider,
Model: config.Opus46Model,
Input: multiTurnMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("low"),
},
},
Fallbacks: config.Fallbacks,
}
retryContext2 := TestRetryContext{
ScenarioName: "Opus46_MultiTurn_Step2",
ExpectedBehavior: map[string]interface{}{
"multi_turn": true,
"model_version": "opus-4.6",
"thinking_mode": "adaptive",
},
TestMetadata: map[string]interface{}{
"provider": config.Provider,
"model": config.Opus46Model,
"step": "follow_up",
},
}
secondResponse, chatError2 := WithChatTestRetry(t, chatRetryConfig, retryContext2, expectations, "Opus46_MultiTurn_Step2", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, multiTurnReq)
})
if chatError2 != nil {
t.Fatalf("Step 2 (multi-turn with reasoning passthrough) failed: %v", GetErrorMessage(chatError2))
}
secondContent := GetChatContent(secondResponse)
if secondContent == "" {
t.Error("Step 2: Expected non-empty response content")
} else {
t.Logf("Step 2 response (first 200 chars): %s", truncateString(secondContent, 200))
}
t.Log("Multi-turn reasoning passthrough test passed!")
})
}
// RunAllOpusReasoningTests runs Opus 4.5 and 4.6 reasoning tests for a given provider
func RunAllOpusReasoningTests(t *testing.T, client *bifrost.Bifrost, ctx context.Context, config OpusReasoningTestConfig) {
t.Run(string(config.Provider)+"_OpusReasoning", func(t *testing.T) {
t.Run("Opus45", func(t *testing.T) {
RunOpus45ReasoningTest(t, client, ctx, config)
})
t.Run("Opus46", func(t *testing.T) {
RunOpus46ReasoningTest(t, client, ctx, config)
})
t.Run("Opus46_MultiTurn", func(t *testing.T) {
RunOpus46MultiTurnReasoningTest(t, client, ctx, config)
})
})
}