first commit
This commit is contained in:
643
core/internal/llmtests/reasoning_opus.go
Normal file
643
core/internal/llmtests/reasoning_opus.go
Normal file
@@ -0,0 +1,643 @@
|
||||
package llmtests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
bifrost "github.com/maximhq/bifrost/core"
|
||||
"github.com/maximhq/bifrost/core/schemas"
|
||||
)
|
||||
|
||||
// OpusReasoningTestConfig holds configuration for Opus-specific reasoning tests
|
||||
type OpusReasoningTestConfig struct {
|
||||
Provider schemas.ModelProvider
|
||||
Opus45Model string // Opus 4.5 model identifier
|
||||
Opus46Model string // Opus 4.6 model identifier
|
||||
Fallbacks []schemas.Fallback
|
||||
SkipOpus45 bool // Skip Opus 4.5 tests
|
||||
SkipOpus46 bool // Skip Opus 4.6 tests
|
||||
SkipReason string // Reason for skipping
|
||||
}
|
||||
|
||||
// GetOpusReasoningTestConfigs returns test configurations for Opus reasoning across providers
|
||||
func GetOpusReasoningTestConfigs() []OpusReasoningTestConfig {
|
||||
return []OpusReasoningTestConfig{
|
||||
{
|
||||
Provider: schemas.Anthropic,
|
||||
Opus45Model: "claude-opus-4-5-20251101",
|
||||
Opus46Model: "claude-opus-4-6-20260210",
|
||||
Fallbacks: []schemas.Fallback{},
|
||||
},
|
||||
{
|
||||
Provider: schemas.Bedrock,
|
||||
Opus45Model: "global.anthropic.claude-opus-4-5-20251101-v1:0",
|
||||
Opus46Model: "global.anthropic.claude-opus-4-6-v1",
|
||||
Fallbacks: []schemas.Fallback{},
|
||||
},
|
||||
{
|
||||
Provider: schemas.Azure,
|
||||
Opus45Model: "claude-opus-4-5", // Uses deployment name
|
||||
Opus46Model: "claude-opus-4-6", // Uses deployment name
|
||||
Fallbacks: []schemas.Fallback{},
|
||||
},
|
||||
{
|
||||
Provider: schemas.Vertex,
|
||||
Opus45Model: "claude-opus-4-5", // Uses deployment name
|
||||
Opus46Model: "claude-opus-4-6", // Uses deployment name
|
||||
Fallbacks: []schemas.Fallback{},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// RunOpus45ReasoningTest tests extended thinking with Opus 4.5 (budget_tokens mode)
|
||||
func RunOpus45ReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, config OpusReasoningTestConfig) {
|
||||
if config.SkipOpus45 {
|
||||
t.Skipf("Skipping Opus 4.5 test: %s", config.SkipReason)
|
||||
return
|
||||
}
|
||||
|
||||
if config.Opus45Model == "" {
|
||||
t.Skip("No Opus 4.5 model configured")
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("Opus45_ExtendedThinking", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
// Complex reasoning problem
|
||||
problemPrompt := "Solve this step by step: A train leaves station A at 9:00 AM traveling at 60 mph. Another train leaves station B (300 miles away) at 10:00 AM traveling towards station A at 80 mph. At what time will they meet, and how far from station A?"
|
||||
|
||||
// Create a test config for retry framework
|
||||
testConfig := ComprehensiveTestConfig{
|
||||
Provider: config.Provider,
|
||||
ReasoningModel: config.Opus45Model,
|
||||
Scenarios: TestScenarios{
|
||||
Reasoning: true,
|
||||
},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
// Test via Responses API
|
||||
t.Run("ResponsesAPI", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
responsesMessages := []schemas.ResponsesMessage{
|
||||
CreateBasicResponsesMessage(problemPrompt),
|
||||
}
|
||||
|
||||
responsesReq := &schemas.BifrostResponsesRequest{
|
||||
Provider: config.Provider,
|
||||
Model: config.Opus45Model,
|
||||
Input: responsesMessages,
|
||||
Params: &schemas.ResponsesParameters{
|
||||
MaxOutputTokens: bifrost.Ptr(4000),
|
||||
Reasoning: &schemas.ResponsesParametersReasoning{
|
||||
Effort: bifrost.Ptr("high"),
|
||||
},
|
||||
Include: []string{"reasoning.encrypted_content"},
|
||||
},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework with enhanced validation for reasoning
|
||||
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
|
||||
retryContext := TestRetryContext{
|
||||
ScenarioName: "Opus45_Reasoning_Responses",
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_show_reasoning": true,
|
||||
"mathematical_problem": true,
|
||||
"step_by_step": true,
|
||||
"model_version": "opus-4.5",
|
||||
"thinking_mode": "budget_tokens",
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": config.Provider,
|
||||
"model": config.Opus45Model,
|
||||
"problem_type": "mathematical",
|
||||
"complexity": "high",
|
||||
"expects_reasoning": true,
|
||||
},
|
||||
}
|
||||
responsesRetryConfig := ResponsesRetryConfig{
|
||||
MaxAttempts: retryConfig.MaxAttempts,
|
||||
BaseDelay: retryConfig.BaseDelay,
|
||||
MaxDelay: retryConfig.MaxDelay,
|
||||
Conditions: []ResponsesRetryCondition{},
|
||||
OnRetry: retryConfig.OnRetry,
|
||||
OnFinalFail: retryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
// Enhanced validation for reasoning scenarios
|
||||
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
|
||||
"requires_reasoning": true,
|
||||
})
|
||||
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
|
||||
|
||||
response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Opus45_Reasoning_Responses", func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.ResponsesRequest(bfCtx, responsesReq)
|
||||
})
|
||||
|
||||
if responsesError != nil {
|
||||
t.Fatalf("❌ Opus 4.5 Responses API reasoning test failed after retries: %v", GetErrorMessage(responsesError))
|
||||
}
|
||||
|
||||
// Validate response has content
|
||||
content := GetResponsesContent(response)
|
||||
if content == "" {
|
||||
t.Error("Expected non-empty response content")
|
||||
} else {
|
||||
t.Logf("✅ Opus 4.5 reasoning response (first 200 chars): %s", truncateString(content, 200))
|
||||
}
|
||||
|
||||
// Check for reasoning indicators
|
||||
reasoningDetected := validateResponsesAPIReasoning(t, response)
|
||||
if !reasoningDetected {
|
||||
t.Logf("⚠️ No explicit reasoning indicators found in response structure")
|
||||
} else {
|
||||
t.Logf("🧠 Reasoning structure detected in response")
|
||||
}
|
||||
|
||||
t.Log("🎉 Opus 4.5 Responses API reasoning test passed!")
|
||||
})
|
||||
|
||||
// Test via Chat Completions API
|
||||
t.Run("ChatCompletionsAPI", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
chatMessages := []schemas.ChatMessage{
|
||||
CreateBasicChatMessage(problemPrompt),
|
||||
}
|
||||
|
||||
chatReq := &schemas.BifrostChatRequest{
|
||||
Provider: config.Provider,
|
||||
Model: config.Opus45Model,
|
||||
Input: chatMessages,
|
||||
Params: &schemas.ChatParameters{
|
||||
MaxCompletionTokens: bifrost.Ptr(4000),
|
||||
Reasoning: &schemas.ChatReasoning{
|
||||
Effort: bifrost.Ptr("high"),
|
||||
MaxTokens: bifrost.Ptr(2000), // Budget tokens for Opus 4.5
|
||||
},
|
||||
},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework with enhanced validation for reasoning
|
||||
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
|
||||
retryContext := TestRetryContext{
|
||||
ScenarioName: "Opus45_Reasoning_Chat",
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_show_reasoning": true,
|
||||
"mathematical_problem": true,
|
||||
"step_by_step": true,
|
||||
"model_version": "opus-4.5",
|
||||
"thinking_mode": "budget_tokens",
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": config.Provider,
|
||||
"model": config.Opus45Model,
|
||||
"problem_type": "mathematical",
|
||||
"complexity": "high",
|
||||
"expects_reasoning": true,
|
||||
},
|
||||
}
|
||||
chatRetryConfig := ChatRetryConfig{
|
||||
MaxAttempts: retryConfig.MaxAttempts,
|
||||
BaseDelay: retryConfig.BaseDelay,
|
||||
MaxDelay: retryConfig.MaxDelay,
|
||||
Conditions: []ChatRetryCondition{},
|
||||
OnRetry: retryConfig.OnRetry,
|
||||
OnFinalFail: retryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
// Enhanced validation for reasoning scenarios
|
||||
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
|
||||
"requires_reasoning": true,
|
||||
})
|
||||
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
|
||||
|
||||
response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Opus45_Reasoning_Chat", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.ChatCompletionRequest(bfCtx, chatReq)
|
||||
})
|
||||
|
||||
if chatError != nil {
|
||||
t.Fatalf("❌ Opus 4.5 Chat Completions API reasoning test failed after retries: %v", GetErrorMessage(chatError))
|
||||
}
|
||||
|
||||
// Validate response has content
|
||||
content := GetChatContent(response)
|
||||
if content == "" {
|
||||
t.Error("Expected non-empty response content")
|
||||
} else {
|
||||
t.Logf("✅ Opus 4.5 reasoning response (first 200 chars): %s", truncateString(content, 200))
|
||||
}
|
||||
|
||||
// Check for reasoning indicators
|
||||
reasoningDetected := validateChatCompletionReasoning(t, response)
|
||||
if !reasoningDetected {
|
||||
t.Logf("⚠️ No explicit reasoning indicators found in response structure")
|
||||
} else {
|
||||
t.Logf("🧠 Reasoning structure detected in response")
|
||||
}
|
||||
|
||||
t.Log("🎉 Opus 4.5 Chat Completions API reasoning test passed!")
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// RunOpus46ReasoningTest tests adaptive thinking with Opus 4.6 (adaptive mode + effort)
|
||||
func RunOpus46ReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, config OpusReasoningTestConfig) {
|
||||
if config.SkipOpus46 {
|
||||
t.Skipf("Skipping Opus 4.6 test: %s", config.SkipReason)
|
||||
return
|
||||
}
|
||||
|
||||
if config.Opus46Model == "" {
|
||||
t.Skip("No Opus 4.6 model configured")
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("Opus46_AdaptiveThinking", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
// Complex reasoning problem that benefits from adaptive thinking
|
||||
problemPrompt := "Analyze this logic puzzle: Five people (A, B, C, D, E) are sitting in a row. A is not at either end. B is somewhere to the left of C. D is not next to E. E is at one of the ends. In how many different valid arrangements can they sit? Show your reasoning."
|
||||
|
||||
// Create a test config for retry framework
|
||||
testConfig := ComprehensiveTestConfig{
|
||||
Provider: config.Provider,
|
||||
ReasoningModel: config.Opus46Model,
|
||||
Scenarios: TestScenarios{
|
||||
Reasoning: true,
|
||||
},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
// Test via Responses API with different effort levels
|
||||
effortLevels := []string{"low", "medium", "high"}
|
||||
|
||||
for _, effort := range effortLevels {
|
||||
effort := effort // capture range variable
|
||||
t.Run("ResponsesAPI_Effort_"+effort, func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
responsesMessages := []schemas.ResponsesMessage{
|
||||
CreateBasicResponsesMessage(problemPrompt),
|
||||
}
|
||||
|
||||
responsesReq := &schemas.BifrostResponsesRequest{
|
||||
Provider: config.Provider,
|
||||
Model: config.Opus46Model,
|
||||
Input: responsesMessages,
|
||||
Params: &schemas.ResponsesParameters{
|
||||
MaxOutputTokens: bifrost.Ptr(4000),
|
||||
Reasoning: &schemas.ResponsesParametersReasoning{
|
||||
Effort: bifrost.Ptr(effort), // Adaptive thinking uses effort parameter
|
||||
},
|
||||
Include: []string{"reasoning.encrypted_content"},
|
||||
},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework with enhanced validation for reasoning
|
||||
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
|
||||
retryContext := TestRetryContext{
|
||||
ScenarioName: "Opus46_Reasoning_Responses_" + effort,
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_show_reasoning": true,
|
||||
"logic_puzzle": true,
|
||||
"step_by_step": true,
|
||||
"model_version": "opus-4.6",
|
||||
"thinking_mode": "adaptive",
|
||||
"effort_level": effort,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": config.Provider,
|
||||
"model": config.Opus46Model,
|
||||
"problem_type": "logic_puzzle",
|
||||
"complexity": "high",
|
||||
"expects_reasoning": true,
|
||||
"effort": effort,
|
||||
},
|
||||
}
|
||||
responsesRetryConfig := ResponsesRetryConfig{
|
||||
MaxAttempts: retryConfig.MaxAttempts,
|
||||
BaseDelay: retryConfig.BaseDelay,
|
||||
MaxDelay: retryConfig.MaxDelay,
|
||||
Conditions: []ResponsesRetryCondition{},
|
||||
OnRetry: retryConfig.OnRetry,
|
||||
OnFinalFail: retryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
// Enhanced validation for reasoning scenarios
|
||||
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
|
||||
"requires_reasoning": true,
|
||||
})
|
||||
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
|
||||
|
||||
response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Opus46_Reasoning_Responses_"+effort, func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.ResponsesRequest(bfCtx, responsesReq)
|
||||
})
|
||||
|
||||
if responsesError != nil {
|
||||
t.Fatalf("❌ Opus 4.6 Responses API (effort=%s) reasoning test failed after retries: %v", effort, GetErrorMessage(responsesError))
|
||||
}
|
||||
|
||||
// Validate response has content
|
||||
content := GetResponsesContent(response)
|
||||
if content == "" {
|
||||
t.Errorf("Expected non-empty response content for effort=%s", effort)
|
||||
} else {
|
||||
t.Logf("✅ Opus 4.6 (effort=%s) response (first 200 chars): %s", effort, truncateString(content, 200))
|
||||
}
|
||||
|
||||
// Check for reasoning indicators
|
||||
reasoningDetected := validateResponsesAPIReasoning(t, response)
|
||||
if !reasoningDetected {
|
||||
t.Logf("⚠️ No explicit reasoning indicators found in response structure")
|
||||
} else {
|
||||
t.Logf("🧠 Reasoning structure detected in response")
|
||||
}
|
||||
|
||||
t.Logf("🎉 Opus 4.6 Responses API (effort=%s) reasoning test passed!", effort)
|
||||
})
|
||||
}
|
||||
|
||||
// Test via Chat Completions API
|
||||
t.Run("ChatCompletionsAPI", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
chatMessages := []schemas.ChatMessage{
|
||||
CreateBasicChatMessage(problemPrompt),
|
||||
}
|
||||
|
||||
chatReq := &schemas.BifrostChatRequest{
|
||||
Provider: config.Provider,
|
||||
Model: config.Opus46Model,
|
||||
Input: chatMessages,
|
||||
Params: &schemas.ChatParameters{
|
||||
MaxCompletionTokens: bifrost.Ptr(4000),
|
||||
Reasoning: &schemas.ChatReasoning{
|
||||
Effort: bifrost.Ptr("high"), // Opus 4.6 uses adaptive thinking with effort
|
||||
// Note: MaxTokens (budget_tokens) is NOT used for Opus 4.6
|
||||
},
|
||||
},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework with enhanced validation for reasoning
|
||||
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
|
||||
retryContext := TestRetryContext{
|
||||
ScenarioName: "Opus46_Reasoning_Chat",
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_show_reasoning": true,
|
||||
"logic_puzzle": true,
|
||||
"step_by_step": true,
|
||||
"model_version": "opus-4.6",
|
||||
"thinking_mode": "adaptive",
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": config.Provider,
|
||||
"model": config.Opus46Model,
|
||||
"problem_type": "logic_puzzle",
|
||||
"complexity": "high",
|
||||
"expects_reasoning": true,
|
||||
},
|
||||
}
|
||||
chatRetryConfig := ChatRetryConfig{
|
||||
MaxAttempts: retryConfig.MaxAttempts,
|
||||
BaseDelay: retryConfig.BaseDelay,
|
||||
MaxDelay: retryConfig.MaxDelay,
|
||||
Conditions: []ChatRetryCondition{},
|
||||
OnRetry: retryConfig.OnRetry,
|
||||
OnFinalFail: retryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
// Enhanced validation for reasoning scenarios
|
||||
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
|
||||
"requires_reasoning": true,
|
||||
})
|
||||
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
|
||||
|
||||
response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Opus46_Reasoning_Chat", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.ChatCompletionRequest(bfCtx, chatReq)
|
||||
})
|
||||
|
||||
if chatError != nil {
|
||||
t.Fatalf("❌ Opus 4.6 Chat Completions API reasoning test failed after retries: %v", GetErrorMessage(chatError))
|
||||
}
|
||||
|
||||
// Validate response has content
|
||||
content := GetChatContent(response)
|
||||
if content == "" {
|
||||
t.Error("Expected non-empty response content")
|
||||
} else {
|
||||
t.Logf("✅ Opus 4.6 reasoning response (first 200 chars): %s", truncateString(content, 200))
|
||||
}
|
||||
|
||||
// Check for reasoning indicators
|
||||
reasoningDetected := validateChatCompletionReasoning(t, response)
|
||||
if !reasoningDetected {
|
||||
t.Logf("⚠️ No explicit reasoning indicators found in response structure")
|
||||
} else {
|
||||
t.Logf("🧠 Reasoning structure detected in response")
|
||||
}
|
||||
|
||||
t.Log("🎉 Opus 4.6 Chat Completions API reasoning test passed!")
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// RunOpus46MultiTurnReasoningTest tests multi-turn conversations with reasoning content passthrough.
|
||||
// This verifies that reasoning details (text + signature) from assistant messages are correctly
|
||||
// passed back to the model in follow-up turns.
|
||||
func RunOpus46MultiTurnReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, config OpusReasoningTestConfig) {
|
||||
if config.SkipOpus46 {
|
||||
t.Skipf("Skipping Opus 4.6 multi-turn test: %s", config.SkipReason)
|
||||
return
|
||||
}
|
||||
|
||||
if config.Opus46Model == "" {
|
||||
t.Skip("No Opus 4.6 model configured")
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("Opus46_MultiTurnReasoning", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
testConfig := ComprehensiveTestConfig{
|
||||
Provider: config.Provider,
|
||||
ReasoningModel: config.Opus46Model,
|
||||
Scenarios: TestScenarios{Reasoning: true},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
// Step 1: Send initial reasoning request
|
||||
initialPrompt := "What is 15 * 17? Think step by step."
|
||||
chatMessages := []schemas.ChatMessage{
|
||||
CreateBasicChatMessage(initialPrompt),
|
||||
}
|
||||
|
||||
chatReq := &schemas.BifrostChatRequest{
|
||||
Provider: config.Provider,
|
||||
Model: config.Opus46Model,
|
||||
Input: chatMessages,
|
||||
Params: &schemas.ChatParameters{
|
||||
MaxCompletionTokens: bifrost.Ptr(4000),
|
||||
Reasoning: &schemas.ChatReasoning{
|
||||
Effort: bifrost.Ptr("low"),
|
||||
},
|
||||
},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
|
||||
retryContext := TestRetryContext{
|
||||
ScenarioName: "Opus46_MultiTurn_Step1",
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_show_reasoning": true,
|
||||
"model_version": "opus-4.6",
|
||||
"thinking_mode": "adaptive",
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": config.Provider,
|
||||
"model": config.Opus46Model,
|
||||
"step": "initial",
|
||||
},
|
||||
}
|
||||
chatRetryConfig := ChatRetryConfig{
|
||||
MaxAttempts: retryConfig.MaxAttempts,
|
||||
BaseDelay: retryConfig.BaseDelay,
|
||||
MaxDelay: retryConfig.MaxDelay,
|
||||
Conditions: []ChatRetryCondition{},
|
||||
OnRetry: retryConfig.OnRetry,
|
||||
OnFinalFail: retryConfig.OnFinalFail,
|
||||
}
|
||||
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
|
||||
"requires_reasoning": true,
|
||||
})
|
||||
expectations = ModifyExpectationsForProvider(expectations, config.Provider)
|
||||
|
||||
firstResponse, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Opus46_MultiTurn_Step1", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.ChatCompletionRequest(bfCtx, chatReq)
|
||||
})
|
||||
|
||||
if chatError != nil {
|
||||
t.Fatalf("Step 1 failed: %v", GetErrorMessage(chatError))
|
||||
}
|
||||
|
||||
firstContent := GetChatContent(firstResponse)
|
||||
if firstContent == "" {
|
||||
t.Fatal("Step 1: Expected non-empty response content")
|
||||
}
|
||||
t.Logf("Step 1 response (first 200 chars): %s", truncateString(firstContent, 200))
|
||||
|
||||
// Extract reasoning details from first response
|
||||
var reasoningDetails []schemas.ChatReasoningDetails
|
||||
if len(firstResponse.Choices) > 0 {
|
||||
choice := firstResponse.Choices[0]
|
||||
if choice.ChatNonStreamResponseChoice != nil &&
|
||||
choice.ChatNonStreamResponseChoice.Message != nil &&
|
||||
choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage != nil {
|
||||
reasoningDetails = choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage.ReasoningDetails
|
||||
}
|
||||
}
|
||||
|
||||
t.Logf("Step 1: Found %d reasoning detail entries", len(reasoningDetails))
|
||||
|
||||
// Step 2: Build multi-turn conversation with reasoning details passed back
|
||||
multiTurnMessages := []schemas.ChatMessage{
|
||||
CreateBasicChatMessage(initialPrompt),
|
||||
{
|
||||
Role: schemas.ChatMessageRoleAssistant,
|
||||
Content: &schemas.ChatMessageContent{
|
||||
ContentStr: &firstContent,
|
||||
},
|
||||
ChatAssistantMessage: &schemas.ChatAssistantMessage{
|
||||
ReasoningDetails: reasoningDetails,
|
||||
},
|
||||
},
|
||||
CreateBasicChatMessage("Now multiply that result by 2."),
|
||||
}
|
||||
|
||||
multiTurnReq := &schemas.BifrostChatRequest{
|
||||
Provider: config.Provider,
|
||||
Model: config.Opus46Model,
|
||||
Input: multiTurnMessages,
|
||||
Params: &schemas.ChatParameters{
|
||||
MaxCompletionTokens: bifrost.Ptr(4000),
|
||||
Reasoning: &schemas.ChatReasoning{
|
||||
Effort: bifrost.Ptr("low"),
|
||||
},
|
||||
},
|
||||
Fallbacks: config.Fallbacks,
|
||||
}
|
||||
|
||||
retryContext2 := TestRetryContext{
|
||||
ScenarioName: "Opus46_MultiTurn_Step2",
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"multi_turn": true,
|
||||
"model_version": "opus-4.6",
|
||||
"thinking_mode": "adaptive",
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": config.Provider,
|
||||
"model": config.Opus46Model,
|
||||
"step": "follow_up",
|
||||
},
|
||||
}
|
||||
|
||||
secondResponse, chatError2 := WithChatTestRetry(t, chatRetryConfig, retryContext2, expectations, "Opus46_MultiTurn_Step2", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.ChatCompletionRequest(bfCtx, multiTurnReq)
|
||||
})
|
||||
|
||||
if chatError2 != nil {
|
||||
t.Fatalf("Step 2 (multi-turn with reasoning passthrough) failed: %v", GetErrorMessage(chatError2))
|
||||
}
|
||||
|
||||
secondContent := GetChatContent(secondResponse)
|
||||
if secondContent == "" {
|
||||
t.Error("Step 2: Expected non-empty response content")
|
||||
} else {
|
||||
t.Logf("Step 2 response (first 200 chars): %s", truncateString(secondContent, 200))
|
||||
}
|
||||
|
||||
t.Log("Multi-turn reasoning passthrough test passed!")
|
||||
})
|
||||
}
|
||||
|
||||
// RunAllOpusReasoningTests runs Opus 4.5 and 4.6 reasoning tests for a given provider
|
||||
func RunAllOpusReasoningTests(t *testing.T, client *bifrost.Bifrost, ctx context.Context, config OpusReasoningTestConfig) {
|
||||
t.Run(string(config.Provider)+"_OpusReasoning", func(t *testing.T) {
|
||||
t.Run("Opus45", func(t *testing.T) {
|
||||
RunOpus45ReasoningTest(t, client, ctx, config)
|
||||
})
|
||||
t.Run("Opus46", func(t *testing.T) {
|
||||
RunOpus46ReasoningTest(t, client, ctx, config)
|
||||
})
|
||||
t.Run("Opus46_MultiTurn", func(t *testing.T) {
|
||||
RunOpus46MultiTurnReasoningTest(t, client, ctx, config)
|
||||
})
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user