Files
bifrost/core/internal/llmtests/reasoning.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

584 lines
21 KiB
Go
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package llmtests
import (
"context"
"os"
"testing"
bifrost "github.com/maximhq/bifrost/core"
"github.com/maximhq/bifrost/core/schemas"
)
// RunResponsesReasoningTest executes the reasoning test scenario to test thinking capabilities via Responses API only
func RunResponsesReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.Reasoning {
t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
return
}
// Skip if no reasoning model is configured
if testConfig.ReasoningModel == "" {
t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
return
}
t.Run("ResponsesReasoning", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
// Create a complex problem that requires step-by-step reasoning
problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning."
responsesMessages := []schemas.ResponsesMessage{
CreateBasicResponsesMessage(problemPrompt),
}
// Execute Responses API test with retries
responsesReq := &schemas.BifrostResponsesRequest{
Provider: testConfig.Provider,
Model: testConfig.ReasoningModel,
Input: responsesMessages,
Params: &schemas.ResponsesParameters{
// Reasoning models (o3, o4-mini) allocate tokens between reasoning and text output.
// Note: Older o1 models may not return message output via Responses API - use o3/o4-mini.
// OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs.
// See: https://platform.openai.com/docs/guides/reasoning#allocating-space-for-reasoning
MaxOutputTokens: bifrost.Ptr(25000),
// Configure reasoning-specific parameters
Reasoning: &schemas.ResponsesParametersReasoning{
Effort: bifrost.Ptr("high"), // High effort for complex reasoning
// Summary: bifrost.Ptr("detailed"), // Detailed summary of reasoning process
},
// Include reasoning content in response
Include: []string{"reasoning.encrypted_content"},
},
Fallbacks: testConfig.Fallbacks,
}
// Use retry framework with enhanced validation for reasoning
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Reasoning",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"mathematical_problem": true,
"step_by_step": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.ReasoningModel,
"problem_type": "mathematical",
"complexity": "high",
"expects_reasoning": true,
},
}
responsesRetryConfig := ResponsesRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ResponsesRetryCondition{}, // Add specific responses retry conditions as needed
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
// Enhanced validation for reasoning scenarios
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ResponsesRequest(bfCtx, responsesReq)
})
if responsesError != nil {
t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(responsesError))
}
// Log the response content
responsesContent := GetResponsesContent(response)
if responsesContent == "" {
t.Logf("✅ Responses API reasoning result: <no content>")
} else {
maxLen := 300
if len(responsesContent) < maxLen {
maxLen = len(responsesContent)
}
t.Logf("✅ Responses API reasoning result: %s", responsesContent[:maxLen])
}
// Additional reasoning-specific validation (complementary to the main validation)
reasoningDetected := validateResponsesAPIReasoning(t, response)
if !reasoningDetected {
t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content")
} else {
t.Logf("🧠 Reasoning structure detected in response")
}
t.Logf("🎉 Responses API passed Reasoning test!")
})
}
// validateResponsesAPIReasoning performs additional validation specific to Responses API reasoning features
// Returns true if reasoning indicators are found
func validateResponsesAPIReasoning(t *testing.T, response *schemas.BifrostResponsesResponse) bool {
if response == nil || response.Output == nil {
return false
}
reasoningFound := false
summaryFound := false
reasoningContentFound := false
// Check if response contains reasoning messages or reasoning content
for _, message := range response.Output {
// Check for ResponsesMessageTypeReasoning
if message.Type != nil && *message.Type == schemas.ResponsesMessageTypeReasoning {
reasoningFound = true
t.Logf("🧠 Found ResponsesMessageTypeReasoning message in response")
// Check for reasoning summary content
if message.ResponsesReasoning != nil && len(message.ResponsesReasoning.Summary) > 0 {
summaryFound = true
t.Logf("📝 Found reasoning summary with %d content blocks", len(message.ResponsesReasoning.Summary))
// Log first summary block for debugging
if len(message.ResponsesReasoning.Summary) > 0 {
firstSummary := message.ResponsesReasoning.Summary[0]
if len(firstSummary.Text) > 0 {
maxLen := 200
if len(firstSummary.Text) < maxLen {
maxLen = len(firstSummary.Text)
}
t.Logf("📋 First reasoning summary: %s", firstSummary.Text[:maxLen])
} else {
t.Logf("📋 First reasoning summary: (empty)")
}
}
}
// Check for encrypted reasoning content
if message.ResponsesReasoning != nil && message.ResponsesReasoning.EncryptedContent != nil {
t.Logf("🔐 Found encrypted reasoning content")
}
}
// Check for content blocks with ResponsesOutputMessageContentTypeReasoning
if message.Content != nil && message.Content.ContentBlocks != nil {
for _, block := range message.Content.ContentBlocks {
if block.Type == schemas.ResponsesOutputMessageContentTypeReasoning {
reasoningContentFound = true
t.Logf("🔍 Found ResponsesOutputMessageContentTypeReasoning content block")
}
}
}
}
// Check if reasoning tokens were used
if response.Usage != nil && response.Usage.OutputTokensDetails != nil &&
response.Usage.OutputTokensDetails.ReasoningTokens > 0 {
t.Logf("🔢 Reasoning tokens used: %d", response.Usage.OutputTokensDetails.ReasoningTokens)
reasoningFound = true // Reasoning tokens indicate reasoning was performed
}
// Log findings
detected := reasoningFound || reasoningContentFound
if detected {
t.Logf("✅ Responses API reasoning indicators detected")
if reasoningFound {
t.Logf(" - ResponsesMessageTypeReasoning or reasoning tokens found")
}
if reasoningContentFound {
t.Logf(" - ResponsesOutputMessageContentTypeReasoning content blocks found")
}
if summaryFound {
t.Logf(" - Reasoning summary content found")
}
} else {
t.Logf(" No explicit reasoning indicators found (may be provider-specific)")
}
return detected
}
// RunChatCompletionReasoningTest executes the reasoning test scenario to test thinking capabilities via Chat Completions API
func RunChatCompletionReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.Reasoning {
t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
return
}
// Skip if no reasoning model is configured
if testConfig.ReasoningModel == "" {
t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
return
}
t.Run("ChatCompletionReasoning", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
if testConfig.Provider == schemas.OpenAI {
// OpenAI because reasoning for them in chat completions is extremely flaky
t.Skip("Skipping ChatCompletionReasoning test for OpenAI")
return
}
// Create a complex problem that requires step-by-step reasoning
problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning."
chatMessages := []schemas.ChatMessage{
CreateBasicChatMessage(problemPrompt),
}
// Execute Chat Completions API test with retries
chatReq := &schemas.BifrostChatRequest{
Provider: testConfig.Provider,
Model: testConfig.ReasoningModel,
Input: chatMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(1800),
// Configure reasoning-specific parameters
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("high"), // High effort for complex reasoning
MaxTokens: bifrost.Ptr(1500), // Maximum tokens for reasoning output
},
},
Fallbacks: testConfig.Fallbacks,
}
// Use retry framework with enhanced validation for reasoning
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Reasoning",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"mathematical_problem": true,
"step_by_step": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.ReasoningModel,
"problem_type": "mathematical",
"complexity": "high",
"expects_reasoning": true,
},
}
chatRetryConfig := ChatRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ChatRetryCondition{}, // Add specific chat retry conditions as needed
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
// Enhanced validation for reasoning scenarios
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, chatReq)
})
if chatError != nil {
t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(chatError))
}
// Log the response content
chatContent := GetChatContent(response)
if chatContent == "" {
t.Logf("✅ Chat Completions API reasoning result: <no content>")
} else {
maxLen := 300
if len(chatContent) < maxLen {
maxLen = len(chatContent)
}
t.Logf("✅ Chat Completions API reasoning result: %s", chatContent[:maxLen])
}
// Additional reasoning-specific validation (complementary to the main validation)
reasoningDetected := validateChatCompletionReasoning(t, response)
if !reasoningDetected {
t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content")
} else {
t.Logf("🧠 Reasoning structure detected in response")
}
t.Logf("🎉 Chat Completions API passed Reasoning test!")
})
}
// validateChatCompletionReasoning performs additional validation specific to Chat Completions API reasoning features
// Returns true if reasoning indicators are found
func validateChatCompletionReasoning(t *testing.T, response *schemas.BifrostChatResponse) bool {
if response == nil || len(response.Choices) == 0 {
return false
}
reasoningFound := false
reasoningDetailsFound := false
reasoningTokensFound := false
// Check each choice for reasoning indicators
for _, choice := range response.Choices {
// Check for reasoning details in ChatNonStreamResponseChoice
if choice.ChatNonStreamResponseChoice != nil && choice.ChatNonStreamResponseChoice.Message != nil {
message := choice.ChatNonStreamResponseChoice.Message
if message == nil {
continue
}
// Check for reasoning content in message (for backward compatibility)
if message.ChatAssistantMessage != nil && message.ChatAssistantMessage.Reasoning != nil && *message.ChatAssistantMessage.Reasoning != "" {
reasoningFound = true
t.Logf("🧠 Found reasoning content in message (length: %d)", len(*message.ChatAssistantMessage.Reasoning))
// Log first 200 chars for debugging
reasoningText := *message.ChatAssistantMessage.Reasoning
maxLen := 200
if len(reasoningText) < maxLen {
maxLen = len(reasoningText)
}
t.Logf("📋 First reasoning content: %s", reasoningText[:maxLen])
}
// Check for reasoning details array
if message.ChatAssistantMessage != nil && len(message.ChatAssistantMessage.ReasoningDetails) > 0 {
reasoningDetailsFound = true
t.Logf("📝 Found %d reasoning details entries", len(message.ChatAssistantMessage.ReasoningDetails))
// Log details about each reasoning entry
for i, detail := range message.ChatAssistantMessage.ReasoningDetails {
t.Logf(" - Entry %d: Type=%s, Index=%d", i, detail.Type, detail.Index)
switch detail.Type {
case schemas.BifrostReasoningDetailsTypeSummary:
if detail.Summary != nil {
t.Logf(" Summary length: %d", len(*detail.Summary))
}
case schemas.BifrostReasoningDetailsTypeText:
if detail.Text != nil {
textLen := len(*detail.Text)
t.Logf(" Text length: %d", textLen)
if textLen > 0 {
maxLen := 150
if textLen < maxLen {
maxLen = textLen
}
t.Logf(" Text preview: %s", (*detail.Text)[:maxLen])
}
}
case schemas.BifrostReasoningDetailsTypeEncrypted:
if detail.Data != nil {
t.Logf(" Encrypted data length: %d", len(*detail.Data))
}
if detail.Signature != nil {
t.Logf(" Signature present: %d bytes", len(*detail.Signature))
}
}
}
}
}
}
// Check if reasoning tokens were used
if response.Usage != nil && response.Usage.CompletionTokensDetails != nil &&
response.Usage.CompletionTokensDetails.ReasoningTokens > 0 {
reasoningTokensFound = true
t.Logf("🔢 Reasoning tokens used: %d", response.Usage.CompletionTokensDetails.ReasoningTokens)
}
// Log findings
detected := reasoningFound || reasoningDetailsFound || reasoningTokensFound
if detected {
t.Logf("✅ Chat Completions API reasoning indicators detected")
if reasoningFound {
t.Logf(" - Reasoning content found in message")
}
if reasoningDetailsFound {
t.Logf(" - Reasoning details array found")
}
if reasoningTokensFound {
t.Logf(" - Reasoning tokens usage reported")
}
} else {
t.Logf(" No explicit reasoning indicators found (may be provider-specific)")
}
return detected
}
// RunMultiTurnReasoningTest tests multi-turn conversations with reasoning content passthrough.
// It verifies that reasoning details (text + signature) from assistant messages are correctly
// passed back to the model in follow-up turns via the Chat Completions API.
func RunMultiTurnReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.Reasoning {
t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
return
}
if testConfig.ReasoningModel == "" {
t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
return
}
t.Run("MultiTurnReasoning", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
if testConfig.Provider == schemas.OpenAI {
t.Skip("Skipping MultiTurnReasoning test for OpenAI")
return
}
// Step 1: Send initial reasoning request
initialPrompt := "What is 15 * 17? Think step by step."
chatMessages := []schemas.ChatMessage{
CreateBasicChatMessage(initialPrompt),
}
chatReq := &schemas.BifrostChatRequest{
Provider: testConfig.Provider,
Model: testConfig.ReasoningModel,
Input: chatMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("low"),
},
},
Fallbacks: testConfig.Fallbacks,
}
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "MultiTurnReasoning_Step1",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"multi_turn": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.ReasoningModel,
"step": "initial",
},
}
chatRetryConfig := ChatRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ChatRetryCondition{},
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
firstResponse, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "MultiTurnReasoning_Step1", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, chatReq)
})
if chatError != nil {
t.Fatalf("Step 1 failed: %v", GetErrorMessage(chatError))
}
firstContent := GetChatContent(firstResponse)
if firstContent == "" {
t.Fatal("Step 1: Expected non-empty response content")
}
t.Logf("Step 1 response: %s", truncateString(firstContent, 200))
// Extract reasoning details from first response
var reasoningDetails []schemas.ChatReasoningDetails
if len(firstResponse.Choices) > 0 {
choice := firstResponse.Choices[0]
if choice.ChatNonStreamResponseChoice != nil &&
choice.ChatNonStreamResponseChoice.Message != nil &&
choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage != nil {
reasoningDetails = choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage.ReasoningDetails
}
}
t.Logf("Step 1: Found %d reasoning detail entries", len(reasoningDetails))
// Step 2: Build multi-turn conversation with reasoning details passed back
multiTurnMessages := []schemas.ChatMessage{
CreateBasicChatMessage(initialPrompt),
{
Role: schemas.ChatMessageRoleAssistant,
Content: &schemas.ChatMessageContent{
ContentStr: &firstContent,
},
ChatAssistantMessage: &schemas.ChatAssistantMessage{
ReasoningDetails: reasoningDetails,
},
},
CreateBasicChatMessage("Now multiply that result by 2."),
}
multiTurnReq := &schemas.BifrostChatRequest{
Provider: testConfig.Provider,
Model: testConfig.ReasoningModel,
Input: multiTurnMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("low"),
},
},
Fallbacks: testConfig.Fallbacks,
}
retryContext2 := TestRetryContext{
ScenarioName: "MultiTurnReasoning_Step2",
ExpectedBehavior: map[string]interface{}{
"multi_turn": true,
"reasoning_passthrough": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.ReasoningModel,
"step": "follow_up",
},
}
secondResponse, chatError2 := WithChatTestRetry(t, chatRetryConfig, retryContext2, expectations, "MultiTurnReasoning_Step2", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, multiTurnReq)
})
if chatError2 != nil {
t.Fatalf("Step 2 (multi-turn with reasoning passthrough) failed: %v", GetErrorMessage(chatError2))
}
secondContent := GetChatContent(secondResponse)
if secondContent == "" {
t.Error("Step 2: Expected non-empty response content")
} else {
t.Logf("Step 2 response: %s", truncateString(secondContent, 200))
}
t.Log("Multi-turn reasoning passthrough test passed!")
})
}
// min returns the smaller of two integers
func min(a, b int) int {
if a < b {
return a
}
return b
}