first commit

This commit is contained in:
Beyhan Oğur
2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions

View File

@@ -0,0 +1,583 @@
package llmtests
import (
"context"
"os"
"testing"
bifrost "github.com/maximhq/bifrost/core"
"github.com/maximhq/bifrost/core/schemas"
)
// RunResponsesReasoningTest executes the reasoning test scenario to test thinking capabilities via Responses API only
func RunResponsesReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.Reasoning {
t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
return
}
// Skip if no reasoning model is configured
if testConfig.ReasoningModel == "" {
t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
return
}
t.Run("ResponsesReasoning", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
// Create a complex problem that requires step-by-step reasoning
problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning."
responsesMessages := []schemas.ResponsesMessage{
CreateBasicResponsesMessage(problemPrompt),
}
// Execute Responses API test with retries
responsesReq := &schemas.BifrostResponsesRequest{
Provider: testConfig.Provider,
Model: testConfig.ReasoningModel,
Input: responsesMessages,
Params: &schemas.ResponsesParameters{
// Reasoning models (o3, o4-mini) allocate tokens between reasoning and text output.
// Note: Older o1 models may not return message output via Responses API - use o3/o4-mini.
// OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs.
// See: https://platform.openai.com/docs/guides/reasoning#allocating-space-for-reasoning
MaxOutputTokens: bifrost.Ptr(25000),
// Configure reasoning-specific parameters
Reasoning: &schemas.ResponsesParametersReasoning{
Effort: bifrost.Ptr("high"), // High effort for complex reasoning
// Summary: bifrost.Ptr("detailed"), // Detailed summary of reasoning process
},
// Include reasoning content in response
Include: []string{"reasoning.encrypted_content"},
},
Fallbacks: testConfig.Fallbacks,
}
// Use retry framework with enhanced validation for reasoning
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Reasoning",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"mathematical_problem": true,
"step_by_step": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.ReasoningModel,
"problem_type": "mathematical",
"complexity": "high",
"expects_reasoning": true,
},
}
responsesRetryConfig := ResponsesRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ResponsesRetryCondition{}, // Add specific responses retry conditions as needed
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
// Enhanced validation for reasoning scenarios
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ResponsesRequest(bfCtx, responsesReq)
})
if responsesError != nil {
t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(responsesError))
}
// Log the response content
responsesContent := GetResponsesContent(response)
if responsesContent == "" {
t.Logf("✅ Responses API reasoning result: <no content>")
} else {
maxLen := 300
if len(responsesContent) < maxLen {
maxLen = len(responsesContent)
}
t.Logf("✅ Responses API reasoning result: %s", responsesContent[:maxLen])
}
// Additional reasoning-specific validation (complementary to the main validation)
reasoningDetected := validateResponsesAPIReasoning(t, response)
if !reasoningDetected {
t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content")
} else {
t.Logf("🧠 Reasoning structure detected in response")
}
t.Logf("🎉 Responses API passed Reasoning test!")
})
}
// validateResponsesAPIReasoning performs additional validation specific to Responses API reasoning features
// Returns true if reasoning indicators are found
func validateResponsesAPIReasoning(t *testing.T, response *schemas.BifrostResponsesResponse) bool {
if response == nil || response.Output == nil {
return false
}
reasoningFound := false
summaryFound := false
reasoningContentFound := false
// Check if response contains reasoning messages or reasoning content
for _, message := range response.Output {
// Check for ResponsesMessageTypeReasoning
if message.Type != nil && *message.Type == schemas.ResponsesMessageTypeReasoning {
reasoningFound = true
t.Logf("🧠 Found ResponsesMessageTypeReasoning message in response")
// Check for reasoning summary content
if message.ResponsesReasoning != nil && len(message.ResponsesReasoning.Summary) > 0 {
summaryFound = true
t.Logf("📝 Found reasoning summary with %d content blocks", len(message.ResponsesReasoning.Summary))
// Log first summary block for debugging
if len(message.ResponsesReasoning.Summary) > 0 {
firstSummary := message.ResponsesReasoning.Summary[0]
if len(firstSummary.Text) > 0 {
maxLen := 200
if len(firstSummary.Text) < maxLen {
maxLen = len(firstSummary.Text)
}
t.Logf("📋 First reasoning summary: %s", firstSummary.Text[:maxLen])
} else {
t.Logf("📋 First reasoning summary: (empty)")
}
}
}
// Check for encrypted reasoning content
if message.ResponsesReasoning != nil && message.ResponsesReasoning.EncryptedContent != nil {
t.Logf("🔐 Found encrypted reasoning content")
}
}
// Check for content blocks with ResponsesOutputMessageContentTypeReasoning
if message.Content != nil && message.Content.ContentBlocks != nil {
for _, block := range message.Content.ContentBlocks {
if block.Type == schemas.ResponsesOutputMessageContentTypeReasoning {
reasoningContentFound = true
t.Logf("🔍 Found ResponsesOutputMessageContentTypeReasoning content block")
}
}
}
}
// Check if reasoning tokens were used
if response.Usage != nil && response.Usage.OutputTokensDetails != nil &&
response.Usage.OutputTokensDetails.ReasoningTokens > 0 {
t.Logf("🔢 Reasoning tokens used: %d", response.Usage.OutputTokensDetails.ReasoningTokens)
reasoningFound = true // Reasoning tokens indicate reasoning was performed
}
// Log findings
detected := reasoningFound || reasoningContentFound
if detected {
t.Logf("✅ Responses API reasoning indicators detected")
if reasoningFound {
t.Logf(" - ResponsesMessageTypeReasoning or reasoning tokens found")
}
if reasoningContentFound {
t.Logf(" - ResponsesOutputMessageContentTypeReasoning content blocks found")
}
if summaryFound {
t.Logf(" - Reasoning summary content found")
}
} else {
t.Logf(" No explicit reasoning indicators found (may be provider-specific)")
}
return detected
}
// RunChatCompletionReasoningTest executes the reasoning test scenario to test thinking capabilities via Chat Completions API
func RunChatCompletionReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.Reasoning {
t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
return
}
// Skip if no reasoning model is configured
if testConfig.ReasoningModel == "" {
t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
return
}
t.Run("ChatCompletionReasoning", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
if testConfig.Provider == schemas.OpenAI {
// OpenAI because reasoning for them in chat completions is extremely flaky
t.Skip("Skipping ChatCompletionReasoning test for OpenAI")
return
}
// Create a complex problem that requires step-by-step reasoning
problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning."
chatMessages := []schemas.ChatMessage{
CreateBasicChatMessage(problemPrompt),
}
// Execute Chat Completions API test with retries
chatReq := &schemas.BifrostChatRequest{
Provider: testConfig.Provider,
Model: testConfig.ReasoningModel,
Input: chatMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(1800),
// Configure reasoning-specific parameters
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("high"), // High effort for complex reasoning
MaxTokens: bifrost.Ptr(1500), // Maximum tokens for reasoning output
},
},
Fallbacks: testConfig.Fallbacks,
}
// Use retry framework with enhanced validation for reasoning
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "Reasoning",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"mathematical_problem": true,
"step_by_step": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.ReasoningModel,
"problem_type": "mathematical",
"complexity": "high",
"expects_reasoning": true,
},
}
chatRetryConfig := ChatRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ChatRetryCondition{}, // Add specific chat retry conditions as needed
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
// Enhanced validation for reasoning scenarios
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, chatReq)
})
if chatError != nil {
t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(chatError))
}
// Log the response content
chatContent := GetChatContent(response)
if chatContent == "" {
t.Logf("✅ Chat Completions API reasoning result: <no content>")
} else {
maxLen := 300
if len(chatContent) < maxLen {
maxLen = len(chatContent)
}
t.Logf("✅ Chat Completions API reasoning result: %s", chatContent[:maxLen])
}
// Additional reasoning-specific validation (complementary to the main validation)
reasoningDetected := validateChatCompletionReasoning(t, response)
if !reasoningDetected {
t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content")
} else {
t.Logf("🧠 Reasoning structure detected in response")
}
t.Logf("🎉 Chat Completions API passed Reasoning test!")
})
}
// validateChatCompletionReasoning performs additional validation specific to Chat Completions API reasoning features
// Returns true if reasoning indicators are found
func validateChatCompletionReasoning(t *testing.T, response *schemas.BifrostChatResponse) bool {
if response == nil || len(response.Choices) == 0 {
return false
}
reasoningFound := false
reasoningDetailsFound := false
reasoningTokensFound := false
// Check each choice for reasoning indicators
for _, choice := range response.Choices {
// Check for reasoning details in ChatNonStreamResponseChoice
if choice.ChatNonStreamResponseChoice != nil && choice.ChatNonStreamResponseChoice.Message != nil {
message := choice.ChatNonStreamResponseChoice.Message
if message == nil {
continue
}
// Check for reasoning content in message (for backward compatibility)
if message.ChatAssistantMessage != nil && message.ChatAssistantMessage.Reasoning != nil && *message.ChatAssistantMessage.Reasoning != "" {
reasoningFound = true
t.Logf("🧠 Found reasoning content in message (length: %d)", len(*message.ChatAssistantMessage.Reasoning))
// Log first 200 chars for debugging
reasoningText := *message.ChatAssistantMessage.Reasoning
maxLen := 200
if len(reasoningText) < maxLen {
maxLen = len(reasoningText)
}
t.Logf("📋 First reasoning content: %s", reasoningText[:maxLen])
}
// Check for reasoning details array
if message.ChatAssistantMessage != nil && len(message.ChatAssistantMessage.ReasoningDetails) > 0 {
reasoningDetailsFound = true
t.Logf("📝 Found %d reasoning details entries", len(message.ChatAssistantMessage.ReasoningDetails))
// Log details about each reasoning entry
for i, detail := range message.ChatAssistantMessage.ReasoningDetails {
t.Logf(" - Entry %d: Type=%s, Index=%d", i, detail.Type, detail.Index)
switch detail.Type {
case schemas.BifrostReasoningDetailsTypeSummary:
if detail.Summary != nil {
t.Logf(" Summary length: %d", len(*detail.Summary))
}
case schemas.BifrostReasoningDetailsTypeText:
if detail.Text != nil {
textLen := len(*detail.Text)
t.Logf(" Text length: %d", textLen)
if textLen > 0 {
maxLen := 150
if textLen < maxLen {
maxLen = textLen
}
t.Logf(" Text preview: %s", (*detail.Text)[:maxLen])
}
}
case schemas.BifrostReasoningDetailsTypeEncrypted:
if detail.Data != nil {
t.Logf(" Encrypted data length: %d", len(*detail.Data))
}
if detail.Signature != nil {
t.Logf(" Signature present: %d bytes", len(*detail.Signature))
}
}
}
}
}
}
// Check if reasoning tokens were used
if response.Usage != nil && response.Usage.CompletionTokensDetails != nil &&
response.Usage.CompletionTokensDetails.ReasoningTokens > 0 {
reasoningTokensFound = true
t.Logf("🔢 Reasoning tokens used: %d", response.Usage.CompletionTokensDetails.ReasoningTokens)
}
// Log findings
detected := reasoningFound || reasoningDetailsFound || reasoningTokensFound
if detected {
t.Logf("✅ Chat Completions API reasoning indicators detected")
if reasoningFound {
t.Logf(" - Reasoning content found in message")
}
if reasoningDetailsFound {
t.Logf(" - Reasoning details array found")
}
if reasoningTokensFound {
t.Logf(" - Reasoning tokens usage reported")
}
} else {
t.Logf(" No explicit reasoning indicators found (may be provider-specific)")
}
return detected
}
// RunMultiTurnReasoningTest tests multi-turn conversations with reasoning content passthrough.
// It verifies that reasoning details (text + signature) from assistant messages are correctly
// passed back to the model in follow-up turns via the Chat Completions API.
func RunMultiTurnReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.Reasoning {
t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider)
return
}
if testConfig.ReasoningModel == "" {
t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider)
return
}
t.Run("MultiTurnReasoning", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
if testConfig.Provider == schemas.OpenAI {
t.Skip("Skipping MultiTurnReasoning test for OpenAI")
return
}
// Step 1: Send initial reasoning request
initialPrompt := "What is 15 * 17? Think step by step."
chatMessages := []schemas.ChatMessage{
CreateBasicChatMessage(initialPrompt),
}
chatReq := &schemas.BifrostChatRequest{
Provider: testConfig.Provider,
Model: testConfig.ReasoningModel,
Input: chatMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("low"),
},
},
Fallbacks: testConfig.Fallbacks,
}
retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig)
retryContext := TestRetryContext{
ScenarioName: "MultiTurnReasoning_Step1",
ExpectedBehavior: map[string]interface{}{
"should_show_reasoning": true,
"multi_turn": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.ReasoningModel,
"step": "initial",
},
}
chatRetryConfig := ChatRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []ChatRetryCondition{},
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{
"requires_reasoning": true,
})
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
firstResponse, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "MultiTurnReasoning_Step1", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, chatReq)
})
if chatError != nil {
t.Fatalf("Step 1 failed: %v", GetErrorMessage(chatError))
}
firstContent := GetChatContent(firstResponse)
if firstContent == "" {
t.Fatal("Step 1: Expected non-empty response content")
}
t.Logf("Step 1 response: %s", truncateString(firstContent, 200))
// Extract reasoning details from first response
var reasoningDetails []schemas.ChatReasoningDetails
if len(firstResponse.Choices) > 0 {
choice := firstResponse.Choices[0]
if choice.ChatNonStreamResponseChoice != nil &&
choice.ChatNonStreamResponseChoice.Message != nil &&
choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage != nil {
reasoningDetails = choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage.ReasoningDetails
}
}
t.Logf("Step 1: Found %d reasoning detail entries", len(reasoningDetails))
// Step 2: Build multi-turn conversation with reasoning details passed back
multiTurnMessages := []schemas.ChatMessage{
CreateBasicChatMessage(initialPrompt),
{
Role: schemas.ChatMessageRoleAssistant,
Content: &schemas.ChatMessageContent{
ContentStr: &firstContent,
},
ChatAssistantMessage: &schemas.ChatAssistantMessage{
ReasoningDetails: reasoningDetails,
},
},
CreateBasicChatMessage("Now multiply that result by 2."),
}
multiTurnReq := &schemas.BifrostChatRequest{
Provider: testConfig.Provider,
Model: testConfig.ReasoningModel,
Input: multiTurnMessages,
Params: &schemas.ChatParameters{
MaxCompletionTokens: bifrost.Ptr(4000),
Reasoning: &schemas.ChatReasoning{
Effort: bifrost.Ptr("low"),
},
},
Fallbacks: testConfig.Fallbacks,
}
retryContext2 := TestRetryContext{
ScenarioName: "MultiTurnReasoning_Step2",
ExpectedBehavior: map[string]interface{}{
"multi_turn": true,
"reasoning_passthrough": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.ReasoningModel,
"step": "follow_up",
},
}
secondResponse, chatError2 := WithChatTestRetry(t, chatRetryConfig, retryContext2, expectations, "MultiTurnReasoning_Step2", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) {
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.ChatCompletionRequest(bfCtx, multiTurnReq)
})
if chatError2 != nil {
t.Fatalf("Step 2 (multi-turn with reasoning passthrough) failed: %v", GetErrorMessage(chatError2))
}
secondContent := GetChatContent(secondResponse)
if secondContent == "" {
t.Error("Step 2: Expected non-empty response content")
} else {
t.Logf("Step 2 response: %s", truncateString(secondContent, 200))
}
t.Log("Multi-turn reasoning passthrough test passed!")
})
}
// min returns the smaller of two integers
func min(a, b int) int {
if a < b {
return a
}
return b
}