package llmtests import ( "context" "os" "testing" bifrost "github.com/maximhq/bifrost/core" "github.com/maximhq/bifrost/core/schemas" ) // RunResponsesReasoningTest executes the reasoning test scenario to test thinking capabilities via Responses API only func RunResponsesReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) { if !testConfig.Scenarios.Reasoning { t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider) return } // Skip if no reasoning model is configured if testConfig.ReasoningModel == "" { t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider) return } t.Run("ResponsesReasoning", func(t *testing.T) { if os.Getenv("SKIP_PARALLEL_TESTS") != "true" { t.Parallel() } // Create a complex problem that requires step-by-step reasoning problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning." responsesMessages := []schemas.ResponsesMessage{ CreateBasicResponsesMessage(problemPrompt), } // Execute Responses API test with retries responsesReq := &schemas.BifrostResponsesRequest{ Provider: testConfig.Provider, Model: testConfig.ReasoningModel, Input: responsesMessages, Params: &schemas.ResponsesParameters{ // Reasoning models (o3, o4-mini) allocate tokens between reasoning and text output. // Note: Older o1 models may not return message output via Responses API - use o3/o4-mini. // OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs. // See: https://platform.openai.com/docs/guides/reasoning#allocating-space-for-reasoning MaxOutputTokens: bifrost.Ptr(25000), // Configure reasoning-specific parameters Reasoning: &schemas.ResponsesParametersReasoning{ Effort: bifrost.Ptr("high"), // High effort for complex reasoning // Summary: bifrost.Ptr("detailed"), // Detailed summary of reasoning process }, // Include reasoning content in response Include: []string{"reasoning.encrypted_content"}, }, Fallbacks: testConfig.Fallbacks, } // Use retry framework with enhanced validation for reasoning retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig) retryContext := TestRetryContext{ ScenarioName: "Reasoning", ExpectedBehavior: map[string]interface{}{ "should_show_reasoning": true, "mathematical_problem": true, "step_by_step": true, }, TestMetadata: map[string]interface{}{ "provider": testConfig.Provider, "model": testConfig.ReasoningModel, "problem_type": "mathematical", "complexity": "high", "expects_reasoning": true, }, } responsesRetryConfig := ResponsesRetryConfig{ MaxAttempts: retryConfig.MaxAttempts, BaseDelay: retryConfig.BaseDelay, MaxDelay: retryConfig.MaxDelay, Conditions: []ResponsesRetryCondition{}, // Add specific responses retry conditions as needed OnRetry: retryConfig.OnRetry, OnFinalFail: retryConfig.OnFinalFail, } // Enhanced validation for reasoning scenarios expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{ "requires_reasoning": true, }) expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider) response, responsesError := WithResponsesTestRetry(t, responsesRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostResponsesResponse, *schemas.BifrostError) { bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline) return client.ResponsesRequest(bfCtx, responsesReq) }) if responsesError != nil { t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(responsesError)) } // Log the response content responsesContent := GetResponsesContent(response) if responsesContent == "" { t.Logf("✅ Responses API reasoning result: ") } else { maxLen := 300 if len(responsesContent) < maxLen { maxLen = len(responsesContent) } t.Logf("✅ Responses API reasoning result: %s", responsesContent[:maxLen]) } // Additional reasoning-specific validation (complementary to the main validation) reasoningDetected := validateResponsesAPIReasoning(t, response) if !reasoningDetected { t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content") } else { t.Logf("🧠 Reasoning structure detected in response") } t.Logf("🎉 Responses API passed Reasoning test!") }) } // validateResponsesAPIReasoning performs additional validation specific to Responses API reasoning features // Returns true if reasoning indicators are found func validateResponsesAPIReasoning(t *testing.T, response *schemas.BifrostResponsesResponse) bool { if response == nil || response.Output == nil { return false } reasoningFound := false summaryFound := false reasoningContentFound := false // Check if response contains reasoning messages or reasoning content for _, message := range response.Output { // Check for ResponsesMessageTypeReasoning if message.Type != nil && *message.Type == schemas.ResponsesMessageTypeReasoning { reasoningFound = true t.Logf("🧠 Found ResponsesMessageTypeReasoning message in response") // Check for reasoning summary content if message.ResponsesReasoning != nil && len(message.ResponsesReasoning.Summary) > 0 { summaryFound = true t.Logf("📝 Found reasoning summary with %d content blocks", len(message.ResponsesReasoning.Summary)) // Log first summary block for debugging if len(message.ResponsesReasoning.Summary) > 0 { firstSummary := message.ResponsesReasoning.Summary[0] if len(firstSummary.Text) > 0 { maxLen := 200 if len(firstSummary.Text) < maxLen { maxLen = len(firstSummary.Text) } t.Logf("📋 First reasoning summary: %s", firstSummary.Text[:maxLen]) } else { t.Logf("📋 First reasoning summary: (empty)") } } } // Check for encrypted reasoning content if message.ResponsesReasoning != nil && message.ResponsesReasoning.EncryptedContent != nil { t.Logf("🔐 Found encrypted reasoning content") } } // Check for content blocks with ResponsesOutputMessageContentTypeReasoning if message.Content != nil && message.Content.ContentBlocks != nil { for _, block := range message.Content.ContentBlocks { if block.Type == schemas.ResponsesOutputMessageContentTypeReasoning { reasoningContentFound = true t.Logf("🔍 Found ResponsesOutputMessageContentTypeReasoning content block") } } } } // Check if reasoning tokens were used if response.Usage != nil && response.Usage.OutputTokensDetails != nil && response.Usage.OutputTokensDetails.ReasoningTokens > 0 { t.Logf("🔢 Reasoning tokens used: %d", response.Usage.OutputTokensDetails.ReasoningTokens) reasoningFound = true // Reasoning tokens indicate reasoning was performed } // Log findings detected := reasoningFound || reasoningContentFound if detected { t.Logf("✅ Responses API reasoning indicators detected") if reasoningFound { t.Logf(" - ResponsesMessageTypeReasoning or reasoning tokens found") } if reasoningContentFound { t.Logf(" - ResponsesOutputMessageContentTypeReasoning content blocks found") } if summaryFound { t.Logf(" - Reasoning summary content found") } } else { t.Logf("ℹ️ No explicit reasoning indicators found (may be provider-specific)") } return detected } // RunChatCompletionReasoningTest executes the reasoning test scenario to test thinking capabilities via Chat Completions API func RunChatCompletionReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) { if !testConfig.Scenarios.Reasoning { t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider) return } // Skip if no reasoning model is configured if testConfig.ReasoningModel == "" { t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider) return } t.Run("ChatCompletionReasoning", func(t *testing.T) { if os.Getenv("SKIP_PARALLEL_TESTS") != "true" { t.Parallel() } if testConfig.Provider == schemas.OpenAI { // OpenAI because reasoning for them in chat completions is extremely flaky t.Skip("Skipping ChatCompletionReasoning test for OpenAI") return } // Create a complex problem that requires step-by-step reasoning problemPrompt := "A farmer has 100 chickens and 50 cows. Each chicken lays 5 eggs per week, and each cow produces 20 liters of milk per day. If the farmer sells eggs for $0.25 each and milk for $1.50 per liter, and it costs $2 per week to feed each chicken and $15 per week to feed each cow, what is the farmer's weekly profit? Please show your step-by-step reasoning." chatMessages := []schemas.ChatMessage{ CreateBasicChatMessage(problemPrompt), } // Execute Chat Completions API test with retries chatReq := &schemas.BifrostChatRequest{ Provider: testConfig.Provider, Model: testConfig.ReasoningModel, Input: chatMessages, Params: &schemas.ChatParameters{ MaxCompletionTokens: bifrost.Ptr(1800), // Configure reasoning-specific parameters Reasoning: &schemas.ChatReasoning{ Effort: bifrost.Ptr("high"), // High effort for complex reasoning MaxTokens: bifrost.Ptr(1500), // Maximum tokens for reasoning output }, }, Fallbacks: testConfig.Fallbacks, } // Use retry framework with enhanced validation for reasoning retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig) retryContext := TestRetryContext{ ScenarioName: "Reasoning", ExpectedBehavior: map[string]interface{}{ "should_show_reasoning": true, "mathematical_problem": true, "step_by_step": true, }, TestMetadata: map[string]interface{}{ "provider": testConfig.Provider, "model": testConfig.ReasoningModel, "problem_type": "mathematical", "complexity": "high", "expects_reasoning": true, }, } chatRetryConfig := ChatRetryConfig{ MaxAttempts: retryConfig.MaxAttempts, BaseDelay: retryConfig.BaseDelay, MaxDelay: retryConfig.MaxDelay, Conditions: []ChatRetryCondition{}, // Add specific chat retry conditions as needed OnRetry: retryConfig.OnRetry, OnFinalFail: retryConfig.OnFinalFail, } // Enhanced validation for reasoning scenarios expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{ "requires_reasoning": true, }) expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider) response, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "Reasoning", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) { bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline) return client.ChatCompletionRequest(bfCtx, chatReq) }) if chatError != nil { t.Fatalf("❌ Reasoning test failed after retries: %v", GetErrorMessage(chatError)) } // Log the response content chatContent := GetChatContent(response) if chatContent == "" { t.Logf("✅ Chat Completions API reasoning result: ") } else { maxLen := 300 if len(chatContent) < maxLen { maxLen = len(chatContent) } t.Logf("✅ Chat Completions API reasoning result: %s", chatContent[:maxLen]) } // Additional reasoning-specific validation (complementary to the main validation) reasoningDetected := validateChatCompletionReasoning(t, response) if !reasoningDetected { t.Logf("⚠️ No explicit reasoning indicators found in response structure - may still contain valid reasoning in content") } else { t.Logf("🧠 Reasoning structure detected in response") } t.Logf("🎉 Chat Completions API passed Reasoning test!") }) } // validateChatCompletionReasoning performs additional validation specific to Chat Completions API reasoning features // Returns true if reasoning indicators are found func validateChatCompletionReasoning(t *testing.T, response *schemas.BifrostChatResponse) bool { if response == nil || len(response.Choices) == 0 { return false } reasoningFound := false reasoningDetailsFound := false reasoningTokensFound := false // Check each choice for reasoning indicators for _, choice := range response.Choices { // Check for reasoning details in ChatNonStreamResponseChoice if choice.ChatNonStreamResponseChoice != nil && choice.ChatNonStreamResponseChoice.Message != nil { message := choice.ChatNonStreamResponseChoice.Message if message == nil { continue } // Check for reasoning content in message (for backward compatibility) if message.ChatAssistantMessage != nil && message.ChatAssistantMessage.Reasoning != nil && *message.ChatAssistantMessage.Reasoning != "" { reasoningFound = true t.Logf("🧠 Found reasoning content in message (length: %d)", len(*message.ChatAssistantMessage.Reasoning)) // Log first 200 chars for debugging reasoningText := *message.ChatAssistantMessage.Reasoning maxLen := 200 if len(reasoningText) < maxLen { maxLen = len(reasoningText) } t.Logf("📋 First reasoning content: %s", reasoningText[:maxLen]) } // Check for reasoning details array if message.ChatAssistantMessage != nil && len(message.ChatAssistantMessage.ReasoningDetails) > 0 { reasoningDetailsFound = true t.Logf("📝 Found %d reasoning details entries", len(message.ChatAssistantMessage.ReasoningDetails)) // Log details about each reasoning entry for i, detail := range message.ChatAssistantMessage.ReasoningDetails { t.Logf(" - Entry %d: Type=%s, Index=%d", i, detail.Type, detail.Index) switch detail.Type { case schemas.BifrostReasoningDetailsTypeSummary: if detail.Summary != nil { t.Logf(" Summary length: %d", len(*detail.Summary)) } case schemas.BifrostReasoningDetailsTypeText: if detail.Text != nil { textLen := len(*detail.Text) t.Logf(" Text length: %d", textLen) if textLen > 0 { maxLen := 150 if textLen < maxLen { maxLen = textLen } t.Logf(" Text preview: %s", (*detail.Text)[:maxLen]) } } case schemas.BifrostReasoningDetailsTypeEncrypted: if detail.Data != nil { t.Logf(" Encrypted data length: %d", len(*detail.Data)) } if detail.Signature != nil { t.Logf(" Signature present: %d bytes", len(*detail.Signature)) } } } } } } // Check if reasoning tokens were used if response.Usage != nil && response.Usage.CompletionTokensDetails != nil && response.Usage.CompletionTokensDetails.ReasoningTokens > 0 { reasoningTokensFound = true t.Logf("🔢 Reasoning tokens used: %d", response.Usage.CompletionTokensDetails.ReasoningTokens) } // Log findings detected := reasoningFound || reasoningDetailsFound || reasoningTokensFound if detected { t.Logf("✅ Chat Completions API reasoning indicators detected") if reasoningFound { t.Logf(" - Reasoning content found in message") } if reasoningDetailsFound { t.Logf(" - Reasoning details array found") } if reasoningTokensFound { t.Logf(" - Reasoning tokens usage reported") } } else { t.Logf("ℹ️ No explicit reasoning indicators found (may be provider-specific)") } return detected } // RunMultiTurnReasoningTest tests multi-turn conversations with reasoning content passthrough. // It verifies that reasoning details (text + signature) from assistant messages are correctly // passed back to the model in follow-up turns via the Chat Completions API. func RunMultiTurnReasoningTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) { if !testConfig.Scenarios.Reasoning { t.Logf("⏭️ Reasoning not supported for provider %s", testConfig.Provider) return } if testConfig.ReasoningModel == "" { t.Logf("⏭️ No reasoning model configured for provider %s", testConfig.Provider) return } t.Run("MultiTurnReasoning", func(t *testing.T) { if os.Getenv("SKIP_PARALLEL_TESTS") != "true" { t.Parallel() } if testConfig.Provider == schemas.OpenAI { t.Skip("Skipping MultiTurnReasoning test for OpenAI") return } // Step 1: Send initial reasoning request initialPrompt := "What is 15 * 17? Think step by step." chatMessages := []schemas.ChatMessage{ CreateBasicChatMessage(initialPrompt), } chatReq := &schemas.BifrostChatRequest{ Provider: testConfig.Provider, Model: testConfig.ReasoningModel, Input: chatMessages, Params: &schemas.ChatParameters{ MaxCompletionTokens: bifrost.Ptr(4000), Reasoning: &schemas.ChatReasoning{ Effort: bifrost.Ptr("low"), }, }, Fallbacks: testConfig.Fallbacks, } retryConfig := GetTestRetryConfigForScenario("Reasoning", testConfig) retryContext := TestRetryContext{ ScenarioName: "MultiTurnReasoning_Step1", ExpectedBehavior: map[string]interface{}{ "should_show_reasoning": true, "multi_turn": true, }, TestMetadata: map[string]interface{}{ "provider": testConfig.Provider, "model": testConfig.ReasoningModel, "step": "initial", }, } chatRetryConfig := ChatRetryConfig{ MaxAttempts: retryConfig.MaxAttempts, BaseDelay: retryConfig.BaseDelay, MaxDelay: retryConfig.MaxDelay, Conditions: []ChatRetryCondition{}, OnRetry: retryConfig.OnRetry, OnFinalFail: retryConfig.OnFinalFail, } expectations := GetExpectationsForScenario("Reasoning", testConfig, map[string]interface{}{ "requires_reasoning": true, }) expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider) firstResponse, chatError := WithChatTestRetry(t, chatRetryConfig, retryContext, expectations, "MultiTurnReasoning_Step1", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) { bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline) return client.ChatCompletionRequest(bfCtx, chatReq) }) if chatError != nil { t.Fatalf("Step 1 failed: %v", GetErrorMessage(chatError)) } firstContent := GetChatContent(firstResponse) if firstContent == "" { t.Fatal("Step 1: Expected non-empty response content") } t.Logf("Step 1 response: %s", truncateString(firstContent, 200)) // Extract reasoning details from first response var reasoningDetails []schemas.ChatReasoningDetails if len(firstResponse.Choices) > 0 { choice := firstResponse.Choices[0] if choice.ChatNonStreamResponseChoice != nil && choice.ChatNonStreamResponseChoice.Message != nil && choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage != nil { reasoningDetails = choice.ChatNonStreamResponseChoice.Message.ChatAssistantMessage.ReasoningDetails } } t.Logf("Step 1: Found %d reasoning detail entries", len(reasoningDetails)) // Step 2: Build multi-turn conversation with reasoning details passed back multiTurnMessages := []schemas.ChatMessage{ CreateBasicChatMessage(initialPrompt), { Role: schemas.ChatMessageRoleAssistant, Content: &schemas.ChatMessageContent{ ContentStr: &firstContent, }, ChatAssistantMessage: &schemas.ChatAssistantMessage{ ReasoningDetails: reasoningDetails, }, }, CreateBasicChatMessage("Now multiply that result by 2."), } multiTurnReq := &schemas.BifrostChatRequest{ Provider: testConfig.Provider, Model: testConfig.ReasoningModel, Input: multiTurnMessages, Params: &schemas.ChatParameters{ MaxCompletionTokens: bifrost.Ptr(4000), Reasoning: &schemas.ChatReasoning{ Effort: bifrost.Ptr("low"), }, }, Fallbacks: testConfig.Fallbacks, } retryContext2 := TestRetryContext{ ScenarioName: "MultiTurnReasoning_Step2", ExpectedBehavior: map[string]interface{}{ "multi_turn": true, "reasoning_passthrough": true, }, TestMetadata: map[string]interface{}{ "provider": testConfig.Provider, "model": testConfig.ReasoningModel, "step": "follow_up", }, } secondResponse, chatError2 := WithChatTestRetry(t, chatRetryConfig, retryContext2, expectations, "MultiTurnReasoning_Step2", func() (*schemas.BifrostChatResponse, *schemas.BifrostError) { bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline) return client.ChatCompletionRequest(bfCtx, multiTurnReq) }) if chatError2 != nil { t.Fatalf("Step 2 (multi-turn with reasoning passthrough) failed: %v", GetErrorMessage(chatError2)) } secondContent := GetChatContent(secondResponse) if secondContent == "" { t.Error("Step 2: Expected non-empty response content") } else { t.Logf("Step 2 response: %s", truncateString(secondContent, 200)) } t.Log("Multi-turn reasoning passthrough test passed!") }) } // min returns the smaller of two integers func min(a, b int) int { if a < b { return a } return b }