1202 lines
37 KiB
Go
1202 lines
37 KiB
Go
package llmtests
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/maximhq/bifrost/core/schemas"
|
|
)
|
|
|
|
// =============================================================================
|
|
// BASIC RESPONSE CONDITIONS
|
|
// =============================================================================
|
|
|
|
// EmptyResponseCondition checks for empty or missing response content
|
|
type EmptyResponseCondition struct{}
|
|
|
|
func (c *EmptyResponseCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
// If there's an error, let the HTTP retry logic handle it
|
|
if err != nil {
|
|
return false, ""
|
|
}
|
|
|
|
// No response at all
|
|
if response == nil {
|
|
return true, "response is nil"
|
|
}
|
|
|
|
// Check if chat completions response exists
|
|
if response.TextCompletionResponse == nil && response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return true, "response has no chat completions or responses data"
|
|
}
|
|
|
|
// Check if all choices are empty (no content AND no tool calls)
|
|
hasContent := false
|
|
|
|
// Check for textual content using the already robust GetResultContent function
|
|
content := GetResultContent(response)
|
|
if strings.TrimSpace(content) != "" {
|
|
hasContent = true
|
|
}
|
|
|
|
// If no textual content, check for tool calls using the universal ExtractToolCalls function
|
|
if !hasContent {
|
|
toolCalls := ExtractToolCalls(response)
|
|
if len(toolCalls) > 0 {
|
|
// Validate that at least one tool call has a function name
|
|
for _, toolCall := range toolCalls {
|
|
if strings.TrimSpace(toolCall.Name) != "" {
|
|
hasContent = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(toolCalls) == 0 {
|
|
return true, "no tool calls found in response"
|
|
}
|
|
}
|
|
|
|
if !hasContent {
|
|
return true, "all choices have empty content and no tool calls"
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *EmptyResponseCondition) GetConditionName() string {
|
|
return "EmptyResponse"
|
|
}
|
|
|
|
// =============================================================================
|
|
// TOOL CALLING CONDITIONS
|
|
// =============================================================================
|
|
|
|
// MissingToolCallCondition checks if expected tool call is missing
|
|
type MissingToolCallCondition struct {
|
|
ExpectedToolName string // Name of the tool that should have been called
|
|
}
|
|
|
|
func (c *MissingToolCallCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
expectedTool := c.ExpectedToolName
|
|
if expectedTool == "" {
|
|
// Try to get from context
|
|
if tool, ok := context.ExpectedBehavior["expected_tool_name"].(string); ok {
|
|
expectedTool = tool
|
|
} else {
|
|
return false, ""
|
|
}
|
|
}
|
|
|
|
// Extract tool calls from both API formats
|
|
toolCalls := ExtractToolCalls(response)
|
|
|
|
// Check if any tool call has the expected name
|
|
for _, toolCall := range toolCalls {
|
|
if toolCall.Name == expectedTool {
|
|
return false, "" // Found the expected tool call
|
|
}
|
|
}
|
|
|
|
return true, fmt.Sprintf("expected tool call '%s' not found in response", expectedTool)
|
|
}
|
|
|
|
func (c *MissingToolCallCondition) GetConditionName() string {
|
|
return "MissingToolCall"
|
|
}
|
|
|
|
// MalformedToolArgsCondition checks for malformed tool call arguments
|
|
type MalformedToolArgsCondition struct{}
|
|
|
|
func (c *MalformedToolArgsCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Extract tool calls from both API formats
|
|
toolCalls := ExtractToolCalls(response)
|
|
|
|
// Check all tool calls for malformed arguments
|
|
for i, toolCall := range toolCalls {
|
|
if toolCall.Arguments == "" {
|
|
continue // Skip empty arguments for now
|
|
}
|
|
|
|
// Try to parse arguments as JSON
|
|
var args map[string]interface{}
|
|
if err := json.Unmarshal([]byte(toolCall.Arguments), &args); err != nil {
|
|
return true, fmt.Sprintf("tool call %d has malformed JSON arguments: %s", i, err.Error())
|
|
}
|
|
|
|
// Check for empty arguments only when arguments are explicitly required
|
|
// Some tools (like get_current_time) legitimately take no arguments
|
|
requiresArgs := false
|
|
if context.ExpectedBehavior != nil {
|
|
// Check if this test expects arguments (default: false, allowing tools with no args)
|
|
if expectArgs, ok := context.ExpectedBehavior["requires_arguments"].(bool); ok {
|
|
requiresArgs = expectArgs
|
|
}
|
|
}
|
|
|
|
if requiresArgs && len(args) == 0 && toolCall.Name != "" {
|
|
return true, fmt.Sprintf("tool call %d (%s) has empty arguments but arguments are required", i, toolCall.Name)
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *MalformedToolArgsCondition) GetConditionName() string {
|
|
return "MalformedToolArgs"
|
|
}
|
|
|
|
// WrongToolCalledCondition checks if the wrong tool was called
|
|
type WrongToolCalledCondition struct {
|
|
ExpectedToolName string
|
|
ForbiddenTools []string // Tools that should not be called
|
|
}
|
|
|
|
func (c *WrongToolCalledCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
expectedTool := c.ExpectedToolName
|
|
if expectedTool == "" {
|
|
if tool, ok := context.ExpectedBehavior["expected_tool_name"].(string); ok {
|
|
expectedTool = tool
|
|
}
|
|
}
|
|
|
|
// Extract tool calls from both API formats
|
|
toolCalls := ExtractToolCalls(response)
|
|
|
|
// Check all tool calls
|
|
for i, toolCall := range toolCalls {
|
|
toolName := toolCall.Name
|
|
if toolName == "" {
|
|
continue
|
|
}
|
|
|
|
// Check if forbidden tool was called
|
|
for _, forbidden := range c.ForbiddenTools {
|
|
if toolName == forbidden {
|
|
return true, fmt.Sprintf("tool call %d called forbidden tool '%s'", i, toolName)
|
|
}
|
|
}
|
|
|
|
// If we have an expected tool and this isn't it
|
|
if expectedTool != "" && toolName != expectedTool {
|
|
return true, fmt.Sprintf("tool call %d called '%s' instead of expected '%s'", i, toolName, expectedTool)
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *WrongToolCalledCondition) GetConditionName() string {
|
|
return "WrongToolCalled"
|
|
}
|
|
|
|
// =============================================================================
|
|
// MULTIPLE TOOL CALL CONDITIONS
|
|
// =============================================================================
|
|
|
|
// PartialToolCallCondition checks if we got fewer tool calls than expected
|
|
type PartialToolCallCondition struct {
|
|
ExpectedCount int // Expected number of tool calls
|
|
}
|
|
|
|
func (c *PartialToolCallCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
expectedCount := c.ExpectedCount
|
|
if expectedCount == 0 {
|
|
if count, ok := context.ExpectedBehavior["expected_tool_count"].(int); ok {
|
|
expectedCount = count
|
|
} else {
|
|
return false, ""
|
|
}
|
|
}
|
|
|
|
// Extract tool calls from both API formats and count them
|
|
toolCalls := ExtractToolCalls(response)
|
|
actualCount := len(toolCalls)
|
|
|
|
if actualCount < expectedCount {
|
|
return true, fmt.Sprintf("got %d tool calls, expected %d", actualCount, expectedCount)
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *PartialToolCallCondition) GetConditionName() string {
|
|
return "PartialToolCall"
|
|
}
|
|
|
|
// WrongToolSequenceCondition checks if tools were called in wrong order
|
|
type WrongToolSequenceCondition struct {
|
|
ExpectedTools []string // Expected sequence of tool names
|
|
}
|
|
|
|
func (c *WrongToolSequenceCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
expectedTools := c.ExpectedTools
|
|
if len(expectedTools) == 0 {
|
|
if tools, ok := context.ExpectedBehavior["expected_tool_sequence"].([]string); ok {
|
|
expectedTools = tools
|
|
} else {
|
|
return false, ""
|
|
}
|
|
}
|
|
|
|
// Extract tool calls from both API formats
|
|
toolCalls := ExtractToolCalls(response)
|
|
|
|
// If we don't have enough tool calls
|
|
if len(toolCalls) < len(expectedTools) {
|
|
return true, fmt.Sprintf("got %d tool calls, expected at least %d", len(toolCalls), len(expectedTools))
|
|
}
|
|
|
|
// Check sequence
|
|
for j, expectedTool := range expectedTools {
|
|
if j >= len(toolCalls) {
|
|
break
|
|
}
|
|
|
|
actualTool := toolCalls[j].Name
|
|
if actualTool != expectedTool {
|
|
if actualTool == "" {
|
|
actualTool = "nil"
|
|
}
|
|
return true, fmt.Sprintf("position %d: got '%s', expected '%s'", j, actualTool, expectedTool)
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *WrongToolSequenceCondition) GetConditionName() string {
|
|
return "WrongToolSequence"
|
|
}
|
|
|
|
// =============================================================================
|
|
// IMAGE PROCESSING CONDITIONS
|
|
// =============================================================================
|
|
|
|
// ImageNotProcessedCondition checks if image content was actually processed
|
|
type ImageNotProcessedCondition struct{}
|
|
|
|
func (c *ImageNotProcessedCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Get response content
|
|
content := strings.ToLower(GetResultContent(response))
|
|
|
|
// Check for generic responses that don't indicate image processing
|
|
genericPhrases := []string{
|
|
"i can't see",
|
|
"i cannot see",
|
|
"unable to see",
|
|
"can't view",
|
|
"cannot view",
|
|
"no image",
|
|
"not able to see",
|
|
"i don't see",
|
|
"i cannot process",
|
|
}
|
|
|
|
for _, phrase := range genericPhrases {
|
|
if strings.Contains(content, phrase) {
|
|
return true, fmt.Sprintf("response suggests image was not processed: contains '%s'", phrase)
|
|
}
|
|
}
|
|
|
|
// If content is suspiciously short for image analysis
|
|
if len(strings.TrimSpace(content)) < 20 {
|
|
return true, "response too short for meaningful image analysis"
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *ImageNotProcessedCondition) GetConditionName() string {
|
|
return "ImageNotProcessed"
|
|
}
|
|
|
|
// =============================================================================
|
|
// FILE/DOCUMENT PROCESSING CONDITIONS
|
|
// =============================================================================
|
|
|
|
// FileNotProcessedCondition checks if file/document was not properly processed
|
|
type FileNotProcessedCondition struct{}
|
|
|
|
func (c *FileNotProcessedCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Get response content
|
|
content := strings.ToLower(GetResultContent(response))
|
|
|
|
// Check for generic responses that don't indicate file/document processing
|
|
fileProcessingFailurePhrases := []string{
|
|
"i can't read",
|
|
"i cannot read",
|
|
"unable to read",
|
|
"can't access",
|
|
"cannot access",
|
|
"no file",
|
|
"no document",
|
|
"not able to read",
|
|
"i don't see",
|
|
"i cannot process",
|
|
"unable to process",
|
|
"can't open",
|
|
"cannot open",
|
|
"invalid file",
|
|
"corrupted",
|
|
"unsupported format",
|
|
"failed to load",
|
|
"no pdf",
|
|
"cannot view",
|
|
}
|
|
|
|
for _, phrase := range fileProcessingFailurePhrases {
|
|
if strings.Contains(content, phrase) {
|
|
return true, fmt.Sprintf("response suggests file was not processed: contains '%s'", phrase)
|
|
}
|
|
}
|
|
|
|
// If content is suspiciously short for document analysis
|
|
if len(strings.TrimSpace(content)) < 15 {
|
|
return true, "response too short for meaningful document analysis"
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *FileNotProcessedCondition) GetConditionName() string {
|
|
return "FileNotProcessed"
|
|
}
|
|
|
|
// GenericResponseCondition checks for generic/template responses
|
|
type GenericResponseCondition struct{}
|
|
|
|
func (c *GenericResponseCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.TextCompletionResponse == nil && response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
content := strings.ToLower(GetResultContent(response))
|
|
|
|
// Generic phrases that suggest the model didn't engage with the specific request
|
|
genericPhrases := []string{
|
|
"as an ai",
|
|
"as a language model",
|
|
"i'm an ai",
|
|
"i am an ai",
|
|
"i'm a language model",
|
|
"i am a language model",
|
|
"i can help you with",
|
|
"how can i assist you",
|
|
"what would you like to know",
|
|
"is there anything else",
|
|
}
|
|
|
|
// Check if response starts with generic phrases (more concerning)
|
|
for _, phrase := range genericPhrases {
|
|
if strings.HasPrefix(content, phrase) {
|
|
return true, fmt.Sprintf("response starts with generic phrase: '%s'", phrase)
|
|
}
|
|
}
|
|
|
|
// Check for overly generic responses (short and generic)
|
|
if len(strings.TrimSpace(content)) < 30 {
|
|
for _, phrase := range genericPhrases {
|
|
if strings.Contains(content, phrase) {
|
|
return true, fmt.Sprintf("short response contains generic phrase: '%s'", phrase)
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *GenericResponseCondition) GetConditionName() string {
|
|
return "GenericResponse"
|
|
}
|
|
|
|
// =============================================================================
|
|
// CONTENT VALIDATION CONDITIONS
|
|
// =============================================================================
|
|
|
|
// ContentValidationCondition checks if response fails basic content validation
|
|
// This is crucial for vision tests where the AI might give different descriptions
|
|
type ContentValidationCondition struct{}
|
|
|
|
func (c *ContentValidationCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.TextCompletionResponse == nil && response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
content := strings.ToLower(GetResultContent(response))
|
|
|
|
// Skip if response is too short or generic (other conditions will handle these)
|
|
if len(content) < 10 {
|
|
return false, ""
|
|
}
|
|
|
|
// Only check content validation for vision-related scenarios
|
|
scenarioName := strings.ToLower(context.ScenarioName)
|
|
if !strings.Contains(scenarioName, "image") && !strings.Contains(scenarioName, "vision") {
|
|
return false, ""
|
|
}
|
|
|
|
// Check if this looks like a valid vision response but might be missing keywords
|
|
// Look for vision-related indicators that suggest the AI processed the image
|
|
visionIndicators := []string{
|
|
"see", "shows", "depicts", "contains", "features", "displays",
|
|
"appears", "looks", "visible", "image", "picture", "photo",
|
|
"color", "shape", "object", "animal", "person", "building",
|
|
"in the", "there is", "there are", "this is", "i can see",
|
|
}
|
|
|
|
hasVisionContent := false
|
|
for _, indicator := range visionIndicators {
|
|
if strings.Contains(content, indicator) {
|
|
hasVisionContent = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// If it looks like a valid vision response, check if we should retry based on missing expected keywords
|
|
if hasVisionContent {
|
|
// Check if this test has expected keywords from the TestRetryContext
|
|
if testMetadata, exists := context.TestMetadata["expected_keywords"]; exists {
|
|
if expectedKeywords, ok := testMetadata.([]string); ok && len(expectedKeywords) > 0 {
|
|
// Check if ANY of the expected keywords are present
|
|
foundExpectedKeyword := false
|
|
for _, keyword := range expectedKeywords {
|
|
if strings.Contains(content, strings.ToLower(keyword)) {
|
|
foundExpectedKeyword = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// If valid vision response but missing ALL expected keywords, retry
|
|
// Allow longer responses for complex vision tasks (comparisons, detailed descriptions)
|
|
if !foundExpectedKeyword && len(content) > 20 && len(content) < 2000 {
|
|
return true, fmt.Sprintf("valid vision response but missing expected keywords %v, might include them on retry", expectedKeywords)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: Check expected behavior fields for dynamic validation
|
|
if expectedAnimal, ok := context.ExpectedBehavior["should_identify_animal"].(string); ok && expectedAnimal != "" {
|
|
// Parse expected animal from behavior context (e.g., "lion or animal")
|
|
expectedTerms := strings.Split(strings.ToLower(expectedAnimal), " or ")
|
|
foundExpected := false
|
|
for _, term := range expectedTerms {
|
|
term = strings.TrimSpace(term)
|
|
if term != "" && strings.Contains(content, term) {
|
|
foundExpected = true
|
|
break
|
|
}
|
|
}
|
|
if !foundExpected && len(content) > 20 && len(content) < 1500 {
|
|
return true, fmt.Sprintf("valid vision response but missing expected animal terms '%s', might get more specific on retry", expectedAnimal)
|
|
}
|
|
}
|
|
|
|
if expectedObject, ok := context.ExpectedBehavior["should_identify_object"].(string); ok && expectedObject != "" {
|
|
// Parse expected object from behavior context (e.g., "ant or insect")
|
|
expectedTerms := strings.Split(strings.ToLower(expectedObject), " or ")
|
|
foundExpected := false
|
|
for _, term := range expectedTerms {
|
|
term = strings.TrimSpace(term)
|
|
if term != "" && strings.Contains(content, term) {
|
|
foundExpected = true
|
|
break
|
|
}
|
|
}
|
|
if !foundExpected && len(content) > 15 && len(content) < 1500 {
|
|
return true, fmt.Sprintf("valid vision response but missing expected object terms '%s', might get more specific on retry", expectedObject)
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *ContentValidationCondition) GetConditionName() string {
|
|
return "ContentValidation"
|
|
}
|
|
|
|
// =============================================================================
|
|
// STREAMING CONDITIONS
|
|
// =============================================================================
|
|
|
|
// StreamErrorCondition checks for streaming-specific errors that should trigger retries
|
|
type StreamErrorCondition struct{}
|
|
|
|
func (c *StreamErrorCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
// Only retry on actual stream errors, not when stream is successful but response is nil
|
|
if err == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check for specific streaming errors that indicate retry-worthy conditions
|
|
// Check both the Message field and nested Error field
|
|
var errorMsg string
|
|
if strings.TrimSpace(err.Error.Message) != "" {
|
|
errorMsg = strings.ToLower(err.Error.Message)
|
|
} else if err.Error.Error != nil {
|
|
errorMsg = strings.ToLower(err.Error.Error.Error())
|
|
} else {
|
|
return false, ""
|
|
}
|
|
|
|
// Retry on connection/timeout issues during streaming
|
|
if strings.Contains(errorMsg, "connection reset") ||
|
|
strings.Contains(errorMsg, "connection refused") ||
|
|
strings.Contains(errorMsg, "timeout") ||
|
|
strings.Contains(errorMsg, "stream closed") ||
|
|
strings.Contains(errorMsg, "stream interrupted") {
|
|
return true, fmt.Sprintf("stream connection error: %s", errorMsg)
|
|
}
|
|
|
|
// Retry on temporary streaming API errors
|
|
if strings.Contains(errorMsg, "rate limit") ||
|
|
strings.Contains(errorMsg, "quota exceeded") ||
|
|
strings.Contains(errorMsg, "service unavailable") ||
|
|
strings.Contains(errorMsg, "server overloaded") {
|
|
return true, fmt.Sprintf("temporary API error: %s", errorMsg)
|
|
}
|
|
|
|
// Don't retry on authentication, invalid request, or other permanent errors
|
|
return false, ""
|
|
}
|
|
|
|
func (c *StreamErrorCondition) GetConditionName() string {
|
|
return "StreamError"
|
|
}
|
|
|
|
// IncompleteStreamCondition checks for incomplete streaming responses
|
|
type IncompleteStreamCondition struct{}
|
|
|
|
func (c *IncompleteStreamCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check both Chat Completions and Responses API formats
|
|
if response.TextCompletionResponse == nil && response.ChatResponse == nil && response.ResponsesResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// For Chat Completions API, check finish reasons in choices
|
|
if response.ChatResponse != nil {
|
|
for i, choice := range response.ChatResponse.Choices {
|
|
if choice.FinishReason == nil {
|
|
return true, fmt.Sprintf("choice %d has no finish reason (stream may be incomplete)", i)
|
|
}
|
|
|
|
// Check for incomplete finish reasons
|
|
finishReason := string(*choice.FinishReason)
|
|
if finishReason == "length" {
|
|
// This might be okay depending on context, but could indicate truncation
|
|
singleChoiceResponse := &schemas.BifrostResponse{
|
|
ChatResponse: &schemas.BifrostChatResponse{
|
|
Choices: []schemas.BifrostResponseChoice{choice},
|
|
},
|
|
}
|
|
choiceContent := GetResultContent(singleChoiceResponse)
|
|
if len(choiceContent) < 10 {
|
|
return true, fmt.Sprintf("choice %d finished due to length but content is very short", i)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if response.TextCompletionResponse != nil {
|
|
for i, choice := range response.TextCompletionResponse.Choices {
|
|
if choice.FinishReason == nil {
|
|
return true, fmt.Sprintf("choice %d has no finish reason (stream may be incomplete)", i)
|
|
}
|
|
|
|
finishReason := string(*choice.FinishReason)
|
|
if finishReason == "length" {
|
|
// This might be okay depending on context, but could indicate truncation
|
|
singleChoiceResponse := &schemas.BifrostResponse{
|
|
TextCompletionResponse: &schemas.BifrostTextCompletionResponse{
|
|
Choices: []schemas.BifrostResponseChoice{choice},
|
|
},
|
|
}
|
|
choiceContent := GetResultContent(singleChoiceResponse)
|
|
if len(choiceContent) < 10 {
|
|
return true, fmt.Sprintf("choice %d finished due to length but content is very short", i)
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// For Responses API, check completion status in output messages
|
|
if response.ResponsesResponse != nil {
|
|
for i, output := range response.ResponsesResponse.Output {
|
|
if output.Status == nil {
|
|
return true, fmt.Sprintf("output %d has no status (stream may be incomplete)", i)
|
|
}
|
|
|
|
status := *output.Status
|
|
if status == "incomplete" || status == "in_progress" {
|
|
return true, fmt.Sprintf("output %d has incomplete status: %s", i, status)
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *IncompleteStreamCondition) GetConditionName() string {
|
|
return "IncompleteStream"
|
|
}
|
|
|
|
// =============================================================================
|
|
// SPEECH SYNTHESIS CONDITIONS
|
|
// =============================================================================
|
|
|
|
// EmptySpeechCondition checks for missing or invalid audio data in speech synthesis responses
|
|
type EmptySpeechCondition struct{}
|
|
|
|
func (c *EmptySpeechCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
// If there's an error, let other conditions handle it
|
|
if err != nil {
|
|
return false, ""
|
|
}
|
|
|
|
// No response at all
|
|
if response == nil {
|
|
return true, "response is nil"
|
|
}
|
|
|
|
// Check if speech response exists
|
|
if response.SpeechResponse == nil {
|
|
return true, "response has no speech data"
|
|
}
|
|
|
|
// Check if audio data exists and is not empty
|
|
if response.SpeechResponse.Audio == nil {
|
|
return true, "response has no audio data"
|
|
}
|
|
|
|
// Check for unreasonably small audio files (likely errors)
|
|
if len(response.SpeechResponse.Audio) < 100 {
|
|
return true, fmt.Sprintf("audio data too small (%d bytes), likely an error", len(response.SpeechResponse.Audio))
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *EmptySpeechCondition) GetConditionName() string {
|
|
return "EmptySpeech"
|
|
}
|
|
|
|
// =============================================================================
|
|
// TRANSCRIPTION CONDITIONS
|
|
// =============================================================================
|
|
|
|
// EmptyTranscriptionCondition checks for missing or invalid transcription text
|
|
type EmptyTranscriptionCondition struct{}
|
|
|
|
func (c *EmptyTranscriptionCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
// If there's an error, let other conditions handle it
|
|
if err != nil {
|
|
return false, ""
|
|
}
|
|
|
|
// No response at all
|
|
if response == nil {
|
|
return true, "response is nil"
|
|
}
|
|
|
|
// Check if transcription response exists
|
|
if response.TranscriptionResponse == nil {
|
|
return true, "response has no transcription data"
|
|
}
|
|
|
|
// Check if text exists and is not empty
|
|
if response.TranscriptionResponse.Text == "" || strings.TrimSpace(response.TranscriptionResponse.Text) == "" {
|
|
return true, "response has no transcription text"
|
|
}
|
|
|
|
// Check for unreasonably short transcriptions (likely errors)
|
|
text := strings.TrimSpace(response.TranscriptionResponse.Text)
|
|
if len(text) < 3 {
|
|
return true, fmt.Sprintf("transcription text too short (%d chars): '%s'", len(text), text)
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *EmptyTranscriptionCondition) GetConditionName() string {
|
|
return "EmptyTranscription"
|
|
}
|
|
|
|
// =============================================================================
|
|
// EMBEDDING CONDITIONS
|
|
// =============================================================================
|
|
|
|
// EmptyEmbeddingCondition checks for missing or empty embeddings
|
|
type EmptyEmbeddingCondition struct{}
|
|
|
|
func (c *EmptyEmbeddingCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check if we have embedding data
|
|
if response.EmbeddingResponse == nil || len(response.EmbeddingResponse.Data) == 0 {
|
|
return true, "response has no embedding data"
|
|
}
|
|
|
|
// Check each embedding
|
|
for i, data := range response.EmbeddingResponse.Data {
|
|
vec, extractErr := getEmbeddingVector(data)
|
|
if extractErr != nil {
|
|
return true, fmt.Sprintf("embedding %d: failed to extract vector: %s", i, extractErr.Error())
|
|
}
|
|
|
|
if len(vec) == 0 {
|
|
return true, fmt.Sprintf("embedding %d: vector is empty", i)
|
|
}
|
|
|
|
// Check for all-zero vectors (sometimes indicates an error)
|
|
allZero := true
|
|
for _, val := range vec {
|
|
if val != 0.0 {
|
|
allZero = false
|
|
break
|
|
}
|
|
}
|
|
|
|
if allZero {
|
|
return true, fmt.Sprintf("embedding %d: vector is all zeros", i)
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *EmptyEmbeddingCondition) GetConditionName() string {
|
|
return "EmptyEmbedding"
|
|
}
|
|
|
|
// InvalidEmbeddingDimensionCondition checks for inconsistent embedding dimensions
|
|
type InvalidEmbeddingDimensionCondition struct {
|
|
ExpectedDimension int // Expected vector dimension (0 means any)
|
|
}
|
|
|
|
func (c *InvalidEmbeddingDimensionCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil || response.EmbeddingResponse == nil || len(response.EmbeddingResponse.Data) == 0 {
|
|
return false, ""
|
|
}
|
|
|
|
expectedDim := c.ExpectedDimension
|
|
if expectedDim == 0 {
|
|
if dim, ok := context.ExpectedBehavior["expected_dimension"].(int); ok {
|
|
expectedDim = dim
|
|
}
|
|
}
|
|
|
|
var firstDimension int
|
|
|
|
// Check each embedding
|
|
for i, data := range response.EmbeddingResponse.Data {
|
|
vec, extractErr := getEmbeddingVector(data)
|
|
if extractErr != nil {
|
|
return false, "" // Let EmptyEmbeddingCondition handle this
|
|
}
|
|
|
|
dimension := len(vec)
|
|
|
|
// Set expected dimension from first embedding if not specified
|
|
if i == 0 {
|
|
firstDimension = dimension
|
|
if expectedDim > 0 && dimension != expectedDim {
|
|
return true, fmt.Sprintf("embedding %d: got dimension %d, expected %d", i, dimension, expectedDim)
|
|
}
|
|
} else {
|
|
// Check consistency with first embedding
|
|
if dimension != firstDimension {
|
|
return true, fmt.Sprintf("embedding %d: dimension %d differs from first embedding dimension %d", i, dimension, firstDimension)
|
|
}
|
|
}
|
|
|
|
// Check for unreasonably small dimensions (likely an error)
|
|
if dimension < 50 {
|
|
return true, fmt.Sprintf("embedding %d: dimension %d seems too small", i, dimension)
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *InvalidEmbeddingDimensionCondition) GetConditionName() string {
|
|
return "InvalidEmbeddingDimension"
|
|
}
|
|
|
|
// =============================================================================
|
|
// IMAGE CONDITIONS
|
|
// =============================================================================
|
|
|
|
// EmptyImageGenerationCondition checks for missing or invalid image data
|
|
type EmptyImageGenerationCondition struct{}
|
|
|
|
func (c *EmptyImageGenerationCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
// If there's an error, let other conditions handle it
|
|
if err != nil {
|
|
return false, ""
|
|
}
|
|
|
|
// No response at all
|
|
if response == nil {
|
|
return true, "response is nil"
|
|
}
|
|
|
|
// Check if both response types are nil
|
|
if response.ImageGenerationResponse == nil && response.ImageGenerationStreamResponse == nil {
|
|
return true, "response has no image data"
|
|
}
|
|
|
|
// Check non-streaming response
|
|
if response.ImageGenerationResponse != nil {
|
|
if len(response.ImageGenerationResponse.Data) == 0 {
|
|
return true, "response has no image data"
|
|
}
|
|
|
|
// Check each image has either B64JSON or URL
|
|
for i, img := range response.ImageGenerationResponse.Data {
|
|
if img.B64JSON == "" && img.URL == "" {
|
|
return true, fmt.Sprintf("image %d has no B64JSON or URL", i)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check streaming response
|
|
if response.ImageGenerationStreamResponse != nil {
|
|
if response.ImageGenerationStreamResponse.B64JSON == "" && response.ImageGenerationStreamResponse.URL == "" {
|
|
return true, "stream response has no B64JSON or URL"
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *EmptyImageGenerationCondition) GetConditionName() string {
|
|
return "EmptyImageGeneration"
|
|
}
|
|
|
|
// =============================================================================
|
|
// COUNT TOKENS CONDITIONS
|
|
// =============================================================================
|
|
|
|
// EmptyCountTokensCondition checks for missing or invalid token counts
|
|
type EmptyCountTokensCondition struct{}
|
|
|
|
func (c *EmptyCountTokensCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
// If there's an error, let other conditions handle it
|
|
if err != nil {
|
|
return false, ""
|
|
}
|
|
|
|
// No response at all
|
|
if response == nil {
|
|
return true, "response is nil"
|
|
}
|
|
|
|
// Check if count tokens response exists
|
|
if response.CountTokensResponse == nil {
|
|
return true, "count tokens response is nil"
|
|
}
|
|
|
|
countTokensResp := response.CountTokensResponse
|
|
|
|
// Check if token counts are valid
|
|
if countTokensResp.InputTokens <= 0 {
|
|
return true, "input_tokens is zero or negative"
|
|
}
|
|
|
|
// Check if total tokens is at least as large as input tokens
|
|
if countTokensResp.TotalTokens != nil {
|
|
if *countTokensResp.TotalTokens < countTokensResp.InputTokens {
|
|
return true, fmt.Sprintf("total_tokens (%d) is less than input_tokens (%d)", *countTokensResp.TotalTokens, countTokensResp.InputTokens)
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *EmptyCountTokensCondition) GetConditionName() string {
|
|
return "EmptyCountTokens"
|
|
}
|
|
|
|
// InvalidCountTokensCondition checks for invalid token count data
|
|
type InvalidCountTokensCondition struct{}
|
|
|
|
func (c *InvalidCountTokensCondition) ShouldRetry(response *schemas.BifrostResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
// Check if count tokens response exists
|
|
if response.CountTokensResponse == nil {
|
|
return false, ""
|
|
}
|
|
|
|
countTokensResp := response.CountTokensResponse
|
|
|
|
// Check if model is set
|
|
if strings.TrimSpace(countTokensResp.Model) == "" {
|
|
return true, "model field is empty"
|
|
}
|
|
|
|
// Check if request type is set correctly
|
|
if countTokensResp.ExtraFields.RequestType != schemas.CountTokensRequest {
|
|
return true, fmt.Sprintf("invalid request type: got %s, expected %s", countTokensResp.ExtraFields.RequestType, schemas.CountTokensRequest)
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *InvalidCountTokensCondition) GetConditionName() string {
|
|
return "InvalidCountTokens"
|
|
}
|
|
|
|
// =============================================================================
|
|
// RESPONSES API CONDITIONS
|
|
// These implement ResponsesRetryCondition for use with WithResponsesTestRetry
|
|
// =============================================================================
|
|
|
|
// ResponsesEmptyCondition checks for empty Responses API responses
|
|
type ResponsesEmptyCondition struct{}
|
|
|
|
func (c *ResponsesEmptyCondition) ShouldRetry(response *schemas.BifrostResponsesResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil {
|
|
return false, ""
|
|
}
|
|
if response == nil {
|
|
return true, "response is nil"
|
|
}
|
|
content := GetResponsesContent(response)
|
|
if strings.TrimSpace(content) == "" {
|
|
return true, "response has empty content"
|
|
}
|
|
return false, ""
|
|
}
|
|
|
|
func (c *ResponsesEmptyCondition) GetConditionName() string {
|
|
return "ResponsesEmpty"
|
|
}
|
|
|
|
// ResponsesFileNotProcessedCondition checks if file/document was not properly processed in Responses API
|
|
type ResponsesFileNotProcessedCondition struct{}
|
|
|
|
func (c *ResponsesFileNotProcessedCondition) ShouldRetry(response *schemas.BifrostResponsesResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
content := strings.ToLower(GetResponsesContent(response))
|
|
|
|
// Check for generic responses that don't indicate file/document processing
|
|
fileProcessingFailurePhrases := []string{
|
|
"i can't read",
|
|
"i cannot read",
|
|
"unable to read",
|
|
"can't access",
|
|
"cannot access",
|
|
"no file",
|
|
"no document",
|
|
"not able to read",
|
|
"i don't see",
|
|
"i cannot process",
|
|
"unable to process",
|
|
"can't open",
|
|
"cannot open",
|
|
"invalid file",
|
|
"corrupted",
|
|
"unsupported format",
|
|
"failed to load",
|
|
"no pdf",
|
|
"cannot view",
|
|
}
|
|
|
|
for _, phrase := range fileProcessingFailurePhrases {
|
|
if strings.Contains(content, phrase) {
|
|
return true, fmt.Sprintf("response suggests file was not processed: contains '%s'", phrase)
|
|
}
|
|
}
|
|
|
|
// If content is suspiciously short for document analysis
|
|
if len(strings.TrimSpace(content)) < 15 {
|
|
return true, "response too short for meaningful document analysis"
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *ResponsesFileNotProcessedCondition) GetConditionName() string {
|
|
return "ResponsesFileNotProcessed"
|
|
}
|
|
|
|
// ResponsesGenericResponseCondition checks for generic/template responses in Responses API
|
|
type ResponsesGenericResponseCondition struct{}
|
|
|
|
func (c *ResponsesGenericResponseCondition) ShouldRetry(response *schemas.BifrostResponsesResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
content := strings.ToLower(GetResponsesContent(response))
|
|
|
|
// Generic phrases that suggest the model didn't engage with the specific request
|
|
genericPhrases := []string{
|
|
"as an ai",
|
|
"as a language model",
|
|
"i'm an ai",
|
|
"i am an ai",
|
|
"i'm a language model",
|
|
"i am a language model",
|
|
"i can help you with",
|
|
"how can i assist you",
|
|
"what would you like to know",
|
|
"is there anything else",
|
|
}
|
|
|
|
// Check if response starts with generic phrases (more concerning)
|
|
for _, phrase := range genericPhrases {
|
|
if strings.HasPrefix(content, phrase) {
|
|
return true, fmt.Sprintf("response starts with generic phrase: '%s'", phrase)
|
|
}
|
|
}
|
|
|
|
// Check for overly generic responses (short and generic)
|
|
if len(strings.TrimSpace(content)) < 30 {
|
|
for _, phrase := range genericPhrases {
|
|
if strings.Contains(content, phrase) {
|
|
return true, fmt.Sprintf("short response contains generic phrase: '%s'", phrase)
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *ResponsesGenericResponseCondition) GetConditionName() string {
|
|
return "ResponsesGenericResponse"
|
|
}
|
|
|
|
// ResponsesContentValidationCondition checks if response fails basic content validation for Responses API
|
|
type ResponsesContentValidationCondition struct{}
|
|
|
|
func (c *ResponsesContentValidationCondition) ShouldRetry(response *schemas.BifrostResponsesResponse, err *schemas.BifrostError, context TestRetryContext) (bool, string) {
|
|
if err != nil || response == nil {
|
|
return false, ""
|
|
}
|
|
|
|
content := strings.ToLower(GetResponsesContent(response))
|
|
|
|
// Skip if response is too short (other conditions will handle these)
|
|
if len(content) < 10 {
|
|
return false, ""
|
|
}
|
|
|
|
// Check for file/document processing scenarios
|
|
scenarioName := strings.ToLower(context.ScenarioName)
|
|
if strings.Contains(scenarioName, "file") || strings.Contains(scenarioName, "document") || strings.Contains(scenarioName, "pdf") {
|
|
// Check if this test has expected keywords from the TestRetryContext
|
|
if testMetadata, exists := context.TestMetadata["expected_keywords"]; exists {
|
|
if expectedKeywords, ok := testMetadata.([]string); ok && len(expectedKeywords) > 0 {
|
|
// Check if ANY of the expected keywords are present
|
|
foundExpectedKeyword := false
|
|
for _, keyword := range expectedKeywords {
|
|
if strings.Contains(content, strings.ToLower(keyword)) {
|
|
foundExpectedKeyword = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// If valid response but missing ALL expected keywords, retry
|
|
if !foundExpectedKeyword && len(content) > 20 && len(content) < 2000 {
|
|
return true, fmt.Sprintf("response missing expected keywords %v, might include them on retry", expectedKeywords)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func (c *ResponsesContentValidationCondition) GetConditionName() string {
|
|
return "ResponsesContentValidation"
|
|
}
|