Files
bifrost/core/internal/llmtests/validation_presets.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

666 lines
24 KiB
Go

package llmtests
import (
"regexp"
"strings"
"github.com/maximhq/bifrost/core/schemas"
)
// =============================================================================
// PRESET VALIDATION EXPECTATIONS FOR COMMON SCENARIOS
// =============================================================================
// BasicChatExpectations returns validation expectations for basic chat scenarios
func BasicChatExpectations() ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: true,
ExpectedChoiceCount: 1, // Usually expect one choice, will be used on outputs for responses API
ShouldHaveUsageStats: true,
ShouldHaveTimestamps: true,
ShouldHaveModel: true,
ShouldHaveLatency: true, // Global expectation: latency should always be present
ShouldNotContainWords: []string{
"i can't", "i cannot", "i'm unable", "i am unable",
"i don't know", "i'm not sure", "i am not sure",
},
}
}
// ToolCallExpectations returns validation expectations for tool calling scenarios
func ToolCallExpectations(toolName string, requiredArgs []string) ResponseExpectations {
expectations := BasicChatExpectations()
expectations.ExpectedToolCalls = []ToolCallExpectation{
{
FunctionName: toolName,
RequiredArgs: requiredArgs,
ValidateArgsJSON: true,
},
}
// Tool calls might not have text content
expectations.ShouldHaveContent = false
return expectations
}
// WeatherToolExpectations returns validation expectations for weather tool calls
func WeatherToolExpectations() ResponseExpectations {
return ToolCallExpectations(string(SampleToolTypeWeather), []string{"location"})
}
// CalculatorToolExpectations returns validation expectations for calculator tool calls
func CalculatorToolExpectations() ResponseExpectations {
return ToolCallExpectations(string(SampleToolTypeCalculate), []string{"expression"})
}
// TimeToolExpectations returns validation expectations for time tool calls
func TimeToolExpectations() ResponseExpectations {
return ToolCallExpectations(string(SampleToolTypeTime), []string{"timezone"})
}
// MultipleToolExpectations returns validation expectations for multiple tool calls
func MultipleToolExpectations(tools []string, requiredArgsPerTool [][]string) ResponseExpectations {
expectations := BasicChatExpectations()
expectations.ShouldHaveContent = false // Tool calls might not have text Content
for i, tool := range tools {
var args []string
if i < len(requiredArgsPerTool) {
args = requiredArgsPerTool[i]
}
expectations.ExpectedToolCalls = append(expectations.ExpectedToolCalls, ToolCallExpectation{
FunctionName: tool,
RequiredArgs: args,
ValidateArgsJSON: true,
})
}
return expectations
}
// ImageAnalysisExpectations returns validation expectations for image analysis scenarios
func ImageAnalysisExpectations() ResponseExpectations {
expectations := BasicChatExpectations()
expectations.ShouldContainKeywords = []string{"image", "picture", "photo", "see", "shows", "contains"}
expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords, []string{
"i can't see", "i cannot see", "unable to see", "can't view",
"cannot view", "no image", "not able to see", "i don't see",
}...)
return expectations
}
// TextCompletionExpectations returns validation expectations for text completion scenarios
func TextCompletionExpectations() ResponseExpectations {
expectations := BasicChatExpectations()
return expectations
}
// EmbeddingExpectations returns validation expectations for embedding scenarios
func EmbeddingExpectations(expectedTexts []string) ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: false, // Embeddings don't have text content
ExpectedChoiceCount: 0, // Embeddings use different structure
ShouldHaveModel: true,
ShouldHaveLatency: true, // Global expectation: latency should always be present
// Custom validation will be needed for embedding data
ProviderSpecific: map[string]interface{}{
"expected_embedding_count": len(expectedTexts),
"expected_texts": expectedTexts,
},
}
}
// CountTokensExpectations returns validation expectations for count tokens scenarios
func CountTokensExpectations() ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: false, // CountTokens doesn't return text content
ExpectedChoiceCount: 0,
ShouldHaveUsageStats: true,
ShouldHaveModel: true,
ShouldHaveLatency: true,
ProviderSpecific: map[string]interface{}{
"response_type": "count_tokens",
},
}
}
// StreamingExpectations returns validation expectations for streaming scenarios
func StreamingExpectations() ResponseExpectations {
expectations := BasicChatExpectations()
// Streaming consolidated responses are assembled from chunks.
// The last chunk often does not carry created/model fields,
// so we cannot reliably validate them on the consolidated response.
expectations.ShouldHaveTimestamps = false
expectations.ShouldHaveModel = false
return expectations
}
// ConversationExpectations returns validation expectations for multi-turn conversation scenarios
func ConversationExpectations(contextKeywords []string) ResponseExpectations {
expectations := BasicChatExpectations()
expectations.ShouldContainAnyOf = contextKeywords // Should reference conversation context
return expectations
}
// VisionExpectations returns validation expectations for vision/image processing scenarios
func VisionExpectations(expectedKeywords []string) ResponseExpectations {
expectations := ImageAnalysisExpectations() // Use existing image analysis base
if len(expectedKeywords) > 0 {
expectations.ShouldContainKeywords = expectedKeywords
}
expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords,
"cannot see", "unable to view", "no image", "can't see",
"image not found", "invalid image", "corrupted image",
"failed to load", "error processing",
)
expectations.IsRelevantToPrompt = true
return expectations
}
// FileInputExpectations returns validation expectations for file input scenarios
func FileInputExpectations() ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: true,
ExpectedChoiceCount: 1,
ShouldHaveUsageStats: true,
ShouldHaveTimestamps: true,
ShouldHaveModel: true,
ShouldHaveLatency: true,
ShouldContainKeywords: []string{"hello", "world"}, // Content from the test PDF
ShouldNotContainWords: []string{
"cannot", "unable", "error", "failed",
"unsupported", "invalid", "corrupted",
"can't read", "cannot read", "no file",
"no document", "cannot process",
},
IsRelevantToPrompt: true,
}
}
// SpeechExpectations returns validation expectations for speech synthesis scenarios
func SpeechExpectations(minAudioBytes int) ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: false, // Speech responses don't have text content
ExpectedChoiceCount: 0, // Speech responses don't have choices
ShouldHaveUsageStats: true,
ShouldHaveTimestamps: true,
ShouldHaveModel: true,
ShouldHaveLatency: true, // Global expectation: latency should always be present
// Speech-specific validations stored in ProviderSpecific
ProviderSpecific: map[string]interface{}{
"min_audio_bytes": minAudioBytes,
"should_have_audio": true,
"expected_format": "audio", // General audio format
"response_type": "speech_synthesis",
},
}
}
// TranscriptionExpectations returns validation expectations for transcription scenarios
func TranscriptionExpectations(minTextLength int) ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: false, // Transcription has transcribed text, not chat content
ExpectedChoiceCount: 0, // Transcription responses don't have choices
ShouldHaveUsageStats: true,
ShouldHaveTimestamps: true,
ShouldHaveModel: true,
ShouldHaveLatency: true, // Global expectation: latency should always be present
// Transcription-specific validations
ShouldNotContainWords: []string{
"could not transcribe", "failed to process",
"invalid audio", "corrupted audio",
"unsupported format", "transcription error",
"no audio detected", "silence detected",
},
ProviderSpecific: map[string]interface{}{
"min_transcription_length": minTextLength,
"should_have_transcription": true,
"response_type": "transcription",
},
}
}
func ImageGenerationExpectations(minImages int, expectedSize string) ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: false, // Image responses don't have text content
ExpectedChoiceCount: 0, // Image responses don't have choices
ShouldHaveUsageStats: true,
ShouldHaveTimestamps: true,
ShouldHaveModel: true,
ShouldHaveLatency: true, // Global expectation: latency should always be present
ProviderSpecific: map[string]interface{}{
"min_images": minImages,
"expected_size": expectedSize,
"response_type": "image_generation",
},
}
}
// ReasoningExpectations returns validation expectations for reasoning scenarios
func ReasoningExpectations() ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: true,
ShouldHaveUsageStats: true,
ShouldHaveTimestamps: true,
ShouldHaveModel: true,
ProviderSpecific: map[string]interface{}{
"response_type": "reasoning",
"expects_step_by_step": true,
},
}
}
// ChatAudioExpectations returns validation expectations for chat audio scenarios
func ChatAudioExpectations() ResponseExpectations {
return ResponseExpectations{
ShouldHaveContent: false, // Chat audio responses may have audio/transcript but not text content
ExpectedChoiceCount: 1, // Should have one choice with audio data
ShouldHaveUsageStats: true,
ShouldHaveTimestamps: true,
ShouldHaveModel: true,
ShouldHaveLatency: true, // Global expectation: latency should always be present
ProviderSpecific: map[string]interface{}{
"response_type": "chat_audio",
},
}
}
// =============================================================================
// SCENARIO-SPECIFIC EXPECTATION BUILDERS
// =============================================================================
// GetExpectationsForScenario returns appropriate validation expectations for a given scenario
func GetExpectationsForScenario(scenarioName string, testConfig ComprehensiveTestConfig, customParams map[string]interface{}) ResponseExpectations {
var expectations ResponseExpectations
switch scenarioName {
case "SimpleChat":
expectations = BasicChatExpectations()
case "TextCompletion":
expectations = TextCompletionExpectations()
case "ToolCalls":
if toolName, ok := customParams["tool_name"].(string); ok {
if args, ok := customParams["required_args"].([]string); ok {
expectations = ToolCallExpectations(toolName, args)
break
}
}
expectations = WeatherToolExpectations() // Default to weather tool
case "MultipleToolCalls":
if tools, ok := customParams["tool_names"].([]string); ok {
if argsPerTool, ok := customParams["required_args_per_tool"].([][]string); ok {
expectations = MultipleToolExpectations(tools, argsPerTool)
break
}
}
// Default to weather and calculator
expectations = MultipleToolExpectations(
[]string{string(SampleToolTypeWeather), string(SampleToolTypeCalculate)},
[][]string{{"location"}, {"expression"}},
)
case "End2EndToolCalling":
expectations = ConversationExpectations([]string{"weather", "temperature", "result"})
case "AutomaticFunctionCalling":
expectations = WeatherToolExpectations()
expectations.ShouldHaveContent = true // Should have follow-up text after tool call
case "ImageURL", "ImageBase64":
expectations = VisionExpectations([]string{"image", "picture", "see"})
case "MultipleImages":
expectations = VisionExpectations([]string{"compare", "similar", "different", "images"})
case "FileInput":
expectations = FileInputExpectations()
case "ChatCompletionStream", "TextCompletionStream":
expectations = StreamingExpectations()
case "MultiTurnConversation":
if keywords, ok := customParams["context_keywords"].([]string); ok {
expectations = ConversationExpectations(keywords)
} else {
expectations = ConversationExpectations([]string{"context", "previous", "mentioned"})
}
case "Embedding":
if texts, ok := customParams["input_texts"].([]string); ok {
expectations = EmbeddingExpectations(texts)
} else {
expectations = EmbeddingExpectations([]string{"Hello, world!", "Hi, world!", "Goodnight, moon!"})
}
case "CountTokens":
expectations = CountTokensExpectations()
case "CompleteEnd2End":
expectations = ConversationExpectations([]string{"complete", "comprehensive", "full"})
case "SpeechSynthesis":
if minBytes, ok := customParams["min_audio_bytes"].(int); ok {
expectations = SpeechExpectations(minBytes)
} else {
expectations = SpeechExpectations(500) // Default minimum 500 bytes
}
case "Transcription":
if minLength, ok := customParams["min_transcription_length"].(int); ok {
expectations = TranscriptionExpectations(minLength)
} else {
expectations = TranscriptionExpectations(10) // Default minimum 10 characters
}
case "Reasoning":
expectations = ReasoningExpectations()
case "ChatAudio":
expectations = ChatAudioExpectations()
case "ProviderSpecific":
expectations = BasicChatExpectations()
expectations.ShouldContainKeywords = []string{"unique", "specific", "capability"}
case "ImageGeneration":
if minImages, ok := customParams["min_images"].(int); ok {
if expectedSize, ok := customParams["expected_size"].(string); ok {
expectations = ImageGenerationExpectations(minImages, expectedSize)
break
}
}
expectations = ImageGenerationExpectations(1, "1024x1024")
case "ImageEdit", "ImageVariation":
// Reuse image generation expectations since they use the same response structure
if minImages, ok := customParams["min_images"].(int); ok {
if expectedSize, ok := customParams["expected_size"].(string); ok {
expectations = ImageGenerationExpectations(minImages, expectedSize)
break
}
}
expectations = ImageGenerationExpectations(1, "1024x1024")
default:
// Default to basic chat expectations
expectations = BasicChatExpectations()
}
// Apply raw request/response expectations from test config
isStreaming := strings.HasSuffix(scenarioName, "Stream") || strings.HasSuffix(scenarioName, "Streaming")
isMultipartRequest := scenarioName == "Transcription" || scenarioName == "TranscriptionStream" ||
scenarioName == "ImageEdit" || scenarioName == "ImageEditStream" ||
scenarioName == "ImageVariation"
// Skip raw request/response for CountTokens - not all providers support it uniformly
if scenarioName != "CountTokens" {
expectations = ApplyRawExpectations(expectations, testConfig, isStreaming, isMultipartRequest)
}
return expectations
}
// =============================================================================
// PROVIDER-SPECIFIC EXPECTATION MODIFIERS
// =============================================================================
// ModifyExpectationsForProvider adjusts expectations based on provider capabilities.
// Each provider is explicitly configured for: usage stats, timestamps, model, and latency.
// If a provider is not listed, defaults are kept (all true from BasicChatExpectations).
func ModifyExpectationsForProvider(expectations ResponseExpectations, provider schemas.ModelProvider) ResponseExpectations {
// NOTE: This function must NOT set ShouldHaveTimestamps or ShouldHaveModel to true.
// StreamingExpectations explicitly disables those fields, and overriding them here
// would cause streaming tests to incorrectly assert on fields that consolidated
// streaming responses cannot reliably carry.
// ShouldHaveUsageStats and ShouldHaveLatency may still be enabled here because no
// scenario preset disables them, and some presets (e.g. ReasoningExpectations) omit
// ShouldHaveLatency entirely.
switch provider {
case schemas.OpenAI:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.Azure:
// Azure OpenAI returns the same fields as OpenAI
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.Anthropic:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.Bedrock:
// Bedrock returns usage stats for most calls via Bifrost normalization, but not all
expectations.ShouldHaveTimestamps = false // Bedrock does not return created timestamps
expectations.ShouldHaveLatency = true
case schemas.Cohere:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveModel = false // Cohere does not return model field in all response types
expectations.ShouldHaveLatency = true
case schemas.Vertex:
// Google Vertex AI returns usage and model but may not return timestamps
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveTimestamps = false // Vertex does not return created timestamps
expectations.ShouldHaveLatency = true
case schemas.Mistral:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.Ollama:
// Local models may not return usage or timestamps
expectations.ShouldHaveUsageStats = false
expectations.ShouldHaveTimestamps = false
expectations.ShouldHaveLatency = true
case schemas.Groq:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.Gemini:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveTimestamps = false // Gemini does not return created timestamps
expectations.ShouldHaveLatency = true
case schemas.Perplexity:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveTimestamps = false // Perplexity does not return created timestamps
expectations.ShouldHaveModel = false // Perplexity does not return model field
expectations.ShouldHaveLatency = true
case schemas.Cerebras:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.OpenRouter:
// OpenRouter proxies to multiple providers; returns OpenAI-compatible fields
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.XAI:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.Nebius:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.SGL:
// SGLang local inference — may not return all fields
expectations.ShouldHaveUsageStats = false
expectations.ShouldHaveTimestamps = false
expectations.ShouldHaveLatency = true
case schemas.Parasail:
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveTimestamps = false // Parasail does not return created timestamps
expectations.ShouldHaveModel = false // Parasail does not return model field
expectations.ShouldHaveLatency = true
case schemas.Elevenlabs:
// Elevenlabs is primarily audio — usage/timestamps may not apply to all calls
expectations.ShouldHaveUsageStats = false
expectations.ShouldHaveTimestamps = false
expectations.ShouldHaveLatency = true
case schemas.HuggingFace:
expectations.ShouldHaveUsageStats = false
expectations.ShouldHaveTimestamps = false
expectations.ShouldHaveLatency = true
case schemas.Replicate:
expectations.ShouldHaveUsageStats = false
expectations.ShouldHaveTimestamps = false
expectations.ShouldHaveLatency = true
case schemas.VLLM:
// vLLM local inference — OpenAI-compatible
expectations.ShouldHaveUsageStats = true
expectations.ShouldHaveLatency = true
case schemas.Runway:
// Runway is primarily video/image generation
expectations.ShouldHaveUsageStats = false
expectations.ShouldHaveTimestamps = false
expectations.ShouldHaveLatency = true
default:
// Keep default expectations — all true from BasicChatExpectations
}
return expectations
}
// ApplyRawExpectations applies raw request/response expectations based on test config.
// Call this after creating expectations directly (SpeechExpectations, TranscriptionExpectations, etc.)
// when not using GetExpectationsForScenario.
// Parameters:
// - isStreaming: if true, skips RawResponse expectation (streaming has no single response body)
// - options: variadic bool options:
// - options[0] = isMultipartRequest: if true, skips RawRequest expectation (multipart form data can't return raw JSON request)
// - options[1] = isBinaryResponse: if true, skips RawResponse expectation (binary responses like audio don't have JSON raw response)
func ApplyRawExpectations(expectations ResponseExpectations, testConfig ComprehensiveTestConfig, isStreaming bool, options ...bool) ResponseExpectations {
if testConfig.ExpectRawRequestResponse {
// options[0] = isMultipartRequest (skip RawRequest for multipart form data requests like transcription)
// options[1] = isBinaryResponse (skip RawResponse for binary responses like speech synthesis audio)
skipRawRequest := len(options) > 0 && options[0]
skipRawResponse := len(options) > 1 && options[1]
if !skipRawRequest {
expectations.ShouldHaveRawRequest = true
}
if !isStreaming && !skipRawResponse {
expectations.ShouldHaveRawResponse = true
}
}
return expectations
}
// =============================================================================
// ADVANCED VALIDATION EXPECTATIONS
// =============================================================================
// SemanticCoherenceExpectations returns expectations for semantic coherence tests
func SemanticCoherenceExpectations(inputPrompt string, expectedTopics []string) ResponseExpectations {
expectations := BasicChatExpectations()
expectations.ShouldContainKeywords = expectedTopics
expectations.IsRelevantToPrompt = true
// Add pattern for coherent responses (no contradictions, proper flow)
expectations.ContentPattern = regexp.MustCompile(`^[A-Z].*[.!?]$`) // Should start with capital and end with punctuation
return expectations
}
// ConsistencyExpectations returns expectations for consistency tests
func ConsistencyExpectations(expectedConsistencyMarkers []string) ResponseExpectations {
expectations := BasicChatExpectations()
expectations.ShouldContainKeywords = expectedConsistencyMarkers
expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords, []string{
"however", "but", "on the other hand", // Contradiction markers
"i'm not sure", "maybe", "possibly", "might be", // Uncertainty markers
}...)
return expectations
}
// =============================================================================
// UTILITY FUNCTIONS
// =============================================================================
// stringPtr returns a pointer to a string
func stringPtr(s string) *string {
return &s
}
// CombineExpectations merges multiple expectations (later ones override earlier ones)
func CombineExpectations(expectations ...ResponseExpectations) ResponseExpectations {
if len(expectations) == 0 {
return BasicChatExpectations()
}
base := expectations[0]
for _, exp := range expectations[1:] {
// Override fields that are set in the new expectation
if exp.ShouldHaveContent {
base.ShouldHaveContent = exp.ShouldHaveContent
}
if exp.ExpectedChoiceCount > 0 {
base.ExpectedChoiceCount = exp.ExpectedChoiceCount
}
if exp.ExpectedFinishReason != nil {
base.ExpectedFinishReason = exp.ExpectedFinishReason
}
// Append arrays
base.ShouldContainKeywords = append(base.ShouldContainKeywords, exp.ShouldContainKeywords...)
base.ShouldNotContainWords = append(base.ShouldNotContainWords, exp.ShouldNotContainWords...)
base.ExpectedToolCalls = append(base.ExpectedToolCalls, exp.ExpectedToolCalls...)
// Override other fields
if exp.ContentPattern != nil {
base.ContentPattern = exp.ContentPattern
}
if exp.IsRelevantToPrompt {
base.IsRelevantToPrompt = exp.IsRelevantToPrompt
}
if exp.ShouldNotHaveFunctionCalls {
base.ShouldNotHaveFunctionCalls = exp.ShouldNotHaveFunctionCalls
}
if exp.ShouldHaveUsageStats {
base.ShouldHaveUsageStats = exp.ShouldHaveUsageStats
}
if exp.ShouldHaveTimestamps {
base.ShouldHaveTimestamps = exp.ShouldHaveTimestamps
}
if exp.ShouldHaveModel {
base.ShouldHaveModel = exp.ShouldHaveModel
}
if exp.ShouldHaveLatency {
base.ShouldHaveLatency = exp.ShouldHaveLatency
}
// Merge provider specific data
if len(exp.ProviderSpecific) > 0 {
if base.ProviderSpecific == nil {
base.ProviderSpecific = make(map[string]interface{})
}
for k, v := range exp.ProviderSpecific {
base.ProviderSpecific[k] = v
}
}
}
return base
}