666 lines
24 KiB
Go
666 lines
24 KiB
Go
package llmtests
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/maximhq/bifrost/core/schemas"
|
|
)
|
|
|
|
// =============================================================================
|
|
// PRESET VALIDATION EXPECTATIONS FOR COMMON SCENARIOS
|
|
// =============================================================================
|
|
|
|
// BasicChatExpectations returns validation expectations for basic chat scenarios
|
|
func BasicChatExpectations() ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: true,
|
|
ExpectedChoiceCount: 1, // Usually expect one choice, will be used on outputs for responses API
|
|
ShouldHaveUsageStats: true,
|
|
ShouldHaveTimestamps: true,
|
|
ShouldHaveModel: true,
|
|
ShouldHaveLatency: true, // Global expectation: latency should always be present
|
|
ShouldNotContainWords: []string{
|
|
"i can't", "i cannot", "i'm unable", "i am unable",
|
|
"i don't know", "i'm not sure", "i am not sure",
|
|
},
|
|
}
|
|
}
|
|
|
|
// ToolCallExpectations returns validation expectations for tool calling scenarios
|
|
func ToolCallExpectations(toolName string, requiredArgs []string) ResponseExpectations {
|
|
expectations := BasicChatExpectations()
|
|
expectations.ExpectedToolCalls = []ToolCallExpectation{
|
|
{
|
|
FunctionName: toolName,
|
|
RequiredArgs: requiredArgs,
|
|
ValidateArgsJSON: true,
|
|
},
|
|
}
|
|
// Tool calls might not have text content
|
|
expectations.ShouldHaveContent = false
|
|
|
|
return expectations
|
|
}
|
|
|
|
// WeatherToolExpectations returns validation expectations for weather tool calls
|
|
func WeatherToolExpectations() ResponseExpectations {
|
|
return ToolCallExpectations(string(SampleToolTypeWeather), []string{"location"})
|
|
}
|
|
|
|
// CalculatorToolExpectations returns validation expectations for calculator tool calls
|
|
func CalculatorToolExpectations() ResponseExpectations {
|
|
return ToolCallExpectations(string(SampleToolTypeCalculate), []string{"expression"})
|
|
}
|
|
|
|
// TimeToolExpectations returns validation expectations for time tool calls
|
|
func TimeToolExpectations() ResponseExpectations {
|
|
return ToolCallExpectations(string(SampleToolTypeTime), []string{"timezone"})
|
|
}
|
|
|
|
// MultipleToolExpectations returns validation expectations for multiple tool calls
|
|
func MultipleToolExpectations(tools []string, requiredArgsPerTool [][]string) ResponseExpectations {
|
|
expectations := BasicChatExpectations()
|
|
expectations.ShouldHaveContent = false // Tool calls might not have text Content
|
|
|
|
for i, tool := range tools {
|
|
var args []string
|
|
if i < len(requiredArgsPerTool) {
|
|
args = requiredArgsPerTool[i]
|
|
}
|
|
|
|
expectations.ExpectedToolCalls = append(expectations.ExpectedToolCalls, ToolCallExpectation{
|
|
FunctionName: tool,
|
|
RequiredArgs: args,
|
|
ValidateArgsJSON: true,
|
|
})
|
|
}
|
|
|
|
return expectations
|
|
}
|
|
|
|
// ImageAnalysisExpectations returns validation expectations for image analysis scenarios
|
|
func ImageAnalysisExpectations() ResponseExpectations {
|
|
expectations := BasicChatExpectations()
|
|
expectations.ShouldContainKeywords = []string{"image", "picture", "photo", "see", "shows", "contains"}
|
|
expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords, []string{
|
|
"i can't see", "i cannot see", "unable to see", "can't view",
|
|
"cannot view", "no image", "not able to see", "i don't see",
|
|
}...)
|
|
|
|
return expectations
|
|
}
|
|
|
|
// TextCompletionExpectations returns validation expectations for text completion scenarios
|
|
func TextCompletionExpectations() ResponseExpectations {
|
|
expectations := BasicChatExpectations()
|
|
|
|
return expectations
|
|
}
|
|
|
|
// EmbeddingExpectations returns validation expectations for embedding scenarios
|
|
func EmbeddingExpectations(expectedTexts []string) ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: false, // Embeddings don't have text content
|
|
ExpectedChoiceCount: 0, // Embeddings use different structure
|
|
ShouldHaveModel: true,
|
|
ShouldHaveLatency: true, // Global expectation: latency should always be present
|
|
// Custom validation will be needed for embedding data
|
|
ProviderSpecific: map[string]interface{}{
|
|
"expected_embedding_count": len(expectedTexts),
|
|
"expected_texts": expectedTexts,
|
|
},
|
|
}
|
|
}
|
|
|
|
// CountTokensExpectations returns validation expectations for count tokens scenarios
|
|
func CountTokensExpectations() ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: false, // CountTokens doesn't return text content
|
|
ExpectedChoiceCount: 0,
|
|
ShouldHaveUsageStats: true,
|
|
ShouldHaveModel: true,
|
|
ShouldHaveLatency: true,
|
|
ProviderSpecific: map[string]interface{}{
|
|
"response_type": "count_tokens",
|
|
},
|
|
}
|
|
}
|
|
|
|
// StreamingExpectations returns validation expectations for streaming scenarios
|
|
func StreamingExpectations() ResponseExpectations {
|
|
expectations := BasicChatExpectations()
|
|
|
|
// Streaming consolidated responses are assembled from chunks.
|
|
// The last chunk often does not carry created/model fields,
|
|
// so we cannot reliably validate them on the consolidated response.
|
|
expectations.ShouldHaveTimestamps = false
|
|
expectations.ShouldHaveModel = false
|
|
|
|
return expectations
|
|
}
|
|
|
|
// ConversationExpectations returns validation expectations for multi-turn conversation scenarios
|
|
func ConversationExpectations(contextKeywords []string) ResponseExpectations {
|
|
expectations := BasicChatExpectations()
|
|
expectations.ShouldContainAnyOf = contextKeywords // Should reference conversation context
|
|
|
|
return expectations
|
|
}
|
|
|
|
// VisionExpectations returns validation expectations for vision/image processing scenarios
|
|
func VisionExpectations(expectedKeywords []string) ResponseExpectations {
|
|
expectations := ImageAnalysisExpectations() // Use existing image analysis base
|
|
if len(expectedKeywords) > 0 {
|
|
expectations.ShouldContainKeywords = expectedKeywords
|
|
}
|
|
expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords,
|
|
"cannot see", "unable to view", "no image", "can't see",
|
|
"image not found", "invalid image", "corrupted image",
|
|
"failed to load", "error processing",
|
|
)
|
|
expectations.IsRelevantToPrompt = true
|
|
return expectations
|
|
}
|
|
|
|
// FileInputExpectations returns validation expectations for file input scenarios
|
|
func FileInputExpectations() ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: true,
|
|
ExpectedChoiceCount: 1,
|
|
ShouldHaveUsageStats: true,
|
|
ShouldHaveTimestamps: true,
|
|
ShouldHaveModel: true,
|
|
ShouldHaveLatency: true,
|
|
ShouldContainKeywords: []string{"hello", "world"}, // Content from the test PDF
|
|
ShouldNotContainWords: []string{
|
|
"cannot", "unable", "error", "failed",
|
|
"unsupported", "invalid", "corrupted",
|
|
"can't read", "cannot read", "no file",
|
|
"no document", "cannot process",
|
|
},
|
|
IsRelevantToPrompt: true,
|
|
}
|
|
}
|
|
|
|
// SpeechExpectations returns validation expectations for speech synthesis scenarios
|
|
func SpeechExpectations(minAudioBytes int) ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: false, // Speech responses don't have text content
|
|
ExpectedChoiceCount: 0, // Speech responses don't have choices
|
|
ShouldHaveUsageStats: true,
|
|
ShouldHaveTimestamps: true,
|
|
ShouldHaveModel: true,
|
|
ShouldHaveLatency: true, // Global expectation: latency should always be present
|
|
// Speech-specific validations stored in ProviderSpecific
|
|
ProviderSpecific: map[string]interface{}{
|
|
"min_audio_bytes": minAudioBytes,
|
|
"should_have_audio": true,
|
|
"expected_format": "audio", // General audio format
|
|
"response_type": "speech_synthesis",
|
|
},
|
|
}
|
|
}
|
|
|
|
// TranscriptionExpectations returns validation expectations for transcription scenarios
|
|
func TranscriptionExpectations(minTextLength int) ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: false, // Transcription has transcribed text, not chat content
|
|
ExpectedChoiceCount: 0, // Transcription responses don't have choices
|
|
ShouldHaveUsageStats: true,
|
|
ShouldHaveTimestamps: true,
|
|
ShouldHaveModel: true,
|
|
ShouldHaveLatency: true, // Global expectation: latency should always be present
|
|
// Transcription-specific validations
|
|
ShouldNotContainWords: []string{
|
|
"could not transcribe", "failed to process",
|
|
"invalid audio", "corrupted audio",
|
|
"unsupported format", "transcription error",
|
|
"no audio detected", "silence detected",
|
|
},
|
|
ProviderSpecific: map[string]interface{}{
|
|
"min_transcription_length": minTextLength,
|
|
"should_have_transcription": true,
|
|
"response_type": "transcription",
|
|
},
|
|
}
|
|
}
|
|
|
|
func ImageGenerationExpectations(minImages int, expectedSize string) ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: false, // Image responses don't have text content
|
|
ExpectedChoiceCount: 0, // Image responses don't have choices
|
|
ShouldHaveUsageStats: true,
|
|
ShouldHaveTimestamps: true,
|
|
ShouldHaveModel: true,
|
|
ShouldHaveLatency: true, // Global expectation: latency should always be present
|
|
ProviderSpecific: map[string]interface{}{
|
|
"min_images": minImages,
|
|
"expected_size": expectedSize,
|
|
"response_type": "image_generation",
|
|
},
|
|
}
|
|
}
|
|
|
|
// ReasoningExpectations returns validation expectations for reasoning scenarios
|
|
func ReasoningExpectations() ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: true,
|
|
ShouldHaveUsageStats: true,
|
|
ShouldHaveTimestamps: true,
|
|
ShouldHaveModel: true,
|
|
ProviderSpecific: map[string]interface{}{
|
|
"response_type": "reasoning",
|
|
"expects_step_by_step": true,
|
|
},
|
|
}
|
|
}
|
|
|
|
// ChatAudioExpectations returns validation expectations for chat audio scenarios
|
|
func ChatAudioExpectations() ResponseExpectations {
|
|
return ResponseExpectations{
|
|
ShouldHaveContent: false, // Chat audio responses may have audio/transcript but not text content
|
|
ExpectedChoiceCount: 1, // Should have one choice with audio data
|
|
ShouldHaveUsageStats: true,
|
|
ShouldHaveTimestamps: true,
|
|
ShouldHaveModel: true,
|
|
ShouldHaveLatency: true, // Global expectation: latency should always be present
|
|
ProviderSpecific: map[string]interface{}{
|
|
"response_type": "chat_audio",
|
|
},
|
|
}
|
|
}
|
|
|
|
// =============================================================================
|
|
// SCENARIO-SPECIFIC EXPECTATION BUILDERS
|
|
// =============================================================================
|
|
|
|
// GetExpectationsForScenario returns appropriate validation expectations for a given scenario
|
|
func GetExpectationsForScenario(scenarioName string, testConfig ComprehensiveTestConfig, customParams map[string]interface{}) ResponseExpectations {
|
|
var expectations ResponseExpectations
|
|
|
|
switch scenarioName {
|
|
case "SimpleChat":
|
|
expectations = BasicChatExpectations()
|
|
|
|
case "TextCompletion":
|
|
expectations = TextCompletionExpectations()
|
|
|
|
case "ToolCalls":
|
|
if toolName, ok := customParams["tool_name"].(string); ok {
|
|
if args, ok := customParams["required_args"].([]string); ok {
|
|
expectations = ToolCallExpectations(toolName, args)
|
|
break
|
|
}
|
|
}
|
|
expectations = WeatherToolExpectations() // Default to weather tool
|
|
|
|
case "MultipleToolCalls":
|
|
if tools, ok := customParams["tool_names"].([]string); ok {
|
|
if argsPerTool, ok := customParams["required_args_per_tool"].([][]string); ok {
|
|
expectations = MultipleToolExpectations(tools, argsPerTool)
|
|
break
|
|
}
|
|
}
|
|
// Default to weather and calculator
|
|
expectations = MultipleToolExpectations(
|
|
[]string{string(SampleToolTypeWeather), string(SampleToolTypeCalculate)},
|
|
[][]string{{"location"}, {"expression"}},
|
|
)
|
|
|
|
case "End2EndToolCalling":
|
|
expectations = ConversationExpectations([]string{"weather", "temperature", "result"})
|
|
|
|
case "AutomaticFunctionCalling":
|
|
expectations = WeatherToolExpectations()
|
|
expectations.ShouldHaveContent = true // Should have follow-up text after tool call
|
|
|
|
case "ImageURL", "ImageBase64":
|
|
expectations = VisionExpectations([]string{"image", "picture", "see"})
|
|
|
|
case "MultipleImages":
|
|
expectations = VisionExpectations([]string{"compare", "similar", "different", "images"})
|
|
|
|
case "FileInput":
|
|
expectations = FileInputExpectations()
|
|
|
|
case "ChatCompletionStream", "TextCompletionStream":
|
|
expectations = StreamingExpectations()
|
|
|
|
case "MultiTurnConversation":
|
|
if keywords, ok := customParams["context_keywords"].([]string); ok {
|
|
expectations = ConversationExpectations(keywords)
|
|
} else {
|
|
expectations = ConversationExpectations([]string{"context", "previous", "mentioned"})
|
|
}
|
|
|
|
case "Embedding":
|
|
if texts, ok := customParams["input_texts"].([]string); ok {
|
|
expectations = EmbeddingExpectations(texts)
|
|
} else {
|
|
expectations = EmbeddingExpectations([]string{"Hello, world!", "Hi, world!", "Goodnight, moon!"})
|
|
}
|
|
|
|
case "CountTokens":
|
|
expectations = CountTokensExpectations()
|
|
|
|
case "CompleteEnd2End":
|
|
expectations = ConversationExpectations([]string{"complete", "comprehensive", "full"})
|
|
|
|
case "SpeechSynthesis":
|
|
if minBytes, ok := customParams["min_audio_bytes"].(int); ok {
|
|
expectations = SpeechExpectations(minBytes)
|
|
} else {
|
|
expectations = SpeechExpectations(500) // Default minimum 500 bytes
|
|
}
|
|
|
|
case "Transcription":
|
|
if minLength, ok := customParams["min_transcription_length"].(int); ok {
|
|
expectations = TranscriptionExpectations(minLength)
|
|
} else {
|
|
expectations = TranscriptionExpectations(10) // Default minimum 10 characters
|
|
}
|
|
|
|
case "Reasoning":
|
|
expectations = ReasoningExpectations()
|
|
|
|
case "ChatAudio":
|
|
expectations = ChatAudioExpectations()
|
|
|
|
case "ProviderSpecific":
|
|
expectations = BasicChatExpectations()
|
|
expectations.ShouldContainKeywords = []string{"unique", "specific", "capability"}
|
|
|
|
case "ImageGeneration":
|
|
if minImages, ok := customParams["min_images"].(int); ok {
|
|
if expectedSize, ok := customParams["expected_size"].(string); ok {
|
|
expectations = ImageGenerationExpectations(minImages, expectedSize)
|
|
break
|
|
}
|
|
}
|
|
expectations = ImageGenerationExpectations(1, "1024x1024")
|
|
|
|
case "ImageEdit", "ImageVariation":
|
|
// Reuse image generation expectations since they use the same response structure
|
|
if minImages, ok := customParams["min_images"].(int); ok {
|
|
if expectedSize, ok := customParams["expected_size"].(string); ok {
|
|
expectations = ImageGenerationExpectations(minImages, expectedSize)
|
|
break
|
|
}
|
|
}
|
|
expectations = ImageGenerationExpectations(1, "1024x1024")
|
|
|
|
default:
|
|
// Default to basic chat expectations
|
|
expectations = BasicChatExpectations()
|
|
}
|
|
|
|
// Apply raw request/response expectations from test config
|
|
isStreaming := strings.HasSuffix(scenarioName, "Stream") || strings.HasSuffix(scenarioName, "Streaming")
|
|
isMultipartRequest := scenarioName == "Transcription" || scenarioName == "TranscriptionStream" ||
|
|
scenarioName == "ImageEdit" || scenarioName == "ImageEditStream" ||
|
|
scenarioName == "ImageVariation"
|
|
// Skip raw request/response for CountTokens - not all providers support it uniformly
|
|
if scenarioName != "CountTokens" {
|
|
expectations = ApplyRawExpectations(expectations, testConfig, isStreaming, isMultipartRequest)
|
|
}
|
|
|
|
return expectations
|
|
}
|
|
|
|
// =============================================================================
|
|
// PROVIDER-SPECIFIC EXPECTATION MODIFIERS
|
|
// =============================================================================
|
|
|
|
// ModifyExpectationsForProvider adjusts expectations based on provider capabilities.
|
|
// Each provider is explicitly configured for: usage stats, timestamps, model, and latency.
|
|
// If a provider is not listed, defaults are kept (all true from BasicChatExpectations).
|
|
func ModifyExpectationsForProvider(expectations ResponseExpectations, provider schemas.ModelProvider) ResponseExpectations {
|
|
// NOTE: This function must NOT set ShouldHaveTimestamps or ShouldHaveModel to true.
|
|
// StreamingExpectations explicitly disables those fields, and overriding them here
|
|
// would cause streaming tests to incorrectly assert on fields that consolidated
|
|
// streaming responses cannot reliably carry.
|
|
// ShouldHaveUsageStats and ShouldHaveLatency may still be enabled here because no
|
|
// scenario preset disables them, and some presets (e.g. ReasoningExpectations) omit
|
|
// ShouldHaveLatency entirely.
|
|
switch provider {
|
|
case schemas.OpenAI:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Azure:
|
|
// Azure OpenAI returns the same fields as OpenAI
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Anthropic:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Bedrock:
|
|
// Bedrock returns usage stats for most calls via Bifrost normalization, but not all
|
|
expectations.ShouldHaveTimestamps = false // Bedrock does not return created timestamps
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Cohere:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveModel = false // Cohere does not return model field in all response types
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Vertex:
|
|
// Google Vertex AI returns usage and model but may not return timestamps
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveTimestamps = false // Vertex does not return created timestamps
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Mistral:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Ollama:
|
|
// Local models may not return usage or timestamps
|
|
expectations.ShouldHaveUsageStats = false
|
|
expectations.ShouldHaveTimestamps = false
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Groq:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Gemini:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveTimestamps = false // Gemini does not return created timestamps
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Perplexity:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveTimestamps = false // Perplexity does not return created timestamps
|
|
expectations.ShouldHaveModel = false // Perplexity does not return model field
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Cerebras:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.OpenRouter:
|
|
// OpenRouter proxies to multiple providers; returns OpenAI-compatible fields
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.XAI:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Nebius:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.SGL:
|
|
// SGLang local inference — may not return all fields
|
|
expectations.ShouldHaveUsageStats = false
|
|
expectations.ShouldHaveTimestamps = false
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Parasail:
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveTimestamps = false // Parasail does not return created timestamps
|
|
expectations.ShouldHaveModel = false // Parasail does not return model field
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Elevenlabs:
|
|
// Elevenlabs is primarily audio — usage/timestamps may not apply to all calls
|
|
expectations.ShouldHaveUsageStats = false
|
|
expectations.ShouldHaveTimestamps = false
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.HuggingFace:
|
|
expectations.ShouldHaveUsageStats = false
|
|
expectations.ShouldHaveTimestamps = false
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Replicate:
|
|
expectations.ShouldHaveUsageStats = false
|
|
expectations.ShouldHaveTimestamps = false
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.VLLM:
|
|
// vLLM local inference — OpenAI-compatible
|
|
expectations.ShouldHaveUsageStats = true
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
case schemas.Runway:
|
|
// Runway is primarily video/image generation
|
|
expectations.ShouldHaveUsageStats = false
|
|
expectations.ShouldHaveTimestamps = false
|
|
expectations.ShouldHaveLatency = true
|
|
|
|
default:
|
|
// Keep default expectations — all true from BasicChatExpectations
|
|
}
|
|
|
|
return expectations
|
|
}
|
|
|
|
// ApplyRawExpectations applies raw request/response expectations based on test config.
|
|
// Call this after creating expectations directly (SpeechExpectations, TranscriptionExpectations, etc.)
|
|
// when not using GetExpectationsForScenario.
|
|
// Parameters:
|
|
// - isStreaming: if true, skips RawResponse expectation (streaming has no single response body)
|
|
// - options: variadic bool options:
|
|
// - options[0] = isMultipartRequest: if true, skips RawRequest expectation (multipart form data can't return raw JSON request)
|
|
// - options[1] = isBinaryResponse: if true, skips RawResponse expectation (binary responses like audio don't have JSON raw response)
|
|
func ApplyRawExpectations(expectations ResponseExpectations, testConfig ComprehensiveTestConfig, isStreaming bool, options ...bool) ResponseExpectations {
|
|
if testConfig.ExpectRawRequestResponse {
|
|
// options[0] = isMultipartRequest (skip RawRequest for multipart form data requests like transcription)
|
|
// options[1] = isBinaryResponse (skip RawResponse for binary responses like speech synthesis audio)
|
|
skipRawRequest := len(options) > 0 && options[0]
|
|
skipRawResponse := len(options) > 1 && options[1]
|
|
if !skipRawRequest {
|
|
expectations.ShouldHaveRawRequest = true
|
|
}
|
|
if !isStreaming && !skipRawResponse {
|
|
expectations.ShouldHaveRawResponse = true
|
|
}
|
|
}
|
|
return expectations
|
|
}
|
|
|
|
// =============================================================================
|
|
// ADVANCED VALIDATION EXPECTATIONS
|
|
// =============================================================================
|
|
|
|
// SemanticCoherenceExpectations returns expectations for semantic coherence tests
|
|
func SemanticCoherenceExpectations(inputPrompt string, expectedTopics []string) ResponseExpectations {
|
|
expectations := BasicChatExpectations()
|
|
expectations.ShouldContainKeywords = expectedTopics
|
|
expectations.IsRelevantToPrompt = true
|
|
|
|
// Add pattern for coherent responses (no contradictions, proper flow)
|
|
expectations.ContentPattern = regexp.MustCompile(`^[A-Z].*[.!?]$`) // Should start with capital and end with punctuation
|
|
|
|
return expectations
|
|
}
|
|
|
|
// ConsistencyExpectations returns expectations for consistency tests
|
|
func ConsistencyExpectations(expectedConsistencyMarkers []string) ResponseExpectations {
|
|
expectations := BasicChatExpectations()
|
|
expectations.ShouldContainKeywords = expectedConsistencyMarkers
|
|
expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords, []string{
|
|
"however", "but", "on the other hand", // Contradiction markers
|
|
"i'm not sure", "maybe", "possibly", "might be", // Uncertainty markers
|
|
}...)
|
|
|
|
return expectations
|
|
}
|
|
|
|
// =============================================================================
|
|
// UTILITY FUNCTIONS
|
|
// =============================================================================
|
|
|
|
// stringPtr returns a pointer to a string
|
|
func stringPtr(s string) *string {
|
|
return &s
|
|
}
|
|
|
|
// CombineExpectations merges multiple expectations (later ones override earlier ones)
|
|
func CombineExpectations(expectations ...ResponseExpectations) ResponseExpectations {
|
|
if len(expectations) == 0 {
|
|
return BasicChatExpectations()
|
|
}
|
|
|
|
base := expectations[0]
|
|
|
|
for _, exp := range expectations[1:] {
|
|
// Override fields that are set in the new expectation
|
|
if exp.ShouldHaveContent {
|
|
base.ShouldHaveContent = exp.ShouldHaveContent
|
|
}
|
|
if exp.ExpectedChoiceCount > 0 {
|
|
base.ExpectedChoiceCount = exp.ExpectedChoiceCount
|
|
}
|
|
if exp.ExpectedFinishReason != nil {
|
|
base.ExpectedFinishReason = exp.ExpectedFinishReason
|
|
}
|
|
|
|
// Append arrays
|
|
base.ShouldContainKeywords = append(base.ShouldContainKeywords, exp.ShouldContainKeywords...)
|
|
base.ShouldNotContainWords = append(base.ShouldNotContainWords, exp.ShouldNotContainWords...)
|
|
base.ExpectedToolCalls = append(base.ExpectedToolCalls, exp.ExpectedToolCalls...)
|
|
|
|
// Override other fields
|
|
if exp.ContentPattern != nil {
|
|
base.ContentPattern = exp.ContentPattern
|
|
}
|
|
if exp.IsRelevantToPrompt {
|
|
base.IsRelevantToPrompt = exp.IsRelevantToPrompt
|
|
}
|
|
if exp.ShouldNotHaveFunctionCalls {
|
|
base.ShouldNotHaveFunctionCalls = exp.ShouldNotHaveFunctionCalls
|
|
}
|
|
if exp.ShouldHaveUsageStats {
|
|
base.ShouldHaveUsageStats = exp.ShouldHaveUsageStats
|
|
}
|
|
if exp.ShouldHaveTimestamps {
|
|
base.ShouldHaveTimestamps = exp.ShouldHaveTimestamps
|
|
}
|
|
if exp.ShouldHaveModel {
|
|
base.ShouldHaveModel = exp.ShouldHaveModel
|
|
}
|
|
if exp.ShouldHaveLatency {
|
|
base.ShouldHaveLatency = exp.ShouldHaveLatency
|
|
}
|
|
|
|
// Merge provider specific data
|
|
if len(exp.ProviderSpecific) > 0 {
|
|
if base.ProviderSpecific == nil {
|
|
base.ProviderSpecific = make(map[string]interface{})
|
|
}
|
|
for k, v := range exp.ProviderSpecific {
|
|
base.ProviderSpecific[k] = v
|
|
}
|
|
}
|
|
}
|
|
|
|
return base
|
|
}
|