bifrost/core/internal/llmtests/validation_presets.go

package llmtests

import (
	"regexp"
	"strings"

	"github.com/maximhq/bifrost/core/schemas"
)

// =============================================================================
// PRESET VALIDATION EXPECTATIONS FOR COMMON SCENARIOS
// =============================================================================

// BasicChatExpectations returns validation expectations for basic chat scenarios
func BasicChatExpectations() ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:    true,
		ExpectedChoiceCount:  1, // Usually expect one choice, will be used on outputs for responses API
		ShouldHaveUsageStats: true,
		ShouldHaveTimestamps: true,
		ShouldHaveModel:      true,
		ShouldHaveLatency:    true, // Global expectation: latency should always be present
		ShouldNotContainWords: []string{
			"i can't", "i cannot", "i'm unable", "i am unable",
			"i don't know", "i'm not sure", "i am not sure",
		},
	}
}

// ToolCallExpectations returns validation expectations for tool calling scenarios
func ToolCallExpectations(toolName string, requiredArgs []string) ResponseExpectations {
	expectations := BasicChatExpectations()
	expectations.ExpectedToolCalls = []ToolCallExpectation{
		{
			FunctionName:     toolName,
			RequiredArgs:     requiredArgs,
			ValidateArgsJSON: true,
		},
	}
	// Tool calls might not have text content
	expectations.ShouldHaveContent = false

	return expectations
}

// WeatherToolExpectations returns validation expectations for weather tool calls
func WeatherToolExpectations() ResponseExpectations {
	return ToolCallExpectations(string(SampleToolTypeWeather), []string{"location"})
}

// CalculatorToolExpectations returns validation expectations for calculator tool calls
func CalculatorToolExpectations() ResponseExpectations {
	return ToolCallExpectations(string(SampleToolTypeCalculate), []string{"expression"})
}

// TimeToolExpectations returns validation expectations for time tool calls
func TimeToolExpectations() ResponseExpectations {
	return ToolCallExpectations(string(SampleToolTypeTime), []string{"timezone"})
}

// MultipleToolExpectations returns validation expectations for multiple tool calls
func MultipleToolExpectations(tools []string, requiredArgsPerTool [][]string) ResponseExpectations {
	expectations := BasicChatExpectations()
	expectations.ShouldHaveContent = false // Tool calls might not have text Content

	for i, tool := range tools {
		var args []string
		if i < len(requiredArgsPerTool) {
			args = requiredArgsPerTool[i]
		}

		expectations.ExpectedToolCalls = append(expectations.ExpectedToolCalls, ToolCallExpectation{
			FunctionName:     tool,
			RequiredArgs:     args,
			ValidateArgsJSON: true,
		})
	}

	return expectations
}

// ImageAnalysisExpectations returns validation expectations for image analysis scenarios
func ImageAnalysisExpectations() ResponseExpectations {
	expectations := BasicChatExpectations()
	expectations.ShouldContainKeywords = []string{"image", "picture", "photo", "see", "shows", "contains"}
	expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords, []string{
		"i can't see", "i cannot see", "unable to see", "can't view",
		"cannot view", "no image", "not able to see", "i don't see",
	}...)

	return expectations
}

// TextCompletionExpectations returns validation expectations for text completion scenarios
func TextCompletionExpectations() ResponseExpectations {
	expectations := BasicChatExpectations()

	return expectations
}

// EmbeddingExpectations returns validation expectations for embedding scenarios
func EmbeddingExpectations(expectedTexts []string) ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:   false, // Embeddings don't have text content
		ExpectedChoiceCount: 0,     // Embeddings use different structure
		ShouldHaveModel:     true,
		ShouldHaveLatency:   true, // Global expectation: latency should always be present
		// Custom validation will be needed for embedding data
		ProviderSpecific: map[string]interface{}{
			"expected_embedding_count": len(expectedTexts),
			"expected_texts":           expectedTexts,
		},
	}
}

// CountTokensExpectations returns validation expectations for count tokens scenarios
func CountTokensExpectations() ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:    false, // CountTokens doesn't return text content
		ExpectedChoiceCount:  0,
		ShouldHaveUsageStats: true,
		ShouldHaveModel:      true,
		ShouldHaveLatency:    true,
		ProviderSpecific: map[string]interface{}{
			"response_type": "count_tokens",
		},
	}
}

// StreamingExpectations returns validation expectations for streaming scenarios
func StreamingExpectations() ResponseExpectations {
	expectations := BasicChatExpectations()

	// Streaming consolidated responses are assembled from chunks.
	// The last chunk often does not carry created/model fields,
	// so we cannot reliably validate them on the consolidated response.
	expectations.ShouldHaveTimestamps = false
	expectations.ShouldHaveModel = false

	return expectations
}

// ConversationExpectations returns validation expectations for multi-turn conversation scenarios
func ConversationExpectations(contextKeywords []string) ResponseExpectations {
	expectations := BasicChatExpectations()
	expectations.ShouldContainAnyOf = contextKeywords // Should reference conversation context

	return expectations
}

// VisionExpectations returns validation expectations for vision/image processing scenarios
func VisionExpectations(expectedKeywords []string) ResponseExpectations {
	expectations := ImageAnalysisExpectations() // Use existing image analysis base
	if len(expectedKeywords) > 0 {
		expectations.ShouldContainKeywords = expectedKeywords
	}
	expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords,
		"cannot see", "unable to view", "no image", "can't see",
		"image not found", "invalid image", "corrupted image",
		"failed to load", "error processing",
	)
	expectations.IsRelevantToPrompt = true
	return expectations
}

// FileInputExpectations returns validation expectations for file input scenarios
func FileInputExpectations() ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:     true,
		ExpectedChoiceCount:   1,
		ShouldHaveUsageStats:  true,
		ShouldHaveTimestamps:  true,
		ShouldHaveModel:       true,
		ShouldHaveLatency:     true,
		ShouldContainKeywords: []string{"hello", "world"}, // Content from the test PDF
		ShouldNotContainWords: []string{
			"cannot", "unable", "error", "failed",
			"unsupported", "invalid", "corrupted",
			"can't read", "cannot read", "no file",
			"no document", "cannot process",
		},
		IsRelevantToPrompt: true,
	}
}

// SpeechExpectations returns validation expectations for speech synthesis scenarios
func SpeechExpectations(minAudioBytes int) ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:    false, // Speech responses don't have text content
		ExpectedChoiceCount:  0,     // Speech responses don't have choices
		ShouldHaveUsageStats: true,
		ShouldHaveTimestamps: true,
		ShouldHaveModel:      true,
		ShouldHaveLatency:    true, // Global expectation: latency should always be present
		// Speech-specific validations stored in ProviderSpecific
		ProviderSpecific: map[string]interface{}{
			"min_audio_bytes":   minAudioBytes,
			"should_have_audio": true,
			"expected_format":   "audio", // General audio format
			"response_type":     "speech_synthesis",
		},
	}
}

// TranscriptionExpectations returns validation expectations for transcription scenarios
func TranscriptionExpectations(minTextLength int) ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:    false, // Transcription has transcribed text, not chat content
		ExpectedChoiceCount:  0,     // Transcription responses don't have choices
		ShouldHaveUsageStats: true,
		ShouldHaveTimestamps: true,
		ShouldHaveModel:      true,
		ShouldHaveLatency:    true, // Global expectation: latency should always be present
		// Transcription-specific validations
		ShouldNotContainWords: []string{
			"could not transcribe", "failed to process",
			"invalid audio", "corrupted audio",
			"unsupported format", "transcription error",
			"no audio detected", "silence detected",
		},
		ProviderSpecific: map[string]interface{}{
			"min_transcription_length":  minTextLength,
			"should_have_transcription": true,
			"response_type":             "transcription",
		},
	}
}

func ImageGenerationExpectations(minImages int, expectedSize string) ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:    false, // Image responses don't have text content
		ExpectedChoiceCount:  0,     // Image responses don't have choices
		ShouldHaveUsageStats: true,
		ShouldHaveTimestamps: true,
		ShouldHaveModel:      true,
		ShouldHaveLatency:    true, // Global expectation: latency should always be present
		ProviderSpecific: map[string]interface{}{
			"min_images":    minImages,
			"expected_size": expectedSize,
			"response_type": "image_generation",
		},
	}
}

// ReasoningExpectations returns validation expectations for reasoning scenarios
func ReasoningExpectations() ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:    true,
		ShouldHaveUsageStats: true,
		ShouldHaveTimestamps: true,
		ShouldHaveModel:      true,
		ProviderSpecific: map[string]interface{}{
			"response_type":        "reasoning",
			"expects_step_by_step": true,
		},
	}
}

// ChatAudioExpectations returns validation expectations for chat audio scenarios
func ChatAudioExpectations() ResponseExpectations {
	return ResponseExpectations{
		ShouldHaveContent:    false, // Chat audio responses may have audio/transcript but not text content
		ExpectedChoiceCount:  1,     // Should have one choice with audio data
		ShouldHaveUsageStats: true,
		ShouldHaveTimestamps: true,
		ShouldHaveModel:      true,
		ShouldHaveLatency:    true, // Global expectation: latency should always be present
		ProviderSpecific: map[string]interface{}{
			"response_type": "chat_audio",
		},
	}
}

// =============================================================================
// SCENARIO-SPECIFIC EXPECTATION BUILDERS
// =============================================================================

// GetExpectationsForScenario returns appropriate validation expectations for a given scenario
func GetExpectationsForScenario(scenarioName string, testConfig ComprehensiveTestConfig, customParams map[string]interface{}) ResponseExpectations {
	var expectations ResponseExpectations

	switch scenarioName {
	case "SimpleChat":
		expectations = BasicChatExpectations()

	case "TextCompletion":
		expectations = TextCompletionExpectations()

	case "ToolCalls":
		if toolName, ok := customParams["tool_name"].(string); ok {
			if args, ok := customParams["required_args"].([]string); ok {
				expectations = ToolCallExpectations(toolName, args)
				break
			}
		}
		expectations = WeatherToolExpectations() // Default to weather tool

	case "MultipleToolCalls":
		if tools, ok := customParams["tool_names"].([]string); ok {
			if argsPerTool, ok := customParams["required_args_per_tool"].([][]string); ok {
				expectations = MultipleToolExpectations(tools, argsPerTool)
				break
			}
		}
		// Default to weather and calculator
		expectations = MultipleToolExpectations(
			[]string{string(SampleToolTypeWeather), string(SampleToolTypeCalculate)},
			[][]string{{"location"}, {"expression"}},
		)

	case "End2EndToolCalling":
		expectations = ConversationExpectations([]string{"weather", "temperature", "result"})

	case "AutomaticFunctionCalling":
		expectations = WeatherToolExpectations()
		expectations.ShouldHaveContent = true // Should have follow-up text after tool call

	case "ImageURL", "ImageBase64":
		expectations = VisionExpectations([]string{"image", "picture", "see"})

	case "MultipleImages":
		expectations = VisionExpectations([]string{"compare", "similar", "different", "images"})

	case "FileInput":
		expectations = FileInputExpectations()

	case "ChatCompletionStream", "TextCompletionStream":
		expectations = StreamingExpectations()

	case "MultiTurnConversation":
		if keywords, ok := customParams["context_keywords"].([]string); ok {
			expectations = ConversationExpectations(keywords)
		} else {
			expectations = ConversationExpectations([]string{"context", "previous", "mentioned"})
		}

	case "Embedding":
		if texts, ok := customParams["input_texts"].([]string); ok {
			expectations = EmbeddingExpectations(texts)
		} else {
			expectations = EmbeddingExpectations([]string{"Hello, world!", "Hi, world!", "Goodnight, moon!"})
		}

	case "CountTokens":
		expectations = CountTokensExpectations()

	case "CompleteEnd2End":
		expectations = ConversationExpectations([]string{"complete", "comprehensive", "full"})

	case "SpeechSynthesis":
		if minBytes, ok := customParams["min_audio_bytes"].(int); ok {
			expectations = SpeechExpectations(minBytes)
		} else {
			expectations = SpeechExpectations(500) // Default minimum 500 bytes
		}

	case "Transcription":
		if minLength, ok := customParams["min_transcription_length"].(int); ok {
			expectations = TranscriptionExpectations(minLength)
		} else {
			expectations = TranscriptionExpectations(10) // Default minimum 10 characters
		}

	case "Reasoning":
		expectations = ReasoningExpectations()

	case "ChatAudio":
		expectations = ChatAudioExpectations()

	case "ProviderSpecific":
		expectations = BasicChatExpectations()
		expectations.ShouldContainKeywords = []string{"unique", "specific", "capability"}

	case "ImageGeneration":
		if minImages, ok := customParams["min_images"].(int); ok {
			if expectedSize, ok := customParams["expected_size"].(string); ok {
				expectations = ImageGenerationExpectations(minImages, expectedSize)
				break
			}
		}
		expectations = ImageGenerationExpectations(1, "1024x1024")

	case "ImageEdit", "ImageVariation":
		// Reuse image generation expectations since they use the same response structure
		if minImages, ok := customParams["min_images"].(int); ok {
			if expectedSize, ok := customParams["expected_size"].(string); ok {
				expectations = ImageGenerationExpectations(minImages, expectedSize)
				break
			}
		}
		expectations = ImageGenerationExpectations(1, "1024x1024")

	default:
		// Default to basic chat expectations
		expectations = BasicChatExpectations()
	}

	// Apply raw request/response expectations from test config
	isStreaming := strings.HasSuffix(scenarioName, "Stream") || strings.HasSuffix(scenarioName, "Streaming")
	isMultipartRequest := scenarioName == "Transcription" || scenarioName == "TranscriptionStream" ||
		scenarioName == "ImageEdit" || scenarioName == "ImageEditStream" ||
		scenarioName == "ImageVariation"
	// Skip raw request/response for CountTokens - not all providers support it uniformly
	if scenarioName != "CountTokens" {
		expectations = ApplyRawExpectations(expectations, testConfig, isStreaming, isMultipartRequest)
	}

	return expectations
}

// =============================================================================
// PROVIDER-SPECIFIC EXPECTATION MODIFIERS
// =============================================================================

// ModifyExpectationsForProvider adjusts expectations based on provider capabilities.
// Each provider is explicitly configured for: usage stats, timestamps, model, and latency.
// If a provider is not listed, defaults are kept (all true from BasicChatExpectations).
func ModifyExpectationsForProvider(expectations ResponseExpectations, provider schemas.ModelProvider) ResponseExpectations {
	// NOTE: This function must NOT set ShouldHaveTimestamps or ShouldHaveModel to true.
	// StreamingExpectations explicitly disables those fields, and overriding them here
	// would cause streaming tests to incorrectly assert on fields that consolidated
	// streaming responses cannot reliably carry.
	// ShouldHaveUsageStats and ShouldHaveLatency may still be enabled here because no
	// scenario preset disables them, and some presets (e.g. ReasoningExpectations) omit
	// ShouldHaveLatency entirely.
	switch provider {
	case schemas.OpenAI:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.Azure:
		// Azure OpenAI returns the same fields as OpenAI
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.Anthropic:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.Bedrock:
		// Bedrock returns usage stats for most calls via Bifrost normalization, but not all
		expectations.ShouldHaveTimestamps = false // Bedrock does not return created timestamps
		expectations.ShouldHaveLatency = true

	case schemas.Cohere:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveModel = false // Cohere does not return model field in all response types
		expectations.ShouldHaveLatency = true

	case schemas.Vertex:
		// Google Vertex AI returns usage and model but may not return timestamps
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveTimestamps = false // Vertex does not return created timestamps
		expectations.ShouldHaveLatency = true

	case schemas.Mistral:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.Ollama:
		// Local models may not return usage or timestamps
		expectations.ShouldHaveUsageStats = false
		expectations.ShouldHaveTimestamps = false
		expectations.ShouldHaveLatency = true

	case schemas.Groq:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.Gemini:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveTimestamps = false // Gemini does not return created timestamps
		expectations.ShouldHaveLatency = true

	case schemas.Perplexity:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveTimestamps = false // Perplexity does not return created timestamps
		expectations.ShouldHaveModel = false      // Perplexity does not return model field
		expectations.ShouldHaveLatency = true

	case schemas.Cerebras:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.OpenRouter:
		// OpenRouter proxies to multiple providers; returns OpenAI-compatible fields
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.XAI:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.Nebius:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.SGL:
		// SGLang local inference — may not return all fields
		expectations.ShouldHaveUsageStats = false
		expectations.ShouldHaveTimestamps = false
		expectations.ShouldHaveLatency = true

	case schemas.Parasail:
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveTimestamps = false // Parasail does not return created timestamps
		expectations.ShouldHaveModel = false      // Parasail does not return model field
		expectations.ShouldHaveLatency = true

	case schemas.Elevenlabs:
		// Elevenlabs is primarily audio — usage/timestamps may not apply to all calls
		expectations.ShouldHaveUsageStats = false
		expectations.ShouldHaveTimestamps = false
		expectations.ShouldHaveLatency = true

	case schemas.HuggingFace:
		expectations.ShouldHaveUsageStats = false
		expectations.ShouldHaveTimestamps = false
		expectations.ShouldHaveLatency = true

	case schemas.Replicate:
		expectations.ShouldHaveUsageStats = false
		expectations.ShouldHaveTimestamps = false
		expectations.ShouldHaveLatency = true

	case schemas.VLLM:
		// vLLM local inference — OpenAI-compatible
		expectations.ShouldHaveUsageStats = true
		expectations.ShouldHaveLatency = true

	case schemas.Runway:
		// Runway is primarily video/image generation
		expectations.ShouldHaveUsageStats = false
		expectations.ShouldHaveTimestamps = false
		expectations.ShouldHaveLatency = true

	default:
		// Keep default expectations — all true from BasicChatExpectations
	}

	return expectations
}

// ApplyRawExpectations applies raw request/response expectations based on test config.
// Call this after creating expectations directly (SpeechExpectations, TranscriptionExpectations, etc.)
// when not using GetExpectationsForScenario.
// Parameters:
//   - isStreaming: if true, skips RawResponse expectation (streaming has no single response body)
//   - options: variadic bool options:
//   - options[0] = isMultipartRequest: if true, skips RawRequest expectation (multipart form data can't return raw JSON request)
//   - options[1] = isBinaryResponse: if true, skips RawResponse expectation (binary responses like audio don't have JSON raw response)
func ApplyRawExpectations(expectations ResponseExpectations, testConfig ComprehensiveTestConfig, isStreaming bool, options ...bool) ResponseExpectations {
	if testConfig.ExpectRawRequestResponse {
		// options[0] = isMultipartRequest (skip RawRequest for multipart form data requests like transcription)
		// options[1] = isBinaryResponse (skip RawResponse for binary responses like speech synthesis audio)
		skipRawRequest := len(options) > 0 && options[0]
		skipRawResponse := len(options) > 1 && options[1]
		if !skipRawRequest {
			expectations.ShouldHaveRawRequest = true
		}
		if !isStreaming && !skipRawResponse {
			expectations.ShouldHaveRawResponse = true
		}
	}
	return expectations
}

// =============================================================================
// ADVANCED VALIDATION EXPECTATIONS
// =============================================================================

// SemanticCoherenceExpectations returns expectations for semantic coherence tests
func SemanticCoherenceExpectations(inputPrompt string, expectedTopics []string) ResponseExpectations {
	expectations := BasicChatExpectations()
	expectations.ShouldContainKeywords = expectedTopics
	expectations.IsRelevantToPrompt = true

	// Add pattern for coherent responses (no contradictions, proper flow)
	expectations.ContentPattern = regexp.MustCompile(`^[A-Z].*[.!?]$`) // Should start with capital and end with punctuation

	return expectations
}

// ConsistencyExpectations returns expectations for consistency tests
func ConsistencyExpectations(expectedConsistencyMarkers []string) ResponseExpectations {
	expectations := BasicChatExpectations()
	expectations.ShouldContainKeywords = expectedConsistencyMarkers
	expectations.ShouldNotContainWords = append(expectations.ShouldNotContainWords, []string{
		"however", "but", "on the other hand", // Contradiction markers
		"i'm not sure", "maybe", "possibly", "might be", // Uncertainty markers
	}...)

	return expectations
}

// =============================================================================
// UTILITY FUNCTIONS
// =============================================================================

// stringPtr returns a pointer to a string
func stringPtr(s string) *string {
	return &s
}

// CombineExpectations merges multiple expectations (later ones override earlier ones)
func CombineExpectations(expectations ...ResponseExpectations) ResponseExpectations {
	if len(expectations) == 0 {
		return BasicChatExpectations()
	}

	base := expectations[0]

	for _, exp := range expectations[1:] {
		// Override fields that are set in the new expectation
		if exp.ShouldHaveContent {
			base.ShouldHaveContent = exp.ShouldHaveContent
		}
		if exp.ExpectedChoiceCount > 0 {
			base.ExpectedChoiceCount = exp.ExpectedChoiceCount
		}
		if exp.ExpectedFinishReason != nil {
			base.ExpectedFinishReason = exp.ExpectedFinishReason
		}

		// Append arrays
		base.ShouldContainKeywords = append(base.ShouldContainKeywords, exp.ShouldContainKeywords...)
		base.ShouldNotContainWords = append(base.ShouldNotContainWords, exp.ShouldNotContainWords...)
		base.ExpectedToolCalls = append(base.ExpectedToolCalls, exp.ExpectedToolCalls...)

		// Override other fields
		if exp.ContentPattern != nil {
			base.ContentPattern = exp.ContentPattern
		}
		if exp.IsRelevantToPrompt {
			base.IsRelevantToPrompt = exp.IsRelevantToPrompt
		}
		if exp.ShouldNotHaveFunctionCalls {
			base.ShouldNotHaveFunctionCalls = exp.ShouldNotHaveFunctionCalls
		}
		if exp.ShouldHaveUsageStats {
			base.ShouldHaveUsageStats = exp.ShouldHaveUsageStats
		}
		if exp.ShouldHaveTimestamps {
			base.ShouldHaveTimestamps = exp.ShouldHaveTimestamps
		}
		if exp.ShouldHaveModel {
			base.ShouldHaveModel = exp.ShouldHaveModel
		}
		if exp.ShouldHaveLatency {
			base.ShouldHaveLatency = exp.ShouldHaveLatency
		}

		// Merge provider specific data
		if len(exp.ProviderSpecific) > 0 {
			if base.ProviderSpecific == nil {
				base.ProviderSpecific = make(map[string]interface{})
			}
			for k, v := range exp.ProviderSpecific {
				base.ProviderSpecific[k] = v
			}
		}
	}

	return base
}