bifrost/core/internal/llmtests/speech_synthesis.go

package llmtests

import (
	"context"
	"os"
	"path/filepath"
	"testing"

	"github.com/stretchr/testify/require"

	bifrost "github.com/maximhq/bifrost/core"
	"github.com/maximhq/bifrost/core/schemas"
)

// RunSpeechSynthesisTest executes the speech synthesis test scenario
func RunSpeechSynthesisTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
	if !testConfig.Scenarios.SpeechSynthesis {
		t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
		return
	}

	t.Run("SpeechSynthesis", func(t *testing.T) {
		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
			t.Parallel()
		}

		// Test with shared text constants for round-trip validation with transcription
		testCases := []struct {
			name           string
			text           string
			voiceType      string
			format         string
			expectMinBytes int
			saveForSST     bool // Whether to save this audio for SST round-trip testing
		}{
			{
				name:           "BasicText_Primary_MP3",
				text:           TTSTestTextBasic,
				voiceType:      "primary",
				format:         GetProviderDefaultFormat(testConfig.Provider),
				expectMinBytes: 1000,
				saveForSST:     true,
			},
			{
				name:           "MediumText_Secondary_MP3",
				text:           TTSTestTextMedium,
				voiceType:      "secondary",
				format:         GetProviderDefaultFormat(testConfig.Provider),
				expectMinBytes: 2000,
				saveForSST:     true,
			},
			{
				name:           "TechnicalText_Tertiary_MP3",
				text:           TTSTestTextTechnical,
				voiceType:      "tertiary",
				format:         GetProviderDefaultFormat(testConfig.Provider),
				expectMinBytes: 500,
				saveForSST:     true,
			},
		}

		for _, tc := range testCases {
			t.Run(tc.name, func(t *testing.T) {
				if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
					t.Parallel()
				}

				voice := GetProviderVoice(testConfig.Provider, tc.voiceType)
				request := &schemas.BifrostSpeechRequest{
					Provider: testConfig.Provider,
					Model:    testConfig.SpeechSynthesisModel, // Use configured model
					Input: &schemas.SpeechInput{
						Input: tc.text,
					},
					Params: &schemas.SpeechParameters{
						VoiceConfig: &schemas.SpeechVoiceInput{
							Voice: &voice,
						},
						ResponseFormat: tc.format,
					},
					Fallbacks: testConfig.SpeechSynthesisFallbacks,
				}

				// Use retry framework with enhanced validation
				retryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
				retryContext := TestRetryContext{
					ScenarioName: "SpeechSynthesis_" + tc.name,
					ExpectedBehavior: map[string]interface{}{
						"should_generate_audio": true,
					},
					TestMetadata: map[string]interface{}{
						"provider": testConfig.Provider,
						"model":    testConfig.SpeechSynthesisModel,
						"format":   tc.format,
						"voice":    voice,
					},
				}

				// Enhanced validation for speech synthesis
				// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
				expectations := ApplyRawExpectations(SpeechExpectations(tc.expectMinBytes), testConfig, false, false, true)
				expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)

				// Create Speech retry config
				speechRetryConfig := SpeechRetryConfig{
					MaxAttempts: retryConfig.MaxAttempts,
					BaseDelay:   retryConfig.BaseDelay,
					MaxDelay:    retryConfig.MaxDelay,
					Conditions:  []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
					OnRetry:     retryConfig.OnRetry,
					OnFinalFail: retryConfig.OnFinalFail,
				}

				speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_"+tc.name, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
					requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
					return client.SpeechRequest(requestCtx, request)
				})

				if bifrostErr != nil {
					t.Fatalf("❌ SpeechSynthesis_"+tc.name+" request failed after retries: %v", GetErrorMessage(bifrostErr))
				}

				// Additional speech-specific validations (complementary to main validation)
				validateSpeechSynthesisSpecific(t, speechResponse, tc.expectMinBytes, testConfig.SpeechSynthesisModel)

				// Save audio file for SST round-trip testing if requested
				if tc.saveForSST {
					tempDir := os.TempDir()
					audioFileName := filepath.Join(tempDir, "tts_"+tc.name+"."+tc.format)

					err := os.WriteFile(audioFileName, speechResponse.Audio, 0644)
					require.NoError(t, err, "Failed to save audio file for SST testing")

					// Register cleanup to remove temp file
					t.Cleanup(func() {
						os.Remove(audioFileName)
					})

					t.Logf("💾 Audio saved for SST testing: %s (text: '%s')", audioFileName, tc.text)
				}

				t.Logf("✅ Speech synthesis successful: %d bytes of %s audio generated for voice '%s'",
					len(speechResponse.Audio), tc.format, voice)
			})
		}
	})
}

// RunSpeechSynthesisAdvancedTest executes advanced speech synthesis test scenarios
func RunSpeechSynthesisAdvancedTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
	if !testConfig.Scenarios.SpeechSynthesis {
		t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
		return
	}

	t.Run("SpeechSynthesisAdvanced", func(t *testing.T) {
		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
			t.Parallel()
		}

		t.Run("LongText_HDModel", func(t *testing.T) {
			if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
				t.Parallel()
			}

			// Test with longer text and HD model
			longText := `
			This is a comprehensive test of the text-to-speech functionality using a longer piece of text.
			The system should be able to handle multiple sentences, proper punctuation, and maintain
			consistent voice quality throughout the entire speech generation process. This test ensures
			that the speech synthesis can handle realistic use cases with substantial content.
			`

			voice := GetProviderVoice(testConfig.Provider, "tertiary")
			request := &schemas.BifrostSpeechRequest{
				Provider: testConfig.Provider,
				Model:    testConfig.SpeechSynthesisModel,
				Input: &schemas.SpeechInput{
					Input: longText,
				},
				Params: &schemas.SpeechParameters{
					VoiceConfig: &schemas.SpeechVoiceInput{
						Voice: &voice,
					},
					ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
					Instructions:   "Speak slowly and clearly with natural intonation.",
				},
				Fallbacks: testConfig.SpeechSynthesisFallbacks,
			}

			// Groq doesn't support instructions
			if testConfig.Provider == schemas.Groq {
				request.Params.Instructions = ""
			}

			retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisHD", testConfig)
			retryContext := TestRetryContext{
				ScenarioName: "SpeechSynthesis_HD_LongText",
				ExpectedBehavior: map[string]interface{}{
					"generate_hd_audio": true,
					"handle_long_text":  true,
					"min_audio_bytes":   5000,
				},
				TestMetadata: map[string]interface{}{
					"provider":    testConfig.Provider,
					"model":       testConfig.SpeechSynthesisModel,
					"text_length": len(longText),
				},
			}

			// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
			expectations := ApplyRawExpectations(SpeechExpectations(5000), testConfig, false, false, true) // HD should produce substantial audio
			expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)

			// Create Speech retry config
			speechRetryConfig := SpeechRetryConfig{
				MaxAttempts: retryConfig.MaxAttempts,
				BaseDelay:   retryConfig.BaseDelay,
				MaxDelay:    retryConfig.MaxDelay,
				Conditions:  []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
				OnRetry:     retryConfig.OnRetry,
				OnFinalFail: retryConfig.OnFinalFail,
			}

			speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_HD", func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
				requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
				return client.SpeechRequest(requestCtx, request)
			})
			if bifrostErr != nil {
				t.Fatalf("❌ SpeechSynthesis_HD request failed after retries: %v", GetErrorMessage(bifrostErr))
			}

			if speechResponse == nil || speechResponse.Audio == nil {
				t.Fatal("HD speech synthesis response missing audio data")
			}

			audioSize := len(speechResponse.Audio)
			if audioSize < 5000 {
				t.Fatalf("HD audio data too small: got %d bytes, expected at least 5000", audioSize)
			}

			if speechResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
				t.Logf("⚠️ Expected HD model, got: %s", speechResponse.ExtraFields.OriginalModelRequested)
			}

			t.Logf("✅ HD speech synthesis successful: %d bytes generated", len(speechResponse.Audio))
		})

		t.Run("AllVoiceOptions", func(t *testing.T) {
			if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
				t.Parallel()
			}

			// Test provider-specific voice options
			voiceTypes := []string{"primary", "secondary", "tertiary"}
			testText := TTSTestTextBasic // Use shared constant

			for _, voiceType := range voiceTypes {
				t.Run("VoiceType_"+voiceType, func(t *testing.T) {
					if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
						t.Parallel()
					}

					voice := GetProviderVoice(testConfig.Provider, voiceType)
					request := &schemas.BifrostSpeechRequest{
						Provider: testConfig.Provider,
						Model:    testConfig.SpeechSynthesisModel,
						Input: &schemas.SpeechInput{
							Input: testText,
						},
						Params: &schemas.SpeechParameters{
							VoiceConfig: &schemas.SpeechVoiceInput{
								Voice: &voice,
							},
							ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
						},
						Fallbacks: testConfig.SpeechSynthesisFallbacks,
					}

					// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
					expectations := ApplyRawExpectations(SpeechExpectations(500), testConfig, false, false, true)
					expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)

					// Use retry framework for voice test
					voiceRetryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
					voiceRetryContext := TestRetryContext{
						ScenarioName: "SpeechSynthesis_VoiceType_" + voiceType,
						ExpectedBehavior: map[string]interface{}{
							"should_generate_audio": true,
						},
						TestMetadata: map[string]interface{}{
							"provider":   testConfig.Provider,
							"model":      testConfig.SpeechSynthesisModel,
							"voice_type": voiceType,
							"voice":      voice,
						},
					}
					voiceSpeechRetryConfig := SpeechRetryConfig{
						MaxAttempts: voiceRetryConfig.MaxAttempts,
						BaseDelay:   voiceRetryConfig.BaseDelay,
						MaxDelay:    voiceRetryConfig.MaxDelay,
						Conditions:  []SpeechRetryCondition{},
						OnRetry:     voiceRetryConfig.OnRetry,
						OnFinalFail: voiceRetryConfig.OnFinalFail,
					}

					speechResponse, bifrostErr := WithSpeechTestRetry(t, voiceSpeechRetryConfig, voiceRetryContext, expectations, "SpeechSynthesis_VoiceType_"+voiceType, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
						requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
						return client.SpeechRequest(requestCtx, request)
					})

					if bifrostErr != nil {
						t.Fatalf("❌ SpeechSynthesis_Voice_"+voiceType+" request failed after retries: %v", GetErrorMessage(bifrostErr))
					}

					if speechResponse == nil || speechResponse.Audio == nil {
						t.Fatalf("Voice %s (%s) missing audio data after retries", voice, voiceType)
					}

					audioSize := len(speechResponse.Audio)
					if audioSize < 500 {
						t.Fatalf("Audio too small for voice %s: got %d bytes, expected at least 500", voice, audioSize)
					}
					t.Logf("✅ Voice %s (%s): %d bytes generated", voice, voiceType, len(speechResponse.Audio))
				})
			}
		})
	})
}

// validateSpeechSynthesisSpecific performs speech-specific validation
// This is complementary to the main validation framework and focuses on speech synthesis concerns
func validateSpeechSynthesisSpecific(t *testing.T, response *schemas.BifrostSpeechResponse, expectMinBytes int, expectedModel string) {
	if response == nil {
		t.Fatal("Invalid speech synthesis response structure")
	}

	if response.Audio == nil {
		t.Fatal("Speech synthesis response missing audio data")
	}

	audioSize := len(response.Audio)
	if audioSize < expectMinBytes {
		t.Fatalf("Audio data too small: got %d bytes, expected at least %d", audioSize, expectMinBytes)
	}

	if expectedModel != "" && response.ExtraFields.OriginalModelRequested != expectedModel {
		t.Logf("⚠️ Expected model, got: %s", response.ExtraFields.OriginalModelRequested)
	}

	t.Logf("✅ Audio validation passed: %d bytes generated", audioSize)
}