Files
bifrost/core/internal/llmtests/speech_synthesis.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

353 lines
13 KiB
Go

package llmtests
import (
"context"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
bifrost "github.com/maximhq/bifrost/core"
"github.com/maximhq/bifrost/core/schemas"
)
// RunSpeechSynthesisTest executes the speech synthesis test scenario
func RunSpeechSynthesisTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.SpeechSynthesis {
t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
return
}
t.Run("SpeechSynthesis", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
// Test with shared text constants for round-trip validation with transcription
testCases := []struct {
name string
text string
voiceType string
format string
expectMinBytes int
saveForSST bool // Whether to save this audio for SST round-trip testing
}{
{
name: "BasicText_Primary_MP3",
text: TTSTestTextBasic,
voiceType: "primary",
format: GetProviderDefaultFormat(testConfig.Provider),
expectMinBytes: 1000,
saveForSST: true,
},
{
name: "MediumText_Secondary_MP3",
text: TTSTestTextMedium,
voiceType: "secondary",
format: GetProviderDefaultFormat(testConfig.Provider),
expectMinBytes: 2000,
saveForSST: true,
},
{
name: "TechnicalText_Tertiary_MP3",
text: TTSTestTextTechnical,
voiceType: "tertiary",
format: GetProviderDefaultFormat(testConfig.Provider),
expectMinBytes: 500,
saveForSST: true,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
voice := GetProviderVoice(testConfig.Provider, tc.voiceType)
request := &schemas.BifrostSpeechRequest{
Provider: testConfig.Provider,
Model: testConfig.SpeechSynthesisModel, // Use configured model
Input: &schemas.SpeechInput{
Input: tc.text,
},
Params: &schemas.SpeechParameters{
VoiceConfig: &schemas.SpeechVoiceInput{
Voice: &voice,
},
ResponseFormat: tc.format,
},
Fallbacks: testConfig.SpeechSynthesisFallbacks,
}
// Use retry framework with enhanced validation
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
retryContext := TestRetryContext{
ScenarioName: "SpeechSynthesis_" + tc.name,
ExpectedBehavior: map[string]interface{}{
"should_generate_audio": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.SpeechSynthesisModel,
"format": tc.format,
"voice": voice,
},
}
// Enhanced validation for speech synthesis
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
expectations := ApplyRawExpectations(SpeechExpectations(tc.expectMinBytes), testConfig, false, false, true)
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
// Create Speech retry config
speechRetryConfig := SpeechRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_"+tc.name, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.SpeechRequest(requestCtx, request)
})
if bifrostErr != nil {
t.Fatalf("❌ SpeechSynthesis_"+tc.name+" request failed after retries: %v", GetErrorMessage(bifrostErr))
}
// Additional speech-specific validations (complementary to main validation)
validateSpeechSynthesisSpecific(t, speechResponse, tc.expectMinBytes, testConfig.SpeechSynthesisModel)
// Save audio file for SST round-trip testing if requested
if tc.saveForSST {
tempDir := os.TempDir()
audioFileName := filepath.Join(tempDir, "tts_"+tc.name+"."+tc.format)
err := os.WriteFile(audioFileName, speechResponse.Audio, 0644)
require.NoError(t, err, "Failed to save audio file for SST testing")
// Register cleanup to remove temp file
t.Cleanup(func() {
os.Remove(audioFileName)
})
t.Logf("💾 Audio saved for SST testing: %s (text: '%s')", audioFileName, tc.text)
}
t.Logf("✅ Speech synthesis successful: %d bytes of %s audio generated for voice '%s'",
len(speechResponse.Audio), tc.format, voice)
})
}
})
}
// RunSpeechSynthesisAdvancedTest executes advanced speech synthesis test scenarios
func RunSpeechSynthesisAdvancedTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.SpeechSynthesis {
t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
return
}
t.Run("SpeechSynthesisAdvanced", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
t.Run("LongText_HDModel", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
// Test with longer text and HD model
longText := `
This is a comprehensive test of the text-to-speech functionality using a longer piece of text.
The system should be able to handle multiple sentences, proper punctuation, and maintain
consistent voice quality throughout the entire speech generation process. This test ensures
that the speech synthesis can handle realistic use cases with substantial content.
`
voice := GetProviderVoice(testConfig.Provider, "tertiary")
request := &schemas.BifrostSpeechRequest{
Provider: testConfig.Provider,
Model: testConfig.SpeechSynthesisModel,
Input: &schemas.SpeechInput{
Input: longText,
},
Params: &schemas.SpeechParameters{
VoiceConfig: &schemas.SpeechVoiceInput{
Voice: &voice,
},
ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
Instructions: "Speak slowly and clearly with natural intonation.",
},
Fallbacks: testConfig.SpeechSynthesisFallbacks,
}
// Groq doesn't support instructions
if testConfig.Provider == schemas.Groq {
request.Params.Instructions = ""
}
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisHD", testConfig)
retryContext := TestRetryContext{
ScenarioName: "SpeechSynthesis_HD_LongText",
ExpectedBehavior: map[string]interface{}{
"generate_hd_audio": true,
"handle_long_text": true,
"min_audio_bytes": 5000,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.SpeechSynthesisModel,
"text_length": len(longText),
},
}
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
expectations := ApplyRawExpectations(SpeechExpectations(5000), testConfig, false, false, true) // HD should produce substantial audio
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
// Create Speech retry config
speechRetryConfig := SpeechRetryConfig{
MaxAttempts: retryConfig.MaxAttempts,
BaseDelay: retryConfig.BaseDelay,
MaxDelay: retryConfig.MaxDelay,
Conditions: []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
OnRetry: retryConfig.OnRetry,
OnFinalFail: retryConfig.OnFinalFail,
}
speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_HD", func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.SpeechRequest(requestCtx, request)
})
if bifrostErr != nil {
t.Fatalf("❌ SpeechSynthesis_HD request failed after retries: %v", GetErrorMessage(bifrostErr))
}
if speechResponse == nil || speechResponse.Audio == nil {
t.Fatal("HD speech synthesis response missing audio data")
}
audioSize := len(speechResponse.Audio)
if audioSize < 5000 {
t.Fatalf("HD audio data too small: got %d bytes, expected at least 5000", audioSize)
}
if speechResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
t.Logf("⚠️ Expected HD model, got: %s", speechResponse.ExtraFields.OriginalModelRequested)
}
t.Logf("✅ HD speech synthesis successful: %d bytes generated", len(speechResponse.Audio))
})
t.Run("AllVoiceOptions", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
// Test provider-specific voice options
voiceTypes := []string{"primary", "secondary", "tertiary"}
testText := TTSTestTextBasic // Use shared constant
for _, voiceType := range voiceTypes {
t.Run("VoiceType_"+voiceType, func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
voice := GetProviderVoice(testConfig.Provider, voiceType)
request := &schemas.BifrostSpeechRequest{
Provider: testConfig.Provider,
Model: testConfig.SpeechSynthesisModel,
Input: &schemas.SpeechInput{
Input: testText,
},
Params: &schemas.SpeechParameters{
VoiceConfig: &schemas.SpeechVoiceInput{
Voice: &voice,
},
ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
},
Fallbacks: testConfig.SpeechSynthesisFallbacks,
}
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
expectations := ApplyRawExpectations(SpeechExpectations(500), testConfig, false, false, true)
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
// Use retry framework for voice test
voiceRetryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
voiceRetryContext := TestRetryContext{
ScenarioName: "SpeechSynthesis_VoiceType_" + voiceType,
ExpectedBehavior: map[string]interface{}{
"should_generate_audio": true,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.SpeechSynthesisModel,
"voice_type": voiceType,
"voice": voice,
},
}
voiceSpeechRetryConfig := SpeechRetryConfig{
MaxAttempts: voiceRetryConfig.MaxAttempts,
BaseDelay: voiceRetryConfig.BaseDelay,
MaxDelay: voiceRetryConfig.MaxDelay,
Conditions: []SpeechRetryCondition{},
OnRetry: voiceRetryConfig.OnRetry,
OnFinalFail: voiceRetryConfig.OnFinalFail,
}
speechResponse, bifrostErr := WithSpeechTestRetry(t, voiceSpeechRetryConfig, voiceRetryContext, expectations, "SpeechSynthesis_VoiceType_"+voiceType, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.SpeechRequest(requestCtx, request)
})
if bifrostErr != nil {
t.Fatalf("❌ SpeechSynthesis_Voice_"+voiceType+" request failed after retries: %v", GetErrorMessage(bifrostErr))
}
if speechResponse == nil || speechResponse.Audio == nil {
t.Fatalf("Voice %s (%s) missing audio data after retries", voice, voiceType)
}
audioSize := len(speechResponse.Audio)
if audioSize < 500 {
t.Fatalf("Audio too small for voice %s: got %d bytes, expected at least 500", voice, audioSize)
}
t.Logf("✅ Voice %s (%s): %d bytes generated", voice, voiceType, len(speechResponse.Audio))
})
}
})
})
}
// validateSpeechSynthesisSpecific performs speech-specific validation
// This is complementary to the main validation framework and focuses on speech synthesis concerns
func validateSpeechSynthesisSpecific(t *testing.T, response *schemas.BifrostSpeechResponse, expectMinBytes int, expectedModel string) {
if response == nil {
t.Fatal("Invalid speech synthesis response structure")
}
if response.Audio == nil {
t.Fatal("Speech synthesis response missing audio data")
}
audioSize := len(response.Audio)
if audioSize < expectMinBytes {
t.Fatalf("Audio data too small: got %d bytes, expected at least %d", audioSize, expectMinBytes)
}
if expectedModel != "" && response.ExtraFields.OriginalModelRequested != expectedModel {
t.Logf("⚠️ Expected model, got: %s", response.ExtraFields.OriginalModelRequested)
}
t.Logf("✅ Audio validation passed: %d bytes generated", audioSize)
}