first commit
This commit is contained in:
352
core/internal/llmtests/speech_synthesis.go
Normal file
352
core/internal/llmtests/speech_synthesis.go
Normal file
@@ -0,0 +1,352 @@
|
||||
package llmtests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
bifrost "github.com/maximhq/bifrost/core"
|
||||
"github.com/maximhq/bifrost/core/schemas"
|
||||
)
|
||||
|
||||
// RunSpeechSynthesisTest executes the speech synthesis test scenario
|
||||
func RunSpeechSynthesisTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
|
||||
if !testConfig.Scenarios.SpeechSynthesis {
|
||||
t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("SpeechSynthesis", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
// Test with shared text constants for round-trip validation with transcription
|
||||
testCases := []struct {
|
||||
name string
|
||||
text string
|
||||
voiceType string
|
||||
format string
|
||||
expectMinBytes int
|
||||
saveForSST bool // Whether to save this audio for SST round-trip testing
|
||||
}{
|
||||
{
|
||||
name: "BasicText_Primary_MP3",
|
||||
text: TTSTestTextBasic,
|
||||
voiceType: "primary",
|
||||
format: GetProviderDefaultFormat(testConfig.Provider),
|
||||
expectMinBytes: 1000,
|
||||
saveForSST: true,
|
||||
},
|
||||
{
|
||||
name: "MediumText_Secondary_MP3",
|
||||
text: TTSTestTextMedium,
|
||||
voiceType: "secondary",
|
||||
format: GetProviderDefaultFormat(testConfig.Provider),
|
||||
expectMinBytes: 2000,
|
||||
saveForSST: true,
|
||||
},
|
||||
{
|
||||
name: "TechnicalText_Tertiary_MP3",
|
||||
text: TTSTestTextTechnical,
|
||||
voiceType: "tertiary",
|
||||
format: GetProviderDefaultFormat(testConfig.Provider),
|
||||
expectMinBytes: 500,
|
||||
saveForSST: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
voice := GetProviderVoice(testConfig.Provider, tc.voiceType)
|
||||
request := &schemas.BifrostSpeechRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.SpeechSynthesisModel, // Use configured model
|
||||
Input: &schemas.SpeechInput{
|
||||
Input: tc.text,
|
||||
},
|
||||
Params: &schemas.SpeechParameters{
|
||||
VoiceConfig: &schemas.SpeechVoiceInput{
|
||||
Voice: &voice,
|
||||
},
|
||||
ResponseFormat: tc.format,
|
||||
},
|
||||
Fallbacks: testConfig.SpeechSynthesisFallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework with enhanced validation
|
||||
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
|
||||
retryContext := TestRetryContext{
|
||||
ScenarioName: "SpeechSynthesis_" + tc.name,
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_generate_audio": true,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": testConfig.Provider,
|
||||
"model": testConfig.SpeechSynthesisModel,
|
||||
"format": tc.format,
|
||||
"voice": voice,
|
||||
},
|
||||
}
|
||||
|
||||
// Enhanced validation for speech synthesis
|
||||
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
|
||||
expectations := ApplyRawExpectations(SpeechExpectations(tc.expectMinBytes), testConfig, false, false, true)
|
||||
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
|
||||
|
||||
// Create Speech retry config
|
||||
speechRetryConfig := SpeechRetryConfig{
|
||||
MaxAttempts: retryConfig.MaxAttempts,
|
||||
BaseDelay: retryConfig.BaseDelay,
|
||||
MaxDelay: retryConfig.MaxDelay,
|
||||
Conditions: []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
|
||||
OnRetry: retryConfig.OnRetry,
|
||||
OnFinalFail: retryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_"+tc.name, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
|
||||
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.SpeechRequest(requestCtx, request)
|
||||
})
|
||||
|
||||
if bifrostErr != nil {
|
||||
t.Fatalf("❌ SpeechSynthesis_"+tc.name+" request failed after retries: %v", GetErrorMessage(bifrostErr))
|
||||
}
|
||||
|
||||
// Additional speech-specific validations (complementary to main validation)
|
||||
validateSpeechSynthesisSpecific(t, speechResponse, tc.expectMinBytes, testConfig.SpeechSynthesisModel)
|
||||
|
||||
// Save audio file for SST round-trip testing if requested
|
||||
if tc.saveForSST {
|
||||
tempDir := os.TempDir()
|
||||
audioFileName := filepath.Join(tempDir, "tts_"+tc.name+"."+tc.format)
|
||||
|
||||
err := os.WriteFile(audioFileName, speechResponse.Audio, 0644)
|
||||
require.NoError(t, err, "Failed to save audio file for SST testing")
|
||||
|
||||
// Register cleanup to remove temp file
|
||||
t.Cleanup(func() {
|
||||
os.Remove(audioFileName)
|
||||
})
|
||||
|
||||
t.Logf("💾 Audio saved for SST testing: %s (text: '%s')", audioFileName, tc.text)
|
||||
}
|
||||
|
||||
t.Logf("✅ Speech synthesis successful: %d bytes of %s audio generated for voice '%s'",
|
||||
len(speechResponse.Audio), tc.format, voice)
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// RunSpeechSynthesisAdvancedTest executes advanced speech synthesis test scenarios
|
||||
func RunSpeechSynthesisAdvancedTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
|
||||
if !testConfig.Scenarios.SpeechSynthesis {
|
||||
t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("SpeechSynthesisAdvanced", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
t.Run("LongText_HDModel", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
// Test with longer text and HD model
|
||||
longText := `
|
||||
This is a comprehensive test of the text-to-speech functionality using a longer piece of text.
|
||||
The system should be able to handle multiple sentences, proper punctuation, and maintain
|
||||
consistent voice quality throughout the entire speech generation process. This test ensures
|
||||
that the speech synthesis can handle realistic use cases with substantial content.
|
||||
`
|
||||
|
||||
voice := GetProviderVoice(testConfig.Provider, "tertiary")
|
||||
request := &schemas.BifrostSpeechRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.SpeechSynthesisModel,
|
||||
Input: &schemas.SpeechInput{
|
||||
Input: longText,
|
||||
},
|
||||
Params: &schemas.SpeechParameters{
|
||||
VoiceConfig: &schemas.SpeechVoiceInput{
|
||||
Voice: &voice,
|
||||
},
|
||||
ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
|
||||
Instructions: "Speak slowly and clearly with natural intonation.",
|
||||
},
|
||||
Fallbacks: testConfig.SpeechSynthesisFallbacks,
|
||||
}
|
||||
|
||||
// Groq doesn't support instructions
|
||||
if testConfig.Provider == schemas.Groq {
|
||||
request.Params.Instructions = ""
|
||||
}
|
||||
|
||||
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisHD", testConfig)
|
||||
retryContext := TestRetryContext{
|
||||
ScenarioName: "SpeechSynthesis_HD_LongText",
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"generate_hd_audio": true,
|
||||
"handle_long_text": true,
|
||||
"min_audio_bytes": 5000,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": testConfig.Provider,
|
||||
"model": testConfig.SpeechSynthesisModel,
|
||||
"text_length": len(longText),
|
||||
},
|
||||
}
|
||||
|
||||
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
|
||||
expectations := ApplyRawExpectations(SpeechExpectations(5000), testConfig, false, false, true) // HD should produce substantial audio
|
||||
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
|
||||
|
||||
// Create Speech retry config
|
||||
speechRetryConfig := SpeechRetryConfig{
|
||||
MaxAttempts: retryConfig.MaxAttempts,
|
||||
BaseDelay: retryConfig.BaseDelay,
|
||||
MaxDelay: retryConfig.MaxDelay,
|
||||
Conditions: []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
|
||||
OnRetry: retryConfig.OnRetry,
|
||||
OnFinalFail: retryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_HD", func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
|
||||
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.SpeechRequest(requestCtx, request)
|
||||
})
|
||||
if bifrostErr != nil {
|
||||
t.Fatalf("❌ SpeechSynthesis_HD request failed after retries: %v", GetErrorMessage(bifrostErr))
|
||||
}
|
||||
|
||||
if speechResponse == nil || speechResponse.Audio == nil {
|
||||
t.Fatal("HD speech synthesis response missing audio data")
|
||||
}
|
||||
|
||||
audioSize := len(speechResponse.Audio)
|
||||
if audioSize < 5000 {
|
||||
t.Fatalf("HD audio data too small: got %d bytes, expected at least 5000", audioSize)
|
||||
}
|
||||
|
||||
if speechResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
|
||||
t.Logf("⚠️ Expected HD model, got: %s", speechResponse.ExtraFields.OriginalModelRequested)
|
||||
}
|
||||
|
||||
t.Logf("✅ HD speech synthesis successful: %d bytes generated", len(speechResponse.Audio))
|
||||
})
|
||||
|
||||
t.Run("AllVoiceOptions", func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
// Test provider-specific voice options
|
||||
voiceTypes := []string{"primary", "secondary", "tertiary"}
|
||||
testText := TTSTestTextBasic // Use shared constant
|
||||
|
||||
for _, voiceType := range voiceTypes {
|
||||
t.Run("VoiceType_"+voiceType, func(t *testing.T) {
|
||||
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
||||
t.Parallel()
|
||||
}
|
||||
|
||||
voice := GetProviderVoice(testConfig.Provider, voiceType)
|
||||
request := &schemas.BifrostSpeechRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.SpeechSynthesisModel,
|
||||
Input: &schemas.SpeechInput{
|
||||
Input: testText,
|
||||
},
|
||||
Params: &schemas.SpeechParameters{
|
||||
VoiceConfig: &schemas.SpeechVoiceInput{
|
||||
Voice: &voice,
|
||||
},
|
||||
ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
|
||||
},
|
||||
Fallbacks: testConfig.SpeechSynthesisFallbacks,
|
||||
}
|
||||
|
||||
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
|
||||
expectations := ApplyRawExpectations(SpeechExpectations(500), testConfig, false, false, true)
|
||||
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
|
||||
|
||||
// Use retry framework for voice test
|
||||
voiceRetryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
|
||||
voiceRetryContext := TestRetryContext{
|
||||
ScenarioName: "SpeechSynthesis_VoiceType_" + voiceType,
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_generate_audio": true,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": testConfig.Provider,
|
||||
"model": testConfig.SpeechSynthesisModel,
|
||||
"voice_type": voiceType,
|
||||
"voice": voice,
|
||||
},
|
||||
}
|
||||
voiceSpeechRetryConfig := SpeechRetryConfig{
|
||||
MaxAttempts: voiceRetryConfig.MaxAttempts,
|
||||
BaseDelay: voiceRetryConfig.BaseDelay,
|
||||
MaxDelay: voiceRetryConfig.MaxDelay,
|
||||
Conditions: []SpeechRetryCondition{},
|
||||
OnRetry: voiceRetryConfig.OnRetry,
|
||||
OnFinalFail: voiceRetryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
speechResponse, bifrostErr := WithSpeechTestRetry(t, voiceSpeechRetryConfig, voiceRetryContext, expectations, "SpeechSynthesis_VoiceType_"+voiceType, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
|
||||
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.SpeechRequest(requestCtx, request)
|
||||
})
|
||||
|
||||
if bifrostErr != nil {
|
||||
t.Fatalf("❌ SpeechSynthesis_Voice_"+voiceType+" request failed after retries: %v", GetErrorMessage(bifrostErr))
|
||||
}
|
||||
|
||||
if speechResponse == nil || speechResponse.Audio == nil {
|
||||
t.Fatalf("Voice %s (%s) missing audio data after retries", voice, voiceType)
|
||||
}
|
||||
|
||||
audioSize := len(speechResponse.Audio)
|
||||
if audioSize < 500 {
|
||||
t.Fatalf("Audio too small for voice %s: got %d bytes, expected at least 500", voice, audioSize)
|
||||
}
|
||||
t.Logf("✅ Voice %s (%s): %d bytes generated", voice, voiceType, len(speechResponse.Audio))
|
||||
})
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// validateSpeechSynthesisSpecific performs speech-specific validation
|
||||
// This is complementary to the main validation framework and focuses on speech synthesis concerns
|
||||
func validateSpeechSynthesisSpecific(t *testing.T, response *schemas.BifrostSpeechResponse, expectMinBytes int, expectedModel string) {
|
||||
if response == nil {
|
||||
t.Fatal("Invalid speech synthesis response structure")
|
||||
}
|
||||
|
||||
if response.Audio == nil {
|
||||
t.Fatal("Speech synthesis response missing audio data")
|
||||
}
|
||||
|
||||
audioSize := len(response.Audio)
|
||||
if audioSize < expectMinBytes {
|
||||
t.Fatalf("Audio data too small: got %d bytes, expected at least %d", audioSize, expectMinBytes)
|
||||
}
|
||||
|
||||
if expectedModel != "" && response.ExtraFields.OriginalModelRequested != expectedModel {
|
||||
t.Logf("⚠️ Expected model, got: %s", response.ExtraFields.OriginalModelRequested)
|
||||
}
|
||||
|
||||
t.Logf("✅ Audio validation passed: %d bytes generated", audioSize)
|
||||
}
|
||||
Reference in New Issue
Block a user