353 lines
13 KiB
Go
353 lines
13 KiB
Go
package llmtests
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
bifrost "github.com/maximhq/bifrost/core"
|
|
"github.com/maximhq/bifrost/core/schemas"
|
|
)
|
|
|
|
// RunSpeechSynthesisTest executes the speech synthesis test scenario
|
|
func RunSpeechSynthesisTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
|
|
if !testConfig.Scenarios.SpeechSynthesis {
|
|
t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
|
|
return
|
|
}
|
|
|
|
t.Run("SpeechSynthesis", func(t *testing.T) {
|
|
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
|
t.Parallel()
|
|
}
|
|
|
|
// Test with shared text constants for round-trip validation with transcription
|
|
testCases := []struct {
|
|
name string
|
|
text string
|
|
voiceType string
|
|
format string
|
|
expectMinBytes int
|
|
saveForSST bool // Whether to save this audio for SST round-trip testing
|
|
}{
|
|
{
|
|
name: "BasicText_Primary_MP3",
|
|
text: TTSTestTextBasic,
|
|
voiceType: "primary",
|
|
format: GetProviderDefaultFormat(testConfig.Provider),
|
|
expectMinBytes: 1000,
|
|
saveForSST: true,
|
|
},
|
|
{
|
|
name: "MediumText_Secondary_MP3",
|
|
text: TTSTestTextMedium,
|
|
voiceType: "secondary",
|
|
format: GetProviderDefaultFormat(testConfig.Provider),
|
|
expectMinBytes: 2000,
|
|
saveForSST: true,
|
|
},
|
|
{
|
|
name: "TechnicalText_Tertiary_MP3",
|
|
text: TTSTestTextTechnical,
|
|
voiceType: "tertiary",
|
|
format: GetProviderDefaultFormat(testConfig.Provider),
|
|
expectMinBytes: 500,
|
|
saveForSST: true,
|
|
},
|
|
}
|
|
|
|
for _, tc := range testCases {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
|
t.Parallel()
|
|
}
|
|
|
|
voice := GetProviderVoice(testConfig.Provider, tc.voiceType)
|
|
request := &schemas.BifrostSpeechRequest{
|
|
Provider: testConfig.Provider,
|
|
Model: testConfig.SpeechSynthesisModel, // Use configured model
|
|
Input: &schemas.SpeechInput{
|
|
Input: tc.text,
|
|
},
|
|
Params: &schemas.SpeechParameters{
|
|
VoiceConfig: &schemas.SpeechVoiceInput{
|
|
Voice: &voice,
|
|
},
|
|
ResponseFormat: tc.format,
|
|
},
|
|
Fallbacks: testConfig.SpeechSynthesisFallbacks,
|
|
}
|
|
|
|
// Use retry framework with enhanced validation
|
|
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
|
|
retryContext := TestRetryContext{
|
|
ScenarioName: "SpeechSynthesis_" + tc.name,
|
|
ExpectedBehavior: map[string]interface{}{
|
|
"should_generate_audio": true,
|
|
},
|
|
TestMetadata: map[string]interface{}{
|
|
"provider": testConfig.Provider,
|
|
"model": testConfig.SpeechSynthesisModel,
|
|
"format": tc.format,
|
|
"voice": voice,
|
|
},
|
|
}
|
|
|
|
// Enhanced validation for speech synthesis
|
|
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
|
|
expectations := ApplyRawExpectations(SpeechExpectations(tc.expectMinBytes), testConfig, false, false, true)
|
|
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
|
|
|
|
// Create Speech retry config
|
|
speechRetryConfig := SpeechRetryConfig{
|
|
MaxAttempts: retryConfig.MaxAttempts,
|
|
BaseDelay: retryConfig.BaseDelay,
|
|
MaxDelay: retryConfig.MaxDelay,
|
|
Conditions: []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
|
|
OnRetry: retryConfig.OnRetry,
|
|
OnFinalFail: retryConfig.OnFinalFail,
|
|
}
|
|
|
|
speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_"+tc.name, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
|
|
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
|
return client.SpeechRequest(requestCtx, request)
|
|
})
|
|
|
|
if bifrostErr != nil {
|
|
t.Fatalf("❌ SpeechSynthesis_"+tc.name+" request failed after retries: %v", GetErrorMessage(bifrostErr))
|
|
}
|
|
|
|
// Additional speech-specific validations (complementary to main validation)
|
|
validateSpeechSynthesisSpecific(t, speechResponse, tc.expectMinBytes, testConfig.SpeechSynthesisModel)
|
|
|
|
// Save audio file for SST round-trip testing if requested
|
|
if tc.saveForSST {
|
|
tempDir := os.TempDir()
|
|
audioFileName := filepath.Join(tempDir, "tts_"+tc.name+"."+tc.format)
|
|
|
|
err := os.WriteFile(audioFileName, speechResponse.Audio, 0644)
|
|
require.NoError(t, err, "Failed to save audio file for SST testing")
|
|
|
|
// Register cleanup to remove temp file
|
|
t.Cleanup(func() {
|
|
os.Remove(audioFileName)
|
|
})
|
|
|
|
t.Logf("💾 Audio saved for SST testing: %s (text: '%s')", audioFileName, tc.text)
|
|
}
|
|
|
|
t.Logf("✅ Speech synthesis successful: %d bytes of %s audio generated for voice '%s'",
|
|
len(speechResponse.Audio), tc.format, voice)
|
|
})
|
|
}
|
|
})
|
|
}
|
|
|
|
// RunSpeechSynthesisAdvancedTest executes advanced speech synthesis test scenarios
|
|
func RunSpeechSynthesisAdvancedTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
|
|
if !testConfig.Scenarios.SpeechSynthesis {
|
|
t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
|
|
return
|
|
}
|
|
|
|
t.Run("SpeechSynthesisAdvanced", func(t *testing.T) {
|
|
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
|
t.Parallel()
|
|
}
|
|
|
|
t.Run("LongText_HDModel", func(t *testing.T) {
|
|
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
|
t.Parallel()
|
|
}
|
|
|
|
// Test with longer text and HD model
|
|
longText := `
|
|
This is a comprehensive test of the text-to-speech functionality using a longer piece of text.
|
|
The system should be able to handle multiple sentences, proper punctuation, and maintain
|
|
consistent voice quality throughout the entire speech generation process. This test ensures
|
|
that the speech synthesis can handle realistic use cases with substantial content.
|
|
`
|
|
|
|
voice := GetProviderVoice(testConfig.Provider, "tertiary")
|
|
request := &schemas.BifrostSpeechRequest{
|
|
Provider: testConfig.Provider,
|
|
Model: testConfig.SpeechSynthesisModel,
|
|
Input: &schemas.SpeechInput{
|
|
Input: longText,
|
|
},
|
|
Params: &schemas.SpeechParameters{
|
|
VoiceConfig: &schemas.SpeechVoiceInput{
|
|
Voice: &voice,
|
|
},
|
|
ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
|
|
Instructions: "Speak slowly and clearly with natural intonation.",
|
|
},
|
|
Fallbacks: testConfig.SpeechSynthesisFallbacks,
|
|
}
|
|
|
|
// Groq doesn't support instructions
|
|
if testConfig.Provider == schemas.Groq {
|
|
request.Params.Instructions = ""
|
|
}
|
|
|
|
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisHD", testConfig)
|
|
retryContext := TestRetryContext{
|
|
ScenarioName: "SpeechSynthesis_HD_LongText",
|
|
ExpectedBehavior: map[string]interface{}{
|
|
"generate_hd_audio": true,
|
|
"handle_long_text": true,
|
|
"min_audio_bytes": 5000,
|
|
},
|
|
TestMetadata: map[string]interface{}{
|
|
"provider": testConfig.Provider,
|
|
"model": testConfig.SpeechSynthesisModel,
|
|
"text_length": len(longText),
|
|
},
|
|
}
|
|
|
|
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
|
|
expectations := ApplyRawExpectations(SpeechExpectations(5000), testConfig, false, false, true) // HD should produce substantial audio
|
|
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
|
|
|
|
// Create Speech retry config
|
|
speechRetryConfig := SpeechRetryConfig{
|
|
MaxAttempts: retryConfig.MaxAttempts,
|
|
BaseDelay: retryConfig.BaseDelay,
|
|
MaxDelay: retryConfig.MaxDelay,
|
|
Conditions: []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
|
|
OnRetry: retryConfig.OnRetry,
|
|
OnFinalFail: retryConfig.OnFinalFail,
|
|
}
|
|
|
|
speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_HD", func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
|
|
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
|
return client.SpeechRequest(requestCtx, request)
|
|
})
|
|
if bifrostErr != nil {
|
|
t.Fatalf("❌ SpeechSynthesis_HD request failed after retries: %v", GetErrorMessage(bifrostErr))
|
|
}
|
|
|
|
if speechResponse == nil || speechResponse.Audio == nil {
|
|
t.Fatal("HD speech synthesis response missing audio data")
|
|
}
|
|
|
|
audioSize := len(speechResponse.Audio)
|
|
if audioSize < 5000 {
|
|
t.Fatalf("HD audio data too small: got %d bytes, expected at least 5000", audioSize)
|
|
}
|
|
|
|
if speechResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
|
|
t.Logf("⚠️ Expected HD model, got: %s", speechResponse.ExtraFields.OriginalModelRequested)
|
|
}
|
|
|
|
t.Logf("✅ HD speech synthesis successful: %d bytes generated", len(speechResponse.Audio))
|
|
})
|
|
|
|
t.Run("AllVoiceOptions", func(t *testing.T) {
|
|
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
|
t.Parallel()
|
|
}
|
|
|
|
// Test provider-specific voice options
|
|
voiceTypes := []string{"primary", "secondary", "tertiary"}
|
|
testText := TTSTestTextBasic // Use shared constant
|
|
|
|
for _, voiceType := range voiceTypes {
|
|
t.Run("VoiceType_"+voiceType, func(t *testing.T) {
|
|
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
|
|
t.Parallel()
|
|
}
|
|
|
|
voice := GetProviderVoice(testConfig.Provider, voiceType)
|
|
request := &schemas.BifrostSpeechRequest{
|
|
Provider: testConfig.Provider,
|
|
Model: testConfig.SpeechSynthesisModel,
|
|
Input: &schemas.SpeechInput{
|
|
Input: testText,
|
|
},
|
|
Params: &schemas.SpeechParameters{
|
|
VoiceConfig: &schemas.SpeechVoiceInput{
|
|
Voice: &voice,
|
|
},
|
|
ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
|
|
},
|
|
Fallbacks: testConfig.SpeechSynthesisFallbacks,
|
|
}
|
|
|
|
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
|
|
expectations := ApplyRawExpectations(SpeechExpectations(500), testConfig, false, false, true)
|
|
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
|
|
|
|
// Use retry framework for voice test
|
|
voiceRetryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
|
|
voiceRetryContext := TestRetryContext{
|
|
ScenarioName: "SpeechSynthesis_VoiceType_" + voiceType,
|
|
ExpectedBehavior: map[string]interface{}{
|
|
"should_generate_audio": true,
|
|
},
|
|
TestMetadata: map[string]interface{}{
|
|
"provider": testConfig.Provider,
|
|
"model": testConfig.SpeechSynthesisModel,
|
|
"voice_type": voiceType,
|
|
"voice": voice,
|
|
},
|
|
}
|
|
voiceSpeechRetryConfig := SpeechRetryConfig{
|
|
MaxAttempts: voiceRetryConfig.MaxAttempts,
|
|
BaseDelay: voiceRetryConfig.BaseDelay,
|
|
MaxDelay: voiceRetryConfig.MaxDelay,
|
|
Conditions: []SpeechRetryCondition{},
|
|
OnRetry: voiceRetryConfig.OnRetry,
|
|
OnFinalFail: voiceRetryConfig.OnFinalFail,
|
|
}
|
|
|
|
speechResponse, bifrostErr := WithSpeechTestRetry(t, voiceSpeechRetryConfig, voiceRetryContext, expectations, "SpeechSynthesis_VoiceType_"+voiceType, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
|
|
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
|
return client.SpeechRequest(requestCtx, request)
|
|
})
|
|
|
|
if bifrostErr != nil {
|
|
t.Fatalf("❌ SpeechSynthesis_Voice_"+voiceType+" request failed after retries: %v", GetErrorMessage(bifrostErr))
|
|
}
|
|
|
|
if speechResponse == nil || speechResponse.Audio == nil {
|
|
t.Fatalf("Voice %s (%s) missing audio data after retries", voice, voiceType)
|
|
}
|
|
|
|
audioSize := len(speechResponse.Audio)
|
|
if audioSize < 500 {
|
|
t.Fatalf("Audio too small for voice %s: got %d bytes, expected at least 500", voice, audioSize)
|
|
}
|
|
t.Logf("✅ Voice %s (%s): %d bytes generated", voice, voiceType, len(speechResponse.Audio))
|
|
})
|
|
}
|
|
})
|
|
})
|
|
}
|
|
|
|
// validateSpeechSynthesisSpecific performs speech-specific validation
|
|
// This is complementary to the main validation framework and focuses on speech synthesis concerns
|
|
func validateSpeechSynthesisSpecific(t *testing.T, response *schemas.BifrostSpeechResponse, expectMinBytes int, expectedModel string) {
|
|
if response == nil {
|
|
t.Fatal("Invalid speech synthesis response structure")
|
|
}
|
|
|
|
if response.Audio == nil {
|
|
t.Fatal("Speech synthesis response missing audio data")
|
|
}
|
|
|
|
audioSize := len(response.Audio)
|
|
if audioSize < expectMinBytes {
|
|
t.Fatalf("Audio data too small: got %d bytes, expected at least %d", audioSize, expectMinBytes)
|
|
}
|
|
|
|
if expectedModel != "" && response.ExtraFields.OriginalModelRequested != expectedModel {
|
|
t.Logf("⚠️ Expected model, got: %s", response.ExtraFields.OriginalModelRequested)
|
|
}
|
|
|
|
t.Logf("✅ Audio validation passed: %d bytes generated", audioSize)
|
|
}
|