Files
bifrost/core/internal/llmtests/speech_synthesis_stream.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

551 lines
20 KiB
Go

package llmtests
import (
"bytes"
"context"
"fmt"
"os"
"strings"
"testing"
bifrost "github.com/maximhq/bifrost/core"
"github.com/maximhq/bifrost/core/providers/utils"
"github.com/maximhq/bifrost/core/schemas"
)
// RunSpeechSynthesisStreamTest executes the streaming speech synthesis test scenario
func RunSpeechSynthesisStreamTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.SpeechSynthesisStream {
t.Logf("Speech synthesis streaming not supported for provider %s", testConfig.Provider)
return
}
t.Run("SpeechSynthesisStream", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
// Test streaming with different text lengths
testCases := []struct {
name string
text string
voice string
format string
expectMinChunks int
expectMinBytes int
skip bool
}{
{
name: "ShortText_Streaming",
text: "This is a short text for streaming speech synthesis test.",
voice: GetProviderVoice(testConfig.Provider, "primary"),
format: GetProviderDefaultFormat(testConfig.Provider),
expectMinChunks: 1,
expectMinBytes: 1000,
skip: false,
},
{
name: "LongText_Streaming",
text: `This is a longer text to test streaming speech synthesis functionality.
The streaming should provide audio chunks as they are generated, allowing for
real-time playback while the rest of the audio is still being processed.
This enables better user experience with reduced latency.`,
voice: GetProviderVoice(testConfig.Provider, "secondary"),
format: GetProviderDefaultFormat(testConfig.Provider),
expectMinChunks: 2,
expectMinBytes: 3000,
skip: testConfig.Provider == schemas.Gemini,
},
// This flow is allowed to only pro accounts
// {
// name: "MediumText_Echo_WAV",
// text: "Testing streaming with WAV format. This should produce multiple audio chunks in WAV format for streaming playback.",
// voice: GetProviderVoice(testConfig.Provider, "tertiary"),
// format: "wav",
// expectMinChunks: 1,
// expectMinBytes: 2000,
// skip: false,
// },
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
if tc.skip {
t.Skipf("Skipping %s test", tc.name)
return
}
voice := tc.voice
request := &schemas.BifrostSpeechRequest{
Provider: testConfig.Provider,
Model: testConfig.SpeechSynthesisModel,
Input: &schemas.SpeechInput{
Input: tc.text,
},
Params: &schemas.SpeechParameters{
VoiceConfig: &schemas.SpeechVoiceInput{
Voice: &voice,
},
ResponseFormat: tc.format,
},
Fallbacks: testConfig.SpeechSynthesisFallbacks,
}
// Use retry framework for streaming speech synthesis
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisStream", testConfig)
retryContext := TestRetryContext{
ScenarioName: "SpeechSynthesisStream_" + tc.name,
ExpectedBehavior: map[string]interface{}{
"generate_streaming_audio": true,
"voice_type": tc.voice,
"format": tc.format,
"min_chunks": tc.expectMinChunks,
"min_total_bytes": tc.expectMinBytes,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.SpeechSynthesisModel,
"text_length": len(tc.text),
"voice": tc.voice,
"format": tc.format,
},
}
responseChannel, err := WithStreamRetry(t, retryConfig, retryContext, func() (chan *schemas.BifrostStreamChunk, *schemas.BifrostError) {
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.SpeechStreamRequest(requestCtx, request)
})
// Enhanced validation for streaming speech synthesis
if err != nil {
RequireNoError(t, err, "Speech synthesis stream initiation failed")
}
if responseChannel == nil {
t.Fatal("Response channel should not be nil")
}
var totalBytes int
var chunkCount int
var lastResponse *schemas.BifrostStreamChunk
var streamErrors []string
var lastTokenLatency int64
var audioBuffer bytes.Buffer // Accumulate audio chunks for validation
// Read streaming chunks with enhanced validation
for response := range responseChannel {
if response == nil {
streamErrors = append(streamErrors, "Received nil stream response")
continue
}
// Check for errors in stream
if response.BifrostError != nil {
streamErrors = append(streamErrors, FormatErrorConcise(ParseBifrostError(response.BifrostError)))
continue
}
if response.BifrostSpeechStreamResponse != nil {
lastTokenLatency = response.BifrostSpeechStreamResponse.ExtraFields.Latency
}
if response.BifrostSpeechStreamResponse == nil {
streamErrors = append(streamErrors, "Stream response missing speech stream payload")
continue
}
if response.BifrostSpeechStreamResponse.Audio == nil {
streamErrors = append(streamErrors, "Stream response missing audio data")
continue
}
// Log latency for each chunk (can be 0 for inter-chunks)
t.Logf("📊 Speech chunk %d latency: %d ms", chunkCount+1, response.BifrostSpeechStreamResponse.ExtraFields.Latency)
// Collect audio chunks
if response.BifrostSpeechStreamResponse.Audio != nil {
chunkSize := len(response.BifrostSpeechStreamResponse.Audio)
if chunkSize == 0 {
t.Logf("⚠️ Skipping zero-length audio chunk")
continue
}
// Accumulate audio data for codec validation
audioBuffer.Write(response.BifrostSpeechStreamResponse.Audio)
totalBytes += chunkSize
chunkCount++
t.Logf("✅ Received audio chunk %d: %d bytes", chunkCount, chunkSize)
// Validate chunk structure
if response.BifrostSpeechStreamResponse.Type != "" && (response.BifrostSpeechStreamResponse.Type != schemas.SpeechStreamResponseTypeDelta && response.BifrostSpeechStreamResponse.Type != schemas.SpeechStreamResponseTypeDone) {
t.Logf("⚠️ Unexpected object type in stream: %s", response.BifrostSpeechStreamResponse.Type)
}
if response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested != "" && response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
t.Logf("⚠️ Unexpected model in stream: %s", response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested)
}
}
lastResponse = DeepCopyBifrostStreamChunk(response)
}
// Enhanced validation of streaming results
if len(streamErrors) > 0 {
t.Logf("⚠️ Stream errors encountered: %v", streamErrors)
}
if chunkCount < tc.expectMinChunks {
t.Fatalf("Insufficient chunks received: got %d, expected at least %d", chunkCount, tc.expectMinChunks)
}
if totalBytes < tc.expectMinBytes {
t.Fatalf("Insufficient audio data: got %d bytes, expected at least %d", totalBytes, tc.expectMinBytes)
}
if lastResponse == nil {
t.Fatal("Should have received at least one response")
}
// Additional streaming-specific validations
if chunkCount == 0 {
t.Fatal("No audio chunks received from stream")
}
averageChunkSize := totalBytes / chunkCount
if averageChunkSize < 100 {
t.Logf("Average chunk size seems small: %d bytes", averageChunkSize)
}
if lastTokenLatency == 0 {
t.Fatalf("❌ Last token latency is 0")
}
// Save audio to temp file, validate codec, and cleanup after test
if audioBuffer.Len() > 0 {
var err error
audioData := audioBuffer.Bytes()
if testConfig.Provider == schemas.Gemini {
audioData, err = utils.ConvertPCMToWAV(audioData, utils.DefaultGeminiPCMConfig())
if err != nil {
t.Fatalf("Failed to convert PCM to WAV: %v", err)
}
}
filePath, validationErr := SaveAndValidateAudio(t, audioData)
if validationErr != nil {
t.Fatalf("Audio codec validation failed: %v", validationErr)
}
t.Logf("Audio file validated successfully: %s", filePath)
} else {
t.Fatal("No audio data accumulated for codec validation")
}
t.Logf("✅ Streaming speech synthesis successful: %d chunks, %d total bytes for voice '%s' in %s format",
chunkCount, totalBytes, tc.voice, tc.format)
})
}
})
}
// RunSpeechSynthesisStreamAdvancedTest executes advanced streaming speech synthesis test scenarios
func RunSpeechSynthesisStreamAdvancedTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
if !testConfig.Scenarios.SpeechSynthesisStream {
t.Logf("Speech synthesis streaming not supported for provider %s", testConfig.Provider)
return
}
t.Run("SpeechSynthesisStreamAdvanced", func(t *testing.T) {
t.Run("LongText_HDModel_Streaming", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
if testConfig.Provider == schemas.Gemini {
t.Skipf("Skipping %s test", "LongText_HDModel_Streaming")
return
}
// Test streaming with HD model and very long text
finalText := ""
for i := 1; i <= 20; i++ {
finalText += strings.Replace("This is sentence number %d in a very long text for testing streaming speech synthesis with the HD model. ", "%d", string(rune('0'+i%10)), -1)
}
voice := GetProviderVoice(testConfig.Provider, "tertiary")
request := &schemas.BifrostSpeechRequest{
Provider: testConfig.Provider,
Model: testConfig.SpeechSynthesisModel,
Input: &schemas.SpeechInput{
Input: finalText,
},
Params: &schemas.SpeechParameters{
VoiceConfig: &schemas.SpeechVoiceInput{
Voice: &voice,
},
ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
Instructions: "Speak at a natural pace with clear pronunciation.",
},
Fallbacks: testConfig.SpeechSynthesisFallbacks,
}
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisStreamHD", testConfig)
retryContext := TestRetryContext{
ScenarioName: "SpeechSynthesisStreamHD_LongText",
ExpectedBehavior: map[string]interface{}{
"generate_hd_streaming_audio": true,
"handle_long_text": true,
"min_chunks": 3,
"min_total_bytes": 10000,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"model": testConfig.SpeechSynthesisModel,
"text_length": len(finalText),
"voice": voice,
},
}
responseChannel, err := WithStreamRetry(t, retryConfig, retryContext, func() (chan *schemas.BifrostStreamChunk, *schemas.BifrostError) {
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.SpeechStreamRequest(requestCtx, request)
})
RequireNoError(t, err, "HD streaming speech synthesis failed")
var totalBytes int
var chunkCount int
var streamErrors []string
var lastTokenLatency int64
var audioBuffer bytes.Buffer // Accumulate audio chunks for validation
for response := range responseChannel {
if response == nil {
streamErrors = append(streamErrors, "Received nil HD stream response")
continue
}
if response.BifrostError != nil {
streamErrors = append(streamErrors, FormatErrorConcise(ParseBifrostError(response.BifrostError)))
continue
}
if response.BifrostSpeechStreamResponse != nil {
lastTokenLatency = response.BifrostSpeechStreamResponse.ExtraFields.Latency
}
if response.BifrostSpeechStreamResponse != nil && response.BifrostSpeechStreamResponse.Audio != nil {
chunkSize := len(response.BifrostSpeechStreamResponse.Audio)
if chunkSize == 0 {
t.Logf("⚠️ Skipping zero-length HD audio chunk")
continue
}
// Accumulate audio data for codec validation
audioBuffer.Write(response.BifrostSpeechStreamResponse.Audio)
totalBytes += chunkSize
chunkCount++
t.Logf("✅ HD chunk %d: %d bytes", chunkCount, chunkSize)
}
if response.BifrostSpeechStreamResponse != nil && response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested != "" && response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
t.Logf("⚠️ Unexpected HD model: %s", response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested)
}
}
if len(streamErrors) > 0 {
t.Logf("⚠️ HD stream errors: %v", streamErrors)
}
if chunkCount <= 3 {
t.Fatalf("HD model should produce more chunks for long text: got %d, expected > 3", chunkCount)
}
if totalBytes <= 10000 {
t.Fatalf("HD model should produce substantial audio data: got %d bytes, expected > 10000", totalBytes)
}
if lastTokenLatency == 0 {
t.Fatalf("❌ Last token latency is 0")
}
// Save audio to temp file, validate codec, and cleanup after test
if audioBuffer.Len() > 0 {
// If provider is Gemini, we will have to convert the PCM bytes to WAV bytes
var err error
audioData := audioBuffer.Bytes()
if testConfig.Provider == schemas.Gemini {
audioData, err = utils.ConvertPCMToWAV(audioData, utils.DefaultGeminiPCMConfig())
if err != nil {
t.Fatalf("Failed to convert PCM to WAV: %v", err)
}
}
filePath, validationErr := SaveAndValidateAudio(t, audioData)
if validationErr != nil {
t.Fatalf("Audio codec validation failed: %v", validationErr)
}
t.Logf("Audio file validated successfully (detected format: %s): %s", GetProviderDefaultFormat(testConfig.Provider), filePath)
} else {
t.Fatal("No audio data accumulated for codec validation")
}
t.Logf("✅ HD streaming successful: %d chunks, %d total bytes", chunkCount, totalBytes)
})
t.Run("MultipleVoices_Streaming", func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
voices := []string{}
// Test streaming with all available voices
openaiVoices := []string{"alloy", "echo", "fable", "onyx", "nova", "shimmer"}
geminiVoices := []string{"achernar", "achird", "erinome"}
// it's not possible to test all voices with Elevenlabs, we are using a few
elevenlabsVoices := []string{"21m00Tcm4TlvDq8ikWAM", "29vD33N1CtxCmqQRPOHJ", "2EiwWnXFnvU5JabPnv8n"}
testText := "Testing streaming speech synthesis with different voice options."
switch testConfig.Provider {
case schemas.OpenAI:
voices = openaiVoices
case schemas.Gemini:
voices = geminiVoices
case schemas.Elevenlabs:
voices = elevenlabsVoices
}
for _, voice := range voices {
voiceCopy := voice
t.Run("StreamingVoice_"+voiceCopy, func(t *testing.T) {
if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
t.Parallel()
}
request := &schemas.BifrostSpeechRequest{
Provider: testConfig.Provider,
Model: testConfig.SpeechSynthesisModel,
Input: &schemas.SpeechInput{
Input: testText,
},
Params: &schemas.SpeechParameters{
VoiceConfig: &schemas.SpeechVoiceInput{
Voice: &voiceCopy,
},
ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
},
Fallbacks: testConfig.SpeechSynthesisFallbacks,
}
retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisStreamVoice", testConfig)
retryContext := TestRetryContext{
ScenarioName: "SpeechSynthesisStream_Voice_" + voiceCopy,
ExpectedBehavior: map[string]interface{}{
"generate_streaming_audio": true,
"voice_type": voiceCopy,
},
TestMetadata: map[string]interface{}{
"provider": testConfig.Provider,
"voice": voiceCopy,
},
}
// Use retry framework with stream validation
var accumulatedAudio bytes.Buffer // Accumulate audio for codec validation
validationResult := WithSpeechStreamValidationRetry(
t,
retryConfig,
retryContext,
func() (chan *schemas.BifrostStreamChunk, *schemas.BifrostError) {
accumulatedAudio.Reset() // Reset buffer on retry
requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
return client.SpeechStreamRequest(requestCtx, request)
},
func(responseChannel chan *schemas.BifrostStreamChunk) SpeechStreamValidationResult {
// Validate stream content
var receivedData bool
var streamErrors []string
var lastTokenLatency int64
var validationErrors []string
for response := range responseChannel {
if response == nil {
streamErrors = append(streamErrors, fmt.Sprintf("Received nil stream response for voice %s", voiceCopy))
continue
}
if response.BifrostError != nil {
streamErrors = append(streamErrors, fmt.Sprintf("Error in stream for voice %s: %s", voiceCopy, FormatErrorConcise(ParseBifrostError(response.BifrostError))))
continue
}
if response.BifrostSpeechStreamResponse != nil {
lastTokenLatency = response.BifrostSpeechStreamResponse.ExtraFields.Latency
}
if response.BifrostSpeechStreamResponse != nil && response.BifrostSpeechStreamResponse.Audio != nil && len(response.BifrostSpeechStreamResponse.Audio) > 0 {
receivedData = true
// Accumulate audio data for codec validation
accumulatedAudio.Write(response.BifrostSpeechStreamResponse.Audio)
t.Logf("✅ Received data for voice %s: %d bytes", voiceCopy, len(response.BifrostSpeechStreamResponse.Audio))
}
}
// Build validation errors
if len(streamErrors) > 0 {
validationErrors = append(validationErrors, fmt.Sprintf("Stream errors: %v", streamErrors))
}
if !receivedData {
validationErrors = append(validationErrors, fmt.Sprintf("Should receive audio data for voice %s", voiceCopy))
}
if lastTokenLatency == 0 {
validationErrors = append(validationErrors, "Last token latency is 0")
}
return SpeechStreamValidationResult{
Passed: len(validationErrors) == 0,
Errors: validationErrors,
ReceivedData: receivedData,
StreamErrors: streamErrors,
LastLatency: lastTokenLatency,
}
},
)
// Check validation result
if !validationResult.Passed {
allErrors := append(validationResult.Errors, validationResult.StreamErrors...)
t.Fatalf("❌ Speech streaming validation failed for voice %s: %s", voiceCopy, strings.Join(allErrors, "; "))
}
// Save audio to temp file, validate codec, and cleanup after test
if accumulatedAudio.Len() > 0 {
var err error
audioData := accumulatedAudio.Bytes()
if testConfig.Provider == schemas.Gemini {
audioData, err = utils.ConvertPCMToWAV(audioData, utils.DefaultGeminiPCMConfig())
if err != nil {
t.Fatalf("Failed to convert PCM to WAV: %v", err)
}
}
filePath, validationErr := SaveAndValidateAudio(t, audioData)
if validationErr != nil {
t.Fatalf("❌ Audio codec validation failed for voice %s: %v", voiceCopy, validationErr)
}
t.Logf("🎵 Audio file validated successfully for voice %s: %s", voiceCopy, filePath)
} else {
t.Fatalf("❌ No audio data accumulated for codec validation (voice: %s)", voiceCopy)
}
t.Logf("✅ Streaming successful for voice: %s", voiceCopy)
})
}
})
})
}