first commit

2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions
--- a/core/internal/llmtests/speech_synthesis_stream.go
+++ b/core/internal/llmtests/speech_synthesis_stream.go
@@ -0,0 +1,550 @@
+package llmtests
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os"
+	"strings"
+	"testing"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/providers/utils"
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// RunSpeechSynthesisStreamTest executes the streaming speech synthesis test scenario
+func RunSpeechSynthesisStreamTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
+	if !testConfig.Scenarios.SpeechSynthesisStream {
+		t.Logf("Speech synthesis streaming not supported for provider %s", testConfig.Provider)
+		return
+	}
+
+	t.Run("SpeechSynthesisStream", func(t *testing.T) {
+		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+			t.Parallel()
+		}
+
+		// Test streaming with different text lengths
+		testCases := []struct {
+			name            string
+			text            string
+			voice           string
+			format          string
+			expectMinChunks int
+			expectMinBytes  int
+			skip            bool
+		}{
+			{
+				name:            "ShortText_Streaming",
+				text:            "This is a short text for streaming speech synthesis test.",
+				voice:           GetProviderVoice(testConfig.Provider, "primary"),
+				format:          GetProviderDefaultFormat(testConfig.Provider),
+				expectMinChunks: 1,
+				expectMinBytes:  1000,
+				skip:            false,
+			},
+			{
+				name: "LongText_Streaming",
+				text: `This is a longer text to test streaming speech synthesis functionality. 
+				       The streaming should provide audio chunks as they are generated, allowing for 
+				       real-time playback while the rest of the audio is still being processed. 
+				       This enables better user experience with reduced latency.`,
+				voice:           GetProviderVoice(testConfig.Provider, "secondary"),
+				format:          GetProviderDefaultFormat(testConfig.Provider),
+				expectMinChunks: 2,
+				expectMinBytes:  3000,
+				skip:            testConfig.Provider == schemas.Gemini,
+			},
+			// This flow is allowed to only pro accounts
+			// {
+			// 	name:            "MediumText_Echo_WAV",
+			// 	text:            "Testing streaming with WAV format. This should produce multiple audio chunks in WAV format for streaming playback.",
+			// 	voice:           GetProviderVoice(testConfig.Provider, "tertiary"),
+			// 	format:          "wav",
+			// 	expectMinChunks: 1,
+			// 	expectMinBytes:  2000,
+			// 	skip:            false,
+			// },
+		}
+
+		for _, tc := range testCases {
+			t.Run(tc.name, func(t *testing.T) {
+				if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+					t.Parallel()
+				}
+
+				if tc.skip {
+					t.Skipf("Skipping %s test", tc.name)
+					return
+				}
+
+				voice := tc.voice
+				request := &schemas.BifrostSpeechRequest{
+					Provider: testConfig.Provider,
+					Model:    testConfig.SpeechSynthesisModel,
+					Input: &schemas.SpeechInput{
+						Input: tc.text,
+					},
+					Params: &schemas.SpeechParameters{
+						VoiceConfig: &schemas.SpeechVoiceInput{
+							Voice: &voice,
+						},
+						ResponseFormat: tc.format,
+					},
+					Fallbacks: testConfig.SpeechSynthesisFallbacks,
+				}
+
+				// Use retry framework for streaming speech synthesis
+				retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisStream", testConfig)
+				retryContext := TestRetryContext{
+					ScenarioName: "SpeechSynthesisStream_" + tc.name,
+					ExpectedBehavior: map[string]interface{}{
+						"generate_streaming_audio": true,
+						"voice_type":               tc.voice,
+						"format":                   tc.format,
+						"min_chunks":               tc.expectMinChunks,
+						"min_total_bytes":          tc.expectMinBytes,
+					},
+					TestMetadata: map[string]interface{}{
+						"provider":    testConfig.Provider,
+						"model":       testConfig.SpeechSynthesisModel,
+						"text_length": len(tc.text),
+						"voice":       tc.voice,
+						"format":      tc.format,
+					},
+				}
+
+				
+
+				responseChannel, err := WithStreamRetry(t, retryConfig, retryContext, func() (chan *schemas.BifrostStreamChunk, *schemas.BifrostError) {
+					requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+					return client.SpeechStreamRequest(requestCtx, request)
+				})
+
+				// Enhanced validation for streaming speech synthesis
+				if err != nil {
+					RequireNoError(t, err, "Speech synthesis stream initiation failed")
+				}
+				if responseChannel == nil {
+					t.Fatal("Response channel should not be nil")
+				}
+
+				var totalBytes int
+				var chunkCount int
+				var lastResponse *schemas.BifrostStreamChunk
+				var streamErrors []string
+				var lastTokenLatency int64
+				var audioBuffer bytes.Buffer // Accumulate audio chunks for validation
+
+				// Read streaming chunks with enhanced validation
+				for response := range responseChannel {
+					if response == nil {
+						streamErrors = append(streamErrors, "Received nil stream response")
+						continue
+					}
+
+					// Check for errors in stream
+					if response.BifrostError != nil {
+						streamErrors = append(streamErrors, FormatErrorConcise(ParseBifrostError(response.BifrostError)))
+						continue
+					}
+
+					if response.BifrostSpeechStreamResponse != nil {
+						lastTokenLatency = response.BifrostSpeechStreamResponse.ExtraFields.Latency
+					}
+
+					if response.BifrostSpeechStreamResponse == nil {
+						streamErrors = append(streamErrors, "Stream response missing speech stream payload")
+						continue
+					}
+
+					if response.BifrostSpeechStreamResponse.Audio == nil {
+						streamErrors = append(streamErrors, "Stream response missing audio data")
+						continue
+					}
+
+					// Log latency for each chunk (can be 0 for inter-chunks)
+					t.Logf("📊 Speech chunk %d latency: %d ms", chunkCount+1, response.BifrostSpeechStreamResponse.ExtraFields.Latency)
+
+					// Collect audio chunks
+					if response.BifrostSpeechStreamResponse.Audio != nil {
+						chunkSize := len(response.BifrostSpeechStreamResponse.Audio)
+						if chunkSize == 0 {
+							t.Logf("⚠️ Skipping zero-length audio chunk")
+							continue
+						}
+						// Accumulate audio data for codec validation
+						audioBuffer.Write(response.BifrostSpeechStreamResponse.Audio)
+						totalBytes += chunkSize
+						chunkCount++
+						t.Logf("✅ Received audio chunk %d: %d bytes", chunkCount, chunkSize)
+
+						// Validate chunk structure
+						if response.BifrostSpeechStreamResponse.Type != "" && (response.BifrostSpeechStreamResponse.Type != schemas.SpeechStreamResponseTypeDelta && response.BifrostSpeechStreamResponse.Type != schemas.SpeechStreamResponseTypeDone) {
+							t.Logf("⚠️ Unexpected object type in stream: %s", response.BifrostSpeechStreamResponse.Type)
+						}
+						if response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested != "" && response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
+							t.Logf("⚠️ Unexpected model in stream: %s", response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested)
+						}
+					}
+
+					lastResponse = DeepCopyBifrostStreamChunk(response)
+				}
+
+				// Enhanced validation of streaming results
+				if len(streamErrors) > 0 {
+					t.Logf("⚠️ Stream errors encountered: %v", streamErrors)
+				}
+
+				if chunkCount < tc.expectMinChunks {
+					t.Fatalf("Insufficient chunks received: got %d, expected at least %d", chunkCount, tc.expectMinChunks)
+				}
+
+				if totalBytes < tc.expectMinBytes {
+					t.Fatalf("Insufficient audio data: got %d bytes, expected at least %d", totalBytes, tc.expectMinBytes)
+				}
+
+				if lastResponse == nil {
+					t.Fatal("Should have received at least one response")
+				}
+
+				// Additional streaming-specific validations
+				if chunkCount == 0 {
+					t.Fatal("No audio chunks received from stream")
+				}
+
+				averageChunkSize := totalBytes / chunkCount
+				if averageChunkSize < 100 {
+					t.Logf("Average chunk size seems small: %d bytes", averageChunkSize)
+				}
+
+				if lastTokenLatency == 0 {
+					t.Fatalf("❌ Last token latency is 0")
+				}
+
+				// Save audio to temp file, validate codec, and cleanup after test
+				if audioBuffer.Len() > 0 {
+					var err error
+					audioData := audioBuffer.Bytes()
+					if testConfig.Provider == schemas.Gemini {
+						audioData, err = utils.ConvertPCMToWAV(audioData, utils.DefaultGeminiPCMConfig())
+						if err != nil {
+							t.Fatalf("Failed to convert PCM to WAV: %v", err)
+						}
+					}
+					filePath, validationErr := SaveAndValidateAudio(t, audioData)
+					if validationErr != nil {
+						t.Fatalf("Audio codec validation failed: %v", validationErr)
+					}
+					t.Logf("Audio file validated successfully: %s", filePath)
+				} else {
+					t.Fatal("No audio data accumulated for codec validation")
+				}
+
+				t.Logf("✅ Streaming speech synthesis successful: %d chunks, %d total bytes for voice '%s' in %s format",
+					chunkCount, totalBytes, tc.voice, tc.format)
+			})
+		}
+	})
+}
+
+// RunSpeechSynthesisStreamAdvancedTest executes advanced streaming speech synthesis test scenarios
+func RunSpeechSynthesisStreamAdvancedTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
+	if !testConfig.Scenarios.SpeechSynthesisStream {
+		t.Logf("Speech synthesis streaming not supported for provider %s", testConfig.Provider)
+		return
+	}
+
+	t.Run("SpeechSynthesisStreamAdvanced", func(t *testing.T) {
+		t.Run("LongText_HDModel_Streaming", func(t *testing.T) {
+			if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+				t.Parallel()
+			}
+
+			if testConfig.Provider == schemas.Gemini {
+				t.Skipf("Skipping %s test", "LongText_HDModel_Streaming")
+				return
+			}
+
+			// Test streaming with HD model and very long text
+			finalText := ""
+			for i := 1; i <= 20; i++ {
+				finalText += strings.Replace("This is sentence number %d in a very long text for testing streaming speech synthesis with the HD model. ", "%d", string(rune('0'+i%10)), -1)
+			}
+
+			voice := GetProviderVoice(testConfig.Provider, "tertiary")
+			request := &schemas.BifrostSpeechRequest{
+				Provider: testConfig.Provider,
+				Model:    testConfig.SpeechSynthesisModel,
+				Input: &schemas.SpeechInput{
+					Input: finalText,
+				},
+				Params: &schemas.SpeechParameters{
+					VoiceConfig: &schemas.SpeechVoiceInput{
+						Voice: &voice,
+					},
+					ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
+					Instructions:   "Speak at a natural pace with clear pronunciation.",
+				},
+				Fallbacks: testConfig.SpeechSynthesisFallbacks,
+			}
+
+			retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisStreamHD", testConfig)
+			retryContext := TestRetryContext{
+				ScenarioName: "SpeechSynthesisStreamHD_LongText",
+				ExpectedBehavior: map[string]interface{}{
+					"generate_hd_streaming_audio": true,
+					"handle_long_text":            true,
+					"min_chunks":                  3,
+					"min_total_bytes":             10000,
+				},
+				TestMetadata: map[string]interface{}{
+					"provider":    testConfig.Provider,
+					"model":       testConfig.SpeechSynthesisModel,
+					"text_length": len(finalText),
+					"voice":       voice,
+				},
+			}
+
+			responseChannel, err := WithStreamRetry(t, retryConfig, retryContext, func() (chan *schemas.BifrostStreamChunk, *schemas.BifrostError) {
+				requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+				return client.SpeechStreamRequest(requestCtx, request)
+			})
+
+			RequireNoError(t, err, "HD streaming speech synthesis failed")
+
+			var totalBytes int
+			var chunkCount int
+			var streamErrors []string
+			var lastTokenLatency int64
+			var audioBuffer bytes.Buffer // Accumulate audio chunks for validation
+
+			for response := range responseChannel {
+				if response == nil {
+					streamErrors = append(streamErrors, "Received nil HD stream response")
+					continue
+				}
+
+				if response.BifrostError != nil {
+					streamErrors = append(streamErrors, FormatErrorConcise(ParseBifrostError(response.BifrostError)))
+					continue
+				}
+
+				if response.BifrostSpeechStreamResponse != nil {
+					lastTokenLatency = response.BifrostSpeechStreamResponse.ExtraFields.Latency
+				}
+
+				if response.BifrostSpeechStreamResponse != nil && response.BifrostSpeechStreamResponse.Audio != nil {
+					chunkSize := len(response.BifrostSpeechStreamResponse.Audio)
+					if chunkSize == 0 {
+						t.Logf("⚠️ Skipping zero-length HD audio chunk")
+						continue
+					}
+					// Accumulate audio data for codec validation
+					audioBuffer.Write(response.BifrostSpeechStreamResponse.Audio)
+					totalBytes += chunkSize
+					chunkCount++
+					t.Logf("✅ HD chunk %d: %d bytes", chunkCount, chunkSize)
+				}
+
+				if response.BifrostSpeechStreamResponse != nil && response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested != "" && response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
+					t.Logf("⚠️ Unexpected HD model: %s", response.BifrostSpeechStreamResponse.ExtraFields.OriginalModelRequested)
+				}
+			}
+
+			if len(streamErrors) > 0 {
+				t.Logf("⚠️ HD stream errors: %v", streamErrors)
+			}
+
+			if chunkCount <= 3 {
+				t.Fatalf("HD model should produce more chunks for long text: got %d, expected > 3", chunkCount)
+			}
+
+			if totalBytes <= 10000 {
+				t.Fatalf("HD model should produce substantial audio data: got %d bytes, expected > 10000", totalBytes)
+			}
+
+			if lastTokenLatency == 0 {
+				t.Fatalf("❌ Last token latency is 0")
+			}
+
+			// Save audio to temp file, validate codec, and cleanup after test
+			if audioBuffer.Len() > 0 {
+				// If provider is Gemini, we will have to convert the PCM bytes to WAV bytes
+				var err error
+				audioData := audioBuffer.Bytes()
+				if testConfig.Provider == schemas.Gemini {
+					audioData, err = utils.ConvertPCMToWAV(audioData, utils.DefaultGeminiPCMConfig())
+					if err != nil {
+						t.Fatalf("Failed to convert PCM to WAV: %v", err)
+					}
+				}
+				filePath, validationErr := SaveAndValidateAudio(t, audioData)
+				if validationErr != nil {
+					t.Fatalf("Audio codec validation failed: %v", validationErr)
+				}
+				t.Logf("Audio file validated successfully (detected format: %s): %s", GetProviderDefaultFormat(testConfig.Provider), filePath)
+			} else {
+				t.Fatal("No audio data accumulated for codec validation")
+			}
+
+			t.Logf("✅ HD streaming successful: %d chunks, %d total bytes", chunkCount, totalBytes)
+		})
+
+		t.Run("MultipleVoices_Streaming", func(t *testing.T) {
+			if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+				t.Parallel()
+			}
+
+			voices := []string{}
+
+			// Test streaming with all available voices
+			openaiVoices := []string{"alloy", "echo", "fable", "onyx", "nova", "shimmer"}
+			geminiVoices := []string{"achernar", "achird", "erinome"}
+
+			// it's not possible to test all voices with Elevenlabs, we are using a few
+			elevenlabsVoices := []string{"21m00Tcm4TlvDq8ikWAM", "29vD33N1CtxCmqQRPOHJ", "2EiwWnXFnvU5JabPnv8n"}
+
+			testText := "Testing streaming speech synthesis with different voice options."
+
+			switch testConfig.Provider {
+			case schemas.OpenAI:
+				voices = openaiVoices
+			case schemas.Gemini:
+				voices = geminiVoices
+			case schemas.Elevenlabs:
+				voices = elevenlabsVoices
+			}
+
+			for _, voice := range voices {
+				voiceCopy := voice
+				t.Run("StreamingVoice_"+voiceCopy, func(t *testing.T) {
+					if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+						t.Parallel()
+					}
+
+					request := &schemas.BifrostSpeechRequest{
+						Provider: testConfig.Provider,
+						Model:    testConfig.SpeechSynthesisModel,
+						Input: &schemas.SpeechInput{
+							Input: testText,
+						},
+						Params: &schemas.SpeechParameters{
+							VoiceConfig: &schemas.SpeechVoiceInput{
+								Voice: &voiceCopy,
+							},
+							ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
+						},
+						Fallbacks: testConfig.SpeechSynthesisFallbacks,
+					}
+
+					retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisStreamVoice", testConfig)
+					retryContext := TestRetryContext{
+						ScenarioName: "SpeechSynthesisStream_Voice_" + voiceCopy,
+						ExpectedBehavior: map[string]interface{}{
+							"generate_streaming_audio": true,
+							"voice_type":               voiceCopy,
+						},
+						TestMetadata: map[string]interface{}{
+							"provider": testConfig.Provider,
+							"voice":    voiceCopy,
+						},
+					}
+
+					
+					// Use retry framework with stream validation
+					var accumulatedAudio bytes.Buffer // Accumulate audio for codec validation
+					validationResult := WithSpeechStreamValidationRetry(
+						t,
+						retryConfig,
+						retryContext,
+						func() (chan *schemas.BifrostStreamChunk, *schemas.BifrostError) {							
+							accumulatedAudio.Reset() // Reset buffer on retry
+							requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+							return client.SpeechStreamRequest(requestCtx, request)
+						},
+						func(responseChannel chan *schemas.BifrostStreamChunk) SpeechStreamValidationResult {
+							// Validate stream content
+							var receivedData bool
+							var streamErrors []string
+							var lastTokenLatency int64
+							var validationErrors []string
+
+							for response := range responseChannel {
+								if response == nil {
+									streamErrors = append(streamErrors, fmt.Sprintf("Received nil stream response for voice %s", voiceCopy))
+									continue
+								}
+
+								if response.BifrostError != nil {
+									streamErrors = append(streamErrors, fmt.Sprintf("Error in stream for voice %s: %s", voiceCopy, FormatErrorConcise(ParseBifrostError(response.BifrostError))))
+									continue
+								}
+
+								if response.BifrostSpeechStreamResponse != nil {
+									lastTokenLatency = response.BifrostSpeechStreamResponse.ExtraFields.Latency
+								}
+
+								if response.BifrostSpeechStreamResponse != nil && response.BifrostSpeechStreamResponse.Audio != nil && len(response.BifrostSpeechStreamResponse.Audio) > 0 {
+									receivedData = true
+									// Accumulate audio data for codec validation
+									accumulatedAudio.Write(response.BifrostSpeechStreamResponse.Audio)
+									t.Logf("✅ Received data for voice %s: %d bytes", voiceCopy, len(response.BifrostSpeechStreamResponse.Audio))
+								}
+							}
+
+							// Build validation errors
+							if len(streamErrors) > 0 {
+								validationErrors = append(validationErrors, fmt.Sprintf("Stream errors: %v", streamErrors))
+							}
+
+							if !receivedData {
+								validationErrors = append(validationErrors, fmt.Sprintf("Should receive audio data for voice %s", voiceCopy))
+							}
+
+							if lastTokenLatency == 0 {
+								validationErrors = append(validationErrors, "Last token latency is 0")
+							}
+
+							return SpeechStreamValidationResult{
+								Passed:       len(validationErrors) == 0,
+								Errors:       validationErrors,
+								ReceivedData: receivedData,
+								StreamErrors: streamErrors,
+								LastLatency:  lastTokenLatency,
+							}
+						},
+					)
+
+					// Check validation result
+					if !validationResult.Passed {
+						allErrors := append(validationResult.Errors, validationResult.StreamErrors...)
+						t.Fatalf("❌ Speech streaming validation failed for voice %s: %s", voiceCopy, strings.Join(allErrors, "; "))
+					}
+
+					// Save audio to temp file, validate codec, and cleanup after test
+					if accumulatedAudio.Len() > 0 {
+						var err error
+						audioData := accumulatedAudio.Bytes()
+						if testConfig.Provider == schemas.Gemini {
+							audioData, err = utils.ConvertPCMToWAV(audioData, utils.DefaultGeminiPCMConfig())
+							if err != nil {
+								t.Fatalf("Failed to convert PCM to WAV: %v", err)
+							}
+						}
+						filePath, validationErr := SaveAndValidateAudio(t, audioData)
+						if validationErr != nil {
+							t.Fatalf("❌ Audio codec validation failed for voice %s: %v", voiceCopy, validationErr)
+						}
+						t.Logf("🎵 Audio file validated successfully for voice %s: %s", voiceCopy, filePath)
+					} else {
+						t.Fatalf("❌ No audio data accumulated for codec validation (voice: %s)", voiceCopy)
+					}
+
+					t.Logf("✅ Streaming successful for voice: %s", voiceCopy)
+				})
+			}
+		})
+	})
+}