first commit

2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions
--- a/core/internal/llmtests/speech_synthesis.go
+++ b/core/internal/llmtests/speech_synthesis.go
@@ -0,0 +1,352 @@
+package llmtests
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// RunSpeechSynthesisTest executes the speech synthesis test scenario
+func RunSpeechSynthesisTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
+	if !testConfig.Scenarios.SpeechSynthesis {
+		t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
+		return
+	}
+
+	t.Run("SpeechSynthesis", func(t *testing.T) {
+		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+			t.Parallel()
+		}
+
+		// Test with shared text constants for round-trip validation with transcription
+		testCases := []struct {
+			name           string
+			text           string
+			voiceType      string
+			format         string
+			expectMinBytes int
+			saveForSST     bool // Whether to save this audio for SST round-trip testing
+		}{
+			{
+				name:           "BasicText_Primary_MP3",
+				text:           TTSTestTextBasic,
+				voiceType:      "primary",
+				format:         GetProviderDefaultFormat(testConfig.Provider),
+				expectMinBytes: 1000,
+				saveForSST:     true,
+			},
+			{
+				name:           "MediumText_Secondary_MP3",
+				text:           TTSTestTextMedium,
+				voiceType:      "secondary",
+				format:         GetProviderDefaultFormat(testConfig.Provider),
+				expectMinBytes: 2000,
+				saveForSST:     true,
+			},
+			{
+				name:           "TechnicalText_Tertiary_MP3",
+				text:           TTSTestTextTechnical,
+				voiceType:      "tertiary",
+				format:         GetProviderDefaultFormat(testConfig.Provider),
+				expectMinBytes: 500,
+				saveForSST:     true,
+			},
+		}
+
+		for _, tc := range testCases {
+			t.Run(tc.name, func(t *testing.T) {
+				if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+					t.Parallel()
+				}
+
+				voice := GetProviderVoice(testConfig.Provider, tc.voiceType)
+				request := &schemas.BifrostSpeechRequest{
+					Provider: testConfig.Provider,
+					Model:    testConfig.SpeechSynthesisModel, // Use configured model
+					Input: &schemas.SpeechInput{
+						Input: tc.text,
+					},
+					Params: &schemas.SpeechParameters{
+						VoiceConfig: &schemas.SpeechVoiceInput{
+							Voice: &voice,
+						},
+						ResponseFormat: tc.format,
+					},
+					Fallbacks: testConfig.SpeechSynthesisFallbacks,
+				}
+
+				// Use retry framework with enhanced validation
+				retryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
+				retryContext := TestRetryContext{
+					ScenarioName: "SpeechSynthesis_" + tc.name,
+					ExpectedBehavior: map[string]interface{}{
+						"should_generate_audio": true,
+					},
+					TestMetadata: map[string]interface{}{
+						"provider": testConfig.Provider,
+						"model":    testConfig.SpeechSynthesisModel,
+						"format":   tc.format,
+						"voice":    voice,
+					},
+				}
+
+				// Enhanced validation for speech synthesis
+				// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
+				expectations := ApplyRawExpectations(SpeechExpectations(tc.expectMinBytes), testConfig, false, false, true)
+				expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
+
+				// Create Speech retry config
+				speechRetryConfig := SpeechRetryConfig{
+					MaxAttempts: retryConfig.MaxAttempts,
+					BaseDelay:   retryConfig.BaseDelay,
+					MaxDelay:    retryConfig.MaxDelay,
+					Conditions:  []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
+					OnRetry:     retryConfig.OnRetry,
+					OnFinalFail: retryConfig.OnFinalFail,
+				}
+
+				speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_"+tc.name, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
+					requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+					return client.SpeechRequest(requestCtx, request)
+				})
+
+				if bifrostErr != nil {
+					t.Fatalf("❌ SpeechSynthesis_"+tc.name+" request failed after retries: %v", GetErrorMessage(bifrostErr))
+				}
+
+				// Additional speech-specific validations (complementary to main validation)
+				validateSpeechSynthesisSpecific(t, speechResponse, tc.expectMinBytes, testConfig.SpeechSynthesisModel)
+
+				// Save audio file for SST round-trip testing if requested
+				if tc.saveForSST {
+					tempDir := os.TempDir()
+					audioFileName := filepath.Join(tempDir, "tts_"+tc.name+"."+tc.format)
+
+					err := os.WriteFile(audioFileName, speechResponse.Audio, 0644)
+					require.NoError(t, err, "Failed to save audio file for SST testing")
+
+					// Register cleanup to remove temp file
+					t.Cleanup(func() {
+						os.Remove(audioFileName)
+					})
+
+					t.Logf("💾 Audio saved for SST testing: %s (text: '%s')", audioFileName, tc.text)
+				}
+
+				t.Logf("✅ Speech synthesis successful: %d bytes of %s audio generated for voice '%s'",
+					len(speechResponse.Audio), tc.format, voice)
+			})
+		}
+	})
+}
+
+// RunSpeechSynthesisAdvancedTest executes advanced speech synthesis test scenarios
+func RunSpeechSynthesisAdvancedTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
+	if !testConfig.Scenarios.SpeechSynthesis {
+		t.Logf("Speech synthesis not supported for provider %s", testConfig.Provider)
+		return
+	}
+
+	t.Run("SpeechSynthesisAdvanced", func(t *testing.T) {
+		if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+			t.Parallel()
+		}
+
+		t.Run("LongText_HDModel", func(t *testing.T) {
+			if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+				t.Parallel()
+			}
+
+			// Test with longer text and HD model
+			longText := `
+			This is a comprehensive test of the text-to-speech functionality using a longer piece of text.
+			The system should be able to handle multiple sentences, proper punctuation, and maintain 
+			consistent voice quality throughout the entire speech generation process. This test ensures
+			that the speech synthesis can handle realistic use cases with substantial content.
+			`
+
+			voice := GetProviderVoice(testConfig.Provider, "tertiary")
+			request := &schemas.BifrostSpeechRequest{
+				Provider: testConfig.Provider,
+				Model:    testConfig.SpeechSynthesisModel,
+				Input: &schemas.SpeechInput{
+					Input: longText,
+				},
+				Params: &schemas.SpeechParameters{
+					VoiceConfig: &schemas.SpeechVoiceInput{
+						Voice: &voice,
+					},
+					ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
+					Instructions:   "Speak slowly and clearly with natural intonation.",
+				},
+				Fallbacks: testConfig.SpeechSynthesisFallbacks,
+			}
+
+			// Groq doesn't support instructions
+			if testConfig.Provider == schemas.Groq {
+				request.Params.Instructions = ""
+			}
+
+			retryConfig := GetTestRetryConfigForScenario("SpeechSynthesisHD", testConfig)
+			retryContext := TestRetryContext{
+				ScenarioName: "SpeechSynthesis_HD_LongText",
+				ExpectedBehavior: map[string]interface{}{
+					"generate_hd_audio": true,
+					"handle_long_text":  true,
+					"min_audio_bytes":   5000,
+				},
+				TestMetadata: map[string]interface{}{
+					"provider":    testConfig.Provider,
+					"model":       testConfig.SpeechSynthesisModel,
+					"text_length": len(longText),
+				},
+			}
+
+			// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
+			expectations := ApplyRawExpectations(SpeechExpectations(5000), testConfig, false, false, true) // HD should produce substantial audio
+			expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
+
+			// Create Speech retry config
+			speechRetryConfig := SpeechRetryConfig{
+				MaxAttempts: retryConfig.MaxAttempts,
+				BaseDelay:   retryConfig.BaseDelay,
+				MaxDelay:    retryConfig.MaxDelay,
+				Conditions:  []SpeechRetryCondition{}, // Add specific speech retry conditions as needed
+				OnRetry:     retryConfig.OnRetry,
+				OnFinalFail: retryConfig.OnFinalFail,
+			}
+
+			speechResponse, bifrostErr := WithSpeechTestRetry(t, speechRetryConfig, retryContext, expectations, "SpeechSynthesis_HD", func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
+				requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+				return client.SpeechRequest(requestCtx, request)
+			})
+			if bifrostErr != nil {
+				t.Fatalf("❌ SpeechSynthesis_HD request failed after retries: %v", GetErrorMessage(bifrostErr))
+			}
+
+			if speechResponse == nil || speechResponse.Audio == nil {
+				t.Fatal("HD speech synthesis response missing audio data")
+			}
+
+			audioSize := len(speechResponse.Audio)
+			if audioSize < 5000 {
+				t.Fatalf("HD audio data too small: got %d bytes, expected at least 5000", audioSize)
+			}
+
+			if speechResponse.ExtraFields.OriginalModelRequested != testConfig.SpeechSynthesisModel {
+				t.Logf("⚠️ Expected HD model, got: %s", speechResponse.ExtraFields.OriginalModelRequested)
+			}
+
+			t.Logf("✅ HD speech synthesis successful: %d bytes generated", len(speechResponse.Audio))
+		})
+
+		t.Run("AllVoiceOptions", func(t *testing.T) {
+			if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+				t.Parallel()
+			}
+
+			// Test provider-specific voice options
+			voiceTypes := []string{"primary", "secondary", "tertiary"}
+			testText := TTSTestTextBasic // Use shared constant
+
+			for _, voiceType := range voiceTypes {
+				t.Run("VoiceType_"+voiceType, func(t *testing.T) {
+					if os.Getenv("SKIP_PARALLEL_TESTS") != "true" {
+						t.Parallel()
+					}
+
+					voice := GetProviderVoice(testConfig.Provider, voiceType)
+					request := &schemas.BifrostSpeechRequest{
+						Provider: testConfig.Provider,
+						Model:    testConfig.SpeechSynthesisModel,
+						Input: &schemas.SpeechInput{
+							Input: testText,
+						},
+						Params: &schemas.SpeechParameters{
+							VoiceConfig: &schemas.SpeechVoiceInput{
+								Voice: &voice,
+							},
+							ResponseFormat: GetProviderDefaultFormat(testConfig.Provider),
+						},
+						Fallbacks: testConfig.SpeechSynthesisFallbacks,
+					}
+
+					// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
+					expectations := ApplyRawExpectations(SpeechExpectations(500), testConfig, false, false, true)
+					expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
+
+					// Use retry framework for voice test
+					voiceRetryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
+					voiceRetryContext := TestRetryContext{
+						ScenarioName: "SpeechSynthesis_VoiceType_" + voiceType,
+						ExpectedBehavior: map[string]interface{}{
+							"should_generate_audio": true,
+						},
+						TestMetadata: map[string]interface{}{
+							"provider":   testConfig.Provider,
+							"model":      testConfig.SpeechSynthesisModel,
+							"voice_type": voiceType,
+							"voice":      voice,
+						},
+					}
+					voiceSpeechRetryConfig := SpeechRetryConfig{
+						MaxAttempts: voiceRetryConfig.MaxAttempts,
+						BaseDelay:   voiceRetryConfig.BaseDelay,
+						MaxDelay:    voiceRetryConfig.MaxDelay,
+						Conditions:  []SpeechRetryCondition{},
+						OnRetry:     voiceRetryConfig.OnRetry,
+						OnFinalFail: voiceRetryConfig.OnFinalFail,
+					}
+
+					speechResponse, bifrostErr := WithSpeechTestRetry(t, voiceSpeechRetryConfig, voiceRetryContext, expectations, "SpeechSynthesis_VoiceType_"+voiceType, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
+						requestCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
+						return client.SpeechRequest(requestCtx, request)
+					})
+
+					if bifrostErr != nil {
+						t.Fatalf("❌ SpeechSynthesis_Voice_"+voiceType+" request failed after retries: %v", GetErrorMessage(bifrostErr))
+					}
+
+					if speechResponse == nil || speechResponse.Audio == nil {
+						t.Fatalf("Voice %s (%s) missing audio data after retries", voice, voiceType)
+					}
+
+					audioSize := len(speechResponse.Audio)
+					if audioSize < 500 {
+						t.Fatalf("Audio too small for voice %s: got %d bytes, expected at least 500", voice, audioSize)
+					}
+					t.Logf("✅ Voice %s (%s): %d bytes generated", voice, voiceType, len(speechResponse.Audio))
+				})
+			}
+		})
+	})
+}
+
+// validateSpeechSynthesisSpecific performs speech-specific validation
+// This is complementary to the main validation framework and focuses on speech synthesis concerns
+func validateSpeechSynthesisSpecific(t *testing.T, response *schemas.BifrostSpeechResponse, expectMinBytes int, expectedModel string) {
+	if response == nil {
+		t.Fatal("Invalid speech synthesis response structure")
+	}
+
+	if response.Audio == nil {
+		t.Fatal("Speech synthesis response missing audio data")
+	}
+
+	audioSize := len(response.Audio)
+	if audioSize < expectMinBytes {
+		t.Fatalf("Audio data too small: got %d bytes, expected at least %d", audioSize, expectMinBytes)
+	}
+
+	if expectedModel != "" && response.ExtraFields.OriginalModelRequested != expectedModel {
+		t.Logf("⚠️ Expected model, got: %s", response.ExtraFields.OriginalModelRequested)
+	}
+
+	t.Logf("✅ Audio validation passed: %d bytes generated", audioSize)
+}