first commit
This commit is contained in:
698
core/internal/llmtests/transcription.go
Normal file
698
core/internal/llmtests/transcription.go
Normal file
@@ -0,0 +1,698 @@
|
||||
package llmtests
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
bifrost "github.com/maximhq/bifrost/core"
|
||||
"github.com/maximhq/bifrost/core/schemas"
|
||||
)
|
||||
|
||||
// RunTranscriptionTest executes the transcription test scenario
|
||||
func RunTranscriptionTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
|
||||
if !testConfig.Scenarios.Transcription {
|
||||
t.Logf("Transcription not supported for provider %s", testConfig.Provider)
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("Transcription", func(t *testing.T) {
|
||||
// First generate TTS audio for round-trip validation
|
||||
roundTripCases := []struct {
|
||||
name string
|
||||
text string
|
||||
voiceType string
|
||||
format string
|
||||
responseFormat *string
|
||||
}{
|
||||
{
|
||||
name: "RoundTrip_Basic_MP3",
|
||||
text: TTSTestTextBasic,
|
||||
voiceType: "primary",
|
||||
format: GetProviderDefaultFormat(testConfig.Provider),
|
||||
responseFormat: bifrost.Ptr("json"),
|
||||
},
|
||||
{
|
||||
name: "RoundTrip_Medium_MP3",
|
||||
text: TTSTestTextMedium,
|
||||
voiceType: "secondary",
|
||||
format: GetProviderDefaultFormat(testConfig.Provider),
|
||||
responseFormat: bifrost.Ptr("json"),
|
||||
},
|
||||
{
|
||||
name: "RoundTrip_Technical_MP3",
|
||||
text: TTSTestTextTechnical,
|
||||
voiceType: "tertiary",
|
||||
format: GetProviderDefaultFormat(testConfig.Provider),
|
||||
responseFormat: bifrost.Ptr("json"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range roundTripCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ShouldRunParallel(t, testConfig, "Transcription")
|
||||
|
||||
speechSynthesisProvider := testConfig.Provider
|
||||
if testConfig.ExternalTTSProvider != "" {
|
||||
speechSynthesisProvider = testConfig.ExternalTTSProvider
|
||||
}
|
||||
|
||||
speechSynthesisModel := testConfig.SpeechSynthesisModel
|
||||
if testConfig.ExternalTTSModel != "" {
|
||||
speechSynthesisModel = testConfig.ExternalTTSModel
|
||||
}
|
||||
|
||||
var transcriptionRequest *schemas.BifrostTranscriptionRequest
|
||||
if testConfig.Provider == schemas.HuggingFace && strings.HasPrefix(testConfig.TranscriptionModel, "fal-ai/") {
|
||||
|
||||
// For Fal-AI models on HuggingFace, we have to use mp3 but fal-ai speech models only return wav
|
||||
// So we read from a pre-generated mp3 file to avoid format issues
|
||||
_, filename, _, _ := runtime.Caller(0)
|
||||
dir := filepath.Dir(filename)
|
||||
filePath := filepath.Join(dir, "scenarios", "media", fmt.Sprintf("%s.mp3", tc.name))
|
||||
fileContent, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read audio fixture %s: %v", filePath, err)
|
||||
}
|
||||
transcriptionRequest = &schemas.BifrostTranscriptionRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.TranscriptionModel,
|
||||
Input: &schemas.TranscriptionInput{
|
||||
File: fileContent,
|
||||
},
|
||||
Params: &schemas.TranscriptionParameters{
|
||||
Language: bifrost.Ptr("en"),
|
||||
Format: bifrost.Ptr("mp3"),
|
||||
ResponseFormat: tc.responseFormat,
|
||||
},
|
||||
Fallbacks: testConfig.TranscriptionFallbacks,
|
||||
}
|
||||
} else {
|
||||
|
||||
// Step 1: Generate TTS audio
|
||||
voice := GetProviderVoice(speechSynthesisProvider, tc.voiceType)
|
||||
ttsRequest := &schemas.BifrostSpeechRequest{
|
||||
Provider: speechSynthesisProvider,
|
||||
Model: speechSynthesisModel,
|
||||
Input: &schemas.SpeechInput{
|
||||
Input: tc.text,
|
||||
},
|
||||
Params: &schemas.SpeechParameters{
|
||||
VoiceConfig: &schemas.SpeechVoiceInput{
|
||||
Voice: &voice,
|
||||
},
|
||||
ResponseFormat: tc.format,
|
||||
},
|
||||
Fallbacks: testConfig.SpeechSynthesisFallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework for TTS generation
|
||||
ttsRetryConfig := GetTestRetryConfigForScenario("SpeechSynthesis", testConfig)
|
||||
ttsRetryContext := TestRetryContext{
|
||||
ScenarioName: "Transcription_RoundTrip_TTS_" + tc.name,
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_generate_audio": true,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": speechSynthesisProvider,
|
||||
"model": speechSynthesisModel,
|
||||
"format": tc.format,
|
||||
},
|
||||
}
|
||||
// isStreaming=false, isMultipartRequest=false, isBinaryResponse=true (audio bytes don't have JSON raw response)
|
||||
ttsExpectations := ApplyRawExpectations(SpeechExpectations(100), testConfig, false, false, true) // Minimum expected bytes
|
||||
ttsExpectations = ModifyExpectationsForProvider(ttsExpectations, testConfig.Provider)
|
||||
speechRetryConfig := SpeechRetryConfig{
|
||||
MaxAttempts: ttsRetryConfig.MaxAttempts,
|
||||
BaseDelay: ttsRetryConfig.BaseDelay,
|
||||
MaxDelay: ttsRetryConfig.MaxDelay,
|
||||
Conditions: []SpeechRetryCondition{},
|
||||
OnRetry: ttsRetryConfig.OnRetry,
|
||||
OnFinalFail: ttsRetryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
ttsResponse, err := WithSpeechTestRetry(t, speechRetryConfig, ttsRetryContext, ttsExpectations, "Transcription_RoundTrip_TTS_"+tc.name, func() (*schemas.BifrostSpeechResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.SpeechRequest(bfCtx, ttsRequest)
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("❌ TTS generation failed for round-trip test after retries: %v", GetErrorMessage(err))
|
||||
}
|
||||
if ttsResponse == nil || len(ttsResponse.Audio) == 0 {
|
||||
t.Fatal("❌ TTS returned invalid or empty audio for round-trip test after retries")
|
||||
}
|
||||
|
||||
// Save temp audio file
|
||||
tempDir := os.TempDir()
|
||||
audioFileName := filepath.Join(tempDir, "roundtrip_"+tc.name+"."+tc.format)
|
||||
writeErr := os.WriteFile(audioFileName, ttsResponse.Audio, 0644)
|
||||
require.NoError(t, writeErr, "Failed to save temp audio file")
|
||||
|
||||
// Register cleanup
|
||||
t.Cleanup(func() {
|
||||
os.Remove(audioFileName)
|
||||
})
|
||||
|
||||
t.Logf("Generated TTS audio for round-trip: %s (%d bytes)", audioFileName, len(ttsResponse.Audio))
|
||||
|
||||
// Step 2: Transcribe the generated audio
|
||||
transcriptionRequest = &schemas.BifrostTranscriptionRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.TranscriptionModel,
|
||||
Input: &schemas.TranscriptionInput{
|
||||
File: ttsResponse.Audio,
|
||||
},
|
||||
Params: &schemas.TranscriptionParameters{
|
||||
Language: bifrost.Ptr("en"),
|
||||
Format: schemas.Ptr(tc.format),
|
||||
ResponseFormat: tc.responseFormat,
|
||||
},
|
||||
Fallbacks: testConfig.TranscriptionFallbacks,
|
||||
}
|
||||
}
|
||||
|
||||
// Use retry framework for transcription
|
||||
retryConfig := GetTestRetryConfigForScenario("Transcription", testConfig)
|
||||
retryContext := TestRetryContext{
|
||||
ScenarioName: "Transcription_RoundTrip_" + tc.name,
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_transcribe_audio": true,
|
||||
"round_trip_test": true,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": testConfig.Provider,
|
||||
"model": testConfig.TranscriptionModel,
|
||||
"format": tc.format,
|
||||
},
|
||||
}
|
||||
|
||||
// Enhanced validation for transcription
|
||||
// Note: isMultipartRequest=true because transcription uses multipart form data, not JSON body
|
||||
expectations := ApplyRawExpectations(TranscriptionExpectations(10), testConfig, false, true) // Expect at least some content
|
||||
expectations = ModifyExpectationsForProvider(expectations, testConfig.Provider)
|
||||
|
||||
// Create Transcription retry config
|
||||
transcriptionRetryConfig := TranscriptionRetryConfig{
|
||||
MaxAttempts: retryConfig.MaxAttempts,
|
||||
BaseDelay: retryConfig.BaseDelay,
|
||||
MaxDelay: retryConfig.MaxDelay,
|
||||
Conditions: []TranscriptionRetryCondition{}, // Add specific transcription retry conditions as needed
|
||||
OnRetry: retryConfig.OnRetry,
|
||||
OnFinalFail: retryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
transcriptionResponse, bifrostErr := WithTranscriptionTestRetry(t, transcriptionRetryConfig, retryContext, expectations, "Transcription_RoundTrip_"+tc.name, func() (*schemas.BifrostTranscriptionResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.TranscriptionRequest(bfCtx, transcriptionRequest)
|
||||
})
|
||||
|
||||
if bifrostErr != nil {
|
||||
t.Fatalf("❌ Transcription_RoundTrip_"+tc.name+" request failed after retries: %v", GetErrorMessage(bifrostErr))
|
||||
}
|
||||
|
||||
// Validate round-trip transcription (complementary to main validation)
|
||||
validateTranscriptionRoundTrip(t, transcriptionResponse, tc.text, tc.name, testConfig)
|
||||
})
|
||||
}
|
||||
|
||||
// Additional test cases using the utility function for edge cases
|
||||
t.Run("AdditionalAudioTests", func(t *testing.T) {
|
||||
// Test with custom generated audio for specific scenarios
|
||||
customCases := []struct {
|
||||
name string
|
||||
text string
|
||||
language *string
|
||||
responseFormat *string
|
||||
}{
|
||||
{
|
||||
name: "Numbers_And_Punctuation",
|
||||
text: "Testing numbers 1, 2, 3 and punctuation marks! Question?",
|
||||
language: bifrost.Ptr("en"),
|
||||
responseFormat: bifrost.Ptr("json"),
|
||||
},
|
||||
{
|
||||
name: "Technical_Terms",
|
||||
text: "API gateway processes HTTP requests with JSON payloads",
|
||||
language: bifrost.Ptr("en"),
|
||||
responseFormat: bifrost.Ptr("json"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range customCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ShouldRunParallel(t, testConfig, "Transcription")
|
||||
|
||||
speechSynthesisProvider := testConfig.Provider
|
||||
if testConfig.ExternalTTSProvider != "" {
|
||||
speechSynthesisProvider = testConfig.ExternalTTSProvider
|
||||
}
|
||||
|
||||
speechSynthesisModel := testConfig.SpeechSynthesisModel
|
||||
if testConfig.ExternalTTSModel != "" {
|
||||
speechSynthesisModel = testConfig.ExternalTTSModel
|
||||
}
|
||||
|
||||
audioFormat := GetProviderDefaultFormat(testConfig.Provider)
|
||||
|
||||
var audioData []byte
|
||||
var readErr error
|
||||
if testConfig.Provider == schemas.HuggingFace && strings.HasPrefix(testConfig.TranscriptionModel, "fal-ai/") {
|
||||
|
||||
// For Fal-AI models on HuggingFace, we have to use mp3 but fal-ai speech models only return wav
|
||||
// So we read from a pre-generated mp3 file to avoid format issues
|
||||
_, filename, _, _ := runtime.Caller(0)
|
||||
dir := filepath.Dir(filename)
|
||||
filePath := filepath.Join(dir, "scenarios", "media", fmt.Sprintf("%s.mp3", tc.name))
|
||||
audioData, readErr = os.ReadFile(filePath)
|
||||
if readErr != nil {
|
||||
t.Fatalf("failed to read audio fixture %s: %v", filePath, readErr)
|
||||
}
|
||||
audioFormat = "mp3"
|
||||
} else {
|
||||
// Use the utility function to generate audio
|
||||
audioData, _ = GenerateTTSAudioForTest(ctx, t, client, speechSynthesisProvider, speechSynthesisModel, tc.text, "primary", audioFormat)
|
||||
}
|
||||
// Test transcription
|
||||
request := &schemas.BifrostTranscriptionRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.TranscriptionModel,
|
||||
Input: &schemas.TranscriptionInput{
|
||||
File: audioData,
|
||||
},
|
||||
Params: &schemas.TranscriptionParameters{
|
||||
Language: tc.language,
|
||||
Format: &audioFormat,
|
||||
ResponseFormat: tc.responseFormat,
|
||||
},
|
||||
Fallbacks: testConfig.TranscriptionFallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework for custom transcription
|
||||
customRetryConfig := GetTestRetryConfigForScenario("Transcription", testConfig)
|
||||
customRetryContext := TestRetryContext{
|
||||
ScenarioName: "Transcription_Custom_" + tc.name,
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_transcribe_audio": true,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": testConfig.Provider,
|
||||
"model": testConfig.TranscriptionModel,
|
||||
},
|
||||
}
|
||||
customExpectations := ApplyRawExpectations(TranscriptionExpectations(5), testConfig, false, true)
|
||||
customExpectations = ModifyExpectationsForProvider(customExpectations, testConfig.Provider)
|
||||
customTranscriptionRetryConfig := TranscriptionRetryConfig{
|
||||
MaxAttempts: customRetryConfig.MaxAttempts,
|
||||
BaseDelay: customRetryConfig.BaseDelay,
|
||||
MaxDelay: customRetryConfig.MaxDelay,
|
||||
Conditions: []TranscriptionRetryCondition{},
|
||||
OnRetry: customRetryConfig.OnRetry,
|
||||
OnFinalFail: customRetryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
response, err := WithTranscriptionTestRetry(t, customTranscriptionRetryConfig, customRetryContext, customExpectations, "Transcription_Custom_"+tc.name, func() (*schemas.BifrostTranscriptionResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.TranscriptionRequest(bfCtx, request)
|
||||
})
|
||||
if err != nil {
|
||||
errorMsg := GetErrorMessage(err)
|
||||
if !strings.Contains(errorMsg, "❌") {
|
||||
errorMsg = fmt.Sprintf("❌ %s", errorMsg)
|
||||
}
|
||||
t.Fatalf("❌ Custom transcription failed after retries: %s", errorMsg)
|
||||
}
|
||||
if response == nil {
|
||||
t.Fatalf("❌ Custom transcription returned nil response after retries")
|
||||
}
|
||||
if response.Text == "" {
|
||||
t.Fatalf("❌ Custom transcription returned empty text after retries")
|
||||
}
|
||||
|
||||
t.Logf("✅ Custom transcription successful: '%s' → '%s'", tc.text, response.Text)
|
||||
})
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// RunTranscriptionAdvancedTest executes advanced transcription test scenarios
|
||||
func RunTranscriptionAdvancedTest(t *testing.T, client *bifrost.Bifrost, ctx context.Context, testConfig ComprehensiveTestConfig) {
|
||||
if !testConfig.Scenarios.Transcription {
|
||||
t.Logf("Transcription not supported for provider %s", testConfig.Provider)
|
||||
return
|
||||
}
|
||||
|
||||
t.Run("TranscriptionAdvanced", func(t *testing.T) {
|
||||
t.Run("AllResponseFormats", func(t *testing.T) {
|
||||
// Test supported response formats (excluding text to avoid JSON parsing issues)
|
||||
formats := []string{"json"}
|
||||
|
||||
for _, format := range formats {
|
||||
t.Run("Format_"+format, func(t *testing.T) {
|
||||
ShouldRunParallel(t, testConfig, "Transcription")
|
||||
|
||||
speechSynthesisProvider := testConfig.Provider
|
||||
if testConfig.ExternalTTSProvider != "" {
|
||||
speechSynthesisProvider = testConfig.ExternalTTSProvider
|
||||
}
|
||||
|
||||
speechSynthesisModel := testConfig.SpeechSynthesisModel
|
||||
if testConfig.ExternalTTSModel != "" {
|
||||
speechSynthesisModel = testConfig.ExternalTTSModel
|
||||
}
|
||||
|
||||
audioFormat := GetProviderDefaultFormat(testConfig.Provider)
|
||||
|
||||
var audioData []byte
|
||||
var readErr error
|
||||
if testConfig.Provider == schemas.HuggingFace && strings.HasPrefix(testConfig.TranscriptionModel, "fal-ai/") {
|
||||
|
||||
// For Fal-AI models on HuggingFace, we have to use mp3 but fal-ai speech models only return wav
|
||||
// So we read from a pre-generated mp3 file to avoid format issues
|
||||
_, filename, _, _ := runtime.Caller(0)
|
||||
dir := filepath.Dir(filename)
|
||||
filePath := filepath.Join(dir, "scenarios", "media", "RoundTrip_Basic_MP3.mp3")
|
||||
audioData, readErr = os.ReadFile(filePath)
|
||||
if readErr != nil {
|
||||
t.Fatalf("failed to read audio fixture %s: %v", filePath, readErr)
|
||||
}
|
||||
audioFormat = "mp3"
|
||||
} else {
|
||||
// Use the utility function to generate audio
|
||||
audioData, _ = GenerateTTSAudioForTest(ctx, t, client, speechSynthesisProvider, speechSynthesisModel, TTSTestTextBasic, "primary", audioFormat)
|
||||
}
|
||||
|
||||
formatCopy := format
|
||||
request := &schemas.BifrostTranscriptionRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.TranscriptionModel,
|
||||
Input: &schemas.TranscriptionInput{
|
||||
File: audioData,
|
||||
},
|
||||
Params: &schemas.TranscriptionParameters{
|
||||
Format: &audioFormat,
|
||||
ResponseFormat: &formatCopy,
|
||||
},
|
||||
Fallbacks: testConfig.TranscriptionFallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework for format test
|
||||
formatRetryConfig := GetTestRetryConfigForScenario("Transcription", testConfig)
|
||||
formatRetryContext := TestRetryContext{
|
||||
ScenarioName: "Transcription_Format_" + format,
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_transcribe_audio": true,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": testConfig.Provider,
|
||||
"model": testConfig.TranscriptionModel,
|
||||
"format": format,
|
||||
},
|
||||
}
|
||||
formatExpectations := ApplyRawExpectations(TranscriptionExpectations(5), testConfig, false, true)
|
||||
formatExpectations = ModifyExpectationsForProvider(formatExpectations, testConfig.Provider)
|
||||
formatTranscriptionRetryConfig := TranscriptionRetryConfig{
|
||||
MaxAttempts: formatRetryConfig.MaxAttempts,
|
||||
BaseDelay: formatRetryConfig.BaseDelay,
|
||||
MaxDelay: formatRetryConfig.MaxDelay,
|
||||
Conditions: []TranscriptionRetryCondition{},
|
||||
OnRetry: formatRetryConfig.OnRetry,
|
||||
OnFinalFail: formatRetryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
response, err := WithTranscriptionTestRetry(t, formatTranscriptionRetryConfig, formatRetryContext, formatExpectations, "Transcription_Format_"+format, func() (*schemas.BifrostTranscriptionResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.TranscriptionRequest(bfCtx, request)
|
||||
})
|
||||
if err != nil {
|
||||
errorMsg := GetErrorMessage(err)
|
||||
if !strings.Contains(errorMsg, "❌") {
|
||||
errorMsg = fmt.Sprintf("❌ %s", errorMsg)
|
||||
}
|
||||
t.Fatalf("❌ Transcription failed for format %s after retries: %s", format, errorMsg)
|
||||
}
|
||||
if response == nil {
|
||||
t.Fatalf("❌ Transcription returned nil response for format %s after retries", format)
|
||||
}
|
||||
if response.Text == "" {
|
||||
t.Fatalf("❌ Transcription returned empty text for format %s after retries", format)
|
||||
}
|
||||
|
||||
t.Logf("✅ Format %s successful: '%s'", format, response.Text)
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("WithCustomParameters", func(t *testing.T) {
|
||||
ShouldRunParallel(t, testConfig, "Transcription")
|
||||
|
||||
speechSynthesisProvider := testConfig.Provider
|
||||
if testConfig.ExternalTTSProvider != "" {
|
||||
speechSynthesisProvider = testConfig.ExternalTTSProvider
|
||||
}
|
||||
|
||||
speechSynthesisModel := testConfig.SpeechSynthesisModel
|
||||
if testConfig.ExternalTTSModel != "" {
|
||||
speechSynthesisModel = testConfig.ExternalTTSModel
|
||||
}
|
||||
|
||||
audioFormat := GetProviderDefaultFormat(testConfig.Provider)
|
||||
|
||||
var audioData []byte
|
||||
var readErr error
|
||||
if testConfig.Provider == schemas.HuggingFace && strings.HasPrefix(testConfig.TranscriptionModel, "fal-ai/") {
|
||||
|
||||
// For Fal-AI models on HuggingFace, we have to use mp3 but fal-ai speech models only return wav
|
||||
// So we read from a pre-generated mp3 file to avoid format issues
|
||||
_, filename, _, _ := runtime.Caller(0)
|
||||
dir := filepath.Dir(filename)
|
||||
filePath := filepath.Join(dir, "scenarios", "media", "RoundTrip_Medium_MP3.mp3")
|
||||
audioData, readErr = os.ReadFile(filePath)
|
||||
if readErr != nil {
|
||||
t.Fatalf("failed to read audio fixture %s: %v", filePath, readErr)
|
||||
}
|
||||
audioFormat = "mp3"
|
||||
} else {
|
||||
// Generate audio for custom parameters test
|
||||
audioData, _ = GenerateTTSAudioForTest(ctx, t, client, speechSynthesisProvider, speechSynthesisModel, TTSTestTextMedium, "secondary", audioFormat)
|
||||
}
|
||||
|
||||
// Test with custom parameters and temperature
|
||||
request := &schemas.BifrostTranscriptionRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.TranscriptionModel,
|
||||
Input: &schemas.TranscriptionInput{
|
||||
File: audioData,
|
||||
},
|
||||
Params: &schemas.TranscriptionParameters{
|
||||
Language: bifrost.Ptr("en"),
|
||||
Format: &audioFormat,
|
||||
Prompt: bifrost.Ptr("This audio contains technical terminology and proper nouns."),
|
||||
ResponseFormat: bifrost.Ptr("json"), // Use json instead of verbose_json for whisper-1
|
||||
},
|
||||
Fallbacks: testConfig.TranscriptionFallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework for advanced transcription
|
||||
advancedRetryConfig := GetTestRetryConfigForScenario("Transcription", testConfig)
|
||||
advancedRetryContext := TestRetryContext{
|
||||
ScenarioName: "Transcription_Advanced_CustomParams",
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_transcribe_audio": true,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": testConfig.Provider,
|
||||
"model": testConfig.TranscriptionModel,
|
||||
},
|
||||
}
|
||||
advancedExpectations := ApplyRawExpectations(TranscriptionExpectations(5), testConfig, false, true)
|
||||
advancedExpectations = ModifyExpectationsForProvider(advancedExpectations, testConfig.Provider)
|
||||
advancedTranscriptionRetryConfig := TranscriptionRetryConfig{
|
||||
MaxAttempts: advancedRetryConfig.MaxAttempts,
|
||||
BaseDelay: advancedRetryConfig.BaseDelay,
|
||||
MaxDelay: advancedRetryConfig.MaxDelay,
|
||||
Conditions: []TranscriptionRetryCondition{},
|
||||
OnRetry: advancedRetryConfig.OnRetry,
|
||||
OnFinalFail: advancedRetryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
response, err := WithTranscriptionTestRetry(t, advancedTranscriptionRetryConfig, advancedRetryContext, advancedExpectations, "Transcription_Advanced_CustomParams", func() (*schemas.BifrostTranscriptionResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.TranscriptionRequest(bfCtx, request)
|
||||
})
|
||||
if err != nil {
|
||||
errorMsg := GetErrorMessage(err)
|
||||
if !strings.Contains(errorMsg, "❌") {
|
||||
errorMsg = fmt.Sprintf("❌ %s", errorMsg)
|
||||
}
|
||||
t.Fatalf("❌ Advanced transcription failed after retries: %s", errorMsg)
|
||||
}
|
||||
if response == nil {
|
||||
t.Fatalf("❌ Advanced transcription returned nil response after retries")
|
||||
}
|
||||
if response.Text == "" {
|
||||
t.Fatalf("❌ Advanced transcription returned empty text after retries")
|
||||
}
|
||||
|
||||
t.Logf("✅ Advanced transcription successful: '%s'", response.Text)
|
||||
})
|
||||
|
||||
t.Run("MultipleLanguages", func(t *testing.T) {
|
||||
// Test with different language hints (only English for now since our TTS is English)
|
||||
languages := []string{"en"}
|
||||
|
||||
for _, lang := range languages {
|
||||
t.Run("Language_"+lang, func(t *testing.T) {
|
||||
ShouldRunParallel(t, testConfig, "Transcription")
|
||||
|
||||
speechSynthesisProvider := testConfig.Provider
|
||||
if testConfig.ExternalTTSProvider != "" {
|
||||
speechSynthesisProvider = testConfig.ExternalTTSProvider
|
||||
}
|
||||
|
||||
speechSynthesisModel := testConfig.SpeechSynthesisModel
|
||||
if testConfig.ExternalTTSModel != "" {
|
||||
speechSynthesisModel = testConfig.ExternalTTSModel
|
||||
}
|
||||
|
||||
audioFormat := GetProviderDefaultFormat(testConfig.Provider)
|
||||
|
||||
var audioData []byte
|
||||
var readErr error
|
||||
if testConfig.Provider == schemas.HuggingFace && strings.HasPrefix(testConfig.TranscriptionModel, "fal-ai/") {
|
||||
|
||||
// For Fal-AI models on HuggingFace, we have to use mp3 but fal-ai speech models only return wav
|
||||
// So we read from a pre-generated mp3 file to avoid format issues
|
||||
_, filename, _, _ := runtime.Caller(0)
|
||||
dir := filepath.Dir(filename)
|
||||
filePath := filepath.Join(dir, "scenarios", "media", "RoundTrip_Basic_MP3.mp3")
|
||||
audioData, readErr = os.ReadFile(filePath)
|
||||
if readErr != nil {
|
||||
t.Fatalf("failed to read audio fixture %s: %v", filePath, readErr)
|
||||
}
|
||||
audioFormat = "mp3"
|
||||
} else {
|
||||
// Use the utility function to generate audio
|
||||
audioData, _ = GenerateTTSAudioForTest(ctx, t, client, speechSynthesisProvider, speechSynthesisModel, TTSTestTextBasic, "primary", audioFormat)
|
||||
}
|
||||
|
||||
langCopy := lang
|
||||
request := &schemas.BifrostTranscriptionRequest{
|
||||
Provider: testConfig.Provider,
|
||||
Model: testConfig.TranscriptionModel,
|
||||
Input: &schemas.TranscriptionInput{
|
||||
File: audioData,
|
||||
},
|
||||
Params: &schemas.TranscriptionParameters{
|
||||
Format: &audioFormat,
|
||||
Language: &langCopy,
|
||||
},
|
||||
Fallbacks: testConfig.TranscriptionFallbacks,
|
||||
}
|
||||
|
||||
// Use retry framework for language test
|
||||
langRetryConfig := GetTestRetryConfigForScenario("Transcription", testConfig)
|
||||
langRetryContext := TestRetryContext{
|
||||
ScenarioName: "Transcription_Language_" + lang,
|
||||
ExpectedBehavior: map[string]interface{}{
|
||||
"should_transcribe_audio": true,
|
||||
},
|
||||
TestMetadata: map[string]interface{}{
|
||||
"provider": testConfig.Provider,
|
||||
"model": testConfig.TranscriptionModel,
|
||||
"language": lang,
|
||||
},
|
||||
}
|
||||
langExpectations := ApplyRawExpectations(TranscriptionExpectations(5), testConfig, false, true)
|
||||
langExpectations = ModifyExpectationsForProvider(langExpectations, testConfig.Provider)
|
||||
langTranscriptionRetryConfig := TranscriptionRetryConfig{
|
||||
MaxAttempts: langRetryConfig.MaxAttempts,
|
||||
BaseDelay: langRetryConfig.BaseDelay,
|
||||
MaxDelay: langRetryConfig.MaxDelay,
|
||||
Conditions: []TranscriptionRetryCondition{},
|
||||
OnRetry: langRetryConfig.OnRetry,
|
||||
OnFinalFail: langRetryConfig.OnFinalFail,
|
||||
}
|
||||
|
||||
response, err := WithTranscriptionTestRetry(t, langTranscriptionRetryConfig, langRetryContext, langExpectations, "Transcription_Language_"+lang, func() (*schemas.BifrostTranscriptionResponse, *schemas.BifrostError) {
|
||||
bfCtx := schemas.NewBifrostContext(ctx, schemas.NoDeadline)
|
||||
return client.TranscriptionRequest(bfCtx, request)
|
||||
})
|
||||
if err != nil {
|
||||
errorMsg := GetErrorMessage(err)
|
||||
if !strings.Contains(errorMsg, "❌") {
|
||||
errorMsg = fmt.Sprintf("❌ %s", errorMsg)
|
||||
}
|
||||
t.Fatalf("❌ Transcription failed for language %s after retries: %s", lang, errorMsg)
|
||||
}
|
||||
if response == nil {
|
||||
t.Fatalf("❌ Transcription returned nil response for language %s after retries", lang)
|
||||
}
|
||||
if response.Text == "" {
|
||||
t.Fatalf("❌ Transcription returned empty text for language %s after retries", lang)
|
||||
}
|
||||
t.Logf("✅ Language %s transcription successful: '%s'", lang, response.Text)
|
||||
})
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// validateTranscriptionRoundTrip performs round-trip validation for transcription responses
|
||||
// This is complementary to the main validation framework and focuses on transcription accuracy
|
||||
func validateTranscriptionRoundTrip(t *testing.T, response *schemas.BifrostTranscriptionResponse, originalText string, testName string, testConfig ComprehensiveTestConfig) {
|
||||
if response == nil || response.Text == "" {
|
||||
t.Fatal("Transcription response missing transcribed text")
|
||||
}
|
||||
|
||||
transcribedText := response.Text
|
||||
|
||||
// Normalize for comparison (lowercase, remove punctuation)
|
||||
originalWords := strings.Fields(strings.ToLower(originalText))
|
||||
transcribedWords := strings.Fields(strings.ToLower(transcribedText))
|
||||
|
||||
// Check that at least 50% of original words are found in transcription
|
||||
foundWords := 0
|
||||
for _, originalWord := range originalWords {
|
||||
// Remove punctuation for comparison
|
||||
cleanOriginal := strings.Trim(originalWord, ".,!?;:")
|
||||
if len(cleanOriginal) < 3 { // Skip very short words
|
||||
continue
|
||||
}
|
||||
|
||||
for _, transcribedWord := range transcribedWords {
|
||||
cleanTranscribed := strings.Trim(transcribedWord, ".,!?;:")
|
||||
if strings.Contains(cleanTranscribed, cleanOriginal) || strings.Contains(cleanOriginal, cleanTranscribed) {
|
||||
foundWords++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Expect at least 50% word match for successful round-trip
|
||||
minExpectedWords := len(originalWords) / 2
|
||||
if foundWords < minExpectedWords {
|
||||
t.Logf("⚠️ Round-trip validation concern:")
|
||||
t.Logf(" Original: '%s'", originalText)
|
||||
t.Logf(" Transcribed: '%s'", transcribedText)
|
||||
t.Logf(" Found %d/%d words (%.1f%%), expected ≥ %d (50%%)",
|
||||
foundWords, len(originalWords), float64(foundWords)/float64(len(originalWords))*100, minExpectedWords)
|
||||
// Note: Not failing test as this can be provider/model dependent
|
||||
} else {
|
||||
t.Logf("✅ Round-trip validation passed: found %d/%d words (%.1f%%)",
|
||||
foundWords, len(originalWords), float64(foundWords)/float64(len(originalWords))*100)
|
||||
}
|
||||
|
||||
// Check provider field
|
||||
if response.ExtraFields.Provider != testConfig.Provider {
|
||||
t.Logf("⚠️ Provider mismatch: expected %s, got %s", testConfig.Provider, response.ExtraFields.Provider)
|
||||
}
|
||||
|
||||
t.Logf("Round-trip test '%s' completed successfully", testName)
|
||||
}
|
||||
Reference in New Issue
Block a user