package gemini import ( "context" "fmt" "strings" "github.com/maximhq/bifrost/core/providers/utils" "github.com/maximhq/bifrost/core/schemas" ) // ToBifrostSpeechRequest converts a GeminiGenerationRequest to a BifrostSpeechRequest func (request *GeminiGenerationRequest) ToBifrostSpeechRequest(ctx *schemas.BifrostContext) *schemas.BifrostSpeechRequest { provider, model := schemas.ParseModelString(request.Model, utils.CheckAndSetDefaultProvider(ctx, schemas.Gemini)) bifrostReq := &schemas.BifrostSpeechRequest{ Provider: provider, Model: model, } // Extract text input from contents var textInput string for _, content := range request.Contents { for _, part := range content.Parts { if part.Text != "" { textInput += part.Text } } } bifrostReq.Input = &schemas.SpeechInput{ Input: textInput, } // Convert generation config to parameters if request.GenerationConfig.SpeechConfig != nil || len(request.GenerationConfig.ResponseModalities) > 0 { bifrostReq.Params = &schemas.SpeechParameters{} // Extract voice config from speech config if request.GenerationConfig.SpeechConfig != nil { // Handle single-speaker voice config if request.GenerationConfig.SpeechConfig.VoiceConfig != nil { bifrostReq.Params.VoiceConfig = &schemas.SpeechVoiceInput{} if request.GenerationConfig.SpeechConfig.VoiceConfig.PrebuiltVoiceConfig != nil { voiceName := request.GenerationConfig.SpeechConfig.VoiceConfig.PrebuiltVoiceConfig.VoiceName bifrostReq.Params.VoiceConfig.Voice = &voiceName } } else if request.GenerationConfig.SpeechConfig.MultiSpeakerVoiceConfig != nil { // Handle multi-speaker voice config // Convert to Bifrost's MultiVoiceConfig format if len(request.GenerationConfig.SpeechConfig.MultiSpeakerVoiceConfig.SpeakerVoiceConfigs) > 0 { bifrostReq.Params.VoiceConfig = &schemas.SpeechVoiceInput{} multiVoiceConfig := make([]schemas.VoiceConfig, 0, len(request.GenerationConfig.SpeechConfig.MultiSpeakerVoiceConfig.SpeakerVoiceConfigs)) for _, speakerConfig := range request.GenerationConfig.SpeechConfig.MultiSpeakerVoiceConfig.SpeakerVoiceConfigs { if speakerConfig.VoiceConfig != nil && speakerConfig.VoiceConfig.PrebuiltVoiceConfig != nil { multiVoiceConfig = append(multiVoiceConfig, schemas.VoiceConfig{ Speaker: speakerConfig.Speaker, Voice: speakerConfig.VoiceConfig.PrebuiltVoiceConfig.VoiceName, }) } } bifrostReq.Params.VoiceConfig.MultiVoiceConfig = multiVoiceConfig } } } // Store response modalities in extra params if needed if len(request.GenerationConfig.ResponseModalities) > 0 { if bifrostReq.Params.ExtraParams == nil { bifrostReq.Params.ExtraParams = make(map[string]interface{}) } modalities := make([]string, len(request.GenerationConfig.ResponseModalities)) for i, mod := range request.GenerationConfig.ResponseModalities { modalities[i] = string(mod) } bifrostReq.Params.ExtraParams["response_modalities"] = modalities } } return bifrostReq } // ToGeminiSpeechRequest converts a BifrostSpeechRequest to a GeminiGenerationRequest func ToGeminiSpeechRequest(bifrostReq *schemas.BifrostSpeechRequest) (*GeminiGenerationRequest, error) { if bifrostReq == nil { return nil, fmt.Errorf("bifrostReq is nil") } // Here we confirm if the response_format is wav or empty string // If its anything else, we will return an error if bifrostReq.Params != nil && bifrostReq.Params.ResponseFormat != "" && bifrostReq.Params.ResponseFormat != "wav" { return nil, fmt.Errorf("gemini does not support response_format: %s. Only wav or empty string is supported which defaults to wav", bifrostReq.Params.ResponseFormat) } // Create the base Gemini generation request geminiReq := &GeminiGenerationRequest{ Model: bifrostReq.Model, } // Convert parameters to generation config geminiReq.GenerationConfig.ResponseModalities = []Modality{ModalityAudio} // Convert speech input to Gemini format if bifrostReq.Input != nil && bifrostReq.Input.Input != "" { geminiReq.Contents = []Content{ { Parts: []*Part{ { Text: bifrostReq.Input.Input, }, }, }, } // Add speech config to generation config if voice config is provided if bifrostReq.Params != nil && bifrostReq.Params.VoiceConfig != nil { // Handle both single voice and multi-voice configurations if bifrostReq.Params.VoiceConfig.Voice != nil || len(bifrostReq.Params.VoiceConfig.MultiVoiceConfig) > 0 { addSpeechConfigToGenerationConfig(&geminiReq.GenerationConfig, bifrostReq.Params.VoiceConfig) } geminiReq.ExtraParams = bifrostReq.Params.ExtraParams } } return geminiReq, nil } // ToBifrostSpeechResponse converts a GenerateContentResponse to a BifrostSpeechResponse func (response *GenerateContentResponse) ToBifrostSpeechResponse(ctx context.Context) (*schemas.BifrostSpeechResponse, error) { bifrostResp := &schemas.BifrostSpeechResponse{} // Process candidates to extract audio content if len(response.Candidates) > 0 { candidate := response.Candidates[0] if candidate.Content != nil && len(candidate.Content.Parts) > 0 { var audioData []byte // Extract audio data from all parts for _, part := range candidate.Content.Parts { if part.InlineData != nil && len(part.InlineData.Data) > 0 { // Check if this is audio data if strings.HasPrefix(part.InlineData.MIMEType, "audio/") { decodedData, err := decodeBase64StringToBytes(part.InlineData.Data) if err != nil { return nil, fmt.Errorf("failed to decode base64 audio data: %v", err) } audioData = append(audioData, decodedData...) } } } if len(audioData) > 0 { responseFormat := ctx.Value(BifrostContextKeyResponseFormat).(string) // Gemini returns PCM audio (s16le, 24000 Hz, mono) // Convert to WAV for standard playable output format if responseFormat == "wav" { wavData, err := utils.ConvertPCMToWAV(audioData, utils.DefaultGeminiPCMConfig()) if err != nil { return nil, fmt.Errorf("failed to convert PCM to WAV: %v", err) } bifrostResp.Audio = wavData } else { bifrostResp.Audio = audioData } } // Set usage information if response.UsageMetadata != nil { bifrostResp.Usage = convertGeminiUsageMetadataToSpeechUsage(response.UsageMetadata) } } } return bifrostResp, nil } // ToGeminiSpeechResponse converts a BifrostSpeechResponse to Gemini's GenerateContentResponse func ToGeminiSpeechResponse(bifrostResp *schemas.BifrostSpeechResponse) *GenerateContentResponse { if bifrostResp == nil { return nil } genaiResp := &GenerateContentResponse{} candidate := &Candidate{ Content: &Content{ Parts: []*Part{ { InlineData: &Blob{ Data: encodeBytesToBase64String(bifrostResp.Audio), MIMEType: utils.DetectAudioMimeType(bifrostResp.Audio), }, }, }, Role: string(RoleModel), }, } // Set usage metadata if present if bifrostResp.Usage != nil { genaiResp.UsageMetadata = convertBifrostSpeechUsageToGeminiUsageMetadata(bifrostResp.Usage) } genaiResp.Candidates = []*Candidate{candidate} return genaiResp }