Files
bifrost/core/providers/elevenlabs/transcription.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

270 lines
7.6 KiB
Go

package elevenlabs
import (
"errors"
"strings"
"github.com/bytedance/sonic"
"github.com/maximhq/bifrost/core/schemas"
)
func ToElevenlabsTranscriptionRequest(bifrostReq *schemas.BifrostTranscriptionRequest) *ElevenlabsTranscriptionRequest {
if bifrostReq == nil {
return nil
}
req := &ElevenlabsTranscriptionRequest{
ModelID: bifrostReq.Model,
}
if bifrostReq.Input != nil && len(bifrostReq.Input.File) > 0 {
req.File = bifrostReq.Input.File
req.Filename = bifrostReq.Input.Filename
}
if bifrostReq.Params == nil {
return req
}
params := bifrostReq.Params
if params.Language != nil {
req.LanguageCode = params.Language
}
if params.ExtraParams != nil {
if tagAudioEvents, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["tag_audio_events"]); ok {
delete(params.ExtraParams, "tag_audio_events")
req.TagAudioEvents = tagAudioEvents
}
if numSpeakers, ok := schemas.SafeExtractIntPointer(params.ExtraParams["num_speakers"]); ok {
delete(params.ExtraParams, "num_speakers")
req.NumSpeakers = numSpeakers
}
if timestampsGranularity, ok := schemas.SafeExtractStringPointer(params.ExtraParams["timestamps_granularity"]); ok {
granularity := ElevenlabsTimestampsGranularity(*timestampsGranularity)
delete(params.ExtraParams, "timestamps_granularity")
req.TimestampsGranularity = &granularity
}
if diarize, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["diarize"]); ok {
delete(params.ExtraParams, "diarize")
req.Diarize = diarize
}
if diarizationThreshold, ok := schemas.SafeExtractFloat64Pointer(params.ExtraParams["diarization_threshold"]); ok {
delete(params.ExtraParams, "diarization_threshold")
req.DiarizationThreshold = diarizationThreshold
}
if fileFormat, ok := schemas.SafeExtractStringPointer(params.ExtraParams["file_format"]); ok {
fileFormat := ElevenlabsFileFormat(*fileFormat)
delete(params.ExtraParams, "file_format")
req.FileFormat = &fileFormat
}
if cloudStorageURL, ok := schemas.SafeExtractStringPointer(params.ExtraParams["cloud_storage_url"]); ok {
delete(params.ExtraParams, "cloud_storage_url")
req.CloudStorageURL = cloudStorageURL
}
if webhook, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["webhook"]); ok {
delete(params.ExtraParams, "webhook")
req.Webhook = webhook
}
if webhookID, ok := schemas.SafeExtractStringPointer(params.ExtraParams["webhook_id"]); ok {
delete(params.ExtraParams, "webhook_id")
req.WebhookID = webhookID
}
if temperature, ok := schemas.SafeExtractFloat64Pointer(params.ExtraParams["temperature"]); ok {
delete(params.ExtraParams, "temperature")
req.Temperature = temperature
}
if seed, ok := schemas.SafeExtractIntPointer(params.ExtraParams["seed"]); ok {
delete(params.ExtraParams, "seed")
req.Seed = seed
}
if useMultiChannel, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["use_multi_channel"]); ok {
delete(params.ExtraParams, "use_multi_channel")
req.UseMultiChannel = useMultiChannel
}
req.ExtraParams = bifrostReq.Params.ExtraParams
}
if len(params.AdditionalFormats) > 0 {
additionalFormats := make([]ElevenlabsAdditionalFormat, 0, len(params.AdditionalFormats))
for _, format := range params.AdditionalFormats {
if converted, ok := convertAdditionalFormat(format); ok {
additionalFormats = append(additionalFormats, converted)
}
}
if len(additionalFormats) > 0 {
req.AdditionalFormats = additionalFormats
}
}
if params.WebhookMetadata != nil {
if metadataMap, ok := params.WebhookMetadata.(map[string]interface{}); ok {
if len(metadataMap) > 0 {
req.WebhookMetadata = metadataMap
}
} else {
req.WebhookMetadata = params.WebhookMetadata
}
}
return req
}
func ToBifrostTranscriptionResponse(chunks []ElevenlabsSpeechToTextChunkResponse) *schemas.BifrostTranscriptionResponse {
if len(chunks) == 0 {
return nil
}
textParts := make([]string, 0, len(chunks))
allWords := make([]schemas.TranscriptionWord, 0)
allLogProbs := make([]schemas.TranscriptionLogProb, 0)
var language *string
var overallDuration *float64
for _, chunk := range chunks {
textParts = append(textParts, chunk.Text)
words, logProbs, chunkDuration := convertWords(chunk.Words)
allWords = append(allWords, words...)
allLogProbs = append(allLogProbs, logProbs...)
if language == nil && chunk.LanguageCode != "" {
lc := chunk.LanguageCode
language = &lc
}
if chunkDuration != nil {
if overallDuration == nil || *chunkDuration > *overallDuration {
val := *chunkDuration
overallDuration = &val
}
}
}
text := strings.Join(textParts, "\n")
response := &schemas.BifrostTranscriptionResponse{
Text: text,
Words: allWords,
LogProbs: allLogProbs,
}
if language != nil {
response.Language = language
}
if overallDuration != nil {
response.Duration = overallDuration
}
return response
}
func convertAdditionalFormat(format schemas.TranscriptionAdditionalFormat) (ElevenlabsAdditionalFormat, bool) {
if format.Format == "" {
return ElevenlabsAdditionalFormat{}, false
}
converted := ElevenlabsAdditionalFormat{
Format: ElevenlabsExportOptions(format.Format),
}
if format.IncludeSpeakers != nil {
converted.IncludeSpeakers = format.IncludeSpeakers
}
if format.IncludeTimestamps != nil {
converted.IncludeTimestamps = format.IncludeTimestamps
}
if format.SegmentOnSilenceLongerThanS != nil {
converted.SegmentOnSilenceLongerThanS = format.SegmentOnSilenceLongerThanS
}
if format.MaxSegmentDurationS != nil {
converted.MaxSegmentDurationS = format.MaxSegmentDurationS
}
if format.MaxSegmentChars != nil {
converted.MaxSegmentChars = format.MaxSegmentChars
}
if format.MaxCharactersPerLine != nil {
converted.MaxCharactersPerLine = format.MaxCharactersPerLine
}
return converted, true
}
func convertWords(words []ElevenlabsSpeechToTextWord) ([]schemas.TranscriptionWord, []schemas.TranscriptionLogProb, *float64) {
if len(words) == 0 {
return nil, nil, nil
}
convertedWords := make([]schemas.TranscriptionWord, 0, len(words))
logProbs := make([]schemas.TranscriptionLogProb, 0, len(words))
var maxEnd float64
var hasEnd bool
for _, word := range words {
trimmed := strings.TrimSpace(word.Text)
if word.Type == "spacing" && trimmed == "" {
continue
}
transcriptionWord := schemas.TranscriptionWord{
Word: word.Text,
}
if word.Start != nil {
transcriptionWord.Start = *word.Start
}
if word.End != nil {
transcriptionWord.End = *word.End
if !hasEnd || *word.End > maxEnd {
maxEnd = *word.End
hasEnd = true
}
}
convertedWords = append(convertedWords, transcriptionWord)
logProbs = append(logProbs, schemas.TranscriptionLogProb{
Token: word.Text,
LogProb: word.LogProb,
})
}
if !hasEnd {
return convertedWords, logProbs, nil
}
duration := maxEnd
return convertedWords, logProbs, &duration
}
func parseTranscriptionResponse(body []byte) ([]ElevenlabsSpeechToTextChunkResponse, error) {
var multichannel ElevenlabsMultichannelSpeechToTextResponse
if err := sonic.Unmarshal(body, &multichannel); err == nil && len(multichannel.Transcripts) > 0 {
return multichannel.Transcripts, nil
}
var single ElevenlabsSpeechToTextChunkResponse
if err := sonic.Unmarshal(body, &single); err == nil {
if single.LanguageCode != "" || single.Text != "" || len(single.Words) > 0 {
return []ElevenlabsSpeechToTextChunkResponse{single}, nil
}
}
var webhook ElevenlabsSpeechToTextWebhookResponse
if err := sonic.Unmarshal(body, &webhook); err == nil && strings.TrimSpace(webhook.Message) != "" {
return nil, errors.New(webhook.Message)
}
return nil, errors.New("unexpected Elevenlabs transcription response format")
}