first commit
This commit is contained in:
269
core/providers/elevenlabs/transcription.go
Normal file
269
core/providers/elevenlabs/transcription.go
Normal file
@@ -0,0 +1,269 @@
|
||||
package elevenlabs
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
|
||||
"github.com/bytedance/sonic"
|
||||
"github.com/maximhq/bifrost/core/schemas"
|
||||
)
|
||||
|
||||
func ToElevenlabsTranscriptionRequest(bifrostReq *schemas.BifrostTranscriptionRequest) *ElevenlabsTranscriptionRequest {
|
||||
if bifrostReq == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
req := &ElevenlabsTranscriptionRequest{
|
||||
ModelID: bifrostReq.Model,
|
||||
}
|
||||
|
||||
if bifrostReq.Input != nil && len(bifrostReq.Input.File) > 0 {
|
||||
req.File = bifrostReq.Input.File
|
||||
req.Filename = bifrostReq.Input.Filename
|
||||
}
|
||||
|
||||
if bifrostReq.Params == nil {
|
||||
return req
|
||||
}
|
||||
|
||||
params := bifrostReq.Params
|
||||
|
||||
if params.Language != nil {
|
||||
req.LanguageCode = params.Language
|
||||
}
|
||||
|
||||
if params.ExtraParams != nil {
|
||||
if tagAudioEvents, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["tag_audio_events"]); ok {
|
||||
delete(params.ExtraParams, "tag_audio_events")
|
||||
req.TagAudioEvents = tagAudioEvents
|
||||
}
|
||||
if numSpeakers, ok := schemas.SafeExtractIntPointer(params.ExtraParams["num_speakers"]); ok {
|
||||
delete(params.ExtraParams, "num_speakers")
|
||||
req.NumSpeakers = numSpeakers
|
||||
}
|
||||
if timestampsGranularity, ok := schemas.SafeExtractStringPointer(params.ExtraParams["timestamps_granularity"]); ok {
|
||||
granularity := ElevenlabsTimestampsGranularity(*timestampsGranularity)
|
||||
delete(params.ExtraParams, "timestamps_granularity")
|
||||
req.TimestampsGranularity = &granularity
|
||||
}
|
||||
if diarize, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["diarize"]); ok {
|
||||
delete(params.ExtraParams, "diarize")
|
||||
req.Diarize = diarize
|
||||
}
|
||||
if diarizationThreshold, ok := schemas.SafeExtractFloat64Pointer(params.ExtraParams["diarization_threshold"]); ok {
|
||||
delete(params.ExtraParams, "diarization_threshold")
|
||||
req.DiarizationThreshold = diarizationThreshold
|
||||
}
|
||||
if fileFormat, ok := schemas.SafeExtractStringPointer(params.ExtraParams["file_format"]); ok {
|
||||
fileFormat := ElevenlabsFileFormat(*fileFormat)
|
||||
delete(params.ExtraParams, "file_format")
|
||||
req.FileFormat = &fileFormat
|
||||
}
|
||||
if cloudStorageURL, ok := schemas.SafeExtractStringPointer(params.ExtraParams["cloud_storage_url"]); ok {
|
||||
delete(params.ExtraParams, "cloud_storage_url")
|
||||
req.CloudStorageURL = cloudStorageURL
|
||||
}
|
||||
if webhook, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["webhook"]); ok {
|
||||
delete(params.ExtraParams, "webhook")
|
||||
req.Webhook = webhook
|
||||
}
|
||||
if webhookID, ok := schemas.SafeExtractStringPointer(params.ExtraParams["webhook_id"]); ok {
|
||||
delete(params.ExtraParams, "webhook_id")
|
||||
req.WebhookID = webhookID
|
||||
}
|
||||
if temperature, ok := schemas.SafeExtractFloat64Pointer(params.ExtraParams["temperature"]); ok {
|
||||
delete(params.ExtraParams, "temperature")
|
||||
req.Temperature = temperature
|
||||
}
|
||||
if seed, ok := schemas.SafeExtractIntPointer(params.ExtraParams["seed"]); ok {
|
||||
delete(params.ExtraParams, "seed")
|
||||
req.Seed = seed
|
||||
}
|
||||
if useMultiChannel, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["use_multi_channel"]); ok {
|
||||
delete(params.ExtraParams, "use_multi_channel")
|
||||
req.UseMultiChannel = useMultiChannel
|
||||
}
|
||||
req.ExtraParams = bifrostReq.Params.ExtraParams
|
||||
}
|
||||
|
||||
if len(params.AdditionalFormats) > 0 {
|
||||
additionalFormats := make([]ElevenlabsAdditionalFormat, 0, len(params.AdditionalFormats))
|
||||
for _, format := range params.AdditionalFormats {
|
||||
if converted, ok := convertAdditionalFormat(format); ok {
|
||||
additionalFormats = append(additionalFormats, converted)
|
||||
}
|
||||
}
|
||||
if len(additionalFormats) > 0 {
|
||||
req.AdditionalFormats = additionalFormats
|
||||
}
|
||||
}
|
||||
|
||||
if params.WebhookMetadata != nil {
|
||||
if metadataMap, ok := params.WebhookMetadata.(map[string]interface{}); ok {
|
||||
if len(metadataMap) > 0 {
|
||||
req.WebhookMetadata = metadataMap
|
||||
}
|
||||
} else {
|
||||
req.WebhookMetadata = params.WebhookMetadata
|
||||
}
|
||||
}
|
||||
|
||||
return req
|
||||
}
|
||||
|
||||
func ToBifrostTranscriptionResponse(chunks []ElevenlabsSpeechToTextChunkResponse) *schemas.BifrostTranscriptionResponse {
|
||||
if len(chunks) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
textParts := make([]string, 0, len(chunks))
|
||||
allWords := make([]schemas.TranscriptionWord, 0)
|
||||
allLogProbs := make([]schemas.TranscriptionLogProb, 0)
|
||||
|
||||
var language *string
|
||||
var overallDuration *float64
|
||||
|
||||
for _, chunk := range chunks {
|
||||
textParts = append(textParts, chunk.Text)
|
||||
|
||||
words, logProbs, chunkDuration := convertWords(chunk.Words)
|
||||
allWords = append(allWords, words...)
|
||||
allLogProbs = append(allLogProbs, logProbs...)
|
||||
|
||||
if language == nil && chunk.LanguageCode != "" {
|
||||
lc := chunk.LanguageCode
|
||||
language = &lc
|
||||
}
|
||||
|
||||
if chunkDuration != nil {
|
||||
if overallDuration == nil || *chunkDuration > *overallDuration {
|
||||
val := *chunkDuration
|
||||
overallDuration = &val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
text := strings.Join(textParts, "\n")
|
||||
|
||||
response := &schemas.BifrostTranscriptionResponse{
|
||||
Text: text,
|
||||
Words: allWords,
|
||||
LogProbs: allLogProbs,
|
||||
}
|
||||
|
||||
if language != nil {
|
||||
response.Language = language
|
||||
}
|
||||
|
||||
if overallDuration != nil {
|
||||
response.Duration = overallDuration
|
||||
}
|
||||
|
||||
return response
|
||||
|
||||
}
|
||||
|
||||
func convertAdditionalFormat(format schemas.TranscriptionAdditionalFormat) (ElevenlabsAdditionalFormat, bool) {
|
||||
if format.Format == "" {
|
||||
return ElevenlabsAdditionalFormat{}, false
|
||||
}
|
||||
|
||||
converted := ElevenlabsAdditionalFormat{
|
||||
Format: ElevenlabsExportOptions(format.Format),
|
||||
}
|
||||
|
||||
if format.IncludeSpeakers != nil {
|
||||
converted.IncludeSpeakers = format.IncludeSpeakers
|
||||
}
|
||||
|
||||
if format.IncludeTimestamps != nil {
|
||||
converted.IncludeTimestamps = format.IncludeTimestamps
|
||||
}
|
||||
|
||||
if format.SegmentOnSilenceLongerThanS != nil {
|
||||
converted.SegmentOnSilenceLongerThanS = format.SegmentOnSilenceLongerThanS
|
||||
}
|
||||
|
||||
if format.MaxSegmentDurationS != nil {
|
||||
converted.MaxSegmentDurationS = format.MaxSegmentDurationS
|
||||
}
|
||||
|
||||
if format.MaxSegmentChars != nil {
|
||||
converted.MaxSegmentChars = format.MaxSegmentChars
|
||||
}
|
||||
|
||||
if format.MaxCharactersPerLine != nil {
|
||||
converted.MaxCharactersPerLine = format.MaxCharactersPerLine
|
||||
}
|
||||
|
||||
return converted, true
|
||||
}
|
||||
|
||||
func convertWords(words []ElevenlabsSpeechToTextWord) ([]schemas.TranscriptionWord, []schemas.TranscriptionLogProb, *float64) {
|
||||
if len(words) == 0 {
|
||||
return nil, nil, nil
|
||||
}
|
||||
|
||||
convertedWords := make([]schemas.TranscriptionWord, 0, len(words))
|
||||
logProbs := make([]schemas.TranscriptionLogProb, 0, len(words))
|
||||
|
||||
var maxEnd float64
|
||||
var hasEnd bool
|
||||
|
||||
for _, word := range words {
|
||||
trimmed := strings.TrimSpace(word.Text)
|
||||
if word.Type == "spacing" && trimmed == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
transcriptionWord := schemas.TranscriptionWord{
|
||||
Word: word.Text,
|
||||
}
|
||||
|
||||
if word.Start != nil {
|
||||
transcriptionWord.Start = *word.Start
|
||||
}
|
||||
|
||||
if word.End != nil {
|
||||
transcriptionWord.End = *word.End
|
||||
if !hasEnd || *word.End > maxEnd {
|
||||
maxEnd = *word.End
|
||||
hasEnd = true
|
||||
}
|
||||
}
|
||||
|
||||
convertedWords = append(convertedWords, transcriptionWord)
|
||||
logProbs = append(logProbs, schemas.TranscriptionLogProb{
|
||||
Token: word.Text,
|
||||
LogProb: word.LogProb,
|
||||
})
|
||||
}
|
||||
|
||||
if !hasEnd {
|
||||
return convertedWords, logProbs, nil
|
||||
}
|
||||
|
||||
duration := maxEnd
|
||||
return convertedWords, logProbs, &duration
|
||||
}
|
||||
|
||||
func parseTranscriptionResponse(body []byte) ([]ElevenlabsSpeechToTextChunkResponse, error) {
|
||||
var multichannel ElevenlabsMultichannelSpeechToTextResponse
|
||||
if err := sonic.Unmarshal(body, &multichannel); err == nil && len(multichannel.Transcripts) > 0 {
|
||||
return multichannel.Transcripts, nil
|
||||
}
|
||||
|
||||
var single ElevenlabsSpeechToTextChunkResponse
|
||||
if err := sonic.Unmarshal(body, &single); err == nil {
|
||||
if single.LanguageCode != "" || single.Text != "" || len(single.Words) > 0 {
|
||||
return []ElevenlabsSpeechToTextChunkResponse{single}, nil
|
||||
}
|
||||
}
|
||||
|
||||
var webhook ElevenlabsSpeechToTextWebhookResponse
|
||||
if err := sonic.Unmarshal(body, &webhook); err == nil && strings.TrimSpace(webhook.Message) != "" {
|
||||
return nil, errors.New(webhook.Message)
|
||||
}
|
||||
|
||||
return nil, errors.New("unexpected Elevenlabs transcription response format")
|
||||
}
|
||||
Reference in New Issue
Block a user