270 lines
7.6 KiB
Go
270 lines
7.6 KiB
Go
package elevenlabs
|
|
|
|
import (
|
|
"errors"
|
|
"strings"
|
|
|
|
"github.com/bytedance/sonic"
|
|
"github.com/maximhq/bifrost/core/schemas"
|
|
)
|
|
|
|
func ToElevenlabsTranscriptionRequest(bifrostReq *schemas.BifrostTranscriptionRequest) *ElevenlabsTranscriptionRequest {
|
|
if bifrostReq == nil {
|
|
return nil
|
|
}
|
|
|
|
req := &ElevenlabsTranscriptionRequest{
|
|
ModelID: bifrostReq.Model,
|
|
}
|
|
|
|
if bifrostReq.Input != nil && len(bifrostReq.Input.File) > 0 {
|
|
req.File = bifrostReq.Input.File
|
|
req.Filename = bifrostReq.Input.Filename
|
|
}
|
|
|
|
if bifrostReq.Params == nil {
|
|
return req
|
|
}
|
|
|
|
params := bifrostReq.Params
|
|
|
|
if params.Language != nil {
|
|
req.LanguageCode = params.Language
|
|
}
|
|
|
|
if params.ExtraParams != nil {
|
|
if tagAudioEvents, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["tag_audio_events"]); ok {
|
|
delete(params.ExtraParams, "tag_audio_events")
|
|
req.TagAudioEvents = tagAudioEvents
|
|
}
|
|
if numSpeakers, ok := schemas.SafeExtractIntPointer(params.ExtraParams["num_speakers"]); ok {
|
|
delete(params.ExtraParams, "num_speakers")
|
|
req.NumSpeakers = numSpeakers
|
|
}
|
|
if timestampsGranularity, ok := schemas.SafeExtractStringPointer(params.ExtraParams["timestamps_granularity"]); ok {
|
|
granularity := ElevenlabsTimestampsGranularity(*timestampsGranularity)
|
|
delete(params.ExtraParams, "timestamps_granularity")
|
|
req.TimestampsGranularity = &granularity
|
|
}
|
|
if diarize, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["diarize"]); ok {
|
|
delete(params.ExtraParams, "diarize")
|
|
req.Diarize = diarize
|
|
}
|
|
if diarizationThreshold, ok := schemas.SafeExtractFloat64Pointer(params.ExtraParams["diarization_threshold"]); ok {
|
|
delete(params.ExtraParams, "diarization_threshold")
|
|
req.DiarizationThreshold = diarizationThreshold
|
|
}
|
|
if fileFormat, ok := schemas.SafeExtractStringPointer(params.ExtraParams["file_format"]); ok {
|
|
fileFormat := ElevenlabsFileFormat(*fileFormat)
|
|
delete(params.ExtraParams, "file_format")
|
|
req.FileFormat = &fileFormat
|
|
}
|
|
if cloudStorageURL, ok := schemas.SafeExtractStringPointer(params.ExtraParams["cloud_storage_url"]); ok {
|
|
delete(params.ExtraParams, "cloud_storage_url")
|
|
req.CloudStorageURL = cloudStorageURL
|
|
}
|
|
if webhook, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["webhook"]); ok {
|
|
delete(params.ExtraParams, "webhook")
|
|
req.Webhook = webhook
|
|
}
|
|
if webhookID, ok := schemas.SafeExtractStringPointer(params.ExtraParams["webhook_id"]); ok {
|
|
delete(params.ExtraParams, "webhook_id")
|
|
req.WebhookID = webhookID
|
|
}
|
|
if temperature, ok := schemas.SafeExtractFloat64Pointer(params.ExtraParams["temperature"]); ok {
|
|
delete(params.ExtraParams, "temperature")
|
|
req.Temperature = temperature
|
|
}
|
|
if seed, ok := schemas.SafeExtractIntPointer(params.ExtraParams["seed"]); ok {
|
|
delete(params.ExtraParams, "seed")
|
|
req.Seed = seed
|
|
}
|
|
if useMultiChannel, ok := schemas.SafeExtractBoolPointer(params.ExtraParams["use_multi_channel"]); ok {
|
|
delete(params.ExtraParams, "use_multi_channel")
|
|
req.UseMultiChannel = useMultiChannel
|
|
}
|
|
req.ExtraParams = bifrostReq.Params.ExtraParams
|
|
}
|
|
|
|
if len(params.AdditionalFormats) > 0 {
|
|
additionalFormats := make([]ElevenlabsAdditionalFormat, 0, len(params.AdditionalFormats))
|
|
for _, format := range params.AdditionalFormats {
|
|
if converted, ok := convertAdditionalFormat(format); ok {
|
|
additionalFormats = append(additionalFormats, converted)
|
|
}
|
|
}
|
|
if len(additionalFormats) > 0 {
|
|
req.AdditionalFormats = additionalFormats
|
|
}
|
|
}
|
|
|
|
if params.WebhookMetadata != nil {
|
|
if metadataMap, ok := params.WebhookMetadata.(map[string]interface{}); ok {
|
|
if len(metadataMap) > 0 {
|
|
req.WebhookMetadata = metadataMap
|
|
}
|
|
} else {
|
|
req.WebhookMetadata = params.WebhookMetadata
|
|
}
|
|
}
|
|
|
|
return req
|
|
}
|
|
|
|
func ToBifrostTranscriptionResponse(chunks []ElevenlabsSpeechToTextChunkResponse) *schemas.BifrostTranscriptionResponse {
|
|
if len(chunks) == 0 {
|
|
return nil
|
|
}
|
|
|
|
textParts := make([]string, 0, len(chunks))
|
|
allWords := make([]schemas.TranscriptionWord, 0)
|
|
allLogProbs := make([]schemas.TranscriptionLogProb, 0)
|
|
|
|
var language *string
|
|
var overallDuration *float64
|
|
|
|
for _, chunk := range chunks {
|
|
textParts = append(textParts, chunk.Text)
|
|
|
|
words, logProbs, chunkDuration := convertWords(chunk.Words)
|
|
allWords = append(allWords, words...)
|
|
allLogProbs = append(allLogProbs, logProbs...)
|
|
|
|
if language == nil && chunk.LanguageCode != "" {
|
|
lc := chunk.LanguageCode
|
|
language = &lc
|
|
}
|
|
|
|
if chunkDuration != nil {
|
|
if overallDuration == nil || *chunkDuration > *overallDuration {
|
|
val := *chunkDuration
|
|
overallDuration = &val
|
|
}
|
|
}
|
|
}
|
|
|
|
text := strings.Join(textParts, "\n")
|
|
|
|
response := &schemas.BifrostTranscriptionResponse{
|
|
Text: text,
|
|
Words: allWords,
|
|
LogProbs: allLogProbs,
|
|
}
|
|
|
|
if language != nil {
|
|
response.Language = language
|
|
}
|
|
|
|
if overallDuration != nil {
|
|
response.Duration = overallDuration
|
|
}
|
|
|
|
return response
|
|
|
|
}
|
|
|
|
func convertAdditionalFormat(format schemas.TranscriptionAdditionalFormat) (ElevenlabsAdditionalFormat, bool) {
|
|
if format.Format == "" {
|
|
return ElevenlabsAdditionalFormat{}, false
|
|
}
|
|
|
|
converted := ElevenlabsAdditionalFormat{
|
|
Format: ElevenlabsExportOptions(format.Format),
|
|
}
|
|
|
|
if format.IncludeSpeakers != nil {
|
|
converted.IncludeSpeakers = format.IncludeSpeakers
|
|
}
|
|
|
|
if format.IncludeTimestamps != nil {
|
|
converted.IncludeTimestamps = format.IncludeTimestamps
|
|
}
|
|
|
|
if format.SegmentOnSilenceLongerThanS != nil {
|
|
converted.SegmentOnSilenceLongerThanS = format.SegmentOnSilenceLongerThanS
|
|
}
|
|
|
|
if format.MaxSegmentDurationS != nil {
|
|
converted.MaxSegmentDurationS = format.MaxSegmentDurationS
|
|
}
|
|
|
|
if format.MaxSegmentChars != nil {
|
|
converted.MaxSegmentChars = format.MaxSegmentChars
|
|
}
|
|
|
|
if format.MaxCharactersPerLine != nil {
|
|
converted.MaxCharactersPerLine = format.MaxCharactersPerLine
|
|
}
|
|
|
|
return converted, true
|
|
}
|
|
|
|
func convertWords(words []ElevenlabsSpeechToTextWord) ([]schemas.TranscriptionWord, []schemas.TranscriptionLogProb, *float64) {
|
|
if len(words) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
|
|
convertedWords := make([]schemas.TranscriptionWord, 0, len(words))
|
|
logProbs := make([]schemas.TranscriptionLogProb, 0, len(words))
|
|
|
|
var maxEnd float64
|
|
var hasEnd bool
|
|
|
|
for _, word := range words {
|
|
trimmed := strings.TrimSpace(word.Text)
|
|
if word.Type == "spacing" && trimmed == "" {
|
|
continue
|
|
}
|
|
|
|
transcriptionWord := schemas.TranscriptionWord{
|
|
Word: word.Text,
|
|
}
|
|
|
|
if word.Start != nil {
|
|
transcriptionWord.Start = *word.Start
|
|
}
|
|
|
|
if word.End != nil {
|
|
transcriptionWord.End = *word.End
|
|
if !hasEnd || *word.End > maxEnd {
|
|
maxEnd = *word.End
|
|
hasEnd = true
|
|
}
|
|
}
|
|
|
|
convertedWords = append(convertedWords, transcriptionWord)
|
|
logProbs = append(logProbs, schemas.TranscriptionLogProb{
|
|
Token: word.Text,
|
|
LogProb: word.LogProb,
|
|
})
|
|
}
|
|
|
|
if !hasEnd {
|
|
return convertedWords, logProbs, nil
|
|
}
|
|
|
|
duration := maxEnd
|
|
return convertedWords, logProbs, &duration
|
|
}
|
|
|
|
func parseTranscriptionResponse(body []byte) ([]ElevenlabsSpeechToTextChunkResponse, error) {
|
|
var multichannel ElevenlabsMultichannelSpeechToTextResponse
|
|
if err := sonic.Unmarshal(body, &multichannel); err == nil && len(multichannel.Transcripts) > 0 {
|
|
return multichannel.Transcripts, nil
|
|
}
|
|
|
|
var single ElevenlabsSpeechToTextChunkResponse
|
|
if err := sonic.Unmarshal(body, &single); err == nil {
|
|
if single.LanguageCode != "" || single.Text != "" || len(single.Words) > 0 {
|
|
return []ElevenlabsSpeechToTextChunkResponse{single}, nil
|
|
}
|
|
}
|
|
|
|
var webhook ElevenlabsSpeechToTextWebhookResponse
|
|
if err := sonic.Unmarshal(body, &webhook); err == nil && strings.TrimSpace(webhook.Message) != "" {
|
|
return nil, errors.New(webhook.Message)
|
|
}
|
|
|
|
return nil, errors.New("unexpected Elevenlabs transcription response format")
|
|
}
|