bifrost/core/schemas/transcriptions.go

package schemas

type BifrostTranscriptionRequest struct {
	Provider       ModelProvider            `json:"provider"`
	Model          string                   `json:"model"`
	Input          *TranscriptionInput      `json:"input,omitempty"`
	Params         *TranscriptionParameters `json:"params,omitempty"`
	Fallbacks      []Fallback               `json:"fallbacks,omitempty"`
	RawRequestBody []byte                   `json:"-"` // set bifrost-use-raw-request-body to true in ctx to use the raw request body. Bifrost will directly send this to the downstream provider.
}

func (r *BifrostTranscriptionRequest) GetRawRequestBody() []byte {
	return r.RawRequestBody
}

type BifrostTranscriptionResponse struct {
	Duration       *float64                   `json:"duration,omitempty"` // Duration in seconds
	Language       *string                    `json:"language,omitempty"` // e.g., "english"
	LogProbs       []TranscriptionLogProb     `json:"logprobs,omitempty"`
	Segments       []TranscriptionSegment     `json:"segments,omitempty"`
	Task           *string                    `json:"task,omitempty"` // e.g., "transcribe"
	Text           string                     `json:"text"`
	Usage          *TranscriptionUsage        `json:"usage,omitempty"`
	Words          []TranscriptionWord        `json:"words,omitempty"`
	ResponseFormat *string                    `json:"-"` // Set by provider for non-JSON formats (text, srt, vtt); used by integration response converters
	ExtraFields    BifrostResponseExtraFields `json:"extra_fields"`
}

func (r *BifrostTranscriptionResponse) BackfillParams(req *BifrostTranscriptionRequest) {
	if r == nil || req == nil || req.Params == nil || req.Params.ResponseFormat == nil {
		return
	}
	r.ResponseFormat = req.Params.ResponseFormat
}

// IsPlainTextTranscriptionFormat returns true if the given response format
// produces a plain-text response body (not JSON).
func IsPlainTextTranscriptionFormat(format *string) bool {
	if format == nil {
		return false
	}
	switch *format {
	case "text", "srt", "vtt":
		return true
	default:
		return false
	}
}

type TranscriptionInput struct {
	File     []byte `json:"file"`
	Filename string `json:"filename,omitempty"` // Original filename, used to preserve file format extension
}

type TranscriptionParameters struct {
	Language               *string  `json:"language,omitempty"`
	Prompt                 *string  `json:"prompt,omitempty"`
	ResponseFormat         *string  `json:"response_format,omitempty"`         // Default is "json"
	Temperature            *float64 `json:"temperature,omitempty"`             // Sampling temperature (0.0-1.0)
	TimestampGranularities []string `json:"timestamp_granularities,omitempty"` // "word" and/or "segment"; requires response_format=verbose_json
	Include                []string `json:"include,omitempty"`                 // Additional response info (e.g., logprobs)
	Format                 *string  `json:"file_format,omitempty"`             // Type of file, not required in openai, but required in gemini
	MaxLength              *int     `json:"max_length,omitempty"`              // Maximum length of the transcription used by HuggingFace
	MinLength              *int     `json:"min_length,omitempty"`              // Minimum length of the transcription used by HuggingFace
	MaxNewTokens           *int     `json:"max_new_tokens,omitempty"`          // Maximum new tokens to generate used by HuggingFace
	MinNewTokens           *int     `json:"min_new_tokens,omitempty"`          // Minimum new tokens to generate used by HuggingFace

	// Elevenlabs-specific fields
	AdditionalFormats []TranscriptionAdditionalFormat `json:"additional_formats,omitempty"`
	WebhookMetadata   interface{}                     `json:"webhook_metadata,omitempty"`

	// Dynamic parameters that can be provider-specific, they are directly
	// added to the request as is.
	ExtraParams map[string]interface{} `json:"-"`
}

type TranscriptionAdditionalFormat struct {
	Format                      TranscriptionExportOptions `json:"format"`
	IncludeSpeakers             *bool                      `json:"include_speakers,omitempty"`
	IncludeTimestamps           *bool                      `json:"include_timestamps,omitempty"`
	SegmentOnSilenceLongerThanS *float64                   `json:"segment_on_silence_longer_than_s,omitempty"`
	MaxSegmentDurationS         *float64                   `json:"max_segment_duration_s,omitempty"`
	MaxSegmentChars             *int                       `json:"max_segment_chars,omitempty"`
	MaxCharactersPerLine        *int                       `json:"max_characters_per_line,omitempty"`
}

type TranscriptionExportOptions string

const (
	TranscriptionExportOptionsSegmentedJson TranscriptionExportOptions = "segmented_json"
	TranscriptionExportOptionsDocx          TranscriptionExportOptions = "docx"
	TranscriptionExportOptionsPdf           TranscriptionExportOptions = "pdf"
	TranscriptionExportOptionsTxt           TranscriptionExportOptions = "txt"
	TranscriptionExportOptionsHtml          TranscriptionExportOptions = "html"
	TranscriptionExportOptionsSrt           TranscriptionExportOptions = "srt"
)

// TranscriptionLogProb represents log probability information for transcription
type TranscriptionLogProb struct {
	Token   string  `json:"token"`
	LogProb float64 `json:"logprob"`
	Bytes   []int   `json:"bytes"`
}

// TranscriptionWord represents word-level timing information
type TranscriptionWord struct {
	Word  string  `json:"word"`
	Start float64 `json:"start"`
	End   float64 `json:"end"`
}

// TranscriptionSegment represents segment-level transcription information
type TranscriptionSegment struct {
	ID               int     `json:"id"`
	Seek             int     `json:"seek"`
	Start            float64 `json:"start"`
	End              float64 `json:"end"`
	Text             string  `json:"text"`
	Tokens           []int   `json:"tokens"`
	Temperature      float64 `json:"temperature"`
	AvgLogProb       float64 `json:"avg_logprob"`
	CompressionRatio float64 `json:"compression_ratio"`
	NoSpeechProb     float64 `json:"no_speech_prob"`
}

// TranscriptionUsage represents usage information for transcription
type TranscriptionUsage struct {
	Type              string                               `json:"type"` // "tokens" or "duration"
	InputTokens       *int                                 `json:"input_tokens,omitempty"`
	InputTokenDetails *TranscriptionUsageInputTokenDetails `json:"input_token_details,omitempty"`
	OutputTokens      *int                                 `json:"output_tokens,omitempty"`
	TotalTokens       *int                                 `json:"total_tokens,omitempty"`
	Seconds           *int                                 `json:"seconds,omitempty"` // For duration-based usage
}

type TranscriptionUsageInputTokenDetails struct {
	TextTokens  int `json:"text_tokens"`
	AudioTokens int `json:"audio_tokens"`
}

type TranscriptionStreamResponseType string

const (
	TranscriptionStreamResponseTypeDelta TranscriptionStreamResponseType = "transcript.text.delta"
	TranscriptionStreamResponseTypeDone  TranscriptionStreamResponseType = "transcript.text.done"
)

// BifrostTranscriptionStreamResponse represents streaming specific fields only
type BifrostTranscriptionStreamResponse struct {
	Delta       *string                         `json:"delta,omitempty"` // For delta events
	LogProbs    []TranscriptionLogProb          `json:"logprobs,omitempty"`
	Text        string                          `json:"text"`
	Type        TranscriptionStreamResponseType `json:"type"`
	Usage       *TranscriptionUsage             `json:"usage,omitempty"`
	ExtraFields BifrostResponseExtraFields      `json:"extra_fields"`
}