Files
bifrost/core/schemas/transcriptions.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

157 lines
7.5 KiB
Go

package schemas
type BifrostTranscriptionRequest struct {
Provider ModelProvider `json:"provider"`
Model string `json:"model"`
Input *TranscriptionInput `json:"input,omitempty"`
Params *TranscriptionParameters `json:"params,omitempty"`
Fallbacks []Fallback `json:"fallbacks,omitempty"`
RawRequestBody []byte `json:"-"` // set bifrost-use-raw-request-body to true in ctx to use the raw request body. Bifrost will directly send this to the downstream provider.
}
func (r *BifrostTranscriptionRequest) GetRawRequestBody() []byte {
return r.RawRequestBody
}
type BifrostTranscriptionResponse struct {
Duration *float64 `json:"duration,omitempty"` // Duration in seconds
Language *string `json:"language,omitempty"` // e.g., "english"
LogProbs []TranscriptionLogProb `json:"logprobs,omitempty"`
Segments []TranscriptionSegment `json:"segments,omitempty"`
Task *string `json:"task,omitempty"` // e.g., "transcribe"
Text string `json:"text"`
Usage *TranscriptionUsage `json:"usage,omitempty"`
Words []TranscriptionWord `json:"words,omitempty"`
ResponseFormat *string `json:"-"` // Set by provider for non-JSON formats (text, srt, vtt); used by integration response converters
ExtraFields BifrostResponseExtraFields `json:"extra_fields"`
}
func (r *BifrostTranscriptionResponse) BackfillParams(req *BifrostTranscriptionRequest) {
if r == nil || req == nil || req.Params == nil || req.Params.ResponseFormat == nil {
return
}
r.ResponseFormat = req.Params.ResponseFormat
}
// IsPlainTextTranscriptionFormat returns true if the given response format
// produces a plain-text response body (not JSON).
func IsPlainTextTranscriptionFormat(format *string) bool {
if format == nil {
return false
}
switch *format {
case "text", "srt", "vtt":
return true
default:
return false
}
}
type TranscriptionInput struct {
File []byte `json:"file"`
Filename string `json:"filename,omitempty"` // Original filename, used to preserve file format extension
}
type TranscriptionParameters struct {
Language *string `json:"language,omitempty"`
Prompt *string `json:"prompt,omitempty"`
ResponseFormat *string `json:"response_format,omitempty"` // Default is "json"
Temperature *float64 `json:"temperature,omitempty"` // Sampling temperature (0.0-1.0)
TimestampGranularities []string `json:"timestamp_granularities,omitempty"` // "word" and/or "segment"; requires response_format=verbose_json
Include []string `json:"include,omitempty"` // Additional response info (e.g., logprobs)
Format *string `json:"file_format,omitempty"` // Type of file, not required in openai, but required in gemini
MaxLength *int `json:"max_length,omitempty"` // Maximum length of the transcription used by HuggingFace
MinLength *int `json:"min_length,omitempty"` // Minimum length of the transcription used by HuggingFace
MaxNewTokens *int `json:"max_new_tokens,omitempty"` // Maximum new tokens to generate used by HuggingFace
MinNewTokens *int `json:"min_new_tokens,omitempty"` // Minimum new tokens to generate used by HuggingFace
// Elevenlabs-specific fields
AdditionalFormats []TranscriptionAdditionalFormat `json:"additional_formats,omitempty"`
WebhookMetadata interface{} `json:"webhook_metadata,omitempty"`
// Dynamic parameters that can be provider-specific, they are directly
// added to the request as is.
ExtraParams map[string]interface{} `json:"-"`
}
type TranscriptionAdditionalFormat struct {
Format TranscriptionExportOptions `json:"format"`
IncludeSpeakers *bool `json:"include_speakers,omitempty"`
IncludeTimestamps *bool `json:"include_timestamps,omitempty"`
SegmentOnSilenceLongerThanS *float64 `json:"segment_on_silence_longer_than_s,omitempty"`
MaxSegmentDurationS *float64 `json:"max_segment_duration_s,omitempty"`
MaxSegmentChars *int `json:"max_segment_chars,omitempty"`
MaxCharactersPerLine *int `json:"max_characters_per_line,omitempty"`
}
type TranscriptionExportOptions string
const (
TranscriptionExportOptionsSegmentedJson TranscriptionExportOptions = "segmented_json"
TranscriptionExportOptionsDocx TranscriptionExportOptions = "docx"
TranscriptionExportOptionsPdf TranscriptionExportOptions = "pdf"
TranscriptionExportOptionsTxt TranscriptionExportOptions = "txt"
TranscriptionExportOptionsHtml TranscriptionExportOptions = "html"
TranscriptionExportOptionsSrt TranscriptionExportOptions = "srt"
)
// TranscriptionLogProb represents log probability information for transcription
type TranscriptionLogProb struct {
Token string `json:"token"`
LogProb float64 `json:"logprob"`
Bytes []int `json:"bytes"`
}
// TranscriptionWord represents word-level timing information
type TranscriptionWord struct {
Word string `json:"word"`
Start float64 `json:"start"`
End float64 `json:"end"`
}
// TranscriptionSegment represents segment-level transcription information
type TranscriptionSegment struct {
ID int `json:"id"`
Seek int `json:"seek"`
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
Temperature float64 `json:"temperature"`
AvgLogProb float64 `json:"avg_logprob"`
CompressionRatio float64 `json:"compression_ratio"`
NoSpeechProb float64 `json:"no_speech_prob"`
}
// TranscriptionUsage represents usage information for transcription
type TranscriptionUsage struct {
Type string `json:"type"` // "tokens" or "duration"
InputTokens *int `json:"input_tokens,omitempty"`
InputTokenDetails *TranscriptionUsageInputTokenDetails `json:"input_token_details,omitempty"`
OutputTokens *int `json:"output_tokens,omitempty"`
TotalTokens *int `json:"total_tokens,omitempty"`
Seconds *int `json:"seconds,omitempty"` // For duration-based usage
}
type TranscriptionUsageInputTokenDetails struct {
TextTokens int `json:"text_tokens"`
AudioTokens int `json:"audio_tokens"`
}
type TranscriptionStreamResponseType string
const (
TranscriptionStreamResponseTypeDelta TranscriptionStreamResponseType = "transcript.text.delta"
TranscriptionStreamResponseTypeDone TranscriptionStreamResponseType = "transcript.text.done"
)
// BifrostTranscriptionStreamResponse represents streaming specific fields only
type BifrostTranscriptionStreamResponse struct {
Delta *string `json:"delta,omitempty"` // For delta events
LogProbs []TranscriptionLogProb `json:"logprobs,omitempty"`
Text string `json:"text"`
Type TranscriptionStreamResponseType `json:"type"`
Usage *TranscriptionUsage `json:"usage,omitempty"`
ExtraFields BifrostResponseExtraFields `json:"extra_fields"`
}