157 lines
7.5 KiB
Go
157 lines
7.5 KiB
Go
package schemas
|
|
|
|
type BifrostTranscriptionRequest struct {
|
|
Provider ModelProvider `json:"provider"`
|
|
Model string `json:"model"`
|
|
Input *TranscriptionInput `json:"input,omitempty"`
|
|
Params *TranscriptionParameters `json:"params,omitempty"`
|
|
Fallbacks []Fallback `json:"fallbacks,omitempty"`
|
|
RawRequestBody []byte `json:"-"` // set bifrost-use-raw-request-body to true in ctx to use the raw request body. Bifrost will directly send this to the downstream provider.
|
|
}
|
|
|
|
func (r *BifrostTranscriptionRequest) GetRawRequestBody() []byte {
|
|
return r.RawRequestBody
|
|
}
|
|
|
|
type BifrostTranscriptionResponse struct {
|
|
Duration *float64 `json:"duration,omitempty"` // Duration in seconds
|
|
Language *string `json:"language,omitempty"` // e.g., "english"
|
|
LogProbs []TranscriptionLogProb `json:"logprobs,omitempty"`
|
|
Segments []TranscriptionSegment `json:"segments,omitempty"`
|
|
Task *string `json:"task,omitempty"` // e.g., "transcribe"
|
|
Text string `json:"text"`
|
|
Usage *TranscriptionUsage `json:"usage,omitempty"`
|
|
Words []TranscriptionWord `json:"words,omitempty"`
|
|
ResponseFormat *string `json:"-"` // Set by provider for non-JSON formats (text, srt, vtt); used by integration response converters
|
|
ExtraFields BifrostResponseExtraFields `json:"extra_fields"`
|
|
}
|
|
|
|
func (r *BifrostTranscriptionResponse) BackfillParams(req *BifrostTranscriptionRequest) {
|
|
if r == nil || req == nil || req.Params == nil || req.Params.ResponseFormat == nil {
|
|
return
|
|
}
|
|
r.ResponseFormat = req.Params.ResponseFormat
|
|
}
|
|
|
|
// IsPlainTextTranscriptionFormat returns true if the given response format
|
|
// produces a plain-text response body (not JSON).
|
|
func IsPlainTextTranscriptionFormat(format *string) bool {
|
|
if format == nil {
|
|
return false
|
|
}
|
|
switch *format {
|
|
case "text", "srt", "vtt":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
type TranscriptionInput struct {
|
|
File []byte `json:"file"`
|
|
Filename string `json:"filename,omitempty"` // Original filename, used to preserve file format extension
|
|
}
|
|
|
|
type TranscriptionParameters struct {
|
|
Language *string `json:"language,omitempty"`
|
|
Prompt *string `json:"prompt,omitempty"`
|
|
ResponseFormat *string `json:"response_format,omitempty"` // Default is "json"
|
|
Temperature *float64 `json:"temperature,omitempty"` // Sampling temperature (0.0-1.0)
|
|
TimestampGranularities []string `json:"timestamp_granularities,omitempty"` // "word" and/or "segment"; requires response_format=verbose_json
|
|
Include []string `json:"include,omitempty"` // Additional response info (e.g., logprobs)
|
|
Format *string `json:"file_format,omitempty"` // Type of file, not required in openai, but required in gemini
|
|
MaxLength *int `json:"max_length,omitempty"` // Maximum length of the transcription used by HuggingFace
|
|
MinLength *int `json:"min_length,omitempty"` // Minimum length of the transcription used by HuggingFace
|
|
MaxNewTokens *int `json:"max_new_tokens,omitempty"` // Maximum new tokens to generate used by HuggingFace
|
|
MinNewTokens *int `json:"min_new_tokens,omitempty"` // Minimum new tokens to generate used by HuggingFace
|
|
|
|
// Elevenlabs-specific fields
|
|
AdditionalFormats []TranscriptionAdditionalFormat `json:"additional_formats,omitempty"`
|
|
WebhookMetadata interface{} `json:"webhook_metadata,omitempty"`
|
|
|
|
// Dynamic parameters that can be provider-specific, they are directly
|
|
// added to the request as is.
|
|
ExtraParams map[string]interface{} `json:"-"`
|
|
}
|
|
|
|
type TranscriptionAdditionalFormat struct {
|
|
Format TranscriptionExportOptions `json:"format"`
|
|
IncludeSpeakers *bool `json:"include_speakers,omitempty"`
|
|
IncludeTimestamps *bool `json:"include_timestamps,omitempty"`
|
|
SegmentOnSilenceLongerThanS *float64 `json:"segment_on_silence_longer_than_s,omitempty"`
|
|
MaxSegmentDurationS *float64 `json:"max_segment_duration_s,omitempty"`
|
|
MaxSegmentChars *int `json:"max_segment_chars,omitempty"`
|
|
MaxCharactersPerLine *int `json:"max_characters_per_line,omitempty"`
|
|
}
|
|
|
|
type TranscriptionExportOptions string
|
|
|
|
const (
|
|
TranscriptionExportOptionsSegmentedJson TranscriptionExportOptions = "segmented_json"
|
|
TranscriptionExportOptionsDocx TranscriptionExportOptions = "docx"
|
|
TranscriptionExportOptionsPdf TranscriptionExportOptions = "pdf"
|
|
TranscriptionExportOptionsTxt TranscriptionExportOptions = "txt"
|
|
TranscriptionExportOptionsHtml TranscriptionExportOptions = "html"
|
|
TranscriptionExportOptionsSrt TranscriptionExportOptions = "srt"
|
|
)
|
|
|
|
// TranscriptionLogProb represents log probability information for transcription
|
|
type TranscriptionLogProb struct {
|
|
Token string `json:"token"`
|
|
LogProb float64 `json:"logprob"`
|
|
Bytes []int `json:"bytes"`
|
|
}
|
|
|
|
// TranscriptionWord represents word-level timing information
|
|
type TranscriptionWord struct {
|
|
Word string `json:"word"`
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
}
|
|
|
|
// TranscriptionSegment represents segment-level transcription information
|
|
type TranscriptionSegment struct {
|
|
ID int `json:"id"`
|
|
Seek int `json:"seek"`
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Text string `json:"text"`
|
|
Tokens []int `json:"tokens"`
|
|
Temperature float64 `json:"temperature"`
|
|
AvgLogProb float64 `json:"avg_logprob"`
|
|
CompressionRatio float64 `json:"compression_ratio"`
|
|
NoSpeechProb float64 `json:"no_speech_prob"`
|
|
}
|
|
|
|
// TranscriptionUsage represents usage information for transcription
|
|
type TranscriptionUsage struct {
|
|
Type string `json:"type"` // "tokens" or "duration"
|
|
InputTokens *int `json:"input_tokens,omitempty"`
|
|
InputTokenDetails *TranscriptionUsageInputTokenDetails `json:"input_token_details,omitempty"`
|
|
OutputTokens *int `json:"output_tokens,omitempty"`
|
|
TotalTokens *int `json:"total_tokens,omitempty"`
|
|
Seconds *int `json:"seconds,omitempty"` // For duration-based usage
|
|
}
|
|
|
|
type TranscriptionUsageInputTokenDetails struct {
|
|
TextTokens int `json:"text_tokens"`
|
|
AudioTokens int `json:"audio_tokens"`
|
|
}
|
|
|
|
type TranscriptionStreamResponseType string
|
|
|
|
const (
|
|
TranscriptionStreamResponseTypeDelta TranscriptionStreamResponseType = "transcript.text.delta"
|
|
TranscriptionStreamResponseTypeDone TranscriptionStreamResponseType = "transcript.text.done"
|
|
)
|
|
|
|
// BifrostTranscriptionStreamResponse represents streaming specific fields only
|
|
type BifrostTranscriptionStreamResponse struct {
|
|
Delta *string `json:"delta,omitempty"` // For delta events
|
|
LogProbs []TranscriptionLogProb `json:"logprobs,omitempty"`
|
|
Text string `json:"text"`
|
|
Type TranscriptionStreamResponseType `json:"type"`
|
|
Usage *TranscriptionUsage `json:"usage,omitempty"`
|
|
ExtraFields BifrostResponseExtraFields `json:"extra_fields"`
|
|
}
|