Files
bifrost/core/schemas/realtime.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

311 lines
14 KiB
Go

package schemas
import "encoding/json"
// RealtimeEventType represents the type of a Bifrost unified Realtime event.
type RealtimeEventType string
// Client-to-server event types (sent by the client through Bifrost)
const (
RTEventSessionUpdate RealtimeEventType = "session.update"
RTEventConversationItemCreate RealtimeEventType = "conversation.item.create"
RTEventConversationItemDelete RealtimeEventType = "conversation.item.delete"
RTEventResponseCreate RealtimeEventType = "response.create"
RTEventResponseCancel RealtimeEventType = "response.cancel"
RTEventInputAudioAppend RealtimeEventType = "input_audio_buffer.append"
RTEventInputAudioCommit RealtimeEventType = "input_audio_buffer.commit"
RTEventInputAudioClear RealtimeEventType = "input_audio_buffer.clear"
)
// Server-to-client event types (received from the provider, forwarded to client)
const (
RTEventSessionCreated RealtimeEventType = "session.created"
RTEventSessionUpdated RealtimeEventType = "session.updated"
RTEventConversationCreated RealtimeEventType = "conversation.created"
RTEventConversationItemAdded RealtimeEventType = "conversation.item.added"
RTEventConversationItemCreated RealtimeEventType = "conversation.item.created"
RTEventConversationItemRetrieved RealtimeEventType = "conversation.item.retrieved"
RTEventConversationItemDone RealtimeEventType = "conversation.item.done"
RTEventResponseCreated RealtimeEventType = "response.created"
RTEventResponseDone RealtimeEventType = "response.done"
RTEventResponseTextDelta RealtimeEventType = "response.text.delta"
RTEventResponseTextDone RealtimeEventType = "response.text.done"
RTEventResponseAudioDelta RealtimeEventType = "response.audio.delta"
RTEventResponseAudioDone RealtimeEventType = "response.audio.done"
RTEventResponseAudioTransDelta RealtimeEventType = "response.audio_transcript.delta"
RTEventResponseAudioTransDone RealtimeEventType = "response.audio_transcript.done"
RTEventResponseOutputItemAdded RealtimeEventType = "response.output_item.added"
RTEventResponseOutputItemDone RealtimeEventType = "response.output_item.done"
RTEventResponseContentPartAdded RealtimeEventType = "response.content_part.added"
RTEventResponseContentPartDone RealtimeEventType = "response.content_part.done"
RTEventRateLimitsUpdated RealtimeEventType = "rate_limits.updated"
RTEventInputAudioTransCompleted RealtimeEventType = "conversation.item.input_audio_transcription.completed"
RTEventInputAudioTransDelta RealtimeEventType = "conversation.item.input_audio_transcription.delta"
RTEventInputAudioTransFailed RealtimeEventType = "conversation.item.input_audio_transcription.failed"
RTEventInputAudioBufferCommitted RealtimeEventType = "input_audio_buffer.committed"
RTEventInputAudioBufferCleared RealtimeEventType = "input_audio_buffer.cleared"
RTEventInputAudioSpeechStarted RealtimeEventType = "input_audio_buffer.speech_started"
RTEventInputAudioSpeechStopped RealtimeEventType = "input_audio_buffer.speech_stopped"
RTEventError RealtimeEventType = "error"
)
// IsRealtimeConversationItemEventType reports whether the event carries a
// canonical conversation item payload after provider translation.
func IsRealtimeConversationItemEventType(eventType RealtimeEventType) bool {
switch eventType {
case RTEventConversationItemCreate,
RTEventConversationItemAdded,
RTEventConversationItemCreated,
RTEventConversationItemRetrieved,
RTEventConversationItemDone:
return true
default:
return false
}
}
// IsRealtimeUserInputEvent reports whether the event represents a finalized
// user input item in the canonical Bifrost realtime schema.
func IsRealtimeUserInputEvent(event *BifrostRealtimeEvent) bool {
return event != nil &&
event.Item != nil &&
event.Item.Role == "user" &&
IsRealtimeConversationItemEventType(event.Type)
}
// IsRealtimeToolOutputEvent reports whether the event represents a finalized
// tool output item in the canonical Bifrost realtime schema.
func IsRealtimeToolOutputEvent(event *BifrostRealtimeEvent) bool {
return event != nil &&
event.Item != nil &&
event.Item.Type == "function_call_output" &&
IsRealtimeConversationItemEventType(event.Type)
}
// IsRealtimeInputTranscriptEvent reports whether the event carries a finalized
// input-audio transcript in the canonical Bifrost realtime schema.
func IsRealtimeInputTranscriptEvent(event *BifrostRealtimeEvent) bool {
return event != nil && event.Type == RTEventInputAudioTransCompleted
}
// BifrostRealtimeEvent is the unified Bifrost envelope for all Realtime events.
// Provider converters translate between this format and the provider-native protocol.
type BifrostRealtimeEvent struct {
Type RealtimeEventType `json:"type"`
EventID string `json:"event_id,omitempty"`
Session *RealtimeSession `json:"session,omitempty"`
Item *RealtimeItem `json:"item,omitempty"`
Delta *RealtimeDelta `json:"delta,omitempty"`
Audio []byte `json:"audio,omitempty"`
Error *RealtimeError `json:"error,omitempty"`
// ExtraParams preserves provider-specific top-level event fields that are not
// promoted into the common Bifrost schema.
ExtraParams map[string]json.RawMessage `json:"extra_params,omitempty"`
// RawData preserves the original provider event for pass-through or debugging.
RawData json.RawMessage `json:"raw_data,omitempty"`
}
// RealtimeSession describes session configuration for the Realtime connection.
type RealtimeSession struct {
ID string `json:"id,omitempty"`
Model string `json:"model,omitempty"`
Modalities []string `json:"modalities,omitempty"`
Instructions string `json:"instructions,omitempty"`
Voice string `json:"voice,omitempty"`
Temperature *float64 `json:"temperature,omitempty"`
MaxOutputTokens json.RawMessage `json:"max_output_tokens,omitempty"`
TurnDetection json.RawMessage `json:"turn_detection,omitempty"`
InputAudioFormat string `json:"input_audio_format,omitempty"`
OutputAudioType string `json:"output_audio_type,omitempty"`
Tools json.RawMessage `json:"tools,omitempty"`
ExtraParams map[string]json.RawMessage `json:"extra_params,omitempty"`
}
// RealtimeItem represents a conversation item in the Realtime protocol.
type RealtimeItem struct {
ID string `json:"id,omitempty"`
Type string `json:"type,omitempty"`
Role string `json:"role,omitempty"`
Status string `json:"status,omitempty"`
Content json.RawMessage `json:"content,omitempty"`
Name string `json:"name,omitempty"`
CallID string `json:"call_id,omitempty"`
Arguments string `json:"arguments,omitempty"`
Output string `json:"output,omitempty"`
ExtraParams map[string]json.RawMessage `json:"extra_params,omitempty"`
}
// RealtimeDelta carries incremental content for streaming events.
type RealtimeDelta struct {
Text string `json:"text,omitempty"`
Audio string `json:"audio,omitempty"`
Transcript string `json:"transcript,omitempty"`
ItemID string `json:"item_id,omitempty"`
OutputIdx *int `json:"output_index,omitempty"`
ContentIdx *int `json:"content_index,omitempty"`
ResponseID string `json:"response_id,omitempty"`
}
// RealtimeError describes an error from the Realtime API.
type RealtimeError struct {
Type string `json:"type,omitempty"`
Code string `json:"code,omitempty"`
Message string `json:"message,omitempty"`
Param string `json:"param,omitempty"`
ExtraParams map[string]json.RawMessage `json:"extra_params,omitempty"`
}
// RealtimeSessionEndpointType identifies the public ephemeral-token endpoint
// shape the client called so providers can preserve versioned behavior.
type RealtimeSessionEndpointType string
const (
RealtimeSessionEndpointClientSecrets RealtimeSessionEndpointType = "client_secrets"
RealtimeSessionEndpointSessions RealtimeSessionEndpointType = "sessions"
)
// RealtimeSessionRoute describes a provider-registered public route for
// ephemeral-token creation.
type RealtimeSessionRoute struct {
Path string
EndpointType RealtimeSessionEndpointType
DefaultProvider ModelProvider
}
// RealtimeProvider is an optional interface that providers can implement to
// indicate support for bidirectional Realtime API (audio/text streaming).
// Checked via type assertion: provider.(RealtimeProvider).
type RealtimeProvider interface {
SupportsRealtimeAPI() bool
RealtimeWebSocketURL(key Key, model string) string
RealtimeHeaders(key Key) map[string]string
// SupportsRealtimeWebRTC reports whether the provider supports WebRTC SDP exchange.
SupportsRealtimeWebRTC() bool
// ExchangeRealtimeWebRTCSDP performs the provider-specific SDP signaling exchange.
// The provider owns the HTTP specifics (URL, headers, body format).
// session may be nil if the signaling format doesn't include session config.
ExchangeRealtimeWebRTCSDP(ctx *BifrostContext, key Key, model string, sdp string, session json.RawMessage) (string, *BifrostError)
ToBifrostRealtimeEvent(providerEvent json.RawMessage) (*BifrostRealtimeEvent, error)
ToProviderRealtimeEvent(bifrostEvent *BifrostRealtimeEvent) (json.RawMessage, error)
// ShouldStartRealtimeTurn reports whether the canonical client-side event
// should start pre-hooks. Providers without an explicit turn-start signal
// return false and rely on finalize-time fallback hooks.
ShouldStartRealtimeTurn(event *BifrostRealtimeEvent) bool
// RealtimeTurnFinalEvent returns the canonical provider event that completes
// a turn and should trigger post-hooks.
RealtimeTurnFinalEvent() RealtimeEventType
RealtimeWebRTCDataChannelLabel() string
RealtimeWebSocketSubprotocol() string
ShouldForwardRealtimeEvent(event *BifrostRealtimeEvent) bool
ShouldAccumulateRealtimeOutput(eventType RealtimeEventType) bool
}
// RealtimeLegacyWebRTCProvider is an optional interface for providers that
// support the beta WebRTC handshake (e.g., OpenAI's /v1/realtime).
// Only checked for legacy integration routes via type assertion.
// Takes SDP offer + optional session JSON, same as ExchangeRealtimeWebRTCSDP
// but targets the provider's legacy/beta endpoint.
type RealtimeLegacyWebRTCProvider interface {
ExchangeLegacyRealtimeWebRTCSDP(ctx *BifrostContext, key Key, sdp string, session json.RawMessage, model string) (string, *BifrostError)
}
// RealtimeUsageExtractor lets providers parse terminal-turn usage/output from
// their native wire payloads without coupling handlers to a specific protocol.
type RealtimeUsageExtractor interface {
ExtractRealtimeTurnUsage(terminalEventRaw []byte) *BifrostLLMUsage
ExtractRealtimeTurnOutput(terminalEventRaw []byte) *ChatMessage
}
// RealtimeSessionProvider is an optional interface for providers that can mint
// short-lived client secrets for browser/client-side Realtime connections.
// Checked via type assertion: provider.(RealtimeSessionProvider).
type RealtimeSessionProvider interface {
CreateRealtimeClientSecret(ctx *BifrostContext, key Key, endpointType RealtimeSessionEndpointType, rawRequest json.RawMessage) (*BifrostPassthroughResponse, *BifrostError)
}
// ParseRealtimeEvent decodes a client/provider realtime event while preserving
// unknown top-level fields in ExtraParams for provider-specific round-tripping.
func ParseRealtimeEvent(raw []byte) (*BifrostRealtimeEvent, error) {
type realtimeEventAlias struct {
Type RealtimeEventType `json:"type"`
EventID string `json:"event_id,omitempty"`
Session *RealtimeSession `json:"session,omitempty"`
Item *RealtimeItem `json:"item,omitempty"`
Delta *RealtimeDelta `json:"delta,omitempty"`
Audio []byte `json:"audio,omitempty"`
Error *RealtimeError `json:"error,omitempty"`
}
var alias realtimeEventAlias
if err := Unmarshal(raw, &alias); err != nil {
return nil, err
}
event := &BifrostRealtimeEvent{
Type: alias.Type,
EventID: alias.EventID,
Session: alias.Session,
Item: alias.Item,
Delta: alias.Delta,
Audio: alias.Audio,
Error: alias.Error,
}
var root map[string]json.RawMessage
if err := Unmarshal(raw, &root); err != nil {
return nil, err
}
savedSession := root["session"]
savedItem := root["item"]
savedError := root["error"]
for _, key := range []string{"type", "event_id", "session", "item", "delta", "audio", "error", "raw_data"} {
delete(root, key)
}
if len(root) > 0 {
event.ExtraParams = root
}
if event.Session != nil {
var sessionRoot map[string]json.RawMessage
if len(savedSession) > 0 && Unmarshal(savedSession, &sessionRoot) == nil {
for _, key := range []string{
"id", "model", "modalities", "instructions", "voice", "temperature",
"max_output_tokens", "turn_detection", "input_audio_format", "output_audio_type", "tools",
} {
delete(sessionRoot, key)
}
if len(sessionRoot) > 0 {
event.Session.ExtraParams = sessionRoot
}
}
}
if event.Item != nil {
var itemRoot map[string]json.RawMessage
if len(savedItem) > 0 && Unmarshal(savedItem, &itemRoot) == nil {
for _, key := range []string{
"id", "type", "role", "status", "content", "name", "call_id", "arguments", "output",
} {
delete(itemRoot, key)
}
if len(itemRoot) > 0 {
event.Item.ExtraParams = itemRoot
}
}
}
if event.Error != nil {
var errorRoot map[string]json.RawMessage
if len(savedError) > 0 && Unmarshal(savedError, &errorRoot) == nil {
for _, key := range []string{"type", "code", "message", "param"} {
delete(errorRoot, key)
}
if len(errorRoot) > 0 {
event.Error.ExtraParams = errorRoot
}
}
}
return event, nil
}