first commit
This commit is contained in:
257
core/providers/elevenlabs/realtime.go
Normal file
257
core/providers/elevenlabs/realtime.go
Normal file
@@ -0,0 +1,257 @@
|
||||
package elevenlabs
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/maximhq/bifrost/core/schemas"
|
||||
|
||||
providerUtils "github.com/maximhq/bifrost/core/providers/utils"
|
||||
)
|
||||
|
||||
// SupportsRealtimeAPI returns true since ElevenLabs supports Conversational AI via WebSocket.
|
||||
func (provider *ElevenlabsProvider) SupportsRealtimeAPI() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// RealtimeWebSocketURL returns the WSS URL for the ElevenLabs Conversational AI endpoint.
|
||||
// The model parameter is used as the agent_id query parameter.
|
||||
// Format: wss://api.elevenlabs.io/v1/convai/conversation?agent_id=<model>
|
||||
func (provider *ElevenlabsProvider) RealtimeWebSocketURL(key schemas.Key, model string) string {
|
||||
base := provider.networkConfig.BaseURL
|
||||
base = strings.Replace(base, "https://", "wss://", 1)
|
||||
base = strings.Replace(base, "http://", "ws://", 1)
|
||||
return base + "/v1/convai/conversation?agent_id=" + model
|
||||
}
|
||||
|
||||
// RealtimeHeaders returns the headers required for the ElevenLabs Conversational AI WebSocket.
|
||||
func (provider *ElevenlabsProvider) RealtimeHeaders(key schemas.Key) map[string]string {
|
||||
headers := map[string]string{
|
||||
"xi-api-key": key.Value.GetValue(),
|
||||
}
|
||||
for k, v := range provider.networkConfig.ExtraHeaders {
|
||||
if strings.EqualFold(k, "xi-api-key") {
|
||||
continue
|
||||
}
|
||||
headers[k] = v
|
||||
}
|
||||
return headers
|
||||
}
|
||||
|
||||
// SupportsRealtimeWebRTC returns false — ElevenLabs WebRTC SDP exchange is not yet implemented.
|
||||
func (provider *ElevenlabsProvider) SupportsRealtimeWebRTC() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ExchangeRealtimeWebRTCSDP is not yet implemented for ElevenLabs.
|
||||
func (provider *ElevenlabsProvider) ExchangeRealtimeWebRTCSDP(_ *schemas.BifrostContext, _ schemas.Key, _ string, _ string, _ json.RawMessage) (string, *schemas.BifrostError) {
|
||||
return "", &schemas.BifrostError{
|
||||
IsBifrostError: true,
|
||||
StatusCode: schemas.Ptr(400),
|
||||
Error: &schemas.ErrorField{Type: schemas.Ptr("invalid_request_error"), Message: "WebRTC SDP exchange is not yet implemented for ElevenLabs"},
|
||||
}
|
||||
}
|
||||
|
||||
func (provider *ElevenlabsProvider) ShouldStartRealtimeTurn(event *schemas.BifrostRealtimeEvent) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (provider *ElevenlabsProvider) RealtimeTurnFinalEvent() schemas.RealtimeEventType {
|
||||
return schemas.RTEventResponseDone
|
||||
}
|
||||
|
||||
func (provider *ElevenlabsProvider) RealtimeWebRTCDataChannelLabel() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (provider *ElevenlabsProvider) RealtimeWebSocketSubprotocol() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (provider *ElevenlabsProvider) ShouldForwardRealtimeEvent(event *schemas.BifrostRealtimeEvent) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (provider *ElevenlabsProvider) ShouldAccumulateRealtimeOutput(eventType schemas.RealtimeEventType) bool {
|
||||
return eventType == schemas.RTEventResponseDone
|
||||
}
|
||||
|
||||
// ElevenLabs Conversational AI WebSocket event types
|
||||
const (
|
||||
elConversationInitMetadata = "conversation_initiation_metadata"
|
||||
elPing = "ping"
|
||||
elAudio = "audio"
|
||||
elUserTranscript = "user_transcript"
|
||||
elAgentResponse = "agent_response"
|
||||
elAgentResponseCorrection = "agent_response_correction"
|
||||
elInterruption = "interruption"
|
||||
elClientToolCall = "client_tool_call"
|
||||
|
||||
elUserAudioChunk = "user_audio_chunk"
|
||||
elPong = "pong"
|
||||
elClientToolResult = "client_tool_result"
|
||||
elContextualUpdate = "contextual_update"
|
||||
)
|
||||
|
||||
// elevenlabsEvent represents a raw ElevenLabs Conversational AI WebSocket event.
|
||||
type elevenlabsEvent struct {
|
||||
Type string `json:"type"`
|
||||
|
||||
// Server events
|
||||
ConversationInitMetadata json.RawMessage `json:"conversation_initiation_metadata_event,omitempty"`
|
||||
Audio json.RawMessage `json:"audio_event,omitempty"`
|
||||
UserTranscript json.RawMessage `json:"user_transcription_event,omitempty"`
|
||||
AgentResponse json.RawMessage `json:"agent_response_event,omitempty"`
|
||||
AgentResponseCorrection json.RawMessage `json:"agent_response_correction_event,omitempty"`
|
||||
ClientToolCall json.RawMessage `json:"client_tool_call,omitempty"`
|
||||
PingEvent json.RawMessage `json:"ping_event,omitempty"`
|
||||
|
||||
// Client events
|
||||
UserAudioChunk json.RawMessage `json:"user_audio_chunk,omitempty"`
|
||||
}
|
||||
|
||||
// elevenlabsAudioEvent is the audio event structure from ElevenLabs.
|
||||
type elevenlabsAudioEvent struct {
|
||||
Audio string `json:"audio_base_64,omitempty"`
|
||||
Alignment json.RawMessage `json:"alignment,omitempty"`
|
||||
}
|
||||
|
||||
// elevenlabsTranscriptEvent is the user/agent transcript event from ElevenLabs.
|
||||
type elevenlabsTranscriptEvent struct {
|
||||
UserTranscript string `json:"user_transcript,omitempty"`
|
||||
AgentResponse string `json:"agent_response,omitempty"`
|
||||
AgentResponseID string `json:"agent_response_id,omitempty"`
|
||||
}
|
||||
|
||||
// elevenlabsCorrectionEvent is the agent response correction event from ElevenLabs.
|
||||
type elevenlabsCorrectionEvent struct {
|
||||
OriginalAgentResponse string `json:"original_agent_response,omitempty"`
|
||||
CorrectedAgentResponse string `json:"corrected_agent_response,omitempty"`
|
||||
}
|
||||
|
||||
// ToBifrostRealtimeEvent converts an ElevenLabs Conversational AI event to the unified Bifrost format.
|
||||
func (provider *ElevenlabsProvider) ToBifrostRealtimeEvent(providerEvent json.RawMessage) (*schemas.BifrostRealtimeEvent, error) {
|
||||
var raw elevenlabsEvent
|
||||
if err := json.Unmarshal(providerEvent, &raw); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal ElevenLabs realtime event: %w", err)
|
||||
}
|
||||
|
||||
event := &schemas.BifrostRealtimeEvent{
|
||||
RawData: providerEvent,
|
||||
}
|
||||
|
||||
switch raw.Type {
|
||||
case elConversationInitMetadata:
|
||||
event.Type = schemas.RTEventSessionCreated
|
||||
event.Session = &schemas.RealtimeSession{}
|
||||
|
||||
case elPing:
|
||||
event.Type = schemas.RealtimeEventType("ping")
|
||||
|
||||
case elAudio:
|
||||
event.Type = schemas.RTEventResponseAudioDelta
|
||||
if raw.Audio != nil {
|
||||
var audioEvt elevenlabsAudioEvent
|
||||
if err := json.Unmarshal(raw.Audio, &audioEvt); err == nil {
|
||||
event.Delta = &schemas.RealtimeDelta{
|
||||
Audio: audioEvt.Audio,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
case elUserTranscript:
|
||||
event.Type = schemas.RTEventInputAudioTransCompleted
|
||||
if raw.UserTranscript != nil {
|
||||
var transcript elevenlabsTranscriptEvent
|
||||
if err := json.Unmarshal(raw.UserTranscript, &transcript); err == nil {
|
||||
event.Delta = &schemas.RealtimeDelta{
|
||||
Transcript: transcript.UserTranscript,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
case elAgentResponse:
|
||||
event.Type = schemas.RTEventResponseDone
|
||||
if raw.AgentResponse != nil {
|
||||
var agentResp elevenlabsTranscriptEvent
|
||||
if err := json.Unmarshal(raw.AgentResponse, &agentResp); err == nil {
|
||||
event.Delta = &schemas.RealtimeDelta{
|
||||
Text: agentResp.AgentResponse,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
case elAgentResponseCorrection:
|
||||
event.Type = schemas.RTEventResponseTextDelta
|
||||
if raw.AgentResponseCorrection != nil {
|
||||
var correction elevenlabsCorrectionEvent
|
||||
if err := json.Unmarshal(raw.AgentResponseCorrection, &correction); err == nil {
|
||||
event.Delta = &schemas.RealtimeDelta{
|
||||
Text: correction.CorrectedAgentResponse,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
case elInterruption:
|
||||
event.Type = schemas.RTEventResponseCancel
|
||||
|
||||
case elClientToolCall:
|
||||
event.Type = schemas.RealtimeEventType("client_tool_call")
|
||||
if raw.ClientToolCall != nil {
|
||||
var toolCall struct {
|
||||
ToolName string `json:"tool_name"`
|
||||
Parameters json.RawMessage `json:"parameters"`
|
||||
ToolCallID string `json:"tool_call_id"`
|
||||
}
|
||||
if err := json.Unmarshal(raw.ClientToolCall, &toolCall); err == nil {
|
||||
args := string(toolCall.Parameters)
|
||||
if len(toolCall.Parameters) > 0 {
|
||||
var parsed interface{}
|
||||
if err := json.Unmarshal(toolCall.Parameters, &parsed); err == nil {
|
||||
if sorted, err := providerUtils.MarshalSorted(parsed); err == nil {
|
||||
args = string(sorted)
|
||||
}
|
||||
}
|
||||
}
|
||||
event.Item = &schemas.RealtimeItem{
|
||||
Type: "function_call",
|
||||
Name: toolCall.ToolName,
|
||||
CallID: toolCall.ToolCallID,
|
||||
Arguments: args,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
event.Type = schemas.RealtimeEventType(raw.Type)
|
||||
}
|
||||
|
||||
return event, nil
|
||||
}
|
||||
|
||||
// ToProviderRealtimeEvent converts a unified Bifrost Realtime event to ElevenLabs' native JSON.
|
||||
func (provider *ElevenlabsProvider) ToProviderRealtimeEvent(bifrostEvent *schemas.BifrostRealtimeEvent) (json.RawMessage, error) {
|
||||
switch bifrostEvent.Type {
|
||||
case schemas.RTEventInputAudioAppend:
|
||||
if bifrostEvent.Delta == nil {
|
||||
return nil, fmt.Errorf("delta must be set for input_audio_buffer.append events")
|
||||
}
|
||||
out := map[string]interface{}{
|
||||
"type": elUserAudioChunk,
|
||||
"user_audio_chunk": bifrostEvent.Delta.Audio,
|
||||
}
|
||||
return schemas.MarshalSorted(out)
|
||||
|
||||
case schemas.RealtimeEventType("pong"):
|
||||
return schemas.MarshalSorted(map[string]interface{}{
|
||||
"type": "pong",
|
||||
})
|
||||
|
||||
default:
|
||||
out := map[string]interface{}{
|
||||
"type": string(bifrostEvent.Type),
|
||||
}
|
||||
return schemas.MarshalSorted(out)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user