968 lines
32 KiB
Go
968 lines
32 KiB
Go
package openai
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
|
|
providerUtils "github.com/maximhq/bifrost/core/providers/utils"
|
|
"github.com/maximhq/bifrost/core/schemas"
|
|
"github.com/valyala/fasthttp"
|
|
)
|
|
|
|
// SupportsRealtimeAPI returns true since OpenAI natively supports the Realtime API.
|
|
func (provider *OpenAIProvider) SupportsRealtimeAPI() bool {
|
|
return true
|
|
}
|
|
|
|
// RealtimeWebSocketURL returns the WSS URL for the OpenAI Realtime API.
|
|
// Format: wss://api.openai.com/v1/realtime?model=<model>
|
|
func (provider *OpenAIProvider) RealtimeWebSocketURL(key schemas.Key, model string) string {
|
|
base := provider.networkConfig.BaseURL
|
|
base = strings.Replace(base, "https://", "wss://", 1)
|
|
base = strings.Replace(base, "http://", "ws://", 1)
|
|
return base + "/v1/realtime?model=" + url.QueryEscape(model)
|
|
}
|
|
|
|
// RealtimeHeaders returns the headers required for the OpenAI Realtime WebSocket connection.
|
|
func (provider *OpenAIProvider) RealtimeHeaders(key schemas.Key) map[string]string {
|
|
headers := map[string]string{
|
|
"Authorization": "Bearer " + key.Value.GetValue(),
|
|
}
|
|
for k, v := range provider.networkConfig.ExtraHeaders {
|
|
headers[k] = v
|
|
}
|
|
return headers
|
|
}
|
|
|
|
// SupportsRealtimeWebRTC reports that OpenAI supports WebRTC SDP exchange.
|
|
func (provider *OpenAIProvider) SupportsRealtimeWebRTC() bool {
|
|
return true
|
|
}
|
|
|
|
// ExchangeRealtimeWebRTCSDP performs the GA SDP exchange via multipart POST to /v1/realtime/calls.
|
|
func (provider *OpenAIProvider) ExchangeRealtimeWebRTCSDP(
|
|
ctx *schemas.BifrostContext,
|
|
key schemas.Key,
|
|
model string,
|
|
sdp string,
|
|
session json.RawMessage,
|
|
) (string, *schemas.BifrostError) {
|
|
path := "/v1/realtime/calls"
|
|
if session == nil && strings.TrimSpace(model) != "" {
|
|
path += "?model=" + url.QueryEscape(model)
|
|
}
|
|
return provider.exchangeWebRTCSDP(ctx, key, path, sdp, session)
|
|
}
|
|
|
|
// ExchangeLegacyRealtimeWebRTCSDP performs the beta SDP exchange via multipart POST to /v1/realtime.
|
|
// Same multipart format but targets the legacy endpoint with model in the URL.
|
|
func (provider *OpenAIProvider) ExchangeLegacyRealtimeWebRTCSDP(
|
|
ctx *schemas.BifrostContext,
|
|
key schemas.Key,
|
|
sdp string,
|
|
session json.RawMessage,
|
|
model string,
|
|
) (string, *schemas.BifrostError) {
|
|
return provider.exchangeWebRTCSDP(ctx, key, "/v1/realtime?model="+url.QueryEscape(model), sdp, session)
|
|
}
|
|
|
|
// exchangeWebRTCSDP is the shared multipart SDP exchange implementation.
|
|
// Builds a multipart body with sdp + optional session, POSTs to the given path.
|
|
func (provider *OpenAIProvider) exchangeWebRTCSDP(
|
|
ctx *schemas.BifrostContext,
|
|
key schemas.Key,
|
|
path string,
|
|
sdp string,
|
|
session json.RawMessage,
|
|
) (string, *schemas.BifrostError) {
|
|
bodyBuf := &bytes.Buffer{}
|
|
writer := multipart.NewWriter(bodyBuf)
|
|
if err := writer.WriteField("sdp", sdp); err != nil {
|
|
return "", newRealtimeWebRTCSDPError(fasthttp.StatusInternalServerError, "server_error", "failed to encode upstream SDP body", err)
|
|
}
|
|
if session != nil {
|
|
if err := writer.WriteField("session", string(session)); err != nil {
|
|
return "", newRealtimeWebRTCSDPError(fasthttp.StatusInternalServerError, "server_error", "failed to encode upstream session body", err)
|
|
}
|
|
}
|
|
if err := writer.Close(); err != nil {
|
|
return "", newRealtimeWebRTCSDPError(fasthttp.StatusInternalServerError, "server_error", "failed to finalize upstream SDP body", err)
|
|
}
|
|
|
|
req := fasthttp.AcquireRequest()
|
|
resp := fasthttp.AcquireResponse()
|
|
defer fasthttp.ReleaseRequest(req)
|
|
defer fasthttp.ReleaseResponse(resp)
|
|
|
|
req.SetRequestURI(provider.buildRequestURL(ctx, path, schemas.RealtimeRequest))
|
|
req.Header.SetMethod(http.MethodPost)
|
|
req.Header.SetContentType(writer.FormDataContentType())
|
|
req.Header.Set("Authorization", "Bearer "+key.Value.GetValue())
|
|
for k, v := range provider.networkConfig.ExtraHeaders {
|
|
req.Header.Set(k, v)
|
|
}
|
|
if headers, _ := ctx.Value(schemas.BifrostContextKeyRequestHeaders).(map[string]string); headers != nil {
|
|
if agentsSDK := headers["x-openai-agents-sdk"]; agentsSDK != "" {
|
|
req.Header.Set("X-OpenAI-Agents-SDK", agentsSDK)
|
|
}
|
|
}
|
|
req.SetBody(bodyBuf.Bytes())
|
|
|
|
_, bifrostErr, wait := providerUtils.MakeRequestWithContext(ctx, provider.client, req, resp)
|
|
defer wait()
|
|
if bifrostErr != nil {
|
|
return "", bifrostErr
|
|
}
|
|
|
|
answerBody := resp.Body()
|
|
if resp.StatusCode() < fasthttp.StatusOK || resp.StatusCode() >= fasthttp.StatusMultipleChoices {
|
|
return "", provider.realtimeWebRTCUpstreamError(ctx, resp.StatusCode(), answerBody)
|
|
}
|
|
|
|
return string(answerBody), nil
|
|
}
|
|
|
|
func (provider *OpenAIProvider) realtimeWebRTCUpstreamError(ctx *schemas.BifrostContext, statusCode int, body []byte) *schemas.BifrostError {
|
|
bifrostErr := &schemas.BifrostError{
|
|
IsBifrostError: false,
|
|
StatusCode: schemas.Ptr(fasthttp.StatusBadGateway),
|
|
Error: &schemas.ErrorField{
|
|
Type: schemas.Ptr("upstream_connection_error"),
|
|
Message: fmt.Sprintf("upstream realtime WebRTC handshake failed for %s", provider.GetProviderKey()),
|
|
},
|
|
ExtraFields: schemas.BifrostErrorExtraFields{
|
|
RequestType: schemas.RealtimeRequest,
|
|
Provider: provider.GetProviderKey(),
|
|
},
|
|
}
|
|
if providerUtils.ShouldSendBackRawResponse(ctx, provider.sendBackRawResponse) {
|
|
bifrostErr.ExtraFields.RawResponse = map[string]any{
|
|
"status": statusCode,
|
|
"body": string(body),
|
|
}
|
|
}
|
|
return bifrostErr
|
|
}
|
|
|
|
func newRealtimeWebRTCSDPError(status int, errorType, message string, err error) *schemas.BifrostError {
|
|
bifrostErr := &schemas.BifrostError{
|
|
IsBifrostError: true,
|
|
StatusCode: schemas.Ptr(status),
|
|
Error: &schemas.ErrorField{
|
|
Type: schemas.Ptr(errorType),
|
|
Message: message,
|
|
},
|
|
}
|
|
if err != nil {
|
|
bifrostErr.Error.Error = err
|
|
}
|
|
return bifrostErr
|
|
}
|
|
|
|
func (provider *OpenAIProvider) ShouldStartRealtimeTurn(event *schemas.BifrostRealtimeEvent) bool {
|
|
if event == nil {
|
|
return false
|
|
}
|
|
switch event.Type {
|
|
case schemas.RTEventResponseCreate, schemas.RTEventInputAudioBufferCommitted:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (provider *OpenAIProvider) RealtimeTurnFinalEvent() schemas.RealtimeEventType {
|
|
return schemas.RTEventResponseDone
|
|
}
|
|
|
|
func (provider *OpenAIProvider) RealtimeWebRTCDataChannelLabel() string {
|
|
return "oai-events"
|
|
}
|
|
|
|
func (provider *OpenAIProvider) RealtimeWebSocketSubprotocol() string {
|
|
return "realtime"
|
|
}
|
|
|
|
func (provider *OpenAIProvider) ShouldForwardRealtimeEvent(event *schemas.BifrostRealtimeEvent) bool {
|
|
return true
|
|
}
|
|
|
|
func (provider *OpenAIProvider) ShouldAccumulateRealtimeOutput(eventType schemas.RealtimeEventType) bool {
|
|
switch eventType {
|
|
case schemas.RTEventResponseTextDelta,
|
|
schemas.RTEventResponseAudioTransDelta,
|
|
schemas.RealtimeEventType("response.output_text.delta"),
|
|
schemas.RealtimeEventType("response.output_audio_transcript.delta"):
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// CreateRealtimeClientSecret mints an OpenAI Realtime client secret and returns
|
|
// the native OpenAI response body unchanged.
|
|
func (provider *OpenAIProvider) CreateRealtimeClientSecret(
|
|
ctx *schemas.BifrostContext,
|
|
key schemas.Key,
|
|
endpointType schemas.RealtimeSessionEndpointType,
|
|
rawRequest json.RawMessage,
|
|
) (*schemas.BifrostPassthroughResponse, *schemas.BifrostError) {
|
|
if err := providerUtils.CheckOperationAllowed(schemas.OpenAI, provider.customProviderConfig, schemas.RealtimeRequest); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
normalizedBody, requestedModel, bifrostErr := normalizeRealtimeClientSecretRequest(rawRequest, provider.GetProviderKey(), endpointType)
|
|
if bifrostErr != nil {
|
|
return nil, bifrostErr
|
|
}
|
|
req := fasthttp.AcquireRequest()
|
|
resp := fasthttp.AcquireResponse()
|
|
defer fasthttp.ReleaseRequest(req)
|
|
defer fasthttp.ReleaseResponse(resp)
|
|
|
|
req.SetRequestURI(provider.buildRequestURL(ctx, realtimeSessionUpstreamPath(endpointType), schemas.RealtimeRequest))
|
|
req.Header.SetMethod(http.MethodPost)
|
|
req.Header.SetContentType("application/json")
|
|
for k, v := range provider.realtimeSessionHeaders(key, endpointType) {
|
|
req.Header.Set(k, v)
|
|
}
|
|
req.SetBody(normalizedBody)
|
|
|
|
latency, bifrostErr, wait := providerUtils.MakeRequestWithContext(ctx, provider.client, req, resp)
|
|
defer wait()
|
|
if bifrostErr != nil {
|
|
return nil, bifrostErr
|
|
}
|
|
|
|
headers := providerUtils.ExtractProviderResponseHeaders(resp)
|
|
ctx.SetValue(schemas.BifrostContextKeyProviderResponseHeaders, headers)
|
|
|
|
if resp.StatusCode() < fasthttp.StatusOK || resp.StatusCode() >= fasthttp.StatusMultipleChoices {
|
|
return nil, ParseOpenAIError(resp)
|
|
}
|
|
|
|
body, err := providerUtils.CheckAndDecodeBody(resp)
|
|
if err != nil {
|
|
return nil, providerUtils.NewBifrostOperationError("failed to decode response body", err)
|
|
}
|
|
for k := range headers {
|
|
if strings.EqualFold(k, "Content-Encoding") || strings.EqualFold(k, "Content-Length") {
|
|
delete(headers, k)
|
|
}
|
|
}
|
|
|
|
out := &schemas.BifrostPassthroughResponse{
|
|
StatusCode: resp.StatusCode(),
|
|
Headers: headers,
|
|
Body: body,
|
|
}
|
|
out.ExtraFields.Provider = provider.GetProviderKey()
|
|
out.ExtraFields.OriginalModelRequested = requestedModel
|
|
out.ExtraFields.RequestType = schemas.RealtimeRequest
|
|
out.ExtraFields.Latency = latency.Milliseconds()
|
|
if providerUtils.ShouldSendBackRawRequest(ctx, provider.sendBackRawRequest) {
|
|
providerUtils.ParseAndSetRawRequestIfJSON(req, &out.ExtraFields)
|
|
}
|
|
|
|
return out, nil
|
|
}
|
|
|
|
func normalizeRealtimeClientSecretRequest(
|
|
rawRequest json.RawMessage,
|
|
defaultProvider schemas.ModelProvider,
|
|
endpointType schemas.RealtimeSessionEndpointType,
|
|
) ([]byte, string, *schemas.BifrostError) {
|
|
root, bifrostErr := schemas.ParseRealtimeClientSecretBody(rawRequest)
|
|
if bifrostErr != nil {
|
|
return nil, "", bifrostErr
|
|
}
|
|
|
|
modelValue, bifrostErr := schemas.ExtractRealtimeClientSecretModel(root)
|
|
if bifrostErr != nil {
|
|
return nil, "", bifrostErr
|
|
}
|
|
providerKey, normalizedModel := schemas.ParseModelString(modelValue, defaultProvider)
|
|
if normalizedModel == "" {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusBadRequest, "invalid_request_error", "session.model is required", nil)
|
|
}
|
|
if providerKey == "" {
|
|
providerKey = defaultProvider
|
|
}
|
|
if providerKey == "" {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusBadRequest, "invalid_request_error", "unable to determine provider from model", nil)
|
|
}
|
|
|
|
if endpointType == schemas.RealtimeSessionEndpointSessions {
|
|
return normalizeRealtimeSessionsRequest(root, normalizedModel)
|
|
}
|
|
|
|
return normalizeRealtimeClientSecretsRequest(root, normalizedModel)
|
|
}
|
|
|
|
func normalizeRealtimeClientSecretsRequest(
|
|
root map[string]json.RawMessage,
|
|
normalizedModel string,
|
|
) ([]byte, string, *schemas.BifrostError) {
|
|
session := map[string]json.RawMessage{}
|
|
if existingSession, ok := root["session"]; ok && len(existingSession) > 0 && !bytes.Equal(existingSession, []byte("null")) {
|
|
if err := json.Unmarshal(existingSession, &session); err != nil {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusBadRequest, "invalid_request_error", "session must be an object", err)
|
|
}
|
|
}
|
|
|
|
modelJSON, marshalErr := json.Marshal(normalizedModel)
|
|
if marshalErr != nil {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusInternalServerError, "server_error", "failed to encode normalized model", marshalErr)
|
|
}
|
|
session["model"] = modelJSON
|
|
if _, ok := session["type"]; !ok {
|
|
typeJSON, marshalErr := json.Marshal("realtime")
|
|
if marshalErr != nil {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusInternalServerError, "server_error", "failed to encode realtime session type", marshalErr)
|
|
}
|
|
session["type"] = typeJSON
|
|
}
|
|
delete(root, "model")
|
|
|
|
sessionJSON, marshalErr := json.Marshal(session)
|
|
if marshalErr != nil {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusInternalServerError, "server_error", "failed to encode realtime session", marshalErr)
|
|
}
|
|
root["session"] = sessionJSON
|
|
|
|
normalizedBody, marshalErr := json.Marshal(root)
|
|
if marshalErr != nil {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusInternalServerError, "server_error", "failed to encode realtime request", marshalErr)
|
|
}
|
|
|
|
return normalizedBody, normalizedModel, nil
|
|
}
|
|
|
|
func normalizeRealtimeSessionsRequest(
|
|
root map[string]json.RawMessage,
|
|
normalizedModel string,
|
|
) ([]byte, string, *schemas.BifrostError) {
|
|
if existingSession, ok := root["session"]; ok && len(existingSession) > 0 && !bytes.Equal(existingSession, []byte("null")) {
|
|
session := map[string]json.RawMessage{}
|
|
if err := json.Unmarshal(existingSession, &session); err != nil {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusBadRequest, "invalid_request_error", "session must be an object", err)
|
|
}
|
|
for key, value := range session {
|
|
if _, exists := root[key]; !exists {
|
|
root[key] = value
|
|
}
|
|
}
|
|
}
|
|
|
|
modelJSON, marshalErr := json.Marshal(normalizedModel)
|
|
if marshalErr != nil {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusInternalServerError, "server_error", "failed to encode normalized model", marshalErr)
|
|
}
|
|
root["model"] = modelJSON
|
|
delete(root, "session")
|
|
|
|
normalizedBody, marshalErr := json.Marshal(root)
|
|
if marshalErr != nil {
|
|
return nil, "", newRealtimeClientSecretError(fasthttp.StatusInternalServerError, "server_error", "failed to encode realtime request", marshalErr)
|
|
}
|
|
|
|
return normalizedBody, normalizedModel, nil
|
|
}
|
|
|
|
func (provider *OpenAIProvider) realtimeSessionHeaders(
|
|
key schemas.Key,
|
|
endpointType schemas.RealtimeSessionEndpointType,
|
|
) map[string]string {
|
|
headers := map[string]string{
|
|
"Authorization": "Bearer " + key.Value.GetValue(),
|
|
}
|
|
if endpointType == schemas.RealtimeSessionEndpointSessions {
|
|
headers["OpenAI-Beta"] = "realtime=v1"
|
|
}
|
|
for k, v := range provider.networkConfig.ExtraHeaders {
|
|
headers[k] = v
|
|
}
|
|
return headers
|
|
}
|
|
|
|
func realtimeSessionUpstreamPath(endpointType schemas.RealtimeSessionEndpointType) string {
|
|
if endpointType == schemas.RealtimeSessionEndpointSessions {
|
|
return "/v1/realtime/sessions"
|
|
}
|
|
return "/v1/realtime/client_secrets"
|
|
}
|
|
|
|
func newRealtimeClientSecretError(status int, errorType, message string, err error) *schemas.BifrostError {
|
|
return &schemas.BifrostError{
|
|
IsBifrostError: false,
|
|
StatusCode: schemas.Ptr(status),
|
|
Error: &schemas.ErrorField{
|
|
Type: schemas.Ptr(errorType),
|
|
Message: message,
|
|
Error: err,
|
|
},
|
|
ExtraFields: schemas.BifrostErrorExtraFields{
|
|
RequestType: schemas.RealtimeRequest,
|
|
Provider: schemas.OpenAI,
|
|
},
|
|
}
|
|
}
|
|
|
|
// openAIRealtimeEvent is the raw shape of an OpenAI Realtime protocol event.
|
|
type openAIRealtimeEvent struct {
|
|
Type string `json:"type"`
|
|
EventID string `json:"event_id,omitempty"`
|
|
Session json.RawMessage `json:"session,omitempty"`
|
|
Conversation json.RawMessage `json:"conversation,omitempty"`
|
|
Item json.RawMessage `json:"item,omitempty"`
|
|
Response json.RawMessage `json:"response,omitempty"`
|
|
Part json.RawMessage `json:"part,omitempty"`
|
|
Delta string `json:"delta,omitempty"`
|
|
Audio string `json:"audio,omitempty"`
|
|
Transcript string `json:"transcript,omitempty"`
|
|
Text string `json:"text,omitempty"`
|
|
Error json.RawMessage `json:"error,omitempty"`
|
|
ItemID string `json:"item_id,omitempty"`
|
|
OutputIndex *int `json:"output_index,omitempty"`
|
|
ContentIndex *int `json:"content_index,omitempty"`
|
|
ResponseID string `json:"response_id,omitempty"`
|
|
AudioEndMS *int `json:"audio_end_ms,omitempty"`
|
|
|
|
PreviousItemID string `json:"previous_item_id,omitempty"`
|
|
}
|
|
|
|
// openAIRealtimeSession is the session object within an OpenAI Realtime event.
|
|
type openAIRealtimeSession struct {
|
|
ID string `json:"id,omitempty"`
|
|
Model string `json:"model,omitempty"`
|
|
Modalities []string `json:"modalities,omitempty"`
|
|
Instructions string `json:"instructions,omitempty"`
|
|
Voice string `json:"voice,omitempty"`
|
|
Temperature *float64 `json:"temperature,omitempty"`
|
|
MaxOutputTokens json.RawMessage `json:"max_output_tokens,omitempty"`
|
|
TurnDetection json.RawMessage `json:"turn_detection,omitempty"`
|
|
InputAudioFormat string `json:"input_audio_format,omitempty"`
|
|
OutputAudioType string `json:"output_audio_type,omitempty"`
|
|
Tools json.RawMessage `json:"tools,omitempty"`
|
|
}
|
|
|
|
// openAIRealtimeItem is the item object within an OpenAI Realtime event.
|
|
type openAIRealtimeItem struct {
|
|
ID string `json:"id,omitempty"`
|
|
Type string `json:"type,omitempty"`
|
|
Role string `json:"role,omitempty"`
|
|
Status string `json:"status,omitempty"`
|
|
Content json.RawMessage `json:"content,omitempty"`
|
|
Name string `json:"name,omitempty"`
|
|
CallID string `json:"call_id,omitempty"`
|
|
Arguments string `json:"arguments,omitempty"`
|
|
Output string `json:"output,omitempty"`
|
|
}
|
|
|
|
// openAIRealtimeError is the error object within an OpenAI Realtime event.
|
|
type openAIRealtimeError struct {
|
|
Type string `json:"type,omitempty"`
|
|
Code string `json:"code,omitempty"`
|
|
Message string `json:"message,omitempty"`
|
|
Param string `json:"param,omitempty"`
|
|
}
|
|
|
|
// ToBifrostRealtimeEvent converts an OpenAI Realtime event (raw JSON) to the unified Bifrost format.
|
|
func (provider *OpenAIProvider) ToBifrostRealtimeEvent(providerEvent json.RawMessage) (*schemas.BifrostRealtimeEvent, error) {
|
|
var raw openAIRealtimeEvent
|
|
if err := json.Unmarshal(providerEvent, &raw); err != nil {
|
|
return nil, fmt.Errorf("failed to unmarshal OpenAI realtime event: %w", err)
|
|
}
|
|
|
|
event := &schemas.BifrostRealtimeEvent{
|
|
Type: schemas.RealtimeEventType(raw.Type),
|
|
EventID: raw.EventID,
|
|
RawData: providerEvent,
|
|
}
|
|
setRealtimeExtraParam(event, "item_id", raw.ItemID)
|
|
setRealtimeExtraParam(event, "previous_item_id", raw.PreviousItemID)
|
|
setRealtimeExtraParam(event, "output_index", raw.OutputIndex)
|
|
setRealtimeExtraParam(event, "content_index", raw.ContentIndex)
|
|
setRealtimeExtraParam(event, "response_id", raw.ResponseID)
|
|
setRealtimeExtraParam(event, "audio_end_ms", raw.AudioEndMS)
|
|
setRealtimeExtraParam(event, "transcript", raw.Transcript)
|
|
setRealtimeExtraParam(event, "text", raw.Text)
|
|
setRealtimeExtraParam(event, "conversation", raw.Conversation)
|
|
setRealtimeExtraParam(event, "response", raw.Response)
|
|
setRealtimeExtraParam(event, "part", raw.Part)
|
|
|
|
switch {
|
|
case raw.Session != nil:
|
|
var sess openAIRealtimeSession
|
|
if err := json.Unmarshal(raw.Session, &sess); err == nil {
|
|
event.Session = &schemas.RealtimeSession{
|
|
ID: sess.ID,
|
|
Model: sess.Model,
|
|
Modalities: sess.Modalities,
|
|
Instructions: sess.Instructions,
|
|
Voice: sess.Voice,
|
|
Temperature: sess.Temperature,
|
|
MaxOutputTokens: sess.MaxOutputTokens,
|
|
TurnDetection: sess.TurnDetection,
|
|
InputAudioFormat: sess.InputAudioFormat,
|
|
OutputAudioType: sess.OutputAudioType,
|
|
Tools: sess.Tools,
|
|
}
|
|
if extra := extractRealtimeNestedParams(raw.Session, "id", "model", "modalities", "instructions", "voice", "temperature", "max_output_tokens", "turn_detection", "input_audio_format", "output_audio_type", "tools"); len(extra) > 0 {
|
|
event.Session.ExtraParams = extra
|
|
}
|
|
}
|
|
case raw.Item != nil:
|
|
var item openAIRealtimeItem
|
|
if err := json.Unmarshal(raw.Item, &item); err == nil {
|
|
event.Item = &schemas.RealtimeItem{
|
|
ID: item.ID,
|
|
Type: item.Type,
|
|
Role: item.Role,
|
|
Status: item.Status,
|
|
Content: item.Content,
|
|
Name: item.Name,
|
|
CallID: item.CallID,
|
|
Arguments: item.Arguments,
|
|
Output: item.Output,
|
|
}
|
|
if extra := extractRealtimeNestedParams(raw.Item, "id", "type", "role", "status", "content", "name", "call_id", "arguments", "output"); len(extra) > 0 {
|
|
event.Item.ExtraParams = extra
|
|
}
|
|
}
|
|
|
|
case raw.Error != nil:
|
|
var rtErr openAIRealtimeError
|
|
if err := json.Unmarshal(raw.Error, &rtErr); err == nil {
|
|
event.Error = &schemas.RealtimeError{
|
|
Type: rtErr.Type,
|
|
Code: rtErr.Code,
|
|
Message: rtErr.Message,
|
|
Param: rtErr.Param,
|
|
}
|
|
if extra := extractRealtimeNestedParams(raw.Error, "type", "code", "message", "param"); len(extra) > 0 {
|
|
event.Error.ExtraParams = extra
|
|
}
|
|
}
|
|
}
|
|
|
|
if isRealtimeDeltaEvent(raw.Type) {
|
|
event.Delta = &schemas.RealtimeDelta{
|
|
Text: raw.Text,
|
|
Audio: raw.Audio,
|
|
Transcript: raw.Transcript,
|
|
ItemID: raw.ItemID,
|
|
OutputIdx: raw.OutputIndex,
|
|
ContentIdx: raw.ContentIndex,
|
|
ResponseID: raw.ResponseID,
|
|
}
|
|
if raw.Delta != "" {
|
|
if event.Delta.Text == "" {
|
|
event.Delta.Text = raw.Delta
|
|
}
|
|
}
|
|
}
|
|
|
|
return event, nil
|
|
}
|
|
|
|
// ToProviderRealtimeEvent converts a unified Bifrost Realtime event back to OpenAI's native JSON.
|
|
func (provider *OpenAIProvider) ToProviderRealtimeEvent(bifrostEvent *schemas.BifrostRealtimeEvent) (json.RawMessage, error) {
|
|
out := map[string]interface{}{
|
|
"type": string(bifrostEvent.Type),
|
|
}
|
|
if bifrostEvent.EventID != "" {
|
|
out["event_id"] = bifrostEvent.EventID
|
|
}
|
|
mergeRealtimeExtraParams(out, bifrostEvent.ExtraParams)
|
|
|
|
if bifrostEvent.Session != nil {
|
|
sess := map[string]interface{}{}
|
|
if bifrostEvent.Session.ID != "" && bifrostEvent.Type != schemas.RTEventSessionUpdate {
|
|
sess["id"] = bifrostEvent.Session.ID
|
|
}
|
|
if bifrostEvent.Session.Model != "" {
|
|
sess["model"] = bifrostEvent.Session.Model
|
|
}
|
|
if len(bifrostEvent.Session.Modalities) > 0 {
|
|
sess["modalities"] = bifrostEvent.Session.Modalities
|
|
}
|
|
if bifrostEvent.Session.Instructions != "" {
|
|
sess["instructions"] = bifrostEvent.Session.Instructions
|
|
}
|
|
if bifrostEvent.Session.Voice != "" {
|
|
sess["voice"] = bifrostEvent.Session.Voice
|
|
}
|
|
if bifrostEvent.Session.Temperature != nil {
|
|
sess["temperature"] = *bifrostEvent.Session.Temperature
|
|
}
|
|
if bifrostEvent.Session.MaxOutputTokens != nil {
|
|
sess["max_output_tokens"] = bifrostEvent.Session.MaxOutputTokens
|
|
}
|
|
if bifrostEvent.Session.TurnDetection != nil {
|
|
sess["turn_detection"] = bifrostEvent.Session.TurnDetection
|
|
}
|
|
if bifrostEvent.Session.InputAudioFormat != "" {
|
|
sess["input_audio_format"] = bifrostEvent.Session.InputAudioFormat
|
|
}
|
|
if bifrostEvent.Session.OutputAudioType != "" {
|
|
sess["output_audio_type"] = bifrostEvent.Session.OutputAudioType
|
|
}
|
|
if bifrostEvent.Session.Tools != nil {
|
|
sess["tools"] = bifrostEvent.Session.Tools
|
|
}
|
|
mergeRealtimeSessionExtraParams(sess, bifrostEvent.Session.ExtraParams, bifrostEvent.Type)
|
|
out["session"] = sess
|
|
}
|
|
|
|
if bifrostEvent.Item != nil {
|
|
item := map[string]interface{}{
|
|
"type": bifrostEvent.Item.Type,
|
|
}
|
|
if bifrostEvent.Item.ID != "" {
|
|
item["id"] = bifrostEvent.Item.ID
|
|
}
|
|
if bifrostEvent.Item.Role != "" {
|
|
item["role"] = bifrostEvent.Item.Role
|
|
}
|
|
if bifrostEvent.Item.Status != "" {
|
|
item["status"] = bifrostEvent.Item.Status
|
|
}
|
|
if bifrostEvent.Item.Content != nil {
|
|
item["content"] = bifrostEvent.Item.Content
|
|
}
|
|
if bifrostEvent.Item.Name != "" {
|
|
item["name"] = bifrostEvent.Item.Name
|
|
}
|
|
if bifrostEvent.Item.CallID != "" {
|
|
item["call_id"] = bifrostEvent.Item.CallID
|
|
}
|
|
if bifrostEvent.Item.Arguments != "" {
|
|
item["arguments"] = bifrostEvent.Item.Arguments
|
|
}
|
|
if bifrostEvent.Item.Output != "" {
|
|
item["output"] = bifrostEvent.Item.Output
|
|
}
|
|
mergeRealtimeExtraParams(item, bifrostEvent.Item.ExtraParams)
|
|
out["item"] = item
|
|
}
|
|
|
|
if bifrostEvent.Error != nil {
|
|
rtErr := map[string]interface{}{}
|
|
if bifrostEvent.Error.Type != "" {
|
|
rtErr["type"] = bifrostEvent.Error.Type
|
|
}
|
|
if bifrostEvent.Error.Code != "" {
|
|
rtErr["code"] = bifrostEvent.Error.Code
|
|
}
|
|
if bifrostEvent.Error.Message != "" {
|
|
rtErr["message"] = bifrostEvent.Error.Message
|
|
}
|
|
if bifrostEvent.Error.Param != "" {
|
|
rtErr["param"] = bifrostEvent.Error.Param
|
|
}
|
|
mergeRealtimeExtraParams(rtErr, bifrostEvent.Error.ExtraParams)
|
|
out["error"] = rtErr
|
|
}
|
|
|
|
if bifrostEvent.Delta != nil {
|
|
if bifrostEvent.Delta.Text != "" {
|
|
out["delta"] = bifrostEvent.Delta.Text
|
|
}
|
|
if bifrostEvent.Delta.Audio != "" {
|
|
out["audio"] = bifrostEvent.Delta.Audio
|
|
}
|
|
if bifrostEvent.Delta.Transcript != "" {
|
|
out["transcript"] = bifrostEvent.Delta.Transcript
|
|
}
|
|
if bifrostEvent.Delta.ItemID != "" && !hasRealtimeExtraParam(bifrostEvent.ExtraParams, "item_id") {
|
|
out["item_id"] = bifrostEvent.Delta.ItemID
|
|
}
|
|
if bifrostEvent.Delta.OutputIdx != nil && !hasRealtimeExtraParam(bifrostEvent.ExtraParams, "output_index") {
|
|
out["output_index"] = *bifrostEvent.Delta.OutputIdx
|
|
}
|
|
if bifrostEvent.Delta.ContentIdx != nil && !hasRealtimeExtraParam(bifrostEvent.ExtraParams, "content_index") {
|
|
out["content_index"] = *bifrostEvent.Delta.ContentIdx
|
|
}
|
|
if bifrostEvent.Delta.ResponseID != "" && !hasRealtimeExtraParam(bifrostEvent.ExtraParams, "response_id") {
|
|
out["response_id"] = bifrostEvent.Delta.ResponseID
|
|
}
|
|
}
|
|
|
|
return providerUtils.MarshalSorted(out)
|
|
}
|
|
|
|
func mergeRealtimeSessionExtraParams(out map[string]interface{}, params map[string]json.RawMessage, eventType schemas.RealtimeEventType) {
|
|
filtered := params
|
|
if eventType == schemas.RTEventSessionUpdate && len(params) > 0 {
|
|
filtered = make(map[string]json.RawMessage, len(params))
|
|
for key, value := range params {
|
|
switch key {
|
|
case "id", "object", "expires_at", "client_secret":
|
|
continue
|
|
default:
|
|
filtered[key] = value
|
|
}
|
|
}
|
|
}
|
|
mergeRealtimeExtraParams(out, filtered)
|
|
}
|
|
|
|
func (provider *OpenAIProvider) ExtractRealtimeTurnUsage(terminalEventRaw []byte) *schemas.BifrostLLMUsage {
|
|
if len(terminalEventRaw) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var parsed openAIRealtimeResponseDoneEnvelope
|
|
if err := json.Unmarshal(terminalEventRaw, &parsed); err != nil || parsed.Response.Usage == nil {
|
|
return nil
|
|
}
|
|
|
|
usage := &schemas.BifrostLLMUsage{
|
|
PromptTokens: parsed.Response.Usage.InputTokens,
|
|
CompletionTokens: parsed.Response.Usage.OutputTokens,
|
|
TotalTokens: parsed.Response.Usage.TotalTokens,
|
|
}
|
|
|
|
if parsed.Response.Usage.InputTokenDetails != nil {
|
|
usage.PromptTokensDetails = &schemas.ChatPromptTokensDetails{
|
|
TextTokens: parsed.Response.Usage.InputTokenDetails.TextTokens,
|
|
AudioTokens: parsed.Response.Usage.InputTokenDetails.AudioTokens,
|
|
ImageTokens: parsed.Response.Usage.InputTokenDetails.ImageTokens,
|
|
CachedReadTokens: parsed.Response.Usage.InputTokenDetails.CachedTokens,
|
|
}
|
|
}
|
|
|
|
if parsed.Response.Usage.OutputTokenDetails != nil {
|
|
usage.CompletionTokensDetails = &schemas.ChatCompletionTokensDetails{
|
|
TextTokens: parsed.Response.Usage.OutputTokenDetails.TextTokens,
|
|
AudioTokens: parsed.Response.Usage.OutputTokenDetails.AudioTokens,
|
|
ReasoningTokens: parsed.Response.Usage.OutputTokenDetails.ReasoningTokens,
|
|
ImageTokens: parsed.Response.Usage.OutputTokenDetails.ImageTokens,
|
|
CitationTokens: parsed.Response.Usage.OutputTokenDetails.CitationTokens,
|
|
NumSearchQueries: parsed.Response.Usage.OutputTokenDetails.NumSearchQueries,
|
|
AcceptedPredictionTokens: parsed.Response.Usage.OutputTokenDetails.AcceptedPredictionTokens,
|
|
RejectedPredictionTokens: parsed.Response.Usage.OutputTokenDetails.RejectedPredictionTokens,
|
|
}
|
|
}
|
|
|
|
return usage
|
|
}
|
|
|
|
func (provider *OpenAIProvider) ExtractRealtimeTurnOutput(terminalEventRaw []byte) *schemas.ChatMessage {
|
|
if len(terminalEventRaw) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var parsed openAIRealtimeResponseDoneEnvelope
|
|
if err := json.Unmarshal(terminalEventRaw, &parsed); err != nil {
|
|
return nil
|
|
}
|
|
|
|
content := extractOpenAIRealtimeResponseDoneAssistantText(parsed.Response.Output)
|
|
toolCalls := extractOpenAIRealtimeResponseDoneToolCalls(parsed.Response.Output)
|
|
if content == "" && len(toolCalls) == 0 {
|
|
return nil
|
|
}
|
|
|
|
message := &schemas.ChatMessage{Role: schemas.ChatMessageRoleAssistant}
|
|
if content != "" {
|
|
message.Content = &schemas.ChatMessageContent{ContentStr: schemas.Ptr(content)}
|
|
}
|
|
if len(toolCalls) > 0 {
|
|
message.ChatAssistantMessage = &schemas.ChatAssistantMessage{ToolCalls: toolCalls}
|
|
}
|
|
|
|
return message
|
|
}
|
|
|
|
type openAIRealtimeResponseDoneEnvelope struct {
|
|
Response struct {
|
|
Output []openAIRealtimeResponseDoneOutput `json:"output"`
|
|
Usage *openAIRealtimeResponseDoneUsage `json:"usage"`
|
|
} `json:"response"`
|
|
}
|
|
|
|
type openAIRealtimeResponseDoneOutput struct {
|
|
ID string `json:"id"`
|
|
Type string `json:"type"`
|
|
Name string `json:"name"`
|
|
CallID string `json:"call_id"`
|
|
Arguments string `json:"arguments"`
|
|
Content []openAIRealtimeResponseDoneBlock `json:"content"`
|
|
}
|
|
|
|
type openAIRealtimeResponseDoneBlock struct {
|
|
Text string `json:"text"`
|
|
Transcript string `json:"transcript"`
|
|
Refusal string `json:"refusal"`
|
|
}
|
|
|
|
type openAIRealtimeResponseDoneUsage struct {
|
|
TotalTokens int `json:"total_tokens"`
|
|
InputTokens int `json:"input_tokens"`
|
|
OutputTokens int `json:"output_tokens"`
|
|
InputTokenDetails *openAIRealtimeResponseDoneInputTokenUsage `json:"input_token_details"`
|
|
OutputTokenDetails *openAIRealtimeResponseDoneOutputTokenUsage `json:"output_token_details"`
|
|
}
|
|
|
|
type openAIRealtimeResponseDoneInputTokenUsage struct {
|
|
TextTokens int `json:"text_tokens"`
|
|
AudioTokens int `json:"audio_tokens"`
|
|
ImageTokens int `json:"image_tokens"`
|
|
CachedTokens int `json:"cached_tokens"`
|
|
}
|
|
|
|
type openAIRealtimeResponseDoneOutputTokenUsage struct {
|
|
TextTokens int `json:"text_tokens"`
|
|
AudioTokens int `json:"audio_tokens"`
|
|
ReasoningTokens int `json:"reasoning_tokens"`
|
|
ImageTokens *int `json:"image_tokens"`
|
|
CitationTokens *int `json:"citation_tokens"`
|
|
NumSearchQueries *int `json:"num_search_queries"`
|
|
AcceptedPredictionTokens int `json:"accepted_prediction_tokens"`
|
|
RejectedPredictionTokens int `json:"rejected_prediction_tokens"`
|
|
}
|
|
|
|
func extractOpenAIRealtimeResponseDoneAssistantText(outputs []openAIRealtimeResponseDoneOutput) string {
|
|
var sb strings.Builder
|
|
for _, output := range outputs {
|
|
if output.Type != "message" {
|
|
continue
|
|
}
|
|
for _, block := range output.Content {
|
|
switch {
|
|
case strings.TrimSpace(block.Text) != "":
|
|
sb.WriteString(block.Text)
|
|
case strings.TrimSpace(block.Transcript) != "":
|
|
sb.WriteString(block.Transcript)
|
|
case strings.TrimSpace(block.Refusal) != "":
|
|
sb.WriteString(block.Refusal)
|
|
}
|
|
}
|
|
}
|
|
return strings.TrimSpace(sb.String())
|
|
}
|
|
|
|
func extractOpenAIRealtimeResponseDoneToolCalls(outputs []openAIRealtimeResponseDoneOutput) []schemas.ChatAssistantMessageToolCall {
|
|
toolCalls := make([]schemas.ChatAssistantMessageToolCall, 0)
|
|
for _, output := range outputs {
|
|
if output.Type != "function_call" {
|
|
continue
|
|
}
|
|
|
|
name := strings.TrimSpace(output.Name)
|
|
if name == "" {
|
|
continue
|
|
}
|
|
|
|
toolType := "function"
|
|
id := strings.TrimSpace(output.CallID)
|
|
if id == "" {
|
|
id = strings.TrimSpace(output.ID)
|
|
}
|
|
|
|
toolCall := schemas.ChatAssistantMessageToolCall{
|
|
Index: uint16(len(toolCalls)),
|
|
Type: &toolType,
|
|
Function: schemas.ChatAssistantMessageToolCallFunction{
|
|
Name: schemas.Ptr(name),
|
|
Arguments: output.Arguments,
|
|
},
|
|
}
|
|
if id != "" {
|
|
toolCall.ID = schemas.Ptr(id)
|
|
}
|
|
|
|
toolCalls = append(toolCalls, toolCall)
|
|
}
|
|
return toolCalls
|
|
}
|
|
|
|
func setRealtimeExtraParam(event *schemas.BifrostRealtimeEvent, key string, value any) {
|
|
if event == nil || key == "" || value == nil {
|
|
return
|
|
}
|
|
|
|
switch v := value.(type) {
|
|
case string:
|
|
if v == "" {
|
|
return
|
|
}
|
|
case *int:
|
|
if v == nil {
|
|
return
|
|
}
|
|
case json.RawMessage:
|
|
if len(v) == 0 || string(v) == "null" {
|
|
return
|
|
}
|
|
}
|
|
|
|
raw, err := json.Marshal(value)
|
|
if err != nil {
|
|
return
|
|
}
|
|
if event.ExtraParams == nil {
|
|
event.ExtraParams = make(map[string]json.RawMessage)
|
|
}
|
|
event.ExtraParams[key] = raw
|
|
}
|
|
|
|
func mergeRealtimeExtraParams(out map[string]interface{}, params map[string]json.RawMessage) {
|
|
for key, raw := range params {
|
|
if len(raw) == 0 {
|
|
continue
|
|
}
|
|
var value any
|
|
if err := json.Unmarshal(raw, &value); err != nil {
|
|
continue
|
|
}
|
|
out[key] = value
|
|
}
|
|
}
|
|
|
|
func hasRealtimeExtraParam(params map[string]json.RawMessage, key string) bool {
|
|
if params == nil {
|
|
return false
|
|
}
|
|
raw, ok := params[key]
|
|
return ok && len(raw) > 0
|
|
}
|
|
|
|
func extractRealtimeNestedParams(raw json.RawMessage, knownKeys ...string) map[string]json.RawMessage {
|
|
if len(raw) == 0 {
|
|
return nil
|
|
}
|
|
root := map[string]json.RawMessage{}
|
|
if err := json.Unmarshal(raw, &root); err != nil {
|
|
return nil
|
|
}
|
|
for _, key := range knownKeys {
|
|
delete(root, key)
|
|
}
|
|
if len(root) == 0 {
|
|
return nil
|
|
}
|
|
return root
|
|
}
|
|
|
|
func isRealtimeDeltaEvent(eventType string) bool {
|
|
switch eventType {
|
|
case "response.text.delta",
|
|
"response.output_text.delta",
|
|
"response.audio.delta",
|
|
"response.output_audio.delta",
|
|
"response.audio_transcript.delta",
|
|
"response.output_audio_transcript.delta",
|
|
"conversation.item.input_audio_transcription.delta":
|
|
return true
|
|
}
|
|
return false
|
|
}
|