package handlers import ( "encoding/json" "strings" "github.com/bytedance/sonic" "github.com/maximhq/bifrost/core/schemas" bfws "github.com/maximhq/bifrost/transports/bifrost-http/websocket" ) type realtimeTurnSource string const ( realtimeTurnSourceEI realtimeTurnSource = "ei" realtimeTurnSourceLM realtimeTurnSource = "lm" ) const ( realtimeMissingTranscriptText = "[Audio transcription unavailable]" ) func extractRealtimeTurnSummary(event *schemas.BifrostRealtimeEvent, contentOverride string) string { if strings.TrimSpace(contentOverride) != "" { return strings.TrimSpace(contentOverride) } if event == nil { return "" } if event.Error != nil && strings.TrimSpace(event.Error.Message) != "" { return strings.TrimSpace(event.Error.Message) } if event.Delta != nil { if text := strings.TrimSpace(event.Delta.Text); text != "" { return text } if transcript := strings.TrimSpace(event.Delta.Transcript); transcript != "" { return transcript } } if event.Item != nil { if summary := extractRealtimeItemSummary(event.Item); summary != "" { return summary } } if event.Session != nil && strings.TrimSpace(event.Session.Instructions) != "" { return strings.TrimSpace(event.Session.Instructions) } if len(event.RawData) > 0 { return strings.TrimSpace(string(event.RawData)) } return "" } func extractRealtimeItemSummary(item *schemas.RealtimeItem) string { if item == nil { return "" } if summary := extractRealtimeContentSummary(item.Content); summary != "" { return summary } switch { case strings.TrimSpace(item.Output) != "": return strings.TrimSpace(item.Output) case strings.TrimSpace(item.Arguments) != "": return strings.TrimSpace(item.Arguments) case strings.TrimSpace(item.Name) != "": return strings.TrimSpace(item.Name) default: return "" } } func extractRealtimeContentSummary(raw []byte) string { if len(raw) == 0 { return "" } var decoded any if err := sonic.Unmarshal(raw, &decoded); err != nil { return strings.TrimSpace(string(raw)) } var parts []string collectRealtimeTextFragments(decoded, &parts) return strings.Join(parts, " ") } func collectRealtimeTextFragments(value any, parts *[]string) { switch v := value.(type) { case map[string]any: for key, field := range v { switch key { case "text", "transcript", "input_text", "output_text", "output", "arguments": if text, ok := field.(string); ok { text = strings.TrimSpace(text) if text != "" { *parts = append(*parts, text) } continue } } collectRealtimeTextFragments(field, parts) } case []any: for _, item := range v { collectRealtimeTextFragments(item, parts) } } } func finalizedRealtimeInputSummary(event *schemas.BifrostRealtimeEvent) string { if event == nil { return "" } switch event.Type { case schemas.RTEventInputAudioTransCompleted: if transcript := extractRealtimeExtraParamString(event, "transcript"); transcript != "" { return transcript } return realtimeMissingTranscriptText default: if event != nil && event.Type == schemas.RTEventConversationItemDone && schemas.IsRealtimeUserInputEvent(event) { if summary := extractRealtimeItemSummary(event.Item); summary != "" { return summary } if realtimeItemHasMissingAudioTranscript(event.Item) { return realtimeMissingTranscriptText } } if schemas.IsRealtimeUserInputEvent(event) { return extractRealtimeItemSummary(event.Item) } } return "" } func pendingRealtimeInputUpdate(event *schemas.BifrostRealtimeEvent) (string, string) { if event == nil { return "", "" } switch event.Type { case schemas.RTEventConversationItemRetrieved: return "", "" case schemas.RTEventInputAudioTransCompleted: return realtimeEventItemID(event), finalizedRealtimeInputSummary(event) default: if schemas.IsRealtimeUserInputEvent(event) { return realtimeEventItemID(event), finalizedRealtimeInputSummary(event) } } return "", "" } func realtimeItemHasMissingAudioTranscript(item *schemas.RealtimeItem) bool { if item == nil || len(item.Content) == 0 { return false } var decoded []map[string]any if err := sonic.Unmarshal(item.Content, &decoded); err != nil { return false } for _, part := range decoded { partType, _ := part["type"].(string) if partType != "input_audio" { continue } transcript, exists := part["transcript"] if !exists || transcript == nil { return true } if text, ok := transcript.(string); ok && strings.TrimSpace(text) == "" { return true } } return false } func finalizedRealtimeToolOutputSummary(event *schemas.BifrostRealtimeEvent) string { if !schemas.IsRealtimeToolOutputEvent(event) { return "" } return extractRealtimeItemSummary(event.Item) } func pendingRealtimeToolOutputUpdate(event *schemas.BifrostRealtimeEvent) (string, string) { if event == nil || event.Type == schemas.RTEventConversationItemRetrieved || !schemas.IsRealtimeToolOutputEvent(event) { return "", "" } return realtimeEventItemID(event), finalizedRealtimeToolOutputSummary(event) } func extractRealtimeExtraParamString(event *schemas.BifrostRealtimeEvent, key string) string { if event == nil || event.ExtraParams == nil { return "" } raw, ok := event.ExtraParams[key] if !ok || len(raw) == 0 { return "" } var value string if err := json.Unmarshal(raw, &value); err != nil { return "" } return strings.TrimSpace(value) } func realtimeEventItemID(event *schemas.BifrostRealtimeEvent) string { if event == nil { return "" } if event.Item != nil && strings.TrimSpace(event.Item.ID) != "" { return strings.TrimSpace(event.Item.ID) } if event.Delta != nil && strings.TrimSpace(event.Delta.ItemID) != "" { return strings.TrimSpace(event.Delta.ItemID) } return extractRealtimeExtraParamString(event, "item_id") } func combineRealtimeInputRaw(turnInputs []bfws.RealtimeTurnInput) string { var parts []string for _, turnInput := range turnInputs { if trimmed := strings.TrimSpace(turnInput.Raw); trimmed != "" { parts = append(parts, trimmed) } } return strings.Join(parts, "\n\n") } type realtimeResponseDoneEnvelope struct { Response struct { Output []realtimeResponseDoneOutput `json:"output"` Usage *realtimeResponseDoneUsage `json:"usage"` } `json:"response"` } type realtimeResponseDoneOutput struct { ID string `json:"id"` Type string `json:"type"` Name string `json:"name"` CallID string `json:"call_id"` Arguments string `json:"arguments"` Content []realtimeResponseDoneContent `json:"content"` } type realtimeResponseDoneContent struct { Type string `json:"type"` Text string `json:"text"` Transcript string `json:"transcript"` Refusal string `json:"refusal"` } type realtimeResponseDoneUsage struct { TotalTokens int `json:"total_tokens"` InputTokens int `json:"input_tokens"` OutputTokens int `json:"output_tokens"` InputTokenDetails *realtimeResponseDoneInputTokenUsage `json:"input_token_details"` OutputTokenDetails *realtimeResponseDoneOutputTokenUsage `json:"output_token_details"` } type realtimeResponseDoneInputTokenUsage struct { TextTokens int `json:"text_tokens"` AudioTokens int `json:"audio_tokens"` ImageTokens int `json:"image_tokens"` CachedTokens int `json:"cached_tokens"` } type realtimeResponseDoneOutputTokenUsage struct { TextTokens int `json:"text_tokens"` AudioTokens int `json:"audio_tokens"` ReasoningTokens int `json:"reasoning_tokens"` ImageTokens *int `json:"image_tokens"` CitationTokens *int `json:"citation_tokens"` NumSearchQueries *int `json:"num_search_queries"` AcceptedPredictionTokens int `json:"accepted_prediction_tokens"` RejectedPredictionTokens int `json:"rejected_prediction_tokens"` } func extractRealtimeTurnUsage(provider schemas.RealtimeProvider, rawMessage []byte) *schemas.BifrostLLMUsage { if extractor, ok := provider.(schemas.RealtimeUsageExtractor); ok { if usage := extractor.ExtractRealtimeTurnUsage(rawMessage); usage != nil { return usage } } return extractRealtimeResponseDoneUsage(rawMessage) } func extractRealtimeTurnOutputMessage(provider schemas.RealtimeProvider, rawMessage []byte, contentSummary string) *schemas.ChatMessage { if extractor, ok := provider.(schemas.RealtimeUsageExtractor); ok { if message := extractor.ExtractRealtimeTurnOutput(rawMessage); message != nil { if strings.TrimSpace(contentSummary) != "" && (message.Content == nil || message.Content.ContentStr == nil || strings.TrimSpace(*message.Content.ContentStr) == "") { message.Content = &schemas.ChatMessageContent{ContentStr: schemas.Ptr(strings.TrimSpace(contentSummary))} } return message } } return buildRealtimeAssistantLogMessage(rawMessage, contentSummary) } func buildRealtimeAssistantLogMessage(rawMessage []byte, contentSummary string) *schemas.ChatMessage { contentSummary = strings.TrimSpace(contentSummary) var parsed realtimeResponseDoneEnvelope if len(rawMessage) > 0 && sonic.Unmarshal(rawMessage, &parsed) == nil { message := &schemas.ChatMessage{Role: schemas.ChatMessageRoleAssistant} if contentSummary == "" { contentSummary = extractRealtimeResponseDoneAssistantText(parsed.Response.Output) } if contentSummary != "" { message.Content = &schemas.ChatMessageContent{ContentStr: schemas.Ptr(contentSummary)} } toolCalls := extractRealtimeResponseDoneToolCalls(parsed.Response.Output) if len(toolCalls) > 0 { message.ChatAssistantMessage = &schemas.ChatAssistantMessage{ ToolCalls: toolCalls, } } if message.Content != nil || message.ChatAssistantMessage != nil { return message } } if contentSummary == "" { return nil } return &schemas.ChatMessage{ Role: schemas.ChatMessageRoleAssistant, Content: &schemas.ChatMessageContent{ContentStr: schemas.Ptr(contentSummary)}, } } func extractRealtimeResponseDoneAssistantText(outputs []realtimeResponseDoneOutput) string { var parts []string for _, output := range outputs { if output.Type != "message" { continue } for _, block := range output.Content { switch { case strings.TrimSpace(block.Text) != "": parts = append(parts, strings.TrimSpace(block.Text)) case strings.TrimSpace(block.Transcript) != "": parts = append(parts, strings.TrimSpace(block.Transcript)) case strings.TrimSpace(block.Refusal) != "": parts = append(parts, strings.TrimSpace(block.Refusal)) } } } return strings.Join(parts, " ") } func extractRealtimeResponseDoneToolCalls(outputs []realtimeResponseDoneOutput) []schemas.ChatAssistantMessageToolCall { toolCalls := make([]schemas.ChatAssistantMessageToolCall, 0) for _, output := range outputs { if output.Type != "function_call" { continue } name := strings.TrimSpace(output.Name) if name == "" { continue } toolType := "function" id := strings.TrimSpace(output.CallID) if id == "" { id = strings.TrimSpace(output.ID) } toolCall := schemas.ChatAssistantMessageToolCall{ Index: uint16(len(toolCalls)), Type: &toolType, Function: schemas.ChatAssistantMessageToolCallFunction{ Name: schemas.Ptr(name), Arguments: output.Arguments, }, } if id != "" { toolCall.ID = schemas.Ptr(id) } toolCalls = append(toolCalls, toolCall) } return toolCalls } func extractRealtimeResponseDoneUsage(rawMessage []byte) *schemas.BifrostLLMUsage { if len(rawMessage) == 0 { return nil } var parsed realtimeResponseDoneEnvelope if err := sonic.Unmarshal(rawMessage, &parsed); err != nil || parsed.Response.Usage == nil { return nil } totalTokens := parsed.Response.Usage.TotalTokens if totalTokens == 0 && (parsed.Response.Usage.InputTokens > 0 || parsed.Response.Usage.OutputTokens > 0) { totalTokens = parsed.Response.Usage.InputTokens + parsed.Response.Usage.OutputTokens } usage := &schemas.BifrostLLMUsage{ PromptTokens: parsed.Response.Usage.InputTokens, CompletionTokens: parsed.Response.Usage.OutputTokens, TotalTokens: totalTokens, } if parsed.Response.Usage.InputTokenDetails != nil { usage.PromptTokensDetails = &schemas.ChatPromptTokensDetails{ TextTokens: parsed.Response.Usage.InputTokenDetails.TextTokens, AudioTokens: parsed.Response.Usage.InputTokenDetails.AudioTokens, ImageTokens: parsed.Response.Usage.InputTokenDetails.ImageTokens, CachedReadTokens: parsed.Response.Usage.InputTokenDetails.CachedTokens, } } if parsed.Response.Usage.OutputTokenDetails != nil { usage.CompletionTokensDetails = &schemas.ChatCompletionTokensDetails{ TextTokens: parsed.Response.Usage.OutputTokenDetails.TextTokens, AudioTokens: parsed.Response.Usage.OutputTokenDetails.AudioTokens, ReasoningTokens: parsed.Response.Usage.OutputTokenDetails.ReasoningTokens, ImageTokens: parsed.Response.Usage.OutputTokenDetails.ImageTokens, CitationTokens: parsed.Response.Usage.OutputTokenDetails.CitationTokens, NumSearchQueries: parsed.Response.Usage.OutputTokenDetails.NumSearchQueries, AcceptedPredictionTokens: parsed.Response.Usage.OutputTokenDetails.AcceptedPredictionTokens, RejectedPredictionTokens: parsed.Response.Usage.OutputTokenDetails.RejectedPredictionTokens, } } return usage }