// Package tracing provides distributed tracing infrastructure for Bifrost package tracing import ( "context" "strings" "sync" "sync/atomic" "time" "github.com/maximhq/bifrost/core/schemas" "github.com/maximhq/bifrost/framework/modelcatalog" "github.com/maximhq/bifrost/framework/streaming" ) // Tracer implements schemas.Tracer using TraceStore. // It provides the bridge between the core Tracer interface and the // framework's TraceStore implementation. // It also embeds a streaming.Accumulator for centralized streaming chunk accumulation. type Tracer struct { store *TraceStore accumulator *streaming.Accumulator pricingManager *modelcatalog.ModelCatalog logger schemas.Logger obsPlugins atomic.Pointer[[]schemas.ObservabilityPlugin] flushWG sync.WaitGroup } // NewTracer creates a new Tracer wrapping the given TraceStore. // The accumulator is embedded for centralized streaming chunk accumulation. // The pricingManager is used for cost calculation in span attributes. func NewTracer(store *TraceStore, pricingManager *modelcatalog.ModelCatalog, logger schemas.Logger) *Tracer { return &Tracer{ store: store, accumulator: streaming.NewAccumulator(pricingManager, logger), pricingManager: pricingManager, logger: logger, obsPlugins: atomic.Pointer[[]schemas.ObservabilityPlugin]{}, } } // SetObservabilityPlugins updates the plugins that receive completed traces. func (t *Tracer) SetObservabilityPlugins(obsPlugins []schemas.ObservabilityPlugin) { if t == nil { return } t.obsPlugins.Store(&obsPlugins) } // CreateTrace creates a new trace with optional parent ID and returns the trace ID. func (t *Tracer) CreateTrace(parentID string, requestID ...string) string { return t.store.CreateTrace(parentID, requestID...) } // EndTrace completes a trace and returns the trace data for observation/export. // The returned trace should be released after use by calling ReleaseTrace. func (t *Tracer) EndTrace(traceID string) *schemas.Trace { trace := t.store.CompleteTrace(traceID) if trace == nil { return nil } // Note: Caller is responsible for releasing the trace after plugin processing // by calling ReleaseTrace on the store or letting GC handle it return trace } // ReleaseTrace returns the trace to the pool for reuse. // Should be called after EndTrace when the trace data is no longer needed. func (t *Tracer) ReleaseTrace(trace *schemas.Trace) { t.store.ReleaseTrace(trace) } // spanHandle is the concrete implementation of schemas.SpanHandle for Tracer. // It contains the trace and span IDs needed to reference the span in the store. type spanHandle struct { traceID string spanID string } // StartSpan creates a new span as a child of the current span in context. // It reads the trace ID and parent span ID from context, creates the span, // and returns an updated context with the new span ID. // // Parent span resolution order: // 1. BifrostContextKeySpanID - existing span in this service (for child spans) // 2. BifrostContextKeyParentSpanID - incoming parent from W3C traceparent (for root spans) // 3. No parent - creates a root span with no parent func (t *Tracer) StartSpan(ctx context.Context, name string, kind schemas.SpanKind) (context.Context, schemas.SpanHandle) { traceID := GetTraceID(ctx) if traceID == "" { return ctx, nil } // Get parent span ID from context - first check for existing span in this service parentSpanID, _ := ctx.Value(schemas.BifrostContextKeySpanID).(string) // If no existing span, check for incoming parent span ID from W3C traceparent header // This links the root span of this service to the upstream service's span if parentSpanID == "" { parentSpanID, _ = ctx.Value(schemas.BifrostContextKeyParentSpanID).(string) } var span *schemas.Span if parentSpanID != "" { span = t.store.StartChildSpan(traceID, parentSpanID, name, kind) } else { span = t.store.StartSpan(traceID, name, kind) } if span == nil { return ctx, nil } // Update context with new span ID newCtx := context.WithValue(ctx, schemas.BifrostContextKeySpanID, span.SpanID) return newCtx, &spanHandle{traceID: traceID, spanID: span.SpanID} } // EndSpan completes a span with the given status and message. func (t *Tracer) EndSpan(handle schemas.SpanHandle, status schemas.SpanStatus, statusMsg string) { h, ok := handle.(*spanHandle) if !ok || h == nil { return } t.store.EndSpan(h.traceID, h.spanID, status, statusMsg, nil) } // SetAttribute sets an attribute on the span identified by the handle. func (t *Tracer) SetAttribute(handle schemas.SpanHandle, key string, value any) { h, ok := handle.(*spanHandle) if !ok || h == nil { return } trace := t.store.GetTrace(h.traceID) if trace == nil { return } span := trace.GetSpan(h.spanID) if span != nil { span.SetAttribute(key, value) } } // AddEvent adds a timestamped event to the span identified by the handle. func (t *Tracer) AddEvent(handle schemas.SpanHandle, name string, attrs map[string]any) { h, ok := handle.(*spanHandle) if !ok || h == nil { return } trace := t.store.GetTrace(h.traceID) if trace == nil { return } span := trace.GetSpan(h.spanID) if span != nil { span.AddEvent(schemas.SpanEvent{ Name: name, Timestamp: time.Now(), Attributes: attrs, }) } } // PopulateLLMRequestAttributes populates all LLM-specific request attributes on the span. func (t *Tracer) PopulateLLMRequestAttributes(handle schemas.SpanHandle, req *schemas.BifrostRequest) { h, ok := handle.(*spanHandle) if !ok || h == nil || req == nil { return } trace := t.store.GetTrace(h.traceID) if trace == nil { return } span := trace.GetSpan(h.spanID) if span == nil { return } for k, v := range PopulateRequestAttributes(req) { span.SetAttribute(k, v) } } // PopulateLLMResponseAttributes populates all LLM-specific response attributes on the span. func (t *Tracer) PopulateLLMResponseAttributes(ctx *schemas.BifrostContext, handle schemas.SpanHandle, resp *schemas.BifrostResponse, err *schemas.BifrostError) { h, ok := handle.(*spanHandle) if !ok || h == nil { return } trace := t.store.GetTrace(h.traceID) if trace == nil { return } span := trace.GetSpan(h.spanID) if span == nil { return } for k, v := range PopulateResponseAttributes(resp) { span.SetAttribute(k, v) } for k, v := range PopulateErrorAttributes(err) { span.SetAttribute(k, v) } // Populate cost attribute using pricing manager if t.pricingManager != nil && resp != nil { cost := t.pricingManager.CalculateCost(resp, modelcatalog.PricingLookupScopesFromContext(ctx, string(resp.GetExtraFields().Provider))) span.SetAttribute(schemas.AttrUsageCost, cost) } } // StoreDeferredSpan stores a span handle for later completion (used for streaming requests). // The span handle is stored keyed by trace ID so it can be retrieved when the stream completes. func (t *Tracer) StoreDeferredSpan(traceID string, handle schemas.SpanHandle) { h, ok := handle.(*spanHandle) if !ok || h == nil { return } t.store.StoreDeferredSpan(traceID, h.spanID) } // GetDeferredSpanHandle retrieves a deferred span handle by trace ID. // Returns nil if no deferred span exists for the given trace ID. func (t *Tracer) GetDeferredSpanHandle(traceID string) schemas.SpanHandle { info := t.store.GetDeferredSpan(traceID) if info == nil { return nil } return &spanHandle{traceID: traceID, spanID: info.SpanID} } // ClearDeferredSpan removes the deferred span handle for a trace ID. // Should be called after the deferred span has been completed. func (t *Tracer) ClearDeferredSpan(traceID string) { t.store.ClearDeferredSpan(traceID) } // GetDeferredSpanID returns the span ID for the deferred span. // Returns empty string if no deferred span exists. func (t *Tracer) GetDeferredSpanID(traceID string) string { info := t.store.GetDeferredSpan(traceID) if info == nil { return "" } return info.SpanID } // AddStreamingChunk tracks TTFT and chunk count for the deferred span. // Chunk contents are no longer stored here; full content accumulation is handled // by the embedded streaming.Accumulator (via ProcessStreamingChunk) for plugins. func (t *Tracer) AddStreamingChunk(traceID string, response *schemas.BifrostResponse) { if traceID == "" || response == nil { return } t.store.AppendStreamingChunk(traceID, response) } // GetAccumulatedChunks returns the accumulated response, TTFT, and chunk count for the deferred span. // The response is built from the streaming accumulator during the final ProcessStreamingChunk call // and stored on the DeferredSpanInfo. Returns nil response if no accumulated data is available // (e.g., when no plugin calls ProcessStreamingChunk). func (t *Tracer) GetAccumulatedChunks(traceID string) (*schemas.BifrostResponse, int64, int) { ttftNs, chunkCount := t.store.GetAccumulatedData(traceID) resp := t.store.GetAccumulatedResponse(traceID) return resp, ttftNs, chunkCount } // CreateStreamAccumulator creates a new stream accumulator for the given trace ID. // This should be called at the start of a streaming request. func (t *Tracer) CreateStreamAccumulator(traceID string, startTime time.Time) { if traceID == "" || t.accumulator == nil { return } t.accumulator.CreateStreamAccumulator(traceID, startTime) } // CleanupStreamAccumulator removes the stream accumulator for the given trace ID. // This should be called after the streaming request is complete. func (t *Tracer) CleanupStreamAccumulator(traceID string) { if traceID == "" || t.accumulator == nil { if t.store != nil && t.store.logger != nil { t.store.logger.Error("traceID or accumulator is nil in CleanupStreamAccumulator") } return } if err := t.accumulator.CleanupStreamAccumulator(traceID); err != nil { if t.store != nil && t.store.logger != nil { t.store.logger.Error("error in CleanupStreamAccumulator: %v", err) } } } // ProcessStreamingChunk processes a streaming chunk and accumulates it. // Returns the accumulated result. IsFinal will be true when the stream is complete. // This method is used by plugins to access accumulated streaming data. // The ctx parameter must contain the stream end indicator for proper final chunk detection. func (t *Tracer) ProcessStreamingChunk(traceID string, isFinalChunk bool, result *schemas.BifrostResponse, err *schemas.BifrostError) *schemas.StreamAccumulatorResult { if traceID == "" || t.accumulator == nil { return nil } // Create a new context for accumulator that sets the traceID as the accumulator lookup ID. accumCtx := schemas.NewBifrostContext(context.Background(), time.Time{}) accumCtx.SetValue(schemas.BifrostContextKeyAccumulatorID, traceID) accumCtx.SetValue(schemas.BifrostContextKeyStreamEndIndicator, isFinalChunk) processedResp, processErr := t.accumulator.ProcessStreamingResponse(accumCtx, result, err) if processErr != nil || processedResp == nil { return nil } // On final chunk, store the accumulated BifrostResponse on the deferred span // so that completeDeferredSpan can populate span attributes (e.g., gen_ai.output.messages) if isFinalChunk { if bifrostResp := processedResp.ToBifrostResponse(); bifrostResp != nil && (bifrostResp.ChatResponse != nil || bifrostResp.TextCompletionResponse != nil || bifrostResp.SpeechResponse != nil || bifrostResp.TranscriptionResponse != nil || bifrostResp.ImageGenerationResponse != nil || bifrostResp.ResponsesResponse != nil) { t.store.SetAccumulatedResponse(traceID, bifrostResp) } } // Convert ProcessedStreamResponse to StreamAccumulatorResult accResult := &schemas.StreamAccumulatorResult{ RequestID: processedResp.RequestID, RequestedModel: processedResp.RequestedModel, ResolvedModel: processedResp.ResolvedModel, Provider: processedResp.Provider, } if processedResp.Data != nil { accResult.Status = processedResp.Data.Status accResult.Latency = processedResp.Data.Latency accResult.TimeToFirstToken = processedResp.Data.TimeToFirstToken accResult.OutputMessage = processedResp.Data.OutputMessage accResult.OutputMessages = processedResp.Data.OutputMessages accResult.TokenUsage = processedResp.Data.TokenUsage accResult.Cost = processedResp.Data.Cost accResult.ErrorDetails = processedResp.Data.ErrorDetails accResult.AudioOutput = processedResp.Data.AudioOutput accResult.TranscriptionOutput = processedResp.Data.TranscriptionOutput accResult.ImageGenerationOutput = processedResp.Data.ImageGenerationOutput accResult.FinishReason = processedResp.Data.FinishReason accResult.RawResponse = processedResp.Data.RawResponse if (accResult.Cost == nil || *accResult.Cost == 0.0) && accResult.TokenUsage != nil && accResult.TokenUsage.Cost != nil { accResult.Cost = &accResult.TokenUsage.Cost.TotalCost } } if processedResp.RawRequest != nil { accResult.RawRequest = *processedResp.RawRequest } return accResult } // GetAccumulator returns the embedded streaming accumulator. // This is useful for plugins that need direct access to accumulator methods. func (t *Tracer) GetAccumulator() *streaming.Accumulator { return t.accumulator } // AttachPluginLogs appends plugin log entries to the trace identified by traceID. func (t *Tracer) AttachPluginLogs(traceID string, logs []schemas.PluginLogEntry) { if len(logs) == 0 || traceID == "" { return } trace := t.store.GetTrace(traceID) if trace == nil { return } trace.AppendPluginLogs(logs) } // Stop stops the tracer and releases its resources. // This stops the internal TraceStore's cleanup goroutine. func (t *Tracer) Stop() { t.flushWG.Wait() if t.store != nil { t.store.Stop() } if t.accumulator != nil { t.accumulator.Cleanup() } } // CompleteAndFlushTrace ends a trace and forwards it to any observability // plugins asynchronously. Realtime transports need this explicit flush because // they bypass the HTTP tracing middleware that normally injects completed traces. func (t *Tracer) CompleteAndFlushTrace(traceID string) { if t == nil { return } if strings.TrimSpace(traceID) == "" { return } t.flushWG.Go(func() { completedTrace := t.EndTrace(strings.TrimSpace(traceID)) if completedTrace == nil { return } // Defer release so the pooled trace is returned even if a plugin panics; // otherwise an unrecovered panic in this detached goroutine leaks the // trace object and takes down the whole process. defer t.ReleaseTrace(completedTrace) var obsPlugins []schemas.ObservabilityPlugin if loaded := t.obsPlugins.Load(); loaded != nil { obsPlugins = *loaded } seen := make(map[string]struct{}, len(obsPlugins)) for _, plugin := range obsPlugins { if plugin == nil { continue } // Isolate each plugin callback — one bad observability backend should // not crash the server or prevent other plugins from receiving the trace. func(plugin schemas.ObservabilityPlugin) { name := "" defer func() { if r := recover(); r != nil && t.logger != nil { t.logger.Error("observability plugin %s panicked during trace injection: %v", name, r) } }() name = plugin.GetName() if _, exists := seen[name]; exists { return } seen[name] = struct{}{} if err := plugin.Inject(context.Background(), completedTrace); err != nil && t.logger != nil { t.logger.Warn("observability plugin %s failed to inject trace: %v", name, err) } }(plugin) } }) } // Ensure Tracer implements schemas.Tracer at compile time var _ schemas.Tracer = (*Tracer)(nil)