first commit
This commit is contained in:
440
framework/tracing/tracer.go
Normal file
440
framework/tracing/tracer.go
Normal file
@@ -0,0 +1,440 @@
|
||||
// Package tracing provides distributed tracing infrastructure for Bifrost
|
||||
package tracing
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/maximhq/bifrost/core/schemas"
|
||||
"github.com/maximhq/bifrost/framework/modelcatalog"
|
||||
"github.com/maximhq/bifrost/framework/streaming"
|
||||
)
|
||||
|
||||
// Tracer implements schemas.Tracer using TraceStore.
|
||||
// It provides the bridge between the core Tracer interface and the
|
||||
// framework's TraceStore implementation.
|
||||
// It also embeds a streaming.Accumulator for centralized streaming chunk accumulation.
|
||||
type Tracer struct {
|
||||
store *TraceStore
|
||||
accumulator *streaming.Accumulator
|
||||
pricingManager *modelcatalog.ModelCatalog
|
||||
logger schemas.Logger
|
||||
obsPlugins atomic.Pointer[[]schemas.ObservabilityPlugin]
|
||||
flushWG sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewTracer creates a new Tracer wrapping the given TraceStore.
|
||||
// The accumulator is embedded for centralized streaming chunk accumulation.
|
||||
// The pricingManager is used for cost calculation in span attributes.
|
||||
func NewTracer(store *TraceStore, pricingManager *modelcatalog.ModelCatalog, logger schemas.Logger) *Tracer {
|
||||
return &Tracer{
|
||||
store: store,
|
||||
accumulator: streaming.NewAccumulator(pricingManager, logger),
|
||||
pricingManager: pricingManager,
|
||||
logger: logger,
|
||||
obsPlugins: atomic.Pointer[[]schemas.ObservabilityPlugin]{},
|
||||
}
|
||||
}
|
||||
|
||||
// SetObservabilityPlugins updates the plugins that receive completed traces.
|
||||
func (t *Tracer) SetObservabilityPlugins(obsPlugins []schemas.ObservabilityPlugin) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
t.obsPlugins.Store(&obsPlugins)
|
||||
}
|
||||
|
||||
// CreateTrace creates a new trace with optional parent ID and returns the trace ID.
|
||||
func (t *Tracer) CreateTrace(parentID string, requestID ...string) string {
|
||||
return t.store.CreateTrace(parentID, requestID...)
|
||||
}
|
||||
|
||||
// EndTrace completes a trace and returns the trace data for observation/export.
|
||||
// The returned trace should be released after use by calling ReleaseTrace.
|
||||
func (t *Tracer) EndTrace(traceID string) *schemas.Trace {
|
||||
trace := t.store.CompleteTrace(traceID)
|
||||
if trace == nil {
|
||||
return nil
|
||||
}
|
||||
// Note: Caller is responsible for releasing the trace after plugin processing
|
||||
// by calling ReleaseTrace on the store or letting GC handle it
|
||||
return trace
|
||||
}
|
||||
|
||||
// ReleaseTrace returns the trace to the pool for reuse.
|
||||
// Should be called after EndTrace when the trace data is no longer needed.
|
||||
func (t *Tracer) ReleaseTrace(trace *schemas.Trace) {
|
||||
t.store.ReleaseTrace(trace)
|
||||
}
|
||||
|
||||
// spanHandle is the concrete implementation of schemas.SpanHandle for Tracer.
|
||||
// It contains the trace and span IDs needed to reference the span in the store.
|
||||
type spanHandle struct {
|
||||
traceID string
|
||||
spanID string
|
||||
}
|
||||
|
||||
// StartSpan creates a new span as a child of the current span in context.
|
||||
// It reads the trace ID and parent span ID from context, creates the span,
|
||||
// and returns an updated context with the new span ID.
|
||||
//
|
||||
// Parent span resolution order:
|
||||
// 1. BifrostContextKeySpanID - existing span in this service (for child spans)
|
||||
// 2. BifrostContextKeyParentSpanID - incoming parent from W3C traceparent (for root spans)
|
||||
// 3. No parent - creates a root span with no parent
|
||||
func (t *Tracer) StartSpan(ctx context.Context, name string, kind schemas.SpanKind) (context.Context, schemas.SpanHandle) {
|
||||
traceID := GetTraceID(ctx)
|
||||
if traceID == "" {
|
||||
return ctx, nil
|
||||
}
|
||||
|
||||
// Get parent span ID from context - first check for existing span in this service
|
||||
parentSpanID, _ := ctx.Value(schemas.BifrostContextKeySpanID).(string)
|
||||
|
||||
// If no existing span, check for incoming parent span ID from W3C traceparent header
|
||||
// This links the root span of this service to the upstream service's span
|
||||
if parentSpanID == "" {
|
||||
parentSpanID, _ = ctx.Value(schemas.BifrostContextKeyParentSpanID).(string)
|
||||
}
|
||||
|
||||
var span *schemas.Span
|
||||
if parentSpanID != "" {
|
||||
span = t.store.StartChildSpan(traceID, parentSpanID, name, kind)
|
||||
} else {
|
||||
span = t.store.StartSpan(traceID, name, kind)
|
||||
}
|
||||
if span == nil {
|
||||
return ctx, nil
|
||||
}
|
||||
// Update context with new span ID
|
||||
newCtx := context.WithValue(ctx, schemas.BifrostContextKeySpanID, span.SpanID)
|
||||
return newCtx, &spanHandle{traceID: traceID, spanID: span.SpanID}
|
||||
}
|
||||
|
||||
// EndSpan completes a span with the given status and message.
|
||||
func (t *Tracer) EndSpan(handle schemas.SpanHandle, status schemas.SpanStatus, statusMsg string) {
|
||||
h, ok := handle.(*spanHandle)
|
||||
if !ok || h == nil {
|
||||
return
|
||||
}
|
||||
t.store.EndSpan(h.traceID, h.spanID, status, statusMsg, nil)
|
||||
}
|
||||
|
||||
// SetAttribute sets an attribute on the span identified by the handle.
|
||||
func (t *Tracer) SetAttribute(handle schemas.SpanHandle, key string, value any) {
|
||||
h, ok := handle.(*spanHandle)
|
||||
if !ok || h == nil {
|
||||
return
|
||||
}
|
||||
trace := t.store.GetTrace(h.traceID)
|
||||
if trace == nil {
|
||||
return
|
||||
}
|
||||
span := trace.GetSpan(h.spanID)
|
||||
if span != nil {
|
||||
span.SetAttribute(key, value)
|
||||
}
|
||||
}
|
||||
|
||||
// AddEvent adds a timestamped event to the span identified by the handle.
|
||||
func (t *Tracer) AddEvent(handle schemas.SpanHandle, name string, attrs map[string]any) {
|
||||
h, ok := handle.(*spanHandle)
|
||||
if !ok || h == nil {
|
||||
return
|
||||
}
|
||||
trace := t.store.GetTrace(h.traceID)
|
||||
if trace == nil {
|
||||
return
|
||||
}
|
||||
span := trace.GetSpan(h.spanID)
|
||||
if span != nil {
|
||||
span.AddEvent(schemas.SpanEvent{
|
||||
Name: name,
|
||||
Timestamp: time.Now(),
|
||||
Attributes: attrs,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// PopulateLLMRequestAttributes populates all LLM-specific request attributes on the span.
|
||||
func (t *Tracer) PopulateLLMRequestAttributes(handle schemas.SpanHandle, req *schemas.BifrostRequest) {
|
||||
h, ok := handle.(*spanHandle)
|
||||
if !ok || h == nil || req == nil {
|
||||
return
|
||||
}
|
||||
trace := t.store.GetTrace(h.traceID)
|
||||
if trace == nil {
|
||||
return
|
||||
}
|
||||
span := trace.GetSpan(h.spanID)
|
||||
if span == nil {
|
||||
return
|
||||
}
|
||||
|
||||
for k, v := range PopulateRequestAttributes(req) {
|
||||
span.SetAttribute(k, v)
|
||||
}
|
||||
}
|
||||
|
||||
// PopulateLLMResponseAttributes populates all LLM-specific response attributes on the span.
|
||||
func (t *Tracer) PopulateLLMResponseAttributes(ctx *schemas.BifrostContext, handle schemas.SpanHandle, resp *schemas.BifrostResponse, err *schemas.BifrostError) {
|
||||
h, ok := handle.(*spanHandle)
|
||||
if !ok || h == nil {
|
||||
return
|
||||
}
|
||||
trace := t.store.GetTrace(h.traceID)
|
||||
if trace == nil {
|
||||
return
|
||||
}
|
||||
span := trace.GetSpan(h.spanID)
|
||||
if span == nil {
|
||||
return
|
||||
}
|
||||
for k, v := range PopulateResponseAttributes(resp) {
|
||||
span.SetAttribute(k, v)
|
||||
}
|
||||
for k, v := range PopulateErrorAttributes(err) {
|
||||
span.SetAttribute(k, v)
|
||||
}
|
||||
// Populate cost attribute using pricing manager
|
||||
if t.pricingManager != nil && resp != nil {
|
||||
cost := t.pricingManager.CalculateCost(resp, modelcatalog.PricingLookupScopesFromContext(ctx, string(resp.GetExtraFields().Provider)))
|
||||
span.SetAttribute(schemas.AttrUsageCost, cost)
|
||||
}
|
||||
}
|
||||
|
||||
// StoreDeferredSpan stores a span handle for later completion (used for streaming requests).
|
||||
// The span handle is stored keyed by trace ID so it can be retrieved when the stream completes.
|
||||
func (t *Tracer) StoreDeferredSpan(traceID string, handle schemas.SpanHandle) {
|
||||
h, ok := handle.(*spanHandle)
|
||||
if !ok || h == nil {
|
||||
return
|
||||
}
|
||||
t.store.StoreDeferredSpan(traceID, h.spanID)
|
||||
}
|
||||
|
||||
// GetDeferredSpanHandle retrieves a deferred span handle by trace ID.
|
||||
// Returns nil if no deferred span exists for the given trace ID.
|
||||
func (t *Tracer) GetDeferredSpanHandle(traceID string) schemas.SpanHandle {
|
||||
info := t.store.GetDeferredSpan(traceID)
|
||||
if info == nil {
|
||||
return nil
|
||||
}
|
||||
return &spanHandle{traceID: traceID, spanID: info.SpanID}
|
||||
}
|
||||
|
||||
// ClearDeferredSpan removes the deferred span handle for a trace ID.
|
||||
// Should be called after the deferred span has been completed.
|
||||
func (t *Tracer) ClearDeferredSpan(traceID string) {
|
||||
t.store.ClearDeferredSpan(traceID)
|
||||
}
|
||||
|
||||
// GetDeferredSpanID returns the span ID for the deferred span.
|
||||
// Returns empty string if no deferred span exists.
|
||||
func (t *Tracer) GetDeferredSpanID(traceID string) string {
|
||||
info := t.store.GetDeferredSpan(traceID)
|
||||
if info == nil {
|
||||
return ""
|
||||
}
|
||||
return info.SpanID
|
||||
}
|
||||
|
||||
// AddStreamingChunk tracks TTFT and chunk count for the deferred span.
|
||||
// Chunk contents are no longer stored here; full content accumulation is handled
|
||||
// by the embedded streaming.Accumulator (via ProcessStreamingChunk) for plugins.
|
||||
func (t *Tracer) AddStreamingChunk(traceID string, response *schemas.BifrostResponse) {
|
||||
if traceID == "" || response == nil {
|
||||
return
|
||||
}
|
||||
t.store.AppendStreamingChunk(traceID, response)
|
||||
}
|
||||
|
||||
// GetAccumulatedChunks returns the accumulated response, TTFT, and chunk count for the deferred span.
|
||||
// The response is built from the streaming accumulator during the final ProcessStreamingChunk call
|
||||
// and stored on the DeferredSpanInfo. Returns nil response if no accumulated data is available
|
||||
// (e.g., when no plugin calls ProcessStreamingChunk).
|
||||
func (t *Tracer) GetAccumulatedChunks(traceID string) (*schemas.BifrostResponse, int64, int) {
|
||||
ttftNs, chunkCount := t.store.GetAccumulatedData(traceID)
|
||||
resp := t.store.GetAccumulatedResponse(traceID)
|
||||
return resp, ttftNs, chunkCount
|
||||
}
|
||||
|
||||
// CreateStreamAccumulator creates a new stream accumulator for the given trace ID.
|
||||
// This should be called at the start of a streaming request.
|
||||
func (t *Tracer) CreateStreamAccumulator(traceID string, startTime time.Time) {
|
||||
if traceID == "" || t.accumulator == nil {
|
||||
return
|
||||
}
|
||||
t.accumulator.CreateStreamAccumulator(traceID, startTime)
|
||||
}
|
||||
|
||||
// CleanupStreamAccumulator removes the stream accumulator for the given trace ID.
|
||||
// This should be called after the streaming request is complete.
|
||||
func (t *Tracer) CleanupStreamAccumulator(traceID string) {
|
||||
if traceID == "" || t.accumulator == nil {
|
||||
if t.store != nil && t.store.logger != nil {
|
||||
t.store.logger.Error("traceID or accumulator is nil in CleanupStreamAccumulator")
|
||||
}
|
||||
return
|
||||
}
|
||||
if err := t.accumulator.CleanupStreamAccumulator(traceID); err != nil {
|
||||
if t.store != nil && t.store.logger != nil {
|
||||
t.store.logger.Error("error in CleanupStreamAccumulator: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ProcessStreamingChunk processes a streaming chunk and accumulates it.
|
||||
// Returns the accumulated result. IsFinal will be true when the stream is complete.
|
||||
// This method is used by plugins to access accumulated streaming data.
|
||||
// The ctx parameter must contain the stream end indicator for proper final chunk detection.
|
||||
func (t *Tracer) ProcessStreamingChunk(traceID string, isFinalChunk bool, result *schemas.BifrostResponse, err *schemas.BifrostError) *schemas.StreamAccumulatorResult {
|
||||
if traceID == "" || t.accumulator == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Create a new context for accumulator that sets the traceID as the accumulator lookup ID.
|
||||
accumCtx := schemas.NewBifrostContext(context.Background(), time.Time{})
|
||||
accumCtx.SetValue(schemas.BifrostContextKeyAccumulatorID, traceID)
|
||||
accumCtx.SetValue(schemas.BifrostContextKeyStreamEndIndicator, isFinalChunk)
|
||||
|
||||
processedResp, processErr := t.accumulator.ProcessStreamingResponse(accumCtx, result, err)
|
||||
if processErr != nil || processedResp == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// On final chunk, store the accumulated BifrostResponse on the deferred span
|
||||
// so that completeDeferredSpan can populate span attributes (e.g., gen_ai.output.messages)
|
||||
if isFinalChunk {
|
||||
if bifrostResp := processedResp.ToBifrostResponse(); bifrostResp != nil &&
|
||||
(bifrostResp.ChatResponse != nil ||
|
||||
bifrostResp.TextCompletionResponse != nil ||
|
||||
bifrostResp.SpeechResponse != nil ||
|
||||
bifrostResp.TranscriptionResponse != nil ||
|
||||
bifrostResp.ImageGenerationResponse != nil ||
|
||||
bifrostResp.ResponsesResponse != nil) {
|
||||
t.store.SetAccumulatedResponse(traceID, bifrostResp)
|
||||
}
|
||||
}
|
||||
|
||||
// Convert ProcessedStreamResponse to StreamAccumulatorResult
|
||||
accResult := &schemas.StreamAccumulatorResult{
|
||||
RequestID: processedResp.RequestID,
|
||||
RequestedModel: processedResp.RequestedModel,
|
||||
ResolvedModel: processedResp.ResolvedModel,
|
||||
Provider: processedResp.Provider,
|
||||
}
|
||||
|
||||
if processedResp.Data != nil {
|
||||
accResult.Status = processedResp.Data.Status
|
||||
accResult.Latency = processedResp.Data.Latency
|
||||
accResult.TimeToFirstToken = processedResp.Data.TimeToFirstToken
|
||||
accResult.OutputMessage = processedResp.Data.OutputMessage
|
||||
accResult.OutputMessages = processedResp.Data.OutputMessages
|
||||
accResult.TokenUsage = processedResp.Data.TokenUsage
|
||||
accResult.Cost = processedResp.Data.Cost
|
||||
accResult.ErrorDetails = processedResp.Data.ErrorDetails
|
||||
accResult.AudioOutput = processedResp.Data.AudioOutput
|
||||
accResult.TranscriptionOutput = processedResp.Data.TranscriptionOutput
|
||||
accResult.ImageGenerationOutput = processedResp.Data.ImageGenerationOutput
|
||||
accResult.FinishReason = processedResp.Data.FinishReason
|
||||
accResult.RawResponse = processedResp.Data.RawResponse
|
||||
|
||||
if (accResult.Cost == nil || *accResult.Cost == 0.0) && accResult.TokenUsage != nil && accResult.TokenUsage.Cost != nil {
|
||||
accResult.Cost = &accResult.TokenUsage.Cost.TotalCost
|
||||
}
|
||||
}
|
||||
|
||||
if processedResp.RawRequest != nil {
|
||||
accResult.RawRequest = *processedResp.RawRequest
|
||||
}
|
||||
|
||||
return accResult
|
||||
}
|
||||
|
||||
// GetAccumulator returns the embedded streaming accumulator.
|
||||
// This is useful for plugins that need direct access to accumulator methods.
|
||||
func (t *Tracer) GetAccumulator() *streaming.Accumulator {
|
||||
return t.accumulator
|
||||
}
|
||||
|
||||
// AttachPluginLogs appends plugin log entries to the trace identified by traceID.
|
||||
func (t *Tracer) AttachPluginLogs(traceID string, logs []schemas.PluginLogEntry) {
|
||||
if len(logs) == 0 || traceID == "" {
|
||||
return
|
||||
}
|
||||
trace := t.store.GetTrace(traceID)
|
||||
if trace == nil {
|
||||
return
|
||||
}
|
||||
trace.AppendPluginLogs(logs)
|
||||
}
|
||||
|
||||
// Stop stops the tracer and releases its resources.
|
||||
// This stops the internal TraceStore's cleanup goroutine.
|
||||
func (t *Tracer) Stop() {
|
||||
t.flushWG.Wait()
|
||||
if t.store != nil {
|
||||
t.store.Stop()
|
||||
}
|
||||
if t.accumulator != nil {
|
||||
t.accumulator.Cleanup()
|
||||
}
|
||||
}
|
||||
|
||||
// CompleteAndFlushTrace ends a trace and forwards it to any observability
|
||||
// plugins asynchronously. Realtime transports need this explicit flush because
|
||||
// they bypass the HTTP tracing middleware that normally injects completed traces.
|
||||
func (t *Tracer) CompleteAndFlushTrace(traceID string) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
if strings.TrimSpace(traceID) == "" {
|
||||
return
|
||||
}
|
||||
t.flushWG.Go(func() {
|
||||
completedTrace := t.EndTrace(strings.TrimSpace(traceID))
|
||||
if completedTrace == nil {
|
||||
return
|
||||
}
|
||||
// Defer release so the pooled trace is returned even if a plugin panics;
|
||||
// otherwise an unrecovered panic in this detached goroutine leaks the
|
||||
// trace object and takes down the whole process.
|
||||
defer t.ReleaseTrace(completedTrace)
|
||||
|
||||
var obsPlugins []schemas.ObservabilityPlugin
|
||||
if loaded := t.obsPlugins.Load(); loaded != nil {
|
||||
obsPlugins = *loaded
|
||||
}
|
||||
seen := make(map[string]struct{}, len(obsPlugins))
|
||||
for _, plugin := range obsPlugins {
|
||||
if plugin == nil {
|
||||
continue
|
||||
}
|
||||
// Isolate each plugin callback — one bad observability backend should
|
||||
// not crash the server or prevent other plugins from receiving the trace.
|
||||
func(plugin schemas.ObservabilityPlugin) {
|
||||
name := "<unknown>"
|
||||
defer func() {
|
||||
if r := recover(); r != nil && t.logger != nil {
|
||||
t.logger.Error("observability plugin %s panicked during trace injection: %v", name, r)
|
||||
}
|
||||
}()
|
||||
name = plugin.GetName()
|
||||
if _, exists := seen[name]; exists {
|
||||
return
|
||||
}
|
||||
seen[name] = struct{}{}
|
||||
if err := plugin.Inject(context.Background(), completedTrace); err != nil && t.logger != nil {
|
||||
t.logger.Warn("observability plugin %s failed to inject trace: %v", name, err)
|
||||
}
|
||||
}(plugin)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Ensure Tracer implements schemas.Tracer at compile time
|
||||
var _ schemas.Tracer = (*Tracer)(nil)
|
||||
Reference in New Issue
Block a user