// Package tracing provides distributed tracing infrastructure for Bifrost package tracing import ( "encoding/hex" "sync" "time" "github.com/google/uuid" "github.com/maximhq/bifrost/core/schemas" ) // DeferredSpanInfo stores information about a deferred span for streaming requests type DeferredSpanInfo struct { SpanID string StartTime time.Time Tracer schemas.Tracer // Reference to tracer for completing the span RequestID string // Request ID for accumulator lookup FirstChunkTime time.Time // Timestamp of first chunk (for TTFT calculation) ChunkCount int // Count of received streaming chunks (for AttrTotalChunks) AccumulatedResponse *schemas.BifrostResponse // Full accumulated response from streaming chunks mu sync.Mutex // Mutex for thread-safe chunk accumulation } // TraceStore manages traces with thread-safe access and object pooling type TraceStore struct { traces sync.Map // map[traceID]*schemas.Trace - thread-safe concurrent access deferredSpans sync.Map // map[traceID]*DeferredSpanInfo - deferred spans for streaming requests tracePool sync.Pool // Reuse Trace objects to reduce allocations spanPool sync.Pool // Reuse Span objects to reduce allocations logger schemas.Logger ttl time.Duration cleanupTicker *time.Ticker stopCleanup chan struct{} cleanupWg sync.WaitGroup stopOnce sync.Once // Ensures Stop() cleanup runs only once } // NewTraceStore creates a new TraceStore with the given TTL for cleanup func NewTraceStore(ttl time.Duration, logger schemas.Logger) *TraceStore { store := &TraceStore{ ttl: ttl, logger: logger, tracePool: sync.Pool{ New: func() any { return &schemas.Trace{ Spans: make([]*schemas.Span, 0, 16), // Pre-allocate capacity Attributes: make(map[string]any), } }, }, spanPool: sync.Pool{ New: func() any { return &schemas.Span{ Attributes: make(map[string]any), Events: make([]schemas.SpanEvent, 0, 4), // Pre-allocate capacity } }, }, stopCleanup: make(chan struct{}), } // Start background cleanup goroutine store.startCleanup() return store } // CreateTrace creates a new trace and stores it, returns trace ID only. // The inheritedTraceID parameter is the trace ID from an incoming W3C traceparent header. // If provided, this trace will use that ID to continue the distributed trace. // If empty, a new trace ID will be generated. // Note: The parent span ID (for linking to upstream spans) is handled separately // via context in StartSpan, not stored on the trace itself. func (s *TraceStore) CreateTrace(inheritedTraceID string, requestID ...string) string { trace := s.tracePool.Get().(*schemas.Trace) // Reset and initialize the trace if inheritedTraceID != "" { trace.TraceID = inheritedTraceID } else { trace.TraceID = generateTraceID() } // Note: trace.ParentID is intentionally not set here. // Parent-child relationships are between spans, not traces. // The root span's ParentID is set in StartSpan from context. trace.ParentID = "" if len(requestID) > 0 { trace.RequestID = requestID[0] } trace.StartTime = time.Now() trace.EndTime = time.Time{} trace.RootSpan = nil // Reset slices but keep capacity if trace.Spans != nil { trace.Spans = trace.Spans[:0] } else { trace.Spans = make([]*schemas.Span, 0, 16) } // Reset attributes if trace.Attributes == nil { trace.Attributes = make(map[string]any) } else { clear(trace.Attributes) } s.traces.Store(trace.TraceID, trace) return trace.TraceID } // GetTrace retrieves a trace by ID func (s *TraceStore) GetTrace(traceID string) *schemas.Trace { if val, ok := s.traces.Load(traceID); ok { return val.(*schemas.Trace) } return nil } // SetRequestID sets the request ID for the trace func (s *TraceStore) SetRequestID(traceID string, requestID string) { trace := s.GetTrace(traceID) if trace == nil { return } trace.SetRequestID(requestID) } // CompleteTrace marks the trace as complete, removes it from store, and returns it for flushing func (s *TraceStore) CompleteTrace(traceID string) *schemas.Trace { // Clear any deferred span for this trace s.deferredSpans.Delete(traceID) if val, ok := s.traces.LoadAndDelete(traceID); ok { trace := val.(*schemas.Trace) trace.EndTime = time.Now() return trace } return nil } // StoreDeferredSpan stores a span ID for later completion (used for streaming requests) func (s *TraceStore) StoreDeferredSpan(traceID, spanID string) { s.deferredSpans.Store(traceID, &DeferredSpanInfo{ SpanID: spanID, StartTime: time.Now(), }) } // GetDeferredSpan retrieves the deferred span info for a trace ID func (s *TraceStore) GetDeferredSpan(traceID string) *DeferredSpanInfo { if val, ok := s.deferredSpans.Load(traceID); ok { return val.(*DeferredSpanInfo) } return nil } // ClearDeferredSpan removes the deferred span info for a trace ID func (s *TraceStore) ClearDeferredSpan(traceID string) { s.deferredSpans.Delete(traceID) } // AppendStreamingChunk tracks TTFT and chunk count for the deferred span. // Chunks are no longer stored — the new streaming.Accumulator handles full content // accumulation for plugins (logging, maxim). This eliminates storing 1M+ BifrostResponse // objects in the old accumulator at high concurrency. func (s *TraceStore) AppendStreamingChunk(traceID string, chunk *schemas.BifrostResponse) { if chunk == nil { return } info := s.GetDeferredSpan(traceID) if info == nil { return } info.mu.Lock() defer info.mu.Unlock() // Track first chunk time for TTFT calculation if info.FirstChunkTime.IsZero() { info.FirstChunkTime = time.Now() } info.ChunkCount++ } // GetAccumulatedData returns TTFT and chunk count for a deferred span. // Chunks are no longer stored; full content is available via the streaming.Accumulator. func (s *TraceStore) GetAccumulatedData(traceID string) (ttftNs int64, chunkCount int) { info := s.GetDeferredSpan(traceID) if info == nil { return 0, 0 } info.mu.Lock() defer info.mu.Unlock() // Calculate TTFT in nanoseconds if !info.StartTime.IsZero() && !info.FirstChunkTime.IsZero() { ttftNs = info.FirstChunkTime.Sub(info.StartTime).Nanoseconds() } return ttftNs, info.ChunkCount } // SetAccumulatedResponse stores the accumulated BifrostResponse on the deferred span info. // Called during the final ProcessStreamingChunk to make the full response // available for span attribute population in completeDeferredSpan. func (s *TraceStore) SetAccumulatedResponse(traceID string, resp *schemas.BifrostResponse) { info := s.GetDeferredSpan(traceID) if info == nil { return } info.mu.Lock() defer info.mu.Unlock() if info.AccumulatedResponse != nil { return // already set; do not overwrite } info.AccumulatedResponse = resp } // GetAccumulatedResponse returns the accumulated BifrostResponse for a deferred span. // Returns nil if no accumulated response has been stored. func (s *TraceStore) GetAccumulatedResponse(traceID string) *schemas.BifrostResponse { info := s.GetDeferredSpan(traceID) if info == nil { return nil } info.mu.Lock() defer info.mu.Unlock() return info.AccumulatedResponse } // ReleaseTrace returns the trace and its spans to the pools for reuse func (s *TraceStore) ReleaseTrace(trace *schemas.Trace) { if trace == nil { return } // Return all spans to the pool for _, span := range trace.Spans { s.releaseSpan(span) } // Reset the trace trace.Reset() // Return trace to pool s.tracePool.Put(trace) } // StartSpan creates a new span and adds it to the trace func (s *TraceStore) StartSpan(traceID, name string, kind schemas.SpanKind) *schemas.Span { trace := s.GetTrace(traceID) if trace == nil { return nil } span := s.spanPool.Get().(*schemas.Span) // Reset and initialize the span span.SpanID = generateSpanID() span.TraceID = traceID span.Name = name span.Kind = kind span.StartTime = time.Now() span.EndTime = time.Time{} span.Status = schemas.SpanStatusUnset span.StatusMsg = "" // Reset slices but keep capacity if span.Events != nil { span.Events = span.Events[:0] } else { span.Events = make([]schemas.SpanEvent, 0, 4) } // Reset attributes if span.Attributes == nil { span.Attributes = make(map[string]any) } else { clear(span.Attributes) } // Set parent ID to root span if it exists, otherwise this is root if trace.RootSpan != nil { span.ParentID = trace.RootSpan.SpanID } else { span.ParentID = "" trace.RootSpan = span } // Add span to trace trace.AddSpan(span) return span } // StartChildSpan creates a new span as a child of the specified parent span func (s *TraceStore) StartChildSpan(traceID, parentSpanID, name string, kind schemas.SpanKind) *schemas.Span { trace := s.GetTrace(traceID) if trace == nil { return nil } span := s.spanPool.Get().(*schemas.Span) // Reset and initialize the span span.SpanID = generateSpanID() span.ParentID = parentSpanID span.TraceID = traceID span.Name = name span.Kind = kind span.StartTime = time.Now() span.EndTime = time.Time{} span.Status = schemas.SpanStatusUnset span.StatusMsg = "" // Reset slices but keep capacity if span.Events != nil { span.Events = span.Events[:0] } else { span.Events = make([]schemas.SpanEvent, 0, 4) } // Reset attributes if span.Attributes == nil { span.Attributes = make(map[string]any) } else { clear(span.Attributes) } // Set as root span if this is the first span in the trace. // This can happen when the span has an external parent (from W3C traceparent) // but is the first span within this service's trace. if trace.RootSpan == nil { trace.RootSpan = span } // Add span to trace trace.AddSpan(span) return span } // EndSpan marks a span as complete with the given status and attributes func (s *TraceStore) EndSpan(traceID, spanID string, status schemas.SpanStatus, statusMsg string, attrs map[string]any) { trace := s.GetTrace(traceID) if trace == nil { return } span := trace.GetSpan(spanID) if span == nil { return } span.End(status, statusMsg) // Add any final attributes for k, v := range attrs { span.SetAttribute(k, v) } } // releaseSpan returns a span to the pool func (s *TraceStore) releaseSpan(span *schemas.Span) { if span == nil { return } span.Reset() s.spanPool.Put(span) } // startCleanup starts the background cleanup goroutine func (s *TraceStore) startCleanup() { if s.ttl <= 0 { return } // Cleanup interval is TTL / 2 cleanupInterval := s.ttl / 2 if cleanupInterval < time.Minute { cleanupInterval = time.Minute } s.cleanupTicker = time.NewTicker(cleanupInterval) s.cleanupWg.Add(1) go func() { defer s.cleanupWg.Done() for { select { case <-s.cleanupTicker.C: s.cleanupOldTraces() case <-s.stopCleanup: return } } }() } // cleanupOldTraces removes traces that have exceeded the TTL func (s *TraceStore) cleanupOldTraces() { cutoff := time.Now().Add(-s.ttl) count := 0 s.traces.Range(func(key, value any) bool { trace := value.(*schemas.Trace) if trace.StartTime.Before(cutoff) { if deleted, ok := s.traces.LoadAndDelete(key); ok { s.ReleaseTrace(deleted.(*schemas.Trace)) count++ } } return true }) if count > 0 && s.logger != nil { s.logger.Debug("tracing: cleaned up %d orphaned traces", count) } } // Stop stops the cleanup goroutine and releases resources func (s *TraceStore) Stop() { s.stopOnce.Do(func() { if s.cleanupTicker != nil { s.cleanupTicker.Stop() } close(s.stopCleanup) s.cleanupWg.Wait() }) } // generateTraceID generates a W3C-compliant trace ID. // Returns 32 lowercase hex characters (128-bit UUID without hyphens). func generateTraceID() string { u := uuid.New() return hex.EncodeToString(u[:]) } // generateSpanID generates a W3C-compliant span ID. // Returns 16 lowercase hex characters (first 64 bits of a UUID). func generateSpanID() string { u := uuid.New() return hex.EncodeToString(u[:8]) }