first commit

This commit is contained in:
Beyhan Oğur
2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions

View File

@@ -0,0 +1,83 @@
// Package tracing provides distributed tracing infrastructure for Bifrost
package tracing
import (
"context"
"github.com/maximhq/bifrost/core/schemas"
)
// GetTraceID retrieves the trace ID from the context
func GetTraceID(ctx context.Context) string {
if ctx == nil {
return ""
}
traceID, ok := ctx.Value(schemas.BifrostContextKeyTraceID).(string)
if !ok {
return ""
}
return traceID
}
// GetTrace retrieves the current trace from context using the store
func GetTrace(ctx context.Context, store *TraceStore) *schemas.Trace {
traceID := GetTraceID(ctx)
if traceID == "" {
return nil
}
return store.GetTrace(traceID)
}
// AddSpan adds a new span to the current trace
func AddSpan(ctx context.Context, store *TraceStore, name string, kind schemas.SpanKind) *schemas.Span {
traceID := GetTraceID(ctx)
if traceID == "" {
return nil
}
return store.StartSpan(traceID, name, kind)
}
// AddChildSpan adds a new child span to the current trace under a specific parent
func AddChildSpan(ctx context.Context, store *TraceStore, parentSpanID, name string, kind schemas.SpanKind) *schemas.Span {
traceID := GetTraceID(ctx)
if traceID == "" {
return nil
}
return store.StartChildSpan(traceID, parentSpanID, name, kind)
}
// EndSpan completes a span with the given status
func EndSpan(ctx context.Context, store *TraceStore, spanID string, status schemas.SpanStatus, statusMsg string, attrs map[string]any) {
traceID := GetTraceID(ctx)
if traceID == "" {
return
}
store.EndSpan(traceID, spanID, status, statusMsg, attrs)
}
// SetSpanAttribute sets an attribute on a span
func SetSpanAttribute(ctx context.Context, store *TraceStore, spanID, key string, value any) {
trace := GetTrace(ctx, store)
if trace == nil {
return
}
span := trace.GetSpan(spanID)
if span == nil {
return
}
span.SetAttribute(key, value)
}
// AddSpanEvent adds an event to a span
func AddSpanEvent(ctx context.Context, store *TraceStore, spanID string, event schemas.SpanEvent) {
trace := GetTrace(ctx, store)
if trace == nil {
return
}
span := trace.GetSpan(spanID)
if span == nil {
return
}
span.AddEvent(event)
}

1669
framework/tracing/llmspan.go Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,211 @@
// Package tracing provides distributed tracing infrastructure for Bifrost
package tracing
import (
"strings"
"github.com/valyala/fasthttp"
)
// normalizeTraceID normalizes a trace ID to W3C-compliant format.
// Strips hyphens and ensures 32 lowercase hex characters.
// Returns empty string if input cannot be normalized to a valid trace ID.
func normalizeTraceID(traceID string) string {
// Remove hyphens (handles UUID format)
normalized := strings.ReplaceAll(traceID, "-", "")
normalized = strings.ToLower(normalized)
// Validate length - must be exactly 32 hex chars
if len(normalized) != 32 {
return ""
}
// Validate hex characters
if !isHex(normalized) {
return ""
}
return normalized
}
// normalizeSpanID normalizes a span ID to W3C-compliant format.
// Strips hyphens and ensures 16 lowercase hex characters.
// If input is longer (e.g., UUID format), takes first 16 hex chars.
// Returns empty string if input cannot be normalized to a valid span ID.
func normalizeSpanID(spanID string) string {
// Remove hyphens (handles UUID format)
normalized := strings.ReplaceAll(spanID, "-", "")
normalized = strings.ToLower(normalized)
// If longer than 16 chars, truncate (e.g., full UUID -> first 16 hex chars)
if len(normalized) > 16 {
normalized = normalized[:16]
}
// Validate length - must be exactly 16 hex chars
if len(normalized) != 16 {
return ""
}
// Validate hex characters
if !isHex(normalized) {
return ""
}
return normalized
}
// W3C Trace Context header names
const (
TraceParentHeader = "traceparent"
TraceStateHeader = "tracestate"
)
// W3CTraceContext holds parsed W3C trace context values
type W3CTraceContext struct {
TraceID string // 32 hex characters
ParentID string // 16 hex characters (span ID of parent)
TraceFlags string // 2 hex characters
TraceState string // Optional vendor-specific trace state
}
// ExtractParentID extracts the trace ID from W3C traceparent header.
// This returns the trace ID (32 hex chars) which should be used to continue
// the distributed trace from the upstream service.
// Returns empty string if header is not present or invalid.
func ExtractParentID(header *fasthttp.RequestHeader) string {
traceParent := string(header.Peek(TraceParentHeader))
if traceParent == "" {
return ""
}
ctx := ParseTraceparent(traceParent)
if ctx == nil {
return ""
}
return ctx.TraceID
}
// ExtractTraceParentSpanID extracts the parent span ID from W3C traceparent header.
// This returns the span ID (16 hex chars) of the upstream service's span that
// initiated this request. This should be set as the ParentID of the root span
// in the receiving service to establish the parent-child relationship.
// Returns empty string if header is not present or invalid.
func ExtractTraceParentSpanID(header *fasthttp.RequestHeader) string {
traceParent := string(header.Peek(TraceParentHeader))
if traceParent == "" {
return ""
}
ctx := ParseTraceparent(traceParent)
if ctx == nil {
return ""
}
return ctx.ParentID
}
// ExtractTraceContext extracts full W3C trace context from headers
func ExtractTraceContext(header *fasthttp.RequestHeader) *W3CTraceContext {
traceparent := string(header.Peek(TraceParentHeader))
if traceparent == "" {
return nil
}
ctx := ParseTraceparent(traceparent)
if ctx == nil {
return nil
}
// Also extract tracestate if present
ctx.TraceState = string(header.Peek(TraceStateHeader))
return ctx
}
// ParseTraceparent parses a W3C traceparent header value
// Format: version-traceid-parentid-traceflags
// Example: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01
func ParseTraceparent(traceparent string) *W3CTraceContext {
parts := strings.Split(traceparent, "-")
if len(parts) != 4 {
return nil
}
version := parts[0]
traceID := parts[1]
parentID := parts[2]
traceFlags := parts[3]
// Validate version (only 00 is currently supported)
if version != "00" {
return nil
}
// Validate trace ID (32 hex characters)
if len(traceID) != 32 || !isHex(traceID) {
return nil
}
// Validate parent ID (16 hex characters)
if len(parentID) != 16 || !isHex(parentID) {
return nil
}
// Validate trace flags (2 hex characters)
if len(traceFlags) != 2 || !isHex(traceFlags) {
return nil
}
return &W3CTraceContext{
TraceID: traceID,
ParentID: parentID,
TraceFlags: traceFlags,
}
}
// FormatTraceparent formats a W3C traceparent header value.
// It normalizes trace ID and span ID to W3C-compliant format:
// - trace ID: 32 lowercase hex characters
// - span ID: 16 lowercase hex characters
// Returns empty string if IDs cannot be normalized to valid format.
func FormatTraceparent(traceID, spanID, traceFlags string) string {
normalizedTraceID := normalizeTraceID(traceID)
normalizedSpanID := normalizeSpanID(spanID)
if normalizedTraceID == "" || normalizedSpanID == "" {
return ""
}
// Normalize and validate traceFlags
traceFlags = strings.ToLower(traceFlags)
if len(traceFlags) != 2 || !isHex(traceFlags) {
traceFlags = "00" // Default: not sampled
}
return "00-" + normalizedTraceID + "-" + normalizedSpanID + "-" + traceFlags
}
// InjectTraceContext injects W3C trace context headers into outgoing request
func InjectTraceContext(header *fasthttp.RequestHeader, traceID, spanID, traceFlags, traceState string) {
if traceID == "" || spanID == "" {
return
}
traceparent := FormatTraceparent(traceID, spanID, traceFlags)
if traceparent == "" {
return // IDs could not be normalized to valid W3C format
}
header.Set(TraceParentHeader, traceparent)
if traceState != "" {
header.Set(TraceStateHeader, traceState)
}
}
// isHex checks if a string contains only hexadecimal characters
func isHex(s string) bool {
for _, c := range s {
if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
return false
}
}
return true
}

View File

@@ -0,0 +1,356 @@
package tracing
import (
"testing"
"github.com/valyala/fasthttp"
)
func TestParseTraceparent_ValidHeader(t *testing.T) {
// Example from W3C spec and the user's actual Datadog headers
tests := []struct {
name string
traceparent string
wantTraceID string
wantParent string
wantFlags string
}{
{
name: "valid traceparent from Datadog",
traceparent: "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01",
wantTraceID: "69538b980000000079943934f90c1d40",
wantParent: "aad09d1659b4c7e3",
wantFlags: "01",
},
{
name: "valid traceparent with sampled flag",
traceparent: "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01",
wantTraceID: "0af7651916cd43dd8448eb211c80319c",
wantParent: "b7ad6b7169203331",
wantFlags: "01",
},
{
name: "valid traceparent not sampled",
traceparent: "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-00",
wantTraceID: "0af7651916cd43dd8448eb211c80319c",
wantParent: "b7ad6b7169203331",
wantFlags: "00",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := ParseTraceparent(tt.traceparent)
if ctx == nil {
t.Fatalf("ParseTraceparent() returned nil for valid header")
}
if ctx.TraceID != tt.wantTraceID {
t.Errorf("TraceID = %q, want %q", ctx.TraceID, tt.wantTraceID)
}
if ctx.ParentID != tt.wantParent {
t.Errorf("ParentID = %q, want %q", ctx.ParentID, tt.wantParent)
}
if ctx.TraceFlags != tt.wantFlags {
t.Errorf("TraceFlags = %q, want %q", ctx.TraceFlags, tt.wantFlags)
}
})
}
}
func TestParseTraceparent_InvalidVersion(t *testing.T) {
// Only version 00 is supported
tests := []struct {
name string
traceparent string
}{
{
name: "version 01",
traceparent: "01-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01",
},
{
name: "version ff",
traceparent: "ff-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := ParseTraceparent(tt.traceparent)
if ctx != nil {
t.Errorf("ParseTraceparent() should return nil for unsupported version")
}
})
}
}
func TestParseTraceparent_InvalidTraceID(t *testing.T) {
tests := []struct {
name string
traceparent string
}{
{
name: "trace ID too short",
traceparent: "00-0af7651916cd43dd8448eb211c8031-b7ad6b7169203331-01",
},
{
name: "trace ID too long",
traceparent: "00-0af7651916cd43dd8448eb211c80319c00-b7ad6b7169203331-01",
},
{
name: "trace ID with invalid chars",
traceparent: "00-0af7651916cd43dd8448eb211c80319z-b7ad6b7169203331-01",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := ParseTraceparent(tt.traceparent)
if ctx != nil {
t.Errorf("ParseTraceparent() should return nil for invalid trace ID")
}
})
}
}
func TestParseTraceparent_InvalidParentID(t *testing.T) {
tests := []struct {
name string
traceparent string
}{
{
name: "parent ID too short",
traceparent: "00-0af7651916cd43dd8448eb211c80319c-b7ad6b71692033-01",
},
{
name: "parent ID too long",
traceparent: "00-0af7651916cd43dd8448eb211c80319c-b7ad6b716920333100-01",
},
{
name: "parent ID with invalid chars",
traceparent: "00-0af7651916cd43dd8448eb211c80319c-b7ad6b716920333z-01",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := ParseTraceparent(tt.traceparent)
if ctx != nil {
t.Errorf("ParseTraceparent() should return nil for invalid parent ID")
}
})
}
}
func TestParseTraceparent_MalformedHeader(t *testing.T) {
tests := []struct {
name string
traceparent string
}{
{
name: "empty string",
traceparent: "",
},
{
name: "missing parts",
traceparent: "00-0af7651916cd43dd8448eb211c80319c",
},
{
name: "too many parts",
traceparent: "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01-extra",
},
{
name: "wrong delimiter",
traceparent: "00_0af7651916cd43dd8448eb211c80319c_b7ad6b7169203331_01",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := ParseTraceparent(tt.traceparent)
if ctx != nil {
t.Errorf("ParseTraceparent() should return nil for malformed header")
}
})
}
}
func TestExtractParentID_ReturnsTraceID(t *testing.T) {
header := &fasthttp.RequestHeader{}
header.Set(TraceParentHeader, "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01")
traceID := ExtractParentID(header)
if traceID != "69538b980000000079943934f90c1d40" {
t.Errorf("ExtractParentID() = %q, want %q", traceID, "69538b980000000079943934f90c1d40")
}
}
func TestExtractParentID_EmptyHeader(t *testing.T) {
header := &fasthttp.RequestHeader{}
traceID := ExtractParentID(header)
if traceID != "" {
t.Errorf("ExtractParentID() = %q, want empty string", traceID)
}
}
func TestExtractTraceParentSpanID_ReturnsParentSpanID(t *testing.T) {
header := &fasthttp.RequestHeader{}
header.Set(TraceParentHeader, "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01")
parentSpanID := ExtractTraceParentSpanID(header)
if parentSpanID != "aad09d1659b4c7e3" {
t.Errorf("ExtractTraceParentSpanID() = %q, want %q", parentSpanID, "aad09d1659b4c7e3")
}
}
func TestExtractTraceParentSpanID_EmptyHeader(t *testing.T) {
header := &fasthttp.RequestHeader{}
parentSpanID := ExtractTraceParentSpanID(header)
if parentSpanID != "" {
t.Errorf("ExtractTraceParentSpanID() = %q, want empty string", parentSpanID)
}
}
func TestExtractTraceParentSpanID_InvalidHeader(t *testing.T) {
header := &fasthttp.RequestHeader{}
header.Set(TraceParentHeader, "invalid-header")
parentSpanID := ExtractTraceParentSpanID(header)
if parentSpanID != "" {
t.Errorf("ExtractTraceParentSpanID() = %q, want empty string for invalid header", parentSpanID)
}
}
func TestFormatTraceparent_NormalizesIDs(t *testing.T) {
tests := []struct {
name string
traceID string
spanID string
traceFlags string
want string
}{
{
name: "already normalized",
traceID: "69538b980000000079943934f90c1d40",
spanID: "aad09d1659b4c7e3",
traceFlags: "01",
want: "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01",
},
{
name: "uppercase to lowercase",
traceID: "69538B980000000079943934F90C1D40",
spanID: "AAD09D1659B4C7E3",
traceFlags: "01",
want: "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01",
},
{
name: "UUID format trace ID",
traceID: "69538b98-0000-0000-7994-3934f90c1d40",
spanID: "aad09d1659b4c7e3",
traceFlags: "01",
want: "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01",
},
{
name: "default trace flags when invalid",
traceID: "69538b980000000079943934f90c1d40",
spanID: "aad09d1659b4c7e3",
traceFlags: "xyz",
want: "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-00",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := FormatTraceparent(tt.traceID, tt.spanID, tt.traceFlags)
if got != tt.want {
t.Errorf("FormatTraceparent() = %q, want %q", got, tt.want)
}
})
}
}
func TestFormatTraceparent_InvalidIDs(t *testing.T) {
tests := []struct {
name string
traceID string
spanID string
}{
{
name: "empty trace ID",
traceID: "",
spanID: "aad09d1659b4c7e3",
},
{
name: "empty span ID",
traceID: "69538b980000000079943934f90c1d40",
spanID: "",
},
{
name: "invalid trace ID length",
traceID: "69538b98",
spanID: "aad09d1659b4c7e3",
},
{
name: "invalid span ID length",
traceID: "69538b980000000079943934f90c1d40",
spanID: "aad09d16",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := FormatTraceparent(tt.traceID, tt.spanID, "01")
if got != "" {
t.Errorf("FormatTraceparent() = %q, want empty string for invalid IDs", got)
}
})
}
}
func TestExtractTraceContext_WithTraceState(t *testing.T) {
header := &fasthttp.RequestHeader{}
header.Set(TraceParentHeader, "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01")
header.Set(TraceStateHeader, "dd=p:aad09d1659b4c7e3;s:1;t.dm:-1;t.tid:69538b9800000000")
ctx := ExtractTraceContext(header)
if ctx == nil {
t.Fatal("ExtractTraceContext() returned nil")
}
if ctx.TraceID != "69538b980000000079943934f90c1d40" {
t.Errorf("TraceID = %q, want %q", ctx.TraceID, "69538b980000000079943934f90c1d40")
}
if ctx.ParentID != "aad09d1659b4c7e3" {
t.Errorf("ParentID = %q, want %q", ctx.ParentID, "aad09d1659b4c7e3")
}
if ctx.TraceState != "dd=p:aad09d1659b4c7e3;s:1;t.dm:-1;t.tid:69538b9800000000" {
t.Errorf("TraceState = %q, want Datadog tracestate", ctx.TraceState)
}
}
func TestInjectTraceContext(t *testing.T) {
header := &fasthttp.RequestHeader{}
InjectTraceContext(header, "69538b980000000079943934f90c1d40", "aad09d1659b4c7e3", "01", "dd=s:1")
traceparent := string(header.Peek(TraceParentHeader))
if traceparent != "00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01" {
t.Errorf("traceparent = %q, want formatted header", traceparent)
}
tracestate := string(header.Peek(TraceStateHeader))
if tracestate != "dd=s:1" {
t.Errorf("tracestate = %q, want %q", tracestate, "dd=s:1")
}
}
func TestInjectTraceContext_EmptyIDs(t *testing.T) {
header := &fasthttp.RequestHeader{}
InjectTraceContext(header, "", "aad09d1659b4c7e3", "01", "")
traceparent := string(header.Peek(TraceParentHeader))
if traceparent != "" {
t.Errorf("traceparent should not be set for empty trace ID")
}
}

446
framework/tracing/store.go Normal file
View File

@@ -0,0 +1,446 @@
// Package tracing provides distributed tracing infrastructure for Bifrost
package tracing
import (
"encoding/hex"
"sync"
"time"
"github.com/google/uuid"
"github.com/maximhq/bifrost/core/schemas"
)
// DeferredSpanInfo stores information about a deferred span for streaming requests
type DeferredSpanInfo struct {
SpanID string
StartTime time.Time
Tracer schemas.Tracer // Reference to tracer for completing the span
RequestID string // Request ID for accumulator lookup
FirstChunkTime time.Time // Timestamp of first chunk (for TTFT calculation)
ChunkCount int // Count of received streaming chunks (for AttrTotalChunks)
AccumulatedResponse *schemas.BifrostResponse // Full accumulated response from streaming chunks
mu sync.Mutex // Mutex for thread-safe chunk accumulation
}
// TraceStore manages traces with thread-safe access and object pooling
type TraceStore struct {
traces sync.Map // map[traceID]*schemas.Trace - thread-safe concurrent access
deferredSpans sync.Map // map[traceID]*DeferredSpanInfo - deferred spans for streaming requests
tracePool sync.Pool // Reuse Trace objects to reduce allocations
spanPool sync.Pool // Reuse Span objects to reduce allocations
logger schemas.Logger
ttl time.Duration
cleanupTicker *time.Ticker
stopCleanup chan struct{}
cleanupWg sync.WaitGroup
stopOnce sync.Once // Ensures Stop() cleanup runs only once
}
// NewTraceStore creates a new TraceStore with the given TTL for cleanup
func NewTraceStore(ttl time.Duration, logger schemas.Logger) *TraceStore {
store := &TraceStore{
ttl: ttl,
logger: logger,
tracePool: sync.Pool{
New: func() any {
return &schemas.Trace{
Spans: make([]*schemas.Span, 0, 16), // Pre-allocate capacity
Attributes: make(map[string]any),
}
},
},
spanPool: sync.Pool{
New: func() any {
return &schemas.Span{
Attributes: make(map[string]any),
Events: make([]schemas.SpanEvent, 0, 4), // Pre-allocate capacity
}
},
},
stopCleanup: make(chan struct{}),
}
// Start background cleanup goroutine
store.startCleanup()
return store
}
// CreateTrace creates a new trace and stores it, returns trace ID only.
// The inheritedTraceID parameter is the trace ID from an incoming W3C traceparent header.
// If provided, this trace will use that ID to continue the distributed trace.
// If empty, a new trace ID will be generated.
// Note: The parent span ID (for linking to upstream spans) is handled separately
// via context in StartSpan, not stored on the trace itself.
func (s *TraceStore) CreateTrace(inheritedTraceID string, requestID ...string) string {
trace := s.tracePool.Get().(*schemas.Trace)
// Reset and initialize the trace
if inheritedTraceID != "" {
trace.TraceID = inheritedTraceID
} else {
trace.TraceID = generateTraceID()
}
// Note: trace.ParentID is intentionally not set here.
// Parent-child relationships are between spans, not traces.
// The root span's ParentID is set in StartSpan from context.
trace.ParentID = ""
if len(requestID) > 0 {
trace.RequestID = requestID[0]
}
trace.StartTime = time.Now()
trace.EndTime = time.Time{}
trace.RootSpan = nil
// Reset slices but keep capacity
if trace.Spans != nil {
trace.Spans = trace.Spans[:0]
} else {
trace.Spans = make([]*schemas.Span, 0, 16)
}
// Reset attributes
if trace.Attributes == nil {
trace.Attributes = make(map[string]any)
} else {
clear(trace.Attributes)
}
s.traces.Store(trace.TraceID, trace)
return trace.TraceID
}
// GetTrace retrieves a trace by ID
func (s *TraceStore) GetTrace(traceID string) *schemas.Trace {
if val, ok := s.traces.Load(traceID); ok {
return val.(*schemas.Trace)
}
return nil
}
// SetRequestID sets the request ID for the trace
func (s *TraceStore) SetRequestID(traceID string, requestID string) {
trace := s.GetTrace(traceID)
if trace == nil {
return
}
trace.SetRequestID(requestID)
}
// CompleteTrace marks the trace as complete, removes it from store, and returns it for flushing
func (s *TraceStore) CompleteTrace(traceID string) *schemas.Trace {
// Clear any deferred span for this trace
s.deferredSpans.Delete(traceID)
if val, ok := s.traces.LoadAndDelete(traceID); ok {
trace := val.(*schemas.Trace)
trace.EndTime = time.Now()
return trace
}
return nil
}
// StoreDeferredSpan stores a span ID for later completion (used for streaming requests)
func (s *TraceStore) StoreDeferredSpan(traceID, spanID string) {
s.deferredSpans.Store(traceID, &DeferredSpanInfo{
SpanID: spanID,
StartTime: time.Now(),
})
}
// GetDeferredSpan retrieves the deferred span info for a trace ID
func (s *TraceStore) GetDeferredSpan(traceID string) *DeferredSpanInfo {
if val, ok := s.deferredSpans.Load(traceID); ok {
return val.(*DeferredSpanInfo)
}
return nil
}
// ClearDeferredSpan removes the deferred span info for a trace ID
func (s *TraceStore) ClearDeferredSpan(traceID string) {
s.deferredSpans.Delete(traceID)
}
// AppendStreamingChunk tracks TTFT and chunk count for the deferred span.
// Chunks are no longer stored — the new streaming.Accumulator handles full content
// accumulation for plugins (logging, maxim). This eliminates storing 1M+ BifrostResponse
// objects in the old accumulator at high concurrency.
func (s *TraceStore) AppendStreamingChunk(traceID string, chunk *schemas.BifrostResponse) {
if chunk == nil {
return
}
info := s.GetDeferredSpan(traceID)
if info == nil {
return
}
info.mu.Lock()
defer info.mu.Unlock()
// Track first chunk time for TTFT calculation
if info.FirstChunkTime.IsZero() {
info.FirstChunkTime = time.Now()
}
info.ChunkCount++
}
// GetAccumulatedData returns TTFT and chunk count for a deferred span.
// Chunks are no longer stored; full content is available via the streaming.Accumulator.
func (s *TraceStore) GetAccumulatedData(traceID string) (ttftNs int64, chunkCount int) {
info := s.GetDeferredSpan(traceID)
if info == nil {
return 0, 0
}
info.mu.Lock()
defer info.mu.Unlock()
// Calculate TTFT in nanoseconds
if !info.StartTime.IsZero() && !info.FirstChunkTime.IsZero() {
ttftNs = info.FirstChunkTime.Sub(info.StartTime).Nanoseconds()
}
return ttftNs, info.ChunkCount
}
// SetAccumulatedResponse stores the accumulated BifrostResponse on the deferred span info.
// Called during the final ProcessStreamingChunk to make the full response
// available for span attribute population in completeDeferredSpan.
func (s *TraceStore) SetAccumulatedResponse(traceID string, resp *schemas.BifrostResponse) {
info := s.GetDeferredSpan(traceID)
if info == nil {
return
}
info.mu.Lock()
defer info.mu.Unlock()
if info.AccumulatedResponse != nil {
return // already set; do not overwrite
}
info.AccumulatedResponse = resp
}
// GetAccumulatedResponse returns the accumulated BifrostResponse for a deferred span.
// Returns nil if no accumulated response has been stored.
func (s *TraceStore) GetAccumulatedResponse(traceID string) *schemas.BifrostResponse {
info := s.GetDeferredSpan(traceID)
if info == nil {
return nil
}
info.mu.Lock()
defer info.mu.Unlock()
return info.AccumulatedResponse
}
// ReleaseTrace returns the trace and its spans to the pools for reuse
func (s *TraceStore) ReleaseTrace(trace *schemas.Trace) {
if trace == nil {
return
}
// Return all spans to the pool
for _, span := range trace.Spans {
s.releaseSpan(span)
}
// Reset the trace
trace.Reset()
// Return trace to pool
s.tracePool.Put(trace)
}
// StartSpan creates a new span and adds it to the trace
func (s *TraceStore) StartSpan(traceID, name string, kind schemas.SpanKind) *schemas.Span {
trace := s.GetTrace(traceID)
if trace == nil {
return nil
}
span := s.spanPool.Get().(*schemas.Span)
// Reset and initialize the span
span.SpanID = generateSpanID()
span.TraceID = traceID
span.Name = name
span.Kind = kind
span.StartTime = time.Now()
span.EndTime = time.Time{}
span.Status = schemas.SpanStatusUnset
span.StatusMsg = ""
// Reset slices but keep capacity
if span.Events != nil {
span.Events = span.Events[:0]
} else {
span.Events = make([]schemas.SpanEvent, 0, 4)
}
// Reset attributes
if span.Attributes == nil {
span.Attributes = make(map[string]any)
} else {
clear(span.Attributes)
}
// Set parent ID to root span if it exists, otherwise this is root
if trace.RootSpan != nil {
span.ParentID = trace.RootSpan.SpanID
} else {
span.ParentID = ""
trace.RootSpan = span
}
// Add span to trace
trace.AddSpan(span)
return span
}
// StartChildSpan creates a new span as a child of the specified parent span
func (s *TraceStore) StartChildSpan(traceID, parentSpanID, name string, kind schemas.SpanKind) *schemas.Span {
trace := s.GetTrace(traceID)
if trace == nil {
return nil
}
span := s.spanPool.Get().(*schemas.Span)
// Reset and initialize the span
span.SpanID = generateSpanID()
span.ParentID = parentSpanID
span.TraceID = traceID
span.Name = name
span.Kind = kind
span.StartTime = time.Now()
span.EndTime = time.Time{}
span.Status = schemas.SpanStatusUnset
span.StatusMsg = ""
// Reset slices but keep capacity
if span.Events != nil {
span.Events = span.Events[:0]
} else {
span.Events = make([]schemas.SpanEvent, 0, 4)
}
// Reset attributes
if span.Attributes == nil {
span.Attributes = make(map[string]any)
} else {
clear(span.Attributes)
}
// Set as root span if this is the first span in the trace.
// This can happen when the span has an external parent (from W3C traceparent)
// but is the first span within this service's trace.
if trace.RootSpan == nil {
trace.RootSpan = span
}
// Add span to trace
trace.AddSpan(span)
return span
}
// EndSpan marks a span as complete with the given status and attributes
func (s *TraceStore) EndSpan(traceID, spanID string, status schemas.SpanStatus, statusMsg string, attrs map[string]any) {
trace := s.GetTrace(traceID)
if trace == nil {
return
}
span := trace.GetSpan(spanID)
if span == nil {
return
}
span.End(status, statusMsg)
// Add any final attributes
for k, v := range attrs {
span.SetAttribute(k, v)
}
}
// releaseSpan returns a span to the pool
func (s *TraceStore) releaseSpan(span *schemas.Span) {
if span == nil {
return
}
span.Reset()
s.spanPool.Put(span)
}
// startCleanup starts the background cleanup goroutine
func (s *TraceStore) startCleanup() {
if s.ttl <= 0 {
return
}
// Cleanup interval is TTL / 2
cleanupInterval := s.ttl / 2
if cleanupInterval < time.Minute {
cleanupInterval = time.Minute
}
s.cleanupTicker = time.NewTicker(cleanupInterval)
s.cleanupWg.Add(1)
go func() {
defer s.cleanupWg.Done()
for {
select {
case <-s.cleanupTicker.C:
s.cleanupOldTraces()
case <-s.stopCleanup:
return
}
}
}()
}
// cleanupOldTraces removes traces that have exceeded the TTL
func (s *TraceStore) cleanupOldTraces() {
cutoff := time.Now().Add(-s.ttl)
count := 0
s.traces.Range(func(key, value any) bool {
trace := value.(*schemas.Trace)
if trace.StartTime.Before(cutoff) {
if deleted, ok := s.traces.LoadAndDelete(key); ok {
s.ReleaseTrace(deleted.(*schemas.Trace))
count++
}
}
return true
})
if count > 0 && s.logger != nil {
s.logger.Debug("tracing: cleaned up %d orphaned traces", count)
}
}
// Stop stops the cleanup goroutine and releases resources
func (s *TraceStore) Stop() {
s.stopOnce.Do(func() {
if s.cleanupTicker != nil {
s.cleanupTicker.Stop()
}
close(s.stopCleanup)
s.cleanupWg.Wait()
})
}
// generateTraceID generates a W3C-compliant trace ID.
// Returns 32 lowercase hex characters (128-bit UUID without hyphens).
func generateTraceID() string {
u := uuid.New()
return hex.EncodeToString(u[:])
}
// generateSpanID generates a W3C-compliant span ID.
// Returns 16 lowercase hex characters (first 64 bits of a UUID).
func generateSpanID() string {
u := uuid.New()
return hex.EncodeToString(u[:8])
}

View File

@@ -0,0 +1,252 @@
package tracing
import (
"testing"
"time"
"github.com/maximhq/bifrost/core/schemas"
)
func TestCreateTrace_WithInheritedTraceID(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
// Use a trace ID from an incoming W3C traceparent header
inheritedTraceID := "69538b980000000079943934f90c1d40"
traceID := store.CreateTrace(inheritedTraceID)
if traceID != inheritedTraceID {
t.Errorf("CreateTrace() returned %q, want inherited trace ID %q", traceID, inheritedTraceID)
}
trace := store.GetTrace(traceID)
if trace == nil {
t.Fatal("GetTrace() returned nil")
}
if trace.TraceID != inheritedTraceID {
t.Errorf("trace.TraceID = %q, want %q", trace.TraceID, inheritedTraceID)
}
// ParentID should be empty - we no longer set it incorrectly to the trace ID
if trace.ParentID != "" {
t.Errorf("trace.ParentID = %q, want empty string (parent span ID is set on spans, not traces)", trace.ParentID)
}
}
func TestCreateTrace_GeneratesNewTraceID(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
traceID := store.CreateTrace("")
if traceID == "" {
t.Error("CreateTrace() returned empty trace ID")
}
// Generated trace ID should be 32 hex characters
if len(traceID) != 32 {
t.Errorf("Generated trace ID length = %d, want 32", len(traceID))
}
// Verify it's valid hex
if !isHex(traceID) {
t.Errorf("Generated trace ID %q is not valid hex", traceID)
}
trace := store.GetTrace(traceID)
if trace == nil {
t.Fatal("GetTrace() returned nil")
}
if trace.ParentID != "" {
t.Errorf("trace.ParentID = %q, want empty string", trace.ParentID)
}
}
func TestStartSpan_RootSpanHasNoParent(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
traceID := store.CreateTrace("")
span := store.StartSpan(traceID, "root-operation", schemas.SpanKindHTTPRequest)
if span == nil {
t.Fatal("StartSpan() returned nil")
}
// Root span should have no parent when there's no incoming trace context
if span.ParentID != "" {
t.Errorf("root span.ParentID = %q, want empty string", span.ParentID)
}
if span.TraceID != traceID {
t.Errorf("span.TraceID = %q, want %q", span.TraceID, traceID)
}
// Verify it's set as root span
trace := store.GetTrace(traceID)
if trace.RootSpan != span {
t.Error("StartSpan() did not set trace.RootSpan")
}
}
func TestStartSpan_SecondSpanHasRootAsParent(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
traceID := store.CreateTrace("")
rootSpan := store.StartSpan(traceID, "root-operation", schemas.SpanKindHTTPRequest)
if rootSpan == nil {
t.Fatal("StartSpan() returned nil for root span")
}
// Second span created with StartSpan should have root as parent
secondSpan := store.StartSpan(traceID, "second-operation", schemas.SpanKindLLMCall)
if secondSpan == nil {
t.Fatal("StartSpan() returned nil for second span")
}
if secondSpan.ParentID != rootSpan.SpanID {
t.Errorf("second span.ParentID = %q, want root span ID %q", secondSpan.ParentID, rootSpan.SpanID)
}
}
func TestStartChildSpan_HasCorrectParent(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
traceID := store.CreateTrace("")
rootSpan := store.StartSpan(traceID, "root-operation", schemas.SpanKindHTTPRequest)
if rootSpan == nil {
t.Fatal("StartSpan() returned nil for root span")
}
// Create a child span with explicit parent
childSpan := store.StartChildSpan(traceID, rootSpan.SpanID, "child-operation", schemas.SpanKindLLMCall)
if childSpan == nil {
t.Fatal("StartChildSpan() returned nil")
}
if childSpan.ParentID != rootSpan.SpanID {
t.Errorf("child span.ParentID = %q, want %q", childSpan.ParentID, rootSpan.SpanID)
}
if childSpan.TraceID != traceID {
t.Errorf("child span.TraceID = %q, want %q", childSpan.TraceID, traceID)
}
}
func TestStartChildSpan_WithExternalParentSpanID(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
// Simulating an incoming request with W3C traceparent header
inheritedTraceID := "69538b980000000079943934f90c1d40"
externalParentSpanID := "aad09d1659b4c7e3" // Parent span ID from upstream service
traceID := store.CreateTrace(inheritedTraceID)
// Create root span as child of external parent span
// This is what should happen when processing an incoming distributed trace
rootSpan := store.StartChildSpan(traceID, externalParentSpanID, "bifrost-request", schemas.SpanKindHTTPRequest)
if rootSpan == nil {
t.Fatal("StartChildSpan() returned nil")
}
// Root span should have the external parent span ID
if rootSpan.ParentID != externalParentSpanID {
t.Errorf("root span.ParentID = %q, want external parent %q", rootSpan.ParentID, externalParentSpanID)
}
if rootSpan.TraceID != inheritedTraceID {
t.Errorf("root span.TraceID = %q, want inherited trace ID %q", rootSpan.TraceID, inheritedTraceID)
}
}
func TestGetTrace_NotFound(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
trace := store.GetTrace("nonexistent-trace-id")
if trace != nil {
t.Error("GetTrace() should return nil for nonexistent trace")
}
}
func TestCompleteTrace_ReturnsAndRemoves(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
traceID := store.CreateTrace("")
store.StartSpan(traceID, "operation", schemas.SpanKindHTTPRequest)
trace := store.CompleteTrace(traceID)
if trace == nil {
t.Fatal("CompleteTrace() returned nil")
}
if trace.TraceID != traceID {
t.Errorf("trace.TraceID = %q, want %q", trace.TraceID, traceID)
}
if trace.EndTime.IsZero() {
t.Error("trace.EndTime should be set")
}
// Trace should be removed from store
if store.GetTrace(traceID) != nil {
t.Error("Trace should be removed from store after CompleteTrace()")
}
}
func TestEndSpan_SetsStatusAndTime(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
traceID := store.CreateTrace("")
span := store.StartSpan(traceID, "operation", schemas.SpanKindHTTPRequest)
store.EndSpan(traceID, span.SpanID, schemas.SpanStatusOk, "success", map[string]any{
"custom.attr": "value",
})
if span.Status != schemas.SpanStatusOk {
t.Errorf("span.Status = %v, want SpanStatusOk", span.Status)
}
if span.EndTime.IsZero() {
t.Error("span.EndTime should be set")
}
if span.Attributes["custom.attr"] != "value" {
t.Error("EndSpan() should set custom attributes")
}
}
func TestGenerateTraceID_Format(t *testing.T) {
id := generateTraceID()
if len(id) != 32 {
t.Errorf("generateTraceID() length = %d, want 32", len(id))
}
if !isHex(id) {
t.Errorf("generateTraceID() = %q, not valid hex", id)
}
}
func TestGenerateSpanID_Format(t *testing.T) {
id := generateSpanID()
if len(id) != 16 {
t.Errorf("generateSpanID() length = %d, want 16", len(id))
}
if !isHex(id) {
t.Errorf("generateSpanID() = %q, not valid hex", id)
}
}

440
framework/tracing/tracer.go Normal file
View File

@@ -0,0 +1,440 @@
// Package tracing provides distributed tracing infrastructure for Bifrost
package tracing
import (
"context"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/maximhq/bifrost/core/schemas"
"github.com/maximhq/bifrost/framework/modelcatalog"
"github.com/maximhq/bifrost/framework/streaming"
)
// Tracer implements schemas.Tracer using TraceStore.
// It provides the bridge between the core Tracer interface and the
// framework's TraceStore implementation.
// It also embeds a streaming.Accumulator for centralized streaming chunk accumulation.
type Tracer struct {
store *TraceStore
accumulator *streaming.Accumulator
pricingManager *modelcatalog.ModelCatalog
logger schemas.Logger
obsPlugins atomic.Pointer[[]schemas.ObservabilityPlugin]
flushWG sync.WaitGroup
}
// NewTracer creates a new Tracer wrapping the given TraceStore.
// The accumulator is embedded for centralized streaming chunk accumulation.
// The pricingManager is used for cost calculation in span attributes.
func NewTracer(store *TraceStore, pricingManager *modelcatalog.ModelCatalog, logger schemas.Logger) *Tracer {
return &Tracer{
store: store,
accumulator: streaming.NewAccumulator(pricingManager, logger),
pricingManager: pricingManager,
logger: logger,
obsPlugins: atomic.Pointer[[]schemas.ObservabilityPlugin]{},
}
}
// SetObservabilityPlugins updates the plugins that receive completed traces.
func (t *Tracer) SetObservabilityPlugins(obsPlugins []schemas.ObservabilityPlugin) {
if t == nil {
return
}
t.obsPlugins.Store(&obsPlugins)
}
// CreateTrace creates a new trace with optional parent ID and returns the trace ID.
func (t *Tracer) CreateTrace(parentID string, requestID ...string) string {
return t.store.CreateTrace(parentID, requestID...)
}
// EndTrace completes a trace and returns the trace data for observation/export.
// The returned trace should be released after use by calling ReleaseTrace.
func (t *Tracer) EndTrace(traceID string) *schemas.Trace {
trace := t.store.CompleteTrace(traceID)
if trace == nil {
return nil
}
// Note: Caller is responsible for releasing the trace after plugin processing
// by calling ReleaseTrace on the store or letting GC handle it
return trace
}
// ReleaseTrace returns the trace to the pool for reuse.
// Should be called after EndTrace when the trace data is no longer needed.
func (t *Tracer) ReleaseTrace(trace *schemas.Trace) {
t.store.ReleaseTrace(trace)
}
// spanHandle is the concrete implementation of schemas.SpanHandle for Tracer.
// It contains the trace and span IDs needed to reference the span in the store.
type spanHandle struct {
traceID string
spanID string
}
// StartSpan creates a new span as a child of the current span in context.
// It reads the trace ID and parent span ID from context, creates the span,
// and returns an updated context with the new span ID.
//
// Parent span resolution order:
// 1. BifrostContextKeySpanID - existing span in this service (for child spans)
// 2. BifrostContextKeyParentSpanID - incoming parent from W3C traceparent (for root spans)
// 3. No parent - creates a root span with no parent
func (t *Tracer) StartSpan(ctx context.Context, name string, kind schemas.SpanKind) (context.Context, schemas.SpanHandle) {
traceID := GetTraceID(ctx)
if traceID == "" {
return ctx, nil
}
// Get parent span ID from context - first check for existing span in this service
parentSpanID, _ := ctx.Value(schemas.BifrostContextKeySpanID).(string)
// If no existing span, check for incoming parent span ID from W3C traceparent header
// This links the root span of this service to the upstream service's span
if parentSpanID == "" {
parentSpanID, _ = ctx.Value(schemas.BifrostContextKeyParentSpanID).(string)
}
var span *schemas.Span
if parentSpanID != "" {
span = t.store.StartChildSpan(traceID, parentSpanID, name, kind)
} else {
span = t.store.StartSpan(traceID, name, kind)
}
if span == nil {
return ctx, nil
}
// Update context with new span ID
newCtx := context.WithValue(ctx, schemas.BifrostContextKeySpanID, span.SpanID)
return newCtx, &spanHandle{traceID: traceID, spanID: span.SpanID}
}
// EndSpan completes a span with the given status and message.
func (t *Tracer) EndSpan(handle schemas.SpanHandle, status schemas.SpanStatus, statusMsg string) {
h, ok := handle.(*spanHandle)
if !ok || h == nil {
return
}
t.store.EndSpan(h.traceID, h.spanID, status, statusMsg, nil)
}
// SetAttribute sets an attribute on the span identified by the handle.
func (t *Tracer) SetAttribute(handle schemas.SpanHandle, key string, value any) {
h, ok := handle.(*spanHandle)
if !ok || h == nil {
return
}
trace := t.store.GetTrace(h.traceID)
if trace == nil {
return
}
span := trace.GetSpan(h.spanID)
if span != nil {
span.SetAttribute(key, value)
}
}
// AddEvent adds a timestamped event to the span identified by the handle.
func (t *Tracer) AddEvent(handle schemas.SpanHandle, name string, attrs map[string]any) {
h, ok := handle.(*spanHandle)
if !ok || h == nil {
return
}
trace := t.store.GetTrace(h.traceID)
if trace == nil {
return
}
span := trace.GetSpan(h.spanID)
if span != nil {
span.AddEvent(schemas.SpanEvent{
Name: name,
Timestamp: time.Now(),
Attributes: attrs,
})
}
}
// PopulateLLMRequestAttributes populates all LLM-specific request attributes on the span.
func (t *Tracer) PopulateLLMRequestAttributes(handle schemas.SpanHandle, req *schemas.BifrostRequest) {
h, ok := handle.(*spanHandle)
if !ok || h == nil || req == nil {
return
}
trace := t.store.GetTrace(h.traceID)
if trace == nil {
return
}
span := trace.GetSpan(h.spanID)
if span == nil {
return
}
for k, v := range PopulateRequestAttributes(req) {
span.SetAttribute(k, v)
}
}
// PopulateLLMResponseAttributes populates all LLM-specific response attributes on the span.
func (t *Tracer) PopulateLLMResponseAttributes(ctx *schemas.BifrostContext, handle schemas.SpanHandle, resp *schemas.BifrostResponse, err *schemas.BifrostError) {
h, ok := handle.(*spanHandle)
if !ok || h == nil {
return
}
trace := t.store.GetTrace(h.traceID)
if trace == nil {
return
}
span := trace.GetSpan(h.spanID)
if span == nil {
return
}
for k, v := range PopulateResponseAttributes(resp) {
span.SetAttribute(k, v)
}
for k, v := range PopulateErrorAttributes(err) {
span.SetAttribute(k, v)
}
// Populate cost attribute using pricing manager
if t.pricingManager != nil && resp != nil {
cost := t.pricingManager.CalculateCost(resp, modelcatalog.PricingLookupScopesFromContext(ctx, string(resp.GetExtraFields().Provider)))
span.SetAttribute(schemas.AttrUsageCost, cost)
}
}
// StoreDeferredSpan stores a span handle for later completion (used for streaming requests).
// The span handle is stored keyed by trace ID so it can be retrieved when the stream completes.
func (t *Tracer) StoreDeferredSpan(traceID string, handle schemas.SpanHandle) {
h, ok := handle.(*spanHandle)
if !ok || h == nil {
return
}
t.store.StoreDeferredSpan(traceID, h.spanID)
}
// GetDeferredSpanHandle retrieves a deferred span handle by trace ID.
// Returns nil if no deferred span exists for the given trace ID.
func (t *Tracer) GetDeferredSpanHandle(traceID string) schemas.SpanHandle {
info := t.store.GetDeferredSpan(traceID)
if info == nil {
return nil
}
return &spanHandle{traceID: traceID, spanID: info.SpanID}
}
// ClearDeferredSpan removes the deferred span handle for a trace ID.
// Should be called after the deferred span has been completed.
func (t *Tracer) ClearDeferredSpan(traceID string) {
t.store.ClearDeferredSpan(traceID)
}
// GetDeferredSpanID returns the span ID for the deferred span.
// Returns empty string if no deferred span exists.
func (t *Tracer) GetDeferredSpanID(traceID string) string {
info := t.store.GetDeferredSpan(traceID)
if info == nil {
return ""
}
return info.SpanID
}
// AddStreamingChunk tracks TTFT and chunk count for the deferred span.
// Chunk contents are no longer stored here; full content accumulation is handled
// by the embedded streaming.Accumulator (via ProcessStreamingChunk) for plugins.
func (t *Tracer) AddStreamingChunk(traceID string, response *schemas.BifrostResponse) {
if traceID == "" || response == nil {
return
}
t.store.AppendStreamingChunk(traceID, response)
}
// GetAccumulatedChunks returns the accumulated response, TTFT, and chunk count for the deferred span.
// The response is built from the streaming accumulator during the final ProcessStreamingChunk call
// and stored on the DeferredSpanInfo. Returns nil response if no accumulated data is available
// (e.g., when no plugin calls ProcessStreamingChunk).
func (t *Tracer) GetAccumulatedChunks(traceID string) (*schemas.BifrostResponse, int64, int) {
ttftNs, chunkCount := t.store.GetAccumulatedData(traceID)
resp := t.store.GetAccumulatedResponse(traceID)
return resp, ttftNs, chunkCount
}
// CreateStreamAccumulator creates a new stream accumulator for the given trace ID.
// This should be called at the start of a streaming request.
func (t *Tracer) CreateStreamAccumulator(traceID string, startTime time.Time) {
if traceID == "" || t.accumulator == nil {
return
}
t.accumulator.CreateStreamAccumulator(traceID, startTime)
}
// CleanupStreamAccumulator removes the stream accumulator for the given trace ID.
// This should be called after the streaming request is complete.
func (t *Tracer) CleanupStreamAccumulator(traceID string) {
if traceID == "" || t.accumulator == nil {
if t.store != nil && t.store.logger != nil {
t.store.logger.Error("traceID or accumulator is nil in CleanupStreamAccumulator")
}
return
}
if err := t.accumulator.CleanupStreamAccumulator(traceID); err != nil {
if t.store != nil && t.store.logger != nil {
t.store.logger.Error("error in CleanupStreamAccumulator: %v", err)
}
}
}
// ProcessStreamingChunk processes a streaming chunk and accumulates it.
// Returns the accumulated result. IsFinal will be true when the stream is complete.
// This method is used by plugins to access accumulated streaming data.
// The ctx parameter must contain the stream end indicator for proper final chunk detection.
func (t *Tracer) ProcessStreamingChunk(traceID string, isFinalChunk bool, result *schemas.BifrostResponse, err *schemas.BifrostError) *schemas.StreamAccumulatorResult {
if traceID == "" || t.accumulator == nil {
return nil
}
// Create a new context for accumulator that sets the traceID as the accumulator lookup ID.
accumCtx := schemas.NewBifrostContext(context.Background(), time.Time{})
accumCtx.SetValue(schemas.BifrostContextKeyAccumulatorID, traceID)
accumCtx.SetValue(schemas.BifrostContextKeyStreamEndIndicator, isFinalChunk)
processedResp, processErr := t.accumulator.ProcessStreamingResponse(accumCtx, result, err)
if processErr != nil || processedResp == nil {
return nil
}
// On final chunk, store the accumulated BifrostResponse on the deferred span
// so that completeDeferredSpan can populate span attributes (e.g., gen_ai.output.messages)
if isFinalChunk {
if bifrostResp := processedResp.ToBifrostResponse(); bifrostResp != nil &&
(bifrostResp.ChatResponse != nil ||
bifrostResp.TextCompletionResponse != nil ||
bifrostResp.SpeechResponse != nil ||
bifrostResp.TranscriptionResponse != nil ||
bifrostResp.ImageGenerationResponse != nil ||
bifrostResp.ResponsesResponse != nil) {
t.store.SetAccumulatedResponse(traceID, bifrostResp)
}
}
// Convert ProcessedStreamResponse to StreamAccumulatorResult
accResult := &schemas.StreamAccumulatorResult{
RequestID: processedResp.RequestID,
RequestedModel: processedResp.RequestedModel,
ResolvedModel: processedResp.ResolvedModel,
Provider: processedResp.Provider,
}
if processedResp.Data != nil {
accResult.Status = processedResp.Data.Status
accResult.Latency = processedResp.Data.Latency
accResult.TimeToFirstToken = processedResp.Data.TimeToFirstToken
accResult.OutputMessage = processedResp.Data.OutputMessage
accResult.OutputMessages = processedResp.Data.OutputMessages
accResult.TokenUsage = processedResp.Data.TokenUsage
accResult.Cost = processedResp.Data.Cost
accResult.ErrorDetails = processedResp.Data.ErrorDetails
accResult.AudioOutput = processedResp.Data.AudioOutput
accResult.TranscriptionOutput = processedResp.Data.TranscriptionOutput
accResult.ImageGenerationOutput = processedResp.Data.ImageGenerationOutput
accResult.FinishReason = processedResp.Data.FinishReason
accResult.RawResponse = processedResp.Data.RawResponse
if (accResult.Cost == nil || *accResult.Cost == 0.0) && accResult.TokenUsage != nil && accResult.TokenUsage.Cost != nil {
accResult.Cost = &accResult.TokenUsage.Cost.TotalCost
}
}
if processedResp.RawRequest != nil {
accResult.RawRequest = *processedResp.RawRequest
}
return accResult
}
// GetAccumulator returns the embedded streaming accumulator.
// This is useful for plugins that need direct access to accumulator methods.
func (t *Tracer) GetAccumulator() *streaming.Accumulator {
return t.accumulator
}
// AttachPluginLogs appends plugin log entries to the trace identified by traceID.
func (t *Tracer) AttachPluginLogs(traceID string, logs []schemas.PluginLogEntry) {
if len(logs) == 0 || traceID == "" {
return
}
trace := t.store.GetTrace(traceID)
if trace == nil {
return
}
trace.AppendPluginLogs(logs)
}
// Stop stops the tracer and releases its resources.
// This stops the internal TraceStore's cleanup goroutine.
func (t *Tracer) Stop() {
t.flushWG.Wait()
if t.store != nil {
t.store.Stop()
}
if t.accumulator != nil {
t.accumulator.Cleanup()
}
}
// CompleteAndFlushTrace ends a trace and forwards it to any observability
// plugins asynchronously. Realtime transports need this explicit flush because
// they bypass the HTTP tracing middleware that normally injects completed traces.
func (t *Tracer) CompleteAndFlushTrace(traceID string) {
if t == nil {
return
}
if strings.TrimSpace(traceID) == "" {
return
}
t.flushWG.Go(func() {
completedTrace := t.EndTrace(strings.TrimSpace(traceID))
if completedTrace == nil {
return
}
// Defer release so the pooled trace is returned even if a plugin panics;
// otherwise an unrecovered panic in this detached goroutine leaks the
// trace object and takes down the whole process.
defer t.ReleaseTrace(completedTrace)
var obsPlugins []schemas.ObservabilityPlugin
if loaded := t.obsPlugins.Load(); loaded != nil {
obsPlugins = *loaded
}
seen := make(map[string]struct{}, len(obsPlugins))
for _, plugin := range obsPlugins {
if plugin == nil {
continue
}
// Isolate each plugin callback — one bad observability backend should
// not crash the server or prevent other plugins from receiving the trace.
func(plugin schemas.ObservabilityPlugin) {
name := "<unknown>"
defer func() {
if r := recover(); r != nil && t.logger != nil {
t.logger.Error("observability plugin %s panicked during trace injection: %v", name, r)
}
}()
name = plugin.GetName()
if _, exists := seen[name]; exists {
return
}
seen[name] = struct{}{}
if err := plugin.Inject(context.Background(), completedTrace); err != nil && t.logger != nil {
t.logger.Warn("observability plugin %s failed to inject trace: %v", name, err)
}
}(plugin)
}
})
}
// Ensure Tracer implements schemas.Tracer at compile time
var _ schemas.Tracer = (*Tracer)(nil)

View File

@@ -0,0 +1,439 @@
package tracing
import (
"context"
"testing"
"time"
"github.com/maximhq/bifrost/core/schemas"
)
type testRealtimeObservabilityPlugin struct {
injected chan *schemas.Trace
}
func (p *testRealtimeObservabilityPlugin) GetName() string { return "test-observability" }
func (p *testRealtimeObservabilityPlugin) Cleanup() error { return nil }
func (p *testRealtimeObservabilityPlugin) PreLLMHook(_ *schemas.BifrostContext, req *schemas.BifrostRequest) (*schemas.BifrostRequest, *schemas.LLMPluginShortCircuit, error) {
return req, nil, nil
}
func (p *testRealtimeObservabilityPlugin) PostLLMHook(_ *schemas.BifrostContext, resp *schemas.BifrostResponse, bifrostErr *schemas.BifrostError) (*schemas.BifrostResponse, *schemas.BifrostError, error) {
return resp, bifrostErr, nil
}
func (p *testRealtimeObservabilityPlugin) Inject(_ context.Context, trace *schemas.Trace) error {
if trace == nil {
p.injected <- nil
return nil
}
traceCopy := *trace
p.injected <- &traceCopy
return nil
}
func TestTracer_CompleteAndFlushTraceInjectsObservabilityPlugins(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
traceID := tracer.CreateTrace("")
plugin := &testRealtimeObservabilityPlugin{
injected: make(chan *schemas.Trace, 1),
}
tracer.SetObservabilityPlugins([]schemas.ObservabilityPlugin{plugin})
tracer.CompleteAndFlushTrace(traceID)
select {
case trace := <-plugin.injected:
if trace == nil || trace.TraceID != traceID {
t.Fatalf("injected trace = %+v, want trace %q", trace, traceID)
}
case <-time.After(time.Second):
t.Fatal("timed out waiting for observability inject")
}
if got := tracer.store.GetTrace(traceID); got != nil {
t.Fatalf("trace %q was not released after flush", traceID)
}
}
func TestTracer_StartSpan_RootSpanWithW3CParent(t *testing.T) {
// This is the key test: verifies that when an incoming request has a W3C traceparent header,
// the root span in Bifrost correctly links to the upstream service's span.
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
// Simulate incoming W3C traceparent: 00-{traceID}-{parentSpanID}-01
inheritedTraceID := "69538b980000000079943934f90c1d40"
externalParentSpanID := "aad09d1659b4c7e3"
// Create trace with inherited trace ID
traceID := tracer.CreateTrace(inheritedTraceID)
if traceID != inheritedTraceID {
t.Errorf("CreateTrace() = %q, want inherited trace ID %q", traceID, inheritedTraceID)
}
// Set up context with trace ID and parent span ID (as middleware would do)
ctx := context.WithValue(context.Background(), schemas.BifrostContextKeyTraceID, traceID)
ctx = context.WithValue(ctx, schemas.BifrostContextKeyParentSpanID, externalParentSpanID)
// Create root span - this should link to the external parent
newCtx, handle := tracer.StartSpan(ctx, "bifrost-http-request", schemas.SpanKindHTTPRequest)
if handle == nil {
t.Fatal("StartSpan() returned nil handle")
}
// Verify the span was created with correct parent
trace := store.GetTrace(traceID)
if trace == nil {
t.Fatal("Trace not found in store")
}
if trace.RootSpan == nil {
t.Fatal("Root span not set on trace")
}
// THE CRITICAL CHECK: Root span should have the external parent span ID
if trace.RootSpan.ParentID != externalParentSpanID {
t.Errorf("Root span ParentID = %q, want external parent span ID %q", trace.RootSpan.ParentID, externalParentSpanID)
}
// Verify trace ID is preserved
if trace.RootSpan.TraceID != inheritedTraceID {
t.Errorf("Root span TraceID = %q, want %q", trace.RootSpan.TraceID, inheritedTraceID)
}
// Verify context has span ID for child span creation
spanID, ok := newCtx.Value(schemas.BifrostContextKeySpanID).(string)
if !ok || spanID == "" {
t.Error("Context should have span ID after StartSpan()")
}
if spanID != trace.RootSpan.SpanID {
t.Errorf("Context span ID = %q, want %q", spanID, trace.RootSpan.SpanID)
}
}
func TestTracer_StartSpan_RootSpanWithoutW3CParent(t *testing.T) {
// When there's no incoming W3C context, root span should have no parent
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
// Create new trace (no inherited trace ID)
traceID := tracer.CreateTrace("")
// Set up context with only trace ID (no parent span ID)
ctx := context.WithValue(context.Background(), schemas.BifrostContextKeyTraceID, traceID)
// Create root span
_, handle := tracer.StartSpan(ctx, "local-request", schemas.SpanKindHTTPRequest)
if handle == nil {
t.Fatal("StartSpan() returned nil handle")
}
trace := store.GetTrace(traceID)
if trace == nil {
t.Fatal("Trace not found in store")
}
// Root span should have no parent
if trace.RootSpan.ParentID != "" {
t.Errorf("Root span ParentID = %q, want empty string (no W3C parent)", trace.RootSpan.ParentID)
}
}
func TestTracer_StartSpan_ChildSpanLinking(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
inheritedTraceID := "69538b980000000079943934f90c1d40"
externalParentSpanID := "aad09d1659b4c7e3"
traceID := tracer.CreateTrace(inheritedTraceID)
// Set up context with W3C parent span ID
ctx := context.WithValue(context.Background(), schemas.BifrostContextKeyTraceID, traceID)
ctx = context.WithValue(ctx, schemas.BifrostContextKeyParentSpanID, externalParentSpanID)
// Create root span
rootCtx, rootHandle := tracer.StartSpan(ctx, "http-request", schemas.SpanKindHTTPRequest)
if rootHandle == nil {
t.Fatal("StartSpan() returned nil handle for root span")
}
// Create child span using the context from root span
childCtx, childHandle := tracer.StartSpan(rootCtx, "llm-call", schemas.SpanKindLLMCall)
if childHandle == nil {
t.Fatal("StartSpan() returned nil handle for child span")
}
trace := store.GetTrace(traceID)
// Find the child span
var childSpan *schemas.Span
for _, span := range trace.Spans {
if span.Name == "llm-call" {
childSpan = span
break
}
}
if childSpan == nil {
t.Fatal("Child span not found in trace")
}
// Child span should have root span as parent (not the external parent)
if childSpan.ParentID != trace.RootSpan.SpanID {
t.Errorf("Child span ParentID = %q, want root span ID %q", childSpan.ParentID, trace.RootSpan.SpanID)
}
// Create grandchild span
_, grandchildHandle := tracer.StartSpan(childCtx, "plugin-call", schemas.SpanKindPlugin)
if grandchildHandle == nil {
t.Fatal("StartSpan() returned nil handle for grandchild span")
}
// Find the grandchild span
var grandchildSpan *schemas.Span
for _, span := range trace.Spans {
if span.Name == "plugin-call" {
grandchildSpan = span
break
}
}
if grandchildSpan == nil {
t.Fatal("Grandchild span not found in trace")
}
// Grandchild should have child as parent
if grandchildSpan.ParentID != childSpan.SpanID {
t.Errorf("Grandchild span ParentID = %q, want child span ID %q", grandchildSpan.ParentID, childSpan.SpanID)
}
}
func TestTracer_StartSpan_NoTraceID(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
// Context without trace ID
ctx := context.Background()
newCtx, handle := tracer.StartSpan(ctx, "operation", schemas.SpanKindHTTPRequest)
if handle != nil {
t.Error("StartSpan() should return nil handle when no trace ID in context")
}
// Context should be unchanged
if newCtx != ctx {
t.Error("Context should be unchanged when StartSpan() fails")
}
}
func TestTracer_EndTrace_ReturnsTraceData(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
inheritedTraceID := "69538b980000000079943934f90c1d40"
externalParentSpanID := "aad09d1659b4c7e3"
traceID := tracer.CreateTrace(inheritedTraceID)
ctx := context.WithValue(context.Background(), schemas.BifrostContextKeyTraceID, traceID)
ctx = context.WithValue(ctx, schemas.BifrostContextKeyParentSpanID, externalParentSpanID)
_, rootHandle := tracer.StartSpan(ctx, "http-request", schemas.SpanKindHTTPRequest)
tracer.EndSpan(rootHandle, schemas.SpanStatusOk, "")
trace := tracer.EndTrace(traceID)
if trace == nil {
t.Fatal("EndTrace() returned nil")
}
if trace.TraceID != inheritedTraceID {
t.Errorf("trace.TraceID = %q, want %q", trace.TraceID, inheritedTraceID)
}
if len(trace.Spans) != 1 {
t.Errorf("len(trace.Spans) = %d, want 1", len(trace.Spans))
}
// Root span should still have external parent
if trace.RootSpan.ParentID != externalParentSpanID {
t.Errorf("Root span ParentID = %q, want %q", trace.RootSpan.ParentID, externalParentSpanID)
}
}
func TestTracer_SetAttribute(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
traceID := tracer.CreateTrace("")
ctx := context.WithValue(context.Background(), schemas.BifrostContextKeyTraceID, traceID)
_, handle := tracer.StartSpan(ctx, "operation", schemas.SpanKindHTTPRequest)
tracer.SetAttribute(handle, "http.method", "POST")
tracer.SetAttribute(handle, "http.status_code", 200)
trace := store.GetTrace(traceID)
span := trace.RootSpan
if span.Attributes["http.method"] != "POST" {
t.Errorf("span attribute http.method = %v, want POST", span.Attributes["http.method"])
}
if span.Attributes["http.status_code"] != 200 {
t.Errorf("span attribute http.status_code = %v, want 200", span.Attributes["http.status_code"])
}
}
func TestTracer_AddEvent(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
traceID := tracer.CreateTrace("")
ctx := context.WithValue(context.Background(), schemas.BifrostContextKeyTraceID, traceID)
_, handle := tracer.StartSpan(ctx, "operation", schemas.SpanKindHTTPRequest)
tracer.AddEvent(handle, "request.received", map[string]any{
"size": 1024,
})
trace := store.GetTrace(traceID)
span := trace.RootSpan
if len(span.Events) != 1 {
t.Fatalf("len(span.Events) = %d, want 1", len(span.Events))
}
if span.Events[0].Name != "request.received" {
t.Errorf("event name = %q, want request.received", span.Events[0].Name)
}
if span.Events[0].Attributes["size"] != 1024 {
t.Errorf("event attribute size = %v, want 1024", span.Events[0].Attributes["size"])
}
}
// TestIntegration_FullDistributedTraceFlow tests the complete flow of receiving
// a distributed trace from an upstream service and properly linking spans.
func TestIntegration_FullDistributedTraceFlow(t *testing.T) {
store := NewTraceStore(5*time.Minute, nil)
defer store.Stop()
tracer := NewTracer(store, nil, nil)
defer tracer.Stop()
// Simulating headers from user's actual Datadog request:
// traceparent: 00-69538b980000000079943934f90c1d40-aad09d1659b4c7e3-01
inheritedTraceID := "69538b980000000079943934f90c1d40"
externalParentSpanID := "aad09d1659b4c7e3"
// Step 1: Middleware extracts trace context and creates trace
traceID := tracer.CreateTrace(inheritedTraceID)
// Step 2: Middleware sets up context (simulating what TracingMiddleware does)
ctx := context.WithValue(context.Background(), schemas.BifrostContextKeyTraceID, traceID)
ctx = context.WithValue(ctx, schemas.BifrostContextKeyParentSpanID, externalParentSpanID)
// Step 3: Middleware creates root span
httpCtx, httpHandle := tracer.StartSpan(ctx, "/v1/chat/completions", schemas.SpanKindHTTPRequest)
tracer.SetAttribute(httpHandle, "http.method", "POST")
// Step 4: Bifrost creates LLM call span
llmCtx, llmHandle := tracer.StartSpan(httpCtx, "openai.chat.completions", schemas.SpanKindLLMCall)
tracer.SetAttribute(llmHandle, "llm.model", "gpt-4")
tracer.SetAttribute(llmHandle, "llm.provider", "openai")
// Step 5: Plugin creates its own span
_, pluginHandle := tracer.StartSpan(llmCtx, "governance-plugin", schemas.SpanKindPlugin)
tracer.SetAttribute(pluginHandle, "plugin.name", "governance")
// Step 6: Complete spans (in reverse order)
tracer.EndSpan(pluginHandle, schemas.SpanStatusOk, "")
tracer.EndSpan(llmHandle, schemas.SpanStatusOk, "")
tracer.EndSpan(httpHandle, schemas.SpanStatusOk, "")
// Step 7: Complete trace
trace := tracer.EndTrace(traceID)
// Verify the trace structure for Datadog
if trace.TraceID != inheritedTraceID {
t.Errorf("Trace ID should match inherited ID from Datadog: got %q, want %q", trace.TraceID, inheritedTraceID)
}
// Find spans by name
var httpSpan, llmSpan, pluginSpan *schemas.Span
for _, span := range trace.Spans {
switch span.Name {
case "/v1/chat/completions":
httpSpan = span
case "openai.chat.completions":
llmSpan = span
case "governance-plugin":
pluginSpan = span
}
}
if httpSpan == nil || llmSpan == nil || pluginSpan == nil {
t.Fatal("Not all spans found in trace")
}
// Verify span hierarchy for Datadog linking:
// External Parent (aad09d1659b4c7e3) -> HTTP Span -> LLM Span -> Plugin Span
// HTTP span should link to Datadog's parent span
if httpSpan.ParentID != externalParentSpanID {
t.Errorf("HTTP span should link to Datadog parent: got ParentID %q, want %q",
httpSpan.ParentID, externalParentSpanID)
}
// LLM span should be child of HTTP span
if llmSpan.ParentID != httpSpan.SpanID {
t.Errorf("LLM span should be child of HTTP span: got ParentID %q, want %q",
llmSpan.ParentID, httpSpan.SpanID)
}
// Plugin span should be child of LLM span
if pluginSpan.ParentID != llmSpan.SpanID {
t.Errorf("Plugin span should be child of LLM span: got ParentID %q, want %q",
pluginSpan.ParentID, llmSpan.SpanID)
}
// All spans should have the same trace ID
if httpSpan.TraceID != inheritedTraceID || llmSpan.TraceID != inheritedTraceID || pluginSpan.TraceID != inheritedTraceID {
t.Error("All spans should have the inherited trace ID")
}
t.Logf("Trace structure (for Datadog):")
t.Logf(" Trace ID: %s", trace.TraceID)
t.Logf(" External Parent Span: %s (from Datadog)", externalParentSpanID)
t.Logf(" -> HTTP Span: %s (ParentID: %s)", httpSpan.SpanID, httpSpan.ParentID)
t.Logf(" -> LLM Span: %s (ParentID: %s)", llmSpan.SpanID, llmSpan.ParentID)
t.Logf(" -> Plugin Span: %s (ParentID: %s)", pluginSpan.SpanID, pluginSpan.ParentID)
}