471 lines
15 KiB
Go
471 lines
15 KiB
Go
// Package otel is OpenTelemetry plugin for Bifrost
|
|
package otel
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
|
|
"github.com/bytedance/sonic"
|
|
"github.com/maximhq/bifrost/core/schemas"
|
|
"github.com/maximhq/bifrost/framework/modelcatalog"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
commonpb "go.opentelemetry.io/proto/otlp/common/v1"
|
|
)
|
|
|
|
// logger is the logger for the OTEL plugin
|
|
var logger schemas.Logger
|
|
|
|
// OTELResponseAttributesEnvKey is the environment variable key for the OTEL resource attributes
|
|
// We check if this is present in the environment variables and if so, we will use it to set the attributes for all spans at the resource level
|
|
const OTELResponseAttributesEnvKey = "OTEL_RESOURCE_ATTRIBUTES"
|
|
|
|
const PluginName = "otel"
|
|
|
|
// TraceType is the type of trace to use for the OTEL collector
|
|
type TraceType string
|
|
|
|
// TraceTypeGenAIExtension is the type of trace to use for the OTEL collector
|
|
const TraceTypeGenAIExtension TraceType = "genai_extension"
|
|
|
|
// TraceTypeVercel is the type of trace to use for the OTEL collector
|
|
const TraceTypeVercel TraceType = "vercel"
|
|
|
|
// TraceTypeOpenInference is the type of trace to use for the OTEL collector
|
|
const TraceTypeOpenInference TraceType = "open_inference"
|
|
|
|
// Protocol is the protocol to use for the OTEL collector
|
|
type Protocol string
|
|
|
|
// ProtocolHTTP is the default protocol
|
|
const ProtocolHTTP Protocol = "http"
|
|
|
|
// ProtocolGRPC is the second protocol
|
|
const ProtocolGRPC Protocol = "grpc"
|
|
|
|
type Config struct {
|
|
ServiceName string `json:"service_name"`
|
|
CollectorURL string `json:"collector_url"`
|
|
Headers map[string]string `json:"headers"`
|
|
TraceType TraceType `json:"trace_type"`
|
|
Protocol Protocol `json:"protocol"`
|
|
TLSCACert string `json:"tls_ca_cert"`
|
|
Insecure bool `json:"insecure"` // Skip TLS when true; ignored if TLSCACert is set. Defaults to true when omitted.
|
|
|
|
// Metrics push configuration
|
|
MetricsEnabled bool `json:"metrics_enabled"`
|
|
MetricsEndpoint string `json:"metrics_endpoint"`
|
|
MetricsPushInterval int `json:"metrics_push_interval"` // in seconds, default 15
|
|
}
|
|
|
|
// UnmarshalJSON applies field defaults that the zero-value wouldn't capture.
|
|
// Specifically, Insecure defaults to true when the key is omitted so http://
|
|
// collectors work out-of-the-box without forcing users to set it explicitly.
|
|
func (c *Config) UnmarshalJSON(data []byte) error {
|
|
type alias Config
|
|
aux := struct {
|
|
Insecure *bool `json:"insecure"`
|
|
*alias
|
|
}{
|
|
alias: (*alias)(c),
|
|
}
|
|
if err := sonic.Unmarshal(data, &aux); err != nil {
|
|
return err
|
|
}
|
|
if aux.Insecure == nil {
|
|
c.Insecure = true
|
|
} else {
|
|
c.Insecure = *aux.Insecure
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// OtelPlugin is the plugin for OpenTelemetry.
|
|
// It implements the ObservabilityPlugin interface to receive completed traces
|
|
// from the tracing middleware and forward them to an OTEL collector.
|
|
type OtelPlugin struct {
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
|
|
serviceName string
|
|
url string
|
|
headers map[string]string
|
|
traceType TraceType
|
|
protocol Protocol
|
|
|
|
bifrostVersion string
|
|
|
|
attributesFromEnvironment []*commonpb.KeyValue
|
|
|
|
client OtelClient
|
|
|
|
pricingManager *modelcatalog.ModelCatalog
|
|
|
|
// Metrics push support
|
|
metricsExporter *MetricsExporter
|
|
}
|
|
|
|
// Init function for the OTEL plugin
|
|
func Init(ctx context.Context, config *Config, _logger schemas.Logger, pricingManager *modelcatalog.ModelCatalog, bifrostVersion string) (*OtelPlugin, error) {
|
|
if config == nil {
|
|
return nil, fmt.Errorf("config is required")
|
|
}
|
|
logger = _logger
|
|
if pricingManager == nil {
|
|
logger.Warn("otel plugin requires model catalog to calculate cost, all cost calculations will be skipped.")
|
|
}
|
|
var err error
|
|
// If headers are present, and any of them start with env., we will replace the value with the environment variable
|
|
if config.Headers != nil {
|
|
for key, value := range config.Headers {
|
|
if newValue, ok := strings.CutPrefix(value, "env."); ok {
|
|
config.Headers[key] = os.Getenv(newValue)
|
|
if config.Headers[key] == "" {
|
|
logger.Warn("environment variable %s not found", newValue)
|
|
return nil, fmt.Errorf("environment variable %s not found", newValue)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if config.ServiceName == "" {
|
|
config.ServiceName = "bifrost"
|
|
}
|
|
// Loading attributes from environment
|
|
attributesFromEnvironment := make([]*commonpb.KeyValue, 0)
|
|
if attributes, ok := os.LookupEnv(OTELResponseAttributesEnvKey); ok {
|
|
// We will split the attributes by , and then split each attribute by =
|
|
for attribute := range strings.SplitSeq(attributes, ",") {
|
|
attributeParts := strings.Split(strings.TrimSpace(attribute), "=")
|
|
if len(attributeParts) == 2 {
|
|
attributesFromEnvironment = append(attributesFromEnvironment, kvStr(strings.TrimSpace(attributeParts[0]), strings.TrimSpace(attributeParts[1])))
|
|
}
|
|
}
|
|
}
|
|
// Preparing the plugin
|
|
p := &OtelPlugin{
|
|
serviceName: config.ServiceName,
|
|
url: config.CollectorURL,
|
|
traceType: config.TraceType,
|
|
headers: config.Headers,
|
|
protocol: config.Protocol,
|
|
pricingManager: pricingManager,
|
|
bifrostVersion: bifrostVersion,
|
|
attributesFromEnvironment: attributesFromEnvironment,
|
|
}
|
|
p.ctx, p.cancel = context.WithCancel(ctx)
|
|
if config.Protocol == ProtocolGRPC {
|
|
p.client, err = NewOtelClientGRPC(config.CollectorURL, config.Headers, config.TLSCACert, config.Insecure)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
if config.Protocol == ProtocolHTTP {
|
|
p.client, err = NewOtelClientHTTP(config.CollectorURL, config.Headers, config.TLSCACert, config.Insecure)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
if p.client == nil {
|
|
return nil, fmt.Errorf("otel client is not initialized. invalid protocol type")
|
|
}
|
|
|
|
// Initialize metrics exporter if enabled
|
|
if config.MetricsEnabled {
|
|
if config.MetricsEndpoint == "" {
|
|
return nil, fmt.Errorf("metrics_endpoint is required when metrics_enabled is true")
|
|
}
|
|
pushInterval := config.MetricsPushInterval
|
|
if pushInterval <= 0 {
|
|
pushInterval = 15 // default 15 seconds
|
|
} else if pushInterval > 300 {
|
|
return nil, fmt.Errorf("metrics_push_interval must be between 1 and 300 seconds, got %d", pushInterval)
|
|
}
|
|
metricsConfig := &MetricsConfig{
|
|
ServiceName: config.ServiceName,
|
|
Endpoint: config.MetricsEndpoint,
|
|
Headers: config.Headers,
|
|
Protocol: config.Protocol,
|
|
TLSCACert: config.TLSCACert,
|
|
Insecure: config.Insecure,
|
|
PushInterval: pushInterval,
|
|
}
|
|
p.metricsExporter, err = NewMetricsExporter(p.ctx, metricsConfig)
|
|
if err != nil {
|
|
// Clean up trace client if metrics exporter fails
|
|
if p.client != nil {
|
|
p.client.Close()
|
|
}
|
|
return nil, fmt.Errorf("failed to initialize metrics exporter: %w", err)
|
|
}
|
|
logger.Info("OTEL metrics push enabled, pushing to %s every %d seconds", config.MetricsEndpoint, pushInterval)
|
|
}
|
|
|
|
return p, nil
|
|
}
|
|
|
|
// GetName function for the OTEL plugin
|
|
func (p *OtelPlugin) GetName() string {
|
|
return PluginName
|
|
}
|
|
|
|
// HTTPTransportPreHook is not used for this plugin
|
|
func (p *OtelPlugin) HTTPTransportPreHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest) (*schemas.HTTPResponse, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
// HTTPTransportPostHook is not used for this plugin
|
|
func (p *OtelPlugin) HTTPTransportPostHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest, resp *schemas.HTTPResponse) error {
|
|
return nil
|
|
}
|
|
|
|
// HTTPTransportStreamChunkHook passes through streaming chunks unchanged
|
|
func (p *OtelPlugin) HTTPTransportStreamChunkHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest, chunk *schemas.BifrostStreamChunk) (*schemas.BifrostStreamChunk, error) {
|
|
return chunk, nil
|
|
}
|
|
|
|
// ValidateConfig function for the OTEL plugin
|
|
func (p *OtelPlugin) ValidateConfig(config any) (*Config, error) {
|
|
var otelConfig Config
|
|
// Checking if its a string, then we will JSON parse and confirm
|
|
if configStr, ok := config.(string); ok {
|
|
if err := sonic.Unmarshal([]byte(configStr), &otelConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
// Checking if its a map[string]any, then we will JSON parse and confirm
|
|
if configMap, ok := config.(map[string]any); ok {
|
|
configString, err := sonic.Marshal(configMap)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err := sonic.Unmarshal([]byte(configString), &otelConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
// Checking if its a Config, then we will confirm
|
|
if config, ok := config.(*Config); ok {
|
|
otelConfig = *config
|
|
}
|
|
// Validating fields
|
|
if otelConfig.CollectorURL == "" {
|
|
return nil, fmt.Errorf("collector url is required")
|
|
}
|
|
if otelConfig.TraceType == "" {
|
|
return nil, fmt.Errorf("trace type is required")
|
|
}
|
|
if otelConfig.Protocol == "" {
|
|
return nil, fmt.Errorf("protocol is required")
|
|
}
|
|
return &otelConfig, nil
|
|
}
|
|
|
|
// PreLLMHook is a no-op - tracing is handled via the Inject method.
|
|
// The OTEL plugin receives completed traces from TracingMiddleware.
|
|
func (p *OtelPlugin) PreLLMHook(_ *schemas.BifrostContext, req *schemas.BifrostRequest) (*schemas.BifrostRequest, *schemas.LLMPluginShortCircuit, error) {
|
|
return req, nil, nil
|
|
}
|
|
|
|
// PostLLMHook is a no-op - tracing is handled via the Inject method.
|
|
// The OTEL plugin receives completed traces from TracingMiddleware.
|
|
func (p *OtelPlugin) PostLLMHook(_ *schemas.BifrostContext, resp *schemas.BifrostResponse, bifrostErr *schemas.BifrostError) (*schemas.BifrostResponse, *schemas.BifrostError, error) {
|
|
return resp, bifrostErr, nil
|
|
}
|
|
|
|
// Inject receives a completed trace and sends it to the OTEL collector.
|
|
// Implements schemas.ObservabilityPlugin interface.
|
|
// This method is called asynchronously by TracingMiddleware after the response
|
|
// has been written to the client.
|
|
func (p *OtelPlugin) Inject(ctx context.Context, trace *schemas.Trace) error {
|
|
if trace == nil {
|
|
return nil
|
|
}
|
|
|
|
// Emit trace to collector if client is initialized
|
|
if p.client != nil {
|
|
// Convert schemas.Trace to OTEL ResourceSpan
|
|
resourceSpan := p.convertTraceToResourceSpan(trace)
|
|
|
|
// Emit to collector
|
|
if err := p.client.Emit(ctx, []*ResourceSpan{resourceSpan}); err != nil {
|
|
logger.Error("failed to emit trace %s: %v", trace.TraceID, err)
|
|
}
|
|
}
|
|
|
|
// Record metrics if metrics exporter is enabled
|
|
if p.metricsExporter != nil {
|
|
p.recordMetricsFromTrace(ctx, trace)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Helper functions for type-safe attribute extraction from trace spans
|
|
func getStringAttr(attrs map[string]any, key string) string {
|
|
if attrs == nil {
|
|
return ""
|
|
}
|
|
if v, ok := attrs[key].(string); ok {
|
|
return v
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func getIntAttr(attrs map[string]any, key string) int {
|
|
if attrs == nil {
|
|
return 0
|
|
}
|
|
switch v := attrs[key].(type) {
|
|
case int:
|
|
return v
|
|
case int64:
|
|
return int(v)
|
|
case float64:
|
|
return int(v)
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func getFloat64Attr(attrs map[string]any, key string) float64 {
|
|
if attrs == nil {
|
|
return 0
|
|
}
|
|
switch v := attrs[key].(type) {
|
|
case float64:
|
|
return v
|
|
case int:
|
|
return float64(v)
|
|
case int64:
|
|
return float64(v)
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// buildSpanAttrs extracts metric dimension attrs from a single attempt span.
|
|
func buildSpanAttrs(span *schemas.Span) []attribute.KeyValue {
|
|
attrs := span.Attributes
|
|
method := getStringAttr(attrs, "request.type")
|
|
if method == "" {
|
|
method = span.Name
|
|
}
|
|
return BuildBifrostAttributes(
|
|
getStringAttr(attrs, schemas.AttrProviderName),
|
|
getStringAttr(attrs, schemas.AttrRequestModel),
|
|
method,
|
|
getStringAttr(attrs, schemas.AttrVirtualKeyID),
|
|
getStringAttr(attrs, schemas.AttrVirtualKeyName),
|
|
getStringAttr(attrs, schemas.AttrSelectedKeyID),
|
|
getStringAttr(attrs, schemas.AttrSelectedKeyName),
|
|
getIntAttr(attrs, schemas.AttrNumberOfRetries),
|
|
getIntAttr(attrs, schemas.AttrFallbackIndex),
|
|
getStringAttr(attrs, schemas.AttrTeamID),
|
|
getStringAttr(attrs, schemas.AttrTeamName),
|
|
getStringAttr(attrs, schemas.AttrCustomerID),
|
|
getStringAttr(attrs, schemas.AttrCustomerName),
|
|
)
|
|
}
|
|
|
|
// recordMetricsFromTrace extracts metrics data from a completed trace and records them
|
|
// via the OTEL metrics exporter. This is called from Inject after trace emission.
|
|
//
|
|
// Per-attempt metrics (upstream_requests, errors, success, latency) are recorded once
|
|
// per llm.call/retry span so fallback attempts and failed retries are counted with
|
|
// their own provider/model/fallback_index labels. Per-trace metrics (tokens, cost,
|
|
// TTFT) are recorded once, keyed off the final (latest) attempt span.
|
|
func (p *OtelPlugin) recordMetricsFromTrace(ctx context.Context, trace *schemas.Trace) {
|
|
if trace == nil || p.metricsExporter == nil {
|
|
return
|
|
}
|
|
|
|
var finalSpan *schemas.Span
|
|
for _, span := range trace.Spans {
|
|
if span.Kind != schemas.SpanKindLLMCall && span.Kind != schemas.SpanKindRetry {
|
|
continue
|
|
}
|
|
|
|
spanAttrs := buildSpanAttrs(span)
|
|
|
|
p.metricsExporter.RecordUpstreamRequest(ctx, spanAttrs...)
|
|
|
|
if !span.StartTime.IsZero() && !span.EndTime.IsZero() {
|
|
latencySeconds := span.EndTime.Sub(span.StartTime).Seconds()
|
|
p.metricsExporter.RecordUpstreamLatency(ctx, latencySeconds, spanAttrs...)
|
|
}
|
|
|
|
if span.Status == schemas.SpanStatusError {
|
|
p.metricsExporter.RecordErrorRequest(ctx, spanAttrs...)
|
|
} else {
|
|
p.metricsExporter.RecordSuccessRequest(ctx, spanAttrs...)
|
|
}
|
|
|
|
if finalSpan == nil || span.EndTime.After(finalSpan.EndTime) {
|
|
finalSpan = span
|
|
}
|
|
}
|
|
|
|
if finalSpan == nil {
|
|
finalSpan = trace.RootSpan
|
|
}
|
|
if finalSpan == nil {
|
|
return
|
|
}
|
|
|
|
attrs := finalSpan.Attributes
|
|
otelAttrs := buildSpanAttrs(finalSpan)
|
|
|
|
// Record token usage - try both naming conventions
|
|
inputTokens := getIntAttr(attrs, schemas.AttrPromptTokens)
|
|
if inputTokens == 0 {
|
|
inputTokens = getIntAttr(attrs, schemas.AttrInputTokens)
|
|
}
|
|
if inputTokens > 0 {
|
|
p.metricsExporter.RecordInputTokens(ctx, int64(inputTokens), otelAttrs...)
|
|
}
|
|
|
|
outputTokens := getIntAttr(attrs, schemas.AttrCompletionTokens)
|
|
if outputTokens == 0 {
|
|
outputTokens = getIntAttr(attrs, schemas.AttrOutputTokens)
|
|
}
|
|
if outputTokens > 0 {
|
|
p.metricsExporter.RecordOutputTokens(ctx, int64(outputTokens), otelAttrs...)
|
|
}
|
|
|
|
// Record cost if available
|
|
cost := getFloat64Attr(attrs, schemas.AttrUsageCost)
|
|
if cost > 0 {
|
|
p.metricsExporter.RecordCost(ctx, cost, otelAttrs...)
|
|
}
|
|
|
|
// Record streaming latency metrics if available
|
|
ttft := getFloat64Attr(attrs, schemas.AttrTimeToFirstToken)
|
|
if ttft > 0 {
|
|
// Convert from nanoseconds to seconds if needed (check the unit)
|
|
p.metricsExporter.RecordStreamFirstTokenLatency(ctx, ttft/1e9, otelAttrs...)
|
|
}
|
|
}
|
|
|
|
// Cleanup function for the OTEL plugin
|
|
func (p *OtelPlugin) Cleanup() error {
|
|
if p.cancel != nil {
|
|
p.cancel()
|
|
}
|
|
// Shutdown metrics exporter first
|
|
if p.metricsExporter != nil {
|
|
if err := p.metricsExporter.Shutdown(context.Background()); err != nil {
|
|
logger.Error("failed to shutdown metrics exporter: %v", err)
|
|
}
|
|
}
|
|
if p.client != nil {
|
|
return p.client.Close()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetMetricsExporter returns the metrics exporter for external use (e.g., by telemetry plugin)
|
|
func (p *OtelPlugin) GetMetricsExporter() *MetricsExporter {
|
|
return p.metricsExporter
|
|
}
|
|
|
|
// Compile-time check that OtelPlugin implements ObservabilityPlugin
|
|
var _ schemas.ObservabilityPlugin = (*OtelPlugin)(nil)
|