first commit
This commit is contained in:
470
plugins/otel/main.go
Normal file
470
plugins/otel/main.go
Normal file
@@ -0,0 +1,470 @@
|
||||
// Package otel is OpenTelemetry plugin for Bifrost
|
||||
package otel
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/bytedance/sonic"
|
||||
"github.com/maximhq/bifrost/core/schemas"
|
||||
"github.com/maximhq/bifrost/framework/modelcatalog"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
commonpb "go.opentelemetry.io/proto/otlp/common/v1"
|
||||
)
|
||||
|
||||
// logger is the logger for the OTEL plugin
|
||||
var logger schemas.Logger
|
||||
|
||||
// OTELResponseAttributesEnvKey is the environment variable key for the OTEL resource attributes
|
||||
// We check if this is present in the environment variables and if so, we will use it to set the attributes for all spans at the resource level
|
||||
const OTELResponseAttributesEnvKey = "OTEL_RESOURCE_ATTRIBUTES"
|
||||
|
||||
const PluginName = "otel"
|
||||
|
||||
// TraceType is the type of trace to use for the OTEL collector
|
||||
type TraceType string
|
||||
|
||||
// TraceTypeGenAIExtension is the type of trace to use for the OTEL collector
|
||||
const TraceTypeGenAIExtension TraceType = "genai_extension"
|
||||
|
||||
// TraceTypeVercel is the type of trace to use for the OTEL collector
|
||||
const TraceTypeVercel TraceType = "vercel"
|
||||
|
||||
// TraceTypeOpenInference is the type of trace to use for the OTEL collector
|
||||
const TraceTypeOpenInference TraceType = "open_inference"
|
||||
|
||||
// Protocol is the protocol to use for the OTEL collector
|
||||
type Protocol string
|
||||
|
||||
// ProtocolHTTP is the default protocol
|
||||
const ProtocolHTTP Protocol = "http"
|
||||
|
||||
// ProtocolGRPC is the second protocol
|
||||
const ProtocolGRPC Protocol = "grpc"
|
||||
|
||||
type Config struct {
|
||||
ServiceName string `json:"service_name"`
|
||||
CollectorURL string `json:"collector_url"`
|
||||
Headers map[string]string `json:"headers"`
|
||||
TraceType TraceType `json:"trace_type"`
|
||||
Protocol Protocol `json:"protocol"`
|
||||
TLSCACert string `json:"tls_ca_cert"`
|
||||
Insecure bool `json:"insecure"` // Skip TLS when true; ignored if TLSCACert is set. Defaults to true when omitted.
|
||||
|
||||
// Metrics push configuration
|
||||
MetricsEnabled bool `json:"metrics_enabled"`
|
||||
MetricsEndpoint string `json:"metrics_endpoint"`
|
||||
MetricsPushInterval int `json:"metrics_push_interval"` // in seconds, default 15
|
||||
}
|
||||
|
||||
// UnmarshalJSON applies field defaults that the zero-value wouldn't capture.
|
||||
// Specifically, Insecure defaults to true when the key is omitted so http://
|
||||
// collectors work out-of-the-box without forcing users to set it explicitly.
|
||||
func (c *Config) UnmarshalJSON(data []byte) error {
|
||||
type alias Config
|
||||
aux := struct {
|
||||
Insecure *bool `json:"insecure"`
|
||||
*alias
|
||||
}{
|
||||
alias: (*alias)(c),
|
||||
}
|
||||
if err := sonic.Unmarshal(data, &aux); err != nil {
|
||||
return err
|
||||
}
|
||||
if aux.Insecure == nil {
|
||||
c.Insecure = true
|
||||
} else {
|
||||
c.Insecure = *aux.Insecure
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// OtelPlugin is the plugin for OpenTelemetry.
|
||||
// It implements the ObservabilityPlugin interface to receive completed traces
|
||||
// from the tracing middleware and forward them to an OTEL collector.
|
||||
type OtelPlugin struct {
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
|
||||
serviceName string
|
||||
url string
|
||||
headers map[string]string
|
||||
traceType TraceType
|
||||
protocol Protocol
|
||||
|
||||
bifrostVersion string
|
||||
|
||||
attributesFromEnvironment []*commonpb.KeyValue
|
||||
|
||||
client OtelClient
|
||||
|
||||
pricingManager *modelcatalog.ModelCatalog
|
||||
|
||||
// Metrics push support
|
||||
metricsExporter *MetricsExporter
|
||||
}
|
||||
|
||||
// Init function for the OTEL plugin
|
||||
func Init(ctx context.Context, config *Config, _logger schemas.Logger, pricingManager *modelcatalog.ModelCatalog, bifrostVersion string) (*OtelPlugin, error) {
|
||||
if config == nil {
|
||||
return nil, fmt.Errorf("config is required")
|
||||
}
|
||||
logger = _logger
|
||||
if pricingManager == nil {
|
||||
logger.Warn("otel plugin requires model catalog to calculate cost, all cost calculations will be skipped.")
|
||||
}
|
||||
var err error
|
||||
// If headers are present, and any of them start with env., we will replace the value with the environment variable
|
||||
if config.Headers != nil {
|
||||
for key, value := range config.Headers {
|
||||
if newValue, ok := strings.CutPrefix(value, "env."); ok {
|
||||
config.Headers[key] = os.Getenv(newValue)
|
||||
if config.Headers[key] == "" {
|
||||
logger.Warn("environment variable %s not found", newValue)
|
||||
return nil, fmt.Errorf("environment variable %s not found", newValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if config.ServiceName == "" {
|
||||
config.ServiceName = "bifrost"
|
||||
}
|
||||
// Loading attributes from environment
|
||||
attributesFromEnvironment := make([]*commonpb.KeyValue, 0)
|
||||
if attributes, ok := os.LookupEnv(OTELResponseAttributesEnvKey); ok {
|
||||
// We will split the attributes by , and then split each attribute by =
|
||||
for attribute := range strings.SplitSeq(attributes, ",") {
|
||||
attributeParts := strings.Split(strings.TrimSpace(attribute), "=")
|
||||
if len(attributeParts) == 2 {
|
||||
attributesFromEnvironment = append(attributesFromEnvironment, kvStr(strings.TrimSpace(attributeParts[0]), strings.TrimSpace(attributeParts[1])))
|
||||
}
|
||||
}
|
||||
}
|
||||
// Preparing the plugin
|
||||
p := &OtelPlugin{
|
||||
serviceName: config.ServiceName,
|
||||
url: config.CollectorURL,
|
||||
traceType: config.TraceType,
|
||||
headers: config.Headers,
|
||||
protocol: config.Protocol,
|
||||
pricingManager: pricingManager,
|
||||
bifrostVersion: bifrostVersion,
|
||||
attributesFromEnvironment: attributesFromEnvironment,
|
||||
}
|
||||
p.ctx, p.cancel = context.WithCancel(ctx)
|
||||
if config.Protocol == ProtocolGRPC {
|
||||
p.client, err = NewOtelClientGRPC(config.CollectorURL, config.Headers, config.TLSCACert, config.Insecure)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if config.Protocol == ProtocolHTTP {
|
||||
p.client, err = NewOtelClientHTTP(config.CollectorURL, config.Headers, config.TLSCACert, config.Insecure)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if p.client == nil {
|
||||
return nil, fmt.Errorf("otel client is not initialized. invalid protocol type")
|
||||
}
|
||||
|
||||
// Initialize metrics exporter if enabled
|
||||
if config.MetricsEnabled {
|
||||
if config.MetricsEndpoint == "" {
|
||||
return nil, fmt.Errorf("metrics_endpoint is required when metrics_enabled is true")
|
||||
}
|
||||
pushInterval := config.MetricsPushInterval
|
||||
if pushInterval <= 0 {
|
||||
pushInterval = 15 // default 15 seconds
|
||||
} else if pushInterval > 300 {
|
||||
return nil, fmt.Errorf("metrics_push_interval must be between 1 and 300 seconds, got %d", pushInterval)
|
||||
}
|
||||
metricsConfig := &MetricsConfig{
|
||||
ServiceName: config.ServiceName,
|
||||
Endpoint: config.MetricsEndpoint,
|
||||
Headers: config.Headers,
|
||||
Protocol: config.Protocol,
|
||||
TLSCACert: config.TLSCACert,
|
||||
Insecure: config.Insecure,
|
||||
PushInterval: pushInterval,
|
||||
}
|
||||
p.metricsExporter, err = NewMetricsExporter(p.ctx, metricsConfig)
|
||||
if err != nil {
|
||||
// Clean up trace client if metrics exporter fails
|
||||
if p.client != nil {
|
||||
p.client.Close()
|
||||
}
|
||||
return nil, fmt.Errorf("failed to initialize metrics exporter: %w", err)
|
||||
}
|
||||
logger.Info("OTEL metrics push enabled, pushing to %s every %d seconds", config.MetricsEndpoint, pushInterval)
|
||||
}
|
||||
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// GetName function for the OTEL plugin
|
||||
func (p *OtelPlugin) GetName() string {
|
||||
return PluginName
|
||||
}
|
||||
|
||||
// HTTPTransportPreHook is not used for this plugin
|
||||
func (p *OtelPlugin) HTTPTransportPreHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest) (*schemas.HTTPResponse, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// HTTPTransportPostHook is not used for this plugin
|
||||
func (p *OtelPlugin) HTTPTransportPostHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest, resp *schemas.HTTPResponse) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// HTTPTransportStreamChunkHook passes through streaming chunks unchanged
|
||||
func (p *OtelPlugin) HTTPTransportStreamChunkHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest, chunk *schemas.BifrostStreamChunk) (*schemas.BifrostStreamChunk, error) {
|
||||
return chunk, nil
|
||||
}
|
||||
|
||||
// ValidateConfig function for the OTEL plugin
|
||||
func (p *OtelPlugin) ValidateConfig(config any) (*Config, error) {
|
||||
var otelConfig Config
|
||||
// Checking if its a string, then we will JSON parse and confirm
|
||||
if configStr, ok := config.(string); ok {
|
||||
if err := sonic.Unmarshal([]byte(configStr), &otelConfig); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
// Checking if its a map[string]any, then we will JSON parse and confirm
|
||||
if configMap, ok := config.(map[string]any); ok {
|
||||
configString, err := sonic.Marshal(configMap)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := sonic.Unmarshal([]byte(configString), &otelConfig); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
// Checking if its a Config, then we will confirm
|
||||
if config, ok := config.(*Config); ok {
|
||||
otelConfig = *config
|
||||
}
|
||||
// Validating fields
|
||||
if otelConfig.CollectorURL == "" {
|
||||
return nil, fmt.Errorf("collector url is required")
|
||||
}
|
||||
if otelConfig.TraceType == "" {
|
||||
return nil, fmt.Errorf("trace type is required")
|
||||
}
|
||||
if otelConfig.Protocol == "" {
|
||||
return nil, fmt.Errorf("protocol is required")
|
||||
}
|
||||
return &otelConfig, nil
|
||||
}
|
||||
|
||||
// PreLLMHook is a no-op - tracing is handled via the Inject method.
|
||||
// The OTEL plugin receives completed traces from TracingMiddleware.
|
||||
func (p *OtelPlugin) PreLLMHook(_ *schemas.BifrostContext, req *schemas.BifrostRequest) (*schemas.BifrostRequest, *schemas.LLMPluginShortCircuit, error) {
|
||||
return req, nil, nil
|
||||
}
|
||||
|
||||
// PostLLMHook is a no-op - tracing is handled via the Inject method.
|
||||
// The OTEL plugin receives completed traces from TracingMiddleware.
|
||||
func (p *OtelPlugin) PostLLMHook(_ *schemas.BifrostContext, resp *schemas.BifrostResponse, bifrostErr *schemas.BifrostError) (*schemas.BifrostResponse, *schemas.BifrostError, error) {
|
||||
return resp, bifrostErr, nil
|
||||
}
|
||||
|
||||
// Inject receives a completed trace and sends it to the OTEL collector.
|
||||
// Implements schemas.ObservabilityPlugin interface.
|
||||
// This method is called asynchronously by TracingMiddleware after the response
|
||||
// has been written to the client.
|
||||
func (p *OtelPlugin) Inject(ctx context.Context, trace *schemas.Trace) error {
|
||||
if trace == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Emit trace to collector if client is initialized
|
||||
if p.client != nil {
|
||||
// Convert schemas.Trace to OTEL ResourceSpan
|
||||
resourceSpan := p.convertTraceToResourceSpan(trace)
|
||||
|
||||
// Emit to collector
|
||||
if err := p.client.Emit(ctx, []*ResourceSpan{resourceSpan}); err != nil {
|
||||
logger.Error("failed to emit trace %s: %v", trace.TraceID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Record metrics if metrics exporter is enabled
|
||||
if p.metricsExporter != nil {
|
||||
p.recordMetricsFromTrace(ctx, trace)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Helper functions for type-safe attribute extraction from trace spans
|
||||
func getStringAttr(attrs map[string]any, key string) string {
|
||||
if attrs == nil {
|
||||
return ""
|
||||
}
|
||||
if v, ok := attrs[key].(string); ok {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func getIntAttr(attrs map[string]any, key string) int {
|
||||
if attrs == nil {
|
||||
return 0
|
||||
}
|
||||
switch v := attrs[key].(type) {
|
||||
case int:
|
||||
return v
|
||||
case int64:
|
||||
return int(v)
|
||||
case float64:
|
||||
return int(v)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func getFloat64Attr(attrs map[string]any, key string) float64 {
|
||||
if attrs == nil {
|
||||
return 0
|
||||
}
|
||||
switch v := attrs[key].(type) {
|
||||
case float64:
|
||||
return v
|
||||
case int:
|
||||
return float64(v)
|
||||
case int64:
|
||||
return float64(v)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// buildSpanAttrs extracts metric dimension attrs from a single attempt span.
|
||||
func buildSpanAttrs(span *schemas.Span) []attribute.KeyValue {
|
||||
attrs := span.Attributes
|
||||
method := getStringAttr(attrs, "request.type")
|
||||
if method == "" {
|
||||
method = span.Name
|
||||
}
|
||||
return BuildBifrostAttributes(
|
||||
getStringAttr(attrs, schemas.AttrProviderName),
|
||||
getStringAttr(attrs, schemas.AttrRequestModel),
|
||||
method,
|
||||
getStringAttr(attrs, schemas.AttrVirtualKeyID),
|
||||
getStringAttr(attrs, schemas.AttrVirtualKeyName),
|
||||
getStringAttr(attrs, schemas.AttrSelectedKeyID),
|
||||
getStringAttr(attrs, schemas.AttrSelectedKeyName),
|
||||
getIntAttr(attrs, schemas.AttrNumberOfRetries),
|
||||
getIntAttr(attrs, schemas.AttrFallbackIndex),
|
||||
getStringAttr(attrs, schemas.AttrTeamID),
|
||||
getStringAttr(attrs, schemas.AttrTeamName),
|
||||
getStringAttr(attrs, schemas.AttrCustomerID),
|
||||
getStringAttr(attrs, schemas.AttrCustomerName),
|
||||
)
|
||||
}
|
||||
|
||||
// recordMetricsFromTrace extracts metrics data from a completed trace and records them
|
||||
// via the OTEL metrics exporter. This is called from Inject after trace emission.
|
||||
//
|
||||
// Per-attempt metrics (upstream_requests, errors, success, latency) are recorded once
|
||||
// per llm.call/retry span so fallback attempts and failed retries are counted with
|
||||
// their own provider/model/fallback_index labels. Per-trace metrics (tokens, cost,
|
||||
// TTFT) are recorded once, keyed off the final (latest) attempt span.
|
||||
func (p *OtelPlugin) recordMetricsFromTrace(ctx context.Context, trace *schemas.Trace) {
|
||||
if trace == nil || p.metricsExporter == nil {
|
||||
return
|
||||
}
|
||||
|
||||
var finalSpan *schemas.Span
|
||||
for _, span := range trace.Spans {
|
||||
if span.Kind != schemas.SpanKindLLMCall && span.Kind != schemas.SpanKindRetry {
|
||||
continue
|
||||
}
|
||||
|
||||
spanAttrs := buildSpanAttrs(span)
|
||||
|
||||
p.metricsExporter.RecordUpstreamRequest(ctx, spanAttrs...)
|
||||
|
||||
if !span.StartTime.IsZero() && !span.EndTime.IsZero() {
|
||||
latencySeconds := span.EndTime.Sub(span.StartTime).Seconds()
|
||||
p.metricsExporter.RecordUpstreamLatency(ctx, latencySeconds, spanAttrs...)
|
||||
}
|
||||
|
||||
if span.Status == schemas.SpanStatusError {
|
||||
p.metricsExporter.RecordErrorRequest(ctx, spanAttrs...)
|
||||
} else {
|
||||
p.metricsExporter.RecordSuccessRequest(ctx, spanAttrs...)
|
||||
}
|
||||
|
||||
if finalSpan == nil || span.EndTime.After(finalSpan.EndTime) {
|
||||
finalSpan = span
|
||||
}
|
||||
}
|
||||
|
||||
if finalSpan == nil {
|
||||
finalSpan = trace.RootSpan
|
||||
}
|
||||
if finalSpan == nil {
|
||||
return
|
||||
}
|
||||
|
||||
attrs := finalSpan.Attributes
|
||||
otelAttrs := buildSpanAttrs(finalSpan)
|
||||
|
||||
// Record token usage - try both naming conventions
|
||||
inputTokens := getIntAttr(attrs, schemas.AttrPromptTokens)
|
||||
if inputTokens == 0 {
|
||||
inputTokens = getIntAttr(attrs, schemas.AttrInputTokens)
|
||||
}
|
||||
if inputTokens > 0 {
|
||||
p.metricsExporter.RecordInputTokens(ctx, int64(inputTokens), otelAttrs...)
|
||||
}
|
||||
|
||||
outputTokens := getIntAttr(attrs, schemas.AttrCompletionTokens)
|
||||
if outputTokens == 0 {
|
||||
outputTokens = getIntAttr(attrs, schemas.AttrOutputTokens)
|
||||
}
|
||||
if outputTokens > 0 {
|
||||
p.metricsExporter.RecordOutputTokens(ctx, int64(outputTokens), otelAttrs...)
|
||||
}
|
||||
|
||||
// Record cost if available
|
||||
cost := getFloat64Attr(attrs, schemas.AttrUsageCost)
|
||||
if cost > 0 {
|
||||
p.metricsExporter.RecordCost(ctx, cost, otelAttrs...)
|
||||
}
|
||||
|
||||
// Record streaming latency metrics if available
|
||||
ttft := getFloat64Attr(attrs, schemas.AttrTimeToFirstToken)
|
||||
if ttft > 0 {
|
||||
// Convert from nanoseconds to seconds if needed (check the unit)
|
||||
p.metricsExporter.RecordStreamFirstTokenLatency(ctx, ttft/1e9, otelAttrs...)
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup function for the OTEL plugin
|
||||
func (p *OtelPlugin) Cleanup() error {
|
||||
if p.cancel != nil {
|
||||
p.cancel()
|
||||
}
|
||||
// Shutdown metrics exporter first
|
||||
if p.metricsExporter != nil {
|
||||
if err := p.metricsExporter.Shutdown(context.Background()); err != nil {
|
||||
logger.Error("failed to shutdown metrics exporter: %v", err)
|
||||
}
|
||||
}
|
||||
if p.client != nil {
|
||||
return p.client.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetMetricsExporter returns the metrics exporter for external use (e.g., by telemetry plugin)
|
||||
func (p *OtelPlugin) GetMetricsExporter() *MetricsExporter {
|
||||
return p.metricsExporter
|
||||
}
|
||||
|
||||
// Compile-time check that OtelPlugin implements ObservabilityPlugin
|
||||
var _ schemas.ObservabilityPlugin = (*OtelPlugin)(nil)
|
||||
Reference in New Issue
Block a user