779 lines
21 KiB
Go
779 lines
21 KiB
Go
//go:build dev
|
|
|
|
package handlers
|
|
|
|
import (
|
|
"bytes"
|
|
"os"
|
|
"regexp"
|
|
"runtime"
|
|
"runtime/pprof"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/fasthttp/router"
|
|
"github.com/google/pprof/profile"
|
|
"github.com/maximhq/bifrost/core/schemas"
|
|
"github.com/maximhq/bifrost/transports/bifrost-http/lib"
|
|
"github.com/valyala/fasthttp"
|
|
)
|
|
|
|
const (
|
|
// Collection interval for metrics
|
|
metricsCollectionInterval = 10 * time.Second
|
|
// Number of data points to keep (5 minutes / 10 seconds = 30 points)
|
|
historySize = 30
|
|
// Top allocations to return per table (cumulative and in-use)
|
|
topAllocationsCount = 50
|
|
)
|
|
|
|
// MemoryStats represents memory statistics at a point in time
|
|
type MemoryStats struct {
|
|
Alloc uint64 `json:"alloc"`
|
|
TotalAlloc uint64 `json:"total_alloc"`
|
|
HeapInuse uint64 `json:"heap_inuse"`
|
|
HeapObjects uint64 `json:"heap_objects"`
|
|
Sys uint64 `json:"sys"`
|
|
}
|
|
|
|
// CPUStats represents CPU statistics
|
|
type CPUStats struct {
|
|
UsagePercent float64 `json:"usage_percent"`
|
|
UserTime float64 `json:"user_time"`
|
|
SystemTime float64 `json:"system_time"`
|
|
}
|
|
|
|
// RuntimeStats represents runtime statistics
|
|
type RuntimeStats struct {
|
|
NumGoroutine int `json:"num_goroutine"`
|
|
NumGC uint32 `json:"num_gc"`
|
|
GCPauseNs uint64 `json:"gc_pause_ns"`
|
|
NumCPU int `json:"num_cpu"`
|
|
GOMAXPROCS int `json:"gomaxprocs"`
|
|
}
|
|
|
|
// AllocationInfo represents a single allocation site
|
|
type AllocationInfo struct {
|
|
Function string `json:"function"`
|
|
File string `json:"file"`
|
|
Line int `json:"line"`
|
|
Bytes int64 `json:"bytes"`
|
|
Count int64 `json:"count"`
|
|
Stack []string `json:"stack"`
|
|
}
|
|
|
|
// GoroutineGroup represents a group of goroutines with the same stack trace
|
|
type GoroutineGroup struct {
|
|
Count int `json:"count"`
|
|
State string `json:"state"`
|
|
WaitReason string `json:"wait_reason,omitempty"`
|
|
WaitMinutes int `json:"wait_minutes,omitempty"` // Parsed wait time in minutes
|
|
TopFunc string `json:"top_func"`
|
|
Stack []string `json:"stack"`
|
|
Category string `json:"category"` // "background", "per-request", "unknown"
|
|
}
|
|
|
|
// GoroutineProfile represents the goroutine profile response
|
|
type GoroutineProfile struct {
|
|
Timestamp string `json:"timestamp"`
|
|
TotalGoroutines int `json:"total_goroutines"`
|
|
Groups []GoroutineGroup `json:"groups"`
|
|
Summary GoroutineSummary `json:"summary"`
|
|
RawProfile string `json:"raw_profile,omitempty"`
|
|
}
|
|
|
|
// GoroutineSummary provides a quick overview of goroutine health
|
|
type GoroutineSummary struct {
|
|
Background int `json:"background"` // Expected long-running goroutines
|
|
PerRequest int `json:"per_request"` // Goroutines that should complete with requests
|
|
LongWaiting int `json:"long_waiting"` // Goroutines waiting > 1 minute (potential leaks)
|
|
PotentiallyStuck int `json:"potentially_stuck"` // Per-request goroutines waiting > 1 minute
|
|
}
|
|
|
|
// HistoryPoint represents a single point in the metrics history
|
|
type HistoryPoint struct {
|
|
Timestamp string `json:"timestamp"`
|
|
Alloc uint64 `json:"alloc"`
|
|
HeapInuse uint64 `json:"heap_inuse"`
|
|
Goroutines int `json:"goroutines"`
|
|
GCPauseNs uint64 `json:"gc_pause_ns"`
|
|
CPUPercent float64 `json:"cpu_percent"`
|
|
}
|
|
|
|
// PprofData represents the complete pprof response
|
|
type PprofData struct {
|
|
Timestamp string `json:"timestamp"`
|
|
Memory MemoryStats `json:"memory"`
|
|
CPU CPUStats `json:"cpu"`
|
|
Runtime RuntimeStats `json:"runtime"`
|
|
TopAllocations []AllocationInfo `json:"top_allocations"`
|
|
InuseAllocations []AllocationInfo `json:"inuse_allocations"`
|
|
History []HistoryPoint `json:"history"`
|
|
}
|
|
|
|
// cpuSample holds a CPU time sample for calculating usage
|
|
type cpuSample struct {
|
|
timestamp time.Time
|
|
userTime time.Duration
|
|
systemTime time.Duration
|
|
}
|
|
|
|
// MetricsCollector collects and stores runtime metrics
|
|
type MetricsCollector struct {
|
|
mu sync.RWMutex
|
|
history []HistoryPoint
|
|
stopCh chan struct{}
|
|
started bool
|
|
lastCPUSample cpuSample
|
|
currentCPU CPUStats
|
|
}
|
|
|
|
// DevPprofHandler handles development profiling endpoints
|
|
type DevPprofHandler struct {
|
|
collector *MetricsCollector
|
|
}
|
|
|
|
// Global collector instance
|
|
var globalCollector *MetricsCollector
|
|
var collectorOnce sync.Once
|
|
|
|
// IsDevMode checks if dev mode is enabled via environment variable
|
|
func IsDevMode() bool {
|
|
return os.Getenv("BIFROST_UI_DEV") == "true"
|
|
}
|
|
|
|
// getOrCreateCollector returns the global metrics collector, creating it if needed
|
|
func getOrCreateCollector() *MetricsCollector {
|
|
collectorOnce.Do(func() {
|
|
globalCollector = &MetricsCollector{
|
|
history: make([]HistoryPoint, 0, historySize),
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
})
|
|
return globalCollector
|
|
}
|
|
|
|
// NewDevPprofHandler creates a new dev pprof handler
|
|
func NewDevPprofHandler() *DevPprofHandler {
|
|
return &DevPprofHandler{
|
|
collector: getOrCreateCollector(),
|
|
}
|
|
}
|
|
|
|
// Start begins the background metrics collection
|
|
func (c *MetricsCollector) Start() {
|
|
c.mu.Lock()
|
|
if c.started {
|
|
c.mu.Unlock()
|
|
return
|
|
}
|
|
c.stopCh = make(chan struct{})
|
|
c.started = true
|
|
c.mu.Unlock()
|
|
|
|
go c.collectLoop()
|
|
}
|
|
|
|
// Stop stops the background metrics collection
|
|
func (c *MetricsCollector) Stop() {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
if !c.started {
|
|
return
|
|
}
|
|
close(c.stopCh)
|
|
c.stopCh = nil
|
|
c.started = false
|
|
}
|
|
|
|
func (c *MetricsCollector) collectLoop() {
|
|
// Initialize CPU sample
|
|
c.lastCPUSample = getCPUSample()
|
|
|
|
// Wait a bit before first collection to get accurate CPU reading
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
// Collect immediately on start
|
|
c.collect()
|
|
|
|
ticker := time.NewTicker(metricsCollectionInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
c.collect()
|
|
case <-c.stopCh:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// calculateCPUUsage calculates CPU usage percentage between two samples
|
|
func calculateCPUUsage(prev, curr cpuSample, numCPU int) CPUStats {
|
|
elapsed := curr.timestamp.Sub(prev.timestamp)
|
|
if elapsed <= 0 {
|
|
return CPUStats{}
|
|
}
|
|
|
|
userDelta := curr.userTime - prev.userTime
|
|
systemDelta := curr.systemTime - prev.systemTime
|
|
totalCPUTime := userDelta + systemDelta
|
|
|
|
// Calculate percentage: (CPU time used / wall time) * 100
|
|
// Normalized by number of CPUs to get 0-100% range
|
|
cpuPercent := (float64(totalCPUTime) / float64(elapsed)) * 100.0
|
|
|
|
// Cap at 100% * numCPU (in case of measurement errors)
|
|
maxPercent := float64(numCPU) * 100.0
|
|
if cpuPercent > maxPercent {
|
|
cpuPercent = maxPercent
|
|
}
|
|
|
|
return CPUStats{
|
|
UsagePercent: cpuPercent,
|
|
UserTime: userDelta.Seconds(),
|
|
SystemTime: systemDelta.Seconds(),
|
|
}
|
|
}
|
|
|
|
func (c *MetricsCollector) collect() {
|
|
var memStats runtime.MemStats
|
|
runtime.ReadMemStats(&memStats)
|
|
|
|
// Get current CPU sample and calculate usage
|
|
currentSample := getCPUSample()
|
|
cpuStats := calculateCPUUsage(c.lastCPUSample, currentSample, runtime.NumCPU())
|
|
c.lastCPUSample = currentSample
|
|
|
|
point := HistoryPoint{
|
|
Timestamp: time.Now().Format(time.RFC3339),
|
|
Alloc: memStats.Alloc,
|
|
HeapInuse: memStats.HeapInuse,
|
|
Goroutines: runtime.NumGoroutine(),
|
|
GCPauseNs: memStats.PauseNs[(memStats.NumGC+255)%256],
|
|
CPUPercent: cpuStats.UsagePercent,
|
|
}
|
|
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
// Store current CPU stats for API response
|
|
c.currentCPU = cpuStats
|
|
|
|
// Append to history, maintaining ring buffer behavior
|
|
if len(c.history) >= historySize {
|
|
// Shift left by one and append
|
|
copy(c.history, c.history[1:])
|
|
c.history[len(c.history)-1] = point
|
|
} else {
|
|
c.history = append(c.history, point)
|
|
}
|
|
}
|
|
|
|
func (c *MetricsCollector) getHistory() []HistoryPoint {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
|
|
// Return a copy to avoid race conditions
|
|
result := make([]HistoryPoint, len(c.history))
|
|
copy(result, c.history)
|
|
return result
|
|
}
|
|
|
|
func (c *MetricsCollector) getCPUStats() CPUStats {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
return c.currentCPU
|
|
}
|
|
|
|
// getAllocations analyzes the heap profile and returns two allocation lists
|
|
// aggregated by full call stack:
|
|
// - cumulative: alloc_space / alloc_objects (total since process start)
|
|
// - inuse: inuse_space / inuse_objects (currently live on the heap)
|
|
//
|
|
// Both are produced from a single pprof.WriteHeapProfile call.
|
|
func getAllocations() (cumulative, inuse []AllocationInfo) {
|
|
var buf bytes.Buffer
|
|
if err := pprof.WriteHeapProfile(&buf); err != nil {
|
|
return nil, nil
|
|
}
|
|
|
|
p, err := profile.Parse(&buf)
|
|
if err != nil {
|
|
return nil, nil
|
|
}
|
|
|
|
allocObjectsIdx, allocSpaceIdx := -1, -1
|
|
inuseObjectsIdx, inuseSpaceIdx := -1, -1
|
|
for i, st := range p.SampleType {
|
|
switch st.Type {
|
|
case "alloc_objects":
|
|
allocObjectsIdx = i
|
|
case "alloc_space":
|
|
allocSpaceIdx = i
|
|
case "inuse_objects":
|
|
inuseObjectsIdx = i
|
|
case "inuse_space":
|
|
inuseSpaceIdx = i
|
|
}
|
|
}
|
|
|
|
allocMap := make(map[string]*AllocationInfo)
|
|
inuseMap := make(map[string]*AllocationInfo)
|
|
|
|
for _, sample := range p.Sample {
|
|
if len(sample.Location) == 0 {
|
|
continue
|
|
}
|
|
|
|
topLoc := sample.Location[0]
|
|
if len(topLoc.Line) == 0 {
|
|
continue
|
|
}
|
|
topLine := topLoc.Line[0]
|
|
topFn := topLine.Function
|
|
if topFn == nil {
|
|
continue
|
|
}
|
|
|
|
// Filter only the top frame — filtering inner frames would drop real
|
|
// user allocations that merely pass through runtime/profiler code.
|
|
if isProfilerFunction(topFn.Name, topFn.Filename) {
|
|
continue
|
|
}
|
|
|
|
// Build full stack in goroutine-dump format: alternating "funcName" and
|
|
// "\tfile:line" entries, top-down. Matches GoroutineGroup.Stack so the
|
|
// UI can render both with the same code path.
|
|
stack := make([]string, 0, len(sample.Location)*2)
|
|
for _, loc := range sample.Location {
|
|
if len(loc.Line) == 0 {
|
|
continue
|
|
}
|
|
frame := loc.Line[0]
|
|
if frame.Function == nil {
|
|
continue
|
|
}
|
|
stack = append(stack, frame.Function.Name)
|
|
stack = append(stack, "\t"+frame.Function.Filename+":"+strconv.FormatInt(frame.Line, 10))
|
|
}
|
|
if len(stack) == 0 {
|
|
continue
|
|
}
|
|
key := strings.Join(stack, "\n")
|
|
|
|
if allocSpaceIdx >= 0 && allocObjectsIdx >= 0 {
|
|
b := sample.Value[allocSpaceIdx]
|
|
c := sample.Value[allocObjectsIdx]
|
|
if existing, ok := allocMap[key]; ok {
|
|
existing.Bytes += b
|
|
existing.Count += c
|
|
} else {
|
|
allocMap[key] = &AllocationInfo{
|
|
Function: topFn.Name,
|
|
File: topFn.Filename,
|
|
Line: int(topLine.Line),
|
|
Bytes: b,
|
|
Count: c,
|
|
Stack: stack,
|
|
}
|
|
}
|
|
}
|
|
|
|
if inuseSpaceIdx >= 0 && inuseObjectsIdx >= 0 {
|
|
b := sample.Value[inuseSpaceIdx]
|
|
c := sample.Value[inuseObjectsIdx]
|
|
// Most samples have inuse=0 (already freed) — skip them so the live
|
|
// table isn't padded with noise.
|
|
if b == 0 && c == 0 {
|
|
continue
|
|
}
|
|
if existing, ok := inuseMap[key]; ok {
|
|
existing.Bytes += b
|
|
existing.Count += c
|
|
} else {
|
|
inuseMap[key] = &AllocationInfo{
|
|
Function: topFn.Name,
|
|
File: topFn.Filename,
|
|
Line: int(topLine.Line),
|
|
Bytes: b,
|
|
Count: c,
|
|
Stack: stack,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return flattenAndTopN(allocMap), flattenAndTopN(inuseMap)
|
|
}
|
|
|
|
// flattenAndTopN sorts an allocation map by bytes desc and caps it.
|
|
func flattenAndTopN(m map[string]*AllocationInfo) []AllocationInfo {
|
|
out := make([]AllocationInfo, 0, len(m))
|
|
for _, a := range m {
|
|
out = append(out, *a)
|
|
}
|
|
sort.Slice(out, func(i, j int) bool { return out[i].Bytes > out[j].Bytes })
|
|
if len(out) > topAllocationsCount {
|
|
out = out[:topAllocationsCount]
|
|
}
|
|
return out
|
|
}
|
|
|
|
// RegisterRoutes registers the dev pprof routes
|
|
func (h *DevPprofHandler) RegisterRoutes(r *router.Router, middlewares ...schemas.BifrostHTTPMiddleware) {
|
|
// Start the collector when routes are registered
|
|
h.collector.Start()
|
|
|
|
r.GET("/api/dev/pprof", lib.ChainMiddlewares(h.getPprof, middlewares...))
|
|
r.GET("/api/dev/pprof/goroutines", lib.ChainMiddlewares(h.getGoroutines, middlewares...))
|
|
}
|
|
|
|
// getPprof handles GET /api/dev/pprof
|
|
func (h *DevPprofHandler) getPprof(ctx *fasthttp.RequestCtx) {
|
|
var memStats runtime.MemStats
|
|
runtime.ReadMemStats(&memStats)
|
|
|
|
data := PprofData{
|
|
Timestamp: time.Now().Format(time.RFC3339),
|
|
Memory: MemoryStats{
|
|
Alloc: memStats.Alloc,
|
|
TotalAlloc: memStats.TotalAlloc,
|
|
HeapInuse: memStats.HeapInuse,
|
|
HeapObjects: memStats.HeapObjects,
|
|
Sys: memStats.Sys,
|
|
},
|
|
CPU: h.collector.getCPUStats(),
|
|
Runtime: RuntimeStats{
|
|
NumGoroutine: runtime.NumGoroutine(),
|
|
NumGC: memStats.NumGC,
|
|
GCPauseNs: memStats.PauseNs[(memStats.NumGC+255)%256],
|
|
NumCPU: runtime.NumCPU(),
|
|
GOMAXPROCS: runtime.GOMAXPROCS(0),
|
|
},
|
|
History: h.collector.getHistory(),
|
|
}
|
|
data.TopAllocations, data.InuseAllocations = getAllocations()
|
|
|
|
SendJSON(ctx, data)
|
|
}
|
|
|
|
// getGoroutines handles GET /api/dev/pprof/goroutines
|
|
// Returns goroutine stack traces grouped by stack signature
|
|
func (h *DevPprofHandler) getGoroutines(ctx *fasthttp.RequestCtx) {
|
|
// Check if raw output is requested
|
|
includeRaw := string(ctx.QueryArgs().Peek("raw")) == "true"
|
|
|
|
// Get goroutine profile
|
|
var buf bytes.Buffer
|
|
if err := pprof.Lookup("goroutine").WriteTo(&buf, 2); err != nil {
|
|
ctx.SetStatusCode(fasthttp.StatusInternalServerError)
|
|
SendJSON(ctx, map[string]string{"error": "failed to get goroutine profile"})
|
|
return
|
|
}
|
|
|
|
rawProfile := buf.String()
|
|
allGroups := parseGoroutineProfile(rawProfile)
|
|
|
|
// Filter out profiler goroutines and calculate summary
|
|
groups := make([]GoroutineGroup, 0, len(allGroups))
|
|
summary := GoroutineSummary{}
|
|
profilerGoroutineCount := 0
|
|
|
|
for i := range allGroups {
|
|
categorizeGoroutine(&allGroups[i])
|
|
|
|
// Skip profiler's own goroutines
|
|
if isProfilerGoroutine(&allGroups[i]) {
|
|
profilerGoroutineCount += allGroups[i].Count
|
|
continue
|
|
}
|
|
|
|
groups = append(groups, allGroups[i])
|
|
|
|
switch allGroups[i].Category {
|
|
case "background":
|
|
summary.Background += allGroups[i].Count
|
|
case "per-request":
|
|
summary.PerRequest += allGroups[i].Count
|
|
}
|
|
|
|
if allGroups[i].WaitMinutes >= 1 {
|
|
summary.LongWaiting += allGroups[i].Count
|
|
if allGroups[i].Category == "per-request" {
|
|
summary.PotentiallyStuck += allGroups[i].Count
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort: potentially stuck first, then by wait time, then by count
|
|
sort.Slice(groups, func(i, j int) bool {
|
|
// Potentially stuck (per-request + long wait) first
|
|
iStuck := groups[i].Category == "per-request" && groups[i].WaitMinutes >= 1
|
|
jStuck := groups[j].Category == "per-request" && groups[j].WaitMinutes >= 1
|
|
if iStuck != jStuck {
|
|
return iStuck
|
|
}
|
|
// Then by wait time
|
|
if groups[i].WaitMinutes != groups[j].WaitMinutes {
|
|
return groups[i].WaitMinutes > groups[j].WaitMinutes
|
|
}
|
|
// Then by count
|
|
return groups[i].Count > groups[j].Count
|
|
})
|
|
|
|
// Calculate app goroutines (total minus profiler goroutines)
|
|
// Calculate total goroutines from profile snapshot
|
|
totalFromProfile := 0
|
|
for _, g := range groups {
|
|
totalFromProfile += g.Count
|
|
}
|
|
|
|
response := GoroutineProfile{
|
|
Timestamp: time.Now().Format(time.RFC3339),
|
|
TotalGoroutines: totalFromProfile,
|
|
Groups: groups,
|
|
Summary: summary,
|
|
}
|
|
|
|
if includeRaw {
|
|
response.RawProfile = rawProfile
|
|
}
|
|
|
|
SendJSON(ctx, response)
|
|
}
|
|
|
|
// categorizeGoroutine determines if a goroutine is a background worker or per-request
|
|
func categorizeGoroutine(g *GoroutineGroup) {
|
|
// Parse wait time from wait reason (e.g., "5 minutes" -> 5)
|
|
g.WaitMinutes = parseWaitMinutes(g.WaitReason)
|
|
|
|
stackStr := strings.Join(g.Stack, " ")
|
|
|
|
// Background goroutines - expected to run forever
|
|
backgroundPatterns := []string{
|
|
"requestWorker", // Provider queue workers
|
|
"collectLoop", // Metrics collector
|
|
"cleanupWorker", // Various cleanup workers
|
|
"startAccumulatorMapCleanup", // Stream accumulator cleanup
|
|
"cleanupOldTraces", // Trace store cleanup
|
|
"startCleanup", // Generic cleanup
|
|
"monitorLoop", // Health monitor
|
|
"StartHeartbeat", // WebSocket heartbeat
|
|
"time.Sleep", // Ticker-based workers
|
|
"runtime.gopark", // Runtime parking (often tickers)
|
|
"sync.(*Cond).Wait", // Condition variable waits
|
|
"net/http.(*persistConn)", // HTTP connection pool
|
|
"internal/poll.runtime_pollWait", // Network polling
|
|
}
|
|
|
|
for _, pattern := range backgroundPatterns {
|
|
if strings.Contains(stackStr, pattern) {
|
|
g.Category = "background"
|
|
return
|
|
}
|
|
}
|
|
|
|
// Per-request goroutines - should complete when request ends
|
|
perRequestPatterns := []string{
|
|
"PreLLMHook",
|
|
"PostLLMHook",
|
|
"PreMCPHook",
|
|
"PostMCPHook",
|
|
"HTTPTransportPreHook",
|
|
"HTTPTransportPostHook",
|
|
"CompleteAndFlushTrace",
|
|
"ProcessAndSend",
|
|
"handleProvider",
|
|
"Inject", // Observability plugin inject
|
|
"insertInitialLogEntry", // Logging
|
|
"updateLogEntry", // Logging
|
|
"retryOnNotFound",
|
|
"BroadcastLogUpdate",
|
|
}
|
|
|
|
for _, pattern := range perRequestPatterns {
|
|
if strings.Contains(stackStr, pattern) {
|
|
g.Category = "per-request"
|
|
return
|
|
}
|
|
}
|
|
|
|
g.Category = "unknown"
|
|
}
|
|
|
|
// parseWaitMinutes extracts wait time in minutes from wait reason string
|
|
func parseWaitMinutes(waitReason string) int {
|
|
if waitReason == "" {
|
|
return 0
|
|
}
|
|
|
|
// Match patterns like "5 minutes", "1 minute", "30 seconds", "2 hours"
|
|
minuteRegex := regexp.MustCompile(`(\d+)\s*minute`)
|
|
if matches := minuteRegex.FindStringSubmatch(waitReason); len(matches) >= 2 {
|
|
if mins, err := strconv.Atoi(matches[1]); err == nil {
|
|
return mins
|
|
}
|
|
}
|
|
|
|
hourRegex := regexp.MustCompile(`(\d+)\s*hour`)
|
|
if matches := hourRegex.FindStringSubmatch(waitReason); len(matches) >= 2 {
|
|
if hours, err := strconv.Atoi(matches[1]); err == nil {
|
|
return hours * 60
|
|
}
|
|
}
|
|
|
|
secondRegex := regexp.MustCompile(`(\d+)\s*second`)
|
|
if matches := secondRegex.FindStringSubmatch(waitReason); len(matches) >= 2 {
|
|
if secs, err := strconv.Atoi(matches[1]); err == nil {
|
|
return secs / 60 // Convert to minutes, will be 0 for < 60 seconds
|
|
}
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
// parseGoroutineProfile parses the text output of pprof goroutine profile
|
|
// and groups goroutines by their stack trace
|
|
func parseGoroutineProfile(profile string) []GoroutineGroup {
|
|
// Regex to match goroutine header: "goroutine N [state, wait reason]:"
|
|
// Examples:
|
|
// goroutine 1 [running]:
|
|
// goroutine 42 [select, 5 minutes]:
|
|
// goroutine 100 [chan receive]:
|
|
headerRegex := regexp.MustCompile(`goroutine \d+ \[([^\]]+)\]:`)
|
|
|
|
// Split by "goroutine " to get individual goroutine blocks
|
|
blocks := strings.Split(profile, "goroutine ")
|
|
|
|
// Map to group goroutines by stack signature
|
|
groupMap := make(map[string]*GoroutineGroup)
|
|
|
|
for _, block := range blocks {
|
|
block = strings.TrimSpace(block)
|
|
if block == "" {
|
|
continue
|
|
}
|
|
|
|
// Re-add "goroutine " prefix for regex matching
|
|
fullBlock := "goroutine " + block
|
|
|
|
// Extract state from header
|
|
matches := headerRegex.FindStringSubmatch(fullBlock)
|
|
if len(matches) < 2 {
|
|
continue
|
|
}
|
|
|
|
stateInfo := matches[1]
|
|
state := stateInfo
|
|
waitReason := ""
|
|
|
|
// Parse state and wait reason (e.g., "select, 5 minutes" -> state="select", waitReason="5 minutes")
|
|
if idx := strings.Index(stateInfo, ","); idx != -1 {
|
|
state = strings.TrimSpace(stateInfo[:idx])
|
|
waitReason = strings.TrimSpace(stateInfo[idx+1:])
|
|
}
|
|
|
|
// Get stack trace (everything after the header line)
|
|
lines := strings.Split(block, "\n")
|
|
if len(lines) < 2 {
|
|
continue
|
|
}
|
|
|
|
// Extract stack frames (skip the header line which is lines[0])
|
|
var stackLines []string
|
|
var topFunc string
|
|
for i := 1; i < len(lines); i++ {
|
|
line := strings.TrimSpace(lines[i])
|
|
if line == "" {
|
|
continue
|
|
}
|
|
stackLines = append(stackLines, line)
|
|
|
|
// First function line (not a file:line) is the top function
|
|
if topFunc == "" && !strings.HasPrefix(line, "/") && !strings.Contains(line, ".go:") {
|
|
topFunc = line
|
|
}
|
|
}
|
|
|
|
if len(stackLines) == 0 {
|
|
continue
|
|
}
|
|
|
|
// Create a signature from the stack (top 10 frames for grouping)
|
|
maxFrames := 10
|
|
if len(stackLines) < maxFrames {
|
|
maxFrames = len(stackLines)
|
|
}
|
|
signature := state + "|" + strings.Join(stackLines[:maxFrames], "|")
|
|
|
|
// Group by signature
|
|
if existing, ok := groupMap[signature]; ok {
|
|
existing.Count++
|
|
} else {
|
|
groupMap[signature] = &GoroutineGroup{
|
|
Count: 1,
|
|
State: state,
|
|
WaitReason: waitReason,
|
|
TopFunc: topFunc,
|
|
Stack: stackLines,
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert map to slice
|
|
groups := make([]GoroutineGroup, 0, len(groupMap))
|
|
for _, group := range groupMap {
|
|
groups = append(groups, *group)
|
|
}
|
|
|
|
return groups
|
|
}
|
|
|
|
// profilerPatterns contains patterns to identify profiler-related code
|
|
var profilerPatterns = []string{
|
|
"devpprof",
|
|
"pprof.WriteHeapProfile",
|
|
"pprof.Lookup",
|
|
"profile.Parse",
|
|
"MetricsCollector",
|
|
"collectLoop",
|
|
"getAllocations",
|
|
"flattenAndTopN",
|
|
"parseGoroutineProfile",
|
|
"getGoroutines",
|
|
"getCPUSample",
|
|
}
|
|
|
|
// isProfilerFunction checks if a function belongs to the profiler itself
|
|
func isProfilerFunction(funcName, fileName string) bool {
|
|
for _, pattern := range profilerPatterns {
|
|
if strings.Contains(funcName, pattern) || strings.Contains(fileName, pattern) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// isProfilerGoroutine checks if a goroutine belongs to the profiler
|
|
func isProfilerGoroutine(g *GoroutineGroup) bool {
|
|
stackStr := strings.Join(g.Stack, " ")
|
|
for _, pattern := range profilerPatterns {
|
|
if strings.Contains(stackStr, pattern) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Cleanup stops the metrics collector
|
|
func (h *DevPprofHandler) Cleanup() {
|
|
if h.collector != nil {
|
|
h.collector.Stop()
|
|
}
|
|
}
|