first commit

2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions
--- a/.github/workflows/scripts/load-test.sh
+++ b/.github/workflows/scripts/load-test.sh
@@ -0,0 +1,850 @@
+#!/bin/bash
+
+# Load Test Script for Bifrost
+# Runs a load test against bifrost-http with a mocker provider
+# Usage: ./load-test.sh
+#
+# This script:
+# 1. Builds bifrost-http and mocker locally
+# 2. Creates a config.json with mocker provider (OpenAI-style)
+# 3. Starts mocker with 0ms latency and bifrost-http
+# 4. Runs a calibration (Vegeta -> Mocker direct) to measure Vegeta+network baseline
+# 5. Runs the overhead test (Vegeta -> Bifrost -> Mocker) to measure total
+# 6. Subtracts calibration from test to isolate Bifrost proxy overhead
+#    (includes local network hop, JSON parsing/unparsing, plugins, and mocker jitter)
+# 7. Restarts mocker with 10s latency for a sustained concurrency stress test
+# 8. Asserts overhead < tiered thresholds (per percentile) and stress test has 100% success rate
+
+set -e
+
+# Configuration
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+BIFROST_HTTP_DIR="${REPO_ROOT}/transports/bifrost-http"
+TRANSPORTS_DIR="${REPO_ROOT}/transports"
+WORK_DIR="${SCRIPT_DIR}"
+MOCKER_DIR="${REPO_ROOT}/../bifrost-benchmarking/mocker"
+
+BIFROST_PORT=8080
+MOCKER_PORT=8000
+RATE=1000
+MAX_WORKERS=12000
+OVERHEAD_DURATION=30            # overhead measurement duration (seconds)
+STRESS_DURATION=30              # stress test duration (seconds)
+OVERHEAD_MOCKER_LATENCY_MS=1000  # 1 second latency for overhead measurement
+STRESS_MOCKER_LATENCY_MS=1000    # 1 second latency for stress test
+# Tiered overhead thresholds (µs) — these cover the full proxy cost:
+# local network hop, JSON parsing/unparsing, plugins, and mocker jitter.
+# At ${RATE} RPS × ${OVERHEAD_MOCKER_LATENCY_MS}ms latency ≈ 1000 concurrent requests.
+MAX_OVERHEAD_MEAN_US=5000       # mean overhead threshold (5ms)
+MAX_OVERHEAD_P50_US=5000        # p50 overhead threshold (5ms)
+MAX_OVERHEAD_P90_US=10000       # p90 overhead threshold (10ms)
+MAX_OVERHEAD_P95_US=20000       # p95 overhead threshold (20ms)
+MAX_OVERHEAD_P99_US=100000      # p99 overhead threshold (100ms)
+
+# Results storage for summary table
+RESULTS_FILE="${WORK_DIR}/load-test-results.md"
+RESULTS_JSON="${WORK_DIR}/load-test-results.json"
+
+# Process stats monitoring
+STATS_PID=""
+STATS_FILE="${WORK_DIR}/bifrost-stats.csv"
+
+# Overhead-phase process stats (saved before bifrost restart)
+OVERHEAD_STATS_CPU_AVG=""
+OVERHEAD_STATS_CPU_PEAK=""
+OVERHEAD_STATS_RSS_AVG=""
+OVERHEAD_STATS_RSS_PEAK=""
+
+# Calibration results per bucket (Vegeta -> Mocker direct)
+CAL_MIN_NS=0
+CAL_MEAN_NS=0
+CAL_50_NS=0
+CAL_90_NS=0
+CAL_95_NS=0
+CAL_99_NS=0
+CAL_MAX_NS=0
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+log_info() {
+  echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+  echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+log_warn() {
+  echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+  echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Cleanup function to kill background processes
+cleanup() {
+  log_info "Cleaning up..."
+  if [ -n "$STATS_PID" ] && kill -0 "$STATS_PID" 2>/dev/null; then
+    kill "$STATS_PID" 2>/dev/null || true
+    wait "$STATS_PID" 2>/dev/null || true
+  fi
+  if [ -n "$BIFROST_PID" ] && kill -0 "$BIFROST_PID" 2>/dev/null; then
+    kill "$BIFROST_PID" 2>/dev/null || true
+    wait "$BIFROST_PID" 2>/dev/null || true
+  fi
+  if [ -n "$MOCKER_PID" ] && kill -0 "$MOCKER_PID" 2>/dev/null; then
+    kill "$MOCKER_PID" 2>/dev/null || true
+    wait "$MOCKER_PID" 2>/dev/null || true
+  fi
+  # Clean up temporary files (keep results files for artifact upload)
+  rm -f "${WORK_DIR}/config.json" "${WORK_DIR}/logs.db" "${WORK_DIR}/attack.bin" "${WORK_DIR}/calibration.bin" "${WORK_DIR}/stress.bin" "${WORK_DIR}/bifrost.log" "${WORK_DIR}/vegeta-target.json" "${WORK_DIR}/vegeta-target-calibration.json" "${WORK_DIR}/vegeta-target-stress.json" "${WORK_DIR}/vegeta-report.json" "${WORK_DIR}/bifrost-stats.csv" 2>/dev/null || true
+  log_info "Cleanup complete"
+}
+
+trap cleanup EXIT
+
+# Check for required tools
+check_dependencies() {
+  log_info "Checking dependencies..."
+
+  if ! command -v go &> /dev/null; then
+    log_error "Go is not installed. Please install Go 1.24.3 or later."
+    exit 1
+  fi
+
+  if ! command -v git &> /dev/null; then
+    log_error "Git is not installed. Please install Git."
+    exit 1
+  fi
+
+  log_success "All dependencies found"
+}
+
+# Kill any process listening on a specific port (not processes with connections to it)
+kill_port() {
+  local port=$1
+  local pids=$(lsof -ti "TCP:${port}" -sTCP:LISTEN 2>/dev/null)
+  if [ -n "$pids" ]; then
+    log_warn "Killing existing process(es) listening on port ${port}: ${pids}"
+    echo "$pids" | xargs kill -9 2>/dev/null || true
+    sleep 1
+  fi
+}
+
+# Kill processes on required ports before starting
+cleanup_ports() {
+  log_info "Checking for processes on required ports..."
+  kill_port ${MOCKER_PORT}
+  kill_port ${BIFROST_PORT}
+}
+
+# Install Vegeta if not present
+install_vegeta() {
+  if ! command -v vegeta &> /dev/null; then
+    log_info "Installing Vegeta load testing tool..."
+    go install github.com/tsenart/vegeta/v12@latest
+    export PATH="$PATH:$(go env GOPATH)/bin"
+    if ! command -v vegeta &> /dev/null; then
+      log_error "Failed to install Vegeta"
+      exit 1
+    fi
+    log_success "Vegeta installed"
+  else
+    log_success "Vegeta already installed"
+  fi
+}
+
+# Build bifrost-http if binary doesn't exist
+build_bifrost_http() {
+  if [ -f "${REPO_ROOT}/tmp/bifrost-http" ]; then
+    log_success "bifrost-http binary already exists at ${REPO_ROOT}/tmp/bifrost-http"
+    return 0
+  fi
+
+  log_info "Building bifrost-http..."
+  cd "${TRANSPORTS_DIR}"
+
+  if go build -o ${REPO_ROOT}/tmp/bifrost-http .; then
+    log_success "bifrost-http built successfully"
+  else
+    log_error "Failed to build bifrost-http"
+    exit 1
+  fi
+
+  cd "${WORK_DIR}"
+}
+
+# Clone and setup mocker from bifrost-benchmarking
+setup_mocker() {
+  if [ -d "${REPO_ROOT}/../bifrost-benchmarking" ]; then
+    log_info "Updating bifrost-benchmarking repository..."
+    cd "${REPO_ROOT}/../bifrost-benchmarking"
+    git pull --quiet || true
+    cd "${WORK_DIR}"
+  else
+    log_info "Cloning bifrost-benchmarking repository..."
+    cd "${WORK_DIR}"
+    git clone --depth 1 https://github.com/maximhq/bifrost-benchmarking.git
+  fi
+
+  log_success "Mocker setup complete"
+}
+
+# Build mocker binary (avoids go run overhead)
+build_mocker() {
+  if [ -f "${REPO_ROOT}/tmp/mocker" ]; then
+    log_success "mocker binary already exists at ${REPO_ROOT}/tmp/mocker"
+    return 0
+  fi
+
+  log_info "Building mocker..."
+  cd "${MOCKER_DIR}"
+
+  if go build -o "${REPO_ROOT}/tmp/mocker" .; then
+    log_success "mocker built successfully"
+  else
+    log_error "Failed to build mocker"
+    exit 1
+  fi
+
+  cd "${WORK_DIR}"
+}
+
+# Create config.json for bifrost with mocker provider
+create_config() {
+  log_info "Creating config.json..."
+
+  cat > "${WORK_DIR}/config.json" << 'EOF'
+{
+  "$schema": "https://www.getbifrost.ai/schema",
+  "client": {
+    "enable_logging": false,
+    "initial_pool_size": 20000,
+    "drop_excess_requests": false,
+    "allow_direct_keys": false
+  },
+  "config_store": {
+    "enabled": false
+  },
+  "logs_store": {
+    "enabled": false
+  },
+  "providers": {
+    "openai": {
+      "keys": [
+        {
+          "name": "mocker-key",
+          "value": "Bearer mocker-key",
+          "weight": 1
+        }
+      ],
+      "network_config": {
+        "base_url": "http://localhost:8000",
+        "default_request_timeout_in_seconds": 30
+      },
+      "concurrency_and_buffer_size": {
+        "concurrency": 20000,
+        "buffer_size": 40000
+      },
+      "custom_provider_config": {
+        "base_provider_type": "openai",
+        "allowed_requests": {
+          "list_models": false,
+          "chat_completion": true,
+          "chat_completion_stream": true
+        }
+      }
+    }
+  }
+}
+EOF
+
+  log_success "config.json created"
+}
+
+# Start mocker with specified latency
+# Arguments: $1 = latency in ms
+start_mocker() {
+  local latency_ms=${1:-0}
+  log_info "Starting mocker server on port ${MOCKER_PORT} with ${latency_ms}ms latency..."
+
+  "${REPO_ROOT}/tmp/mocker" -port ${MOCKER_PORT} -host 0.0.0.0 -latency ${latency_ms} &
+  MOCKER_PID=$!
+
+  # Wait for mocker to be ready
+  local max_attempts=30
+  local attempt=0
+  while ! curl -s "http://localhost:${MOCKER_PORT}/v1/chat/completions" -X POST \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer mocker-key" \
+    -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"test"}]}' > /dev/null 2>&1; do
+    sleep 1
+    attempt=$((attempt + 1))
+    if [ $attempt -ge $max_attempts ]; then
+      log_error "Mocker failed to start within ${max_attempts} seconds"
+      exit 1
+    fi
+  done
+
+  log_success "Mocker server started (PID: ${MOCKER_PID})"
+}
+
+# Stop mocker
+stop_mocker() {
+  if [ -n "$MOCKER_PID" ] && kill -0 "$MOCKER_PID" 2>/dev/null; then
+    log_info "Stopping mocker (PID: ${MOCKER_PID})..."
+    kill "$MOCKER_PID" 2>/dev/null || true
+    wait "$MOCKER_PID" 2>/dev/null || true
+    MOCKER_PID=""
+    sleep 1
+  fi
+}
+
+# Stop bifrost-http server
+stop_bifrost() {
+  if [ -n "$BIFROST_PID" ] && kill -0 "$BIFROST_PID" 2>/dev/null; then
+    log_info "Stopping bifrost (PID: ${BIFROST_PID})..."
+    kill "$BIFROST_PID" 2>/dev/null || true
+    wait "$BIFROST_PID" 2>/dev/null || true
+    BIFROST_PID=""
+    sleep 1
+  fi
+}
+
+# Start background process stats collection for bifrost
+# Samples CPU% and RSS every second, writes to CSV
+start_stats_monitor() {
+  if [ -z "$BIFROST_PID" ] || ! kill -0 "$BIFROST_PID" 2>/dev/null; then
+    log_warn "Cannot start stats monitor: bifrost not running"
+    return
+  fi
+
+  echo "timestamp,cpu_pct,rss_mb" > "${STATS_FILE}"
+
+  (
+    while kill -0 "$BIFROST_PID" 2>/dev/null; do
+      # ps -o %cpu= -o rss= works on both macOS and Linux
+      stats=$(ps -p "$BIFROST_PID" -o %cpu=,rss= 2>/dev/null)
+      if [ -n "$stats" ]; then
+        cpu=$(echo "$stats" | awk '{print $1}')
+        rss_kb=$(echo "$stats" | awk '{print $2}')
+        rss_mb=$(echo "scale=1; ${rss_kb} / 1024" | bc)
+        echo "$(date +%s),${cpu},${rss_mb}" >> "${STATS_FILE}"
+      fi
+      sleep 1
+    done
+  ) &
+  STATS_PID=$!
+  log_info "Stats monitor started (PID: ${STATS_PID})"
+}
+
+# Stop stats monitor and print summary
+stop_stats_monitor() {
+  if [ -n "$STATS_PID" ] && kill -0 "$STATS_PID" 2>/dev/null; then
+    kill "$STATS_PID" 2>/dev/null || true
+    wait "$STATS_PID" 2>/dev/null || true
+    STATS_PID=""
+  fi
+
+  if [ ! -f "${STATS_FILE}" ] || [ $(wc -l < "${STATS_FILE}") -le 1 ]; then
+    log_warn "No process stats collected"
+    return
+  fi
+
+  # Compute peak and average CPU/RSS from CSV (skip header)
+  if command -v awk &> /dev/null; then
+    local stats_summary=$(awk -F',' 'NR>1 {
+      cpu_sum+=$2; rss_sum+=$3; n++;
+      if($2>cpu_max) cpu_max=$2;
+      if($3>rss_max) rss_max=$3;
+    } END {
+      if(n>0) printf "%.1f,%.1f,%.1f,%.1f,%d", cpu_sum/n, cpu_max, rss_sum/n, rss_max, n
+    }' "${STATS_FILE}")
+
+    STATS_CPU_AVG=$(echo "$stats_summary" | cut -d',' -f1)
+    STATS_CPU_PEAK=$(echo "$stats_summary" | cut -d',' -f2)
+    STATS_RSS_AVG=$(echo "$stats_summary" | cut -d',' -f3)
+    STATS_RSS_PEAK=$(echo "$stats_summary" | cut -d',' -f4)
+    local samples=$(echo "$stats_summary" | cut -d',' -f5)
+
+    echo ""
+    log_success "Bifrost process stats (single instance, ${samples} samples):"
+    log_info "  CPU:  avg=${STATS_CPU_AVG}%, peak=${STATS_CPU_PEAK}%"
+    log_info "  RSS:  avg=${STATS_RSS_AVG}MB, peak=${STATS_RSS_PEAK}MB"
+  fi
+}
+
+# Start bifrost-http server
+start_bifrost() {
+  log_info "Starting bifrost-http on port ${BIFROST_PORT}..."
+
+  cd "${WORK_DIR}"
+  local bifrost_log="${WORK_DIR}/bifrost.log"
+  "${REPO_ROOT}/tmp/bifrost-http" -app-dir "${WORK_DIR}" -port "${BIFROST_PORT}" -host "0.0.0.0" -log-level "info" > "${bifrost_log}" 2>&1 &
+  BIFROST_PID=$!
+
+  # Wait for bifrost to be fully ready (look for "successfully started bifrost" message)
+  local max_attempts=60
+  local attempt=0
+  while ! grep -q "successfully started bifrost" "${bifrost_log}" 2>/dev/null; do
+    sleep 1
+    attempt=$((attempt + 1))
+    if [ $attempt -ge $max_attempts ]; then
+      log_error "Bifrost failed to start within ${max_attempts} seconds"
+      log_error "Bifrost log output:"
+      cat "${bifrost_log}" 2>/dev/null || true
+      exit 1
+    fi
+    # Check if process is still running
+    if ! kill -0 "$BIFROST_PID" 2>/dev/null; then
+      log_error "Bifrost process died unexpectedly"
+      log_error "Bifrost log output:"
+      cat "${bifrost_log}" 2>/dev/null || true
+      exit 1
+    fi
+  done
+
+  log_success "Bifrost-http started (PID: ${BIFROST_PID})"
+}
+
+# Extract latencies from a vegeta binary results file
+# Arguments: $1 = path to .bin file
+# Sets: EXTRACTED_MIN_NS, EXTRACTED_MEAN_NS, EXTRACTED_50_NS, etc.
+extract_latencies() {
+  local bin_file=$1
+  local json_report_file="${WORK_DIR}/vegeta-report.json"
+  vegeta report -type=json < "${bin_file}" > "${json_report_file}"
+
+  if command -v jq &> /dev/null; then
+    EXTRACTED_MIN_NS=$(jq '.latencies.min // 0' "${json_report_file}")
+    EXTRACTED_MEAN_NS=$(jq '.latencies.mean // 0' "${json_report_file}")
+    EXTRACTED_50_NS=$(jq '.latencies["50th"] // 0' "${json_report_file}")
+    EXTRACTED_90_NS=$(jq '.latencies["90th"] // 0' "${json_report_file}")
+    EXTRACTED_95_NS=$(jq '.latencies["95th"] // 0' "${json_report_file}")
+    EXTRACTED_99_NS=$(jq '.latencies["99th"] // 0' "${json_report_file}")
+    EXTRACTED_MAX_NS=$(jq '.latencies.max // 0' "${json_report_file}")
+    EXTRACTED_SUCCESS=$(jq '.success // 0' "${json_report_file}")
+    EXTRACTED_RATE=$(jq '.rate // 0' "${json_report_file}")
+    EXTRACTED_THROUGHPUT=$(jq '.throughput // 0' "${json_report_file}")
+  elif command -v python3 &> /dev/null; then
+    EXTRACTED_MIN_NS=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('latencies', {}).get('min', 0))")
+    EXTRACTED_MEAN_NS=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('latencies', {}).get('mean', 0))")
+    EXTRACTED_50_NS=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('latencies', {}).get('50th', 0))")
+    EXTRACTED_90_NS=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('latencies', {}).get('90th', 0))")
+    EXTRACTED_95_NS=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('latencies', {}).get('95th', 0))")
+    EXTRACTED_99_NS=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('latencies', {}).get('99th', 0))")
+    EXTRACTED_MAX_NS=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('latencies', {}).get('max', 0))")
+    EXTRACTED_SUCCESS=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('success', 0))")
+    EXTRACTED_RATE=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('rate', 0))")
+    EXTRACTED_THROUGHPUT=$(python3 -c "import json; d=json.load(open('${json_report_file}')); print(d.get('throughput', 0))")
+  else
+    log_error "Neither jq nor python3 found. Cannot parse JSON results."
+    return 1
+  fi
+
+  rm -f "${json_report_file}"
+}
+
+# ============================================================
+# Phase 1: Overhead measurement (mocker at ${OVERHEAD_MOCKER_LATENCY_MS}ms)
+# ============================================================
+
+# Calibration: Vegeta -> Mocker direct (with latency)
+# Measures: Vegeta HTTP client + localhost network round-trip + mocker response generation
+run_calibration() {
+  echo ""
+  echo "╔═══════════════════════════════════════════════════════════╗"
+  echo "║    Calibration: Vegeta -> Mocker (${OVERHEAD_MOCKER_LATENCY_MS}ms, direct)        ║"
+  echo "╚═══════════════════════════════════════════════════════════╝"
+  echo ""
+  log_info "Measuring Vegeta + network baseline (mocker at ${OVERHEAD_MOCKER_LATENCY_MS}ms latency)"
+  log_info "Duration: ${OVERHEAD_DURATION}s at ${RATE} RPS, ~$(( RATE * OVERHEAD_MOCKER_LATENCY_MS / 1000 )) concurrent"
+  echo ""
+
+  local target_file="${WORK_DIR}/vegeta-target-calibration.json"
+  local payload='{"model":"gpt-4o-mini","messages":[{"role":"user","content":"Hello, how are you?"}]}'
+
+  cat > "${target_file}" << EOF
+{"method": "POST", "url": "http://localhost:${MOCKER_PORT}/v1/chat/completions", "header": {"Content-Type": ["application/json"], "Authorization": ["Bearer mocker-key"]}, "body": "$(echo -n "${payload}" | base64)"}
+EOF
+
+  vegeta attack \
+    -format=json \
+    -targets="${target_file}" \
+    -rate="${RATE}" \
+    -duration="${OVERHEAD_DURATION}s" \
+    -timeout="$((OVERHEAD_MOCKER_LATENCY_MS / 1000 + 5))s" \
+    -workers=$((RATE * OVERHEAD_MOCKER_LATENCY_MS / 1000)) \
+    -max-workers="${MAX_WORKERS}" > "${WORK_DIR}/calibration.bin"
+
+  echo ""
+  log_info "Calibration complete. Results:"
+  vegeta report < "${WORK_DIR}/calibration.bin"
+
+  extract_latencies "${WORK_DIR}/calibration.bin"
+
+  log_info "Actual RPS: $(printf "%.0f" $EXTRACTED_RATE) (configured: ${RATE})"
+
+  CAL_MIN_NS=$EXTRACTED_MIN_NS
+  CAL_MEAN_NS=$EXTRACTED_MEAN_NS
+  CAL_50_NS=$EXTRACTED_50_NS
+  CAL_90_NS=$EXTRACTED_90_NS
+  CAL_95_NS=$EXTRACTED_95_NS
+  CAL_99_NS=$EXTRACTED_99_NS
+  CAL_MAX_NS=$EXTRACTED_MAX_NS
+
+  echo ""
+  log_success "Calibration baseline (per bucket):"
+  log_info "  Min:  $(echo "scale=2; $CAL_MIN_NS / 1000" | bc)µs"
+  log_info "  Mean: $(echo "scale=2; $CAL_MEAN_NS / 1000" | bc)µs"
+  log_info "  P50:  $(echo "scale=2; $CAL_50_NS / 1000" | bc)µs"
+  log_info "  P90:  $(echo "scale=2; $CAL_90_NS / 1000" | bc)µs"
+  log_info "  P95:  $(echo "scale=2; $CAL_95_NS / 1000" | bc)µs"
+  log_info "  P99:  $(echo "scale=2; $CAL_99_NS / 1000" | bc)µs"
+  log_info "  Max:  $(echo "scale=2; $CAL_MAX_NS / 1000" | bc)µs"
+}
+
+# Overhead test: Vegeta -> Bifrost -> Mocker (with latency)
+# Same duration/rate as calibration so percentile distributions are comparable
+run_overhead_test() {
+  echo ""
+  echo "╔═══════════════════════════════════════════════════════════╗"
+  echo "║  Overhead Test: Vegeta -> Bifrost -> Mocker (${OVERHEAD_MOCKER_LATENCY_MS}ms)     ║"
+  echo "╚═══════════════════════════════════════════════════════════╝"
+  echo ""
+  log_info "Measuring Bifrost overhead (single instance, mocker at ${OVERHEAD_MOCKER_LATENCY_MS}ms latency)"
+  log_info "Duration: ${OVERHEAD_DURATION}s at ${RATE} RPS, ~$(( RATE * OVERHEAD_MOCKER_LATENCY_MS / 1000 )) concurrent requests through Bifrost"
+  log_info "Overhead consists of: vegetta overhead and mocker timeout jitter"
+  echo ""
+
+  local target_file="${WORK_DIR}/vegeta-target.json"
+  local payload='{"model":"openai/gpt-4o-mini","messages":[{"role":"user","content":"Hello, how are you?"}]}'
+
+  cat > "${target_file}" << EOF
+{"method": "POST", "url": "http://localhost:${BIFROST_PORT}/v1/chat/completions", "header": {"Content-Type": ["application/json"]}, "body": "$(echo -n "${payload}" | base64)"}
+EOF
+
+  vegeta attack \
+    -format=json \
+    -targets="${target_file}" \
+    -rate="${RATE}" \
+    -duration="${OVERHEAD_DURATION}s" \
+    -timeout="$((OVERHEAD_MOCKER_LATENCY_MS / 1000 + 5))s" \
+    -workers=$((RATE * OVERHEAD_MOCKER_LATENCY_MS / 1000)) \
+    -max-workers="${MAX_WORKERS}" > "${WORK_DIR}/attack.bin"
+
+  echo ""
+  log_info "Overhead test complete. Results:"
+  vegeta report < "${WORK_DIR}/attack.bin"
+
+  echo ""
+  log_info "Latency histogram:"
+  vegeta report -type=hist[0,100us,500us,1ms,5ms,10ms,50ms,100ms] < "${WORK_DIR}/attack.bin" || log_warn "Histogram generation failed"
+
+  # Extract and compute overhead
+  extract_latencies "${WORK_DIR}/attack.bin"
+
+  log_info "  Raw latencies (ns): min=$EXTRACTED_MIN_NS, mean=$EXTRACTED_MEAN_NS, p50=$EXTRACTED_50_NS, p99=$EXTRACTED_99_NS, max=$EXTRACTED_MAX_NS"
+  log_info "  Success rate: $EXTRACTED_SUCCESS"
+  log_info "  Actual RPS: $(printf "%.0f" $EXTRACTED_RATE) (configured: ${RATE})"
+
+  if [ -z "$EXTRACTED_MIN_NS" ] || [ "$EXTRACTED_MIN_NS" = "0" ] || [ "$EXTRACTED_MIN_NS" = "null" ]; then
+    log_error "Failed to extract latency values from vegeta report"
+    exit 1
+  fi
+
+  # Subtract calibration per bucket: overhead = through_bifrost - direct_to_mocker
+  local us_min=$(printf "%.2f" $(echo "scale=4; ($EXTRACTED_MIN_NS - $CAL_MIN_NS) / 1000" | bc))
+  local us_mean=$(printf "%.2f" $(echo "scale=4; ($EXTRACTED_MEAN_NS - $CAL_MEAN_NS) / 1000" | bc))
+  local us_50=$(printf "%.2f" $(echo "scale=4; ($EXTRACTED_50_NS - $CAL_50_NS) / 1000" | bc))
+  local us_90=$(printf "%.2f" $(echo "scale=4; ($EXTRACTED_90_NS - $CAL_90_NS) / 1000" | bc))
+  local us_95=$(printf "%.2f" $(echo "scale=4; ($EXTRACTED_95_NS - $CAL_95_NS) / 1000" | bc))
+  local us_99=$(printf "%.2f" $(echo "scale=4; ($EXTRACTED_99_NS - $CAL_99_NS) / 1000" | bc))
+  local us_max=$(printf "%.2f" $(echo "scale=4; ($EXTRACTED_MAX_NS - $CAL_MAX_NS) / 1000" | bc))
+
+  local success_pct=$(printf "%.2f" $(echo "scale=4; $EXTRACTED_SUCCESS * 100" | bc))
+
+  echo ""
+  log_success "Bifrost overhead (per bucket):"
+  log_info "  Min:  ${us_min}µs"
+  log_info "  Mean: ${us_mean}µs"
+  log_info "  P50:  ${us_50}µs"
+  log_info "  P90:  ${us_90}µs"
+  log_info "  P95:  ${us_95}µs"
+  log_info "  P99:  ${us_99}µs"
+  log_info "  Max:  ${us_max}µs"
+
+  local actual_rps=$(printf "%.0f" $EXTRACTED_RATE)
+
+  # Write results
+  cat > "${RESULTS_FILE}" << EOF
+# Bifrost Load Test Results (single instance, ${actual_rps} RPS)
+
+## Bifrost Processing Overhead
+
+| Metric | Actual RPS | Duration | Concurrent | Success Rate | Min | Mean | P50 | P90 | P95 | P99 | Max |
+|--------|-----------|----------|------------|--------------|-----|------|-----|-----|-----|-----|-----|
+| Overhead | ${actual_rps} | ${OVERHEAD_DURATION}s | ~$((RATE * OVERHEAD_MOCKER_LATENCY_MS / 1000)) | ${success_pct}% | ${us_min}µs | ${us_mean}µs | ${us_50}µs | ${us_90}µs | ${us_95}µs | ${us_99}µs | ${us_max}µs |
+EOF
+
+  echo '{"overhead": {"configured_rate": '"${RATE}"', "actual_rate": '"${actual_rps}"', "duration": '"${OVERHEAD_DURATION}"', "concurrent": '$((RATE * OVERHEAD_MOCKER_LATENCY_MS / 1000))', "success_rate": '"${success_pct}"', "latency_us": {"min": '"${us_min}"', "mean": '"${us_mean}"', "p50": '"${us_50}"', "p90": '"${us_90}"', "p95": '"${us_95}"', "p99": '"${us_99}"', "max": '"${us_max}"'}}, "timestamp": "'"$(date -u +"%Y-%m-%dT%H:%M:%SZ")"'"}' > "${RESULTS_JSON}"
+
+  # Check tiered thresholds (skip Min/Max — single-point extremes are too noisy)
+  local failed=0
+  local labels=("Mean" "P50" "P90" "P95" "P99")
+  local real_values=($EXTRACTED_MEAN_NS $EXTRACTED_50_NS $EXTRACTED_90_NS $EXTRACTED_95_NS $EXTRACTED_99_NS)
+  local cal_values=($CAL_MEAN_NS $CAL_50_NS $CAL_90_NS $CAL_95_NS $CAL_99_NS)
+  local thresholds=($MAX_OVERHEAD_MEAN_US $MAX_OVERHEAD_P50_US $MAX_OVERHEAD_P90_US $MAX_OVERHEAD_P95_US $MAX_OVERHEAD_P99_US)
+  local extras=()
+
+  for i in "${!real_values[@]}"; do
+    local overhead_us=$(( (real_values[i] - cal_values[i]) / 1000 ))
+    if [ "$overhead_us" -gt "${thresholds[i]}" ]; then
+      extras+=("${labels[i]}:${overhead_us}:${thresholds[i]}")
+      failed=1
+    fi
+  done
+
+  if [ "$failed" -eq 1 ]; then
+    echo ""
+    log_error "FAILED: Bifrost overhead exceeded tiered thresholds"
+    log_error "Overhead  consists of: vegetta overhead and mocker timeout jitter. In real-world the P99 overhead will be approximately 100 microseconds."
+    echo ""
+    echo -e "${RED}| Bucket | Overhead (µs) | Threshold (µs) |${NC}"
+    echo -e "${RED}|--------|---------------|----------------|${NC}"
+    for entry in "${extras[@]}"; do
+      IFS=: read -r bucket overhead threshold <<< "$entry"
+      echo -e "${RED}| ${bucket} | ${overhead}µs | ${threshold}µs |${NC}"
+    done
+    echo ""
+    stop_stats_monitor
+    exit 1
+  fi
+
+  log_success "All overhead buckets within tiered thresholds (mean<${MAX_OVERHEAD_MEAN_US}µs, p50<${MAX_OVERHEAD_P50_US}µs, p90<${MAX_OVERHEAD_P90_US}µs, p95<${MAX_OVERHEAD_P95_US}µs, p99<${MAX_OVERHEAD_P99_US}µs)"
+}
+
+# ============================================================
+# Phase 2: Stress test (mocker at 10s latency)
+# ============================================================
+
+# Arguments: $1 = label (e.g. "Stress #1", "Stress #2")
+run_stress_test() {
+  local label="${1:-Stress}"
+  local bin_file="${WORK_DIR}/stress.bin"
+
+  echo ""
+  echo "╔═══════════════════════════════════════════════════════════╗"
+  echo "║    ${label}: ${RATE} RPS with ${STRESS_MOCKER_LATENCY_MS}ms mocker latency          ║"
+  echo "╚═══════════════════════════════════════════════════════════╝"
+  echo ""
+  log_info "Testing single Bifrost instance under sustained concurrency"
+  log_info "Duration: ${STRESS_DURATION}s at ${RATE} RPS (${STRESS_MOCKER_LATENCY_MS}ms mocker latency)"
+  log_info "Expected concurrent requests: ~$(( RATE * STRESS_MOCKER_LATENCY_MS / 1000 )) (provider concurrency: 15,000, buffer: 20,000)"
+  echo ""
+
+  local target_file="${WORK_DIR}/vegeta-target-stress.json"
+  local payload='{"model":"openai/gpt-4o-mini","messages":[{"role":"user","content":"Hello, how are you?"}]}'
+
+  cat > "${target_file}" << EOF
+{"method": "POST", "url": "http://localhost:${BIFROST_PORT}/v1/chat/completions", "header": {"Content-Type": ["application/json"]}, "body": "$(echo -n "${payload}" | base64)"}
+EOF
+
+  vegeta attack \
+    -format=json \
+    -targets="${target_file}" \
+    -rate="${RATE}" \
+    -duration="${STRESS_DURATION}s" \
+    -timeout="30s" \
+    -workers=$((RATE * STRESS_MOCKER_LATENCY_MS / 1000)) \
+    -max-workers="${MAX_WORKERS}" > "${bin_file}"
+
+  echo ""
+  log_info "${label} complete. Results:"
+  vegeta report < "${bin_file}"
+
+  echo ""
+  log_info "Latency histogram:"
+  vegeta report -type=hist[0,1ms,5ms,10ms,50ms,100ms,500ms,1s,5s,10s,15s] < "${bin_file}" || log_warn "Histogram generation failed"
+
+  # Check success rate
+  extract_latencies "${bin_file}"
+
+  local success_pct=$(printf "%.2f" $(echo "scale=4; $EXTRACTED_SUCCESS * 100" | bc))
+
+  log_info "Actual RPS: $(printf "%.0f" $EXTRACTED_RATE) (configured: ${RATE})"
+
+  local stress_actual_rps=$(printf "%.0f" $EXTRACTED_RATE)
+
+  # Append stress test results to results file
+  cat >> "${RESULTS_FILE}" << EOF
+
+## ${label} (${STRESS_MOCKER_LATENCY_MS}ms mocker latency)
+
+| Metric | Actual RPS | Duration | Concurrent | Success Rate | Min | Mean | P50 | P90 | P95 | P99 | Max |
+|--------|-----------|----------|------------|--------------|-----|------|-----|-----|-----|-----|-----|
+| ${label} | ${stress_actual_rps} | ${STRESS_DURATION}s | ~$((RATE * STRESS_MOCKER_LATENCY_MS / 1000)) | ${success_pct}% | $(echo "scale=2; $EXTRACTED_MIN_NS / 1000000" | bc)ms | $(echo "scale=2; $EXTRACTED_MEAN_NS / 1000000" | bc)ms | $(echo "scale=2; $EXTRACTED_50_NS / 1000000" | bc)ms | $(echo "scale=2; $EXTRACTED_90_NS / 1000000" | bc)ms | $(echo "scale=2; $EXTRACTED_95_NS / 1000000" | bc)ms | $(echo "scale=2; $EXTRACTED_99_NS / 1000000" | bc)ms | $(echo "scale=2; $EXTRACTED_MAX_NS / 1000000" | bc)ms |
+EOF
+
+  if [ "$success_pct" != "100.00" ]; then
+    echo ""
+    log_error "FAILED: ${label} success rate is ${success_pct}% (expected 100%)"
+    exit 1
+  fi
+
+  log_success "${label} passed: ${success_pct}% success rate"
+}
+
+# ============================================================
+# Finalize
+# ============================================================
+
+finalize_results() {
+  # Append process stats if available
+  local has_overhead_stats=false
+  local has_stress_stats=false
+
+  if [ -n "$OVERHEAD_STATS_CPU_PEAK" ]; then
+    has_overhead_stats=true
+  fi
+  if [ -n "$STATS_CPU_PEAK" ]; then
+    has_stress_stats=true
+  fi
+
+  if [ "$has_overhead_stats" = true ] || [ "$has_stress_stats" = true ]; then
+    cat >> "${RESULTS_FILE}" << 'EOF'
+
+## Bifrost Process Stats (single instance)
+
+| Phase | CPU Avg | CPU Peak | RSS Avg | RSS Peak |
+|-------|---------|----------|---------|----------|
+EOF
+
+    if [ "$has_overhead_stats" = true ]; then
+      echo "| Overhead | ${OVERHEAD_STATS_CPU_AVG}% | ${OVERHEAD_STATS_CPU_PEAK}% | ${OVERHEAD_STATS_RSS_AVG}MB | ${OVERHEAD_STATS_RSS_PEAK}MB |" >> "${RESULTS_FILE}"
+    fi
+    if [ "$has_stress_stats" = true ]; then
+      echo "| Stress | ${STATS_CPU_AVG}% | ${STATS_CPU_PEAK}% | ${STATS_RSS_AVG}MB | ${STATS_RSS_PEAK}MB |" >> "${RESULTS_FILE}"
+    fi
+  fi
+
+  cat >> "${RESULTS_FILE}" << EOF
+
+## Method
+
+- **Single instance**: All tests run against one bifrost-http process at ${RATE} RPS
+- **Overhead measurement**: Mocker at ${OVERHEAD_MOCKER_LATENCY_MS}ms latency, calibration (Vegeta->Mocker) subtracted from test (Vegeta->Bifrost->Mocker)
+- **Stress test**: Mocker at ${STRESS_MOCKER_LATENCY_MS}ms latency, verifies 100% success under sustained concurrency
+
+## Notes
+
+- Overhead values are in microseconds (µs), stress test values in milliseconds (ms)
+- Overhead ignores the mocker jitter, local network request queuing. In real-world the P99 overhead will be approximately 100 microseconds.
+- Tiered overhead thresholds: mean<${MAX_OVERHEAD_MEAN_US}µs, p50<${MAX_OVERHEAD_P50_US}µs, p90<${MAX_OVERHEAD_P90_US}µs, p95<${MAX_OVERHEAD_P95_US}µs, p99<${MAX_OVERHEAD_P99_US}µs
+- P50/P90/P95/P99 represent percentile latencies
+
+---
+*Generated by Bifrost Load Test Script*
+EOF
+
+  # Update JSON with stress results and process stats
+  local tmp_json=$(mktemp)
+  if command -v jq &> /dev/null; then
+    jq --arg sr "$(printf "%.2f" $(echo "scale=4; $EXTRACTED_SUCCESS * 100" | bc))" \
+       --arg cpu_avg "${STATS_CPU_AVG:-0}" --arg cpu_peak "${STATS_CPU_PEAK:-0}" \
+       --arg rss_avg "${STATS_RSS_AVG:-0}" --arg rss_peak "${STATS_RSS_PEAK:-0}" \
+       --arg oh_cpu_avg "${OVERHEAD_STATS_CPU_AVG:-0}" --arg oh_cpu_peak "${OVERHEAD_STATS_CPU_PEAK:-0}" \
+       --arg oh_rss_avg "${OVERHEAD_STATS_RSS_AVG:-0}" --arg oh_rss_peak "${OVERHEAD_STATS_RSS_PEAK:-0}" \
+       '.stress = {"rate": '"${RATE}"', "duration": '"${STRESS_DURATION}"', "mocker_latency_ms": '"${STRESS_MOCKER_LATENCY_MS}"', "success_rate": ($sr | tonumber)} | .process_stats = {"overhead": {"cpu_avg_pct": ($oh_cpu_avg | tonumber), "cpu_peak_pct": ($oh_cpu_peak | tonumber), "rss_avg_mb": ($oh_rss_avg | tonumber), "rss_peak_mb": ($oh_rss_peak | tonumber)}, "stress": {"cpu_avg_pct": ($cpu_avg | tonumber), "cpu_peak_pct": ($cpu_peak | tonumber), "rss_avg_mb": ($rss_avg | tonumber), "rss_peak_mb": ($rss_peak | tonumber)}}' \
+       "${RESULTS_JSON}" > "${tmp_json}"
+    mv "${tmp_json}" "${RESULTS_JSON}"
+  fi
+
+  log_success "Results saved to:"
+  log_info "  - Markdown: ${RESULTS_FILE}"
+  log_info "  - JSON: ${RESULTS_JSON}"
+}
+
+# Main execution
+main() {
+  echo ""
+  echo "╔═══════════════════════════════════════════════════════════╗"
+  echo "║       Bifrost Load Test (single instance, ${RATE} RPS)        ║"
+  echo "╚═══════════════════════════════════════════════════════════╝"
+  echo ""
+
+  log_info "Configuration: single bifrost-http instance, ${RATE} RPS"
+  log_info "Provider concurrency: 15,000 (buffer: 20,000)"
+  log_info "Overhead thresholds: mean<${MAX_OVERHEAD_MEAN_US}µs, p50<${MAX_OVERHEAD_P50_US}µs, p90<${MAX_OVERHEAD_P90_US}µs, p95<${MAX_OVERHEAD_P95_US}µs, p99<${MAX_OVERHEAD_P99_US}µs"
+  log_info "Phase 1: Overhead measurement — ${OVERHEAD_MOCKER_LATENCY_MS}ms mocker, ${OVERHEAD_DURATION}s, ~$(( RATE * OVERHEAD_MOCKER_LATENCY_MS / 1000 )) concurrent requests"
+  log_info "Phase 2: Stress test — ${STRESS_MOCKER_LATENCY_MS}ms mocker, ${STRESS_DURATION}s, ~$(( RATE * STRESS_MOCKER_LATENCY_MS / 1000 )) concurrent requests"
+
+  check_dependencies
+  install_vegeta
+  build_bifrost_http
+  setup_mocker
+  build_mocker
+  create_config
+  cleanup_ports
+
+  # ── Phase 1: Overhead measurement with ${OVERHEAD_MOCKER_LATENCY_MS}ms mocker ──
+  start_mocker ${OVERHEAD_MOCKER_LATENCY_MS}
+  start_bifrost
+  start_stats_monitor
+
+  run_calibration
+  run_overhead_test
+
+  # ── Collect process stats from overhead phase ──
+  stop_stats_monitor
+  OVERHEAD_STATS_CPU_AVG="${STATS_CPU_AVG}"
+  OVERHEAD_STATS_CPU_PEAK="${STATS_CPU_PEAK}"
+  OVERHEAD_STATS_RSS_AVG="${STATS_RSS_AVG}"
+  OVERHEAD_STATS_RSS_PEAK="${STATS_RSS_PEAK}"
+
+  # ── Phase 2: Stress test with high-latency mocker ──
+  # Restart both mocker and bifrost to ensure a clean fasthttp connection pool.
+  # Without restarting bifrost, stale TCP connections from the overhead phase
+  # (which used a different mocker process) cause immediate 400s on POST requests
+  # because fasthttp does not retry non-idempotent methods on broken connections.
+  stop_mocker
+  stop_bifrost
+  start_mocker ${STRESS_MOCKER_LATENCY_MS}
+  start_bifrost
+  start_stats_monitor
+
+  run_stress_test "Stress #1"
+
+  echo ""
+  log_info "Waiting 30s before second stress test (idle period)..."
+  sleep 30
+
+  run_stress_test "Stress #2"
+
+  # ── Collect process stats from stress phase ──
+  stop_stats_monitor
+
+  # ── Finalize ──
+  finalize_results
+
+  cleanup_ports
+  echo ""
+
+  # Print final summary
+  echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗"
+  echo "║                                                         FINAL RESULTS SUMMARY                                                                                    ║"
+  echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝"
+  echo ""
+  cat "${RESULTS_FILE}"
+  echo ""
+  log_success "All tests passed!"
+}
+
+main "$@"