Files
bifrost/tests/load_test_parameter_ordering.sh
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

764 lines
26 KiB
Bash
Executable File

#!/bin/bash
# Load test: detect JSON key ordering mutations in Bifrost's request proxying.
# Sends randomized payloads with different schema shapes and compares the input
# key order against what Bifrost actually sent to the provider (via
# extra_fields.raw_request with json.RawMessage preservation).
#
# Validates:
# - Tool parameter key ordering at every nesting level (properties, $defs, nested schemas)
# - tool_choice serialization (key ordering, no extra zero-value fields like "custom"/"allowed_tools")
# - Multiple tool schemas, deeply nested objects, adversarial property orderings
#
# Each request randomly picks from 8 different payload shapes to maximize coverage.
#
# This catches both:
# - Consistent mutations (struct field order overriding client order) — 100% rate
# - Sporadic mutations (sync.Pool reuse, concurrency bugs) — variable rate
#
# Prerequisites:
# - Bifrost running with send_back_raw_request: true on the openai provider
# - OpenAI provider pointed at a mock server (any 200 response works)
#
# Usage: ./tests/load_test_parameter_ordering.sh [rps] [duration]
# rps - requests per second (default: 20)
# duration - how many seconds to run (default: 10)
BIFROST_URL="http://localhost:8080/litellm/v1/chat/completions"
RPS="${1:-20}"
DURATION="${2:-10}"
NUM_REQUESTS=$((RPS * DURATION))
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
TMPDIR=$(mktemp -d)
trap 'rm -rf "$TMPDIR"' EXIT
# ---------------------------------------------------------------------------
# Payload 1: Standard — non-alpha properties, $defs after required, function tool_choice
# ---------------------------------------------------------------------------
cat > "$TMPDIR/payload_1.json" << 'EOF'
{
"model": "openai/gpt-4.1",
"messages": [{"role": "user", "content": "test"}],
"temperature": 0,
"tools": [
{
"type": "function",
"function": {
"name": "structured_response",
"description": "Generate a structured response",
"parameters": {
"type": "object",
"properties": {
"reasoning": {
"type": "string",
"description": "Step by step reasoning",
"title": "Reasoning"
},
"summary": {
"type": "string",
"description": "The final summary",
"title": "Summary"
},
"tags": {
"description": "Relevant tags",
"items": {"$ref": "#/$defs/Tag"},
"title": "Tags",
"type": "array"
},
"confidence": {
"description": "Confidence score",
"title": "Confidence",
"type": "number"
}
},
"required": ["reasoning", "summary", "tags", "confidence"],
"$defs": {
"Tag": {
"type": "object",
"description": "A tag",
"required": ["label"],
"properties": {
"label": {"description": "The tag label", "title": "Label", "type": "string"},
"score": {"description": "Relevance score", "title": "Score", "type": "number"}
},
"title": "Tag"
}
}
}
}
}
],
"tool_choice": {
"type": "function",
"function": {"name": "structured_response"}
}
}
EOF
# ---------------------------------------------------------------------------
# Payload 2: Reverse-alpha properties, $defs at TOP, string tool_choice "auto"
# Property names z_ y_ x_ w_ would get reordered to w_ x_ y_ z_ if sorted
# ---------------------------------------------------------------------------
cat > "$TMPDIR/payload_2.json" << 'EOF'
{
"model": "openai/gpt-4.1",
"messages": [{"role": "user", "content": "test"}],
"tools": [
{
"type": "function",
"function": {
"name": "reverse_alpha_tool",
"parameters": {
"$defs": {
"ZItem": {
"type": "object",
"properties": {
"z_name": {"type": "string"},
"a_value": {"type": "number"}
},
"required": ["z_name"]
}
},
"type": "object",
"properties": {
"z_output": {"type": "string", "description": "Last alphabetically, first in schema"},
"y_reasoning": {"type": "string", "description": "Second to last"},
"x_items": {
"type": "array",
"items": {"$ref": "#/$defs/ZItem"},
"description": "Third to last"
},
"w_confidence": {"type": "number", "description": "Fourth to last"}
},
"required": ["z_output", "y_reasoning"]
}
}
}
],
"tool_choice": "auto"
}
EOF
# ---------------------------------------------------------------------------
# Payload 3: Multiple tools, deeply nested objects, string tool_choice "required"
# ---------------------------------------------------------------------------
cat > "$TMPDIR/payload_3.json" << 'EOF'
{
"model": "openai/gpt-4.1",
"messages": [{"role": "user", "content": "test"}],
"tools": [
{
"type": "function",
"function": {
"name": "deep_nested_tool",
"description": "Tool with 3-level nesting",
"parameters": {
"type": "object",
"properties": {
"output": {
"type": "object",
"description": "Nested output",
"properties": {
"verdict": {"type": "string"},
"metadata": {
"type": "object",
"properties": {
"timestamp": {"type": "string"},
"source": {"type": "string"},
"confidence": {"type": "number"},
"author": {"type": "string"}
}
},
"score": {"type": "number"}
}
},
"chain_of_thought": {"type": "string"},
"answer": {"type": "string"}
},
"required": ["output", "answer"]
}
}
},
{
"type": "function",
"function": {
"name": "secondary_tool",
"description": "A second tool to verify multi-tool ordering",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query"},
"max_results": {"type": "integer", "description": "Limit"},
"filters": {
"type": "object",
"properties": {
"date_range": {"type": "string"},
"category": {"type": "string"},
"active_only": {"type": "boolean"}
}
}
},
"required": ["query"]
}
}
}
],
"tool_choice": "required"
}
EOF
# ---------------------------------------------------------------------------
# Payload 4: Many properties in adversarial order (zigzag), no $defs, tool_choice "none"
# Names deliberately interleave early/late alphabet letters
# ---------------------------------------------------------------------------
cat > "$TMPDIR/payload_4.json" << 'EOF'
{
"model": "openai/gpt-4.1",
"messages": [{"role": "user", "content": "test"}],
"temperature": 0.7,
"max_tokens": 500,
"tools": [
{
"type": "function",
"function": {
"name": "zigzag_tool",
"description": "Properties in zigzag alphabetical order",
"parameters": {
"type": "object",
"properties": {
"zebra": {"type": "string"},
"apple": {"type": "string"},
"yarn": {"type": "number"},
"banana": {"type": "boolean"},
"xenon": {"type": "string"},
"cherry": {"type": "integer"},
"walnut": {"type": "string"},
"date": {"type": "array", "items": {"type": "string"}},
"violet": {"type": "number"},
"elderberry": {"type": "string"}
},
"required": ["zebra", "apple", "yarn"]
}
}
}
],
"tool_choice": "none"
}
EOF
# ---------------------------------------------------------------------------
# Payload 5: $defs with multiple definitions, additionalProperties, nested $ref
# ---------------------------------------------------------------------------
cat > "$TMPDIR/payload_5.json" << 'EOF'
{
"model": "openai/gpt-4.1",
"messages": [{"role": "user", "content": "test"}],
"tools": [
{
"type": "function",
"function": {
"name": "AnswerResponseModel",
"description": "Realistic pydantic-generated schema",
"parameters": {
"$defs": {
"Citation": {
"type": "object",
"properties": {
"url": {"type": "string", "description": "Source URL"},
"text": {"type": "string", "description": "Cited text"},
"page_number": {"type": "integer", "description": "Page"}
},
"required": ["url", "text"]
},
"Metadata": {
"type": "object",
"properties": {
"model_version": {"type": "string"},
"latency_ms": {"type": "number"},
"token_count": {"type": "integer"}
},
"required": ["model_version"]
}
},
"type": "object",
"properties": {
"answer": {"type": "string", "description": "The answer"},
"chain_of_thought": {"type": "string", "description": "Reasoning steps"},
"citations": {
"type": "array",
"items": {"$ref": "#/$defs/Citation"},
"description": "Supporting citations"
},
"is_unanswered": {"type": "boolean", "description": "Whether answerable"},
"metadata": {"$ref": "#/$defs/Metadata"}
},
"required": ["answer", "is_unanswered"],
"additionalProperties": false
}
}
}
],
"tool_choice": {
"type": "function",
"function": {"name": "AnswerResponseModel"}
}
}
EOF
# ---------------------------------------------------------------------------
# Payload 6: Minimal single-property tool, no tool_choice — tests baseline passthrough
# Also uses top-level keys in non-standard order (tools before messages)
# ---------------------------------------------------------------------------
cat > "$TMPDIR/payload_6.json" << 'EOF'
{
"model": "openai/gpt-4.1",
"tools": [
{
"type": "function",
"function": {
"name": "simple_extractor",
"parameters": {
"type": "object",
"properties": {
"result": {"type": "string", "description": "Extracted result"}
},
"required": ["result"]
}
}
}
],
"messages": [{"role": "user", "content": "test"}],
"temperature": 0
}
EOF
# ---------------------------------------------------------------------------
# Payload 7: EXACT reproduction of reported bug — Issue 1 + 2 + 3 combined
# tool_choice: {type, function} with AnswerResponseModel (Issue 1)
# properties: answer, chain_of_thought, citations, is_unanswered (Issue 2)
# $defs with Citation at TOP of parameters object (Issue 3)
# ---------------------------------------------------------------------------
cat > "$TMPDIR/payload_7.json" << 'EOF'
{
"model": "openai/gpt-4.1",
"messages": [{"role": "user", "content": "test"}],
"tools": [
{
"type": "function",
"function": {
"name": "AnswerResponseModel",
"parameters": {
"$defs": {
"Citation": {
"type": "object",
"properties": {
"url": {"type": "string"},
"text": {"type": "string"}
},
"required": ["url", "text"]
}
},
"properties": {
"answer": {"type": "string", "description": "The answer"},
"chain_of_thought": {"type": "string", "description": "Reasoning"},
"citations": {
"type": "array",
"items": {"$ref": "#/$defs/Citation"}
},
"is_unanswered": {"type": "boolean"}
},
"required": ["answer", "is_unanswered"],
"type": "object"
}
}
}
],
"tool_choice": {
"type": "function",
"function": {
"name": "AnswerResponseModel"
}
}
}
EOF
# ---------------------------------------------------------------------------
# Payload 8: tool_choice string variants cycle — ensures "none"/"auto"/"required"
# pass through as strings and don't get expanded to structs
# Also: properties in exact reverse alphabetical to maximize reorder detection
# ---------------------------------------------------------------------------
cat > "$TMPDIR/payload_8.json" << 'EOF'
{
"model": "openai/gpt-4.1",
"messages": [{"role": "user", "content": "test"}],
"tools": [
{
"type": "function",
"function": {
"name": "reverse_order_check",
"parameters": {
"type": "object",
"properties": {
"zulu": {"type": "string"},
"yankee": {"type": "string"},
"x_ray": {"type": "string"},
"whiskey": {"type": "number"},
"victor": {"type": "boolean"},
"uniform": {"type": "string"},
"tango": {"type": "integer"},
"sierra": {"type": "string"},
"romeo": {"type": "number"},
"quebec": {"type": "string"},
"papa": {"type": "boolean"},
"oscar": {"type": "string"},
"november": {"type": "string"},
"mike": {"type": "number"},
"lima": {"type": "string"},
"kilo": {"type": "boolean"},
"juliet": {"type": "string"},
"india": {"type": "integer"},
"hotel": {"type": "string"},
"golf": {"type": "number"},
"foxtrot": {"type": "string"},
"echo_field": {"type": "boolean"},
"delta": {"type": "string"},
"charlie": {"type": "number"},
"bravo": {"type": "string"},
"alpha": {"type": "string"}
},
"required": ["zulu", "alpha"]
}
}
}
],
"tool_choice": "auto"
}
EOF
NUM_PAYLOADS=8
# ---------------------------------------------------------------------------
# Python analyzer — compares input vs output key ordering at every nesting level
# ---------------------------------------------------------------------------
cat > "$TMPDIR/analyze.py" << 'PYEOF'
import json, sys, os
from collections import OrderedDict
def extract_key_orders(obj, path=""):
"""Recursively extract key orders from all nested dicts.
Returns a dict of {path: [keys]} for every object in the tree."""
if not isinstance(obj, dict):
return {}
result = {path: list(obj.keys())}
for key, val in obj.items():
child_path = f"{path}.{key}" if path else key
if isinstance(val, dict):
result.update(extract_key_orders(val, child_path))
elif isinstance(val, list):
for i, item in enumerate(val):
if isinstance(item, dict):
result.update(extract_key_orders(item, f"{child_path}[{i}]"))
return result
def get_all_tool_parameters(payload):
"""Extract tool function parameters from ALL tools in a chat completion payload.
Returns list of (tool_name, parameters_dict) tuples."""
tools = payload.get("tools", [])
result = []
for tool in tools:
func = tool.get("function", {})
name = func.get("name", "unknown")
params = func.get("parameters")
if params is not None:
result.append((name, params))
return result
def check_tool_choice(input_payload, raw_request):
"""Check tool_choice serialization: key ordering and no extra fields.
Returns a list of (description, input, output) mutation tuples.
Catches Issue 1 from the bug report:
- Zero-value fields injected: "custom":{"name":""}, "allowed_tools":{"mode":"","tools":null}
- Key reordering: "type" moving from first to last position
- String tool_choice ("auto"/"none"/"required") being expanded to struct
"""
mutations = []
input_tc = input_payload.get("tool_choice")
output_tc = raw_request.get("tool_choice")
if input_tc is None and output_tc is None:
return mutations
if input_tc is None and output_tc is not None:
mutations.append(("tool_choice (injected)", None, output_tc))
return mutations
if input_tc is not None and output_tc is None:
mutations.append(("tool_choice (dropped)", input_tc, None))
return mutations
# Issue 1: string tool_choice must stay as string, not become struct
if isinstance(input_tc, str):
if isinstance(output_tc, dict):
mutations.append(("tool_choice (string->struct)", input_tc, list(output_tc.keys())))
elif output_tc != input_tc:
mutations.append(("tool_choice (string)", input_tc, output_tc))
return mutations
if isinstance(input_tc, dict) and isinstance(output_tc, dict):
input_keys = list(input_tc.keys())
output_keys = list(output_tc.keys())
# Issue 1.2: Check for zero-value fields from unused union variants
# These are the exact fields reported in the bug:
zero_value_fields = {"custom", "allowed_tools"}
injected = zero_value_fields & (set(output_keys) - set(input_keys))
if injected:
mutations.append(("tool_choice (zero-value fields injected)", sorted(injected), [
f'{k}={json.dumps(output_tc[k])}' for k in sorted(injected)
]))
# Any other extra fields
other_extra = set(output_keys) - set(input_keys) - zero_value_fields
if other_extra:
mutations.append(("tool_choice (unexpected extra fields)", [], list(other_extra)))
# Issue 1.2: Check key ordering — "type" should stay first, not move to end
if input_keys != output_keys:
mutations.append(("tool_choice (key order)", input_keys, output_keys))
# Recursively check nested key orders (e.g. function object)
input_tc_orders = extract_key_orders(input_tc, "tool_choice")
output_tc_orders = extract_key_orders(output_tc, "tool_choice")
for path, inp_keys in input_tc_orders.items():
out_keys = output_tc_orders.get(path)
if out_keys is not None and inp_keys != out_keys:
mutations.append((path, inp_keys, out_keys))
return mutations
def check_defs_position(input_params, output_params, tool_idx):
"""Check that $defs stays in its original position within the parameters object.
Catches Issue 3 from the bug report:
- $defs at top of parameters moves to bottom after round-trip
"""
mutations = []
input_keys = list(input_params.keys())
output_keys = list(output_params.keys())
if "$defs" in input_keys and "$defs" in output_keys:
input_pos = input_keys.index("$defs")
output_pos = output_keys.index("$defs")
if input_pos != output_pos:
mutations.append((
f"tools[{tool_idx}].parameters ($defs position)",
f"$defs at index {input_pos} in {input_keys}",
f"$defs at index {output_pos} in {output_keys}"
))
if "definitions" in input_keys and "definitions" in output_keys:
input_pos = input_keys.index("definitions")
output_pos = output_keys.index("definitions")
if input_pos != output_pos:
mutations.append((
f"tools[{tool_idx}].parameters (definitions position)",
f"definitions at index {input_pos} in {input_keys}",
f"definitions at index {output_pos} in {output_keys}"
))
return mutations
# Analyze response
resp_file = sys.argv[1]
idx = sys.argv[2]
payload_file = sys.argv[3]
try:
# Load input payload (the known-good key order)
with open(payload_file) as f:
input_payload = json.load(f, object_pairs_hook=OrderedDict)
input_tool_params = get_all_tool_parameters(input_payload)
if not input_tool_params:
print(f"PARSE_ERROR:{idx}:no tool parameters in input payload")
sys.exit(0)
with open(resp_file) as f:
resp = json.load(f, object_pairs_hook=OrderedDict)
raw_request = resp.get("extra_fields", OrderedDict()).get("raw_request")
if raw_request is None:
print(f"NO_RAW_REQUEST:{idx}")
sys.exit(0)
output_tool_params = get_all_tool_parameters(raw_request)
if not output_tool_params:
print(f"PARSE_ERROR:{idx}:no tool parameters in raw_request")
sys.exit(0)
mutations = []
# Compare each tool's parameter key ordering (Issue 2: properties reordering)
for i, (inp_name, inp_params) in enumerate(input_tool_params):
if i >= len(output_tool_params):
mutations.append((f"tool[{i}] missing", inp_name, "MISSING"))
continue
out_name, out_params = output_tool_params[i]
input_orders = extract_key_orders(inp_params, f"tools[{i}].parameters")
output_orders = extract_key_orders(out_params, f"tools[{i}].parameters")
for path, input_keys in input_orders.items():
output_keys = output_orders.get(path)
if output_keys is None:
continue
if input_keys != output_keys:
mutations.append((path, input_keys, output_keys))
# Issue 3: $defs position within parameters object
mutations.extend(check_defs_position(inp_params, out_params, i))
# Issue 1: tool_choice serialization (zero-value fields, key ordering)
mutations.extend(check_tool_choice(input_payload, raw_request))
if not mutations:
print(f"OK:{idx}")
else:
payload_num = os.path.basename(payload_file).replace("payload_", "").replace(".json", "")
print(f"MUTATED:{idx}")
for path, inp, out in mutations:
label = path if path else "parameters"
print(f" DETAIL:{idx}:P{payload_num}:{label}: input={inp} -> output={out}", file=sys.stderr)
except Exception as e:
print(f"PARSE_ERROR:{idx}:{e}")
PYEOF
echo -e "${CYAN}JSON Serialization Fidelity — Input vs Output Validator${NC}"
echo "=========================================================="
echo "Target: $BIFROST_URL"
echo "RPS: $RPS"
echo "Duration: ${DURATION}s"
echo "Total: $NUM_REQUESTS requests"
echo "Payloads: $NUM_PAYLOADS variants (randomly selected per request)"
echo ""
echo "Validates that Bifrost preserves the client's original JSON"
echo "key ordering (tool parameters + tool_choice) and doesn't"
echo "inject extra zero-value fields."
echo ""
echo "Payload variants:"
echo " P1: Standard — non-alpha properties, \$defs after required, function tool_choice"
echo " P2: Reverse-alpha properties, \$defs at TOP, tool_choice \"auto\""
echo " P3: Multiple tools, 3-level nested objects, tool_choice \"required\""
echo " P4: 10 zigzag-ordered properties, no \$defs, tool_choice \"none\""
echo " P5: Multiple \$defs, additionalProperties, pydantic-style schema"
echo " P6: Minimal single-property tool, no tool_choice, non-standard top-level order"
echo " P7: EXACT bug report reproduction — all 3 issues in one payload"
echo " P8: 26 reverse-alpha NATO properties — maximum reorder detection"
echo "=========================================================="
echo ""
# Send a single request with a random payload and analyze
send_and_check() {
local idx=$1
local payload_num=$(( (RANDOM % NUM_PAYLOADS) + 1 ))
local payload_file="$TMPDIR/payload_${payload_num}.json"
local outfile="$TMPDIR/resp_${idx}.json"
local httpcode
httpcode=$(curl -s -o "$outfile" -w "%{http_code}" \
-X POST "$BIFROST_URL" \
-H "Content-Type: application/json" \
-d @"$payload_file" \
--max-time 30 2>/dev/null)
if [ "$httpcode" != "200" ]; then
echo "HTTP_ERROR:${idx}:${httpcode}"
return
fi
python3 "$TMPDIR/analyze.py" "$outfile" "$idx" "$payload_file"
}
export -f send_and_check
export BIFROST_URL TMPDIR NUM_PAYLOADS
# Send RPS requests per second for DURATION seconds
idx=0
for sec in $(seq 1 "$DURATION"); do
for _ in $(seq 1 "$RPS"); do
((idx++))
send_and_check "$idx" >> "$TMPDIR/results.txt" 2>>"$TMPDIR/details.log" &
done
echo -e " Second $sec/$DURATION — launched $RPS requests"
sleep 1
done
# Wait for all background jobs to finish
wait
results=$(cat "$TMPDIR/results.txt" 2>/dev/null)
OK=0
MUTATED=0
HTTP_ERRORS=0
PARSE_ERRORS=0
NO_RAW=0
while IFS= read -r line; do
case "$line" in
OK:*) ((OK++)) ;;
MUTATED:*)
((MUTATED++))
idx=$(echo "$line" | cut -d: -f2)
echo -e "${RED} [MUTATED] Request #${idx}${NC}"
;;
HTTP_ERROR:*)
((HTTP_ERRORS++))
idx=$(echo "$line" | cut -d: -f2)
code=$(echo "$line" | cut -d: -f3)
echo -e "${YELLOW} [HTTP ${code}] Request #${idx}${NC}"
;;
NO_RAW_REQUEST:*)
((NO_RAW++))
echo -e "${YELLOW} [NO RAW REQUEST] Request #$(echo "$line" | cut -d: -f2) - is send_back_raw_request enabled?${NC}"
;;
PARSE_ERROR:*)
((PARSE_ERRORS++))
echo -e "${YELLOW} [PARSE ERROR] ${line}${NC}"
;;
esac
done <<< "$results"
TOTAL=$((OK + MUTATED + HTTP_ERRORS + PARSE_ERRORS + NO_RAW))
echo ""
echo "=========================================================="
echo -e "${CYAN}Results (${TOTAL}/${NUM_REQUESTS} completed):${NC}"
echo -e " ${GREEN}OK (order preserved): $OK${NC}"
echo -e " ${RED}MUTATED (reordered): $MUTATED${NC}"
echo -e " ${YELLOW}HTTP errors: $HTTP_ERRORS${NC}"
echo -e " ${YELLOW}No raw request: $NO_RAW${NC}"
echo -e " ${YELLOW}Parse errors: $PARSE_ERRORS${NC}"
echo "=========================================================="
if [ "$MUTATED" -gt 0 ]; then
RATE=$(python3 -c "print(f'{$MUTATED/$TOTAL*100:.1f}')" 2>/dev/null || echo "?")
echo ""
echo -e "${RED}MUTATION RATE: ${RATE}% ($MUTATED / $TOTAL)${NC}"
echo ""
echo -e "${CYAN}Key order mutations (input vs output):${NC}"
cat "$TMPDIR/details.log" 2>/dev/null | head -50
exit 1
elif [ "$NO_RAW" -gt 0 ]; then
echo ""
echo -e "${YELLOW}WARNING: No raw_request in responses. Enable send_back_raw_request in provider config.${NC}"
exit 2
else
echo ""
echo -e "${GREEN}All $OK requests preserved the original JSON key ordering across all $NUM_PAYLOADS payload variants.${NC}"
exit 0
fi