3783 lines
154 KiB
Python
3783 lines
154 KiB
Python
"""
|
|
Anthropic Integration Tests - Cross-Provider Support
|
|
|
|
CROSS-PROVIDER TESTING:
|
|
This test suite uses the Anthropic SDK to test against multiple AI providers through Bifrost.
|
|
Tests automatically run against all available providers with proper capability filtering.
|
|
|
|
Note: Tests automatically skip for providers that don't support specific capabilities.
|
|
Example: Thinking tests only run for Anthropic, speech/transcription skip for all providers using Anthropic SDK.
|
|
|
|
Tests all core scenarios using Anthropic SDK directly:
|
|
1. Simple chat
|
|
2. Multi turn conversation
|
|
3. Tool calls
|
|
4. Multiple tool calls
|
|
5. End2End tool calling
|
|
6. Automatic function calling
|
|
7. Image (url)
|
|
8. Image (base64)
|
|
9. Multiple images
|
|
10. Complete end2end test with conversation history, tool calls, tool results and images
|
|
11. Integration specific tests
|
|
12. Error handling
|
|
13. Streaming
|
|
14. List models
|
|
15. Extended thinking (non-streaming)
|
|
16. Extended thinking (streaming)
|
|
17. Files API - file upload (Cross-Provider)
|
|
18. Files API - file list (Cross-Provider)
|
|
19. Files API - file retrieve (Cross-Provider)
|
|
20. Files API - file delete (Cross-Provider)
|
|
21. Files API - file content (Cross-Provider)
|
|
22. Batch API - batch create with inline requests (Cross-Provider)
|
|
23. Batch API - batch list
|
|
24. Batch API - batch retrieve
|
|
25. Batch API - batch cancel
|
|
26. Batch API - batch results
|
|
27. Batch API - end-to-end workflow
|
|
28. Prompt caching - system message checkpoint
|
|
29. Prompt caching - messages checkpoint
|
|
30. Prompt caching - tools checkpoint
|
|
31. Count tokens (Cross-Provider)
|
|
32. Passthrough messages (non-streaming)
|
|
33. Passthrough messages (streaming)
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict, List
|
|
|
|
import pytest
|
|
from anthropic import Anthropic
|
|
|
|
from .utils.common import (
|
|
# Anthropic-specific test data
|
|
ANTHROPIC_THINKING_PROMPT,
|
|
ANTHROPIC_THINKING_STREAMING_PROMPT,
|
|
BASE64_IMAGE,
|
|
CALCULATOR_TOOL,
|
|
COMPARISON_KEYWORDS,
|
|
IMAGE_URL,
|
|
FILE_DATA_BASE64,
|
|
INPUT_TOKENS_LONG_TEXT,
|
|
INPUT_TOKENS_SIMPLE_TEXT,
|
|
INPUT_TOKENS_WITH_SYSTEM,
|
|
INVALID_ROLE_MESSAGES,
|
|
LOCATION_KEYWORDS,
|
|
MULTI_TURN_MESSAGES,
|
|
MULTIPLE_TOOL_CALL_MESSAGES,
|
|
PROMPT_CACHING_LARGE_CONTEXT,
|
|
PROMPT_CACHING_TOOLS,
|
|
SIMPLE_CHAT_MESSAGES,
|
|
SINGLE_TOOL_CALL_MESSAGES,
|
|
STREAMING_CHAT_MESSAGES,
|
|
STREAMING_TOOL_CALL_MESSAGES,
|
|
WEATHER_KEYWORDS,
|
|
WEATHER_TOOL,
|
|
Config,
|
|
assert_has_tool_calls,
|
|
assert_valid_batch_inline_response,
|
|
assert_valid_chat_response,
|
|
assert_valid_image_response,
|
|
assert_valid_input_tokens_response,
|
|
collect_streaming_content,
|
|
# Files API utilities
|
|
create_batch_inline_requests,
|
|
create_batch_jsonl_content,
|
|
extract_tool_calls,
|
|
get_api_key,
|
|
mock_tool_response,
|
|
# Citation utilities
|
|
CITATION_TEXT_DOCUMENT,
|
|
CITATION_MULTI_DOCUMENT_SET,
|
|
assert_valid_anthropic_citation,
|
|
collect_anthropic_streaming_citations,
|
|
create_anthropic_document,
|
|
)
|
|
from .utils.config_loader import get_config, get_model
|
|
from .utils.parametrize import (
|
|
format_provider_model,
|
|
get_cross_provider_params_for_scenario,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def anthropic_client():
|
|
"""Create Anthropic client for testing"""
|
|
from .utils.config_loader import get_config, get_integration_url
|
|
|
|
api_key = get_api_key("anthropic")
|
|
base_url = get_integration_url("anthropic")
|
|
|
|
# Get additional integration settings
|
|
config = get_config()
|
|
integration_settings = config.get_integration_settings("anthropic")
|
|
api_config = config.get_api_config()
|
|
|
|
client_kwargs = {
|
|
"api_key": api_key,
|
|
"base_url": base_url,
|
|
"timeout": api_config.get("timeout", 120),
|
|
"max_retries": api_config.get("max_retries", 3),
|
|
}
|
|
|
|
# Add Anthropic-specific settings
|
|
if integration_settings.get("version"):
|
|
client_kwargs["default_headers"] = {"anthropic-version": integration_settings["version"]}
|
|
|
|
return Anthropic(**client_kwargs)
|
|
|
|
|
|
@pytest.fixture
|
|
def test_config():
|
|
"""Test configuration"""
|
|
return Config()
|
|
|
|
|
|
def get_provider_anthropic_client(provider, passthrough: bool = False):
|
|
"""Create Anthropic client with x-model-provider header for given provider"""
|
|
from .utils.config_loader import get_config, get_integration_url
|
|
|
|
api_key = get_api_key("anthropic")
|
|
integration = "anthropic_passthrough" if passthrough else "anthropic"
|
|
base_url = get_integration_url(integration)
|
|
config = get_config()
|
|
api_config = config.get_api_config()
|
|
integration_settings = config.get_integration_settings("anthropic")
|
|
|
|
default_headers = {"x-model-provider": provider}
|
|
if integration_settings.get("version"):
|
|
default_headers["anthropic-version"] = integration_settings["version"]
|
|
|
|
return Anthropic(
|
|
api_key=api_key,
|
|
base_url=base_url,
|
|
timeout=api_config.get("timeout", 300),
|
|
default_headers=default_headers,
|
|
)
|
|
|
|
|
|
def convert_to_anthropic_messages(
|
|
messages: List[Dict[str, Any]],
|
|
) -> List[Dict[str, Any]]:
|
|
"""Convert common message format to Anthropic format"""
|
|
anthropic_messages = []
|
|
|
|
for msg in messages:
|
|
if msg["role"] == "system":
|
|
continue # System messages handled separately in Anthropic
|
|
|
|
# Handle image messages
|
|
if isinstance(msg.get("content"), list):
|
|
content = []
|
|
for item in msg["content"]:
|
|
if item["type"] == "text":
|
|
content.append({"type": "text", "text": item["text"]})
|
|
elif item["type"] == "image_url":
|
|
url = item["image_url"]["url"]
|
|
if url.startswith("data:image"):
|
|
# Base64 image
|
|
media_type, data = url.split(",", 1)
|
|
content.append(
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": media_type,
|
|
"data": data,
|
|
},
|
|
}
|
|
)
|
|
else:
|
|
# URL image - send URL directly to Anthropic
|
|
content.append(
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "url",
|
|
"url": url,
|
|
},
|
|
}
|
|
)
|
|
|
|
anthropic_messages.append({"role": msg["role"], "content": content})
|
|
else:
|
|
anthropic_messages.append({"role": msg["role"], "content": msg["content"]})
|
|
|
|
return anthropic_messages
|
|
|
|
|
|
def convert_to_anthropic_tools(tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Convert common tool format to Anthropic format"""
|
|
anthropic_tools = []
|
|
|
|
for tool in tools:
|
|
anthropic_tools.append(
|
|
{
|
|
"name": tool["name"],
|
|
"description": tool["description"],
|
|
"input_schema": tool["parameters"],
|
|
}
|
|
)
|
|
|
|
return anthropic_tools
|
|
|
|
|
|
class TestAnthropicIntegration:
|
|
"""Test suite for Anthropic integration with cross-provider support"""
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("simple_chat")
|
|
)
|
|
def test_01_simple_chat(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 1: Simple chat interaction - runs across all available providers"""
|
|
messages = convert_to_anthropic_messages(SIMPLE_CHAT_MESSAGES)
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=100
|
|
)
|
|
|
|
assert_valid_chat_response(response)
|
|
assert len(response.content) > 0
|
|
assert response.content[0].type == "text"
|
|
assert len(response.content[0].text) > 0
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("multi_turn_conversation")
|
|
)
|
|
def test_02_multi_turn_conversation(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 2: Multi-turn conversation - runs across all available providers"""
|
|
messages = convert_to_anthropic_messages(MULTI_TURN_MESSAGES)
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=150
|
|
)
|
|
|
|
assert_valid_chat_response(response)
|
|
content = response.content[0].text.lower()
|
|
# Should mention population or numbers since we asked about Paris population
|
|
assert any(word in content for word in ["population", "million", "people", "inhabitants"])
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("tool_calls"))
|
|
def test_03_single_tool_call(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 3: Single tool call - auto-skips providers without tool support"""
|
|
messages = convert_to_anthropic_messages(SINGLE_TOOL_CALL_MESSAGES)
|
|
tools = convert_to_anthropic_tools([WEATHER_TOOL])
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=tools,
|
|
max_tokens=100,
|
|
)
|
|
|
|
assert_has_tool_calls(response, expected_count=1)
|
|
tool_calls = extract_tool_calls(response)
|
|
assert tool_calls[0]["name"] == "get_weather"
|
|
assert "location" in tool_calls[0]["arguments"]
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("multiple_tool_calls")
|
|
)
|
|
def test_04_multiple_tool_calls(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 4: Multiple tool calls in one response - auto-skips providers without multiple tool support"""
|
|
messages = convert_to_anthropic_messages(MULTIPLE_TOOL_CALL_MESSAGES)
|
|
tools = convert_to_anthropic_tools([WEATHER_TOOL, CALCULATOR_TOOL])
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=tools,
|
|
max_tokens=200,
|
|
)
|
|
|
|
# Providers might be more conservative with multiple tool calls
|
|
# Let's check if it made at least one tool call and prefer multiple if possible
|
|
assert_has_tool_calls(response) # At least 1 tool call
|
|
tool_calls = extract_anthropic_tool_calls(response)
|
|
tool_names = [tc["name"] for tc in tool_calls]
|
|
|
|
# Should make relevant tool calls - either weather, calculate, or both
|
|
expected_tools = ["get_weather", "calculate"]
|
|
made_relevant_calls = any(name in expected_tools for name in tool_names)
|
|
assert made_relevant_calls, f"Expected tool calls from {expected_tools}, got {tool_names}"
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("end2end_tool_calling")
|
|
)
|
|
def test_05_end2end_tool_calling(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 5: Complete tool calling flow with responses"""
|
|
messages = [{"role": "user", "content": "What's the weather in Boston in fahrenheit?"}]
|
|
tools = convert_to_anthropic_tools([WEATHER_TOOL])
|
|
logger = logging.getLogger("05AnthropicEnd2EndToolCalling")
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=tools,
|
|
max_tokens=500,
|
|
)
|
|
|
|
assert_has_tool_calls(response, expected_count=1)
|
|
|
|
# Add assistant's response to conversation
|
|
# Serialize content blocks to dicts for cross-provider compatibility
|
|
messages.append(
|
|
{"role": "assistant", "content": serialize_anthropic_content(response.content)}
|
|
)
|
|
|
|
# Add tool response
|
|
tool_calls = extract_anthropic_tool_calls(response)
|
|
tool_response = mock_tool_response(tool_calls[0]["name"], tool_calls[0]["arguments"])
|
|
|
|
# Find the tool use block to get its ID
|
|
tool_use_id = None
|
|
for content in response.content:
|
|
if content.type == "tool_use":
|
|
tool_use_id = content.id
|
|
break
|
|
|
|
messages.append(
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "tool_result",
|
|
"tool_use_id": tool_use_id,
|
|
"content": tool_response,
|
|
}
|
|
],
|
|
}
|
|
)
|
|
|
|
logger.info(f"Messages: {messages}")
|
|
|
|
# Get final response
|
|
final_response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=150
|
|
)
|
|
|
|
# Anthropic might return empty content if tool result is sufficient
|
|
assert final_response is not None
|
|
if len(final_response.content) > 0:
|
|
assert_valid_chat_response(final_response)
|
|
content = final_response.content[0].text.lower()
|
|
weather_location_keywords = WEATHER_KEYWORDS + LOCATION_KEYWORDS
|
|
assert any(word in content for word in weather_location_keywords)
|
|
else:
|
|
# If no content, that's ok - tool result was sufficient
|
|
print("Model returned empty content - tool result was sufficient")
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("automatic_function_calling")
|
|
)
|
|
def test_06_automatic_function_calling(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 6: Automatic function calling"""
|
|
messages = [{"role": "user", "content": "Calculate 25 * 4 for me"}]
|
|
tools = convert_to_anthropic_tools([CALCULATOR_TOOL])
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=tools,
|
|
max_tokens=100,
|
|
)
|
|
|
|
# Should automatically choose to use the calculator
|
|
assert_has_tool_calls(response, expected_count=1)
|
|
tool_calls = extract_tool_calls(response)
|
|
assert tool_calls[0]["name"] == "calculate"
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("image_url"))
|
|
def test_07_image_url(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 7: Image analysis from URL"""
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "What do you see in this image?"},
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "url",
|
|
"url": IMAGE_URL,
|
|
},
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=200
|
|
)
|
|
|
|
assert_valid_image_response(response)
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("image_base64")
|
|
)
|
|
def test_08_image_base64(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 8: Image analysis from base64 - runs for all providers with base64 image support"""
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "Describe this image"},
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": BASE64_IMAGE,
|
|
},
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=200
|
|
)
|
|
|
|
assert_valid_image_response(response)
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("multiple_images")
|
|
)
|
|
def test_09_multiple_images(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 9: Multiple image analysis"""
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "Compare these two images"},
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "url",
|
|
"url": IMAGE_URL,
|
|
},
|
|
},
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/png",
|
|
"data": BASE64_IMAGE,
|
|
},
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=300
|
|
)
|
|
|
|
assert_valid_image_response(response)
|
|
content = response.content[0].text.lower()
|
|
# Should mention comparison or differences
|
|
assert any(
|
|
word in content for word in COMPARISON_KEYWORDS
|
|
), f"Response should contain comparison keywords. Got content: {content}"
|
|
|
|
def test_10_complex_end2end(self, anthropic_client, test_config):
|
|
"""Test Case 10: Complex end-to-end with conversation, images, and tools"""
|
|
messages = [
|
|
{"role": "user", "content": "Hello! I need help with some tasks."},
|
|
{
|
|
"role": "assistant",
|
|
"content": "Hello! I'd be happy to help you with your tasks. What do you need assistance with?",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "First, can you tell me what's in this image and then get the weather for the location shown?",
|
|
},
|
|
{
|
|
"type": "image",
|
|
"source": {
|
|
"type": "url",
|
|
"url": IMAGE_URL,
|
|
},
|
|
},
|
|
],
|
|
},
|
|
]
|
|
|
|
tools = convert_to_anthropic_tools([WEATHER_TOOL])
|
|
|
|
response1 = anthropic_client.messages.create(
|
|
model=get_model("anthropic", "chat"),
|
|
messages=messages,
|
|
tools=tools,
|
|
max_tokens=300,
|
|
)
|
|
|
|
# Should either describe image or call weather tool (or both)
|
|
assert len(response1.content) > 0
|
|
|
|
# Add response to conversation
|
|
# Serialize content blocks to dicts for cross-provider compatibility
|
|
messages.append(
|
|
{"role": "assistant", "content": serialize_anthropic_content(response1.content)}
|
|
)
|
|
|
|
# If there were tool calls, handle them
|
|
tool_calls = extract_anthropic_tool_calls(response1)
|
|
if tool_calls:
|
|
for _i, tool_call in enumerate(tool_calls):
|
|
tool_response = mock_tool_response(tool_call["name"], tool_call["arguments"])
|
|
|
|
# Find the corresponding tool use ID
|
|
tool_use_id = None
|
|
for content in response1.content:
|
|
if content.type == "tool_use" and content.name == tool_call["name"]:
|
|
tool_use_id = content.id
|
|
break
|
|
|
|
messages.append(
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "tool_result",
|
|
"tool_use_id": tool_use_id,
|
|
"content": tool_response,
|
|
}
|
|
],
|
|
}
|
|
)
|
|
|
|
# Get final response after tool calls
|
|
final_response = anthropic_client.messages.create(
|
|
model=get_model("anthropic", "chat"), messages=messages, max_tokens=200
|
|
)
|
|
|
|
# Anthropic might return empty content if tool result is sufficient
|
|
# This is valid behavior - just check that we got a response
|
|
assert final_response is not None
|
|
if final_response.content and len(final_response.content) > 0:
|
|
# If there is content, validate it
|
|
assert_valid_chat_response(final_response)
|
|
else:
|
|
# If no content, that's ok too - tool result was sufficient
|
|
print("Model returned empty content - tool result was sufficient")
|
|
|
|
def test_11_integration_specific_features(self, anthropic_client, test_config):
|
|
"""Test Case 11: Anthropic-specific features"""
|
|
|
|
# Test 1: System message
|
|
response1 = anthropic_client.messages.create(
|
|
model=get_model("anthropic", "chat"),
|
|
system="You are a helpful assistant that always responds in exactly 5 words.",
|
|
messages=[{"role": "user", "content": "Hello, how are you?"}],
|
|
max_tokens=50,
|
|
)
|
|
|
|
assert_valid_chat_response(response1)
|
|
# Check if response is approximately 5 words (allow some flexibility)
|
|
word_count = len(response1.content[0].text.split())
|
|
assert 3 <= word_count <= 7, f"Expected ~5 words, got {word_count}"
|
|
|
|
# Test 2: Temperature parameter
|
|
response2 = anthropic_client.messages.create(
|
|
model=get_model("anthropic", "chat"),
|
|
messages=[{"role": "user", "content": "Tell me a creative story in one sentence."}],
|
|
temperature=0.9,
|
|
max_tokens=100,
|
|
)
|
|
|
|
assert_valid_chat_response(response2)
|
|
|
|
# Test 3: Tool choice (any tool)
|
|
tools = convert_to_anthropic_tools([CALCULATOR_TOOL, WEATHER_TOOL])
|
|
response3 = anthropic_client.messages.create(
|
|
model=get_model("anthropic", "chat"),
|
|
messages=[{"role": "user", "content": "What's 15 + 27?"}],
|
|
tools=tools,
|
|
tool_choice={"type": "any"}, # Force tool use
|
|
max_tokens=100,
|
|
)
|
|
|
|
assert_has_tool_calls(response3)
|
|
tool_calls = extract_anthropic_tool_calls(response3)
|
|
# Should prefer calculator for math question
|
|
assert tool_calls[0]["name"] == "calculate"
|
|
|
|
def test_12_error_handling_invalid_roles(self, anthropic_client, test_config):
|
|
"""Test Case 12: Error handling for invalid roles"""
|
|
# bifrost handles invalid roles internally so this test should not raise an exception
|
|
response = anthropic_client.messages.create(
|
|
model=get_model("anthropic", "chat"),
|
|
messages=INVALID_ROLE_MESSAGES,
|
|
max_tokens=100,
|
|
)
|
|
|
|
# Verify the response is successful
|
|
assert response is not None
|
|
assert hasattr(response, "content")
|
|
assert len(response.content) > 0
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("streaming"))
|
|
def test_13_streaming(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 13: Streaming chat completion - auto-skips providers without streaming support"""
|
|
# Test basic streaming
|
|
stream = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=STREAMING_CHAT_MESSAGES,
|
|
max_tokens=1000,
|
|
stream=True,
|
|
)
|
|
|
|
content, chunk_count, tool_calls_detected = collect_streaming_content(
|
|
stream, "anthropic", timeout=300
|
|
)
|
|
|
|
# Validate streaming results
|
|
assert chunk_count > 0, "Should receive at least one chunk"
|
|
assert len(content) > 10, "Should receive substantial content"
|
|
assert not tool_calls_detected, "Basic streaming shouldn't have tool calls"
|
|
|
|
# Test streaming with tool calls (only if provider supports tools)
|
|
config = get_config()
|
|
if config.provider_supports_scenario(provider, "tool_calls"):
|
|
# Get the tools-capable model for this provider
|
|
tools_model = config.get_provider_model(provider, "tools")
|
|
if tools_model:
|
|
stream_with_tools = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, tools_model),
|
|
messages=STREAMING_TOOL_CALL_MESSAGES,
|
|
max_tokens=1000,
|
|
tools=convert_to_anthropic_tools([WEATHER_TOOL]),
|
|
stream=True,
|
|
)
|
|
|
|
content_tools, chunk_count_tools, tool_calls_detected_tools = (
|
|
collect_streaming_content(stream_with_tools, "anthropic", timeout=300)
|
|
)
|
|
|
|
# Validate tool streaming results
|
|
assert chunk_count_tools > 0, "Should receive at least one chunk with tools"
|
|
assert tool_calls_detected_tools, "Should receive at least one chunk with tools"
|
|
|
|
def test_14_list_models(self, anthropic_client, test_config):
|
|
"""Test Case 14: List models with pagination parameters"""
|
|
# Test basic list with limit
|
|
response = anthropic_client.models.list(limit=5)
|
|
assert response.data is not None
|
|
assert len(response.data) <= 5 # May return fewer if not enough models
|
|
assert hasattr(response, "first_id"), "Response should have first_id"
|
|
assert hasattr(response, "last_id"), "Response should have last_id"
|
|
assert hasattr(response, "has_more"), "Response should have has_more"
|
|
|
|
# Test pagination with after_id if there are more results
|
|
if response.has_more and response.last_id:
|
|
next_response = anthropic_client.models.list(limit=3, after_id=response.last_id)
|
|
assert next_response.data is not None
|
|
assert len(next_response.data) <= 3
|
|
# Ensure we got different results
|
|
if len(response.data) > 0 and len(next_response.data) > 0:
|
|
assert response.data[0].id != next_response.data[0].id
|
|
|
|
# Test pagination with before_id if we have a first_id
|
|
if response.first_id:
|
|
# Get a second page first
|
|
second_response = anthropic_client.models.list(limit=10)
|
|
if len(second_response.data) > 5 and second_response.last_id:
|
|
# Now try to go backwards from the last item
|
|
prev_response = anthropic_client.models.list(
|
|
limit=2, before_id=second_response.last_id
|
|
)
|
|
assert prev_response.data is not None
|
|
assert len(prev_response.data) <= 2
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("thinking"))
|
|
def test_15_extended_thinking(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 15: Extended thinking/reasoning (non-streaming)"""
|
|
# Convert to Anthropic message format
|
|
messages = convert_to_anthropic_messages(ANTHROPIC_THINKING_PROMPT)
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), # Specific thinking-capable model
|
|
max_tokens=4000, # Reduced to prevent token limit errors for smaller context window models
|
|
thinking={
|
|
"type": "enabled",
|
|
"budget_tokens": 2500, # Reduced to prevent token limit errors
|
|
},
|
|
extra_body={"reasoning_summary": "detailed"},
|
|
messages=messages,
|
|
)
|
|
|
|
# Validate response structure
|
|
assert response is not None, "Response should not be None"
|
|
assert hasattr(response, "content"), "Response should have content"
|
|
assert len(response.content) > 0, "Content should not be empty"
|
|
|
|
# Check for thinking content blocks
|
|
has_thinking = False
|
|
thinking_content = ""
|
|
regular_content = ""
|
|
|
|
for block in response.content:
|
|
if block.type:
|
|
if block.type == "thinking":
|
|
has_thinking = True
|
|
# The thinking content is directly in block.thinking attribute
|
|
if block.thinking:
|
|
thinking_content += str(block.thinking)
|
|
print(f"Found thinking block with {len(str(block.thinking))} chars")
|
|
elif block.type == "text":
|
|
if block.text:
|
|
regular_content += str(block.text)
|
|
|
|
# Should have thinking content
|
|
assert has_thinking, (
|
|
f"Response should contain thinking blocks. "
|
|
f"Got {len(response.content)} blocks: "
|
|
f"{[block.type if hasattr(block, 'type') else 'unknown' for block in response.content]}"
|
|
)
|
|
assert len(thinking_content) > 0, "Thinking content should not be empty"
|
|
|
|
# Validate thinking content quality - should show reasoning
|
|
thinking_lower = thinking_content.lower()
|
|
reasoning_keywords = [
|
|
"batch",
|
|
"oven",
|
|
"cookie",
|
|
"minute",
|
|
"calculate",
|
|
"total",
|
|
"time",
|
|
"divide",
|
|
"multiply",
|
|
"step",
|
|
]
|
|
|
|
keyword_matches = sum(1 for keyword in reasoning_keywords if keyword in thinking_lower)
|
|
assert keyword_matches >= 2, (
|
|
f"Thinking should contain reasoning about the problem. "
|
|
f"Found {keyword_matches} keywords. Content: {thinking_content[:200]}..."
|
|
)
|
|
|
|
# Should also have regular text response
|
|
assert len(regular_content) > 0, "Should have regular response text"
|
|
|
|
print(f"✓ Thinking content ({len(thinking_content)} chars): {thinking_content[:150]}...")
|
|
print(f"✓ Response content: {regular_content[:100]}...")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("thinking"))
|
|
def test_16_extended_thinking_streaming(self, anthropic_client, test_config, provider, model):
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
"""Test Case 16: Extended thinking/reasoning (streaming)"""
|
|
# Convert to Anthropic message format
|
|
messages = convert_to_anthropic_messages(ANTHROPIC_THINKING_STREAMING_PROMPT)
|
|
|
|
# Stream with thinking enabled - use thinking-capable model
|
|
stream = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
max_tokens=3000,
|
|
thinking={
|
|
"type": "enabled",
|
|
"budget_tokens": 2000, # Reduced to prevent token limit errors
|
|
},
|
|
messages=messages,
|
|
stream=True,
|
|
extra_body={"reasoning_summary": "detailed"},
|
|
)
|
|
|
|
# Collect streaming content
|
|
thinking_parts = []
|
|
text_parts = []
|
|
chunk_count = 0
|
|
has_thinking_delta = False
|
|
has_thinking_block_start = False
|
|
|
|
for event in stream:
|
|
chunk_count += 1
|
|
|
|
# Check event type
|
|
if event.type:
|
|
event_type = event.type
|
|
|
|
# Handle content_block_start to detect thinking blocks
|
|
if event_type == "content_block_start":
|
|
if event.content_block and event.content_block.type:
|
|
if event.content_block.type == "thinking":
|
|
has_thinking_block_start = True
|
|
print("Thinking block started")
|
|
|
|
# Handle content_block_delta events
|
|
elif event_type == "content_block_delta":
|
|
if event.delta and event.delta.type:
|
|
# Check for thinking delta
|
|
if event.delta.type == "thinking_delta":
|
|
has_thinking_delta = True
|
|
if event.delta.thinking:
|
|
thinking_parts.append(str(event.delta.thinking))
|
|
# Check for text delta
|
|
elif event.delta.type == "text_delta":
|
|
if event.delta.text:
|
|
text_parts.append(str(event.delta.text))
|
|
|
|
# Safety check
|
|
print("chunk_count", chunk_count)
|
|
if chunk_count > 5000:
|
|
break
|
|
|
|
# Combine collected content
|
|
complete_thinking = "".join(thinking_parts)
|
|
complete_text = "".join(text_parts)
|
|
|
|
# Validate results
|
|
assert chunk_count > 0, "Should receive at least one chunk"
|
|
assert has_thinking_delta or has_thinking_block_start, (
|
|
f"Should detect thinking in streaming. "
|
|
f"has_thinking_delta={has_thinking_delta}, has_thinking_block_start={has_thinking_block_start}"
|
|
)
|
|
assert len(complete_thinking) > 10, (
|
|
f"Should receive substantial thinking content, got {len(complete_thinking)} chars. "
|
|
f"Thinking parts: {len(thinking_parts)}"
|
|
)
|
|
|
|
# Validate thinking content
|
|
thinking_lower = complete_thinking.lower()
|
|
math_keywords = [
|
|
"paid",
|
|
"split",
|
|
"equal",
|
|
"owe",
|
|
"alice",
|
|
"bob",
|
|
"carol",
|
|
"total",
|
|
"divide",
|
|
"step",
|
|
]
|
|
|
|
keyword_matches = sum(1 for keyword in math_keywords if keyword in thinking_lower)
|
|
assert keyword_matches >= 2, (
|
|
f"Thinking should reason about splitting the bill. "
|
|
f"Found {keyword_matches} keywords. Content: {complete_thinking[:200]}..."
|
|
)
|
|
|
|
# Should have regular response text too
|
|
assert len(complete_text) > 0, "Should have regular response text"
|
|
|
|
print(f"✓ Streamed thinking ({len(thinking_parts)} chunks): {complete_thinking[:150]}...")
|
|
print(f"✓ Streamed response ({len(text_parts)} chunks): {complete_text[:100]}...")
|
|
|
|
# =========================================================================
|
|
# FILES API TEST CASES (Cross-Provider)
|
|
# =========================================================================
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("file_upload")
|
|
)
|
|
def test_17_file_upload(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 17: Upload a file via Files API
|
|
|
|
Uses cross-provider parametrization to test file upload across providers
|
|
that support the Files API (Anthropic, OpenAI, Gemini).
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for file_upload scenario")
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
try:
|
|
# Upload the file using beta API
|
|
if provider == "openai":
|
|
# Create test content
|
|
jsonl_content = create_batch_jsonl_content(
|
|
model=get_model("openai", "chat"), num_requests=1
|
|
)
|
|
response = client.beta.files.upload(
|
|
file=("test_upload.jsonl", jsonl_content, "application/jsonl"),
|
|
)
|
|
else:
|
|
text_content = b"This is a test file for Files API integration testing."
|
|
response = client.beta.files.upload(
|
|
file=("test_upload.txt", text_content, "text/plain"),
|
|
)
|
|
# Validate response
|
|
assert response is not None, "File response should not be None"
|
|
assert hasattr(response, "id"), "File response should have 'id' attribute"
|
|
assert response.id is not None, "File ID should not be None"
|
|
assert len(response.id) > 0, "File ID should not be empty"
|
|
|
|
print(f"Success: Uploaded file with ID: {response.id} for provider {provider}")
|
|
|
|
# Clean up - delete the file
|
|
try:
|
|
client.beta.files.delete(response.id)
|
|
print(f"Cleanup: Deleted file {response.id}")
|
|
except Exception as e:
|
|
print(f"Warning: Failed to clean up file: {e}")
|
|
|
|
except Exception as e:
|
|
# Files API might not be available or require specific permissions
|
|
error_str = str(e).lower()
|
|
if "beta" in error_str or "not found" in error_str or "not supported" in error_str:
|
|
pytest.skip(f"Files API not available for provider {provider}: {e}")
|
|
raise
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("file_list"))
|
|
def test_18_file_list(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 18: List files from Files API
|
|
|
|
Uses cross-provider parametrization to test file listing across providers.
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for file_list scenario")
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
try:
|
|
# First upload a file to ensure we have at least one
|
|
if provider == "openai":
|
|
jsonl_content = create_batch_jsonl_content(
|
|
model=get_model("openai", "chat"), num_requests=1
|
|
)
|
|
uploaded_file = client.beta.files.upload(
|
|
file=("test_list.jsonl", jsonl_content, "application/jsonl"),
|
|
)
|
|
else:
|
|
test_content = b"Test file for listing"
|
|
uploaded_file = client.beta.files.upload(
|
|
file=("test_list.txt", test_content, "text/plain"),
|
|
)
|
|
|
|
try:
|
|
# List files
|
|
response = client.beta.files.list()
|
|
|
|
# Validate response
|
|
assert response is not None, "File list response should not be None"
|
|
assert hasattr(response, "data"), "File list response should have 'data' attribute"
|
|
assert isinstance(response.data, list), "Data should be a list"
|
|
|
|
# Check that our uploaded file is in the list
|
|
file_ids = [f.id for f in response.data]
|
|
assert (
|
|
uploaded_file.id in file_ids
|
|
), f"Uploaded file {uploaded_file.id} should be in file list"
|
|
|
|
print(f"Success: Listed {len(response.data)} files for provider {provider}")
|
|
|
|
finally:
|
|
# Clean up
|
|
try:
|
|
client.beta.files.delete(uploaded_file.id)
|
|
except Exception as e:
|
|
print(f"Warning: Failed to clean up file: {e}")
|
|
|
|
except Exception as e:
|
|
error_str = str(e).lower()
|
|
if "beta" in error_str or "not found" in error_str or "not supported" in error_str:
|
|
pytest.skip(f"Files API not available for provider {provider}: {e}")
|
|
raise
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("file_delete")
|
|
)
|
|
def test_20_file_delete(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 20: Delete a file from Files API
|
|
|
|
Uses cross-provider parametrization to test file deletion across providers.
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for file_delete scenario")
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
try:
|
|
# First upload a file
|
|
if provider == "openai":
|
|
jsonl_content = create_batch_jsonl_content(
|
|
model=get_model("openai", "chat"), num_requests=1
|
|
)
|
|
uploaded_file = client.beta.files.upload(
|
|
file=("test_delete.jsonl", jsonl_content, "application/jsonl"),
|
|
)
|
|
else:
|
|
test_content = b"Test file for deletion"
|
|
uploaded_file = client.beta.files.upload(
|
|
file=("test_delete.txt", test_content, "text/plain"),
|
|
)
|
|
|
|
# Delete the file
|
|
response = client.beta.files.delete(uploaded_file.id)
|
|
|
|
# Validate response - providers may return different formats
|
|
assert response is not None, "Delete response should not be None"
|
|
|
|
print(f"Success: Deleted file {uploaded_file.id} (provider: {provider})")
|
|
|
|
# Verify file is no longer retrievable
|
|
with pytest.raises(Exception):
|
|
client.beta.files.retrieve(uploaded_file.id)
|
|
|
|
except Exception as e:
|
|
error_str = str(e).lower()
|
|
if "beta" in error_str or "not found" in error_str or "not supported" in error_str:
|
|
pytest.skip(f"Files API not available for provider {provider}: {e}")
|
|
raise
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("file_content")
|
|
)
|
|
def test_21_file_content(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 21: Download file content from Files API
|
|
|
|
Uses cross-provider parametrization to test file content download.
|
|
Note: Some providers have restrictions on downloading uploaded files:
|
|
- Anthropic: Only files created by code execution tool can be downloaded
|
|
- Gemini: Doesn't support direct file download (excluded via config)
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for file_content scenario")
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
try:
|
|
# First upload a file
|
|
if provider == "openai":
|
|
original_content = create_batch_jsonl_content(
|
|
model=get_model("openai", "chat"), num_requests=1
|
|
)
|
|
uploaded_file = client.beta.files.upload(
|
|
file=("test_content.jsonl", original_content, "application/jsonl"),
|
|
)
|
|
else:
|
|
original_content = b"Test file content for download"
|
|
uploaded_file = client.beta.files.upload(
|
|
file=("test_content.txt", original_content, "text/plain"),
|
|
)
|
|
|
|
try:
|
|
# Try to download file content
|
|
# This may fail for some providers (e.g., Anthropic uploaded files)
|
|
response = client.beta.files.download(uploaded_file.id)
|
|
|
|
# If we get here, download was successful
|
|
assert response is not None, "File content should not be None"
|
|
|
|
# Compare downloaded content with original
|
|
downloaded_content = response.text()
|
|
original_str = (
|
|
original_content
|
|
if isinstance(original_content, str)
|
|
else original_content.decode("utf-8")
|
|
)
|
|
|
|
assert downloaded_content == original_str, (
|
|
f"Downloaded content should match original. "
|
|
f"Expected: {original_str[:100]}..., Got: {downloaded_content[:100]}..."
|
|
)
|
|
|
|
print(
|
|
f"Success: Downloaded and verified file content ({len(downloaded_content)} bytes) for provider {provider}"
|
|
)
|
|
|
|
except Exception as download_error:
|
|
# Some providers don't allow downloading uploaded files
|
|
error_str = str(download_error).lower()
|
|
if (
|
|
"download" in error_str
|
|
or "not allowed" in error_str
|
|
or "forbidden" in error_str
|
|
):
|
|
print(
|
|
f"Expected for {provider}: Cannot download uploaded files - {download_error}"
|
|
)
|
|
else:
|
|
raise
|
|
|
|
finally:
|
|
# Clean up
|
|
try:
|
|
client.beta.files.delete(uploaded_file.id)
|
|
except Exception as e:
|
|
print(f"Warning: Failed to clean up file: {e}")
|
|
|
|
except Exception as e:
|
|
error_str = str(e).lower()
|
|
if "beta" in error_str or "not found" in error_str or "not supported" in error_str:
|
|
pytest.skip(f"Files API not available for provider {provider}: {e}")
|
|
raise
|
|
|
|
# =========================================================================
|
|
# BATCH API TEST CASES (Cross-Provider)
|
|
# =========================================================================
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("batch_inline")
|
|
)
|
|
def test_22_batch_create_inline(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 22: Create a batch job with inline requests
|
|
|
|
Uses cross-provider parametrization to test batch creation across providers
|
|
that support inline batch requests (Anthropic, Gemini, etc.)
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for batch_inline scenario")
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
# Create inline requests
|
|
batch_requests = create_batch_inline_requests(
|
|
model=model, num_requests=2, provider=provider, sdk="anthropic"
|
|
)
|
|
|
|
batch = None
|
|
try:
|
|
# Create batch job
|
|
batch = client.beta.messages.batches.create(requests=batch_requests)
|
|
|
|
print(
|
|
f"Success: Created batch with ID: {batch.id}, status: {batch.processing_status} for provider {provider}"
|
|
)
|
|
|
|
# Validate response
|
|
assert_valid_batch_inline_response(batch, provider="anthropic")
|
|
finally:
|
|
# Clean up - cancel batch if created
|
|
if batch:
|
|
try:
|
|
client.beta.messages.batches.cancel(batch.id)
|
|
except Exception as e:
|
|
print(f"Info: Could not cancel batch (may already be processed): {e}")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("batch_list"))
|
|
def test_23_batch_list(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 23: List batch jobs
|
|
|
|
Tests batch listing across all providers using Anthropic SDK.
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for batch_list scenario")
|
|
|
|
if provider == "bedrock":
|
|
pytest.skip(
|
|
"Bedrock can't create batches with file input. Hence skipping batch_list scenario"
|
|
)
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
# List batches
|
|
response = client.beta.messages.batches.list(limit=10)
|
|
|
|
# Validate response
|
|
assert response is not None, "Batch list response should not be None"
|
|
assert hasattr(response, "data"), "Batch list response should have 'data' attribute"
|
|
assert isinstance(response.data, list), "Data should be a list"
|
|
|
|
batch_count = len(response.data)
|
|
print(f"Success: Listed {batch_count} batches for provider {provider}")
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("batch_retrieve")
|
|
)
|
|
def test_24_batch_retrieve(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 24: Retrieve batch status by ID
|
|
|
|
Creates a batch using inline requests, then retrieves it.
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for batch_retrieve scenario")
|
|
|
|
if provider == "bedrock":
|
|
pytest.skip(
|
|
"Bedrock can't create batches with file input. Hence skipping batch_retrieve scenario"
|
|
)
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
batch_id = None
|
|
|
|
try:
|
|
# Create batch for testing retrieval
|
|
batch_requests = create_batch_inline_requests(
|
|
model=model, num_requests=1, provider=provider, sdk="anthropic"
|
|
)
|
|
batch = client.beta.messages.batches.create(requests=batch_requests)
|
|
batch_id = batch.id
|
|
|
|
# Retrieve batch
|
|
retrieved_batch = client.beta.messages.batches.retrieve(batch_id)
|
|
|
|
# Validate response
|
|
assert retrieved_batch is not None, "Retrieved batch should not be None"
|
|
assert (
|
|
retrieved_batch.id == batch_id
|
|
), f"Batch ID should match: expected {batch_id}, got {retrieved_batch.id}"
|
|
|
|
print(
|
|
f"Success: Retrieved batch {batch_id}, status: {retrieved_batch.processing_status} for provider {provider}"
|
|
)
|
|
|
|
finally:
|
|
# Clean up
|
|
if batch_id:
|
|
try:
|
|
client.beta.messages.batches.cancel(batch_id)
|
|
except Exception:
|
|
pass
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("batch_cancel")
|
|
)
|
|
def test_25_batch_cancel(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 25: Cancel a batch job
|
|
|
|
Creates a batch using inline requests, then cancels it.
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for batch_cancel scenario")
|
|
|
|
if provider == "bedrock":
|
|
pytest.skip(
|
|
"Bedrock can't create batches with file input. Hence skipping batch_list scenario"
|
|
)
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
batch_id = None
|
|
|
|
try:
|
|
# Create batch for testing cancellation
|
|
batch_requests = create_batch_inline_requests(
|
|
model=model, num_requests=1, provider=provider
|
|
)
|
|
batch = client.beta.messages.batches.create(requests=batch_requests)
|
|
batch_id = batch.id
|
|
|
|
# Cancel batch
|
|
cancelled_batch = client.beta.messages.batches.cancel(batch_id)
|
|
|
|
# Validate response
|
|
assert cancelled_batch is not None, "Cancelled batch should not be None"
|
|
assert cancelled_batch.id == batch_id, "Batch ID should match"
|
|
# Anthropic uses different status values
|
|
assert cancelled_batch.processing_status in [
|
|
"canceling",
|
|
"ended",
|
|
], f"Status should be 'canceling' or 'ended', got {cancelled_batch.processing_status}"
|
|
|
|
print(
|
|
f"Success: Cancelled batch {batch_id}, status: {cancelled_batch.processing_status} for provider {provider}"
|
|
)
|
|
|
|
except Exception as e:
|
|
# Batch might already be processed
|
|
if batch_id:
|
|
print(f"Info: Batch cancel may have failed due to batch state: {e}")
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("batch_cancel")
|
|
)
|
|
def test_26_batch_results(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 26: Retrieve batch results
|
|
|
|
Note: This test creates a batch and attempts to retrieve results.
|
|
Results are only available after the batch has completed processing.
|
|
"""
|
|
if provider == "bedrock":
|
|
pytest.skip(
|
|
"Bedrock can't create batches with file input. Hence skipping test_26_batch_results scenario"
|
|
)
|
|
|
|
try:
|
|
# Create batch with simple requests
|
|
batch_requests = create_batch_inline_requests(
|
|
model=model, num_requests=1, provider=provider, sdk="anthropic"
|
|
)
|
|
|
|
batch = anthropic_client.beta.messages.batches.create(requests=batch_requests)
|
|
batch_id = batch.id
|
|
|
|
print(f"Created batch {batch_id} with status: {batch.processing_status}")
|
|
|
|
# Try to get results - might fail if batch not yet complete
|
|
try:
|
|
results = anthropic_client.beta.messages.batches.results(batch_id)
|
|
|
|
# Collect results if available
|
|
result_count = 0
|
|
for result in results:
|
|
result_count += 1
|
|
print(f" Result {result_count}: custom_id={result.custom_id}")
|
|
|
|
print(f"Success: Retrieved {result_count} results for batch {batch_id}")
|
|
|
|
except Exception as results_error:
|
|
# Results might not be ready yet
|
|
error_str = str(results_error).lower()
|
|
if (
|
|
"not ready" in error_str
|
|
or "in_progress" in error_str
|
|
or "processing" in error_str
|
|
):
|
|
print("Info: Batch results not yet available (batch still processing)")
|
|
else:
|
|
print(f"Info: Could not retrieve results: {results_error}")
|
|
|
|
# Clean up
|
|
try:
|
|
anthropic_client.beta.messages.batches.cancel(batch_id)
|
|
except Exception:
|
|
pass
|
|
|
|
except Exception as e:
|
|
error_str = str(e).lower()
|
|
if "beta" in error_str or "not found" in error_str:
|
|
pytest.skip(f"Anthropic Batch API not available: {e}")
|
|
raise
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("batch_inline")
|
|
)
|
|
def test_27_batch_e2e(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 27: End-to-end batch workflow
|
|
|
|
Complete workflow: create batch -> poll status -> verify in list.
|
|
Uses cross-provider parametrization.
|
|
"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for batch_inline scenario")
|
|
|
|
if provider == "bedrock":
|
|
pytest.skip(
|
|
"Bedrock can't create batches with file input. Hence skipping test_27_batch_e2e scenario"
|
|
)
|
|
|
|
import time
|
|
|
|
# Get provider-specific client
|
|
client = get_provider_anthropic_client(provider)
|
|
|
|
# Step 1: Create batch with inline requests
|
|
print(f"Step 1: Creating batch for provider {provider}...")
|
|
batch_requests = create_batch_inline_requests(
|
|
model=model, num_requests=2, provider=provider, sdk="anthropic"
|
|
)
|
|
|
|
batch = client.beta.messages.batches.create(requests=batch_requests)
|
|
batch_id = batch.id
|
|
|
|
assert batch_id is not None, "Batch ID should not be None"
|
|
print(f" Created batch: {batch_id}, status: {batch.processing_status}")
|
|
|
|
try:
|
|
# Step 2: Poll batch status (with timeout)
|
|
print("Step 2: Polling batch status...")
|
|
max_polls = 5
|
|
poll_interval = 2 # seconds
|
|
|
|
for i in range(max_polls):
|
|
retrieved_batch = client.beta.messages.batches.retrieve(batch_id)
|
|
print(f" Poll {i+1}: status = {retrieved_batch.processing_status}")
|
|
|
|
if retrieved_batch.processing_status in ["ended"]:
|
|
print(f" Batch reached terminal state: {retrieved_batch.processing_status}")
|
|
break
|
|
|
|
if hasattr(retrieved_batch, "request_counts") and retrieved_batch.request_counts:
|
|
counts = retrieved_batch.request_counts
|
|
print(
|
|
f" Request counts - processing: {counts.processing}, succeeded: {counts.succeeded}, errored: {counts.errored}"
|
|
)
|
|
|
|
time.sleep(poll_interval)
|
|
|
|
# Step 3: Verify batch is in the list
|
|
print("Step 3: Verifying batch in list...")
|
|
batch_list = client.beta.messages.batches.list(limit=20)
|
|
batch_ids = [b.id for b in batch_list.data]
|
|
assert batch_id in batch_ids, f"Batch {batch_id} should be in the batch list"
|
|
print(f" Verified batch {batch_id} is in list")
|
|
|
|
print(f"Success: E2E completed for batch {batch_id} (provider: {provider})")
|
|
|
|
finally:
|
|
# Clean up
|
|
try:
|
|
client.beta.messages.batches.cancel(batch_id)
|
|
print(f"Cleanup: Cancelled batch {batch_id}")
|
|
except Exception as e:
|
|
print(f"Cleanup info: Could not cancel batch: {e}")
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("prompt_caching")
|
|
)
|
|
def test_28_prompt_caching_system(self, anthropic_client, provider, model):
|
|
"""Test Case 28: Prompt caching with system message checkpoint"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for prompt_caching scenario")
|
|
|
|
print(f"\n=== Testing System Message Caching for provider {provider} ===")
|
|
print("First request: Creating cache with system message checkpoint...")
|
|
|
|
system_messages = [
|
|
{
|
|
"type": "text",
|
|
"text": "You are an AI assistant tasked with analyzing legal documents.",
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": PROMPT_CACHING_LARGE_CONTEXT,
|
|
"cache_control": {"type": "ephemeral"},
|
|
},
|
|
]
|
|
|
|
# First request - should create cache
|
|
response1 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
system=system_messages,
|
|
messages=[
|
|
{"role": "user", "content": "What are the key elements of contract formation?"}
|
|
],
|
|
max_tokens=1024,
|
|
)
|
|
|
|
# Validate first response
|
|
assert_valid_chat_response(response1)
|
|
assert hasattr(response1, "usage"), "Response should have usage information"
|
|
cache_write_tokens = validate_cache_write(response1.usage, "First request")
|
|
|
|
# Second request with same system - should hit cache
|
|
print("\nSecond request: Hitting cache with same system checkpoint...")
|
|
response2 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
system=system_messages, # Same system messages with cache_control
|
|
messages=[
|
|
{"role": "user", "content": "What is the purpose of a force majeure clause?"}
|
|
],
|
|
max_tokens=1024,
|
|
)
|
|
|
|
# Validate second response
|
|
assert_valid_chat_response(response2)
|
|
cache_read_tokens = validate_cache_read(response2.usage, "Second request")
|
|
|
|
# Validate that cache read tokens are approximately equal to cache creation tokens
|
|
assert (
|
|
abs(cache_write_tokens - cache_read_tokens) < 100
|
|
), f"Cache read tokens ({cache_read_tokens}) should be close to cache creation tokens ({cache_write_tokens})"
|
|
|
|
print(
|
|
f"✓ System caching validated - Cache created: {cache_write_tokens} tokens, "
|
|
f"Cache read: {cache_read_tokens} tokens"
|
|
)
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("prompt_caching")
|
|
)
|
|
def test_29_prompt_caching_messages(self, anthropic_client, provider, model):
|
|
"""Test Case 29: Prompt caching with messages checkpoint"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for prompt_caching scenario")
|
|
|
|
print(f"\n=== Testing Messages Caching for provider {provider} ===")
|
|
print("First request: Creating cache with messages checkpoint...")
|
|
|
|
# First request with cache control in user message
|
|
response1 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "Here is a large legal document to analyze:"},
|
|
{
|
|
"type": "text",
|
|
"text": PROMPT_CACHING_LARGE_CONTEXT,
|
|
"cache_control": {"type": "ephemeral"},
|
|
},
|
|
{"type": "text", "text": "What are the main indemnification principles?"},
|
|
],
|
|
}
|
|
],
|
|
max_tokens=1024,
|
|
)
|
|
|
|
assert_valid_chat_response(response1)
|
|
assert hasattr(response1, "usage"), "Response should have usage information"
|
|
cache_write_tokens = validate_cache_write(response1.usage, "First request")
|
|
|
|
# Second request with same cached content
|
|
print("\nSecond request: Hitting cache with same messages checkpoint...")
|
|
response2 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "Here is a large legal document to analyze:"},
|
|
{
|
|
"type": "text",
|
|
"text": PROMPT_CACHING_LARGE_CONTEXT,
|
|
"cache_control": {"type": "ephemeral"},
|
|
},
|
|
{"type": "text", "text": "Summarize the dispute resolution methods."},
|
|
],
|
|
}
|
|
],
|
|
max_tokens=1024,
|
|
)
|
|
|
|
assert_valid_chat_response(response2)
|
|
cache_read_tokens = validate_cache_read(response2.usage, "Second request")
|
|
|
|
# Validate that cache read tokens are approximately equal to cache creation tokens
|
|
assert (
|
|
abs(cache_write_tokens - cache_read_tokens) < 100
|
|
), f"Cache read tokens ({cache_read_tokens}) should be close to cache creation tokens ({cache_write_tokens})"
|
|
|
|
print(
|
|
f"✓ Messages caching validated - Cache created: {cache_write_tokens} tokens, "
|
|
f"Cache read: {cache_read_tokens} tokens"
|
|
)
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("prompt_caching")
|
|
)
|
|
def test_30_prompt_caching_tools(self, anthropic_client, provider, model):
|
|
"""Test Case 30: Prompt caching with tools checkpoint (12 tools)"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for prompt_caching scenario")
|
|
|
|
print(f"\n=== Testing Tools Caching for provider {provider} ===")
|
|
print("First request: Creating cache with tools checkpoint...")
|
|
|
|
# Convert tools to Anthropic format with cache control
|
|
tools = convert_to_anthropic_tools(PROMPT_CACHING_TOOLS)
|
|
# Add cache control to the last tool
|
|
tools[-1]["cache_control"] = {"type": "ephemeral"}
|
|
|
|
# First request with tool cache control
|
|
response1 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
tools=tools,
|
|
messages=[{"role": "user", "content": "What's the weather in Boston?"}],
|
|
max_tokens=1024,
|
|
)
|
|
|
|
assert hasattr(response1, "usage"), "Response should have usage information"
|
|
cache_write_tokens = validate_cache_write(response1.usage, "First request")
|
|
|
|
# Second request with same tools
|
|
print("\nSecond request: Hitting cache with same tools checkpoint...")
|
|
response2 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
tools=tools,
|
|
messages=[{"role": "user", "content": "Calculate 42 * 17"}],
|
|
max_tokens=1024,
|
|
)
|
|
|
|
cache_read_tokens = validate_cache_read(response2.usage, "Second request")
|
|
|
|
print(
|
|
f"✓ Tools caching validated - Cache created: {cache_write_tokens} tokens, "
|
|
f"Cache read: {cache_read_tokens} tokens"
|
|
)
|
|
|
|
# =========================================================================
|
|
# INPUT TOKENS / TOKEN COUNTING TEST CASES
|
|
# =========================================================================
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("count_tokens")
|
|
)
|
|
def test_31a_input_tokens_simple_text(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 31a: Input tokens count with simple text"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
|
|
response = anthropic_client.beta.messages.count_tokens(
|
|
model=format_provider_model(provider, model),
|
|
messages=[{"role": "user", "content": INPUT_TOKENS_SIMPLE_TEXT}],
|
|
)
|
|
|
|
# Validate response structure
|
|
assert_valid_input_tokens_response(response, "anthropic")
|
|
|
|
# Simple text should have a reasonable token count (between 3-20 tokens)
|
|
assert (
|
|
3 <= response.input_tokens <= 20
|
|
), f"Simple text should have 3-20 tokens, got {response.input_tokens}"
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("count_tokens")
|
|
)
|
|
def test_31b_input_tokens_with_system_message(
|
|
self, anthropic_client, test_config, provider, model
|
|
):
|
|
"""Test Case 31b: Input tokens count with system message"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
|
|
# Convert to Anthropic format
|
|
messages = convert_to_anthropic_messages(INPUT_TOKENS_WITH_SYSTEM)
|
|
|
|
# Extract system message if present
|
|
system_message = None
|
|
for msg in INPUT_TOKENS_WITH_SYSTEM:
|
|
if msg.get("role") == "system":
|
|
system_message = msg.get("content")
|
|
break
|
|
|
|
response = anthropic_client.beta.messages.count_tokens(
|
|
model=format_provider_model(provider, model),
|
|
system=system_message,
|
|
messages=messages,
|
|
)
|
|
|
|
# Validate response structure
|
|
assert_valid_input_tokens_response(response, "anthropic")
|
|
|
|
# With system message should have more tokens than simple text
|
|
assert (
|
|
response.input_tokens > 2
|
|
), f"With system message should have >2 tokens, got {response.input_tokens}"
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("count_tokens")
|
|
)
|
|
def test_31c_input_tokens_long_text(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 31c: Input tokens count with long text"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
|
|
response = anthropic_client.beta.messages.count_tokens(
|
|
model=format_provider_model(provider, model),
|
|
messages=[{"role": "user", "content": INPUT_TOKENS_LONG_TEXT}],
|
|
)
|
|
|
|
# Validate response structure
|
|
assert_valid_input_tokens_response(response, "anthropic")
|
|
|
|
# Long text should have significantly more tokens
|
|
assert (
|
|
response.input_tokens > 100
|
|
), f"Long text should have >100 tokens, got {response.input_tokens}"
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("file_input"))
|
|
def test_31_document_pdf_input(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 31: PDF document input"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for document_input scenario")
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "What is the main content of this PDF document? Summarize it.",
|
|
},
|
|
{
|
|
"type": "document",
|
|
"title": "testing",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "application/pdf",
|
|
"data": FILE_DATA_BASE64,
|
|
},
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=500
|
|
)
|
|
|
|
assert_valid_chat_response(response)
|
|
assert len(response.content) > 0
|
|
assert response.content[0].type == "text"
|
|
content = response.content[0].text.lower()
|
|
|
|
# Should mention "hello world" from the PDF
|
|
assert any(
|
|
word in content for word in ["hello", "world"]
|
|
), f"Response should reference document content. Got: {content}"
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("file_input_text")
|
|
)
|
|
def test_32_document_text_input(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 32: Text document input"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for document_input scenario")
|
|
|
|
# Plain text document content
|
|
text_content = """This is a test text document for document input testing.
|
|
|
|
It contains multiple paragraphs to ensure the model can properly process text documents.
|
|
|
|
Key features of this document:
|
|
1. Multiple lines and structure
|
|
2. Clear formatting
|
|
3. Numbered list
|
|
|
|
This document is used to verify that the AI can read and understand text document inputs."""
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "What are the key features mentioned in this document?",
|
|
},
|
|
{
|
|
"type": "document",
|
|
"title": "testing",
|
|
"source": {
|
|
"type": "text",
|
|
"media_type": "text/plain",
|
|
"data": text_content,
|
|
},
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=500
|
|
)
|
|
|
|
assert_valid_chat_response(response)
|
|
assert len(response.content) > 0
|
|
assert response.content[0].type == "text"
|
|
content = response.content[0].text.lower()
|
|
|
|
# Should reference the document features
|
|
document_keywords = ["feature", "line", "format", "list", "document"]
|
|
assert any(
|
|
word in content for word in document_keywords
|
|
), f"Response should reference document features. Got: {content}"
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("citations"))
|
|
def test_33_citations_pdf(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 33: PDF document with page_location citations"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for citations scenario")
|
|
|
|
print(f"\n=== Testing PDF Citations (page_location) for provider {provider} ===")
|
|
|
|
# Create PDF document using helper
|
|
document = create_anthropic_document(
|
|
content=FILE_DATA_BASE64, doc_type="pdf", title="Test PDF Document"
|
|
)
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "What does this PDF document say? Please cite your sources.",
|
|
},
|
|
document,
|
|
],
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=500
|
|
)
|
|
|
|
# Validate basic response
|
|
assert_valid_chat_response(response)
|
|
assert len(response.content) > 0
|
|
|
|
# Check for citations using helper
|
|
has_citations = False
|
|
citation_count = 0
|
|
for block in response.content:
|
|
if hasattr(block, "citations") and block.citations:
|
|
has_citations = True
|
|
for citation in block.citations:
|
|
citation_count += 1
|
|
# Use common validator
|
|
assert_valid_anthropic_citation(
|
|
citation, expected_type="page_location", document_index=0
|
|
)
|
|
print(
|
|
f"✓ Citation {citation_count}: pages {citation.start_page_number}-{citation.end_page_number}, "
|
|
f"text: '{citation.cited_text[:50]}...'"
|
|
)
|
|
|
|
assert has_citations, "Response should contain citations for PDF document"
|
|
print(f"✓ PDF citations test passed - Found {citation_count} citations")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("citations"))
|
|
def test_34_citations_text(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 34: Plain text document with char_location citations"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for citations scenario")
|
|
|
|
print(f"\n=== Testing Text Citations (char_location) for provider {provider} ===")
|
|
|
|
# Create text document using helper
|
|
document = create_anthropic_document(
|
|
content=CITATION_TEXT_DOCUMENT, doc_type="text", title="Theory of Relativity Overview"
|
|
)
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "When was General Relativity published and what does it deal with? Please cite your sources.",
|
|
},
|
|
document,
|
|
],
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=500
|
|
)
|
|
|
|
# Validate basic response
|
|
assert_valid_chat_response(response)
|
|
assert len(response.content) > 0
|
|
|
|
# Check for citations using helper
|
|
has_citations = False
|
|
citation_count = 0
|
|
for block in response.content:
|
|
if hasattr(block, "citations") and block.citations:
|
|
has_citations = True
|
|
for citation in block.citations:
|
|
citation_count += 1
|
|
# Use common validator
|
|
assert_valid_anthropic_citation(
|
|
citation, expected_type="char_location", document_index=0
|
|
)
|
|
print(
|
|
f"✓ Citation {citation_count}: chars {citation.start_char_index}-{citation.end_char_index}, "
|
|
f"text: '{citation.cited_text[:50]}...'"
|
|
)
|
|
|
|
assert has_citations, "Response should contain citations for text document"
|
|
print(f"✓ Text citations test passed - Found {citation_count} citations")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("citations"))
|
|
def test_35_citations_multi_document(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 35: Multiple documents with citations (document_index validation)"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for citations scenario")
|
|
|
|
print(f"\n=== Testing Multi-Document Citations for provider {provider} ===")
|
|
|
|
# Create multiple documents using helper
|
|
documents = []
|
|
for idx, doc_info in enumerate(CITATION_MULTI_DOCUMENT_SET):
|
|
doc = create_anthropic_document(
|
|
content=doc_info["content"], doc_type="text", title=doc_info["title"]
|
|
)
|
|
documents.append(doc)
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "Summarize what each document says. Please cite your sources from each document.",
|
|
},
|
|
*documents,
|
|
],
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model), messages=messages, max_tokens=600
|
|
)
|
|
|
|
# Validate basic response
|
|
assert_valid_chat_response(response)
|
|
assert len(response.content) > 0
|
|
|
|
# Check for citations from multiple documents
|
|
has_citations = False
|
|
citations_by_doc = {0: 0, 1: 0} # Track citations per document
|
|
total_citations = 0
|
|
|
|
for block in response.content:
|
|
if hasattr(block, "citations") and block.citations:
|
|
has_citations = True
|
|
for citation in block.citations:
|
|
total_citations += 1
|
|
doc_idx = citation.document_index if hasattr(citation, "document_index") else 0
|
|
|
|
# Validate citation
|
|
assert_valid_anthropic_citation(
|
|
citation, expected_type="char_location", document_index=doc_idx
|
|
)
|
|
|
|
# Track which document this citation is from
|
|
if doc_idx in citations_by_doc:
|
|
citations_by_doc[doc_idx] += 1
|
|
|
|
doc_title = (
|
|
citation.document_title
|
|
if hasattr(citation, "document_title")
|
|
else "Unknown"
|
|
)
|
|
print(
|
|
f"✓ Citation from doc[{doc_idx}] ({doc_title}): "
|
|
f"chars {citation.start_char_index}-{citation.end_char_index}, "
|
|
f"text: '{citation.cited_text[:40]}...'"
|
|
)
|
|
|
|
assert has_citations, "Response should contain citations"
|
|
|
|
# Report statistics
|
|
print(f"\n✓ Multi-document citations test passed:")
|
|
print(f" - Total citations: {total_citations}")
|
|
for doc_idx, count in citations_by_doc.items():
|
|
doc_title = CITATION_MULTI_DOCUMENT_SET[doc_idx]["title"]
|
|
print(f" - Document {doc_idx} ({doc_title}): {count} citations")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("citations"))
|
|
def test_36_citations_streaming(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 36: Text citations with streaming (citations_delta)"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for citations scenario")
|
|
|
|
print(f"\n=== Testing Streaming Citations (char_location) for provider {provider} ===")
|
|
|
|
# Create text document using helper
|
|
document = create_anthropic_document(
|
|
content=CITATION_TEXT_DOCUMENT, doc_type="text", title="Machine Learning Introduction"
|
|
)
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "Explain the key concepts from this document. Please cite your sources.",
|
|
},
|
|
document,
|
|
],
|
|
}
|
|
]
|
|
|
|
stream = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
max_tokens=500,
|
|
stream=True,
|
|
)
|
|
|
|
# Collect streaming content and citations using helper
|
|
complete_text, citations, chunk_count = collect_anthropic_streaming_citations(stream)
|
|
|
|
# Validate results
|
|
assert chunk_count > 0, "Should receive at least one chunk"
|
|
assert len(complete_text) > 0, "Should receive text content"
|
|
assert len(citations) > 0, "Should collect at least one citation from stream"
|
|
|
|
# Validate each citation
|
|
for idx, citation in enumerate(citations, 1):
|
|
# Use common validator
|
|
assert_valid_anthropic_citation(
|
|
citation, expected_type="char_location", document_index=0
|
|
)
|
|
print(
|
|
f"✓ Citation {idx}: chars {citation.start_char_index}-{citation.end_char_index}, "
|
|
f"text: '{citation.cited_text[:50]}...'"
|
|
)
|
|
|
|
print(
|
|
f"✓ Streaming citations test passed - {len(citations)} citations in {chunk_count} chunks"
|
|
)
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("citations"))
|
|
def test_37_citations_streaming_pdf(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 37: PDF citations with streaming (page_location + citations_delta)"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for citations scenario")
|
|
|
|
print(f"\n=== Testing Streaming PDF Citations (page_location) for provider {provider} ===")
|
|
|
|
# Create PDF document using helper
|
|
document = create_anthropic_document(
|
|
content=FILE_DATA_BASE64, doc_type="pdf", title="Test PDF Document"
|
|
)
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "What does this PDF say? Please cite your sources."},
|
|
document,
|
|
],
|
|
}
|
|
]
|
|
|
|
stream = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
max_tokens=500,
|
|
stream=True,
|
|
)
|
|
|
|
# Collect streaming content and citations using helper
|
|
complete_text, citations, chunk_count = collect_anthropic_streaming_citations(stream)
|
|
|
|
# Validate results
|
|
assert chunk_count > 0, "Should receive at least one chunk"
|
|
assert len(complete_text) > 0, "Should receive text content"
|
|
assert len(citations) > 0, "Should collect at least one citation from stream"
|
|
|
|
# Validate each citation - should be page_location for PDF
|
|
for idx, citation in enumerate(citations, 1):
|
|
# Use common validator
|
|
assert_valid_anthropic_citation(
|
|
citation, expected_type="page_location", document_index=0
|
|
)
|
|
print(
|
|
f"✓ Citation {idx}: pages {citation.start_page_number}-{citation.end_page_number}, "
|
|
f"text: '{citation.cited_text[:50]}...'"
|
|
)
|
|
|
|
print(
|
|
f"✓ Streaming PDF citations test passed - {len(citations)} citations in {chunk_count} chunks"
|
|
)
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_38_web_search_non_streaming(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 38: Web search tool (non-streaming)"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search (Non-Streaming) for provider {provider} ===")
|
|
|
|
# Create web search tool
|
|
web_search_tool = {"type": "web_search_20250305", "name": "web_search", "max_uses": 5}
|
|
|
|
messages = [{"role": "user", "content": "What is a positive news story from today?"}]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
# Validate basic response
|
|
assert response is not None, "Response should not be None"
|
|
assert hasattr(response, "content"), "Response should have content"
|
|
assert len(response.content) > 0, "Content should not be empty"
|
|
|
|
# Check for web search tool use
|
|
has_web_search = False
|
|
has_search_results = False
|
|
has_citations = False
|
|
search_query = None
|
|
|
|
for block in response.content:
|
|
if hasattr(block, "type"):
|
|
# Check for server_tool_use with web_search
|
|
if (
|
|
block.type == "server_tool_use"
|
|
and hasattr(block, "name")
|
|
and block.name == "web_search"
|
|
):
|
|
has_web_search = True
|
|
if hasattr(block, "input") and "query" in block.input:
|
|
search_query = block.input["query"]
|
|
print(f"✓ Found web search with query: {search_query}")
|
|
|
|
# Check for web_search_tool_result
|
|
elif block.type == "web_search_tool_result":
|
|
has_search_results = True
|
|
if hasattr(block, "content") and block.content:
|
|
result_count = len(block.content)
|
|
print(f"✓ Found {result_count} search results")
|
|
|
|
# Log first few results
|
|
for i, result in enumerate(block.content[:3]):
|
|
if hasattr(result, "url") and hasattr(result, "title"):
|
|
print(f" Result {i+1}: {result.title}")
|
|
|
|
# Check for text with citations
|
|
elif block.type == "text":
|
|
if hasattr(block, "citations") and block.citations:
|
|
has_citations = True
|
|
citation_count = len(block.citations)
|
|
print(f"✓ Found {citation_count} citations in response")
|
|
|
|
# Validate citation structure
|
|
for citation in block.citations[:3]:
|
|
assert hasattr(citation, "type"), "Citation should have type"
|
|
assert hasattr(citation, "url"), "Citation should have URL"
|
|
assert hasattr(citation, "title"), "Citation should have title"
|
|
assert hasattr(
|
|
citation, "cited_text"
|
|
), "Citation should have cited_text"
|
|
print(f" Citation: {citation.title}")
|
|
|
|
# Validate that web search was performed
|
|
assert has_web_search, "Response should contain web_search tool use"
|
|
assert has_search_results, "Response should contain web search results"
|
|
assert search_query is not None, "Web search should have a query"
|
|
|
|
print(f"✓ Web search (non-streaming) test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_39_web_search_streaming(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 39: Web search tool (streaming)"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search (Streaming) for provider {provider} ===")
|
|
|
|
# Create web search tool with user location
|
|
web_search_tool = {
|
|
"type": "web_search_20250305",
|
|
"name": "web_search",
|
|
"max_uses": 5,
|
|
"user_location": {
|
|
"type": "approximate",
|
|
"city": "New York",
|
|
"region": "New York",
|
|
"country": "US",
|
|
"timezone": "America/New_York",
|
|
},
|
|
}
|
|
|
|
messages = [{"role": "user", "content": "what was a positive news story from today??"}]
|
|
|
|
stream = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
stream=True,
|
|
)
|
|
|
|
# Collect streaming events
|
|
text_parts = []
|
|
search_queries = []
|
|
search_results = []
|
|
citations = []
|
|
chunk_count = 0
|
|
has_server_tool_use = False
|
|
has_search_tool_result = False
|
|
has_citation_delta = False
|
|
|
|
for event in stream:
|
|
chunk_count += 1
|
|
|
|
if hasattr(event, "type"):
|
|
event_type = event.type
|
|
|
|
# Handle content_block_start for tool use
|
|
if event_type == "content_block_start":
|
|
if hasattr(event, "content_block") and event.content_block:
|
|
block = event.content_block
|
|
|
|
# Check for server_tool_use
|
|
if hasattr(block, "type") and block.type == "server_tool_use":
|
|
if hasattr(block, "name") and block.name == "web_search":
|
|
has_server_tool_use = True
|
|
print(
|
|
f"✓ Web search tool use started (block id: {block.id if hasattr(block, 'id') else 'unknown'})"
|
|
)
|
|
|
|
# Check for web_search_tool_result
|
|
elif hasattr(block, "type") and block.type == "web_search_tool_result":
|
|
print(f"block: {block}")
|
|
has_search_tool_result = True
|
|
if hasattr(block, "content") and block.content:
|
|
result_count = len(block.content)
|
|
print(f"✓ Received {result_count} search results")
|
|
|
|
# Collect search results
|
|
for result in block.content:
|
|
if hasattr(result, "url") and hasattr(result, "title"):
|
|
search_results.append(
|
|
{"url": result.url, "title": result.title}
|
|
)
|
|
|
|
# Handle content_block_delta for queries and text
|
|
elif event_type == "content_block_delta":
|
|
if hasattr(event, "delta") and event.delta:
|
|
delta = event.delta
|
|
|
|
# Check for text_delta
|
|
if hasattr(delta, "type") and delta.type == "text_delta":
|
|
if hasattr(delta, "text"):
|
|
text_parts.append(delta.text)
|
|
|
|
# Check for citations_delta
|
|
elif hasattr(delta, "type") and delta.type == "citations_delta":
|
|
has_citation_delta = True
|
|
if hasattr(delta, "citation"):
|
|
citation = delta.citation
|
|
citations.append(citation)
|
|
|
|
if hasattr(citation, "title"):
|
|
print(f" Received citation: {citation.title}")
|
|
|
|
# Safety check
|
|
if chunk_count > 5000:
|
|
break
|
|
|
|
# Combine collected content
|
|
complete_text = "".join(text_parts)
|
|
|
|
# Validate results
|
|
assert chunk_count > 0, "Should receive at least one chunk"
|
|
assert has_server_tool_use, "Should detect web search tool use in streaming"
|
|
assert has_search_tool_result, "Should receive search results in streaming"
|
|
assert len(search_results) > 0, "Should collect search results from stream"
|
|
assert len(complete_text) > 0, "Should receive text content about weather"
|
|
|
|
print("✓ Streaming validation:")
|
|
print(f" - Chunks received: {chunk_count}")
|
|
print(f" - Search results: {len(search_results)}")
|
|
print(f" - Citations: {len(citations)}")
|
|
print(f" - Text length: {len(complete_text)} characters")
|
|
print(f" - First 150 chars: {complete_text[:150]}...")
|
|
|
|
# Log a few search results
|
|
if len(search_results) > 0:
|
|
print("✓ Search results:")
|
|
for i, result in enumerate(search_results[:3]):
|
|
print(f" {i+1}. {result['title']}")
|
|
|
|
print("✓ Web search (streaming) test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_40_web_search_allowed_domains(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 40: Web search with allowed_domains filter"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search with Allowed Domains for provider {provider} ===")
|
|
|
|
# Create web search tool with allowed domains
|
|
web_search_tool = {
|
|
"type": "web_search_20250305",
|
|
"name": "web_search",
|
|
"allowed_domains": ["en.wikipedia.org", "britannica.com"],
|
|
"max_uses": 5,
|
|
}
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": "Who was Albert Einstein? Please search for this information.",
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
# Validate basic response
|
|
assert response is not None, "Response should not be None"
|
|
assert hasattr(response, "content"), "Response should have content"
|
|
assert len(response.content) > 0, "Content should not be empty"
|
|
|
|
# Collect search results
|
|
search_results = []
|
|
for block in response.content:
|
|
if hasattr(block, "type") and block.type == "web_search_tool_result":
|
|
if hasattr(block, "content") and block.content:
|
|
for result in block.content:
|
|
if hasattr(result, "url") and hasattr(result, "title"):
|
|
search_results.append(result)
|
|
print(f"✓ Found result: {result.title} - {result.url}")
|
|
|
|
# Validate domain filtering
|
|
from .utils.common import validate_domain_filter
|
|
|
|
if len(search_results) > 0:
|
|
validate_domain_filter(search_results, allowed=["wikipedia.org", "britannica.com"])
|
|
print(f"✓ All {len(search_results)} results respect allowed_domains filter")
|
|
|
|
print(f"✓ Web search with allowed_domains test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_41_web_search_blocked_domains(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 41: Web search with blocked_domains filter"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
# skip for openai
|
|
if provider == "openai":
|
|
pytest.skip("OpenAI does not support blocked_domains filter")
|
|
|
|
print(f"\n=== Testing Web Search with Blocked Domains for provider {provider} ===")
|
|
|
|
# Create web search tool with blocked domains
|
|
web_search_tool = {
|
|
"type": "web_search_20250305",
|
|
"name": "web_search",
|
|
"blocked_domains": ["reddit.com", "twitter.com", "x.com"],
|
|
"max_uses": 5,
|
|
}
|
|
|
|
messages = [
|
|
{"role": "user", "content": "What are recent developments in artificial intelligence?"}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
# Validate basic response
|
|
assert response is not None, "Response should not be None"
|
|
assert hasattr(response, "content"), "Response should have content"
|
|
|
|
# Collect search results
|
|
search_results = []
|
|
for block in response.content:
|
|
if hasattr(block, "type") and block.type == "web_search_tool_result":
|
|
if hasattr(block, "content") and block.content:
|
|
for result in block.content:
|
|
if hasattr(result, "url"):
|
|
search_results.append(result)
|
|
print(f"✓ Found result: {result.url}")
|
|
|
|
# Validate domain filtering
|
|
from .utils.common import validate_domain_filter
|
|
|
|
if len(search_results) > 0:
|
|
validate_domain_filter(search_results, blocked=["reddit.com", "twitter.com", "x.com"])
|
|
print(f"✓ All {len(search_results)} results respect blocked_domains filter")
|
|
|
|
print(f"✓ Web search with blocked_domains test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_42_web_search_multi_turn(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 42: Web search in multi-turn conversation"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search Multi-Turn Conversation for provider {provider} ===")
|
|
|
|
web_search_tool = {"type": "web_search_20250305", "name": "web_search", "max_uses": 5}
|
|
|
|
# First turn: Ask about a topic
|
|
messages = [{"role": "user", "content": "What is quantum computing?"}]
|
|
|
|
response1 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
assert response1 is not None, "First response should not be None"
|
|
print(f"✓ First turn completed")
|
|
|
|
# Add assistant response to conversation
|
|
messages.append(
|
|
{"role": "assistant", "content": serialize_anthropic_content(response1.content)}
|
|
)
|
|
|
|
# Second turn: Follow-up question
|
|
messages.append(
|
|
{"role": "user", "content": "How is it different from classical computing?"}
|
|
)
|
|
|
|
response2 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
assert response2 is not None, "Second response should not be None"
|
|
assert hasattr(response2, "content"), "Second response should have content"
|
|
assert len(response2.content) > 0, "Second response content should not be empty"
|
|
|
|
# Validate that context was maintained
|
|
has_text_response = False
|
|
for block in response2.content:
|
|
if hasattr(block, "type") and block.type == "text":
|
|
if hasattr(block, "text") and len(block.text) > 0:
|
|
has_text_response = True
|
|
print(f"✓ Second turn response (first 150 chars): {block.text[:150]}...")
|
|
|
|
assert has_text_response, "Second turn should have text response"
|
|
print(f"✓ Multi-turn web search conversation test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_43_web_search_citation_validation(
|
|
self, anthropic_client, test_config, provider, model
|
|
):
|
|
"""Test Case 43: Validate web search citation structure"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search Citation Validation for provider {provider} ===")
|
|
|
|
web_search_tool = {"type": "web_search_20250305", "name": "web_search", "max_uses": 5}
|
|
|
|
messages = [{"role": "user", "content": "What is the capital of France?"}]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
# Find citations in response
|
|
citations_found = []
|
|
for block in response.content:
|
|
if hasattr(block, "type") and block.type == "text":
|
|
if hasattr(block, "citations") and block.citations:
|
|
for citation in block.citations:
|
|
citations_found.append(citation)
|
|
|
|
# Validate citation structure
|
|
from .utils.common import assert_valid_web_search_citation
|
|
|
|
if len(citations_found) > 0:
|
|
print(f"✓ Found {len(citations_found)} citations")
|
|
for i, citation in enumerate(citations_found[:3]):
|
|
assert_valid_web_search_citation(citation, sdk_type="anthropic")
|
|
print(f" Citation {i+1}: {citation.title}")
|
|
print(f" URL: {citation.url}")
|
|
print(
|
|
f" Cited text (first 50 chars): {citation.cited_text[:50] if citation.cited_text else 'N/A'}..."
|
|
)
|
|
print(f"✓ All citations have valid structure")
|
|
else:
|
|
print(f"⚠ No citations found (may be acceptable)")
|
|
|
|
print(f"✓ Citation validation test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_44_web_search_streaming_event_order(
|
|
self, anthropic_client, test_config, provider, model
|
|
):
|
|
"""Test Case 44: Validate web search streaming event sequence"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search Streaming Event Order for provider {provider} ===")
|
|
|
|
web_search_tool = {"type": "web_search_20250305", "name": "web_search", "max_uses": 3}
|
|
|
|
messages = [{"role": "user", "content": "What is the Eiffel Tower?"}]
|
|
|
|
stream = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
stream=True,
|
|
)
|
|
|
|
# Track event sequence
|
|
event_sequence = []
|
|
|
|
for event in stream:
|
|
if hasattr(event, "type"):
|
|
event_type = event.type
|
|
event_sequence.append(event_type)
|
|
|
|
# Log key events
|
|
if event_type == "content_block_start":
|
|
if hasattr(event, "content_block"):
|
|
block_type = getattr(event.content_block, "type", "unknown")
|
|
print(f"✓ Event: content_block_start ({block_type})")
|
|
elif event_type == "content_block_stop":
|
|
print(f"✓ Event: content_block_stop")
|
|
elif event_type == "content_block_delta":
|
|
if hasattr(event, "delta") and hasattr(event.delta, "type"):
|
|
delta_type = event.delta.type
|
|
if delta_type == "input_json_delta":
|
|
print(f"✓ Event: content_block_delta (input_json_delta)")
|
|
|
|
# Validate expected event types are present
|
|
assert "message_start" in event_sequence, "Should have message_start event"
|
|
assert "content_block_start" in event_sequence, "Should have content_block_start events"
|
|
assert "content_block_stop" in event_sequence, "Should have content_block_stop events"
|
|
assert "message_stop" in event_sequence, "Should have message_stop event"
|
|
|
|
print(f"✓ Received {len(event_sequence)} total events")
|
|
print(f"✓ Event sequence validation passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_45_web_search_with_prompt_caching(
|
|
self, anthropic_client, test_config, provider, model
|
|
):
|
|
"""Test Case 45: Web search with prompt caching"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search with Prompt Caching for provider {provider} ===")
|
|
|
|
web_search_tool = {"type": "web_search_20250305", "name": "web_search", "max_uses": 3}
|
|
|
|
# First request with cache breakpoint
|
|
messages = [{"role": "user", "content": "What is the current population of Tokyo?"}]
|
|
|
|
response1 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=1500,
|
|
)
|
|
|
|
assert response1 is not None, "First response should not be None"
|
|
|
|
# Check if cache was written
|
|
if hasattr(response1, "usage"):
|
|
cache_write_tokens = getattr(response1.usage, "cache_creation_input_tokens", 0)
|
|
print(f"✓ First request - cache_creation_input_tokens: {cache_write_tokens}")
|
|
|
|
# Add assistant response with cache control
|
|
messages.append(
|
|
{"role": "assistant", "content": serialize_anthropic_content(response1.content)}
|
|
)
|
|
|
|
messages.append(
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "What about its GDP?",
|
|
"cache_control": {"type": "ephemeral"},
|
|
}
|
|
],
|
|
}
|
|
)
|
|
|
|
# Second request should benefit from caching
|
|
response2 = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=1500,
|
|
)
|
|
|
|
assert response2 is not None, "Second response should not be None"
|
|
|
|
# Check if cache was read
|
|
if hasattr(response2, "usage"):
|
|
cache_read_tokens = getattr(response2.usage, "cache_read_input_tokens", 0)
|
|
print(f"✓ Second request - cache_read_input_tokens: {cache_read_tokens}")
|
|
|
|
if cache_read_tokens > 0:
|
|
print(f"✓ Successfully read {cache_read_tokens} tokens from cache")
|
|
|
|
print(f"✓ Prompt caching test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_47_web_search_error_handling(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 47: Web search error code handling"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search Error Handling for provider {provider} ===")
|
|
|
|
web_search_tool = {"type": "web_search_20250305", "name": "web_search", "max_uses": 5}
|
|
|
|
# Try with an extremely long query that might trigger query_too_long error
|
|
very_long_query = "What is " + ("the meaning of life and the universe " * 50)
|
|
|
|
messages = [
|
|
{"role": "user", "content": very_long_query[:1000]} # Limit to reasonable length
|
|
]
|
|
|
|
try:
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
# Check response structure
|
|
assert response is not None, "Response should not be None"
|
|
assert hasattr(response, "content"), "Response should have content"
|
|
|
|
# Look for any error structures in the response
|
|
has_error = False
|
|
for block in response.content:
|
|
if hasattr(block, "type") and block.type == "web_search_tool_result":
|
|
if hasattr(block, "content") and isinstance(block.content, dict):
|
|
if "error_code" in block.content:
|
|
has_error = True
|
|
error_code = block.content["error_code"]
|
|
print(f"✓ Found error code: {error_code}")
|
|
|
|
if not has_error:
|
|
print(f"✓ Request handled successfully (no errors triggered)")
|
|
|
|
except Exception as e:
|
|
# Some errors might be raised as exceptions
|
|
print(f"✓ Exception caught (expected for error scenarios): {type(e).__name__}")
|
|
|
|
print(f"✓ Error handling test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_48_web_search_no_results_graceful(
|
|
self, anthropic_client, test_config, provider, model
|
|
):
|
|
"""Test Case 48: Web search with query that may return no results"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search No Results Handling for provider {provider} ===")
|
|
|
|
web_search_tool = {"type": "web_search_20250305", "name": "web_search", "max_uses": 3}
|
|
|
|
# Use a very specific/nonsensical query
|
|
messages = [
|
|
{"role": "user", "content": "Find information about xyzabc123nonexistent456topic789"}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
# Validate graceful handling
|
|
assert response is not None, "Response should not be None"
|
|
assert hasattr(response, "content"), "Response should have content"
|
|
assert len(response.content) > 0, "Content should not be empty"
|
|
|
|
# Check for search attempt
|
|
has_search_attempt = False
|
|
has_response_text = False
|
|
|
|
for block in response.content:
|
|
if hasattr(block, "type"):
|
|
if (
|
|
block.type == "server_tool_use"
|
|
and hasattr(block, "name")
|
|
and block.name == "web_search"
|
|
):
|
|
has_search_attempt = True
|
|
print(f"✓ Web search was attempted")
|
|
elif block.type == "text" and hasattr(block, "text"):
|
|
has_response_text = True
|
|
print(f"✓ Response text present (first 100 chars): {block.text[:100]}...")
|
|
|
|
assert has_search_attempt, "Should attempt web search"
|
|
assert has_response_text, "Should provide text response even with no/few results"
|
|
|
|
print(f"✓ No results graceful handling test passed!")
|
|
|
|
@pytest.mark.parametrize("provider,model", get_cross_provider_params_for_scenario("web_search"))
|
|
def test_49_web_search_sources_validation(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 49: Comprehensive web search sources validation"""
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for web_search scenario")
|
|
|
|
print(f"\n=== Testing Web Search Sources Validation for provider {provider} ===")
|
|
|
|
web_search_tool = {"type": "web_search_20250305", "name": "web_search", "max_uses": 5}
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": "What are the main programming languages used for web development?",
|
|
}
|
|
]
|
|
|
|
response = anthropic_client.messages.create(
|
|
model=format_provider_model(provider, model),
|
|
messages=messages,
|
|
tools=[web_search_tool],
|
|
max_tokens=2048,
|
|
)
|
|
|
|
# Collect all search sources
|
|
all_sources = []
|
|
for block in response.content:
|
|
if hasattr(block, "type") and block.type == "web_search_tool_result":
|
|
if hasattr(block, "content") and block.content:
|
|
for result in block.content:
|
|
if hasattr(result, "type") and result.type == "web_search_result":
|
|
all_sources.append(result)
|
|
|
|
# Validate sources using helper
|
|
from .utils.common import assert_web_search_sources_valid
|
|
|
|
if len(all_sources) > 0:
|
|
assert_web_search_sources_valid(all_sources)
|
|
print(f"✓ Found and validated {len(all_sources)} search sources")
|
|
|
|
# Log details of first few sources
|
|
for i, source in enumerate(all_sources[:3]):
|
|
print(f" Source {i+1}:")
|
|
print(f" URL: {source.url}")
|
|
print(f" Title: {source.title if hasattr(source, 'title') else 'N/A'}")
|
|
if hasattr(source, "page_age"):
|
|
print(f" Page age: {source.page_age}")
|
|
if hasattr(source, "encrypted_content"):
|
|
print(f" Encrypted content: Present")
|
|
else:
|
|
print(f"⚠ No search sources found (may indicate no search was performed)")
|
|
|
|
print(f"✓ Sources validation test passed!")
|
|
|
|
# =========================================================================
|
|
# Async Inference Tests
|
|
# =========================================================================
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model", get_cross_provider_params_for_scenario("simple_chat")
|
|
)
|
|
def test_50_async_messages(self, anthropic_client, test_config, provider, model):
|
|
"""Test Case 50: Async messages - submit and poll"""
|
|
_ = test_config
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for this scenario")
|
|
|
|
print(f"\n=== Testing Async Messages for provider {provider} ===")
|
|
messages = convert_to_anthropic_messages(SIMPLE_CHAT_MESSAGES)
|
|
|
|
request_params = {
|
|
"model": format_provider_model(provider, model),
|
|
"messages": messages,
|
|
"max_tokens": 100,
|
|
}
|
|
|
|
# Submit async request
|
|
initial = anthropic_client.messages.create(
|
|
**request_params,
|
|
extra_headers={"x-bf-async": "true"},
|
|
)
|
|
|
|
assert initial.id is not None, "Async response should have an ID"
|
|
print(f" Async job ID: {initial.id}")
|
|
|
|
# If completed synchronously (content is present), validate and return
|
|
if initial.content and len(initial.content) > 0:
|
|
print(" Status: completed (sync)")
|
|
assert initial.content[0].type == "text"
|
|
assert len(initial.content[0].text) > 0
|
|
print(f" Result: {initial.content[0].text[:80]}...")
|
|
return
|
|
|
|
print(" Status: processing")
|
|
|
|
# Poll until completed
|
|
max_polls = 30
|
|
for i in range(max_polls):
|
|
time.sleep(2)
|
|
print(f" Polling attempt {i + 1}/{max_polls}...")
|
|
|
|
poll = anthropic_client.messages.create(
|
|
**request_params,
|
|
extra_headers={"x-bf-async-id": initial.id},
|
|
)
|
|
|
|
if poll.content and len(poll.content) > 0:
|
|
print(" Status: completed")
|
|
assert poll.content[0].type == "text"
|
|
assert len(poll.content[0].text) > 0
|
|
print(f" Result: {poll.content[0].text[:80]}...")
|
|
print("✓ Async messages test passed!")
|
|
return
|
|
|
|
pytest.fail(f"Async job did not complete after {max_polls} polls")
|
|
|
|
# =========================================================================
|
|
# Passthrough Tests
|
|
# =========================================================================
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model",
|
|
get_cross_provider_params_for_scenario("simple_chat", include_providers=["anthropic"]),
|
|
)
|
|
def test_51_passthrough_messages(self, test_config, provider, model):
|
|
"""Test Case 51: Passthrough messages (non-streaming) - sends request directly to Anthropic API"""
|
|
_ = test_config
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for passthrough scenario")
|
|
|
|
print(f"\n=== Testing Passthrough Messages (non-streaming) for provider {provider} ===")
|
|
|
|
client = get_provider_anthropic_client(provider, passthrough=True)
|
|
messages = convert_to_anthropic_messages(SIMPLE_CHAT_MESSAGES)
|
|
|
|
response = client.messages.create(
|
|
model=model,
|
|
messages=messages,
|
|
max_tokens=100,
|
|
)
|
|
|
|
assert_valid_chat_response(response)
|
|
assert len(response.content) > 0
|
|
assert response.content[0].type == "text"
|
|
assert len(response.content[0].text) > 0
|
|
print(f" Response: {response.content[0].text[:80]}...")
|
|
print("✓ Passthrough messages test passed!")
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider,model",
|
|
get_cross_provider_params_for_scenario("simple_chat", include_providers=["anthropic"]),
|
|
)
|
|
def test_52_passthrough_messages_streaming(self, test_config, provider, model):
|
|
"""Test Case 52: Passthrough messages (streaming) - streams response directly from Anthropic API"""
|
|
_ = test_config
|
|
if provider == "_no_providers_" or model == "_no_model_":
|
|
pytest.skip("No providers configured for passthrough scenario")
|
|
|
|
print(f"\n=== Testing Passthrough Messages (streaming) for provider {provider} ===")
|
|
|
|
client = get_provider_anthropic_client(provider, passthrough=True)
|
|
messages = convert_to_anthropic_messages(STREAMING_CHAT_MESSAGES)
|
|
|
|
stream = client.messages.create(
|
|
model=model,
|
|
messages=messages,
|
|
max_tokens=200,
|
|
stream=True,
|
|
)
|
|
|
|
content, chunk_count, tool_calls_detected = collect_streaming_content(
|
|
stream, "anthropic", timeout=300
|
|
)
|
|
|
|
assert chunk_count > 0, "Should receive at least one chunk"
|
|
assert len(content) > 0, "Should receive non-empty streamed content"
|
|
assert not tool_calls_detected, "Basic passthrough streaming should not have tool calls"
|
|
print(f" Received {chunk_count} chunks, total content length: {len(content)}")
|
|
print("✓ Passthrough streaming test passed!")
|
|
|
|
|
|
# Additional helper functions specific to Anthropic
|
|
def serialize_anthropic_content(content_blocks: List[Any]) -> List[Dict[str, Any]]:
|
|
"""Serialize Anthropic content blocks (including ToolUseBlock objects) to dicts"""
|
|
serialized_content = []
|
|
|
|
for block in content_blocks:
|
|
if hasattr(block, "type"):
|
|
if block.type == "tool_use":
|
|
# Serialize ToolUseBlock to dict
|
|
serialized_content.append(
|
|
{"type": "tool_use", "id": block.id, "name": block.name, "input": block.input}
|
|
)
|
|
elif block.type == "text":
|
|
# Serialize TextBlock to dict
|
|
serialized_content.append({"type": "text", "text": block.text})
|
|
else:
|
|
# For other block types, try to convert using model_dump if available
|
|
if hasattr(block, "model_dump"):
|
|
serialized_content.append(block.model_dump())
|
|
else:
|
|
# Fallback: try to convert to dict
|
|
serialized_content.append(dict(block))
|
|
else:
|
|
# If already a dict, use as is
|
|
serialized_content.append(block)
|
|
|
|
return serialized_content
|
|
|
|
|
|
def extract_anthropic_tool_calls(response: Any) -> List[Dict[str, Any]]:
|
|
"""Extract tool calls from Anthropic response format with proper type checking"""
|
|
tool_calls = []
|
|
logger = logging.getLogger("AnthropicToolCallsExtractor")
|
|
|
|
# Type check for Anthropic Message response
|
|
if not hasattr(response, "content") or not response.content:
|
|
return tool_calls
|
|
|
|
for content in response.content:
|
|
if hasattr(content, "type") and content.type == "tool_use":
|
|
if hasattr(content, "name") and hasattr(content, "input"):
|
|
try:
|
|
logger.debug(f"Extracting tool call: {content}")
|
|
tool_calls.append(
|
|
{"id": content.id, "name": content.name, "arguments": content.input}
|
|
)
|
|
except AttributeError as e:
|
|
print(f"Warning: Failed to extract tool call from content: {e}")
|
|
continue
|
|
|
|
return tool_calls
|
|
|
|
|
|
def validate_cache_write(usage: Any, operation: str) -> int:
|
|
"""Validate cache write operation and return tokens written"""
|
|
print(
|
|
f"{operation} usage - input_tokens: {usage.input_tokens}, "
|
|
f"cache_creation_input_tokens: {getattr(usage, 'cache_creation_input_tokens', 0)}, "
|
|
f"cache_read_input_tokens: {getattr(usage, 'cache_read_input_tokens', 0)}"
|
|
)
|
|
|
|
assert hasattr(
|
|
usage, "cache_creation_input_tokens"
|
|
), f"{operation} should have cache_creation_input_tokens"
|
|
cache_write_tokens = getattr(usage, "cache_creation_input_tokens", 0)
|
|
assert (
|
|
cache_write_tokens > 0
|
|
), f"{operation} should create cache (got {cache_write_tokens} tokens)"
|
|
|
|
return cache_write_tokens
|
|
|
|
|
|
def validate_cache_read(usage: Any, operation: str) -> int:
|
|
"""Validate cache read operation and return tokens read"""
|
|
print(
|
|
f"{operation} usage - input_tokens: {usage.input_tokens}, "
|
|
f"cache_creation_input_tokens: {getattr(usage, 'cache_creation_input_tokens', 0)}, "
|
|
f"cache_read_input_tokens: {getattr(usage, 'cache_read_input_tokens', 0)}"
|
|
)
|
|
|
|
assert hasattr(
|
|
usage, "cache_read_input_tokens"
|
|
), f"{operation} should have cache_read_input_tokens"
|
|
cache_read_tokens = getattr(usage, "cache_read_input_tokens", 0)
|
|
assert (
|
|
cache_read_tokens > 0
|
|
), f"{operation} should read from cache (got {cache_read_tokens} tokens)"
|
|
|
|
return cache_read_tokens
|
|
|
|
|
|
# ============================================================================
|
|
# COMPACTION TESTS
|
|
# ============================================================================
|
|
|
|
|
|
class TestAnthropicCompaction:
|
|
"""Test suite for Anthropic compaction feature (context management)
|
|
|
|
Tests the server-side context compaction feature that automatically
|
|
summarizes older context when approaching context window limits.
|
|
Requires Claude Opus 4.6 and the compact-2026-01-12 beta header.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def compaction_client(self):
|
|
"""Create Anthropic client with compaction beta header"""
|
|
from .utils.config_loader import get_config, get_integration_url
|
|
|
|
api_key = get_api_key("anthropic")
|
|
base_url = get_integration_url("anthropic")
|
|
config = get_config()
|
|
api_config = config.get_api_config()
|
|
integration_settings = config.get_integration_settings("anthropic")
|
|
|
|
default_headers = {"anthropic-beta": "compact-2026-01-12"}
|
|
if integration_settings.get("version"):
|
|
default_headers["anthropic-version"] = integration_settings["version"]
|
|
|
|
return Anthropic(
|
|
api_key=api_key,
|
|
base_url=base_url,
|
|
timeout=api_config.get("timeout", 300),
|
|
default_headers=default_headers,
|
|
)
|
|
|
|
def _generate_large_context(self, token_count_estimate: int) -> str:
|
|
"""Generate large text context to trigger compaction"""
|
|
# Approximately 4 chars per token
|
|
chars_needed = token_count_estimate * 4
|
|
base_text = "This is a sample document about software architecture and design patterns. "
|
|
repeat_count = chars_needed // len(base_text) + 1
|
|
return (base_text * repeat_count)[:chars_needed]
|
|
|
|
def _create_large_messages(self, total_tokens: int = 80000) -> List[Dict[str, Any]]:
|
|
"""Create messages with enough content to trigger compaction
|
|
|
|
Args:
|
|
total_tokens: Estimated token count (must be > 50000 to trigger compaction)
|
|
Default is 80000 to ensure we exceed 50k after actual tokenization
|
|
"""
|
|
messages = []
|
|
large_text = self._generate_large_context(total_tokens)
|
|
|
|
# Split into multiple turns to simulate a conversation
|
|
chunk_size = len(large_text) // 10
|
|
for i in range(10):
|
|
chunk = large_text[i * chunk_size : (i + 1) * chunk_size]
|
|
messages.append({"role": "user", "content": f"Document part {i+1}: {chunk}"})
|
|
messages.append({"role": "assistant", "content": f"I've received document part {i+1}."})
|
|
|
|
# Add final query
|
|
messages.append(
|
|
{"role": "user", "content": "Please provide a brief summary of the document."}
|
|
)
|
|
|
|
return messages
|
|
|
|
def test_32_compaction_basic(self, compaction_client):
|
|
"""Test Case 32: Basic compaction functionality
|
|
|
|
Verifies that compaction can be enabled and creates a compaction block
|
|
when the trigger threshold is exceeded.
|
|
"""
|
|
print("\n=== Testing Basic Compaction ===")
|
|
|
|
# Create messages that will trigger compaction (minimum trigger is 50k tokens)
|
|
# Use 80k to ensure we exceed 50k after actual tokenization
|
|
messages = self._create_large_messages(80000)
|
|
|
|
print(f"Created {len(messages)} messages for compaction test")
|
|
|
|
# Enable compaction with minimum allowed threshold (50k tokens)
|
|
response = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
# Validate response structure
|
|
assert hasattr(response, "content"), "Response should have content"
|
|
assert len(response.content) > 0, "Response should have at least one content block"
|
|
|
|
# Check for compaction block
|
|
compaction_blocks = [
|
|
block
|
|
for block in response.content
|
|
if hasattr(block, "type") and block.type == "compaction"
|
|
]
|
|
|
|
if len(compaction_blocks) > 0:
|
|
print(f"✓ Compaction triggered! Found {len(compaction_blocks)} compaction block(s)")
|
|
compaction_block = compaction_blocks[0]
|
|
|
|
# Validate compaction block structure
|
|
assert hasattr(compaction_block, "content"), "Compaction block should have content"
|
|
assert len(compaction_block.content) > 0, "Compaction summary should not be empty"
|
|
print(f" Compaction summary length: {len(compaction_block.content)} chars")
|
|
print(f" Summary preview: {compaction_block.content[:200]}...")
|
|
|
|
# Check for text content after compaction
|
|
text_blocks = [
|
|
block
|
|
for block in response.content
|
|
if hasattr(block, "type") and block.type == "text"
|
|
]
|
|
assert len(text_blocks) > 0, "Response should have text content after compaction"
|
|
print(f"✓ Response also contains {len(text_blocks)} text block(s)")
|
|
else:
|
|
print("⚠ Compaction not triggered (threshold may not have been reached)")
|
|
# Still validate it's a valid response
|
|
assert_valid_chat_response(response)
|
|
|
|
# Validate response has usage information
|
|
assert hasattr(response, "usage"), "Response should have usage information"
|
|
print(f" Input tokens: {response.usage.input_tokens}")
|
|
print(f" Output tokens: {response.usage.output_tokens}")
|
|
|
|
def test_33_compaction_usage_tracking(self, compaction_client):
|
|
"""Test Case 33: Compaction usage tracking with iterations
|
|
|
|
Verifies that usage information includes iteration details when
|
|
compaction occurs, showing separate compaction and message iterations.
|
|
"""
|
|
print("\n=== Testing Compaction Usage Tracking ===")
|
|
|
|
messages = self._create_large_messages(80000)
|
|
|
|
response = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
# Validate usage structure
|
|
assert hasattr(response, "usage"), "Response should have usage information"
|
|
usage = response.usage
|
|
|
|
print(f"Top-level usage:")
|
|
print(f" input_tokens: {usage.input_tokens}")
|
|
print(f" output_tokens: {usage.output_tokens}")
|
|
|
|
# Check for iterations array (only present when compaction triggers)
|
|
iterations = None
|
|
if hasattr(usage, "iterations"):
|
|
iterations = usage.iterations
|
|
elif isinstance(usage, dict) and "iterations" in usage:
|
|
iterations = usage["iterations"]
|
|
|
|
if iterations:
|
|
print(f"\n✓ Found {len(iterations)} iteration(s)")
|
|
|
|
# Calculate total tokens from iterations
|
|
total_input = 0
|
|
total_output = 0
|
|
|
|
for idx, iteration in enumerate(iterations):
|
|
# Handle both dict and object iteration types
|
|
if isinstance(iteration, dict):
|
|
assert "type" in iteration, "Iteration should have type"
|
|
assert "input_tokens" in iteration, "Iteration should have input_tokens"
|
|
assert "output_tokens" in iteration, "Iteration should have output_tokens"
|
|
|
|
iter_type = iteration["type"]
|
|
iter_input = iteration["input_tokens"]
|
|
iter_output = iteration["output_tokens"]
|
|
else:
|
|
assert hasattr(iteration, "type"), "Iteration should have type"
|
|
assert hasattr(iteration, "input_tokens"), "Iteration should have input_tokens"
|
|
assert hasattr(
|
|
iteration, "output_tokens"
|
|
), "Iteration should have output_tokens"
|
|
|
|
iter_type = iteration.type
|
|
iter_input = iteration.input_tokens
|
|
iter_output = iteration.output_tokens
|
|
|
|
print(f"\n Iteration {idx + 1}:")
|
|
print(f" type: {iter_type}")
|
|
print(f" input_tokens: {iter_input}")
|
|
print(f" output_tokens: {iter_output}")
|
|
|
|
if iter_type == "compaction":
|
|
# Validate compaction iteration
|
|
assert iter_input > 0, "Compaction should consume input tokens"
|
|
assert iter_output > 0, "Compaction should produce summary tokens"
|
|
print(f" ✓ Compaction iteration validated")
|
|
elif iter_type == "message":
|
|
# Validate message iteration
|
|
assert iter_input > 0, "Message should have input tokens"
|
|
assert iter_output > 0, "Message should have output tokens"
|
|
print(f" ✓ Message iteration validated")
|
|
|
|
# Only sum non-compaction iterations for comparison with top-level
|
|
if iter_type != "compaction":
|
|
total_input += iter_input
|
|
total_output += iter_output
|
|
|
|
# Top-level tokens should equal sum of non-compaction iterations
|
|
print(f"\nValidating top-level vs iterations:")
|
|
print(f" Top-level input: {usage.input_tokens}, Non-compaction sum: {total_input}")
|
|
print(f" Top-level output: {usage.output_tokens}, Non-compaction sum: {total_output}")
|
|
|
|
# Allow small variance due to rounding
|
|
assert (
|
|
abs(usage.input_tokens - total_input) < 10
|
|
), f"Top-level input tokens should match non-compaction sum"
|
|
assert (
|
|
abs(usage.output_tokens - total_output) < 10
|
|
), f"Top-level output tokens should match non-compaction sum"
|
|
|
|
print("✓ Usage tracking validation passed")
|
|
else:
|
|
print("⚠ No iterations found (compaction may not have triggered)")
|
|
|
|
def test_34_compaction_streaming(self, compaction_client):
|
|
"""Test Case 34: Compaction with streaming responses
|
|
|
|
Verifies that compaction works correctly with streaming, including
|
|
proper event ordering and compaction block streaming.
|
|
"""
|
|
print("\n=== Testing Compaction with Streaming ===")
|
|
|
|
messages = self._create_large_messages(80000)
|
|
|
|
stream = compaction_client.beta.messages.stream(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
compaction_started = False
|
|
compaction_content = ""
|
|
text_content = ""
|
|
compaction_delta_count = 0
|
|
text_delta_count = 0
|
|
|
|
print("Processing stream events...")
|
|
|
|
with stream as s:
|
|
for event in s:
|
|
if event.type == "content_block_start":
|
|
if hasattr(event, "content_block"):
|
|
if event.content_block.type == "compaction":
|
|
compaction_started = True
|
|
print(" ✓ Compaction block started")
|
|
elif event.content_block.type == "text":
|
|
print(" ✓ Text block started")
|
|
|
|
elif event.type == "content_block_delta":
|
|
if hasattr(event, "delta"):
|
|
if event.delta.type == "compaction_delta":
|
|
# Compaction streams as single delta
|
|
compaction_content += event.delta.content
|
|
compaction_delta_count += 1
|
|
print(
|
|
f" ✓ Compaction delta received ({len(event.delta.content)} chars)"
|
|
)
|
|
elif event.delta.type == "text_delta":
|
|
# Text streams incrementally
|
|
text_content += event.delta.text
|
|
text_delta_count += 1
|
|
|
|
elif event.type == "content_block_stop":
|
|
print(f" ✓ Content block stopped (index: {event.index})")
|
|
|
|
# Get final message
|
|
final_message = s.get_final_message()
|
|
|
|
# Validate streaming results
|
|
if compaction_started:
|
|
print(f"\n✓ Compaction triggered during streaming")
|
|
assert len(compaction_content) > 0, "Compaction content should not be empty"
|
|
print(
|
|
f" Compaction summary: {len(compaction_content)} chars, {compaction_delta_count} delta(s)"
|
|
)
|
|
print(f" Compaction preview: {compaction_content[:200]}...")
|
|
|
|
# Compaction typically streams as single complete delta
|
|
assert compaction_delta_count >= 1, "Should have at least one compaction delta"
|
|
else:
|
|
print("⚠ Compaction not triggered during streaming")
|
|
|
|
# Validate text content was received
|
|
assert len(text_content) > 0, "Should receive text content"
|
|
print(f" Text content: {len(text_content)} chars, {text_delta_count} delta(s)")
|
|
|
|
# Validate final message structure
|
|
assert hasattr(final_message, "content"), "Final message should have content"
|
|
assert len(final_message.content) > 0, "Final message should have content blocks"
|
|
assert hasattr(final_message, "usage"), "Final message should have usage"
|
|
|
|
print(f"✓ Streaming compaction test passed")
|
|
|
|
def test_35_compaction_pause_after(self, compaction_client):
|
|
"""Test Case 35: Compaction with pause_after_compaction
|
|
|
|
Verifies that pause_after_compaction causes the API to pause after
|
|
generating the compaction summary, returning a 'compaction' stop_reason.
|
|
"""
|
|
print("\n=== Testing Compaction with Pause After ===")
|
|
|
|
messages = self._create_large_messages(80000)
|
|
|
|
# First request with pause_after_compaction
|
|
response1 = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
"pause_after_compaction": True,
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
# Check if compaction triggered a pause
|
|
if hasattr(response1, "stop_reason") and response1.stop_reason == "compaction":
|
|
print("✓ Compaction pause triggered!")
|
|
print(f" stop_reason: {response1.stop_reason}")
|
|
|
|
# Validate response contains only compaction block
|
|
assert hasattr(response1, "content"), "Response should have content"
|
|
assert len(response1.content) > 0, "Response should have at least one content block"
|
|
|
|
# Should have compaction block
|
|
compaction_blocks = [
|
|
b for b in response1.content if hasattr(b, "type") and b.type == "compaction"
|
|
]
|
|
assert len(compaction_blocks) > 0, "Response should contain compaction block"
|
|
print(f" Compaction summary length: {len(compaction_blocks[0].content)} chars")
|
|
|
|
# Append response to messages for continuation
|
|
messages.append({"role": "assistant", "content": response1.content})
|
|
|
|
# Continue the request (could add preserved messages here)
|
|
print("\nContinuing after compaction pause...")
|
|
response2 = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={"edits": [{"type": "compact_20260112"}]},
|
|
)
|
|
|
|
# Validate continuation response
|
|
assert_valid_chat_response(response2)
|
|
assert response2.stop_reason != "compaction", "Continuation should not pause again"
|
|
|
|
# Should have text content in continuation
|
|
text_blocks = [b for b in response2.content if hasattr(b, "type") and b.type == "text"]
|
|
assert len(text_blocks) > 0, "Continuation should have text content"
|
|
print(f"✓ Continuation successful with {len(text_blocks)} text block(s)")
|
|
|
|
else:
|
|
print("⚠ Compaction pause not triggered")
|
|
print(
|
|
f" stop_reason: {response1.stop_reason if hasattr(response1, 'stop_reason') else 'N/A'}"
|
|
)
|
|
# Still validate it's a valid response
|
|
assert_valid_chat_response(response1)
|
|
|
|
def test_36_compaction_custom_instructions(self, compaction_client):
|
|
"""Test Case 36: Compaction with custom summarization instructions
|
|
|
|
Verifies that custom instructions parameter works and affects the
|
|
compaction summary generation.
|
|
"""
|
|
print("\n=== Testing Compaction with Custom Instructions ===")
|
|
|
|
messages = self._create_large_messages(80000)
|
|
|
|
custom_instructions = (
|
|
"Create a highly detailed technical summary that preserves all "
|
|
"specific technical terms, code snippets, and architectural decisions. "
|
|
"Include section headers for clarity."
|
|
)
|
|
|
|
response = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
"instructions": custom_instructions,
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
# Validate response
|
|
assert hasattr(response, "content"), "Response should have content"
|
|
|
|
# Check for compaction block
|
|
compaction_blocks = [
|
|
block
|
|
for block in response.content
|
|
if hasattr(block, "type") and block.type == "compaction"
|
|
]
|
|
|
|
if len(compaction_blocks) > 0:
|
|
print("✓ Compaction with custom instructions triggered")
|
|
compaction_content = compaction_blocks[0].content
|
|
print(f" Summary length: {len(compaction_content)} chars")
|
|
print(f" Summary preview: {compaction_content[:300]}...")
|
|
|
|
# Validate summary is substantial (custom instructions may produce longer summaries)
|
|
assert len(compaction_content) > 50, "Custom summary should be substantial"
|
|
print("✓ Custom instructions applied successfully")
|
|
else:
|
|
print("⚠ Compaction not triggered (threshold may not have been reached)")
|
|
assert_valid_chat_response(response)
|
|
|
|
def test_37_compaction_continuation(self, compaction_client):
|
|
"""Test Case 37: Compaction block continuation across multiple requests
|
|
|
|
Verifies that compaction blocks can be passed back to the API and
|
|
that prior content is properly dropped in favor of the summary.
|
|
"""
|
|
print("\n=== Testing Compaction Continuation ===")
|
|
|
|
# Initial conversation with compaction
|
|
messages = self._create_large_messages(80000)
|
|
|
|
response1 = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
# Check if compaction occurred
|
|
compaction_blocks = [
|
|
b for b in response1.content if hasattr(b, "type") and b.type == "compaction"
|
|
]
|
|
|
|
if len(compaction_blocks) > 0:
|
|
print("✓ Initial compaction created")
|
|
|
|
# Append entire response (including compaction block) to messages
|
|
messages.append({"role": "assistant", "content": response1.content})
|
|
|
|
# Add a follow-up query
|
|
messages.append(
|
|
{
|
|
"role": "user",
|
|
"content": "Based on what we discussed, what are the three main points?",
|
|
}
|
|
)
|
|
|
|
print("\nSending continuation request with compaction block...")
|
|
|
|
# Second request with compaction block included
|
|
response2 = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={"edits": [{"type": "compact_20260112"}]},
|
|
)
|
|
|
|
# Validate continuation works
|
|
assert_valid_chat_response(response2)
|
|
print("✓ Continuation with compaction block successful")
|
|
|
|
# Check usage - should reflect effective context after compaction
|
|
if hasattr(response2, "usage"):
|
|
print(f" Continuation input tokens: {response2.usage.input_tokens}")
|
|
print(f" Continuation output tokens: {response2.usage.output_tokens}")
|
|
|
|
# Input tokens should be significantly less than original due to compaction
|
|
# This validates that compaction actually reduced context
|
|
print("✓ Context successfully compacted and reused")
|
|
else:
|
|
print("⚠ Initial compaction not triggered, skipping continuation test")
|
|
|
|
def test_38_compaction_multiple_iterations(self, compaction_client):
|
|
"""Test Case 38: Multiple compaction iterations in single conversation
|
|
|
|
Verifies that compaction can trigger multiple times as conversation
|
|
grows, with each compaction replacing the previous one.
|
|
"""
|
|
print("\n=== Testing Multiple Compaction Iterations ===")
|
|
|
|
# Start with large enough context to potentially trigger compaction
|
|
messages = self._create_large_messages(80000)
|
|
|
|
compaction_count = 0
|
|
max_iterations = 3
|
|
|
|
for iteration in range(max_iterations):
|
|
print(f"\nIteration {iteration + 1}:")
|
|
|
|
# Add more context to grow beyond threshold
|
|
messages.append(
|
|
{
|
|
"role": "user",
|
|
"content": f"Additional context for iteration {iteration + 1}: "
|
|
+ self._generate_large_context(20000),
|
|
}
|
|
)
|
|
|
|
response = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=512,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
# Check for compaction
|
|
compaction_blocks = [
|
|
b for b in response.content if hasattr(b, "type") and b.type == "compaction"
|
|
]
|
|
|
|
if len(compaction_blocks) > 0:
|
|
compaction_count += 1
|
|
print(f" ✓ Compaction {compaction_count} triggered")
|
|
print(f" Summary length: {len(compaction_blocks[0].content)} chars")
|
|
|
|
# Append response to continue conversation
|
|
messages.append({"role": "assistant", "content": response.content})
|
|
|
|
# Validate response
|
|
assert_valid_chat_response(response)
|
|
|
|
print(f"\n✓ Multiple iteration test completed")
|
|
print(f" Total compactions triggered: {compaction_count}")
|
|
|
|
if compaction_count > 0:
|
|
print("✓ At least one compaction occurred across iterations")
|
|
else:
|
|
print("⚠ No compactions triggered (threshold may need adjustment)")
|
|
|
|
def test_39_compaction_with_prompt_caching(self, compaction_client):
|
|
"""Test Case 39: Compaction combined with prompt caching
|
|
|
|
Verifies that compaction blocks can have cache_control breakpoints
|
|
and that caching works correctly with compacted context.
|
|
"""
|
|
print("\n=== Testing Compaction with Prompt Caching ===")
|
|
|
|
messages = self._create_large_messages(80000)
|
|
|
|
# First request - create compaction
|
|
response1 = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=1024,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
compaction_blocks = [
|
|
b for b in response1.content if hasattr(b, "type") and b.type == "compaction"
|
|
]
|
|
|
|
if len(compaction_blocks) > 0:
|
|
print("✓ Compaction created in first request")
|
|
|
|
# Modify compaction block to add cache_control
|
|
modified_content = []
|
|
for block in response1.content:
|
|
if hasattr(block, "type") and block.type == "compaction":
|
|
# Add cache control to compaction block
|
|
modified_content.append(
|
|
{
|
|
"type": "compaction",
|
|
"content": block.content,
|
|
"cache_control": {"type": "ephemeral"},
|
|
}
|
|
)
|
|
elif hasattr(block, "type") and block.type == "text":
|
|
modified_content.append({"type": "text", "text": block.text})
|
|
|
|
# Create new messages with cached compaction block
|
|
cached_messages = [{"role": "assistant", "content": modified_content}]
|
|
cached_messages.append(
|
|
{"role": "user", "content": "What were the main topics discussed?"}
|
|
)
|
|
|
|
print("\nSending request with cached compaction block...")
|
|
|
|
# Second request should hit cache
|
|
response2 = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=cached_messages,
|
|
max_tokens=512,
|
|
context_management={"edits": [{"type": "compact_20260112"}]},
|
|
)
|
|
|
|
# Validate response
|
|
assert_valid_chat_response(response2)
|
|
|
|
# Check for cache hit in usage
|
|
if hasattr(response2, "usage"):
|
|
print(f" Input tokens: {response2.usage.input_tokens}")
|
|
if hasattr(response2.usage, "cache_read_input_tokens"):
|
|
cache_read = response2.usage.cache_read_input_tokens
|
|
print(f" Cache read tokens: {cache_read}")
|
|
if cache_read > 0:
|
|
print("✓ Cache hit detected on compaction block!")
|
|
else:
|
|
print(" Note: Cache may not have hit (timing/TTL)")
|
|
else:
|
|
print(" Note: No cache_read_input_tokens in usage")
|
|
|
|
print("✓ Compaction with caching test completed")
|
|
else:
|
|
print("⚠ Compaction not triggered, skipping caching test")
|
|
|
|
def test_40_compaction_edge_cases(self, compaction_client):
|
|
"""Test Case 40: Compaction edge cases and error handling
|
|
|
|
Verifies behavior with minimal context, invalid parameters, and
|
|
boundary conditions.
|
|
"""
|
|
print("\n=== Testing Compaction Edge Cases ===")
|
|
|
|
# Test 1: Very small context (should not trigger compaction)
|
|
print("\n1. Testing with minimal context:")
|
|
small_messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi there!"},
|
|
{"role": "user", "content": "How are you?"},
|
|
]
|
|
|
|
response_small = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=small_messages,
|
|
max_tokens=100,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Won't be reached with small messages
|
|
},
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
# Should work without compaction
|
|
assert_valid_chat_response(response_small)
|
|
compaction_in_small = [
|
|
b for b in response_small.content if hasattr(b, "type") and b.type == "compaction"
|
|
]
|
|
assert len(compaction_in_small) == 0, "Small context should not trigger compaction"
|
|
print(" ✓ Small context handled correctly (no compaction)")
|
|
|
|
# Test 2: Default trigger value (should use 150,000 tokens)
|
|
print("\n2. Testing with default trigger value:")
|
|
messages = [{"role": "user", "content": "Tell me about AI."}]
|
|
|
|
response_default = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=messages,
|
|
max_tokens=100,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112"
|
|
# No trigger specified, should use default 150k
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
assert_valid_chat_response(response_default)
|
|
print(" ✓ Default trigger value accepted")
|
|
|
|
# Test 3: Compaction with tools
|
|
print("\n3. Testing compaction with tool use:")
|
|
tool_messages = [
|
|
{
|
|
"role": "user",
|
|
"content": self._generate_large_context(80000) + " What's the weather?",
|
|
}
|
|
]
|
|
|
|
tools = convert_to_anthropic_tools([WEATHER_TOOL])
|
|
|
|
response_tools = compaction_client.beta.messages.create(
|
|
model="claude-opus-4-6",
|
|
messages=tool_messages,
|
|
tools=tools,
|
|
max_tokens=512,
|
|
context_management={
|
|
"edits": [
|
|
{
|
|
"type": "compact_20260112",
|
|
"trigger": {
|
|
"type": "input_tokens",
|
|
"value": 50000, # Minimum allowed threshold
|
|
},
|
|
}
|
|
]
|
|
},
|
|
)
|
|
|
|
assert_valid_chat_response(response_tools)
|
|
print(" ✓ Compaction works with tool use")
|
|
|
|
print("\n✓ All edge cases handled correctly")
|