first commit

2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions
--- a/docs/openapi/schemas/inference/async.yaml
+++ b/docs/openapi/schemas/inference/async.yaml
@@ -0,0 +1,43 @@
+# Async Job schemas
+
+AsyncJobStatus:
+  type: string
+  description: The status of an async job
+  enum:
+    - pending
+    - processing
+    - completed
+    - failed
+
+AsyncJobResponse:
+  type: object
+  description: Response returned when creating or polling an async job
+  required:
+    - id
+    - status
+    - created_at
+  properties:
+    id:
+      type: string
+      description: Unique identifier for the async job
+    status:
+      $ref: '#/AsyncJobStatus'
+    expires_at:
+      type: string
+      format: date-time
+      description: When the job result expires and will be cleaned up
+    created_at:
+      type: string
+      format: date-time
+      description: When the job was created
+    completed_at:
+      type: string
+      format: date-time
+      description: When the job completed (successfully or with failure)
+    status_code:
+      type: integer
+      description: HTTP status code of the completed operation
+    result:
+      description: The result of the completed operation (shape depends on the request type)
+    error:
+      $ref: './common.yaml#/BifrostError'
--- a/docs/openapi/schemas/inference/batch.yaml
+++ b/docs/openapi/schemas/inference/batch.yaml
@@ -0,0 +1,309 @@
+# Batch API schemas
+
+BatchStatus:
+  type: string
+  enum:
+    - validating
+    - failed
+    - in_progress
+    - finalizing
+    - completed
+    - expired
+    - cancelling
+    - canceled
+    - ended
+
+BatchEndpoint:
+  type: string
+  enum:
+    - /v1/chat/completions
+    - /v1/embeddings
+    - /v1/completions
+    - /v1/responses
+    - /v1/messages
+
+BatchCreateRequest:
+  type: object
+  required:
+    - model
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+    input_file_id:
+      type: string
+      description: OpenAI-style file ID
+    requests:
+      type: array
+      items:
+        $ref: '#/BatchRequestItem'
+      description: Anthropic-style inline requests
+    endpoint:
+      $ref: '#/BatchEndpoint'
+    completion_window:
+      type: string
+      description: e.g., "24h"
+    metadata:
+      type: object
+      additionalProperties:
+        type: string
+
+BatchRequestItem:
+  type: object
+  required:
+    - custom_id
+  properties:
+    custom_id:
+      type: string
+    method:
+      type: string
+    url:
+      type: string
+    body:
+      type: object
+    params:
+      type: object
+
+BatchCreateResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    object:
+      type: string
+    endpoint:
+      type: string
+    input_file_id:
+      type: string
+    completion_window:
+      type: string
+    status:
+      $ref: '#/BatchStatus'
+    request_counts:
+      $ref: '#/BatchRequestCounts'
+    metadata:
+      type: object
+      additionalProperties:
+        type: string
+    created_at:
+      type: integer
+      format: int64
+    expires_at:
+      type: integer
+      format: int64
+    output_file_id:
+      type: string
+    error_file_id:
+      type: string
+    processing_status:
+      type: string
+    results_url:
+      type: string
+    operation_name:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+BatchRequestCounts:
+  type: object
+  properties:
+    total:
+      type: integer
+    completed:
+      type: integer
+    failed:
+      type: integer
+    succeeded:
+      type: integer
+    expired:
+      type: integer
+    canceled:
+      type: integer
+    pending:
+      type: integer
+
+BatchListResponse:
+  type: object
+  properties:
+    object:
+      type: string
+    data:
+      type: array
+      items:
+        $ref: '#/BatchRetrieveResponse'
+    first_id:
+      type: string
+    last_id:
+      type: string
+    has_more:
+      type: boolean
+    next_cursor:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+BatchRetrieveResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    object:
+      type: string
+    endpoint:
+      type: string
+    input_file_id:
+      type: string
+    completion_window:
+      type: string
+    status:
+      $ref: '#/BatchStatus'
+    request_counts:
+      $ref: '#/BatchRequestCounts'
+    metadata:
+      type: object
+      additionalProperties:
+        type: string
+    created_at:
+      type: integer
+      format: int64
+    expires_at:
+      type: integer
+      format: int64
+    in_progress_at:
+      type: integer
+      format: int64
+    finalizing_at:
+      type: integer
+      format: int64
+    completed_at:
+      type: integer
+      format: int64
+    failed_at:
+      type: integer
+      format: int64
+    expired_at:
+      type: integer
+      format: int64
+    cancelling_at:
+      type: integer
+      format: int64
+    cancelled_at:
+      type: integer
+      format: int64
+    output_file_id:
+      type: string
+    error_file_id:
+      type: string
+    errors:
+      $ref: '#/BatchErrors'
+    processing_status:
+      type: string
+    results_url:
+      type: string
+    archived_at:
+      type: integer
+      format: int64
+    operation_name:
+      type: string
+    done:
+      type: boolean
+    progress:
+      type: integer
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+BatchErrors:
+  type: object
+  properties:
+    object:
+      type: string
+    data:
+      type: array
+      items:
+        $ref: '#/BatchError'
+
+BatchError:
+  type: object
+  properties:
+    code:
+      type: string
+    message:
+      type: string
+    param:
+      type: string
+    line:
+      type: integer
+
+BatchCancelResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    object:
+      type: string
+    status:
+      $ref: '#/BatchStatus'
+    request_counts:
+      $ref: '#/BatchRequestCounts'
+    cancelling_at:
+      type: integer
+      format: int64
+    cancelled_at:
+      type: integer
+      format: int64
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+BatchResultsResponse:
+  type: object
+  properties:
+    batch_id:
+      type: string
+    results:
+      type: array
+      items:
+        $ref: '#/BatchResultItem'
+    has_more:
+      type: boolean
+    next_cursor:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+BatchResultItem:
+  type: object
+  properties:
+    custom_id:
+      type: string
+    response:
+      $ref: '#/BatchResultResponse'
+    result:
+      $ref: '#/BatchResultData'
+    error:
+      $ref: '#/BatchResultError'
+
+BatchResultResponse:
+  type: object
+  properties:
+    status_code:
+      type: integer
+    request_id:
+      type: string
+    body:
+      type: object
+
+BatchResultData:
+  type: object
+  properties:
+    type:
+      type: string
+    message:
+      type: object
+
+BatchResultError:
+  type: object
+  properties:
+    code:
+      type: string
+    message:
+      type: string
--- a/docs/openapi/schemas/inference/chat.yaml
+++ b/docs/openapi/schemas/inference/chat.yaml
@@ -0,0 +1,673 @@
+# Chat Completions API schemas
+
+ChatCompletionRequest:
+  type: object
+  required:
+    - model
+    - messages
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format (e.g., openai/gpt-4)
+      example: openai/gpt-4
+    messages:
+      type: array
+      items:
+        $ref: '#/ChatMessage'
+      description: List of messages in the conversation
+    fallbacks:
+      type: array
+      items:
+        type: string
+      description: Fallback models in provider/model format
+    stream:
+      type: boolean
+      description: Whether to stream the response
+    frequency_penalty:
+      type: number
+      minimum: -2.0
+      maximum: 2.0
+    logit_bias:
+      type: object
+      additionalProperties:
+        type: number
+    logprobs:
+      type: boolean
+    max_completion_tokens:
+      type: integer
+    metadata:
+      type: object
+      additionalProperties: true
+    modalities:
+      type: array
+      items:
+        type: string
+    parallel_tool_calls:
+      type: boolean
+    presence_penalty:
+      type: number
+      minimum: -2.0
+      maximum: 2.0
+    prompt_cache_key:
+      type: string
+    reasoning:
+      $ref: '#/ChatReasoning'
+    response_format:
+      type: object
+      description: Format for the response
+    safety_identifier:
+      type: string
+    service_tier:
+      type: string
+    stream_options:
+      $ref: '#/ChatStreamOptions'
+    store:
+      type: boolean
+    temperature:
+      type: number
+      minimum: 0
+      maximum: 2
+    tool_choice:
+      $ref: '#/ChatToolChoice'
+    tools:
+      type: array
+      items:
+        $ref: '#/ChatTool'
+    seed:
+      type: integer
+      description: Deterministic sampling seed
+    top_p:
+      type: number
+      minimum: 0
+      maximum: 1
+      description: Nucleus sampling parameter
+    top_logprobs:
+      type: integer
+      minimum: 0
+      maximum: 20
+      description: Number of most likely tokens to return at each position
+    stop:
+      oneOf:
+        - type: string
+        - type: array
+          items:
+            type: string
+      description: Up to 4 sequences where the API will stop generating tokens
+    prediction:
+      $ref: '#/ChatPrediction'
+    prompt_cache_retention:
+      type: string
+      enum: [in-memory, 24h]
+      description: Prompt cache retention policy
+    web_search_options:
+      $ref: '#/ChatWebSearchOptions'
+    truncation:
+      type: string
+    user:
+      type: string
+    verbosity:
+      type: string
+      enum: [low, medium, high]
+
+ChatMessage:
+  type: object
+  required:
+    - role
+  properties:
+    role:
+      $ref: '#/ChatMessageRole'
+    name:
+      type: string
+    content:
+      $ref: '#/ChatMessageContent'
+    tool_call_id:
+      type: string
+      description: For tool messages
+    refusal:
+      type: string
+    audio:
+      $ref: '#/ChatAudioMessageAudio'
+    reasoning:
+      type: string
+    reasoning_details:
+      type: array
+      items:
+        $ref: '#/ChatReasoningDetails'
+    annotations:
+      type: array
+      items:
+        $ref: '#/ChatAssistantMessageAnnotation'
+    tool_calls:
+      type: array
+      items:
+        $ref: '#/ChatAssistantMessageToolCall'
+
+ChatMessageRole:
+  type: string
+  enum:
+    - assistant
+    - user
+    - system
+    - tool
+    - developer
+
+ChatMessageContent:
+  oneOf:
+    - type: string
+    - type: array
+      items:
+        $ref: '#/ChatContentBlock'
+  description: Message content - can be a string or array of content blocks
+
+ChatContentBlock:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+      enum: [text, image_url, input_audio, file, refusal]
+    text:
+      type: string
+    refusal:
+      type: string
+    image_url:
+      $ref: '#/ChatInputImage'
+    input_audio:
+      $ref: '#/ChatInputAudio'
+    file:
+      $ref: '#/ChatInputFile'
+    cache_control:
+      $ref: './common.yaml#/CacheControl'
+
+ChatInputImage:
+  type: object
+  required:
+    - url
+  properties:
+    url:
+      type: string
+    detail:
+      type: string
+      enum: [low, high, auto]
+
+ChatInputAudio:
+  type: object
+  required:
+    - data
+  properties:
+    data:
+      type: string
+    format:
+      type: string
+
+ChatInputFile:
+  type: object
+  properties:
+    file_data:
+      type: string
+    file_id:
+      type: string
+    filename:
+      type: string
+    file_type:
+      type: string
+
+ChatReasoning:
+  type: object
+  properties:
+    effort:
+      type: string
+      description: Reasoning effort level
+      enum: [none, minimal, low, medium, high, xhigh]
+    max_tokens:
+      type: integer
+
+ChatStreamOptions:
+  type: object
+  properties:
+    include_obfuscation:
+      type: boolean
+    include_usage:
+      type: boolean
+
+ChatToolChoice:
+  oneOf:
+    - type: string
+      enum: [none, auto, required]
+    - $ref: '#/ChatToolChoiceStruct'
+
+ChatToolChoiceStruct:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+      enum: [none, any, required, function, allowed_tools, custom]
+    function:
+      $ref: '#/ChatToolChoiceFunction'
+    allowed_tools:
+      $ref: '#/ChatToolChoiceAllowedTools'
+
+ChatToolChoiceFunction:
+  type: object
+  required:
+    - name
+  properties:
+    name:
+      type: string
+
+ChatToolChoiceAllowedTools:
+  type: object
+  properties:
+    mode:
+      type: string
+      enum: [auto, required]
+    tools:
+      type: array
+      items:
+        $ref: '#/ChatToolChoiceAllowedToolsTool'
+
+ChatToolChoiceAllowedToolsTool:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+    function:
+      $ref: '#/ChatToolChoiceFunction'
+
+ChatTool:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+      enum: [function, custom]
+    function:
+      $ref: '#/ChatToolFunction'
+    custom:
+      $ref: '#/ChatToolCustom'
+    cache_control:
+      $ref: './common.yaml#/CacheControl'
+
+ChatToolFunction:
+  type: object
+  required:
+    - name
+  properties:
+    name:
+      type: string
+    description:
+      type: string
+    parameters:
+      $ref: '#/ToolFunctionParameters'
+    strict:
+      type: boolean
+
+ToolFunctionParameters:
+  type: object
+  properties:
+    type:
+      type: string
+    description:
+      type: string
+    required:
+      type: array
+      items:
+        type: string
+    properties:
+      type: object
+      additionalProperties: true
+    enum:
+      type: array
+      items:
+        type: string
+    additionalProperties:
+      type: boolean
+
+ChatToolCustom:
+  type: object
+  properties:
+    format:
+      $ref: '#/ChatToolCustomFormat'
+
+ChatToolCustomFormat:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+    grammar:
+      $ref: '#/ChatToolCustomGrammarFormat'
+
+ChatToolCustomGrammarFormat:
+  type: object
+  required:
+    - definition
+    - syntax
+  properties:
+    definition:
+      type: string
+    syntax:
+      type: string
+      enum: [lark, regex]
+
+ChatReasoningDetails:
+  type: object
+  properties:
+    id:
+      type: string
+    index:
+      type: integer
+    type:
+      type: string
+      enum: [reasoning.summary, reasoning.encrypted, reasoning.text]
+    summary:
+      type: string
+    text:
+      type: string
+    signature:
+      type: string
+    data:
+      type: string
+
+ChatAssistantMessageAnnotation:
+  type: object
+  properties:
+    type:
+      type: string
+    url_citation:
+      $ref: '#/ChatAssistantMessageAnnotationCitation'
+
+ChatAssistantMessageAnnotationCitation:
+  type: object
+  properties:
+    start_index:
+      type: integer
+    end_index:
+      type: integer
+    title:
+      type: string
+    url:
+      type: string
+    sources:
+      type: object
+    type:
+      type: string
+
+ChatAssistantMessageToolCall:
+  type: object
+  required:
+    - function
+  properties:
+    index:
+      type: integer
+    type:
+      type: string
+    id:
+      type: string
+    function:
+      $ref: '#/ChatAssistantMessageToolCallFunction'
+
+ChatAssistantMessageToolCallFunction:
+  type: object
+  properties:
+    name:
+      type: string
+    arguments:
+      type: string
+
+ChatAudioMessageAudio:
+  type: object
+  properties:
+    id:
+      type: string
+    data:
+      type: string
+    expires_at:
+      type: integer
+    transcript:
+      type: string
+
+ChatCompletionResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    choices:
+      type: array
+      items:
+        $ref: '#/BifrostResponseChoice'
+    created:
+      type: integer
+    model:
+      type: string
+    object:
+      type: string
+    service_tier:
+      type: string
+    system_fingerprint:
+      type: string
+    usage:
+      $ref: './usage.yaml#/BifrostLLMUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+    search_results:
+      type: array
+      items:
+        $ref: '#/PerplexitySearchResult'
+    videos:
+      type: array
+      items:
+        $ref: '#/VideoResult'
+    citations:
+      type: array
+      items:
+        type: string
+
+BifrostResponseChoice:
+  type: object
+  properties:
+    index:
+      type: integer
+    finish_reason:
+      type: string
+    log_probs:
+      $ref: '#/BifrostLogProbs'
+    text:
+      type: string
+      description: For text completions
+    message:
+      $ref: '#/ChatMessage'
+      description: For non-streaming chat completions
+    delta:
+      $ref: '#/ChatStreamResponseChoiceDelta'
+      description: For streaming chat completions
+
+BifrostLogProbs:
+  type: object
+  properties:
+    content:
+      type: array
+      items:
+        $ref: '#/ContentLogProb'
+    refusal:
+      type: array
+      items:
+        $ref: '#/LogProb'
+    text_offset:
+      type: array
+      items:
+        type: integer
+    token_logprobs:
+      type: array
+      items:
+        type: number
+    tokens:
+      type: array
+      items:
+        type: string
+    top_logprobs:
+      type: array
+      items:
+        type: object
+        additionalProperties:
+          type: number
+
+ContentLogProb:
+  type: object
+  properties:
+    bytes:
+      type: array
+      items:
+        type: integer
+    logprob:
+      type: number
+    token:
+      type: string
+    top_logprobs:
+      type: array
+      items:
+        $ref: '#/LogProb'
+
+LogProb:
+  type: object
+  properties:
+    bytes:
+      type: array
+      items:
+        type: integer
+    logprob:
+      type: number
+    token:
+      type: string
+
+ChatStreamResponseChoiceDelta:
+  type: object
+  properties:
+    role:
+      type: string
+    content:
+      type: string
+    refusal:
+      type: string
+    audio:
+      $ref: '#/ChatAudioMessageAudio'
+    reasoning:
+      type: string
+    reasoning_details:
+      type: array
+      items:
+        $ref: '#/ChatReasoningDetails'
+    tool_calls:
+      type: array
+      items:
+        $ref: '#/ChatAssistantMessageToolCall'
+
+ChatCompletionStreamResponse:
+  type: object
+  description: Streaming chat completion response (SSE format)
+  properties:
+    id:
+      type: string
+    choices:
+      type: array
+      items:
+        $ref: '#/BifrostResponseChoice'
+    created:
+      type: integer
+    model:
+      type: string
+    object:
+      type: string
+    usage:
+      $ref: './usage.yaml#/BifrostLLMUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+PerplexitySearchResult:
+  type: object
+  description: Search result from Perplexity AI search
+  properties:
+    title:
+      type: string
+    url:
+      type: string
+    date:
+      type: string
+    last_updated:
+      type: string
+    snippet:
+      type: string
+    source:
+      type: string
+
+VideoResult:
+  type: object
+  properties:
+    url:
+      type: string
+    thumbnail_url:
+      type: string
+    thumbnail_width:
+      type: integer
+    thumbnail_height:
+      type: integer
+    duration:
+      type: number
+
+ChatPrediction:
+  type: object
+  description: Predicted output content for the model to reference (OpenAI only). Can reduce latency.
+  properties:
+    type:
+      type: string
+      description: Always "content"
+    content:
+      description: Predicted content (string or array of content parts)
+      oneOf:
+        - type: string
+        - type: array
+          items:
+            type: object
+            additionalProperties: true
+
+ChatWebSearchOptions:
+  type: object
+  description: Web search options for chat completions (OpenAI only)
+  properties:
+    search_context_size:
+      type: string
+      enum: [low, medium, high]
+      description: Amount of search context to include
+    user_location:
+      $ref: '#/ChatWebSearchOptionsUserLocation'
+
+ChatWebSearchOptionsUserLocation:
+  type: object
+  properties:
+    type:
+      type: string
+      description: Location type (e.g., "approximate")
+    approximate:
+      $ref: '#/ChatWebSearchOptionsUserLocationApproximate'
+
+ChatWebSearchOptionsUserLocationApproximate:
+  type: object
+  properties:
+    city:
+      type: string
+    country:
+      type: string
+      description: Two-letter ISO country code (e.g., "US")
+    region:
+      type: string
+      description: Region or state (e.g., "California")
+    timezone:
+      type: string
+      description: IANA timezone (e.g., "America/Los_Angeles")
--- a/docs/openapi/schemas/inference/common.yaml
+++ b/docs/openapi/schemas/inference/common.yaml
@@ -0,0 +1,149 @@
+# Common schemas used across the API
+
+ModelProvider:
+  type: string
+  description: AI model provider identifier
+  enum:
+    - openai
+    - azure
+    - anthropic
+    - bedrock
+    - cohere
+    - vertex
+    - vllm
+    - mistral
+    - ollama
+    - groq
+    - sgl
+    - parasail
+    - perplexity
+    - replicate
+    - cerebras
+    - gemini
+    - openrouter
+    - elevenlabs
+    - huggingface
+    - nebius
+    - xai
+    - runway
+    - fireworks
+
+Fallback:
+  type: object
+  description: Fallback model configuration
+  required:
+    - provider
+    - model
+  properties:
+    provider:
+      $ref: '#/ModelProvider'
+    model:
+      type: string
+      description: Model name
+
+BifrostError:
+  type: object
+  description: Error response from Bifrost
+  properties:
+    event_id:
+      type: string
+    type:
+      type: string
+    is_bifrost_error:
+      type: boolean
+    status_code:
+      type: integer
+    error:
+      $ref: '#/ErrorField'
+    extra_fields:
+      $ref: '#/BifrostErrorExtraFields'
+
+ErrorField:
+  type: object
+  properties:
+    type:
+      type: string
+    code:
+      type: string
+    message:
+      type: string
+    param:
+      type: string
+    event_id:
+      type: string
+
+BifrostErrorExtraFields:
+  type: object
+  properties:
+    provider:
+      $ref: '#/ModelProvider'
+    model_requested:
+      type: string
+    request_type:
+      type: string
+
+BifrostResponseExtraFields:
+  type: object
+  description: Additional fields included in responses
+  properties:
+    request_type:
+      type: string
+      description: Type of request that was made
+    provider:
+      $ref: '#/ModelProvider'
+    model_requested:
+      type: string
+      description: The model that was requested
+    model_deployment:
+      type: string
+      description: The actual model deployment used
+    latency:
+      type: integer
+      format: int64
+      description: Request latency in milliseconds
+    chunk_index:
+      type: integer
+      description: Index of the chunk for streaming responses
+    raw_request:
+      type: object
+      description: Raw request if enabled
+    raw_response:
+      type: object
+      description: Raw response if enabled
+    cache_debug:
+      $ref: '#/BifrostCacheDebug'
+
+BifrostCacheDebug:
+  type: object
+  properties:
+    cache_hit:
+      type: boolean
+    cache_id:
+      type: string
+    hit_type:
+      type: string
+    requested_provider:
+      type: string
+    requested_model:
+      type: string
+    provider_used:
+      type: string
+    model_used:
+      type: string
+    input_tokens:
+      type: integer
+    threshold:
+      type: number
+    similarity:
+      type: number
+
+CacheControl:
+  type: object
+  description: Cache control settings for content blocks
+  properties:
+    type:
+      type: string
+      enum: [ephemeral]
+    ttl:
+      type: string
+      description: Time to live (e.g., "1m", "1h")
--- a/docs/openapi/schemas/inference/containers.yaml
+++ b/docs/openapi/schemas/inference/containers.yaml
@@ -0,0 +1,344 @@
+# Containers API schemas
+
+ContainerStatus:
+  type: string
+  enum:
+    - running
+  description: The status of a container
+
+ContainerExpiresAfter:
+  type: object
+  description: Expiration configuration for a container
+  properties:
+    anchor:
+      type: string
+      description: The anchor point for expiration (e.g., "last_active_at")
+    minutes:
+      type: integer
+      description: Number of minutes after anchor point
+
+ContainerObject:
+  type: object
+  description: A container object
+  properties:
+    id:
+      type: string
+      description: The unique identifier for the container
+    object:
+      type: string
+      description: The object type (always "container")
+    name:
+      type: string
+      description: The name of the container
+    created_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of when the container was created
+    status:
+      $ref: '#/ContainerStatus'
+    expires_after:
+      $ref: '#/ContainerExpiresAfter'
+    last_active_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of last activity
+    memory_limit:
+      type: string
+      description: Memory limit for the container (e.g., "1g", "4g")
+    metadata:
+      type: object
+      additionalProperties:
+        type: string
+      description: User-provided metadata
+
+ContainerCreateRequest:
+  type: object
+  required:
+    - provider
+    - name
+  properties:
+    provider:
+      $ref: './common.yaml#/ModelProvider'
+    name:
+      type: string
+      description: Name of the container
+    expires_after:
+      $ref: '#/ContainerExpiresAfter'
+    file_ids:
+      type: array
+      items:
+        type: string
+      description: IDs of existing files to copy into this container
+    memory_limit:
+      type: string
+      description: Memory limit for the container (e.g., "1g", "4g")
+    metadata:
+      type: object
+      additionalProperties:
+        type: string
+      description: User-provided metadata
+
+ContainerCreateResponse:
+  type: object
+  properties:
+    id:
+      type: string
+      description: The unique identifier for the created container
+    object:
+      type: string
+      description: The object type (always "container")
+    name:
+      type: string
+      description: The name of the container
+    created_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of when the container was created
+    status:
+      $ref: '#/ContainerStatus'
+    expires_after:
+      $ref: '#/ContainerExpiresAfter'
+    last_active_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of last activity
+    memory_limit:
+      type: string
+      description: Memory limit for the container
+    metadata:
+      type: object
+      additionalProperties:
+        type: string
+      description: User-provided metadata
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+ContainerListResponse:
+  type: object
+  properties:
+    object:
+      type: string
+      description: The object type (always "list")
+    data:
+      type: array
+      items:
+        $ref: '#/ContainerObject'
+      description: List of container objects
+    first_id:
+      type: string
+      description: ID of the first container in the list
+    last_id:
+      type: string
+      description: ID of the last container in the list
+    has_more:
+      type: boolean
+      description: Whether there are more containers to fetch
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+ContainerRetrieveResponse:
+  type: object
+  properties:
+    id:
+      type: string
+      description: The unique identifier for the container
+    object:
+      type: string
+      description: The object type (always "container")
+    name:
+      type: string
+      description: The name of the container
+    created_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of when the container was created
+    status:
+      $ref: '#/ContainerStatus'
+    expires_after:
+      $ref: '#/ContainerExpiresAfter'
+    last_active_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of last activity
+    memory_limit:
+      type: string
+      description: Memory limit for the container
+    metadata:
+      type: object
+      additionalProperties:
+        type: string
+      description: User-provided metadata
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+ContainerDeleteResponse:
+  type: object
+  properties:
+    id:
+      type: string
+      description: The ID of the deleted container
+    object:
+      type: string
+      description: The object type (always "container.deleted")
+    deleted:
+      type: boolean
+      description: Whether the container was successfully deleted
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+# =============================================================================
+# CONTAINER FILES SCHEMAS
+# =============================================================================
+
+ContainerFileObject:
+  type: object
+  description: A file object within a container
+  properties:
+    id:
+      type: string
+      description: The unique identifier for the file
+    object:
+      type: string
+      description: The object type (always "container.file")
+    container_id:
+      type: string
+      description: The ID of the container this file belongs to
+    path:
+      type: string
+      description: The path of the file within the container
+    bytes:
+      type: integer
+      format: int64
+      description: The size of the file in bytes
+    created_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of when the file was created
+    source:
+      type: string
+      description: The source of the file (e.g., "user_upload", "copied")
+
+ContainerFileCreateMultipartRequest:
+  type: object
+  description: Request to create a file in a container via multipart upload
+  properties:
+    file:
+      type: string
+      format: binary
+      description: The file content to upload
+    file_path:
+      type: string
+      description: Optional path for the file within the container
+
+ContainerFileCreateJsonRequest:
+  type: object
+  description: Request to create a file in a container by referencing an existing file
+  required:
+    - file_id
+  properties:
+    file_id:
+      type: string
+      description: The ID of an existing file to copy into the container
+    file_path:
+      type: string
+      description: Optional path for the file within the container
+
+ContainerFileCreateResponse:
+  type: object
+  description: Response from creating a file in a container
+  properties:
+    id:
+      type: string
+      description: The unique identifier for the created file
+    object:
+      type: string
+      description: The object type (always "container.file")
+    container_id:
+      type: string
+      description: The ID of the container this file belongs to
+    path:
+      type: string
+      description: The path of the file within the container
+    bytes:
+      type: integer
+      format: int64
+      description: The size of the file in bytes
+    created_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of when the file was created
+    source:
+      type: string
+      description: The source of the file
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+ContainerFileListResponse:
+  type: object
+  description: Response containing a list of files in a container
+  properties:
+    object:
+      type: string
+      description: The object type (always "list")
+    data:
+      type: array
+      items:
+        $ref: '#/ContainerFileObject'
+      description: List of file objects
+    first_id:
+      type: string
+      description: ID of the first file in the list
+    last_id:
+      type: string
+      description: ID of the last file in the list
+    has_more:
+      type: boolean
+      description: Whether there are more files to fetch
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+ContainerFileRetrieveResponse:
+  type: object
+  description: Response from retrieving a file from a container
+  properties:
+    id:
+      type: string
+      description: The unique identifier for the file
+    object:
+      type: string
+      description: The object type (always "container.file")
+    container_id:
+      type: string
+      description: The ID of the container this file belongs to
+    path:
+      type: string
+      description: The path of the file within the container
+    bytes:
+      type: integer
+      format: int64
+      description: The size of the file in bytes
+    created_at:
+      type: integer
+      format: int64
+      description: Unix timestamp of when the file was created
+    source:
+      type: string
+      description: The source of the file
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+ContainerFileDeleteResponse:
+  type: object
+  description: Response from deleting a file from a container
+  properties:
+    id:
+      type: string
+      description: The ID of the deleted file
+    object:
+      type: string
+      description: The object type (always "container.file.deleted")
+    deleted:
+      type: boolean
+      description: Whether the file was successfully deleted
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
--- a/docs/openapi/schemas/inference/count-tokens.yaml
+++ b/docs/openapi/schemas/inference/count-tokens.yaml
@@ -0,0 +1,53 @@
+# Count Tokens API schemas
+
+CountTokensRequest:
+  type: object
+  required:
+    - model
+    - messages
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+    messages:
+      type: array
+      items:
+        $ref: './responses.yaml#/ResponsesMessage'
+    fallbacks:
+      type: array
+      items:
+        type: string
+    tools:
+      type: array
+      items:
+        $ref: './responses.yaml#/ResponsesTool'
+    instructions:
+      type: string
+    text:
+      type: string
+
+CountTokensResponse:
+  type: object
+  properties:
+    object:
+      type: string
+    model:
+      type: string
+    input_tokens:
+      type: integer
+    input_tokens_details:
+      $ref: './responses.yaml#/ResponsesResponseInputTokens'
+    tokens:
+      type: array
+      items:
+        type: integer
+    token_strings:
+      type: array
+      items:
+        type: string
+    output_tokens:
+      type: integer
+    total_tokens:
+      type: integer
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
--- a/docs/openapi/schemas/inference/embeddings.yaml
+++ b/docs/openapi/schemas/inference/embeddings.yaml
@@ -0,0 +1,76 @@
+# Embeddings API schemas
+
+EmbeddingRequest:
+  type: object
+  required:
+    - model
+    - input
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+    input:
+      $ref: '#/EmbeddingInput'
+    fallbacks:
+      type: array
+      items:
+        type: string
+    encoding_format:
+      type: string
+      enum: [float, base64]
+    dimensions:
+      type: integer
+
+EmbeddingInput:
+  oneOf:
+    - type: string
+    - type: array
+      items:
+        type: string
+    - type: array
+      items:
+        type: integer
+    - type: array
+      items:
+        type: array
+        items:
+          type: integer
+  description: Input for embedding - text or token arrays
+
+EmbeddingResponse:
+  type: object
+  properties:
+    data:
+      type: array
+      items:
+        $ref: '#/EmbeddingData'
+    model:
+      type: string
+    object:
+      type: string
+    usage:
+      $ref: './usage.yaml#/BifrostLLMUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+EmbeddingData:
+  type: object
+  properties:
+    index:
+      type: integer
+    object:
+      type: string
+    embedding:
+      $ref: '#/EmbeddingStruct'
+
+EmbeddingStruct:
+  oneOf:
+    - type: string
+    - type: array
+      items:
+        type: number
+    - type: array
+      items:
+        type: array
+        items:
+          type: number
--- a/docs/openapi/schemas/inference/files.yaml
+++ b/docs/openapi/schemas/inference/files.yaml
@@ -0,0 +1,188 @@
+# Files API schemas
+
+S3StorageConfig:
+  type: object
+  description: AWS S3 storage configuration
+  properties:
+    bucket:
+      type: string
+      description: S3 bucket name
+    region:
+      type: string
+      description: AWS region
+    prefix:
+      type: string
+      description: Path prefix for stored files
+
+GCSStorageConfig:
+  type: object
+  description: Google Cloud Storage configuration
+  properties:
+    bucket:
+      type: string
+      description: GCS bucket name
+    project:
+      type: string
+      description: GCP project ID
+    prefix:
+      type: string
+      description: Path prefix for stored files
+
+FileStorageConfig:
+  type: object
+  description: Storage configuration for cloud storage backends
+  properties:
+    s3:
+      $ref: '#/S3StorageConfig'
+    gcs:
+      $ref: '#/GCSStorageConfig'
+
+FilePurpose:
+  type: string
+  enum:
+    - batch
+    - assistants
+    - fine-tune
+    - vision
+    - batch_output
+    - user_data
+    - responses
+    - evals
+
+FileStatus:
+  type: string
+  enum:
+    - uploaded
+    - processed
+    - processing
+    - error
+    - deleted
+
+FileUploadRequest:
+  type: object
+  required:
+    - file
+    - purpose
+  properties:
+    file:
+      type: string
+      format: binary
+    purpose:
+      $ref: '#/FilePurpose'
+    provider:
+      $ref: './common.yaml#/ModelProvider'
+
+FileUploadResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    object:
+      type: string
+    bytes:
+      type: integer
+      format: int64
+    created_at:
+      type: integer
+      format: int64
+    filename:
+      type: string
+    purpose:
+      $ref: '#/FilePurpose'
+    status:
+      $ref: '#/FileStatus'
+    status_details:
+      type: string
+    expires_at:
+      type: integer
+      format: int64
+    storage_backend:
+      type: string
+    storage_uri:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+FileListResponse:
+  type: object
+  properties:
+    object:
+      type: string
+    data:
+      type: array
+      items:
+        $ref: '#/FileObject'
+    has_more:
+      type: boolean
+    after:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+FileObject:
+  type: object
+  properties:
+    id:
+      type: string
+    object:
+      type: string
+    bytes:
+      type: integer
+      format: int64
+    created_at:
+      type: integer
+      format: int64
+    filename:
+      type: string
+    purpose:
+      $ref: '#/FilePurpose'
+    status:
+      $ref: '#/FileStatus'
+    status_details:
+      type: string
+    expires_at:
+      type: integer
+      format: int64
+
+FileRetrieveResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    object:
+      type: string
+    bytes:
+      type: integer
+      format: int64
+    created_at:
+      type: integer
+      format: int64
+    filename:
+      type: string
+    purpose:
+      $ref: '#/FilePurpose'
+    status:
+      $ref: '#/FileStatus'
+    status_details:
+      type: string
+    expires_at:
+      type: integer
+      format: int64
+    storage_backend:
+      type: string
+    storage_uri:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+FileDeleteResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    object:
+      type: string
+    deleted:
+      type: boolean
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
--- a/docs/openapi/schemas/inference/images.yaml
+++ b/docs/openapi/schemas/inference/images.yaml
@@ -0,0 +1,514 @@
+# Image Generation Schemas
+
+ImageGenerationRequest:
+  allOf:
+    - type: object
+      required:
+        - model
+        - prompt
+      properties:
+        model:
+          type: string
+          description: Model identifier in format `provider/model`
+        prompt:
+          type: string
+          description: Text prompt to generate image
+        n:
+          type: integer
+          minimum: 1
+          maximum: 10
+          description: Number of images to generate
+        size:
+          type: string
+          enum:
+            - "256x256"
+            - "512x512"
+            - "1024x1024"
+            - "1792x1024"
+            - "1024x1792"
+            - "1536x1024"
+            - "1024x1536"
+            - "auto"
+          description: Size of the generated image
+        quality:
+          type: string
+          enum:
+            - "auto"
+            - "high"
+            - "medium"
+            - "low"
+            - "hd"
+            - "standard"
+          description: Quality of the generated image
+        style:
+          type: string
+          enum:
+            - "natural"
+            - "vivid"
+          description: Style of the generated image
+        response_format:
+          type: string
+          enum:
+            - "url"
+            - "b64_json"
+          default: "url"
+          description: |
+            Format of the response.
+        background:
+          type: string
+          enum:
+            - "transparent"
+            - "opaque"
+            - "auto"
+          description: Background type for the image
+        moderation:
+          type: string
+          enum:
+            - "low"
+            - "auto"
+          description: Content moderation level
+        partial_images:
+          type: integer
+          minimum: 0
+          maximum: 3
+          description: Number of partial images to generate
+        output_compression:
+          type: integer
+          minimum: 0
+          maximum: 100
+          description: Compression level (0-100%)
+        output_format:
+          type: string
+          enum:
+            - "png"
+            - "webp"
+            - "jpeg"
+          description: Output image format
+        user:
+          type: string
+          description: User identifier for tracking
+        seed:
+          type: integer
+          description: Seed for reproducible image generation
+        negative_prompt:
+          type: string
+          description: Negative prompt to guide what to avoid in generation
+        num_inference_steps:
+          type: integer
+          description: Number of inference steps for generation
+        stream:
+          type: boolean
+          default: false
+          description: |
+            Whether to stream the response. When true, images are sent as SSE.
+            When streaming, providers may return base64 chunks (`b64_json`) and/or URLs (`url`) depending on provider and configuration.
+        fallbacks:
+          type: array
+          items:
+            $ref: './common.yaml#/Fallback'
+          description: Fallback models to try if primary model fails
+
+ImageGenerationResponse:
+  type: object
+  properties:
+    id:
+      type: string
+      description: Unique identifier for the generation request
+    created:
+      type: integer
+      format: int64
+      description: Unix timestamp when the image was created
+    model:
+      type: string
+      description: Model used for generation
+    data:
+      type: array
+      items:
+        $ref: '#/ImageData'
+      description: Array of generated images
+    background:
+      type: string
+      description: Background type for the image
+    output_format:
+      type: string
+      enum:
+        - "png"
+        - "webp"
+        - "jpeg"
+      description: Output image format
+    quality:
+      type: string
+      description: Quality of the generated image
+    size:
+      type: string
+      enum:
+        - "256x256"
+        - "512x512"
+        - "1024x1024"
+        - "1792x1024"
+        - "1024x1792"
+        - "1536x1024"
+        - "1024x1536"
+        - "auto"
+      description: Size of the generated image
+    usage:
+      $ref: '#/ImageUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+ImageData:
+  type: object
+  properties:
+    url:
+      type: string
+      format: uri
+      description: URL of the generated image
+    b64_json:
+      type: string
+      description: Base64-encoded image data
+    revised_prompt:
+      type: string
+      description: Revised prompt used for generation
+    index:
+      type: integer
+      description: Index of this image
+
+ImageGenerationResponseParameters:
+  type: object
+  properties:
+    background:
+      type: string
+    output_format:
+      type: string
+    quality:
+      type: string
+    size:
+      type: string
+
+ImageUsage:
+  type: object
+  properties:
+    input_tokens:
+      type: integer
+      description: Number of input tokens
+    input_tokens_details:
+      $ref: '#/ImageTokenDetails'
+    total_tokens:
+      type: integer
+      description: Total tokens used
+    output_tokens:
+      type: integer
+      description: Number of output tokens
+    output_tokens_details:
+      $ref: '#/ImageTokenDetails'
+
+ImageTokenDetails:
+  type: object
+  properties:
+    image_tokens:
+      type: integer
+      description: Tokens used for images
+    text_tokens:
+      type: integer
+      description: Tokens used for text
+
+ImageGenerationStreamResponse:
+  type: object
+  description: |
+    Streaming response chunk for image generation.
+    Sent via Server-Sent Events (SSE).
+    Providers may return either b64_json (base64-encoded image data) or url (public URL to the image).
+  properties:
+    id:
+      type: string
+      description: Request identifier
+    type:
+      type: string
+      enum:
+        - "image_generation.partial_image"
+        - "image_generation.completed"
+        - "error"
+      description: Type of stream event
+    partial_image_index:
+      type: integer
+      description: Index of the partial image chunk
+    sequence_number:
+      type: integer
+      description: Sequence number for event ordering within the stream
+    b64_json:
+      type: string
+      description: |
+        Base64-encoded chunk of image data.
+        Optional; either b64_json or url may be present.
+    url:
+      type: string
+      format: uri
+      description: |
+        Optional public URL to the generated image chunk.
+        Used by HuggingFace and other providers that return image URLs instead of base64 data.
+    created_at:
+      type: integer
+      format: int64
+      description: Timestamp when chunk was created
+    size:
+      type: string
+      enum:
+        - "256x256"
+        - "512x512"
+        - "1024x1024"
+        - "1792x1024"
+        - "1024x1792"
+        - "1536x1024"
+        - "1024x1536"
+        - "auto"
+      description: Size of the generated image
+    quality:
+      type: string
+      description: Quality setting used
+    background:
+      type: string
+      description: Background type used
+    output_format:
+      type: string
+      enum:
+        - "png"
+        - "webp"
+        - "jpeg"
+      description: Output format used
+    revised_prompt:
+      type: string
+      description: Revised prompt
+    usage:
+      $ref: '#/ImageUsage'
+      description: Token usage
+    error:
+      $ref: './common.yaml#/BifrostError'
+      description: Error information if generation failed
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+# Image Edit Schemas (multipart/form-data)
+
+ImageEditRequest:
+  type: object
+  required:
+    - model
+    - image
+  properties:
+    model:
+      type: string
+      description: Model identifier in format `provider/model`
+    prompt:
+      type: string
+      description: |
+        Text prompt describing the edit. Required unless `type` is `background_removal`.
+    image:
+      type: string
+      format: binary
+      description: |
+        Image file to edit. Use field name `image` for a single file or `image[]` for multiple files.
+    mask:
+      type: string
+      format: binary
+      description: Optional mask image for inpainting (transparent areas indicate regions to edit)
+    type:
+      type: string
+      enum:
+        - "inpainting"
+        - "outpainting"
+        - "background_removal"
+      description: Type of edit operation
+    n:
+      type: integer
+      minimum: 1
+      maximum: 10
+      description: Number of images to generate
+    size:
+      type: string
+      enum:
+        - "256x256"
+        - "512x512"
+        - "1024x1024"
+        - "1536x1024"
+        - "1024x1536"
+        - "auto"
+      description: Size of the output image
+    response_format:
+      type: string
+      enum:
+        - "url"
+        - "b64_json"
+      default: "url"
+      description: Format of the response
+    stream:
+      type: boolean
+      default: false
+      description: When true, stream the response via Server-Sent Events
+    background:
+      type: string
+      enum:
+        - "transparent"
+        - "opaque"
+        - "auto"
+      description: Background type for the image
+    input_fidelity:
+      type: string
+      enum:
+        - "low"
+        - "high"
+      description: How closely to follow the original image
+    partial_images:
+      type: integer
+      minimum: 0
+      maximum: 3
+      description: Number of partial images to generate when streaming
+    quality:
+      type: string
+      enum:
+        - "auto"
+        - "high"
+        - "medium"
+        - "low"
+        - "standard"
+      description: Quality of the output image
+    output_format:
+      type: string
+      enum:
+        - "png"
+        - "webp"
+        - "jpeg"
+      description: Output image format
+    num_inference_steps:
+      type: integer
+      description: Number of inference steps
+    seed:
+      type: integer
+      description: Seed for reproducible editing
+    output_compression:
+      type: integer
+      minimum: 0
+      maximum: 100
+      description: Compression level (0-100%)
+    negative_prompt:
+      type: string
+      description: What to avoid in the edit
+    user:
+      type: string
+      description: User identifier for tracking
+    fallbacks:
+      type: array
+      items:
+        $ref: './common.yaml#/Fallback'
+      description: Fallback models to try if primary model fails
+
+# Image Variation Schemas (multipart/form-data)
+
+ImageVariationRequest:
+  type: object
+  required:
+    - model
+    - image
+  properties:
+    model:
+      type: string
+      description: Model identifier in format `provider/model`
+    image:
+      type: string
+      format: binary
+      description: |
+        Image file to create variations of. Use field name `image` for a single file or `image[]` for multiple (first image is used).
+    n:
+      type: integer
+      minimum: 1
+      maximum: 10
+      description: Number of variations to generate
+    size:
+      type: string
+      enum:
+        - "256x256"
+        - "512x512"
+        - "1024x1024"
+        - "1792x1024"
+        - "1024x1792"
+        - "1536x1024"
+        - "1024x1536"
+        - "auto"
+      description: Size of the output images
+    response_format:
+      type: string
+      enum:
+        - "url"
+        - "b64_json"
+      default: "url"
+      description: Format of the response
+    user:
+      type: string
+      description: User identifier for tracking
+    fallbacks:
+      type: array
+      items:
+        $ref: './common.yaml#/Fallback'
+      description: Fallback models to try if primary model fails
+
+# Image Edit Streaming (SSE)
+
+ImageEditStreamResponse:
+  type: object
+  description: |
+    Streaming response chunk for image edit.
+    Sent via Server-Sent Events (SSE) when `stream=true`.
+  properties:
+    id:
+      type: string
+      description: Request identifier
+    type:
+      type: string
+      enum:
+        - "image_edit.partial_image"
+        - "image_edit.completed"
+        - "error"
+      description: Type of stream event
+    partial_image_index:
+      type: integer
+      description: Index of the partial image chunk
+    sequence_number:
+      type: integer
+      description: Sequence number for event ordering within the stream
+    b64_json:
+      type: string
+      description: Base64-encoded chunk of image data; optional
+    url:
+      type: string
+      format: uri
+      description: Optional public URL to the image chunk
+    created_at:
+      type: integer
+      format: int64
+      description: Timestamp when chunk was created
+    size:
+      type: string
+      description: Size of the image
+    quality:
+      type: string
+      description: Quality setting used
+    background:
+      type: string
+      description: Background type used
+    output_format:
+      type: string
+      enum:
+        - "png"
+        - "webp"
+        - "jpeg"
+      description: Output format used
+    revised_prompt:
+      type: string
+      description: Revised prompt
+    usage:
+      $ref: '#/ImageUsage'
+      description: Token usage
+    error:
+      $ref: './common.yaml#/BifrostError'
+      description: Error information if edit failed
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
--- a/docs/openapi/schemas/inference/models.yaml
+++ b/docs/openapi/schemas/inference/models.yaml
@@ -0,0 +1,125 @@
+# Models API schemas
+
+ListModelsResponse:
+  type: object
+  properties:
+    data:
+      type: array
+      items:
+        $ref: '#/Model'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+    next_page_token:
+      type: string
+
+Model:
+  type: object
+  properties:
+    id:
+      type: string
+      description: Model ID in provider/model format
+    canonical_slug:
+      type: string
+    name:
+      type: string
+    deployment:
+      type: string
+    created:
+      type: integer
+      format: int64
+    context_length:
+      type: integer
+    max_input_tokens:
+      type: integer
+    max_output_tokens:
+      type: integer
+    architecture:
+      $ref: '#/Architecture'
+    pricing:
+      $ref: '#/Pricing'
+    top_provider:
+      $ref: '#/TopProvider'
+    per_request_limits:
+      $ref: '#/PerRequestLimits'
+    supported_parameters:
+      type: array
+      items:
+        type: string
+    default_parameters:
+      $ref: '#/DefaultParameters'
+    hugging_face_id:
+      type: string
+    description:
+      type: string
+    owned_by:
+      type: string
+    supported_methods:
+      type: array
+      items:
+        type: string
+
+Architecture:
+  type: object
+  properties:
+    modality:
+      type: string
+    tokenizer:
+      type: string
+    instruct_type:
+      type: string
+    input_modalities:
+      type: array
+      items:
+        type: string
+    output_modalities:
+      type: array
+      items:
+        type: string
+
+Pricing:
+  type: object
+  properties:
+    prompt:
+      type: string
+    completion:
+      type: string
+    request:
+      type: string
+    image:
+      type: string
+    web_search:
+      type: string
+    internal_reasoning:
+      type: string
+    input_cache_read:
+      type: string
+    input_cache_write:
+      type: string
+
+TopProvider:
+  type: object
+  properties:
+    is_moderated:
+      type: boolean
+    context_length:
+      type: integer
+    max_completion_tokens:
+      type: integer
+
+PerRequestLimits:
+  type: object
+  properties:
+    prompt_tokens:
+      type: integer
+    completion_tokens:
+      type: integer
+
+DefaultParameters:
+  type: object
+  properties:
+    temperature:
+      type: number
+    top_p:
+      type: number
+    frequency_penalty:
+      type: number
--- a/docs/openapi/schemas/inference/rerank.yaml
+++ b/docs/openapi/schemas/inference/rerank.yaml
@@ -0,0 +1,98 @@
+# Rerank API schemas
+
+RerankRequest:
+  type: object
+  required:
+    - model
+    - query
+    - documents
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+      example: cohere/rerank-v3.5
+    query:
+      type: string
+      minLength: 1
+      description: Query used to score and reorder documents
+    documents:
+      type: array
+      description: Documents to rerank
+      minItems: 1
+      items:
+        $ref: '#/RerankDocument'
+    fallbacks:
+      type: array
+      items:
+        type: string
+      description: Fallback models in provider/model format
+    top_n:
+      type: integer
+      minimum: 1
+      description: Maximum number of ranked results to return
+    max_tokens_per_doc:
+      type: integer
+      minimum: 1
+      description: Maximum tokens to consider per document (provider-dependent)
+    priority:
+      type: integer
+      description: Request priority hint (provider-dependent)
+    return_documents:
+      type: boolean
+      description: Whether to include document content in each result
+
+RerankDocument:
+  type: object
+  required:
+    - text
+  properties:
+    text:
+      type: string
+      minLength: 1
+      description: Document text content
+    id:
+      type: string
+      minLength: 1
+      description: Optional document identifier
+    meta:
+      type: object
+      description: Optional document metadata
+      additionalProperties: true
+
+RerankResponse:
+  type: object
+  required:
+    - results
+    - model
+  properties:
+    id:
+      type: string
+      description: Unique identifier for the rerank response
+    results:
+      type: array
+      description: Ranked results ordered by relevance score descending
+      items:
+        $ref: '#/RerankResult'
+    model:
+      type: string
+      description: Model used to perform reranking
+    usage:
+      $ref: './usage.yaml#/BifrostLLMUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+RerankResult:
+  type: object
+  required:
+    - index
+    - relevance_score
+  properties:
+    index:
+      type: integer
+      minimum: 0
+      description: Index into the original documents array
+    relevance_score:
+      type: number
+      description: Relevance score for this document
+    document:
+      $ref: '#/RerankDocument'
--- a/docs/openapi/schemas/inference/responses.yaml
+++ b/docs/openapi/schemas/inference/responses.yaml
@@ -0,0 +1,716 @@
+# Responses API schemas
+
+ResponsesRequest:
+  type: object
+  required:
+    - model
+    - input
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+    input:
+      $ref: '#/ResponsesRequestInput'
+    fallbacks:
+      type: array
+      items:
+        type: string
+    stream:
+      type: boolean
+    background:
+      type: boolean
+    conversation:
+      type: string
+    include:
+      type: array
+      items:
+        type: string
+    instructions:
+      type: string
+    max_output_tokens:
+      type: integer
+    max_tool_calls:
+      type: integer
+    metadata:
+      type: object
+      additionalProperties: true
+    parallel_tool_calls:
+      type: boolean
+    previous_response_id:
+      type: string
+    prompt_cache_key:
+      type: string
+    reasoning:
+      $ref: '#/ResponsesParametersReasoning'
+    safety_identifier:
+      type: string
+    service_tier:
+      type: string
+    stream_options:
+      $ref: '#/ResponsesStreamOptions'
+    store:
+      type: boolean
+    temperature:
+      type: number
+    text:
+      $ref: '#/ResponsesTextConfig'
+    top_logprobs:
+      type: integer
+    top_p:
+      type: number
+    tool_choice:
+      $ref: '#/ResponsesToolChoice'
+    tools:
+      type: array
+      items:
+        $ref: '#/ResponsesTool'
+    truncation:
+      type: string
+
+ResponsesRequestInput:
+  oneOf:
+    - type: string
+    - type: array
+      items:
+        $ref: '#/ResponsesMessage'
+  description: Input - can be a string or array of messages
+
+ResponsesMessage:
+  type: object
+  properties:
+    id:
+      type: string
+    type:
+      $ref: '#/ResponsesMessageType'
+    status:
+      type: string
+      enum: [in_progress, completed, incomplete, interpreting, failed]
+    role:
+      type: string
+      enum: [assistant, user, system, developer]
+    content:
+      $ref: '#/ResponsesMessageContent'
+    call_id:
+      type: string
+    name:
+      type: string
+    arguments:
+      type: string
+    output:
+      type: object
+    action:
+      type: object
+    error:
+      type: string
+    queries:
+      type: array
+      items:
+        type: string
+    results:
+      type: array
+      items:
+        type: object
+    summary:
+      type: array
+      items:
+        $ref: '#/ResponsesReasoningSummary'
+    encrypted_content:
+      type: string
+
+ResponsesMessageType:
+  type: string
+  enum:
+    - message
+    - file_search_call
+    - computer_call
+    - computer_call_output
+    - web_search_call
+    - web_fetch_call
+    - function_call
+    - function_call_output
+    - code_interpreter_call
+    - local_shell_call
+    - local_shell_call_output
+    - mcp_call
+    - custom_tool_call
+    - custom_tool_call_output
+    - image_generation_call
+    - mcp_list_tools
+    - mcp_approval_request
+    - mcp_approval_responses
+    - reasoning
+    - item_reference
+    - refusal
+
+ResponsesMessageContent:
+  oneOf:
+    - type: string
+    - type: array
+      items:
+        $ref: '#/ResponsesMessageContentBlock'
+
+ResponsesMessageContentBlock:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+      enum: [input_text, input_image, input_file, input_audio, output_text, refusal, reasoning_text]
+    file_id:
+      type: string
+    text:
+      type: string
+    signature:
+      type: string
+    image_url:
+      type: string
+    detail:
+      type: string
+    file_data:
+      type: string
+    file_url:
+      type: string
+    filename:
+      type: string
+    file_type:
+      type: string
+    input_audio:
+      $ref: '#/ResponsesInputMessageContentBlockAudio'
+    annotations:
+      type: array
+      items:
+        $ref: '#/ResponsesOutputMessageContentTextAnnotation'
+    logprobs:
+      type: array
+      items:
+        $ref: '#/ResponsesOutputMessageContentTextLogProb'
+    refusal:
+      type: string
+    cache_control:
+      $ref: './common.yaml#/CacheControl'
+
+ResponsesInputMessageContentBlockAudio:
+  type: object
+  required:
+    - format
+    - data
+  properties:
+    format:
+      type: string
+      enum: [mp3, wav]
+    data:
+      type: string
+
+ResponsesOutputMessageContentTextAnnotation:
+  type: object
+  properties:
+    type:
+      type: string
+      enum: [file_citation, url_citation, container_file_citation, file_path]
+    index:
+      type: integer
+    file_id:
+      type: string
+    text:
+      type: string
+    start_index:
+      type: integer
+    end_index:
+      type: integer
+    filename:
+      type: string
+    title:
+      type: string
+    url:
+      type: string
+    container_id:
+      type: string
+
+ResponsesOutputMessageContentTextLogProb:
+  type: object
+  properties:
+    bytes:
+      type: array
+      items:
+        type: integer
+    logprob:
+      type: number
+    token:
+      type: string
+    top_logprobs:
+      type: array
+      items:
+        $ref: './chat.yaml#/LogProb'
+
+ResponsesParametersReasoning:
+  type: object
+  properties:
+    effort:
+      type: string
+      enum: [none, minimal, low, medium, high, xhigh]
+    generate_summary:
+      type: string
+      deprecated: true
+    summary:
+      type: string
+      enum: [auto, concise, detailed]
+    max_tokens:
+      type: integer
+
+ResponsesStreamOptions:
+  type: object
+  properties:
+    include_obfuscation:
+      type: boolean
+
+ResponsesTextConfig:
+  type: object
+  properties:
+    format:
+      $ref: '#/ResponsesTextConfigFormat'
+    verbosity:
+      type: string
+      enum: [low, medium, high]
+
+ResponsesTextConfigFormat:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+      enum: [text, json_schema, json_object]
+    name:
+      type: string
+    schema:
+      type: object
+    strict:
+      type: boolean
+
+ResponsesToolChoice:
+  oneOf:
+    - type: string
+      enum: [none, auto, required]
+    - $ref: '#/ResponsesToolChoiceStruct'
+
+ResponsesToolChoiceStruct:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+      enum:
+        - none
+        - auto
+        - any
+        - required
+        - function
+        - allowed_tools
+        - file_search
+        - web_search_preview
+        - computer_use_preview
+        - code_interpreter
+        - image_generation
+        - mcp
+        - custom
+    mode:
+      type: string
+    name:
+      type: string
+    server_label:
+      type: string
+    tools:
+      type: array
+      items:
+        $ref: '#/ResponsesToolChoiceAllowedToolDef'
+
+ResponsesToolChoiceAllowedToolDef:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+      enum: [function, mcp, image_generation]
+    name:
+      type: string
+    server_label:
+      type: string
+
+ResponsesTool:
+  type: object
+  required:
+    - type
+  properties:
+    type:
+      type: string
+      enum:
+        - function
+        - file_search
+        - computer_use_preview
+        - web_search
+        - web_fetch
+        - mcp
+        - code_interpreter
+        - image_generation
+        - local_shell
+        - custom
+        - web_search_preview
+        - memory
+        - tool_search
+    name:
+      type: string
+    description:
+      type: string
+    cache_control:
+      $ref: './common.yaml#/CacheControl'
+    parameters:
+      $ref: './chat.yaml#/ToolFunctionParameters'
+    strict:
+      type: boolean
+    vector_store_ids:
+      type: array
+      items:
+        type: string
+    filters:
+      type: object
+    max_num_results:
+      type: integer
+    ranking_options:
+      type: object
+    display_height:
+      type: integer
+    display_width:
+      type: integer
+    environment:
+      type: string
+    enable_zoom:
+      type: boolean
+    search_context_size:
+      type: string
+    user_location:
+      type: object
+    server_label:
+      type: string
+    server_url:
+      type: string
+    allowed_tools:
+      type: object
+    authorization:
+      type: string
+    connector_id:
+      type: string
+    headers:
+      type: object
+      additionalProperties:
+        type: string
+    require_approval:
+      type: object
+    server_description:
+      type: string
+    container:
+      type: object
+    background:
+      type: string
+    input_fidelity:
+      type: string
+    input_image_mask:
+      type: object
+    moderation:
+      type: string
+    output_compression:
+      type: integer
+    output_format:
+      type: string
+    partial_images:
+      type: integer
+    quality:
+      type: string
+    size:
+      type: string
+    format:
+      type: object
+
+ResponsesReasoningSummary:
+  type: object
+  required:
+    - type
+    - text
+  properties:
+    type:
+      type: string
+      enum: [summary_text]
+    text:
+      type: string
+
+ResponsesResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    background:
+      type: boolean
+    conversation:
+      type: object
+    created_at:
+      type: integer
+    error:
+      $ref: '#/ResponsesResponseError'
+    include:
+      type: array
+      items:
+        type: string
+    incomplete_details:
+      $ref: '#/ResponsesResponseIncompleteDetails'
+    instructions:
+      type: object
+    max_output_tokens:
+      type: integer
+    max_tool_calls:
+      type: integer
+    metadata:
+      type: object
+    model:
+      type: string
+    output:
+      type: array
+      items:
+        $ref: '#/ResponsesMessage'
+    parallel_tool_calls:
+      type: boolean
+    previous_response_id:
+      type: string
+    prompt:
+      type: object
+    prompt_cache_key:
+      type: string
+    reasoning:
+      $ref: '#/ResponsesParametersReasoning'
+    safety_identifier:
+      type: string
+    service_tier:
+      type: string
+    status:
+      type: string
+      enum: [completed, failed, in_progress, canceled, queued, incomplete]
+    stop_reason:
+      type: string
+    store:
+      type: boolean
+    temperature:
+      type: number
+    text:
+      $ref: '#/ResponsesTextConfig'
+    top_logprobs:
+      type: integer
+    top_p:
+      type: number
+    tool_choice:
+      $ref: '#/ResponsesToolChoice'
+    tools:
+      type: array
+      items:
+        $ref: '#/ResponsesTool'
+    truncation:
+      type: string
+    usage:
+      $ref: '#/ResponsesResponseUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+    search_results:
+      type: array
+      items:
+        $ref: './chat.yaml#/PerplexitySearchResult'
+    videos:
+      type: array
+      items:
+        $ref: './chat.yaml#/VideoResult'
+    citations:
+      type: array
+      items:
+        type: string
+
+ResponsesResponseError:
+  type: object
+  required:
+    - code
+    - message
+  properties:
+    code:
+      type: string
+    message:
+      type: string
+
+ResponsesResponseIncompleteDetails:
+  type: object
+  required:
+    - reason
+  properties:
+    reason:
+      type: string
+
+ResponsesResponseUsage:
+  type: object
+  properties:
+    input_tokens:
+      type: integer
+    input_tokens_details:
+      $ref: '#/ResponsesResponseInputTokens'
+    output_tokens:
+      type: integer
+    output_tokens_details:
+      $ref: '#/ResponsesResponseOutputTokens'
+    total_tokens:
+      type: integer
+    cost:
+      $ref: './usage.yaml#/BifrostCost'
+
+ResponsesResponseInputTokens:
+  type: object
+  properties:
+    text_tokens:
+      type: integer
+    audio_tokens:
+      type: integer
+    image_tokens:
+      type: integer
+    cached_read_tokens:
+      type: integer
+      description: >
+        Tokens served from the prompt cache (cache hit), billed at the reduced
+        cache-read rate. Already included in the parent input_tokens total.
+    cached_write_tokens:
+      type: integer
+      description: >
+        Tokens written to the prompt cache on this request, billed at the
+        cache-creation rate. Already included in the parent input_tokens total.
+        Populated for providers that separately report cache write tokens
+        (Anthropic, Bedrock).
+
+ResponsesResponseOutputTokens:
+  type: object
+  properties:
+    text_tokens:
+      type: integer
+    accepted_prediction_tokens:
+      type: integer
+    audio_tokens:
+      type: integer
+    reasoning_tokens:
+      type: integer
+    rejected_prediction_tokens:
+      type: integer
+    citation_tokens:
+      type: integer
+    num_search_queries:
+      type: integer
+
+ResponsesStreamResponse:
+  type: object
+  description: Streaming responses API response (SSE format)
+  properties:
+    type:
+      $ref: '#/ResponsesStreamResponseType'
+    sequence_number:
+      type: integer
+    response:
+      $ref: '#/ResponsesResponse'
+    output_index:
+      type: integer
+    item:
+      $ref: '#/ResponsesMessage'
+    content_index:
+      type: integer
+    item_id:
+      type: string
+    part:
+      $ref: '#/ResponsesMessageContentBlock'
+    delta:
+      type: string
+    signature:
+      type: string
+    logprobs:
+      type: array
+      items:
+        $ref: '#/ResponsesOutputMessageContentTextLogProb'
+    text:
+      type: string
+    refusal:
+      type: string
+    arguments:
+      type: string
+    partial_image_b64:
+      type: string
+    partial_image_index:
+      type: integer
+    annotation:
+      $ref: '#/ResponsesOutputMessageContentTextAnnotation'
+    annotation_index:
+      type: integer
+    code:
+      type: string
+    message:
+      type: string
+    param:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+ResponsesStreamResponseType:
+  type: string
+  enum:
+    - response.ping
+    - response.created
+    - response.in_progress
+    - response.completed
+    - response.failed
+    - response.incomplete
+    - response.output_item.added
+    - response.output_item.done
+    - response.content_part.added
+    - response.content_part.done
+    - response.output_text.delta
+    - response.output_text.done
+    - response.refusal.delta
+    - response.refusal.done
+    - response.function_call_arguments.delta
+    - response.function_call_arguments.done
+    - response.file_search_call.in_progress
+    - response.file_search_call.searching
+    - response.file_search_call.results.added
+    - response.file_search_call.results.completed
+    - response.web_search_call.searching
+    - response.web_search_call.results.added
+    - response.web_search_call.results.completed
+    - response.web_fetch_call.in_progress
+    - response.web_fetch_call.fetching
+    - response.web_fetch_call.completed
+    - response.reasoning_summary_part.added
+    - response.reasoning_summary_part.done
+    - response.reasoning_summary_text.delta
+    - response.reasoning_summary_text.done
+    - response.image_generation_call.completed
+    - response.image_generation_call.generating
+    - response.image_generation_call.in_progress
+    - response.image_generation_call.partial_image
+    - response.mcp_call_arguments.delta
+    - response.mcp_call_arguments.done
+    - response.mcp_call.completed
+    - response.mcp_call.failed
+    - response.mcp_call.in_progress
+    - response.mcp_list_tools.completed
+    - response.mcp_list_tools.failed
+    - response.mcp_list_tools.in_progress
+    - response.code_interpreter_call.in_progress
+    - response.code_interpreter_call.interpreting
+    - response.code_interpreter_call.completed
+    - response.code_interpreter_call_code.delta
+    - response.code_interpreter_call_code.done
+    - response.output_text.annotation.added
+    - response.output_text.annotation.done
+    - response.queued
+    - response.custom_tool_call_input.delta
+    - response.custom_tool_call_input.done
+    - error
--- a/docs/openapi/schemas/inference/speech.yaml
+++ b/docs/openapi/schemas/inference/speech.yaml
@@ -0,0 +1,132 @@
+# Speech API schemas
+
+SpeechRequest:
+  type: object
+  required:
+    - model
+    - input
+    - voice
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+    input:
+      type: string
+      description: Text to convert to speech
+    fallbacks:
+      type: array
+      items:
+        type: string
+    stream_format:
+      type: string
+      enum: [sse]
+      description: Set to "sse" to enable streaming
+    voice:
+      $ref: '#/SpeechVoiceInput'
+    instructions:
+      type: string
+    response_format:
+      type: string
+      enum: [mp3, opus, aac, flac, wav, pcm]
+    speed:
+      type: number
+      minimum: 0.25
+      maximum: 4.0
+    language_code:
+      type: string
+    pronunciation_dictionary_locators:
+      type: array
+      items:
+        $ref: '#/SpeechPronunciationDictionaryLocator'
+    enable_logging:
+      type: boolean
+    optimize_streaming_latency:
+      type: boolean
+    with_timestamps:
+      type: boolean
+
+SpeechVoiceInput:
+  oneOf:
+    - type: string
+    - type: array
+      items:
+        $ref: '#/VoiceConfig'
+
+VoiceConfig:
+  type: object
+  required:
+    - speaker
+    - voice
+  properties:
+    speaker:
+      type: string
+    voice:
+      type: string
+
+SpeechPronunciationDictionaryLocator:
+  type: object
+  required:
+    - pronunciation_dictionary_id
+  properties:
+    pronunciation_dictionary_id:
+      type: string
+    version_id:
+      type: string
+
+SpeechResponse:
+  type: object
+  properties:
+    audio:
+      type: string
+      format: byte
+      description: Audio data (binary)
+    usage:
+      $ref: '#/SpeechUsage'
+    alignment:
+      $ref: '#/SpeechAlignment'
+    normalized_alignment:
+      $ref: '#/SpeechAlignment'
+    audio_base64:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+SpeechUsage:
+  type: object
+  properties:
+    input_tokens:
+      type: integer
+    output_tokens:
+      type: integer
+    total_tokens:
+      type: integer
+
+SpeechAlignment:
+  type: object
+  properties:
+    char_start_times_ms:
+      type: array
+      items:
+        type: number
+    char_end_times_ms:
+      type: array
+      items:
+        type: number
+    characters:
+      type: array
+      items:
+        type: string
+
+SpeechStreamResponse:
+  type: object
+  properties:
+    type:
+      type: string
+      enum: [speech.audio.delta, speech.audio.done]
+    audio:
+      type: string
+      format: byte
+    usage:
+      $ref: '#/SpeechUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
--- a/docs/openapi/schemas/inference/text.yaml
+++ b/docs/openapi/schemas/inference/text.yaml
@@ -0,0 +1,98 @@
+# Text Completions API schemas
+
+TextCompletionRequest:
+  type: object
+  required:
+    - model
+    - prompt
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+    prompt:
+      $ref: '#/TextCompletionInput'
+    fallbacks:
+      type: array
+      items:
+        type: string
+    stream:
+      type: boolean
+    best_of:
+      type: integer
+    echo:
+      type: boolean
+    frequency_penalty:
+      type: number
+    logit_bias:
+      type: object
+      additionalProperties:
+        type: number
+    logprobs:
+      type: integer
+    max_tokens:
+      type: integer
+    n:
+      type: integer
+    presence_penalty:
+      type: number
+    seed:
+      type: integer
+    stop:
+      type: array
+      items:
+        type: string
+    suffix:
+      type: string
+    temperature:
+      type: number
+    top_p:
+      type: number
+    user:
+      type: string
+
+TextCompletionInput:
+  oneOf:
+    - type: string
+    - type: array
+      items:
+        type: string
+  description: Prompt input - can be a string or array of strings
+
+TextCompletionResponse:
+  type: object
+  properties:
+    id:
+      type: string
+    choices:
+      type: array
+      items:
+        $ref: './chat.yaml#/BifrostResponseChoice'
+    model:
+      type: string
+    object:
+      type: string
+    system_fingerprint:
+      type: string
+    usage:
+      $ref: './usage.yaml#/BifrostLLMUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+TextCompletionStreamResponse:
+  type: object
+  description: Streaming text completion response
+  properties:
+    id:
+      type: string
+    choices:
+      type: array
+      items:
+        $ref: './chat.yaml#/BifrostResponseChoice'
+    model:
+      type: string
+    object:
+      type: string
+    usage:
+      $ref: './usage.yaml#/BifrostLLMUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
--- a/docs/openapi/schemas/inference/transcription.yaml
+++ b/docs/openapi/schemas/inference/transcription.yaml
@@ -0,0 +1,150 @@
+# Transcription API schemas
+
+TranscriptionRequest:
+  type: object
+  required:
+    - model
+    - file
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+    file:
+      type: string
+      format: binary
+      description: Audio file to transcribe
+    fallbacks:
+      type: array
+      items:
+        type: string
+    stream:
+      type: boolean
+    language:
+      type: string
+    prompt:
+      type: string
+    response_format:
+      type: string
+      enum: [json, text, srt, verbose_json, vtt]
+    file_format:
+      type: string
+
+TranscriptionResponse:
+  type: object
+  properties:
+    duration:
+      type: number
+    language:
+      type: string
+    logprobs:
+      type: array
+      items:
+        $ref: '#/TranscriptionLogProb'
+    segments:
+      type: array
+      items:
+        $ref: '#/TranscriptionSegment'
+    task:
+      type: string
+    text:
+      type: string
+    usage:
+      $ref: '#/TranscriptionUsage'
+    words:
+      type: array
+      items:
+        $ref: '#/TranscriptionWord'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+TranscriptionLogProb:
+  type: object
+  properties:
+    token:
+      type: string
+    logprob:
+      type: number
+    bytes:
+      type: array
+      items:
+        type: integer
+
+TranscriptionSegment:
+  type: object
+  properties:
+    id:
+      type: integer
+    seek:
+      type: integer
+    start:
+      type: number
+    end:
+      type: number
+    text:
+      type: string
+    tokens:
+      type: array
+      items:
+        type: integer
+    temperature:
+      type: number
+    avg_logprob:
+      type: number
+    compression_ratio:
+      type: number
+    no_speech_prob:
+      type: number
+
+TranscriptionWord:
+  type: object
+  properties:
+    word:
+      type: string
+    start:
+      type: number
+    end:
+      type: number
+
+TranscriptionUsage:
+  type: object
+  properties:
+    type:
+      type: string
+      enum: [tokens, duration]
+    input_tokens:
+      type: integer
+    input_token_details:
+      $ref: '#/TranscriptionUsageInputTokenDetails'
+    output_tokens:
+      type: integer
+    total_tokens:
+      type: integer
+    seconds:
+      type: integer
+
+TranscriptionUsageInputTokenDetails:
+  type: object
+  properties:
+    text_tokens:
+      type: integer
+    audio_tokens:
+      type: integer
+
+TranscriptionStreamResponse:
+  type: object
+  properties:
+    type:
+      type: string
+      enum: [transcript.text.delta, transcript.text.done]
+    delta:
+      type: string
+    logprobs:
+      type: array
+      items:
+        $ref: '#/TranscriptionLogProb'
+    text:
+      type: string
+    usage:
+      $ref: '#/TranscriptionUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
--- a/docs/openapi/schemas/inference/usage.yaml
+++ b/docs/openapi/schemas/inference/usage.yaml
@@ -0,0 +1,89 @@
+# Usage and cost related schemas
+
+BifrostLLMUsage:
+  type: object
+  description: Token usage information
+  properties:
+    prompt_tokens:
+      type: integer
+      description: >
+        Total input tokens including any prompt-cache tokens (read + write).
+        Subtract prompt_tokens_details.cached_read_tokens and
+        prompt_tokens_details.cached_write_tokens to get the non-cached portion.
+    prompt_tokens_details:
+      $ref: '#/ChatPromptTokensDetails'
+    completion_tokens:
+      type: integer
+      description: Number of output/completion tokens generated.
+    completion_tokens_details:
+      $ref: '#/ChatCompletionTokensDetails'
+    total_tokens:
+      type: integer
+    cost:
+      $ref: '#/BifrostCost'
+
+ChatPromptTokensDetails:
+  type: object
+  properties:
+    text_tokens:
+      type: integer
+    audio_tokens:
+      type: integer
+    image_tokens:
+      type: integer
+    cached_read_tokens:
+      type: integer
+      description: >
+        Tokens served from the prompt cache (cache hit). These tokens are already
+        included in prompt_tokens and are billed at the reduced cache-read rate.
+        Populated for all providers that support prompt caching (Anthropic, Bedrock,
+        OpenAI, Gemini, xAI, etc.).
+    cached_write_tokens:
+      type: integer
+      description: >
+        Tokens written to the prompt cache on this request (cache creation / write).
+        These tokens are already included in prompt_tokens and are billed at the
+        cache-creation rate. Populated for providers that separately report cache
+        write tokens (Anthropic, Bedrock).
+
+ChatCompletionTokensDetails:
+  type: object
+  properties:
+    text_tokens:
+      type: integer
+    accepted_prediction_tokens:
+      type: integer
+    audio_tokens:
+      type: integer
+    citation_tokens:
+      type: integer
+    num_search_queries:
+      type: integer
+    reasoning_tokens:
+      type: integer
+    image_tokens:
+      type: integer
+    rejected_prediction_tokens:
+      type: integer
+
+BifrostCost:
+  type: object
+  description: Cost breakdown for the request
+  properties:
+    input_tokens_cost:
+      type: number
+    output_tokens_cost:
+      type: number
+    reasoning_tokens_cost:
+      type: number
+      description: Cost for reasoning/thinking tokens (reasoning models)
+    citation_tokens_cost:
+      type: number
+      description: Cost for citation tokens
+    search_queries_cost:
+      type: number
+      description: Cost for web search queries
+    request_cost:
+      type: number
+    total_cost:
+      type: number
--- a/docs/openapi/schemas/inference/videos.yaml
+++ b/docs/openapi/schemas/inference/videos.yaml
@@ -0,0 +1,254 @@
+# Video Generation Schemas
+
+VideoGenerationRequest:
+  type: object
+  required:
+    - model
+    - prompt
+  properties:
+    model:
+      type: string
+      description: Model identifier in format `provider/model`
+    prompt:
+      type: string
+      description: Text prompt describing the video to generate
+    input_reference:
+      type: string
+      description: Optional reference image for image-to-video. OpenAI and Gemini require a base64 data URL (e.g., `data:image/png;base64,...`). Runway and Replicate accept both data URLs and plain URLs.
+    seconds:
+      type: string
+      description: Duration of the video in seconds as a string (e.g., "4")
+    size:
+      type: string
+      description: Resolution of the generated video (e.g., `1280x720`, `720x1280`, `1920x1080`)
+    negative_prompt:
+      type: string
+      description: Text describing what to avoid in the generated video
+    seed:
+      type: integer
+      description: Seed for reproducible generation
+    video_uri:
+      type: string
+      description: Source video URI for video-to-video generation (provider-specific, e.g. GCS URI)
+    audio:
+      type: boolean
+      description: Enable audio generation in the video (supported by select providers/models)
+    fallbacks:
+      type: array
+      items:
+        $ref: './common.yaml#/Fallback'
+      description: Fallback models to try if primary model fails
+
+VideoGenerationResponse:
+  type: object
+  properties:
+    id:
+      type: string
+      description: Provider-native job ID. To use in path parameters (retrieve/delete/download), combine as `{id}:{provider}` (e.g., `task_abc123:runway`)
+    object:
+      type: string
+      enum:
+        - "video"
+      description: Object type, always "video"
+    model:
+      type: string
+      description: Model used for video generation
+    status:
+      $ref: '#/VideoStatus'
+    progress:
+      type: number
+      format: float
+      minimum: 0
+      maximum: 100
+      description: Approximate completion percentage (0-100)
+    prompt:
+      type: string
+      description: Prompt used to generate the video
+    remixed_from_video_id:
+      type: string
+      description: Source video ID if this is a remix
+    seconds:
+      type: string
+      description: Duration of the generated video in seconds as a string (e.g., "4")
+    size:
+      $ref: '#/VideoSize'
+    created_at:
+      type: integer
+      format: int64
+      description: Unix timestamp (seconds) when the job was created
+    completed_at:
+      type: integer
+      format: int64
+      description: Unix timestamp (seconds) when the job completed
+    expires_at:
+      type: integer
+      format: int64
+      description: Unix timestamp (seconds) when downloadable assets expire
+    videos:
+      type: array
+      description: Generated video outputs (only present when status is "completed")
+      items:
+        type: object
+        properties:
+          type:
+            type: string
+            enum:
+              - "url"
+              - "base64"
+            description: Output format of this video
+          url:
+            type: string
+            format: uri
+            description: URL to the generated video (present when type is "url")
+          base64:
+            type: string
+            description: Base64-encoded video content (present when type is "base64")
+          content_type:
+            type: string
+            description: MIME type of the video (e.g., "video/mp4")
+    error:
+      $ref: '#/VideoError'
+    content_filter:
+      $ref: '#/VideoContentFilter'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+VideoRemixRequest:
+  type: object
+  required:
+    - prompt
+  properties:
+    prompt:
+      type: string
+      description: Text prompt describing how to remix the video
+
+VideoListResponse:
+  type: object
+  properties:
+    object:
+      type: string
+      enum:
+        - "list"
+      description: Object type, always "list"
+    data:
+      type: array
+      items:
+        $ref: '#/VideoObject'
+      description: Array of video generation jobs
+    first_id:
+      type: string
+      description: ID of the first item in the list
+    last_id:
+      type: string
+      description: ID of the last item in the list
+    has_more:
+      type: boolean
+      description: Whether there are more results available
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+VideoObject:
+  type: object
+  properties:
+    id:
+      type: string
+      description: Provider-native video ID. To use in path parameters (retrieve/delete/download), combine as `{id}:{provider}` (e.g., `task_abc123:runway`)
+    object:
+      type: string
+      enum:
+        - "video"
+      description: Object type, always "video"
+    model:
+      type: string
+      description: Model used for generation
+    status:
+      $ref: '#/VideoStatus'
+    progress:
+      type: number
+      format: float
+      minimum: 0
+      maximum: 100
+      description: Approximate completion percentage (0-100)
+    prompt:
+      type: string
+      description: Prompt used to generate the video
+    remixed_from_video_id:
+      type: string
+      description: Source video ID if this is a remix
+    seconds:
+      type: string
+      description: Duration of the video in seconds as a string (e.g., "4")
+    size:
+      $ref: '#/VideoSize'
+    created_at:
+      type: integer
+      format: int64
+      description: Unix timestamp (seconds) when the job was created
+    completed_at:
+      type: integer
+      format: int64
+      description: Unix timestamp (seconds) when the job completed
+    expires_at:
+      type: integer
+      format: int64
+      description: Unix timestamp (seconds) when downloadable assets expire
+    error:
+      $ref: '#/VideoError'
+
+VideoDeleteResponse:
+  type: object
+  properties:
+    id:
+      type: string
+      description: ID of the deleted video
+    object:
+      type: string
+      enum:
+        - "video.deleted"
+      description: Object type, always "video.deleted"
+    deleted:
+      type: boolean
+      description: Whether the video was successfully deleted
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+VideoStatus:
+  type: string
+  enum:
+    - "queued"
+    - "in_progress"
+    - "completed"
+    - "failed"
+  description: |
+    Current lifecycle status of the video generation job:
+    - `queued`: Job is waiting to be processed
+    - `in_progress`: Video is currently being generated
+    - `completed`: Video generation completed successfully
+    - `failed`: Video generation failed
+
+VideoSize:
+  type: string
+  description: Resolution of the generated video (e.g., "1920x1080")
+
+VideoError:
+  type: object
+  properties:
+    code:
+      type: string
+      description: Error code
+    message:
+      type: string
+      description: Human-readable error message
+
+VideoContentFilter:
+  type: object
+  description: Information about content that was filtered due to safety policies
+  properties:
+    filtered_count:
+      type: integer
+      description: Number of items filtered
+    reasons:
+      type: array
+      items:
+        type: string
+      description: Human-readable reasons for filtering