# Speech API schemas

SpeechRequest:
  type: object
  required:
    - model
    - input
    - voice
  properties:
    model:
      type: string
      description: Model in provider/model format
    input:
      type: string
      description: Text to convert to speech
    fallbacks:
      type: array
      items:
        type: string
    stream_format:
      type: string
      enum: [sse]
      description: Set to "sse" to enable streaming
    voice:
      $ref: '#/SpeechVoiceInput'
    instructions:
      type: string
    response_format:
      type: string
      enum: [mp3, opus, aac, flac, wav, pcm]
    speed:
      type: number
      minimum: 0.25
      maximum: 4.0
    language_code:
      type: string
    pronunciation_dictionary_locators:
      type: array
      items:
        $ref: '#/SpeechPronunciationDictionaryLocator'
    enable_logging:
      type: boolean
    optimize_streaming_latency:
      type: boolean
    with_timestamps:
      type: boolean

SpeechVoiceInput:
  oneOf:
    - type: string
    - type: array
      items:
        $ref: '#/VoiceConfig'

VoiceConfig:
  type: object
  required:
    - speaker
    - voice
  properties:
    speaker:
      type: string
    voice:
      type: string

SpeechPronunciationDictionaryLocator:
  type: object
  required:
    - pronunciation_dictionary_id
  properties:
    pronunciation_dictionary_id:
      type: string
    version_id:
      type: string

SpeechResponse:
  type: object
  properties:
    audio:
      type: string
      format: byte
      description: Audio data (binary)
    usage:
      $ref: '#/SpeechUsage'
    alignment:
      $ref: '#/SpeechAlignment'
    normalized_alignment:
      $ref: '#/SpeechAlignment'
    audio_base64:
      type: string
    extra_fields:
      $ref: './common.yaml#/BifrostResponseExtraFields'

SpeechUsage:
  type: object
  properties:
    input_tokens:
      type: integer
    output_tokens:
      type: integer
    total_tokens:
      type: integer

SpeechAlignment:
  type: object
  properties:
    char_start_times_ms:
      type: array
      items:
        type: number
    char_end_times_ms:
      type: array
      items:
        type: number
    characters:
      type: array
      items:
        type: string

SpeechStreamResponse:
  type: object
  properties:
    type:
      type: string
      enum: [speech.audio.delta, speech.audio.done]
    audio:
      type: string
      format: byte
    usage:
      $ref: '#/SpeechUsage'
    extra_fields:
      $ref: './common.yaml#/BifrostResponseExtraFields'