133 lines
2.5 KiB
YAML
133 lines
2.5 KiB
YAML
# Speech API schemas
|
|
|
|
SpeechRequest:
|
|
type: object
|
|
required:
|
|
- model
|
|
- input
|
|
- voice
|
|
properties:
|
|
model:
|
|
type: string
|
|
description: Model in provider/model format
|
|
input:
|
|
type: string
|
|
description: Text to convert to speech
|
|
fallbacks:
|
|
type: array
|
|
items:
|
|
type: string
|
|
stream_format:
|
|
type: string
|
|
enum: [sse]
|
|
description: Set to "sse" to enable streaming
|
|
voice:
|
|
$ref: '#/SpeechVoiceInput'
|
|
instructions:
|
|
type: string
|
|
response_format:
|
|
type: string
|
|
enum: [mp3, opus, aac, flac, wav, pcm]
|
|
speed:
|
|
type: number
|
|
minimum: 0.25
|
|
maximum: 4.0
|
|
language_code:
|
|
type: string
|
|
pronunciation_dictionary_locators:
|
|
type: array
|
|
items:
|
|
$ref: '#/SpeechPronunciationDictionaryLocator'
|
|
enable_logging:
|
|
type: boolean
|
|
optimize_streaming_latency:
|
|
type: boolean
|
|
with_timestamps:
|
|
type: boolean
|
|
|
|
SpeechVoiceInput:
|
|
oneOf:
|
|
- type: string
|
|
- type: array
|
|
items:
|
|
$ref: '#/VoiceConfig'
|
|
|
|
VoiceConfig:
|
|
type: object
|
|
required:
|
|
- speaker
|
|
- voice
|
|
properties:
|
|
speaker:
|
|
type: string
|
|
voice:
|
|
type: string
|
|
|
|
SpeechPronunciationDictionaryLocator:
|
|
type: object
|
|
required:
|
|
- pronunciation_dictionary_id
|
|
properties:
|
|
pronunciation_dictionary_id:
|
|
type: string
|
|
version_id:
|
|
type: string
|
|
|
|
SpeechResponse:
|
|
type: object
|
|
properties:
|
|
audio:
|
|
type: string
|
|
format: byte
|
|
description: Audio data (binary)
|
|
usage:
|
|
$ref: '#/SpeechUsage'
|
|
alignment:
|
|
$ref: '#/SpeechAlignment'
|
|
normalized_alignment:
|
|
$ref: '#/SpeechAlignment'
|
|
audio_base64:
|
|
type: string
|
|
extra_fields:
|
|
$ref: './common.yaml#/BifrostResponseExtraFields'
|
|
|
|
SpeechUsage:
|
|
type: object
|
|
properties:
|
|
input_tokens:
|
|
type: integer
|
|
output_tokens:
|
|
type: integer
|
|
total_tokens:
|
|
type: integer
|
|
|
|
SpeechAlignment:
|
|
type: object
|
|
properties:
|
|
char_start_times_ms:
|
|
type: array
|
|
items:
|
|
type: number
|
|
char_end_times_ms:
|
|
type: array
|
|
items:
|
|
type: number
|
|
characters:
|
|
type: array
|
|
items:
|
|
type: string
|
|
|
|
SpeechStreamResponse:
|
|
type: object
|
|
properties:
|
|
type:
|
|
type: string
|
|
enum: [speech.audio.delta, speech.audio.done]
|
|
audio:
|
|
type: string
|
|
format: byte
|
|
usage:
|
|
$ref: '#/SpeechUsage'
|
|
extra_fields:
|
|
$ref: './common.yaml#/BifrostResponseExtraFields'
|