first commit

2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions
--- a/docs/openapi/schemas/inference/speech.yaml
+++ b/docs/openapi/schemas/inference/speech.yaml
@@ -0,0 +1,132 @@
+# Speech API schemas
+
+SpeechRequest:
+  type: object
+  required:
+    - model
+    - input
+    - voice
+  properties:
+    model:
+      type: string
+      description: Model in provider/model format
+    input:
+      type: string
+      description: Text to convert to speech
+    fallbacks:
+      type: array
+      items:
+        type: string
+    stream_format:
+      type: string
+      enum: [sse]
+      description: Set to "sse" to enable streaming
+    voice:
+      $ref: '#/SpeechVoiceInput'
+    instructions:
+      type: string
+    response_format:
+      type: string
+      enum: [mp3, opus, aac, flac, wav, pcm]
+    speed:
+      type: number
+      minimum: 0.25
+      maximum: 4.0
+    language_code:
+      type: string
+    pronunciation_dictionary_locators:
+      type: array
+      items:
+        $ref: '#/SpeechPronunciationDictionaryLocator'
+    enable_logging:
+      type: boolean
+    optimize_streaming_latency:
+      type: boolean
+    with_timestamps:
+      type: boolean
+
+SpeechVoiceInput:
+  oneOf:
+    - type: string
+    - type: array
+      items:
+        $ref: '#/VoiceConfig'
+
+VoiceConfig:
+  type: object
+  required:
+    - speaker
+    - voice
+  properties:
+    speaker:
+      type: string
+    voice:
+      type: string
+
+SpeechPronunciationDictionaryLocator:
+  type: object
+  required:
+    - pronunciation_dictionary_id
+  properties:
+    pronunciation_dictionary_id:
+      type: string
+    version_id:
+      type: string
+
+SpeechResponse:
+  type: object
+  properties:
+    audio:
+      type: string
+      format: byte
+      description: Audio data (binary)
+    usage:
+      $ref: '#/SpeechUsage'
+    alignment:
+      $ref: '#/SpeechAlignment'
+    normalized_alignment:
+      $ref: '#/SpeechAlignment'
+    audio_base64:
+      type: string
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'
+
+SpeechUsage:
+  type: object
+  properties:
+    input_tokens:
+      type: integer
+    output_tokens:
+      type: integer
+    total_tokens:
+      type: integer
+
+SpeechAlignment:
+  type: object
+  properties:
+    char_start_times_ms:
+      type: array
+      items:
+        type: number
+    char_end_times_ms:
+      type: array
+      items:
+        type: number
+    characters:
+      type: array
+      items:
+        type: string
+
+SpeechStreamResponse:
+  type: object
+  properties:
+    type:
+      type: string
+      enum: [speech.audio.delta, speech.audio.done]
+    audio:
+      type: string
+      format: byte
+    usage:
+      $ref: '#/SpeechUsage'
+    extra_fields:
+      $ref: './common.yaml#/BifrostResponseExtraFields'