Files
bifrost/core/providers/utils/audio.go
Beyhan Oğur 880f412e2c first commit
2026-04-26 21:52:23 +03:00

137 lines
4.4 KiB
Go

// Package utils provides common utility functions used across different provider implementations.
// This file contains audio-related utility functions for format conversion.
package utils
import (
"bytes"
"encoding/binary"
)
// PCMConfig holds the configuration for PCM audio data
type PCMConfig struct {
SampleRate int // Sample rate in Hz (e.g., 24000)
NumChannels int // Number of audio channels (1 = mono, 2 = stereo)
BitsPerSample int // Bits per sample (e.g., 16)
}
// DefaultGeminiPCMConfig returns the default PCM configuration for Gemini TTS
// Gemini TTS returns audio in PCM format with the following specs:
// - Format: signed 16-bit little-endian (s16le)
// - Sample rate: 24000 Hz
// - Channels: 1 (mono)
func DefaultGeminiPCMConfig() PCMConfig {
return PCMConfig{
SampleRate: 24000,
NumChannels: 1,
BitsPerSample: 16,
}
}
// ConvertPCMToWAV converts raw PCM audio data to WAV format
// The PCM data is expected to be in signed little-endian format (s16le for 16-bit)
func ConvertPCMToWAV(pcmData []byte, config PCMConfig) ([]byte, error) {
byteRate := config.SampleRate * config.NumChannels * config.BitsPerSample / 8
blockAlign := config.NumChannels * config.BitsPerSample / 8
dataSize := uint32(len(pcmData))
fileSize := 36 + dataSize // 36 bytes for header + data
var buf bytes.Buffer
// RIFF header
buf.WriteString("RIFF")
binary.Write(&buf, binary.LittleEndian, fileSize)
buf.WriteString("WAVE")
// fmt subchunk
buf.WriteString("fmt ")
binary.Write(&buf, binary.LittleEndian, uint32(16)) // Subchunk1Size (16 for PCM)
binary.Write(&buf, binary.LittleEndian, uint16(1)) // AudioFormat (1 = PCM)
binary.Write(&buf, binary.LittleEndian, uint16(config.NumChannels)) // NumChannels
binary.Write(&buf, binary.LittleEndian, uint32(config.SampleRate)) // SampleRate
binary.Write(&buf, binary.LittleEndian, uint32(byteRate)) // ByteRate
binary.Write(&buf, binary.LittleEndian, uint16(blockAlign)) // BlockAlign
binary.Write(&buf, binary.LittleEndian, uint16(config.BitsPerSample)) // BitsPerSample
// data subchunk
buf.WriteString("data")
binary.Write(&buf, binary.LittleEndian, dataSize)
buf.Write(pcmData)
return buf.Bytes(), nil
}
var (
riff = []byte("RIFF")
wave = []byte("WAVE")
id3 = []byte("ID3")
form = []byte("FORM")
aiff = []byte("AIFF")
aifc = []byte("AIFC")
flac = []byte("fLaC")
oggs = []byte("OggS")
adif = []byte("ADIF")
)
// DetectAudioMimeType attempts to detect the MIME type from audio file headers.
// Supports detection of: WAV, MP3, AIFF, AAC, OGG Vorbis, and FLAC formats.
func DetectAudioMimeType(audioData []byte) string {
if len(audioData) < 4 {
return "audio/mp3"
}
// WAV (RIFF/WAVE)
if len(audioData) >= 12 &&
bytes.Equal(audioData[:4], riff) &&
bytes.Equal(audioData[8:12], wave) {
return "audio/wav"
}
// MP3: ID3v2 tag (keep this check for MP3)
if len(audioData) >= 3 && bytes.Equal(audioData[:3], id3) {
return "audio/mp3"
}
// AAC: ADIF or ADTS (0xFFF sync) - check before MP3 frame sync to avoid misclassification
if bytes.HasPrefix(audioData, adif) {
return "audio/aac"
}
if len(audioData) >= 2 && audioData[0] == 0xFF && (audioData[1]&0xF6) == 0xF0 {
return "audio/aac"
}
// AIFF / AIFC (map both to audio/aiff)
if len(audioData) >= 12 && bytes.Equal(audioData[:4], form) &&
(bytes.Equal(audioData[8:12], aiff) || bytes.Equal(audioData[8:12], aifc)) {
return "audio/aiff"
}
// FLAC
if bytes.HasPrefix(audioData, flac) {
return "audio/flac"
}
// OGG container
if bytes.HasPrefix(audioData, oggs) {
return "audio/ogg"
}
// MP3: MPEG frame sync (cover common variants) - check after AAC to avoid misclassification
if len(audioData) >= 2 && audioData[0] == 0xFF &&
(audioData[1] == 0xFB || audioData[1] == 0xF3 || audioData[1] == 0xF2 || audioData[1] == 0xFA) {
return "audio/mp3"
}
// Fallback within supported set
return "audio/mp3"
}
// AudioFilenameFromBytes returns a filename with the correct extension for the given audio data.
// Falls back to "audio.mp3" if the format cannot be detected.
func AudioFilenameFromBytes(audioData []byte) string {
mimeToExt := map[string]string{
"audio/wav": "audio.wav",
"audio/aac": "audio.aac",
"audio/aiff": "audio.aiff",
"audio/flac": "audio.flac",
"audio/ogg": "audio.ogg",
}
mime := DetectAudioMimeType(audioData)
if ext, ok := mimeToExt[mime]; ok {
return ext
}
return "audio.mp3"
}