first commit
This commit is contained in:
136
core/providers/utils/audio.go
Normal file
136
core/providers/utils/audio.go
Normal file
@@ -0,0 +1,136 @@
|
||||
// Package utils provides common utility functions used across different provider implementations.
|
||||
// This file contains audio-related utility functions for format conversion.
|
||||
package utils
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
)
|
||||
|
||||
// PCMConfig holds the configuration for PCM audio data
|
||||
type PCMConfig struct {
|
||||
SampleRate int // Sample rate in Hz (e.g., 24000)
|
||||
NumChannels int // Number of audio channels (1 = mono, 2 = stereo)
|
||||
BitsPerSample int // Bits per sample (e.g., 16)
|
||||
}
|
||||
|
||||
// DefaultGeminiPCMConfig returns the default PCM configuration for Gemini TTS
|
||||
// Gemini TTS returns audio in PCM format with the following specs:
|
||||
// - Format: signed 16-bit little-endian (s16le)
|
||||
// - Sample rate: 24000 Hz
|
||||
// - Channels: 1 (mono)
|
||||
func DefaultGeminiPCMConfig() PCMConfig {
|
||||
return PCMConfig{
|
||||
SampleRate: 24000,
|
||||
NumChannels: 1,
|
||||
BitsPerSample: 16,
|
||||
}
|
||||
}
|
||||
|
||||
// ConvertPCMToWAV converts raw PCM audio data to WAV format
|
||||
// The PCM data is expected to be in signed little-endian format (s16le for 16-bit)
|
||||
func ConvertPCMToWAV(pcmData []byte, config PCMConfig) ([]byte, error) {
|
||||
byteRate := config.SampleRate * config.NumChannels * config.BitsPerSample / 8
|
||||
blockAlign := config.NumChannels * config.BitsPerSample / 8
|
||||
|
||||
dataSize := uint32(len(pcmData))
|
||||
fileSize := 36 + dataSize // 36 bytes for header + data
|
||||
|
||||
var buf bytes.Buffer
|
||||
|
||||
// RIFF header
|
||||
buf.WriteString("RIFF")
|
||||
binary.Write(&buf, binary.LittleEndian, fileSize)
|
||||
buf.WriteString("WAVE")
|
||||
|
||||
// fmt subchunk
|
||||
buf.WriteString("fmt ")
|
||||
binary.Write(&buf, binary.LittleEndian, uint32(16)) // Subchunk1Size (16 for PCM)
|
||||
binary.Write(&buf, binary.LittleEndian, uint16(1)) // AudioFormat (1 = PCM)
|
||||
binary.Write(&buf, binary.LittleEndian, uint16(config.NumChannels)) // NumChannels
|
||||
binary.Write(&buf, binary.LittleEndian, uint32(config.SampleRate)) // SampleRate
|
||||
binary.Write(&buf, binary.LittleEndian, uint32(byteRate)) // ByteRate
|
||||
binary.Write(&buf, binary.LittleEndian, uint16(blockAlign)) // BlockAlign
|
||||
binary.Write(&buf, binary.LittleEndian, uint16(config.BitsPerSample)) // BitsPerSample
|
||||
|
||||
// data subchunk
|
||||
buf.WriteString("data")
|
||||
binary.Write(&buf, binary.LittleEndian, dataSize)
|
||||
buf.Write(pcmData)
|
||||
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
var (
|
||||
riff = []byte("RIFF")
|
||||
wave = []byte("WAVE")
|
||||
id3 = []byte("ID3")
|
||||
form = []byte("FORM")
|
||||
aiff = []byte("AIFF")
|
||||
aifc = []byte("AIFC")
|
||||
flac = []byte("fLaC")
|
||||
oggs = []byte("OggS")
|
||||
adif = []byte("ADIF")
|
||||
)
|
||||
|
||||
// DetectAudioMimeType attempts to detect the MIME type from audio file headers.
|
||||
// Supports detection of: WAV, MP3, AIFF, AAC, OGG Vorbis, and FLAC formats.
|
||||
func DetectAudioMimeType(audioData []byte) string {
|
||||
if len(audioData) < 4 {
|
||||
return "audio/mp3"
|
||||
}
|
||||
// WAV (RIFF/WAVE)
|
||||
if len(audioData) >= 12 &&
|
||||
bytes.Equal(audioData[:4], riff) &&
|
||||
bytes.Equal(audioData[8:12], wave) {
|
||||
return "audio/wav"
|
||||
}
|
||||
// MP3: ID3v2 tag (keep this check for MP3)
|
||||
if len(audioData) >= 3 && bytes.Equal(audioData[:3], id3) {
|
||||
return "audio/mp3"
|
||||
}
|
||||
// AAC: ADIF or ADTS (0xFFF sync) - check before MP3 frame sync to avoid misclassification
|
||||
if bytes.HasPrefix(audioData, adif) {
|
||||
return "audio/aac"
|
||||
}
|
||||
if len(audioData) >= 2 && audioData[0] == 0xFF && (audioData[1]&0xF6) == 0xF0 {
|
||||
return "audio/aac"
|
||||
}
|
||||
// AIFF / AIFC (map both to audio/aiff)
|
||||
if len(audioData) >= 12 && bytes.Equal(audioData[:4], form) &&
|
||||
(bytes.Equal(audioData[8:12], aiff) || bytes.Equal(audioData[8:12], aifc)) {
|
||||
return "audio/aiff"
|
||||
}
|
||||
// FLAC
|
||||
if bytes.HasPrefix(audioData, flac) {
|
||||
return "audio/flac"
|
||||
}
|
||||
// OGG container
|
||||
if bytes.HasPrefix(audioData, oggs) {
|
||||
return "audio/ogg"
|
||||
}
|
||||
// MP3: MPEG frame sync (cover common variants) - check after AAC to avoid misclassification
|
||||
if len(audioData) >= 2 && audioData[0] == 0xFF &&
|
||||
(audioData[1] == 0xFB || audioData[1] == 0xF3 || audioData[1] == 0xF2 || audioData[1] == 0xFA) {
|
||||
return "audio/mp3"
|
||||
}
|
||||
// Fallback within supported set
|
||||
return "audio/mp3"
|
||||
}
|
||||
|
||||
// AudioFilenameFromBytes returns a filename with the correct extension for the given audio data.
|
||||
// Falls back to "audio.mp3" if the format cannot be detected.
|
||||
func AudioFilenameFromBytes(audioData []byte) string {
|
||||
mimeToExt := map[string]string{
|
||||
"audio/wav": "audio.wav",
|
||||
"audio/aac": "audio.aac",
|
||||
"audio/aiff": "audio.aiff",
|
||||
"audio/flac": "audio.flac",
|
||||
"audio/ogg": "audio.ogg",
|
||||
}
|
||||
mime := DetectAudioMimeType(audioData)
|
||||
if ext, ok := mimeToExt[mime]; ok {
|
||||
return ext
|
||||
}
|
||||
return "audio.mp3"
|
||||
}
|
||||
Reference in New Issue
Block a user