refactor: replace ffmpeg with pyav

gpustack · Nov 26, 2024 · 1010f29 · 1010f29
1 parent cf0ed24
commit 1010f29
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 81 deletions.
diff --git a/vox_box/backends/tts/bark.py b/vox_box/backends/tts/bark.py
@@ -8,7 +8,7 @@
 from transformers import AutoProcessor, BarkModel
 from scipy.io.wavfile import write as write_wav
 
-from vox_box.utils.ffmpeg import convert
+from vox_box.utils.audio import convert
 from vox_box.utils.log import log_method
 from vox_box.utils.model import create_model_dict
 
@@ -81,12 +81,8 @@ def speech(
             wav_file_path = temp_file.name
             write_wav(wav_file_path, rate=sample_rate, data=audio_array)
 
-            with tempfile.NamedTemporaryFile(
-                suffix=f".{reponse_format}", delete=False
-            ) as output_temp_file:
-                output_file_path = output_temp_file.name
-                convert(wav_file_path, reponse_format, output_file_path, speed)
-                return output_file_path
+            output_file_path = convert(wav_file_path, reponse_format, speed)
+            return output_file_path
 
     def _get_voices(self) -> List[str]:
         voices = []

diff --git a/vox_box/backends/tts/cosyvoice.py b/vox_box/backends/tts/cosyvoice.py
@@ -8,7 +8,7 @@
 from vox_box.backends.tts.base import TTSBackend
 from vox_box.utils.log import log_method
 from vox_box.config.config import BackendEnum, Config, TaskTypeEnum
-from vox_box.utils.ffmpeg import convert
+from vox_box.utils.audio import convert
 from vox_box.utils.model import create_model_dict
 
 paths_to_insert = [
@@ -83,11 +83,7 @@ def speech(
                     )
                     wf.writeframes(tts_audio)
 
-            with tempfile.NamedTemporaryFile(
-                suffix=f".{reponse_format}", delete=False
-            ) as output_temp_file:
-                output_file_path = output_temp_file.name
-                convert(wav_file_path, reponse_format, output_file_path, speed)
+                output_file_path = convert(wav_file_path, reponse_format, speed)
                 return output_file_path
 
     def _get_required_resource(self) -> Dict:

diff --git a/vox_box/server/routers.py b/vox_box/server/routers.py
@@ -136,6 +136,8 @@ def get_media_type(response_format) -> str:
         media_type = "audio/x-flac"
     elif response_format == "wav":
         media_type = "audio/wav"
+    elif response_format == "pcm":
+        media_type = "audio/pcm"
     else:
         raise Exception(
             f"Invalid response_format: '{response_format}'", param="response_format"

diff --git a/vox_box/utils/audio.py b/vox_box/utils/audio.py
@@ -0,0 +1,94 @@
+import tempfile
+import av
+
+
+response_format_to_encoder_decoder_map = {
+    "mp3": "libmp3lame",
+    "opus": "libopus",
+    "aac": "aac",
+    "flac": "flac",
+    "wav": "pcm_s16le",
+    "pcm": "pcm_s16le",
+}
+
+response_format_to_suffix_map = {
+    "mp3": ".mp3",
+    "opus": ".ogg",
+    "aac": ".aac",
+    "flac": ".flac",
+    "wav": ".wav",
+    "pcm": ".pcm",
+}
+
+
+def convert(
+    input_file_path: str,
+    response_format: str,
+    speed: float = 1,
+) -> str:
+    suffix = response_format_to_suffix_map.get(response_format)
+    with tempfile.NamedTemporaryFile(
+        suffix=f"{suffix}", delete=False
+    ) as output_temp_file:
+        output_file_path = output_temp_file.name
+        input_container = av.open(input_file_path)
+        input_stream = input_container.streams.audio[0]
+
+        if response_format == "pcm":
+            convert_to_pcm(input_stream, output_file_path, speed)
+        else:
+            convert_to_format(input_stream, output_file_path, response_format, speed)
+
+        input_container.close()
+        return output_file_path
+
+
+def convert_to_pcm(input_stream, output_file_path: str, speed: float):
+    # Bare PCM data should not have any container structure, need to ensure the output is purely raw audio data stream.
+    with open(output_file_path, "wb") as output_file:
+        resampler = av.AudioResampler(
+            format="s16",  # 16-bit PCM
+            layout=input_stream.layout,
+            rate=int(input_stream.rate * speed),
+        )
+
+        for frame in input_stream.container.decode(input_stream):
+            frame.pts = None  # Reset PTS to avoid issues with frame timing
+            resampled_frames = resampler.resample(frame)
+            for resampled_frame in resampled_frames:
+                # convert the audio frame into a NumPy array. The array format is usually (samples, channels),
+                # where 'samples' is the number of sample points per frame, and 'channels' is the number of channels (e.g., stereo has 2 channels, mono has 1).
+                pcm_data = resampled_frame.to_ndarray()
+                # convert the NumPy array into a byte stream, then written to the file to generate raw PCM data.
+                output_file.write(pcm_data.tobytes())
+
+
+def convert_to_format(
+    input_stream, output_file_path: str, response_format: str, speed: float
+):
+    output_container = av.open(output_file_path, mode="w")
+    output_stream = output_container.add_stream(
+        codec_name=response_format_to_encoder_decoder_map.get(response_format),
+        rate=int(input_stream.rate * speed),
+        channels=input_stream.channels,
+    )
+
+    resampler = av.AudioResampler(
+        format=output_stream.format,
+        layout=output_stream.layout,
+        rate=output_stream.rate,
+    )
+
+    for frame in input_stream.container.decode(input_stream):
+        # Reset PTS to avoid issues with frame timing
+        frame.pts = None
+        frames = resampler.resample(frame)
+        for resampled_frame in frames:
+            for packet in output_stream.encode(resampled_frame):
+                output_container.mux(packet)
+
+    # Flush encoder
+    for packet in output_stream.encode():
+        output_container.mux(packet)
+
+    output_container.close()
diff --git a/vox_box/utils/ffmpeg.py b/vox_box/utils/ffmpeg.py