Skip to content

Commit

Permalink
refactor: replace ffmpeg with pyav
Browse files Browse the repository at this point in the history
  • Loading branch information
aiwantaozi committed Nov 26, 2024
1 parent cf0ed24 commit 1010f29
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 81 deletions.
10 changes: 3 additions & 7 deletions vox_box/backends/tts/bark.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from transformers import AutoProcessor, BarkModel
from scipy.io.wavfile import write as write_wav

from vox_box.utils.ffmpeg import convert
from vox_box.utils.audio import convert
from vox_box.utils.log import log_method
from vox_box.utils.model import create_model_dict

Expand Down Expand Up @@ -81,12 +81,8 @@ def speech(
wav_file_path = temp_file.name
write_wav(wav_file_path, rate=sample_rate, data=audio_array)

with tempfile.NamedTemporaryFile(
suffix=f".{reponse_format}", delete=False
) as output_temp_file:
output_file_path = output_temp_file.name
convert(wav_file_path, reponse_format, output_file_path, speed)
return output_file_path
output_file_path = convert(wav_file_path, reponse_format, speed)
return output_file_path

def _get_voices(self) -> List[str]:
voices = []
Expand Down
8 changes: 2 additions & 6 deletions vox_box/backends/tts/cosyvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vox_box.backends.tts.base import TTSBackend
from vox_box.utils.log import log_method
from vox_box.config.config import BackendEnum, Config, TaskTypeEnum
from vox_box.utils.ffmpeg import convert
from vox_box.utils.audio import convert
from vox_box.utils.model import create_model_dict

paths_to_insert = [
Expand Down Expand Up @@ -83,11 +83,7 @@ def speech(
)
wf.writeframes(tts_audio)

with tempfile.NamedTemporaryFile(
suffix=f".{reponse_format}", delete=False
) as output_temp_file:
output_file_path = output_temp_file.name
convert(wav_file_path, reponse_format, output_file_path, speed)
output_file_path = convert(wav_file_path, reponse_format, speed)
return output_file_path

def _get_required_resource(self) -> Dict:
Expand Down
2 changes: 2 additions & 0 deletions vox_box/server/routers.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ def get_media_type(response_format) -> str:
media_type = "audio/x-flac"
elif response_format == "wav":
media_type = "audio/wav"
elif response_format == "pcm":
media_type = "audio/pcm"
else:
raise Exception(
f"Invalid response_format: '{response_format}'", param="response_format"
Expand Down
94 changes: 94 additions & 0 deletions vox_box/utils/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import tempfile
import av


response_format_to_encoder_decoder_map = {
"mp3": "libmp3lame",
"opus": "libopus",
"aac": "aac",
"flac": "flac",
"wav": "pcm_s16le",
"pcm": "pcm_s16le",
}

response_format_to_suffix_map = {
"mp3": ".mp3",
"opus": ".ogg",
"aac": ".aac",
"flac": ".flac",
"wav": ".wav",
"pcm": ".pcm",
}


def convert(
input_file_path: str,
response_format: str,
speed: float = 1,
) -> str:
suffix = response_format_to_suffix_map.get(response_format)
with tempfile.NamedTemporaryFile(
suffix=f"{suffix}", delete=False
) as output_temp_file:
output_file_path = output_temp_file.name
input_container = av.open(input_file_path)
input_stream = input_container.streams.audio[0]

if response_format == "pcm":
convert_to_pcm(input_stream, output_file_path, speed)
else:
convert_to_format(input_stream, output_file_path, response_format, speed)

input_container.close()
return output_file_path


def convert_to_pcm(input_stream, output_file_path: str, speed: float):
# Bare PCM data should not have any container structure, need to ensure the output is purely raw audio data stream.
with open(output_file_path, "wb") as output_file:
resampler = av.AudioResampler(
format="s16", # 16-bit PCM
layout=input_stream.layout,
rate=int(input_stream.rate * speed),
)

for frame in input_stream.container.decode(input_stream):
frame.pts = None # Reset PTS to avoid issues with frame timing
resampled_frames = resampler.resample(frame)
for resampled_frame in resampled_frames:
# convert the audio frame into a NumPy array. The array format is usually (samples, channels),
# where 'samples' is the number of sample points per frame, and 'channels' is the number of channels (e.g., stereo has 2 channels, mono has 1).
pcm_data = resampled_frame.to_ndarray()
# convert the NumPy array into a byte stream, then written to the file to generate raw PCM data.
output_file.write(pcm_data.tobytes())


def convert_to_format(
input_stream, output_file_path: str, response_format: str, speed: float
):
output_container = av.open(output_file_path, mode="w")
output_stream = output_container.add_stream(
codec_name=response_format_to_encoder_decoder_map.get(response_format),
rate=int(input_stream.rate * speed),
channels=input_stream.channels,
)

resampler = av.AudioResampler(
format=output_stream.format,
layout=output_stream.layout,
rate=output_stream.rate,
)

for frame in input_stream.container.decode(input_stream):
# Reset PTS to avoid issues with frame timing
frame.pts = None
frames = resampler.resample(frame)
for resampled_frame in frames:
for packet in output_stream.encode(resampled_frame):
output_container.mux(packet)

# Flush encoder
for packet in output_stream.encode():
output_container.mux(packet)

output_container.close()
68 changes: 0 additions & 68 deletions vox_box/utils/ffmpeg.py

This file was deleted.

0 comments on commit 1010f29

Please sign in to comment.