elixir-nx · kevinschweikert · Oct 22, 2024 · Oct 23, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/lib/bumblebee/audio/speech_to_text_whisper.ex b/lib/bumblebee/audio/speech_to_text_whisper.ex
@@ -136,7 +136,7 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do
         {:ok, [Nx.backend_transfer(input, Nx.BinaryBackend)]}
 
       {:file, path} when is_binary(path) ->
-        ffmpeg_read_as_pcm(path, sampling_rate)
+        from_file(path, sampling_rate)
 
       other ->
         cond do
@@ -164,49 +164,27 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do
     end
   end
 
-  defp ffmpeg_read_as_pcm(path, sampling_rate) do
-    channels = 1
+  defp from_file(path, sampling_rate) do
+    # This chunk can be of arbitrary size, the serving accumulates
+    # and overlaps chunks internally as needed.
 
-    format =
-      case System.endianness() do
-        :little -> "f32le"
-        :big -> "f32be"
-      end
-
-    cond do
-      System.find_executable("ffmpeg") == nil ->
-        {:error, "ffmpeg not found in PATH"}
-
-      not File.exists?(path) ->
-        {:error, "no file found at #{path}"}
-
-      true ->
-        # This chunk can be of arbitrary size, the serving accumulates
-        # and overlaps chunks internally as needed. We read the file
-        # as stream to reduce memory usage
-        chunk_size = 30
-
-        stream =
-          Stream.iterate(0, fn offset -> offset + chunk_size end)
-          |> Stream.transform({}, fn offset, acc ->
-            System.cmd(
-              "ffmpeg",
-              ~w[-ss #{offset} -t #{chunk_size} -i #{path} -ac #{channels} -ar #{sampling_rate} -f #{format} -hide_banner -loglevel quiet pipe:1]
-            )
-            |> case do
-              {<<>>, 0} ->
-                {:halt, acc}
-
-              {data, 0} ->
-                chunk = Nx.from_binary(data, :f32, backend: Nx.BinaryBackend)
-                {[chunk], acc}
-
-              {_, 1} ->
-                raise "ffmpeg failed to decode the given file"
-            end
-          end)
-
-        {:ok, stream}
+    if File.exists?(path) do
+      stream =
+        path
+        |> Xav.Reader.stream!(
+          read: :audio,
+          out_format: :f32,
+          out_channels: 1,
+          out_sample_rate: sampling_rate
+        )
+        |> Stream.map(fn frame -> Xav.Frame.to_nx(frame) end)
+        |> Stream.chunk_every(1000)
+        |> Stream.map(&Nx.Batch.concatenate/1)
+        |> Stream.map(fn batch -> Nx.Defn.jit_apply(&Function.identity/1, [batch]) end)
-        |> Stream.chunk_every(1000)
-        |> Stream.map(&Nx.Batch.concatenate/1)
-        |> Stream.map(fn batch -> Nx.Defn.jit_apply(&Function.identity/1, [batch]) end)
-        |> Stream.chunk_every(1000)
-        |> Stream.map(&Nx.Batch.concatenate/1)
-        |> Stream.map(fn batch -> Nx.Defn.jit_apply(&Function.identity/1, [batch]) end)
+
+      {:ok, stream}
+    else
+      {:error, "no file found at #{path}"}
     end
   end
 

diff --git a/mix.exs b/mix.exs
@@ -34,7 +34,7 @@ defmodule Bumblebee.MixProject do
       {:axon, "~> 0.7.0"},
       # {:axon, github: "elixir-nx/axon", override: true},
       {:tokenizers, "~> 0.4"},
-      {:nx, "~> 0.9.0"},
+      {:nx, "~> 0.9.0", override: true},
       {:exla, ">= 0.0.0", only: [:dev, :test]},
       {:torchx, ">= 0.0.0", only: [:dev, :test]},
       # {:nx, github: "elixir-nx/nx", sparse: "nx", override: true},
@@ -49,7 +49,8 @@ defmodule Bumblebee.MixProject do
       {:stb_image, "~> 0.6.0", only: :test},
       {:bypass, "~> 2.1", only: :test},
       {:ex_doc, "~> 0.28", only: :dev, runtime: false},
-      {:nx_signal, "~> 0.2.0"}
+      {:nx_signal, "~> 0.2.0"},
+      {:xav, "~> 0.6.0"}
     ]
   end
 

diff --git a/mix.lock b/mix.lock
@@ -36,5 +36,6 @@
   "torchx": {:hex, :torchx, "0.9.0", "936cbd32233f89d73700c39b7ef56f94b3f3541db03c90f8ddf6b3fe73260e28", [:mix], [{:nx, "~> 0.9.0", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "4e057d6b93fc91191957230b2c61c408861b888abdf6a900baf0db4125405505"},
   "unpickler": {:hex, :unpickler, "0.1.0", "c2262c0819e6985b761e7107546cef96a485f401816be5304a65fdd200d5bd6a", [:mix], [], "hexpm", "e2b3f61e62406187ac52afead8a63bfb4e49394028993f3c4c42712743cab79e"},
   "unzip": {:hex, :unzip, "0.12.0", "beed92238724732418b41eba77dcb7f51e235b707406c05b1732a3052d1c0f36", [:mix], [], "hexpm", "95655b72db368e5a84951f0bed586ac053b55ee3815fd96062fce10ce4fc998d"},
+  "xav": {:hex, :xav, "0.6.0", "38835d735fc3d620e41c84fe29cd7db0381436b54c9ef209ba9112255a091fc4", [:make, :mix], [{:elixir_make, "~> 0.7", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:nx, "~> 0.7.0", [hex: :nx, repo: "hexpm", optional: true]}], "hexpm", "ad988df9d44c0ca3ccc4053ea0f1f1702ca14d4b926047b224deb527f0866edf"},
   "xla": {:hex, :xla, "0.8.0", "fef314d085dd3ee16a0816c095239938f80769150e15db16dfaa435553d7cb16", [:make, :mix], [{:elixir_make, "~> 0.4", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "739c61c8d93b97e12ba0369d10e76130224c208f1a76ad293e3581f056833e57"},
 }