Feature: add speecht5_tts

geminixiang · Sep 9, 2023 · fd77bef · fd77bef
1 parent 4af0867
commit fd77bef
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 *.jpg
+*.wav
 *.whl
 *.pyc
 .coverage
@@ -7,4 +8,4 @@ requirements.txt
 **/__pycache__/
 .vscode/
 .ruff_cache/
-.pytest_cache/
+.pytest_cache/
diff --git a/README.md b/README.md
@@ -19,6 +19,7 @@
 - [x] Support model
   - [x] [yolos-tiny](https://huggingface.co/hustvl/yolos-tiny)
   - [x] [blip_image_captioning_large](https://huggingface.co/Salesforce/blip-image-captioning-large)
+  - [x] [speecht5_tts](https://huggingface.co/microsoft/speecht5_tts)
 - [x] Dockerize: [GPU](https://github.com/geminixiang/huggingface-service/pull/1)
 
 ## Quick Start

diff --git a/src/huggingface/__init__.py b/src/huggingface/__init__.py
@@ -1,3 +1,4 @@
 from .utils.storages import tmp_file  # noqa: F401
 from .blip_image_captioning_large import blip_image_captioning_large  # noqa: F401
-from .yolos_tiny import yolos_tiny  # noqa: F401
+from .yolos_tiny import yolos_tiny  # noqa: F401
+from .speecht5_tts import speecht5_tts  # noqa: F401
diff --git a/src/huggingface/speecht5_tts.py b/src/huggingface/speecht5_tts.py
@@ -0,0 +1,23 @@
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import torch
+import soundfile as sf
+import uuid
+
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+
+def speecht5_tts(text: str) -> str:
+    inputs = processor(text=text, return_tensors="pt")
+    outputs = f"{uuid.uuid4()}.mp3"
+
+    # load xvector containing speaker's voice characteristics from a dataset
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+
+    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+
+    sf.write(f"outputs/{outputs}", speech.numpy(), samplerate=16000)
+
+    return f"http://localhost:8000/audio/{outputs}"
diff --git a/src/main.py b/src/main.py
@@ -4,7 +4,7 @@
 
 from config import settings
 from models import ObjectDetectionResult
-from huggingface import blip_image_captioning_large, yolos_tiny
+from huggingface import blip_image_captioning_large, yolos_tiny, speecht5_tts
 
 app = FastAPI()
 
@@ -37,7 +37,20 @@ def detect(url: str) -> ObjectDetectionResult:
 def img2txt(url: str) -> str:
     return blip_image_captioning_large(url)
 
+@app.get(
+    "/tts",
+    name="Detect objects in an image from a URL",
+    tags=["HuggingFace"],
+    response_model=str,
+)
+def tts(text: str) -> str:
+    return speecht5_tts(text)
 
 @app.get("/img/{file_name}", name="Get modified image", tags=["general"])
 def get_image(file_name: str):
     return FileResponse(f"outputs/{file_name}", media_type="image/jpeg")
+
+
+@app.get("/audio/{file_name}", name="Get audio", tags=["general"])
+def get_image(file_name: str):
+    return FileResponse(f"outputs/{file_name}", media_type="audio/mpeg")