Skip to content

Commit

Permalink
Feature: add speecht5_tts
Browse files Browse the repository at this point in the history
  • Loading branch information
geminixiang committed Sep 9, 2023
1 parent 4af0867 commit fd77bef
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
*.jpg
*.wav
*.whl
*.pyc
.coverage
Expand All @@ -7,4 +8,4 @@ requirements.txt
**/__pycache__/
.vscode/
.ruff_cache/
.pytest_cache/
.pytest_cache/
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- [x] Support model
- [x] [yolos-tiny](https://huggingface.co/hustvl/yolos-tiny)
- [x] [blip_image_captioning_large](https://huggingface.co/Salesforce/blip-image-captioning-large)
- [x] [speecht5_tts](https://huggingface.co/microsoft/speecht5_tts)
- [x] Dockerize: [GPU](https://github.com/geminixiang/huggingface-service/pull/1)

## Quick Start
Expand Down
3 changes: 2 additions & 1 deletion src/huggingface/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .utils.storages import tmp_file # noqa: F401
from .blip_image_captioning_large import blip_image_captioning_large # noqa: F401
from .yolos_tiny import yolos_tiny # noqa: F401
from .yolos_tiny import yolos_tiny # noqa: F401
from .speecht5_tts import speecht5_tts # noqa: F401
23 changes: 23 additions & 0 deletions src/huggingface/speecht5_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
import uuid

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

def speecht5_tts(text: str) -> str:
inputs = processor(text=text, return_tensors="pt")
outputs = f"{uuid.uuid4()}.mp3"

# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

sf.write(f"outputs/{outputs}", speech.numpy(), samplerate=16000)

return f"http://localhost:8000/audio/{outputs}"
15 changes: 14 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from config import settings
from models import ObjectDetectionResult
from huggingface import blip_image_captioning_large, yolos_tiny
from huggingface import blip_image_captioning_large, yolos_tiny, speecht5_tts

app = FastAPI()

Expand Down Expand Up @@ -37,7 +37,20 @@ def detect(url: str) -> ObjectDetectionResult:
def img2txt(url: str) -> str:
return blip_image_captioning_large(url)

@app.get(
"/tts",
name="Detect objects in an image from a URL",
tags=["HuggingFace"],
response_model=str,
)
def tts(text: str) -> str:
return speecht5_tts(text)

@app.get("/img/{file_name}", name="Get modified image", tags=["general"])
def get_image(file_name: str):
return FileResponse(f"outputs/{file_name}", media_type="image/jpeg")


@app.get("/audio/{file_name}", name="Get audio", tags=["general"])
def get_image(file_name: str):
return FileResponse(f"outputs/{file_name}", media_type="audio/mpeg")

0 comments on commit fd77bef

Please sign in to comment.