From fd77bef1aa30a040cce691a50405efdf4a9a6370 Mon Sep 17 00:00:00 2001 From: Ying Xiang Date: Sat, 9 Sep 2023 21:42:11 +0800 Subject: [PATCH] Feature: add speecht5_tts --- .gitignore | 3 ++- README.md | 1 + src/huggingface/__init__.py | 3 ++- src/huggingface/speecht5_tts.py | 23 +++++++++++++++++++++++ src/main.py | 15 ++++++++++++++- 5 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 src/huggingface/speecht5_tts.py diff --git a/.gitignore b/.gitignore index beb98a4..7290403 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.jpg +*.wav *.whl *.pyc .coverage @@ -7,4 +8,4 @@ requirements.txt **/__pycache__/ .vscode/ .ruff_cache/ -.pytest_cache/ \ No newline at end of file +.pytest_cache/ diff --git a/README.md b/README.md index 6b81ea6..c5ef618 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ - [x] Support model - [x] [yolos-tiny](https://huggingface.co/hustvl/yolos-tiny) - [x] [blip_image_captioning_large](https://huggingface.co/Salesforce/blip-image-captioning-large) + - [x] [speecht5_tts](https://huggingface.co/microsoft/speecht5_tts) - [x] Dockerize: [GPU](https://github.com/geminixiang/huggingface-service/pull/1) ## Quick Start diff --git a/src/huggingface/__init__.py b/src/huggingface/__init__.py index af4b9c8..ff2cf1b 100644 --- a/src/huggingface/__init__.py +++ b/src/huggingface/__init__.py @@ -1,3 +1,4 @@ from .utils.storages import tmp_file # noqa: F401 from .blip_image_captioning_large import blip_image_captioning_large # noqa: F401 -from .yolos_tiny import yolos_tiny # noqa: F401 \ No newline at end of file +from .yolos_tiny import yolos_tiny # noqa: F401 +from .speecht5_tts import speecht5_tts # noqa: F401 \ No newline at end of file diff --git a/src/huggingface/speecht5_tts.py b/src/huggingface/speecht5_tts.py new file mode 100644 index 0000000..c6cc7cd --- /dev/null +++ b/src/huggingface/speecht5_tts.py @@ -0,0 +1,23 @@ +from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan +from datasets import load_dataset +import torch +import soundfile as sf +import uuid + +processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") +model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") +vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") + +def speecht5_tts(text: str) -> str: + inputs = processor(text=text, return_tensors="pt") + outputs = f"{uuid.uuid4()}.mp3" + + # load xvector containing speaker's voice characteristics from a dataset + embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") + speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) + + speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) + + sf.write(f"outputs/{outputs}", speech.numpy(), samplerate=16000) + + return f"http://localhost:8000/audio/{outputs}" diff --git a/src/main.py b/src/main.py index 80ddf96..c37cd60 100644 --- a/src/main.py +++ b/src/main.py @@ -4,7 +4,7 @@ from config import settings from models import ObjectDetectionResult -from huggingface import blip_image_captioning_large, yolos_tiny +from huggingface import blip_image_captioning_large, yolos_tiny, speecht5_tts app = FastAPI() @@ -37,7 +37,20 @@ def detect(url: str) -> ObjectDetectionResult: def img2txt(url: str) -> str: return blip_image_captioning_large(url) +@app.get( + "/tts", + name="Detect objects in an image from a URL", + tags=["HuggingFace"], + response_model=str, +) +def tts(text: str) -> str: + return speecht5_tts(text) @app.get("/img/{file_name}", name="Get modified image", tags=["general"]) def get_image(file_name: str): return FileResponse(f"outputs/{file_name}", media_type="image/jpeg") + + +@app.get("/audio/{file_name}", name="Get audio", tags=["general"]) +def get_image(file_name: str): + return FileResponse(f"outputs/{file_name}", media_type="audio/mpeg")