From 5c5d37708f82073eef338e2cffcd00ab872a377d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 25 Mar 2024 22:54:04 +0000 Subject: [PATCH 01/16] Changes for building TorchServe on linux aarch64 --- requirements/developer.txt | 2 +- requirements/torch_linux_aarch64.txt | 6 ++++++ ts_scripts/install_dependencies.py | 11 ++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 requirements/torch_linux_aarch64.txt diff --git a/requirements/developer.txt b/requirements/developer.txt index d1ae1fd39d..57d3d13ee7 100644 --- a/requirements/developer.txt +++ b/requirements/developer.txt @@ -14,7 +14,7 @@ pre-commit==3.3.2 twine==4.0.2 mypy==1.3.0 torchpippy==0.1.1 -intel_extension_for_pytorch==2.2.0; sys_platform != 'win32' and sys_platform != 'darwin' +intel_extension_for_pytorch==2.2.0; sys_platform != 'win32' and sys_platform != 'darwin' and platform_machine != 'aarch64' onnxruntime==1.17.1 googleapis-common-protos onnx==1.14.1 diff --git a/requirements/torch_linux_aarch64.txt b/requirements/torch_linux_aarch64.txt new file mode 100644 index 0000000000..5aff2cf43c --- /dev/null +++ b/requirements/torch_linux_aarch64.txt @@ -0,0 +1,6 @@ +#pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://download.pytorch.org/whl/cpu +-r torch_common.txt +torch==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64' +torchvision==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64' +torchaudio==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64' diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py index f047de2a2b..f6c208bf5b 100644 --- a/ts_scripts/install_dependencies.py +++ b/ts_scripts/install_dependencies.py @@ -118,9 +118,14 @@ def install_torch_packages(self, cuda_version): f"{sys.executable} -m pip install -U -r {torch_neuronx_requirements_file}" ) else: - os.system( - f"{sys.executable} -m pip install -U -r requirements/torch_{platform.system().lower()}.txt" - ) + if platform.machine() == "aarch64": + os.system( + f"{sys.executable} -m pip install -U -r requirements/torch_{platform.system().lower()}_{platform.machine()}.txt" + ) + else: + os.system( + f"{sys.executable} -m pip install -U -r requirements/torch_{platform.system().lower()}.txt" + ) def install_python_packages(self, cuda_version, requirements_file_path, nightly): check = "where" if platform.system() == "Windows" else "which" From 1da104fcd65188c00ec993e5bc55258cc12869fd Mon Sep 17 00:00:00 2001 From: agunapal Date: Mon, 25 Mar 2024 22:56:00 +0000 Subject: [PATCH 02/16] Changes for building TorchServe on linux aarch64 --- requirements/torch_linux_aarch64.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/torch_linux_aarch64.txt b/requirements/torch_linux_aarch64.txt index 5aff2cf43c..315dd108be 100644 --- a/requirements/torch_linux_aarch64.txt +++ b/requirements/torch_linux_aarch64.txt @@ -3,4 +3,5 @@ -r torch_common.txt torch==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64' torchvision==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64' +#torchtext==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64' torchaudio==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64' From 5ab0b43f46b224aa5348169a5072f04780862898 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 3 Apr 2024 19:32:10 +0000 Subject: [PATCH 03/16] Added an example for linux aarch64 --- .../SpeechT5/README.md | 48 +++++++++++++ .../SpeechT5/download_model.py | 17 +++++ .../SpeechT5/model-config.yaml | 7 ++ .../SpeechT5/sample_input.txt | 1 + .../SpeechT5/setup.sh | 6 ++ .../SpeechT5/text_to_speech_handler.py | 68 +++++++++++++++++++ .../{ => WaveGlow}/README.md | 0 .../{ => WaveGlow}/create_mar.sh | 0 .../{ => WaveGlow}/requirements.txt | 0 .../{ => WaveGlow}/sample_text.txt | 0 .../{ => WaveGlow}/waveglow_handler.py | 0 .../{ => WaveGlow}/waveglow_model.py | 0 12 files changed, 147 insertions(+) create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/README.md create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/download_model.py create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/sample_input.txt create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/setup.sh create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py rename examples/text_to_speech_synthesizer/{ => WaveGlow}/README.md (100%) rename examples/text_to_speech_synthesizer/{ => WaveGlow}/create_mar.sh (100%) rename examples/text_to_speech_synthesizer/{ => WaveGlow}/requirements.txt (100%) rename examples/text_to_speech_synthesizer/{ => WaveGlow}/sample_text.txt (100%) rename examples/text_to_speech_synthesizer/{ => WaveGlow}/waveglow_handler.py (100%) rename examples/text_to_speech_synthesizer/{ => WaveGlow}/waveglow_model.py (100%) diff --git a/examples/text_to_speech_synthesizer/SpeechT5/README.md b/examples/text_to_speech_synthesizer/SpeechT5/README.md new file mode 100644 index 0000000000..e92442a54c --- /dev/null +++ b/examples/text_to_speech_synthesizer/SpeechT5/README.md @@ -0,0 +1,48 @@ +# Text to Speech synthesis with SpeechT5 + +This is an example showing text to speech synthesis using SpeechT5 model. + +While running this model on `linux-aarch64`, you can enable these optimizations + +``` +export DNNL_DEFAULT_FPMATH_MODE=BF16 +export LRU_CACHE_CAPACITY=1024 +``` +More details can be found in this [blog](https://pytorch.org/blog/optimized-pytorch-w-graviton/) + + +## Pre-requesites +``` +chmod +x setup.sh +./setup.sh +``` + +## Download model + +This saves the model artifacts to `model_artifacts` directory +``` +huggingface-cli login +python download_model.py +``` + +## Create model archiver + +``` +mkdir model_store + +torch-model-archiver --model-name SpeechT5-TTS --version 1.0 --handler text_to_speech_handler.py --config-file model-config.yaml --archive-format no-archive --export-path model_store + +mv model_artifacts model_store/SpeechT5-TTS/ +``` + +## Start TorchServe + +``` +torchserve --start --ncs --model-store model_store --models SpeechT5-TTS +``` + +## Send Inference request + +``` +curl http://127.0.0.1:8080/predictions/SpeechT5-TTS -T sample_input.txt -o speech.wav +``` \ No newline at end of file diff --git a/examples/text_to_speech_synthesizer/SpeechT5/download_model.py b/examples/text_to_speech_synthesizer/SpeechT5/download_model.py new file mode 100644 index 0000000000..a50654596e --- /dev/null +++ b/examples/text_to_speech_synthesizer/SpeechT5/download_model.py @@ -0,0 +1,17 @@ +from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan +from datasets import load_dataset +import torch +import soundfile as sf +from datasets import load_dataset + +processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") +model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") +vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") + +embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") + +model.save_pretrained(save_directory="model_artifacts/model") +processor.save_pretrained(save_directory="model_artifacts/processor") +vocoder.save_pretrained(save_directory="model_artifacts/vocoder") +embeddings_dataset.save_to_disk("model_artifacts/speaker_embeddings") +print("Save model artifacts to directory model_artifacts") diff --git a/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml b/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml new file mode 100644 index 0000000000..aefc704276 --- /dev/null +++ b/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml @@ -0,0 +1,7 @@ +minWorkers: 1 +maxWorkers: 1 +handler: + model: "./model" + vocoder: "./vocoder" + processor: "./processor" + speaker_embeddings: "./speaker_embeddings" diff --git a/examples/text_to_speech_synthesizer/SpeechT5/sample_input.txt b/examples/text_to_speech_synthesizer/SpeechT5/sample_input.txt new file mode 100644 index 0000000000..e60d898198 --- /dev/null +++ b/examples/text_to_speech_synthesizer/SpeechT5/sample_input.txt @@ -0,0 +1 @@ +"I love San Francisco" diff --git a/examples/text_to_speech_synthesizer/SpeechT5/setup.sh b/examples/text_to_speech_synthesizer/SpeechT5/setup.sh new file mode 100644 index 0000000000..895c08b49e --- /dev/null +++ b/examples/text_to_speech_synthesizer/SpeechT5/setup.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Needed for soundfile +sudo apt install libsndfile1 -y + +pip install --upgrade transformers sentencepiece datasets[audio] soundfile diff --git a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py new file mode 100644 index 0000000000..074ce21043 --- /dev/null +++ b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py @@ -0,0 +1,68 @@ + +import logging +import os +import torch +import uuid +from ts.torch_handler.base_handler import BaseHandler + +import soundfile as sf +from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan +from datasets import load_from_disk + +logger = logging.getLogger(__name__) + + +class SpeechT5_TTS(BaseHandler): + def __init__(self): + self.model = None + self.processor = None + self.vocoder = None + self.speaker_embeddings = None + + def initialize(self, ctx): + + properties = ctx.system_properties + model_dir = properties.get("model_dir") + + processor = ctx.model_yaml_config["handler"]["processor"] + model = ctx.model_yaml_config["handler"]["model"] + vocoder = ctx.model_yaml_config["handler"]["vocoder"] + embeddings_dataset = ctx.model_yaml_config["handler"]["speaker_embeddings"] + + self.processor = SpeechT5Processor.from_pretrained(processor) + self.model = SpeechT5ForTextToSpeech.from_pretrained(model) + self.vocoder = SpeechT5HifiGan.from_pretrained(vocoder) + + # load xvector containing speaker's voice characteristics from a dataset + embeddings_dataset = load_from_disk(embeddings_dataset) + self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) + + def preprocess(self, requests): + + assert ( + len(requests) == 1 + ), "This is currently supported with batch_size=1" + req_data = requests[0] + + input_data = req_data.get("data") or req_data.get("body") + + if isinstance(input_data, (bytes, bytearray)): + input_data = input_data.decode("utf-8") + + inputs = self.processor(text=input_data, return_tensors="pt") + + return inputs + + def inference(self, inputs): + + output = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder) + return output + + + def postprocess(self, inference_output): + path = "/tmp/{}.wav".format(uuid.uuid4().hex) + sf.write(path, inference_output.numpy(), samplerate=16000) + with open(path, "rb") as output: + data = output.read() + os.remove(path) + return [data] \ No newline at end of file diff --git a/examples/text_to_speech_synthesizer/README.md b/examples/text_to_speech_synthesizer/WaveGlow/README.md similarity index 100% rename from examples/text_to_speech_synthesizer/README.md rename to examples/text_to_speech_synthesizer/WaveGlow/README.md diff --git a/examples/text_to_speech_synthesizer/create_mar.sh b/examples/text_to_speech_synthesizer/WaveGlow/create_mar.sh similarity index 100% rename from examples/text_to_speech_synthesizer/create_mar.sh rename to examples/text_to_speech_synthesizer/WaveGlow/create_mar.sh diff --git a/examples/text_to_speech_synthesizer/requirements.txt b/examples/text_to_speech_synthesizer/WaveGlow/requirements.txt similarity index 100% rename from examples/text_to_speech_synthesizer/requirements.txt rename to examples/text_to_speech_synthesizer/WaveGlow/requirements.txt diff --git a/examples/text_to_speech_synthesizer/sample_text.txt b/examples/text_to_speech_synthesizer/WaveGlow/sample_text.txt similarity index 100% rename from examples/text_to_speech_synthesizer/sample_text.txt rename to examples/text_to_speech_synthesizer/WaveGlow/sample_text.txt diff --git a/examples/text_to_speech_synthesizer/waveglow_handler.py b/examples/text_to_speech_synthesizer/WaveGlow/waveglow_handler.py similarity index 100% rename from examples/text_to_speech_synthesizer/waveglow_handler.py rename to examples/text_to_speech_synthesizer/WaveGlow/waveglow_handler.py diff --git a/examples/text_to_speech_synthesizer/waveglow_model.py b/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py similarity index 100% rename from examples/text_to_speech_synthesizer/waveglow_model.py rename to examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py From 441eb5e0acd96771aad0ec6426e30cc7c0c99922 Mon Sep 17 00:00:00 2001 From: agunapal Date: Wed, 3 Apr 2024 19:41:51 +0000 Subject: [PATCH 04/16] Doc update for linux aarch64 --- docs/linux_aarch64.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 docs/linux_aarch64.md diff --git a/docs/linux_aarch64.md b/docs/linux_aarch64.md new file mode 100644 index 0000000000..ee6f3f7f5b --- /dev/null +++ b/docs/linux_aarch64.md @@ -0,0 +1,26 @@ +# TorchServe on linux aarch64 + +TorchServe has been tested to be working on linux aarch64. Tested this on Amazon Graviton 3 instance(m7g.4x.large) + +## Installation + +Currently installation from PyPi or installing from source works + +``` +python ts_scripts/install_dependencies.py +pip install torchserve torch-model-archiver torch-workflow-archiver +``` + +## Optimizations + +You can also enable this optimizations for Graviton 3 to get an improved performance. More details can be found in this [blog](https://pytorch.org/blog/optimized-pytorch-w-graviton/) +``` +export DNNL_DEFAULT_FPMATH_MODE=BF16 +export LRU_CACHE_CAPACITY=1024 +``` + +## Example + +This [example](https://github.com/pytorch/serve/tree/master/examples/text_to_speech_synthesizer/SpeechT5) on Text to Speech synthesis was verified to be working on Graviton 3 + + From 92ad55a48d8ab511608240201d1cda8e3b26a67b Mon Sep 17 00:00:00 2001 From: agunapal Date: Wed, 3 Apr 2024 19:41:53 +0000 Subject: [PATCH 05/16] Doc update for linux aarch64 --- docs/linux_aarch64.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/linux_aarch64.md b/docs/linux_aarch64.md index ee6f3f7f5b..7c37cc8bab 100644 --- a/docs/linux_aarch64.md +++ b/docs/linux_aarch64.md @@ -1,6 +1,6 @@ # TorchServe on linux aarch64 -TorchServe has been tested to be working on linux aarch64. Tested this on Amazon Graviton 3 instance(m7g.4x.large) +TorchServe has been tested to be working on linux aarch64 for some of the examples. Regression tests have not been tested. Tested this on Amazon Graviton 3 instance(m7g.4x.large) ## Installation From aa0a9c5a6a6240eb209f7fca7a8450918fc3f226 Mon Sep 17 00:00:00 2001 From: agunapal Date: Wed, 3 Apr 2024 19:43:39 +0000 Subject: [PATCH 06/16] Doc update for linux aarch64 --- examples/text_to_speech_synthesizer/SpeechT5/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_to_speech_synthesizer/SpeechT5/README.md b/examples/text_to_speech_synthesizer/SpeechT5/README.md index e92442a54c..8e955cf390 100644 --- a/examples/text_to_speech_synthesizer/SpeechT5/README.md +++ b/examples/text_to_speech_synthesizer/SpeechT5/README.md @@ -1,6 +1,6 @@ # Text to Speech synthesis with SpeechT5 -This is an example showing text to speech synthesis using SpeechT5 model. +This is an example showing text to speech synthesis using SpeechT5 model. This has been verified to work on (linux-aarch64) Graviton 3 instance While running this model on `linux-aarch64`, you can enable these optimizations From 9a07909eefae1e7ddf54169f1a705eb7a990cdae Mon Sep 17 00:00:00 2001 From: agunapal Date: Wed, 3 Apr 2024 19:46:14 +0000 Subject: [PATCH 07/16] removed torchtext for aarch64 --- requirements/torch_linux_aarch64.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/torch_linux_aarch64.txt b/requirements/torch_linux_aarch64.txt index 315dd108be..5aff2cf43c 100644 --- a/requirements/torch_linux_aarch64.txt +++ b/requirements/torch_linux_aarch64.txt @@ -3,5 +3,4 @@ -r torch_common.txt torch==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64' torchvision==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64' -#torchtext==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64' torchaudio==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64' From e7f31a489dabdc0cc160a4fc11aeaa2eaf2adf75 Mon Sep 17 00:00:00 2001 From: agunapal Date: Thu, 4 Apr 2024 21:29:25 +0000 Subject: [PATCH 08/16] lint failure --- examples/text_to_speech_synthesizer/SpeechT5/README.md | 6 +++--- ts_scripts/spellcheck_conf/wordlist.txt | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/text_to_speech_synthesizer/SpeechT5/README.md b/examples/text_to_speech_synthesizer/SpeechT5/README.md index 8e955cf390..b991ee218e 100644 --- a/examples/text_to_speech_synthesizer/SpeechT5/README.md +++ b/examples/text_to_speech_synthesizer/SpeechT5/README.md @@ -1,4 +1,4 @@ -# Text to Speech synthesis with SpeechT5 +# Text to Speech synthesis with SpeechT5 This is an example showing text to speech synthesis using SpeechT5 model. This has been verified to work on (linux-aarch64) Graviton 3 instance @@ -11,7 +11,7 @@ export LRU_CACHE_CAPACITY=1024 More details can be found in this [blog](https://pytorch.org/blog/optimized-pytorch-w-graviton/) -## Pre-requesites +## Pre-requisites ``` chmod +x setup.sh ./setup.sh @@ -45,4 +45,4 @@ torchserve --start --ncs --model-store model_store --models SpeechT5-TTS ``` curl http://127.0.0.1:8080/predictions/SpeechT5-TTS -T sample_input.txt -o speech.wav -``` \ No newline at end of file +``` diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index 8df3e2852e..48f4e81112 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1216,3 +1216,7 @@ libomp rpath venv TorchInductor +Graviton +aarch +linux +SpeechT From 458be7068b3acc4bcb3ce686d39405d4da098fa2 Mon Sep 17 00:00:00 2001 From: agunapal Date: Thu, 4 Apr 2024 21:30:44 +0000 Subject: [PATCH 09/16] lint failure --- .../SpeechT5/download_model.py | 5 +--- .../SpeechT5/text_to_speech_handler.py | 28 +++++++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/examples/text_to_speech_synthesizer/SpeechT5/download_model.py b/examples/text_to_speech_synthesizer/SpeechT5/download_model.py index a50654596e..66d1494e0c 100644 --- a/examples/text_to_speech_synthesizer/SpeechT5/download_model.py +++ b/examples/text_to_speech_synthesizer/SpeechT5/download_model.py @@ -1,8 +1,5 @@ -from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan -from datasets import load_dataset -import torch -import soundfile as sf from datasets import load_dataset +from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") diff --git a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py index 074ce21043..ca9f30ab26 100644 --- a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py +++ b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py @@ -1,13 +1,13 @@ - import logging import os -import torch import uuid -from ts.torch_handler.base_handler import BaseHandler import soundfile as sf -from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan +import torch from datasets import load_from_disk +from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor + +from ts.torch_handler.base_handler import BaseHandler logger = logging.getLogger(__name__) @@ -18,9 +18,8 @@ def __init__(self): self.processor = None self.vocoder = None self.speaker_embeddings = None - - def initialize(self, ctx): + def initialize(self, ctx): properties = ctx.system_properties model_dir = properties.get("model_dir") @@ -35,13 +34,12 @@ def initialize(self, ctx): # load xvector containing speaker's voice characteristics from a dataset embeddings_dataset = load_from_disk(embeddings_dataset) - self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) + self.speaker_embeddings = torch.tensor( + embeddings_dataset[7306]["xvector"] + ).unsqueeze(0) def preprocess(self, requests): - - assert ( - len(requests) == 1 - ), "This is currently supported with batch_size=1" + assert len(requests) == 1, "This is currently supported with batch_size=1" req_data = requests[0] input_data = req_data.get("data") or req_data.get("body") @@ -54,15 +52,15 @@ def preprocess(self, requests): return inputs def inference(self, inputs): - - output = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder) + output = self.model.generate_speech( + inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder + ) return output - def postprocess(self, inference_output): path = "/tmp/{}.wav".format(uuid.uuid4().hex) sf.write(path, inference_output.numpy(), samplerate=16000) with open(path, "rb") as output: data = output.read() os.remove(path) - return [data] \ No newline at end of file + return [data] From 68706be9129d6785c06e9d5fa8759be629df19cb Mon Sep 17 00:00:00 2001 From: agunapal Date: Thu, 4 Apr 2024 21:32:29 +0000 Subject: [PATCH 10/16] Build conda binaries --- binaries/conda/build_packages.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/binaries/conda/build_packages.py b/binaries/conda/build_packages.py index 4fd8a5d82b..00b9e9c13b 100644 --- a/binaries/conda/build_packages.py +++ b/binaries/conda/build_packages.py @@ -22,7 +22,13 @@ PACKAGES = ["torchserve", "torch-model-archiver", "torch-workflow-archiver"] # conda convert supported platforms https://docs.conda.io/projects/conda-build/en/stable/resources/commands/conda-convert.html -PLATFORMS = ["linux-64", "osx-64", "win-64", "osx-arm64"] # Add a new platform here +PLATFORMS = [ + "linux-64", + "osx-64", + "win-64", + "osx-arm64", + "linux-aarch64", +] # Add a new platform here if os.name == "nt": # Assumes miniconda is installed in windows From 1a3b2fb565cca943f61a7bafcaa11d2eacb9ff04 Mon Sep 17 00:00:00 2001 From: agunapal Date: Thu, 4 Apr 2024 21:35:19 +0000 Subject: [PATCH 11/16] Build conda binaries --- .../WaveGlow/waveglow_model.py | 96 +++++++++++-------- 1 file changed, 54 insertions(+), 42 deletions(-) diff --git a/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py b/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py index c799709a87..31b9992383 100644 --- a/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py +++ b/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py @@ -25,8 +25,8 @@ # # ***************************************************************************** import torch -from torch.autograd import Variable import torch.nn.functional as F +from torch.autograd import Variable @torch.jit.script @@ -48,11 +48,12 @@ class Invertible1x1Conv(torch.nn.Module): def __init__(self, c): super(Invertible1x1Conv, self).__init__() - self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, - bias=False) + self.conv = torch.nn.Conv1d( + c, c, kernel_size=1, stride=1, padding=0, bias=False + ) # Sample a random orthonormal matrix to initialize weights - W = torch.qr(torch.FloatTensor(c, c).normal_())[0] + W = torch.linalg.qr(torch.FloatTensor(c, c).normal_())[0] # Ensure determinant is 1.0 not -1.0 if torch.det(W) < 0: @@ -67,18 +68,25 @@ def forward(self, z, reverse=False): W = self.conv.weight.squeeze() if reverse: - if not hasattr(self, 'W_inverse'): + if not hasattr(self, "W_inverse"): # Reverse computation W_inverse = W.float().inverse() W_inverse = Variable(W_inverse[..., None]) - if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor': + if ( + z.type() == "torch.cuda.HalfTensor" + or z.type() == "torch.HalfTensor" + ): W_inverse = W_inverse.half() self.W_inverse = W_inverse z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) return z else: # Forward computation - log_det_W = batch_size * n_of_groups * torch.logdet(W.unsqueeze(0).float()).squeeze() + log_det_W = ( + batch_size + * n_of_groups + * torch.logdet(W.unsqueeze(0).float()).squeeze() + ) z = self.conv(z) return z, log_det_W @@ -90,11 +98,12 @@ class WN(torch.nn.Module): also no dilation size reset. The dilation only doubles on each layer """ - def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, - kernel_size): + def __init__( + self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size + ): super(WN, self).__init__() - assert(kernel_size % 2 == 1) - assert(n_channels % 2 == 0) + assert kernel_size % 2 == 1 + assert n_channels % 2 == 0 self.n_layers = n_layers self.n_channels = n_channels self.in_layers = torch.nn.ModuleList() @@ -102,7 +111,7 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, self.cond_layers = torch.nn.ModuleList() start = torch.nn.Conv1d(n_in_channels, n_channels, 1) - start = torch.nn.utils.weight_norm(start, name='weight') + start = torch.nn.utils.weight_norm(start, name="weight") self.start = start # Initializing last layer to 0 makes the affine coupling layers @@ -113,15 +122,20 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, self.end = end for i in range(n_layers): - dilation = 2 ** i + dilation = 2**i padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d(n_channels, 2 * n_channels, kernel_size, - dilation=dilation, padding=padding) - in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + in_layer = torch.nn.Conv1d( + n_channels, + 2 * n_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") self.in_layers.append(in_layer) cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) - cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") self.cond_layers.append(cond_layer) # last one is not necessary @@ -130,8 +144,7 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, else: res_skip_channels = n_channels res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm( - res_skip_layer, name='weight') + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") self.res_skip_layers.append(res_skip_layer) def forward(self, forward_input): @@ -142,12 +155,13 @@ def forward(self, forward_input): acts = fused_add_tanh_sigmoid_multiply( self.in_layers[i](audio), self.cond_layers[i](spect), - torch.IntTensor([self.n_channels])) + torch.IntTensor([self.n_channels]), + ) res_skip_acts = self.res_skip_layers[i](acts) if i < self.n_layers - 1: - audio = res_skip_acts[:, :self.n_channels, :] + audio - skip_acts = res_skip_acts[:, self.n_channels:, :] + audio = res_skip_acts[:, : self.n_channels, :] + audio + skip_acts = res_skip_acts[:, self.n_channels :, :] else: skip_acts = res_skip_acts @@ -159,14 +173,15 @@ def forward(self, forward_input): class WaveGlow(torch.nn.Module): - def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, - n_early_size, WN_config): + def __init__( + self, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, WN_config + ): super(WaveGlow, self).__init__() - self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, - n_mel_channels, - 1024, stride=256) - assert(n_group % 2 == 0) + self.upsample = torch.nn.ConvTranspose1d( + n_mel_channels, n_mel_channels, 1024, stride=256 + ) + assert n_group % 2 == 0 self.n_flows = n_flows self.n_group = n_group self.n_early_every = n_early_every @@ -196,9 +211,9 @@ def forward(self, forward_input): # Upsample spectrogram to size of audio spect = self.upsample(spect) - assert(spect.size(2) >= audio.size(1)) + assert spect.size(2) >= audio.size(1) if spect.size(2) > audio.size(1): - spect = spect[:, :, :audio.size(1)] + spect = spect[:, :, : audio.size(1)] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) @@ -211,8 +226,8 @@ def forward(self, forward_input): for k in range(self.n_flows): if k % self.n_early_every == 0 and k > 0: - output_audio.append(audio[:, :self.n_early_size, :]) - audio = audio[:, self.n_early_size:, :] + output_audio.append(audio[:, : self.n_early_size, :]) + audio = audio[:, self.n_early_size :, :] audio, log_det_W = self.convinv[k](audio) log_det_W_list.append(log_det_W) @@ -233,7 +248,6 @@ def forward(self, forward_input): return torch.cat(output_audio, 1), log_s_list, log_det_W_list def infer(self, spect, sigma=1.0): - spect = self.upsample(spect) # trim conv artifacts. maybe pad spec to kernel multiple time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] @@ -243,9 +257,9 @@ def infer(self, spect, sigma=1.0): spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) spect = spect.permute(0, 2, 1) - audio = torch.randn(spect.size(0), - self.n_remaining_channels, - spect.size(2), device=spect.device).to(spect.dtype) + audio = torch.randn( + spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device + ).to(spect.dtype) audio = torch.autograd.Variable(sigma * audio) @@ -263,16 +277,14 @@ def infer(self, spect, sigma=1.0): audio = self.convinv[k](audio, reverse=True) if k % self.n_early_every == 0 and k > 0: - z = torch.randn(spect.size(0), self.n_early_size, spect.size( - 2), device=spect.device).to(spect.dtype) + z = torch.randn( + spect.size(0), self.n_early_size, spect.size(2), device=spect.device + ).to(spect.dtype) audio = torch.cat((sigma * z, audio), 1) - audio = audio.permute( - 0, 2, 1).contiguous().view( - audio.size(0), -1).data + audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data return audio - @staticmethod def remove_weightnorm(model): waveglow = model From c8a6871bc2b09483a375befe5ca446d07daa8521 Mon Sep 17 00:00:00 2001 From: agunapal Date: Thu, 4 Apr 2024 21:38:22 +0000 Subject: [PATCH 12/16] resolving merge conflicts --- .../waveglow_model.py | 304 ++++++++++++++++++ 1 file changed, 304 insertions(+) create mode 100644 examples/text_to_speech_synthesizer/waveglow_model.py diff --git a/examples/text_to_speech_synthesizer/waveglow_model.py b/examples/text_to_speech_synthesizer/waveglow_model.py new file mode 100644 index 0000000000..31b9992383 --- /dev/null +++ b/examples/text_to_speech_synthesizer/waveglow_model.py @@ -0,0 +1,304 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** +import torch +import torch.nn.functional as F +from torch.autograd import Variable + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class Invertible1x1Conv(torch.nn.Module): + """ + The layer outputs both the convolution, and the log determinant + of its weight matrix. If reverse=True it does convolution with + inverse + """ + + def __init__(self, c): + super(Invertible1x1Conv, self).__init__() + self.conv = torch.nn.Conv1d( + c, c, kernel_size=1, stride=1, padding=0, bias=False + ) + + # Sample a random orthonormal matrix to initialize weights + W = torch.linalg.qr(torch.FloatTensor(c, c).normal_())[0] + + # Ensure determinant is 1.0 not -1.0 + if torch.det(W) < 0: + W[:, 0] = -1 * W[:, 0] + W = W.view(c, c, 1) + self.conv.weight.data = W + + def forward(self, z, reverse=False): + # shape + batch_size, group_size, n_of_groups = z.size() + + W = self.conv.weight.squeeze() + + if reverse: + if not hasattr(self, "W_inverse"): + # Reverse computation + W_inverse = W.float().inverse() + W_inverse = Variable(W_inverse[..., None]) + if ( + z.type() == "torch.cuda.HalfTensor" + or z.type() == "torch.HalfTensor" + ): + W_inverse = W_inverse.half() + self.W_inverse = W_inverse + z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + return z + else: + # Forward computation + log_det_W = ( + batch_size + * n_of_groups + * torch.logdet(W.unsqueeze(0).float()).squeeze() + ) + z = self.conv(z) + return z, log_det_W + + +class WN(torch.nn.Module): + """ + This is the WaveNet like layer for the affine coupling. The primary + difference from WaveNet is the convolutions need not be causal. There is + also no dilation size reset. The dilation only doubles on each layer + """ + + def __init__( + self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + assert n_channels % 2 == 0 + self.n_layers = n_layers + self.n_channels = n_channels + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.cond_layers = torch.nn.ModuleList() + + start = torch.nn.Conv1d(n_in_channels, n_channels, 1) + start = torch.nn.utils.weight_norm(start, name="weight") + self.start = start + + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + + for i in range(n_layers): + dilation = 2**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + n_channels, + 2 * n_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) + cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + self.cond_layers.append(cond_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * n_channels + else: + res_skip_channels = n_channels + res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, forward_input): + audio, spect = forward_input + audio = self.start(audio) + + for i in range(self.n_layers): + acts = fused_add_tanh_sigmoid_multiply( + self.in_layers[i](audio), + self.cond_layers[i](spect), + torch.IntTensor([self.n_channels]), + ) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + audio = res_skip_acts[:, : self.n_channels, :] + audio + skip_acts = res_skip_acts[:, self.n_channels :, :] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output = skip_acts + output + return self.end(output) + + +class WaveGlow(torch.nn.Module): + def __init__( + self, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, WN_config + ): + super(WaveGlow, self).__init__() + + self.upsample = torch.nn.ConvTranspose1d( + n_mel_channels, n_mel_channels, 1024, stride=256 + ) + assert n_group % 2 == 0 + self.n_flows = n_flows + self.n_group = n_group + self.n_early_every = n_early_every + self.n_early_size = n_early_size + self.WN = torch.nn.ModuleList() + self.convinv = torch.nn.ModuleList() + + n_half = int(n_group / 2) + + # Set up layers with the right sizes based on how many dimensions + # have been output already + n_remaining_channels = n_group + for k in range(n_flows): + if k % self.n_early_every == 0 and k > 0: + n_half = n_half - int(self.n_early_size / 2) + n_remaining_channels = n_remaining_channels - self.n_early_size + self.convinv.append(Invertible1x1Conv(n_remaining_channels)) + self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels + + def forward(self, forward_input): + """ + forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames + forward_input[1] = audio: batch x time + """ + spect, audio = forward_input + + # Upsample spectrogram to size of audio + spect = self.upsample(spect) + assert spect.size(2) >= audio.size(1) + if spect.size(2) > audio.size(1): + spect = spect[:, :, : audio.size(1)] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) + spect = spect.permute(0, 2, 1) + + audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) + output_audio = [] + log_s_list = [] + log_det_W_list = [] + + for k in range(self.n_flows): + if k % self.n_early_every == 0 and k > 0: + output_audio.append(audio[:, : self.n_early_size, :]) + audio = audio[:, self.n_early_size :, :] + + audio, log_det_W = self.convinv[k](audio) + log_det_W_list.append(log_det_W) + + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + log_s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = torch.exp(log_s) * audio_1 + b + log_s_list.append(log_s) + + audio = torch.cat([audio_0, audio_1], 1) + + output_audio.append(audio) + return torch.cat(output_audio, 1), log_s_list, log_det_W_list + + def infer(self, spect, sigma=1.0): + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) + spect = spect.permute(0, 2, 1) + + audio = torch.randn( + spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device + ).to(spect.dtype) + + audio = torch.autograd.Variable(sigma * audio) + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b) / torch.exp(s) + audio = torch.cat([audio_0, audio_1], 1) + + audio = self.convinv[k](audio, reverse=True) + + if k % self.n_early_every == 0 and k > 0: + z = torch.randn( + spect.size(0), self.n_early_size, spect.size(2), device=spect.device + ).to(spect.dtype) + audio = torch.cat((sigma * z, audio), 1) + + audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data + return audio + + @staticmethod + def remove_weightnorm(model): + waveglow = model + for WN in waveglow.WN: + WN.start = torch.nn.utils.remove_weight_norm(WN.start) + WN.in_layers = remove(WN.in_layers) + WN.cond_layers = remove(WN.cond_layers) + WN.res_skip_layers = remove(WN.res_skip_layers) + return waveglow + + +def remove(conv_list): + new_conv_list = torch.nn.ModuleList() + for old_conv in conv_list: + old_conv = torch.nn.utils.remove_weight_norm(old_conv) + new_conv_list.append(old_conv) + return new_conv_list From 8e9e482082fed07a034234fb0a999f9dddbf1975 Mon Sep 17 00:00:00 2001 From: agunapal Date: Thu, 4 Apr 2024 21:38:51 +0000 Subject: [PATCH 13/16] resolving merge conflicts --- .../waveglow_model.py | 304 ------------------ 1 file changed, 304 deletions(-) delete mode 100644 examples/text_to_speech_synthesizer/waveglow_model.py diff --git a/examples/text_to_speech_synthesizer/waveglow_model.py b/examples/text_to_speech_synthesizer/waveglow_model.py deleted file mode 100644 index 31b9992383..0000000000 --- a/examples/text_to_speech_synthesizer/waveglow_model.py +++ /dev/null @@ -1,304 +0,0 @@ -# ***************************************************************************** -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the NVIDIA CORPORATION nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# ***************************************************************************** -import torch -import torch.nn.functional as F -from torch.autograd import Variable - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -class Invertible1x1Conv(torch.nn.Module): - """ - The layer outputs both the convolution, and the log determinant - of its weight matrix. If reverse=True it does convolution with - inverse - """ - - def __init__(self, c): - super(Invertible1x1Conv, self).__init__() - self.conv = torch.nn.Conv1d( - c, c, kernel_size=1, stride=1, padding=0, bias=False - ) - - # Sample a random orthonormal matrix to initialize weights - W = torch.linalg.qr(torch.FloatTensor(c, c).normal_())[0] - - # Ensure determinant is 1.0 not -1.0 - if torch.det(W) < 0: - W[:, 0] = -1 * W[:, 0] - W = W.view(c, c, 1) - self.conv.weight.data = W - - def forward(self, z, reverse=False): - # shape - batch_size, group_size, n_of_groups = z.size() - - W = self.conv.weight.squeeze() - - if reverse: - if not hasattr(self, "W_inverse"): - # Reverse computation - W_inverse = W.float().inverse() - W_inverse = Variable(W_inverse[..., None]) - if ( - z.type() == "torch.cuda.HalfTensor" - or z.type() == "torch.HalfTensor" - ): - W_inverse = W_inverse.half() - self.W_inverse = W_inverse - z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) - return z - else: - # Forward computation - log_det_W = ( - batch_size - * n_of_groups - * torch.logdet(W.unsqueeze(0).float()).squeeze() - ) - z = self.conv(z) - return z, log_det_W - - -class WN(torch.nn.Module): - """ - This is the WaveNet like layer for the affine coupling. The primary - difference from WaveNet is the convolutions need not be causal. There is - also no dilation size reset. The dilation only doubles on each layer - """ - - def __init__( - self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size - ): - super(WN, self).__init__() - assert kernel_size % 2 == 1 - assert n_channels % 2 == 0 - self.n_layers = n_layers - self.n_channels = n_channels - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.cond_layers = torch.nn.ModuleList() - - start = torch.nn.Conv1d(n_in_channels, n_channels, 1) - start = torch.nn.utils.weight_norm(start, name="weight") - self.start = start - - # Initializing last layer to 0 makes the affine coupling layers - # do nothing at first. This helps with training stability - end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1) - end.weight.data.zero_() - end.bias.data.zero_() - self.end = end - - for i in range(n_layers): - dilation = 2**i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d( - n_channels, - 2 * n_channels, - kernel_size, - dilation=dilation, - padding=padding, - ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") - self.in_layers.append(in_layer) - - cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) - cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") - self.cond_layers.append(cond_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * n_channels - else: - res_skip_channels = n_channels - res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") - self.res_skip_layers.append(res_skip_layer) - - def forward(self, forward_input): - audio, spect = forward_input - audio = self.start(audio) - - for i in range(self.n_layers): - acts = fused_add_tanh_sigmoid_multiply( - self.in_layers[i](audio), - self.cond_layers[i](spect), - torch.IntTensor([self.n_channels]), - ) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - audio = res_skip_acts[:, : self.n_channels, :] + audio - skip_acts = res_skip_acts[:, self.n_channels :, :] - else: - skip_acts = res_skip_acts - - if i == 0: - output = skip_acts - else: - output = skip_acts + output - return self.end(output) - - -class WaveGlow(torch.nn.Module): - def __init__( - self, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, WN_config - ): - super(WaveGlow, self).__init__() - - self.upsample = torch.nn.ConvTranspose1d( - n_mel_channels, n_mel_channels, 1024, stride=256 - ) - assert n_group % 2 == 0 - self.n_flows = n_flows - self.n_group = n_group - self.n_early_every = n_early_every - self.n_early_size = n_early_size - self.WN = torch.nn.ModuleList() - self.convinv = torch.nn.ModuleList() - - n_half = int(n_group / 2) - - # Set up layers with the right sizes based on how many dimensions - # have been output already - n_remaining_channels = n_group - for k in range(n_flows): - if k % self.n_early_every == 0 and k > 0: - n_half = n_half - int(self.n_early_size / 2) - n_remaining_channels = n_remaining_channels - self.n_early_size - self.convinv.append(Invertible1x1Conv(n_remaining_channels)) - self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) - self.n_remaining_channels = n_remaining_channels - - def forward(self, forward_input): - """ - forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames - forward_input[1] = audio: batch x time - """ - spect, audio = forward_input - - # Upsample spectrogram to size of audio - spect = self.upsample(spect) - assert spect.size(2) >= audio.size(1) - if spect.size(2) > audio.size(1): - spect = spect[:, :, : audio.size(1)] - - spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) - spect = spect.permute(0, 2, 1) - - audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) - output_audio = [] - log_s_list = [] - log_det_W_list = [] - - for k in range(self.n_flows): - if k % self.n_early_every == 0 and k > 0: - output_audio.append(audio[:, : self.n_early_size, :]) - audio = audio[:, self.n_early_size :, :] - - audio, log_det_W = self.convinv[k](audio) - log_det_W_list.append(log_det_W) - - n_half = int(audio.size(1) / 2) - audio_0 = audio[:, :n_half, :] - audio_1 = audio[:, n_half:, :] - - output = self.WN[k]((audio_0, spect)) - log_s = output[:, n_half:, :] - b = output[:, :n_half, :] - audio_1 = torch.exp(log_s) * audio_1 + b - log_s_list.append(log_s) - - audio = torch.cat([audio_0, audio_1], 1) - - output_audio.append(audio) - return torch.cat(output_audio, 1), log_s_list, log_det_W_list - - def infer(self, spect, sigma=1.0): - spect = self.upsample(spect) - # trim conv artifacts. maybe pad spec to kernel multiple - time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] - spect = spect[:, :, :-time_cutoff] - - spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = spect.contiguous().view(spect.size(0), spect.size(1), -1) - spect = spect.permute(0, 2, 1) - - audio = torch.randn( - spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device - ).to(spect.dtype) - - audio = torch.autograd.Variable(sigma * audio) - - for k in reversed(range(self.n_flows)): - n_half = int(audio.size(1) / 2) - audio_0 = audio[:, :n_half, :] - audio_1 = audio[:, n_half:, :] - - output = self.WN[k]((audio_0, spect)) - s = output[:, n_half:, :] - b = output[:, :n_half, :] - audio_1 = (audio_1 - b) / torch.exp(s) - audio = torch.cat([audio_0, audio_1], 1) - - audio = self.convinv[k](audio, reverse=True) - - if k % self.n_early_every == 0 and k > 0: - z = torch.randn( - spect.size(0), self.n_early_size, spect.size(2), device=spect.device - ).to(spect.dtype) - audio = torch.cat((sigma * z, audio), 1) - - audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data - return audio - - @staticmethod - def remove_weightnorm(model): - waveglow = model - for WN in waveglow.WN: - WN.start = torch.nn.utils.remove_weight_norm(WN.start) - WN.in_layers = remove(WN.in_layers) - WN.cond_layers = remove(WN.cond_layers) - WN.res_skip_layers = remove(WN.res_skip_layers) - return waveglow - - -def remove(conv_list): - new_conv_list = torch.nn.ModuleList() - for old_conv in conv_list: - old_conv = torch.nn.utils.remove_weight_norm(old_conv) - new_conv_list.append(old_conv) - return new_conv_list From 95da450c8e8760e1d77a7078a034f7292789d221 Mon Sep 17 00:00:00 2001 From: agunapal Date: Fri, 5 Apr 2024 00:23:57 +0000 Subject: [PATCH 14/16] update documentation --- docs/linux_aarch64.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/linux_aarch64.md b/docs/linux_aarch64.md index 7c37cc8bab..0b911faadf 100644 --- a/docs/linux_aarch64.md +++ b/docs/linux_aarch64.md @@ -1,6 +1,9 @@ -# TorchServe on linux aarch64 +# TorchServe on linux aarch64 - Experimental -TorchServe has been tested to be working on linux aarch64 for some of the examples. Regression tests have not been tested. Tested this on Amazon Graviton 3 instance(m7g.4x.large) +TorchServe has been tested to be working on linux aarch64 for some of the examples. +- CI is not implemented yet. +- Regression tests have not been run. +- Tested this on Amazon Graviton 3 instance(m7g.4x.large) ## Installation @@ -22,5 +25,3 @@ export LRU_CACHE_CAPACITY=1024 ## Example This [example](https://github.com/pytorch/serve/tree/master/examples/text_to_speech_synthesizer/SpeechT5) on Text to Speech synthesis was verified to be working on Graviton 3 - - From dc1accd33bb307821b0dd2c662d8286a85d8372e Mon Sep 17 00:00:00 2001 From: agunapal Date: Thu, 18 Apr 2024 20:38:20 +0000 Subject: [PATCH 15/16] review comments --- docs/linux_aarch64.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/linux_aarch64.md b/docs/linux_aarch64.md index 0b911faadf..5e13410c83 100644 --- a/docs/linux_aarch64.md +++ b/docs/linux_aarch64.md @@ -1,8 +1,6 @@ # TorchServe on linux aarch64 - Experimental TorchServe has been tested to be working on linux aarch64 for some of the examples. -- CI is not implemented yet. -- Regression tests have not been run. - Tested this on Amazon Graviton 3 instance(m7g.4x.large) ## Installation @@ -25,3 +23,7 @@ export LRU_CACHE_CAPACITY=1024 ## Example This [example](https://github.com/pytorch/serve/tree/master/examples/text_to_speech_synthesizer/SpeechT5) on Text to Speech synthesis was verified to be working on Graviton 3 + +## To Dos +- CI +- Regression tests From 4a27ed9ff7a7f0e7dfdcce8788b4b137a66530cf Mon Sep 17 00:00:00 2001 From: agunapal Date: Fri, 3 May 2024 17:45:12 +0000 Subject: [PATCH 16/16] Updated based on review comments --- examples/text_to_speech_synthesizer/SpeechT5/README.md | 6 ++++-- .../SpeechT5/model-config.yaml | 9 +++++---- .../SpeechT5/text_to_speech_handler.py | 4 +++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/text_to_speech_synthesizer/SpeechT5/README.md b/examples/text_to_speech_synthesizer/SpeechT5/README.md index b991ee218e..e2182faf7f 100644 --- a/examples/text_to_speech_synthesizer/SpeechT5/README.md +++ b/examples/text_to_speech_synthesizer/SpeechT5/README.md @@ -30,9 +30,9 @@ python download_model.py ``` mkdir model_store -torch-model-archiver --model-name SpeechT5-TTS --version 1.0 --handler text_to_speech_handler.py --config-file model-config.yaml --archive-format no-archive --export-path model_store +torch-model-archiver --model-name SpeechT5-TTS --version 1.0 --handler text_to_speech_handler.py --config-file model-config.yaml --archive-format no-archive --export-path model_store -f -mv model_artifacts model_store/SpeechT5-TTS/ +mv model_artifacts/* model_store/SpeechT5-TTS/ ``` ## Start TorchServe @@ -46,3 +46,5 @@ torchserve --start --ncs --model-store model_store --models SpeechT5-TTS ``` curl http://127.0.0.1:8080/predictions/SpeechT5-TTS -T sample_input.txt -o speech.wav ``` + +This generates an audio file `speech.wav` corresponding to the text in `sample_input.txt` diff --git a/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml b/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml index aefc704276..feaf7026b3 100644 --- a/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml +++ b/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml @@ -1,7 +1,8 @@ minWorkers: 1 maxWorkers: 1 handler: - model: "./model" - vocoder: "./vocoder" - processor: "./processor" - speaker_embeddings: "./speaker_embeddings" + model: "model" + vocoder: "vocoder" + processor: "processor" + speaker_embeddings: "speaker_embeddings" + output_dir: "/tmp" diff --git a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py index ca9f30ab26..65fbbf1509 100644 --- a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py +++ b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py @@ -18,6 +18,7 @@ def __init__(self): self.processor = None self.vocoder = None self.speaker_embeddings = None + self.output_dir = "/tmp" def initialize(self, ctx): properties = ctx.system_properties @@ -27,6 +28,7 @@ def initialize(self, ctx): model = ctx.model_yaml_config["handler"]["model"] vocoder = ctx.model_yaml_config["handler"]["vocoder"] embeddings_dataset = ctx.model_yaml_config["handler"]["speaker_embeddings"] + self.output_dir = ctx.model_yaml_config["handler"]["output_dir"] self.processor = SpeechT5Processor.from_pretrained(processor) self.model = SpeechT5ForTextToSpeech.from_pretrained(model) @@ -58,7 +60,7 @@ def inference(self, inputs): return output def postprocess(self, inference_output): - path = "/tmp/{}.wav".format(uuid.uuid4().hex) + path = self.output_dir + "/{}.wav".format(uuid.uuid4().hex) sf.write(path, inference_output.numpy(), samplerate=16000) with open(path, "rb") as output: data = output.read()