From 5c5d37708f82073eef338e2cffcd00ab872a377d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-48-252.us-west-2.compute.internal>
Date: Mon, 25 Mar 2024 22:54:04 +0000
Subject: [PATCH 01/16] Changes for building TorchServe on linux aarch64

---
 requirements/developer.txt           |  2 +-
 requirements/torch_linux_aarch64.txt |  6 ++++++
 ts_scripts/install_dependencies.py   | 11 ++++++++---
 3 files changed, 15 insertions(+), 4 deletions(-)
 create mode 100644 requirements/torch_linux_aarch64.txt

diff --git a/requirements/developer.txt b/requirements/developer.txt
index d1ae1fd39d..57d3d13ee7 100644
--- a/requirements/developer.txt
+++ b/requirements/developer.txt
@@ -14,7 +14,7 @@ pre-commit==3.3.2
 twine==4.0.2
 mypy==1.3.0
 torchpippy==0.1.1
-intel_extension_for_pytorch==2.2.0; sys_platform != 'win32' and sys_platform != 'darwin'
+intel_extension_for_pytorch==2.2.0; sys_platform != 'win32' and sys_platform != 'darwin' and platform_machine != 'aarch64'
 onnxruntime==1.17.1
 googleapis-common-protos
 onnx==1.14.1
diff --git a/requirements/torch_linux_aarch64.txt b/requirements/torch_linux_aarch64.txt
new file mode 100644
index 0000000000..5aff2cf43c
--- /dev/null
+++ b/requirements/torch_linux_aarch64.txt
@@ -0,0 +1,6 @@
+#pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://download.pytorch.org/whl/cpu
+-r torch_common.txt
+torch==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64'
+torchvision==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64'
+torchaudio==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64'
diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py
index f047de2a2b..f6c208bf5b 100644
--- a/ts_scripts/install_dependencies.py
+++ b/ts_scripts/install_dependencies.py
@@ -118,9 +118,14 @@ def install_torch_packages(self, cuda_version):
                 f"{sys.executable} -m pip install -U -r {torch_neuronx_requirements_file}"
             )
         else:
-            os.system(
-                f"{sys.executable} -m pip install -U -r requirements/torch_{platform.system().lower()}.txt"
-            )
+            if platform.machine() == "aarch64":
+                os.system(
+                    f"{sys.executable} -m pip install -U -r requirements/torch_{platform.system().lower()}_{platform.machine()}.txt"
+                )
+            else:
+                os.system(
+                    f"{sys.executable} -m pip install -U -r requirements/torch_{platform.system().lower()}.txt"
+                )
 
     def install_python_packages(self, cuda_version, requirements_file_path, nightly):
         check = "where" if platform.system() == "Windows" else "which"

From 1da104fcd65188c00ec993e5bc55258cc12869fd Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Mon, 25 Mar 2024 22:56:00 +0000
Subject: [PATCH 02/16] Changes for building TorchServe on linux aarch64

---
 requirements/torch_linux_aarch64.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/torch_linux_aarch64.txt b/requirements/torch_linux_aarch64.txt
index 5aff2cf43c..315dd108be 100644
--- a/requirements/torch_linux_aarch64.txt
+++ b/requirements/torch_linux_aarch64.txt
@@ -3,4 +3,5 @@
 -r torch_common.txt
 torch==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64'
 torchvision==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64'
+#torchtext==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64'
 torchaudio==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64'

From 5ab0b43f46b224aa5348169a5072f04780862898 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-6-221.us-west-2.compute.internal>
Date: Wed, 3 Apr 2024 19:32:10 +0000
Subject: [PATCH 03/16] Added an example for linux aarch64

---
 .../SpeechT5/README.md                        | 48 +++++++++++++
 .../SpeechT5/download_model.py                | 17 +++++
 .../SpeechT5/model-config.yaml                |  7 ++
 .../SpeechT5/sample_input.txt                 |  1 +
 .../SpeechT5/setup.sh                         |  6 ++
 .../SpeechT5/text_to_speech_handler.py        | 68 +++++++++++++++++++
 .../{ => WaveGlow}/README.md                  |  0
 .../{ => WaveGlow}/create_mar.sh              |  0
 .../{ => WaveGlow}/requirements.txt           |  0
 .../{ => WaveGlow}/sample_text.txt            |  0
 .../{ => WaveGlow}/waveglow_handler.py        |  0
 .../{ => WaveGlow}/waveglow_model.py          |  0
 12 files changed, 147 insertions(+)
 create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/README.md
 create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/download_model.py
 create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml
 create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/sample_input.txt
 create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/setup.sh
 create mode 100644 examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
 rename examples/text_to_speech_synthesizer/{ => WaveGlow}/README.md (100%)
 rename examples/text_to_speech_synthesizer/{ => WaveGlow}/create_mar.sh (100%)
 rename examples/text_to_speech_synthesizer/{ => WaveGlow}/requirements.txt (100%)
 rename examples/text_to_speech_synthesizer/{ => WaveGlow}/sample_text.txt (100%)
 rename examples/text_to_speech_synthesizer/{ => WaveGlow}/waveglow_handler.py (100%)
 rename examples/text_to_speech_synthesizer/{ => WaveGlow}/waveglow_model.py (100%)

diff --git a/examples/text_to_speech_synthesizer/SpeechT5/README.md b/examples/text_to_speech_synthesizer/SpeechT5/README.md
new file mode 100644
index 0000000000..e92442a54c
--- /dev/null
+++ b/examples/text_to_speech_synthesizer/SpeechT5/README.md
@@ -0,0 +1,48 @@
+# Text to Speech synthesis with SpeechT5 
+
+This is an example showing text to speech synthesis using SpeechT5 model.
+
+While  running this model on `linux-aarch64`, you can enable these optimizations
+
+```
+export DNNL_DEFAULT_FPMATH_MODE=BF16
+export LRU_CACHE_CAPACITY=1024
+```
+More details can be found in this [blog](https://pytorch.org/blog/optimized-pytorch-w-graviton/)
+
+
+## Pre-requesites
+```
+chmod +x setup.sh
+./setup.sh
+```
+
+## Download model
+
+This saves the model artifacts to `model_artifacts` directory
+```
+huggingface-cli login
+python download_model.py
+```
+
+## Create model archiver
+
+```
+mkdir model_store
+
+torch-model-archiver --model-name SpeechT5-TTS --version 1.0 --handler text_to_speech_handler.py --config-file model-config.yaml --archive-format no-archive --export-path model_store
+
+mv model_artifacts model_store/SpeechT5-TTS/
+```
+
+## Start TorchServe
+
+```
+torchserve --start --ncs --model-store model_store --models SpeechT5-TTS
+```
+
+## Send Inference request
+
+```
+curl http://127.0.0.1:8080/predictions/SpeechT5-TTS -T sample_input.txt  -o speech.wav
+```
\ No newline at end of file
diff --git a/examples/text_to_speech_synthesizer/SpeechT5/download_model.py b/examples/text_to_speech_synthesizer/SpeechT5/download_model.py
new file mode 100644
index 0000000000..a50654596e
--- /dev/null
+++ b/examples/text_to_speech_synthesizer/SpeechT5/download_model.py
@@ -0,0 +1,17 @@
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import torch
+import soundfile as sf
+from datasets import load_dataset
+
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+
+model.save_pretrained(save_directory="model_artifacts/model")
+processor.save_pretrained(save_directory="model_artifacts/processor")
+vocoder.save_pretrained(save_directory="model_artifacts/vocoder")
+embeddings_dataset.save_to_disk("model_artifacts/speaker_embeddings")
+print("Save model artifacts to directory model_artifacts")
diff --git a/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml b/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml
new file mode 100644
index 0000000000..aefc704276
--- /dev/null
+++ b/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml
@@ -0,0 +1,7 @@
+minWorkers: 1
+maxWorkers: 1
+handler:
+  model: "./model"
+  vocoder: "./vocoder"
+  processor: "./processor"
+  speaker_embeddings: "./speaker_embeddings"
diff --git a/examples/text_to_speech_synthesizer/SpeechT5/sample_input.txt b/examples/text_to_speech_synthesizer/SpeechT5/sample_input.txt
new file mode 100644
index 0000000000..e60d898198
--- /dev/null
+++ b/examples/text_to_speech_synthesizer/SpeechT5/sample_input.txt
@@ -0,0 +1 @@
+"I love San Francisco"
diff --git a/examples/text_to_speech_synthesizer/SpeechT5/setup.sh b/examples/text_to_speech_synthesizer/SpeechT5/setup.sh
new file mode 100644
index 0000000000..895c08b49e
--- /dev/null
+++ b/examples/text_to_speech_synthesizer/SpeechT5/setup.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Needed for soundfile
+sudo apt install libsndfile1 -y
+
+pip install --upgrade transformers sentencepiece datasets[audio] soundfile
diff --git a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
new file mode 100644
index 0000000000..074ce21043
--- /dev/null
+++ b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
@@ -0,0 +1,68 @@
+
+import logging
+import os
+import torch
+import uuid
+from ts.torch_handler.base_handler import BaseHandler
+
+import soundfile as sf
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_from_disk
+
+logger = logging.getLogger(__name__)
+
+
+class SpeechT5_TTS(BaseHandler):
+    def __init__(self):
+        self.model = None
+        self.processor = None
+        self.vocoder = None
+        self.speaker_embeddings = None
+    
+    def initialize(self, ctx):
+
+        properties = ctx.system_properties
+        model_dir = properties.get("model_dir")
+
+        processor = ctx.model_yaml_config["handler"]["processor"]
+        model = ctx.model_yaml_config["handler"]["model"]
+        vocoder = ctx.model_yaml_config["handler"]["vocoder"]
+        embeddings_dataset = ctx.model_yaml_config["handler"]["speaker_embeddings"]
+
+        self.processor = SpeechT5Processor.from_pretrained(processor)
+        self.model = SpeechT5ForTextToSpeech.from_pretrained(model)
+        self.vocoder = SpeechT5HifiGan.from_pretrained(vocoder)
+
+        # load xvector containing speaker's voice characteristics from a dataset
+        embeddings_dataset = load_from_disk(embeddings_dataset)
+        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+
+    def preprocess(self, requests):
+
+        assert (
+            len(requests) == 1
+        ), "This is currently supported with batch_size=1"
+        req_data = requests[0]
+
+        input_data = req_data.get("data") or req_data.get("body")
+
+        if isinstance(input_data, (bytes, bytearray)):
+            input_data = input_data.decode("utf-8")
+
+        inputs = self.processor(text=input_data, return_tensors="pt")
+
+        return inputs
+
+    def inference(self, inputs):
+
+        output = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
+        return output
+
+
+    def postprocess(self, inference_output):
+        path = "/tmp/{}.wav".format(uuid.uuid4().hex)
+        sf.write(path, inference_output.numpy(), samplerate=16000)
+        with open(path, "rb") as output:
+            data = output.read()
+        os.remove(path)
+        return [data]
\ No newline at end of file
diff --git a/examples/text_to_speech_synthesizer/README.md b/examples/text_to_speech_synthesizer/WaveGlow/README.md
similarity index 100%
rename from examples/text_to_speech_synthesizer/README.md
rename to examples/text_to_speech_synthesizer/WaveGlow/README.md
diff --git a/examples/text_to_speech_synthesizer/create_mar.sh b/examples/text_to_speech_synthesizer/WaveGlow/create_mar.sh
similarity index 100%
rename from examples/text_to_speech_synthesizer/create_mar.sh
rename to examples/text_to_speech_synthesizer/WaveGlow/create_mar.sh
diff --git a/examples/text_to_speech_synthesizer/requirements.txt b/examples/text_to_speech_synthesizer/WaveGlow/requirements.txt
similarity index 100%
rename from examples/text_to_speech_synthesizer/requirements.txt
rename to examples/text_to_speech_synthesizer/WaveGlow/requirements.txt
diff --git a/examples/text_to_speech_synthesizer/sample_text.txt b/examples/text_to_speech_synthesizer/WaveGlow/sample_text.txt
similarity index 100%
rename from examples/text_to_speech_synthesizer/sample_text.txt
rename to examples/text_to_speech_synthesizer/WaveGlow/sample_text.txt
diff --git a/examples/text_to_speech_synthesizer/waveglow_handler.py b/examples/text_to_speech_synthesizer/WaveGlow/waveglow_handler.py
similarity index 100%
rename from examples/text_to_speech_synthesizer/waveglow_handler.py
rename to examples/text_to_speech_synthesizer/WaveGlow/waveglow_handler.py
diff --git a/examples/text_to_speech_synthesizer/waveglow_model.py b/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py
similarity index 100%
rename from examples/text_to_speech_synthesizer/waveglow_model.py
rename to examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py

From 441eb5e0acd96771aad0ec6426e30cc7c0c99922 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Wed, 3 Apr 2024 19:41:51 +0000
Subject: [PATCH 04/16] Doc update for linux aarch64

---
 docs/linux_aarch64.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 docs/linux_aarch64.md

diff --git a/docs/linux_aarch64.md b/docs/linux_aarch64.md
new file mode 100644
index 0000000000..ee6f3f7f5b
--- /dev/null
+++ b/docs/linux_aarch64.md
@@ -0,0 +1,26 @@
+# TorchServe on linux aarch64
+
+TorchServe has been tested to be working on linux aarch64. Tested this on Amazon Graviton 3 instance(m7g.4x.large)
+
+## Installation
+
+Currently installation from PyPi or installing from source works
+
+```
+python ts_scripts/install_dependencies.py
+pip install torchserve torch-model-archiver torch-workflow-archiver
+```
+
+## Optimizations
+
+You can also enable this optimizations for Graviton 3 to get an improved performance. More details can be found in this [blog](https://pytorch.org/blog/optimized-pytorch-w-graviton/)
+```
+export DNNL_DEFAULT_FPMATH_MODE=BF16
+export LRU_CACHE_CAPACITY=1024
+```
+
+## Example
+
+This [example](https://github.com/pytorch/serve/tree/master/examples/text_to_speech_synthesizer/SpeechT5) on Text to Speech synthesis was verified to be working on Graviton 3
+
+

From 92ad55a48d8ab511608240201d1cda8e3b26a67b Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Wed, 3 Apr 2024 19:41:53 +0000
Subject: [PATCH 05/16] Doc update for linux aarch64

---
 docs/linux_aarch64.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/linux_aarch64.md b/docs/linux_aarch64.md
index ee6f3f7f5b..7c37cc8bab 100644
--- a/docs/linux_aarch64.md
+++ b/docs/linux_aarch64.md
@@ -1,6 +1,6 @@
 # TorchServe on linux aarch64
 
-TorchServe has been tested to be working on linux aarch64. Tested this on Amazon Graviton 3 instance(m7g.4x.large)
+TorchServe has been tested to be working on linux aarch64 for some of the examples. Regression tests have not been tested. Tested this on Amazon Graviton 3 instance(m7g.4x.large)
 
 ## Installation
 

From aa0a9c5a6a6240eb209f7fca7a8450918fc3f226 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Wed, 3 Apr 2024 19:43:39 +0000
Subject: [PATCH 06/16] Doc update for linux aarch64

---
 examples/text_to_speech_synthesizer/SpeechT5/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/text_to_speech_synthesizer/SpeechT5/README.md b/examples/text_to_speech_synthesizer/SpeechT5/README.md
index e92442a54c..8e955cf390 100644
--- a/examples/text_to_speech_synthesizer/SpeechT5/README.md
+++ b/examples/text_to_speech_synthesizer/SpeechT5/README.md
@@ -1,6 +1,6 @@
 # Text to Speech synthesis with SpeechT5 
 
-This is an example showing text to speech synthesis using SpeechT5 model.
+This is an example showing text to speech synthesis using SpeechT5 model. This has been verified to work on (linux-aarch64) Graviton 3 instance
 
 While  running this model on `linux-aarch64`, you can enable these optimizations
 

From 9a07909eefae1e7ddf54169f1a705eb7a990cdae Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Wed, 3 Apr 2024 19:46:14 +0000
Subject: [PATCH 07/16] removed torchtext for aarch64

---
 requirements/torch_linux_aarch64.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements/torch_linux_aarch64.txt b/requirements/torch_linux_aarch64.txt
index 315dd108be..5aff2cf43c 100644
--- a/requirements/torch_linux_aarch64.txt
+++ b/requirements/torch_linux_aarch64.txt
@@ -3,5 +3,4 @@
 -r torch_common.txt
 torch==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64'
 torchvision==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64'
-#torchtext==0.17.1; sys_platform == 'linux' and platform_machine == 'aarch64'
 torchaudio==2.2.1; sys_platform == 'linux' and platform_machine == 'aarch64'

From e7f31a489dabdc0cc160a4fc11aeaa2eaf2adf75 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Thu, 4 Apr 2024 21:29:25 +0000
Subject: [PATCH 08/16] lint failure

---
 examples/text_to_speech_synthesizer/SpeechT5/README.md | 6 +++---
 ts_scripts/spellcheck_conf/wordlist.txt                | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/text_to_speech_synthesizer/SpeechT5/README.md b/examples/text_to_speech_synthesizer/SpeechT5/README.md
index 8e955cf390..b991ee218e 100644
--- a/examples/text_to_speech_synthesizer/SpeechT5/README.md
+++ b/examples/text_to_speech_synthesizer/SpeechT5/README.md
@@ -1,4 +1,4 @@
-# Text to Speech synthesis with SpeechT5 
+# Text to Speech synthesis with SpeechT5
 
 This is an example showing text to speech synthesis using SpeechT5 model. This has been verified to work on (linux-aarch64) Graviton 3 instance
 
@@ -11,7 +11,7 @@ export LRU_CACHE_CAPACITY=1024
 More details can be found in this [blog](https://pytorch.org/blog/optimized-pytorch-w-graviton/)
 
 
-## Pre-requesites
+## Pre-requisites
 ```
 chmod +x setup.sh
 ./setup.sh
@@ -45,4 +45,4 @@ torchserve --start --ncs --model-store model_store --models SpeechT5-TTS
 
 ```
 curl http://127.0.0.1:8080/predictions/SpeechT5-TTS -T sample_input.txt  -o speech.wav
-```
\ No newline at end of file
+```
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 8df3e2852e..48f4e81112 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1216,3 +1216,7 @@ libomp
 rpath
 venv
 TorchInductor
+Graviton
+aarch
+linux
+SpeechT

From 458be7068b3acc4bcb3ce686d39405d4da098fa2 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Thu, 4 Apr 2024 21:30:44 +0000
Subject: [PATCH 09/16] lint failure

---
 .../SpeechT5/download_model.py                |  5 +---
 .../SpeechT5/text_to_speech_handler.py        | 28 +++++++++----------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/examples/text_to_speech_synthesizer/SpeechT5/download_model.py b/examples/text_to_speech_synthesizer/SpeechT5/download_model.py
index a50654596e..66d1494e0c 100644
--- a/examples/text_to_speech_synthesizer/SpeechT5/download_model.py
+++ b/examples/text_to_speech_synthesizer/SpeechT5/download_model.py
@@ -1,8 +1,5 @@
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from datasets import load_dataset
-import torch
-import soundfile as sf
 from datasets import load_dataset
+from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
 
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
diff --git a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
index 074ce21043..ca9f30ab26 100644
--- a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
+++ b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
@@ -1,13 +1,13 @@
-
 import logging
 import os
-import torch
 import uuid
-from ts.torch_handler.base_handler import BaseHandler
 
 import soundfile as sf
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+import torch
 from datasets import load_from_disk
+from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
+
+from ts.torch_handler.base_handler import BaseHandler
 
 logger = logging.getLogger(__name__)
 
@@ -18,9 +18,8 @@ def __init__(self):
         self.processor = None
         self.vocoder = None
         self.speaker_embeddings = None
-    
-    def initialize(self, ctx):
 
+    def initialize(self, ctx):
         properties = ctx.system_properties
         model_dir = properties.get("model_dir")
 
@@ -35,13 +34,12 @@ def initialize(self, ctx):
 
         # load xvector containing speaker's voice characteristics from a dataset
         embeddings_dataset = load_from_disk(embeddings_dataset)
-        self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+        self.speaker_embeddings = torch.tensor(
+            embeddings_dataset[7306]["xvector"]
+        ).unsqueeze(0)
 
     def preprocess(self, requests):
-
-        assert (
-            len(requests) == 1
-        ), "This is currently supported with batch_size=1"
+        assert len(requests) == 1, "This is currently supported with batch_size=1"
         req_data = requests[0]
 
         input_data = req_data.get("data") or req_data.get("body")
@@ -54,15 +52,15 @@ def preprocess(self, requests):
         return inputs
 
     def inference(self, inputs):
-
-        output = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
+        output = self.model.generate_speech(
+            inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder
+        )
         return output
 
-
     def postprocess(self, inference_output):
         path = "/tmp/{}.wav".format(uuid.uuid4().hex)
         sf.write(path, inference_output.numpy(), samplerate=16000)
         with open(path, "rb") as output:
             data = output.read()
         os.remove(path)
-        return [data]
\ No newline at end of file
+        return [data]

From 68706be9129d6785c06e9d5fa8759be629df19cb Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Thu, 4 Apr 2024 21:32:29 +0000
Subject: [PATCH 10/16] Build conda binaries

---
 binaries/conda/build_packages.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/binaries/conda/build_packages.py b/binaries/conda/build_packages.py
index 4fd8a5d82b..00b9e9c13b 100644
--- a/binaries/conda/build_packages.py
+++ b/binaries/conda/build_packages.py
@@ -22,7 +22,13 @@
 PACKAGES = ["torchserve", "torch-model-archiver", "torch-workflow-archiver"]
 
 # conda convert supported platforms https://docs.conda.io/projects/conda-build/en/stable/resources/commands/conda-convert.html
-PLATFORMS = ["linux-64", "osx-64", "win-64", "osx-arm64"]  # Add a new platform here
+PLATFORMS = [
+    "linux-64",
+    "osx-64",
+    "win-64",
+    "osx-arm64",
+    "linux-aarch64",
+]  # Add a new platform here
 
 if os.name == "nt":
     # Assumes miniconda is installed in windows

From 1a3b2fb565cca943f61a7bafcaa11d2eacb9ff04 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Thu, 4 Apr 2024 21:35:19 +0000
Subject: [PATCH 11/16] Build conda binaries

---
 .../WaveGlow/waveglow_model.py                | 96 +++++++++++--------
 1 file changed, 54 insertions(+), 42 deletions(-)

diff --git a/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py b/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py
index c799709a87..31b9992383 100644
--- a/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py
+++ b/examples/text_to_speech_synthesizer/WaveGlow/waveglow_model.py
@@ -25,8 +25,8 @@
 #
 # *****************************************************************************
 import torch
-from torch.autograd import Variable
 import torch.nn.functional as F
+from torch.autograd import Variable
 
 
 @torch.jit.script
@@ -48,11 +48,12 @@ class Invertible1x1Conv(torch.nn.Module):
 
     def __init__(self, c):
         super(Invertible1x1Conv, self).__init__()
-        self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
-                                    bias=False)
+        self.conv = torch.nn.Conv1d(
+            c, c, kernel_size=1, stride=1, padding=0, bias=False
+        )
 
         # Sample a random orthonormal matrix to initialize weights
-        W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+        W = torch.linalg.qr(torch.FloatTensor(c, c).normal_())[0]
 
         # Ensure determinant is 1.0 not -1.0
         if torch.det(W) < 0:
@@ -67,18 +68,25 @@ def forward(self, z, reverse=False):
         W = self.conv.weight.squeeze()
 
         if reverse:
-            if not hasattr(self, 'W_inverse'):
+            if not hasattr(self, "W_inverse"):
                 # Reverse computation
                 W_inverse = W.float().inverse()
                 W_inverse = Variable(W_inverse[..., None])
-                if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor':
+                if (
+                    z.type() == "torch.cuda.HalfTensor"
+                    or z.type() == "torch.HalfTensor"
+                ):
                     W_inverse = W_inverse.half()
                 self.W_inverse = W_inverse
             z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
             return z
         else:
             # Forward computation
-            log_det_W = batch_size * n_of_groups * torch.logdet(W.unsqueeze(0).float()).squeeze()
+            log_det_W = (
+                batch_size
+                * n_of_groups
+                * torch.logdet(W.unsqueeze(0).float()).squeeze()
+            )
             z = self.conv(z)
             return z, log_det_W
 
@@ -90,11 +98,12 @@ class WN(torch.nn.Module):
     also no dilation size reset.  The dilation only doubles on each layer
     """
 
-    def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
-                 kernel_size):
+    def __init__(
+        self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size
+    ):
         super(WN, self).__init__()
-        assert(kernel_size % 2 == 1)
-        assert(n_channels % 2 == 0)
+        assert kernel_size % 2 == 1
+        assert n_channels % 2 == 0
         self.n_layers = n_layers
         self.n_channels = n_channels
         self.in_layers = torch.nn.ModuleList()
@@ -102,7 +111,7 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
         self.cond_layers = torch.nn.ModuleList()
 
         start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
-        start = torch.nn.utils.weight_norm(start, name='weight')
+        start = torch.nn.utils.weight_norm(start, name="weight")
         self.start = start
 
         # Initializing last layer to 0 makes the affine coupling layers
@@ -113,15 +122,20 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
         self.end = end
 
         for i in range(n_layers):
-            dilation = 2 ** i
+            dilation = 2**i
             padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = torch.nn.Conv1d(n_channels, 2 * n_channels, kernel_size,
-                                       dilation=dilation, padding=padding)
-            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+            in_layer = torch.nn.Conv1d(
+                n_channels,
+                2 * n_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
             self.in_layers.append(in_layer)
 
             cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
-            cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+            cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
             self.cond_layers.append(cond_layer)
 
             # last one is not necessary
@@ -130,8 +144,7 @@ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
             else:
                 res_skip_channels = n_channels
             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
-            res_skip_layer = torch.nn.utils.weight_norm(
-                res_skip_layer, name='weight')
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
             self.res_skip_layers.append(res_skip_layer)
 
     def forward(self, forward_input):
@@ -142,12 +155,13 @@ def forward(self, forward_input):
             acts = fused_add_tanh_sigmoid_multiply(
                 self.in_layers[i](audio),
                 self.cond_layers[i](spect),
-                torch.IntTensor([self.n_channels]))
+                torch.IntTensor([self.n_channels]),
+            )
 
             res_skip_acts = self.res_skip_layers[i](acts)
             if i < self.n_layers - 1:
-                audio = res_skip_acts[:, :self.n_channels, :] + audio
-                skip_acts = res_skip_acts[:, self.n_channels:, :]
+                audio = res_skip_acts[:, : self.n_channels, :] + audio
+                skip_acts = res_skip_acts[:, self.n_channels :, :]
             else:
                 skip_acts = res_skip_acts
 
@@ -159,14 +173,15 @@ def forward(self, forward_input):
 
 
 class WaveGlow(torch.nn.Module):
-    def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
-                 n_early_size, WN_config):
+    def __init__(
+        self, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, WN_config
+    ):
         super(WaveGlow, self).__init__()
 
-        self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
-                                                 n_mel_channels,
-                                                 1024, stride=256)
-        assert(n_group % 2 == 0)
+        self.upsample = torch.nn.ConvTranspose1d(
+            n_mel_channels, n_mel_channels, 1024, stride=256
+        )
+        assert n_group % 2 == 0
         self.n_flows = n_flows
         self.n_group = n_group
         self.n_early_every = n_early_every
@@ -196,9 +211,9 @@ def forward(self, forward_input):
 
         #  Upsample spectrogram to size of audio
         spect = self.upsample(spect)
-        assert(spect.size(2) >= audio.size(1))
+        assert spect.size(2) >= audio.size(1)
         if spect.size(2) > audio.size(1):
-            spect = spect[:, :, :audio.size(1)]
+            spect = spect[:, :, : audio.size(1)]
 
         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
@@ -211,8 +226,8 @@ def forward(self, forward_input):
 
         for k in range(self.n_flows):
             if k % self.n_early_every == 0 and k > 0:
-                output_audio.append(audio[:, :self.n_early_size, :])
-                audio = audio[:, self.n_early_size:, :]
+                output_audio.append(audio[:, : self.n_early_size, :])
+                audio = audio[:, self.n_early_size :, :]
 
             audio, log_det_W = self.convinv[k](audio)
             log_det_W_list.append(log_det_W)
@@ -233,7 +248,6 @@ def forward(self, forward_input):
         return torch.cat(output_audio, 1), log_s_list, log_det_W_list
 
     def infer(self, spect, sigma=1.0):
-
         spect = self.upsample(spect)
         # trim conv artifacts. maybe pad spec to kernel multiple
         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
@@ -243,9 +257,9 @@ def infer(self, spect, sigma=1.0):
         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
         spect = spect.permute(0, 2, 1)
 
-        audio = torch.randn(spect.size(0),
-                            self.n_remaining_channels,
-                            spect.size(2), device=spect.device).to(spect.dtype)
+        audio = torch.randn(
+            spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device
+        ).to(spect.dtype)
 
         audio = torch.autograd.Variable(sigma * audio)
 
@@ -263,16 +277,14 @@ def infer(self, spect, sigma=1.0):
             audio = self.convinv[k](audio, reverse=True)
 
             if k % self.n_early_every == 0 and k > 0:
-                z = torch.randn(spect.size(0), self.n_early_size, spect.size(
-                    2), device=spect.device).to(spect.dtype)
+                z = torch.randn(
+                    spect.size(0), self.n_early_size, spect.size(2), device=spect.device
+                ).to(spect.dtype)
                 audio = torch.cat((sigma * z, audio), 1)
 
-        audio = audio.permute(
-            0, 2, 1).contiguous().view(
-            audio.size(0), -1).data
+        audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data
         return audio
 
-
     @staticmethod
     def remove_weightnorm(model):
         waveglow = model

From c8a6871bc2b09483a375befe5ca446d07daa8521 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Thu, 4 Apr 2024 21:38:22 +0000
Subject: [PATCH 12/16] resolving merge conflicts

---
 .../waveglow_model.py                         | 304 ++++++++++++++++++
 1 file changed, 304 insertions(+)
 create mode 100644 examples/text_to_speech_synthesizer/waveglow_model.py

diff --git a/examples/text_to_speech_synthesizer/waveglow_model.py b/examples/text_to_speech_synthesizer/waveglow_model.py
new file mode 100644
index 0000000000..31b9992383
--- /dev/null
+++ b/examples/text_to_speech_synthesizer/waveglow_model.py
@@ -0,0 +1,304 @@
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+class Invertible1x1Conv(torch.nn.Module):
+    """
+    The layer outputs both the convolution, and the log determinant
+    of its weight matrix.  If reverse=True it does convolution with
+    inverse
+    """
+
+    def __init__(self, c):
+        super(Invertible1x1Conv, self).__init__()
+        self.conv = torch.nn.Conv1d(
+            c, c, kernel_size=1, stride=1, padding=0, bias=False
+        )
+
+        # Sample a random orthonormal matrix to initialize weights
+        W = torch.linalg.qr(torch.FloatTensor(c, c).normal_())[0]
+
+        # Ensure determinant is 1.0 not -1.0
+        if torch.det(W) < 0:
+            W[:, 0] = -1 * W[:, 0]
+        W = W.view(c, c, 1)
+        self.conv.weight.data = W
+
+    def forward(self, z, reverse=False):
+        # shape
+        batch_size, group_size, n_of_groups = z.size()
+
+        W = self.conv.weight.squeeze()
+
+        if reverse:
+            if not hasattr(self, "W_inverse"):
+                # Reverse computation
+                W_inverse = W.float().inverse()
+                W_inverse = Variable(W_inverse[..., None])
+                if (
+                    z.type() == "torch.cuda.HalfTensor"
+                    or z.type() == "torch.HalfTensor"
+                ):
+                    W_inverse = W_inverse.half()
+                self.W_inverse = W_inverse
+            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+            return z
+        else:
+            # Forward computation
+            log_det_W = (
+                batch_size
+                * n_of_groups
+                * torch.logdet(W.unsqueeze(0).float()).squeeze()
+            )
+            z = self.conv(z)
+            return z, log_det_W
+
+
+class WN(torch.nn.Module):
+    """
+    This is the WaveNet like layer for the affine coupling.  The primary
+    difference from WaveNet is the convolutions need not be causal.  There is
+    also no dilation size reset.  The dilation only doubles on each layer
+    """
+
+    def __init__(
+        self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        assert n_channels % 2 == 0
+        self.n_layers = n_layers
+        self.n_channels = n_channels
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.cond_layers = torch.nn.ModuleList()
+
+        start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+        start = torch.nn.utils.weight_norm(start, name="weight")
+        self.start = start
+
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  This helps with training stability
+        end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = end
+
+        for i in range(n_layers):
+            dilation = 2**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                n_channels,
+                2 * n_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
+            cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+            self.cond_layers.append(cond_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * n_channels
+            else:
+                res_skip_channels = n_channels
+            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, forward_input):
+        audio, spect = forward_input
+        audio = self.start(audio)
+
+        for i in range(self.n_layers):
+            acts = fused_add_tanh_sigmoid_multiply(
+                self.in_layers[i](audio),
+                self.cond_layers[i](spect),
+                torch.IntTensor([self.n_channels]),
+            )
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                audio = res_skip_acts[:, : self.n_channels, :] + audio
+                skip_acts = res_skip_acts[:, self.n_channels :, :]
+            else:
+                skip_acts = res_skip_acts
+
+            if i == 0:
+                output = skip_acts
+            else:
+                output = skip_acts + output
+        return self.end(output)
+
+
+class WaveGlow(torch.nn.Module):
+    def __init__(
+        self, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, WN_config
+    ):
+        super(WaveGlow, self).__init__()
+
+        self.upsample = torch.nn.ConvTranspose1d(
+            n_mel_channels, n_mel_channels, 1024, stride=256
+        )
+        assert n_group % 2 == 0
+        self.n_flows = n_flows
+        self.n_group = n_group
+        self.n_early_every = n_early_every
+        self.n_early_size = n_early_size
+        self.WN = torch.nn.ModuleList()
+        self.convinv = torch.nn.ModuleList()
+
+        n_half = int(n_group / 2)
+
+        # Set up layers with the right sizes based on how many dimensions
+        # have been output already
+        n_remaining_channels = n_group
+        for k in range(n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                n_half = n_half - int(self.n_early_size / 2)
+                n_remaining_channels = n_remaining_channels - self.n_early_size
+            self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+            self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config))
+        self.n_remaining_channels = n_remaining_channels
+
+    def forward(self, forward_input):
+        """
+        forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
+        forward_input[1] = audio: batch x time
+        """
+        spect, audio = forward_input
+
+        #  Upsample spectrogram to size of audio
+        spect = self.upsample(spect)
+        assert spect.size(2) >= audio.size(1)
+        if spect.size(2) > audio.size(1):
+            spect = spect[:, :, : audio.size(1)]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
+        spect = spect.permute(0, 2, 1)
+
+        audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
+        output_audio = []
+        log_s_list = []
+        log_det_W_list = []
+
+        for k in range(self.n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                output_audio.append(audio[:, : self.n_early_size, :])
+                audio = audio[:, self.n_early_size :, :]
+
+            audio, log_det_W = self.convinv[k](audio)
+            log_det_W_list.append(log_det_W)
+
+            n_half = int(audio.size(1) / 2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:, :]
+
+            output = self.WN[k]((audio_0, spect))
+            log_s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = torch.exp(log_s) * audio_1 + b
+            log_s_list.append(log_s)
+
+            audio = torch.cat([audio_0, audio_1], 1)
+
+        output_audio.append(audio)
+        return torch.cat(output_audio, 1), log_s_list, log_det_W_list
+
+    def infer(self, spect, sigma=1.0):
+        spect = self.upsample(spect)
+        # trim conv artifacts. maybe pad spec to kernel multiple
+        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+        spect = spect[:, :, :-time_cutoff]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
+        spect = spect.permute(0, 2, 1)
+
+        audio = torch.randn(
+            spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device
+        ).to(spect.dtype)
+
+        audio = torch.autograd.Variable(sigma * audio)
+
+        for k in reversed(range(self.n_flows)):
+            n_half = int(audio.size(1) / 2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:, :]
+
+            output = self.WN[k]((audio_0, spect))
+            s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = (audio_1 - b) / torch.exp(s)
+            audio = torch.cat([audio_0, audio_1], 1)
+
+            audio = self.convinv[k](audio, reverse=True)
+
+            if k % self.n_early_every == 0 and k > 0:
+                z = torch.randn(
+                    spect.size(0), self.n_early_size, spect.size(2), device=spect.device
+                ).to(spect.dtype)
+                audio = torch.cat((sigma * z, audio), 1)
+
+        audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data
+        return audio
+
+    @staticmethod
+    def remove_weightnorm(model):
+        waveglow = model
+        for WN in waveglow.WN:
+            WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+            WN.in_layers = remove(WN.in_layers)
+            WN.cond_layers = remove(WN.cond_layers)
+            WN.res_skip_layers = remove(WN.res_skip_layers)
+        return waveglow
+
+
+def remove(conv_list):
+    new_conv_list = torch.nn.ModuleList()
+    for old_conv in conv_list:
+        old_conv = torch.nn.utils.remove_weight_norm(old_conv)
+        new_conv_list.append(old_conv)
+    return new_conv_list

From 8e9e482082fed07a034234fb0a999f9dddbf1975 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Thu, 4 Apr 2024 21:38:51 +0000
Subject: [PATCH 13/16] resolving merge conflicts

---
 .../waveglow_model.py                         | 304 ------------------
 1 file changed, 304 deletions(-)
 delete mode 100644 examples/text_to_speech_synthesizer/waveglow_model.py

diff --git a/examples/text_to_speech_synthesizer/waveglow_model.py b/examples/text_to_speech_synthesizer/waveglow_model.py
deleted file mode 100644
index 31b9992383..0000000000
--- a/examples/text_to_speech_synthesizer/waveglow_model.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# *****************************************************************************
-#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are met:
-#      * Redistributions of source code must retain the above copyright
-#        notice, this list of conditions and the following disclaimer.
-#      * Redistributions in binary form must reproduce the above copyright
-#        notice, this list of conditions and the following disclaimer in the
-#        documentation and/or other materials provided with the distribution.
-#      * Neither the name of the NVIDIA CORPORATION nor the
-#        names of its contributors may be used to endorse or promote products
-#        derived from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# *****************************************************************************
-import torch
-import torch.nn.functional as F
-from torch.autograd import Variable
-
-
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
-class Invertible1x1Conv(torch.nn.Module):
-    """
-    The layer outputs both the convolution, and the log determinant
-    of its weight matrix.  If reverse=True it does convolution with
-    inverse
-    """
-
-    def __init__(self, c):
-        super(Invertible1x1Conv, self).__init__()
-        self.conv = torch.nn.Conv1d(
-            c, c, kernel_size=1, stride=1, padding=0, bias=False
-        )
-
-        # Sample a random orthonormal matrix to initialize weights
-        W = torch.linalg.qr(torch.FloatTensor(c, c).normal_())[0]
-
-        # Ensure determinant is 1.0 not -1.0
-        if torch.det(W) < 0:
-            W[:, 0] = -1 * W[:, 0]
-        W = W.view(c, c, 1)
-        self.conv.weight.data = W
-
-    def forward(self, z, reverse=False):
-        # shape
-        batch_size, group_size, n_of_groups = z.size()
-
-        W = self.conv.weight.squeeze()
-
-        if reverse:
-            if not hasattr(self, "W_inverse"):
-                # Reverse computation
-                W_inverse = W.float().inverse()
-                W_inverse = Variable(W_inverse[..., None])
-                if (
-                    z.type() == "torch.cuda.HalfTensor"
-                    or z.type() == "torch.HalfTensor"
-                ):
-                    W_inverse = W_inverse.half()
-                self.W_inverse = W_inverse
-            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
-            return z
-        else:
-            # Forward computation
-            log_det_W = (
-                batch_size
-                * n_of_groups
-                * torch.logdet(W.unsqueeze(0).float()).squeeze()
-            )
-            z = self.conv(z)
-            return z, log_det_W
-
-
-class WN(torch.nn.Module):
-    """
-    This is the WaveNet like layer for the affine coupling.  The primary
-    difference from WaveNet is the convolutions need not be causal.  There is
-    also no dilation size reset.  The dilation only doubles on each layer
-    """
-
-    def __init__(
-        self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size
-    ):
-        super(WN, self).__init__()
-        assert kernel_size % 2 == 1
-        assert n_channels % 2 == 0
-        self.n_layers = n_layers
-        self.n_channels = n_channels
-        self.in_layers = torch.nn.ModuleList()
-        self.res_skip_layers = torch.nn.ModuleList()
-        self.cond_layers = torch.nn.ModuleList()
-
-        start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
-        start = torch.nn.utils.weight_norm(start, name="weight")
-        self.start = start
-
-        # Initializing last layer to 0 makes the affine coupling layers
-        # do nothing at first.  This helps with training stability
-        end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
-        end.weight.data.zero_()
-        end.bias.data.zero_()
-        self.end = end
-
-        for i in range(n_layers):
-            dilation = 2**i
-            padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = torch.nn.Conv1d(
-                n_channels,
-                2 * n_channels,
-                kernel_size,
-                dilation=dilation,
-                padding=padding,
-            )
-            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
-            self.in_layers.append(in_layer)
-
-            cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
-            cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
-            self.cond_layers.append(cond_layer)
-
-            # last one is not necessary
-            if i < n_layers - 1:
-                res_skip_channels = 2 * n_channels
-            else:
-                res_skip_channels = n_channels
-            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
-            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
-            self.res_skip_layers.append(res_skip_layer)
-
-    def forward(self, forward_input):
-        audio, spect = forward_input
-        audio = self.start(audio)
-
-        for i in range(self.n_layers):
-            acts = fused_add_tanh_sigmoid_multiply(
-                self.in_layers[i](audio),
-                self.cond_layers[i](spect),
-                torch.IntTensor([self.n_channels]),
-            )
-
-            res_skip_acts = self.res_skip_layers[i](acts)
-            if i < self.n_layers - 1:
-                audio = res_skip_acts[:, : self.n_channels, :] + audio
-                skip_acts = res_skip_acts[:, self.n_channels :, :]
-            else:
-                skip_acts = res_skip_acts
-
-            if i == 0:
-                output = skip_acts
-            else:
-                output = skip_acts + output
-        return self.end(output)
-
-
-class WaveGlow(torch.nn.Module):
-    def __init__(
-        self, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, WN_config
-    ):
-        super(WaveGlow, self).__init__()
-
-        self.upsample = torch.nn.ConvTranspose1d(
-            n_mel_channels, n_mel_channels, 1024, stride=256
-        )
-        assert n_group % 2 == 0
-        self.n_flows = n_flows
-        self.n_group = n_group
-        self.n_early_every = n_early_every
-        self.n_early_size = n_early_size
-        self.WN = torch.nn.ModuleList()
-        self.convinv = torch.nn.ModuleList()
-
-        n_half = int(n_group / 2)
-
-        # Set up layers with the right sizes based on how many dimensions
-        # have been output already
-        n_remaining_channels = n_group
-        for k in range(n_flows):
-            if k % self.n_early_every == 0 and k > 0:
-                n_half = n_half - int(self.n_early_size / 2)
-                n_remaining_channels = n_remaining_channels - self.n_early_size
-            self.convinv.append(Invertible1x1Conv(n_remaining_channels))
-            self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config))
-        self.n_remaining_channels = n_remaining_channels
-
-    def forward(self, forward_input):
-        """
-        forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
-        forward_input[1] = audio: batch x time
-        """
-        spect, audio = forward_input
-
-        #  Upsample spectrogram to size of audio
-        spect = self.upsample(spect)
-        assert spect.size(2) >= audio.size(1)
-        if spect.size(2) > audio.size(1):
-            spect = spect[:, :, : audio.size(1)]
-
-        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
-        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
-        spect = spect.permute(0, 2, 1)
-
-        audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
-        output_audio = []
-        log_s_list = []
-        log_det_W_list = []
-
-        for k in range(self.n_flows):
-            if k % self.n_early_every == 0 and k > 0:
-                output_audio.append(audio[:, : self.n_early_size, :])
-                audio = audio[:, self.n_early_size :, :]
-
-            audio, log_det_W = self.convinv[k](audio)
-            log_det_W_list.append(log_det_W)
-
-            n_half = int(audio.size(1) / 2)
-            audio_0 = audio[:, :n_half, :]
-            audio_1 = audio[:, n_half:, :]
-
-            output = self.WN[k]((audio_0, spect))
-            log_s = output[:, n_half:, :]
-            b = output[:, :n_half, :]
-            audio_1 = torch.exp(log_s) * audio_1 + b
-            log_s_list.append(log_s)
-
-            audio = torch.cat([audio_0, audio_1], 1)
-
-        output_audio.append(audio)
-        return torch.cat(output_audio, 1), log_s_list, log_det_W_list
-
-    def infer(self, spect, sigma=1.0):
-        spect = self.upsample(spect)
-        # trim conv artifacts. maybe pad spec to kernel multiple
-        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
-        spect = spect[:, :, :-time_cutoff]
-
-        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
-        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1)
-        spect = spect.permute(0, 2, 1)
-
-        audio = torch.randn(
-            spect.size(0), self.n_remaining_channels, spect.size(2), device=spect.device
-        ).to(spect.dtype)
-
-        audio = torch.autograd.Variable(sigma * audio)
-
-        for k in reversed(range(self.n_flows)):
-            n_half = int(audio.size(1) / 2)
-            audio_0 = audio[:, :n_half, :]
-            audio_1 = audio[:, n_half:, :]
-
-            output = self.WN[k]((audio_0, spect))
-            s = output[:, n_half:, :]
-            b = output[:, :n_half, :]
-            audio_1 = (audio_1 - b) / torch.exp(s)
-            audio = torch.cat([audio_0, audio_1], 1)
-
-            audio = self.convinv[k](audio, reverse=True)
-
-            if k % self.n_early_every == 0 and k > 0:
-                z = torch.randn(
-                    spect.size(0), self.n_early_size, spect.size(2), device=spect.device
-                ).to(spect.dtype)
-                audio = torch.cat((sigma * z, audio), 1)
-
-        audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data
-        return audio
-
-    @staticmethod
-    def remove_weightnorm(model):
-        waveglow = model
-        for WN in waveglow.WN:
-            WN.start = torch.nn.utils.remove_weight_norm(WN.start)
-            WN.in_layers = remove(WN.in_layers)
-            WN.cond_layers = remove(WN.cond_layers)
-            WN.res_skip_layers = remove(WN.res_skip_layers)
-        return waveglow
-
-
-def remove(conv_list):
-    new_conv_list = torch.nn.ModuleList()
-    for old_conv in conv_list:
-        old_conv = torch.nn.utils.remove_weight_norm(old_conv)
-        new_conv_list.append(old_conv)
-    return new_conv_list

From 95da450c8e8760e1d77a7078a034f7292789d221 Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Fri, 5 Apr 2024 00:23:57 +0000
Subject: [PATCH 14/16] update documentation

---
 docs/linux_aarch64.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/linux_aarch64.md b/docs/linux_aarch64.md
index 7c37cc8bab..0b911faadf 100644
--- a/docs/linux_aarch64.md
+++ b/docs/linux_aarch64.md
@@ -1,6 +1,9 @@
-# TorchServe on linux aarch64
+# TorchServe on linux aarch64 - Experimental
 
-TorchServe has been tested to be working on linux aarch64 for some of the examples. Regression tests have not been tested. Tested this on Amazon Graviton 3 instance(m7g.4x.large)
+TorchServe has been tested to be working on linux aarch64 for some of the examples.
+- CI is not implemented yet.
+- Regression tests have not been run.
+- Tested this on Amazon Graviton 3 instance(m7g.4x.large)
 
 ## Installation
 
@@ -22,5 +25,3 @@ export LRU_CACHE_CAPACITY=1024
 ## Example
 
 This [example](https://github.com/pytorch/serve/tree/master/examples/text_to_speech_synthesizer/SpeechT5) on Text to Speech synthesis was verified to be working on Graviton 3
-
-

From dc1accd33bb307821b0dd2c662d8286a85d8372e Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Thu, 18 Apr 2024 20:38:20 +0000
Subject: [PATCH 15/16] review comments

---
 docs/linux_aarch64.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/linux_aarch64.md b/docs/linux_aarch64.md
index 0b911faadf..5e13410c83 100644
--- a/docs/linux_aarch64.md
+++ b/docs/linux_aarch64.md
@@ -1,8 +1,6 @@
 # TorchServe on linux aarch64 - Experimental
 
 TorchServe has been tested to be working on linux aarch64 for some of the examples.
-- CI is not implemented yet.
-- Regression tests have not been run.
 - Tested this on Amazon Graviton 3 instance(m7g.4x.large)
 
 ## Installation
@@ -25,3 +23,7 @@ export LRU_CACHE_CAPACITY=1024
 ## Example
 
 This [example](https://github.com/pytorch/serve/tree/master/examples/text_to_speech_synthesizer/SpeechT5) on Text to Speech synthesis was verified to be working on Graviton 3
+
+## To Dos
+- CI
+- Regression tests

From 4a27ed9ff7a7f0e7dfdcce8788b4b137a66530cf Mon Sep 17 00:00:00 2001
From: agunapal <agunapal@ischool.berkeley.edu>
Date: Fri, 3 May 2024 17:45:12 +0000
Subject: [PATCH 16/16] Updated based on review comments

---
 examples/text_to_speech_synthesizer/SpeechT5/README.md   | 6 ++++--
 .../SpeechT5/model-config.yaml                           | 9 +++++----
 .../SpeechT5/text_to_speech_handler.py                   | 4 +++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/examples/text_to_speech_synthesizer/SpeechT5/README.md b/examples/text_to_speech_synthesizer/SpeechT5/README.md
index b991ee218e..e2182faf7f 100644
--- a/examples/text_to_speech_synthesizer/SpeechT5/README.md
+++ b/examples/text_to_speech_synthesizer/SpeechT5/README.md
@@ -30,9 +30,9 @@ python download_model.py
 ```
 mkdir model_store
 
-torch-model-archiver --model-name SpeechT5-TTS --version 1.0 --handler text_to_speech_handler.py --config-file model-config.yaml --archive-format no-archive --export-path model_store
+torch-model-archiver --model-name SpeechT5-TTS --version 1.0 --handler text_to_speech_handler.py --config-file model-config.yaml --archive-format no-archive --export-path model_store -f
 
-mv model_artifacts model_store/SpeechT5-TTS/
+mv model_artifacts/* model_store/SpeechT5-TTS/
 ```
 
 ## Start TorchServe
@@ -46,3 +46,5 @@ torchserve --start --ncs --model-store model_store --models SpeechT5-TTS
 ```
 curl http://127.0.0.1:8080/predictions/SpeechT5-TTS -T sample_input.txt  -o speech.wav
 ```
+
+This generates an audio file `speech.wav` corresponding to the text in `sample_input.txt`
diff --git a/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml b/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml
index aefc704276..feaf7026b3 100644
--- a/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml
+++ b/examples/text_to_speech_synthesizer/SpeechT5/model-config.yaml
@@ -1,7 +1,8 @@
 minWorkers: 1
 maxWorkers: 1
 handler:
-  model: "./model"
-  vocoder: "./vocoder"
-  processor: "./processor"
-  speaker_embeddings: "./speaker_embeddings"
+  model: "model"
+  vocoder: "vocoder"
+  processor: "processor"
+  speaker_embeddings: "speaker_embeddings"
+  output_dir: "/tmp"
diff --git a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
index ca9f30ab26..65fbbf1509 100644
--- a/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
+++ b/examples/text_to_speech_synthesizer/SpeechT5/text_to_speech_handler.py
@@ -18,6 +18,7 @@ def __init__(self):
         self.processor = None
         self.vocoder = None
         self.speaker_embeddings = None
+        self.output_dir = "/tmp"
 
     def initialize(self, ctx):
         properties = ctx.system_properties
@@ -27,6 +28,7 @@ def initialize(self, ctx):
         model = ctx.model_yaml_config["handler"]["model"]
         vocoder = ctx.model_yaml_config["handler"]["vocoder"]
         embeddings_dataset = ctx.model_yaml_config["handler"]["speaker_embeddings"]
+        self.output_dir = ctx.model_yaml_config["handler"]["output_dir"]
 
         self.processor = SpeechT5Processor.from_pretrained(processor)
         self.model = SpeechT5ForTextToSpeech.from_pretrained(model)
@@ -58,7 +60,7 @@ def inference(self, inputs):
         return output
 
     def postprocess(self, inference_output):
-        path = "/tmp/{}.wav".format(uuid.uuid4().hex)
+        path = self.output_dir + "/{}.wav".format(uuid.uuid4().hex)
         sf.write(path, inference_output.numpy(), samplerate=16000)
         with open(path, "rb") as output:
             data = output.read()