Merge pull request #94 from idiap/v0.24.2

v0.24.2
idiap · Oct 4, 2024 · 3e1e2b8 · 3e1e2b8
2 parents 439fb45 + 282b2da
commit 3e1e2b8
Show file tree

Hide file tree

Showing 79 changed files with 527 additions and 659 deletions.
diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
@@ -8,6 +8,7 @@ defaults:
       bash
 jobs:
   build-sdist:
+    name: Build source distribution
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -23,37 +24,31 @@ jobs:
         with:
           python-version: 3.9
       - run: |
-          python -m pip install -U pip setuptools wheel build
+          python -m pip install -U pip setuptools build
       - run: |
           python -m build
       - run: |
           pip install dist/*.tar.gz
       - uses: actions/upload-artifact@v4
         with:
-          name: sdist
+          name: build-sdist
           path: dist/*.tar.gz
   build-wheels:
-    runs-on: ubuntu-latest
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        os: [ubuntu-latest, windows-latest, macos-latest]
     steps:
       - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install build requirements
-        run: |
-          python -m pip install -U pip setuptools wheel build numpy cython
-      - name: Setup and install manylinux1_x86_64 wheel
-        run: |
-          python setup.py bdist_wheel --plat-name=manylinux1_x86_64
-          python -m pip install dist/*-manylinux*.whl
+      - name: Build wheels
+        uses: pypa/[email protected]
       - uses: actions/upload-artifact@v4
         with:
-          name: wheel-${{ matrix.python-version }}
-          path: dist/*-manylinux*.whl
+          name: build-wheels-${{ matrix.os }}
+          path: ./wheelhouse/*.whl
   publish-artifacts:
+    name: Publish to PyPI
     runs-on: ubuntu-latest
     needs: [build-sdist, build-wheels]
     environment:
@@ -62,28 +57,11 @@ jobs:
     permissions:
       id-token: write
     steps:
-      - run: |
-          mkdir dist
-      - uses: actions/download-artifact@v4
-        with:
-          name: "sdist"
-          path: "dist/"
-      - uses: actions/download-artifact@v4
-        with:
-          name: "wheel-3.9"
-          path: "dist/"
-      - uses: actions/download-artifact@v4
-        with:
-          name: "wheel-3.10"
-          path: "dist/"
-      - uses: actions/download-artifact@v4
-        with:
-          name: "wheel-3.11"
-          path: "dist/"
       - uses: actions/download-artifact@v4
         with:
-          name: "wheel-3.12"
-          path: "dist/"
+          path: dist
+          pattern: build-*
+          merge-multiple: true
       - run: |
           ls -lh dist/
       - name: Publish package distributions to PyPI

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -45,13 +45,17 @@ jobs:
           sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
       - name: Install TTS
         run: |
-          python3 -m uv pip install --system "coqui-tts[dev,server,languages] @ ."
-          python3 setup.py egg_info
+          resolution=highest
+          if [ "${{ matrix.python-version }}" == "3.9" ]; then
+            resolution=lowest-direct
+          fi
+          python3 -m uv pip install --resolution=$resolution --system "coqui-tts[dev,server,languages] @ ."
       - name: Unit tests
         run: make ${{ matrix.subset }}
       - name: Upload coverage data
         uses: actions/upload-artifact@v4
         with:
+          include-hidden-files: true
           name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
           path: .coverage.*
           if-no-files-found: ignore

diff --git a/README.md b/README.md
@@ -4,10 +4,10 @@
 - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
 - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
 - 📣 ⓍTTS can now stream with <200ms latency.
-- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/dev/models/xtts.html)
-- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/dev/models/bark.html)
+- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/latest/models/xtts.html)
+- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
-- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://coqui-tts.readthedocs.io/en/dev/models/tortoise.html)
+- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/tortoise.html)
 
 <div align="center">
 <img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
@@ -55,6 +55,10 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 [discord]: https://discord.gg/5eXr5seRrv
 [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
 
+The [issues](https://github.com/coqui-ai/TTS/issues) and
+[discussions](https://github.com/coqui-ai/TTS/discussions) in the original
+repository are also still a useful source of information.
+
 
 ## 🔗 Links and Resources
 | Type                            | Links                               |
@@ -143,6 +147,7 @@ If you plan to code or train models, clone 🐸TTS and install it locally.
 
 ```bash
 git clone https://github.com/idiap/coqui-ai-TTS
+cd coqui-ai-TTS
 pip install -e .
 ```
 

diff --git a/TTS/.models.json b/TTS/.models.json
@@ -48,7 +48,6 @@
                         "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
                         "https://coqui.gateway.scarf.sh/hf/bark/text_2.pt",
                         "https://coqui.gateway.scarf.sh/hf/bark/config.json",
-                        "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
                         "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
                     ],
                     "default_vocoder": null,

diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
@@ -8,14 +8,14 @@
 import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
+from trainer.io import load_checkpoint
 
 from TTS.config import load_config
 from TTS.tts.datasets.TTSDataset import TTSDataset
 from TTS.tts.models import setup_model
 from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
-from TTS.utils.io import load_checkpoint
 
 if __name__ == "__main__":
     setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
@@ -35,7 +35,7 @@
         --data_path /root/LJSpeech-1.1/
         --batch_size 32
         --dataset ljspeech
-        --use_cuda True
+        --use_cuda
 """,
         formatter_class=RawTextHelpFormatter,
     )
@@ -62,7 +62,7 @@
         help="Dataset metafile inclusing file paths with transcripts.",
     )
     parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
-    parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
+    parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.")
 
     parser.add_argument(
         "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."

diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
@@ -150,7 +150,7 @@ def compute_embeddings(
         default=False,
         action="store_true",
     )
-    parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
+    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
     parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
     parser.add_argument(
         "--formatter_name",

diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
@@ -75,8 +75,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
         type=str,
         help="Path to dataset config file.",
     )
-    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
-    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+    parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
 
     args = parser.parse_args()
 

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
@@ -282,7 +282,7 @@ def main(args):  # pylint: disable=redefined-outer-name
     parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
     parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
     parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
-    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
     args = parser.parse_args()
 
     c = load_config(args.config_path)

diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
@@ -80,7 +80,7 @@ def preprocess_audios():
     setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
 
     parser = argparse.ArgumentParser(
-        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
+        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
     )
     parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
     parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
@@ -95,20 +95,20 @@ def preprocess_audios():
     parser.add_argument(
         "-t",
         "--trim_just_beginning_and_end",
-        type=bool,
+        action=argparse.BooleanOptionalAction,
         default=True,
-        help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
+        help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.",
     )
     parser.add_argument(
         "-c",
         "--use_cuda",
-        type=bool,
+        action=argparse.BooleanOptionalAction,
         default=False,
         help="If True use cuda",
     )
     parser.add_argument(
         "--use_onnx",
-        type=bool,
+        action=argparse.BooleanOptionalAction,
         default=False,
         help="If True use onnx",
     )

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
+
+"""Command line interface."""
 
 import argparse
 import contextlib
@@ -136,30 +137,16 @@
 """
 
 
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    if v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    raise argparse.ArgumentTypeError("Boolean value expected.")
-
-
-def main():
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
+def parse_args() -> argparse.Namespace:
+    """Parse arguments."""
     parser = argparse.ArgumentParser(
         description=description.replace("    ```\n", ""),
         formatter_class=RawTextHelpFormatter,
     )
 
     parser.add_argument(
         "--list_models",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
+        action="store_true",
         help="list available pre-trained TTS and vocoder models.",
     )
 
@@ -207,7 +194,7 @@ def main():
         default="tts_output.wav",
         help="Output wav file path.",
     )
-    parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
+    parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.")
     parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
     parser.add_argument(
         "--vocoder_path",
@@ -226,10 +213,7 @@ def main():
     parser.add_argument(
         "--pipe_out",
         help="stdout the generated TTS wav file for shell pipe.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
+        action="store_true",
     )
 
     # args for multi-speaker synthesis
@@ -261,25 +245,18 @@ def main():
     parser.add_argument(
         "--list_speaker_idxs",
         help="List available speaker ids for the defined multi-speaker model.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
+        action="store_true",
     )
     parser.add_argument(
         "--list_language_idxs",
         help="List available language ids for the defined multi-lingual model.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
+        action="store_true",
     )
     # aux args
     parser.add_argument(
         "--save_spectogram",
-        type=bool,
-        help="If true save raw spectogram for further (vocoder) processing in out_path.",
-        default=False,
+        action="store_true",
+        help="Save raw spectogram for further (vocoder) processing in out_path.",
     )
     parser.add_argument(
         "--reference_wav",
@@ -295,8 +272,8 @@ def main():
     )
     parser.add_argument(
         "--progress_bar",
-        type=str2bool,
-        help="If true shows a progress bar for the model download. Defaults to True",
+        action=argparse.BooleanOptionalAction,
+        help="Show a progress bar for the model download.",
         default=True,
     )
 
@@ -337,19 +314,23 @@ def main():
     ]
     if not any(check_args):
         parser.parse_args(["-h"])
+    return args
+
+
+def main():
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    args = parse_args()
 
     pipe_out = sys.stdout if args.pipe_out else None
 
     with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
         # Late-import to make things load faster
-        from TTS.api import TTS
         from TTS.utils.manage import ModelManager
         from TTS.utils.synthesizer import Synthesizer
 
         # load model manager
         path = Path(__file__).parent / "../.models.json"
         manager = ModelManager(path, progress_bar=args.progress_bar)
-        api = TTS()
 
         tts_path = None
         tts_config_path = None