diff --git a/.flake8 b/.flake8 deleted file mode 100644 index d9ad0b40..00000000 --- a/.flake8 +++ /dev/null @@ -1,5 +0,0 @@ -[flake8] -ignore = E203, E266, E501, W503, F403, F401 -max-line-length = 79 -max-complexity = 18 -select = B,C,E,F,W,T4,B9 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..39663c55 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,5 @@ +02babfb698a8dfbeb87f5be6ad21172eba82bc05 +ee480fbd24c2d0b1730f5ae4a6be6c6bc842eb94 +1862060ef717c05080c9b47497dc79328563b072 +3416098be96c2e8efee5c5ce1e935711575d2e47 +13435428e87005f168db210019759bf7578ec06f diff --git a/.github/workflows/pythonpublish.yml b/.github/workflows/pythonpublish.yml index 10759abf..f272dfa5 100644 --- a/.github/workflows/pythonpublish.yml +++ b/.github/workflows/pythonpublish.yml @@ -9,21 +9,21 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: python-version: "3.x" - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine + pip install build twine - name: Build and publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python setup.py sdist bdist_wheel + python -m build --sdist --wheel twine upload dist/* - name: Determine tag id: determine_tag diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 00000000..4b4b61f7 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,77 @@ +name: Run Tests +on: + - push + - pull_request +jobs: + test: + runs-on: ubuntu-latest + # #no-ci in the commit log flags commit we don't want CI-validated + if: ${{ !contains(github.event.head_commit.message, '#no-ci') }} + steps: + - uses: actions/checkout@v3 + + - uses: FedericoCarboni/setup-ffmpeg@v2 + id: setup-ffmpeg + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.7" + cache: "pip" + + - name: Install Python dependencies + run: | + # Keep pip up to date + python -m pip install --upgrade pip + # Some dependencies are built using wheel + pip install wheel + # Install all Python dependencies in just one pip call, including Studio itself + pip install -r requirements.txt \ + -r requirements.dev.txt \ + -r requirements.ci.txt \ + -e . + + - name: Run tests + run: | + gunicorn readalongs.app:app --bind 0.0.0.0:5000 --daemon + cd test && coverage run run.py prod && coverage xml + + - name: Nitpicking + run: | + # coding style: we want black compliance + find . -name \*.py | xargs black --check + # Legal check: make sure we don't have or introduce GPL dependencies + if pip-licenses | grep -v 'Artistic License' | grep -v LGPL | grep GNU; then echo 'Please avoid introducing *GPL dependencies'; false; fi + + - uses: codecov/codecov-action@v3 + with: + directory: ./test + token: ${{ secrets.CODECOV_TOKEN }} # optional but apparently makes upload more reliable + fail_ci_if_error: false # too many upload errors to keep "true" + + test-on-windows: + runs-on: windows-latest + if: ${{ !contains(github.event.head_commit.message, '#no-ci') }} + steps: + - uses: actions/checkout@v3 + + - uses: FedericoCarboni/setup-ffmpeg@v2 + id: setup-ffmpeg + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.7" + cache: "pip" + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install wheel + pip install -r requirements.txt ` + -r requirements.dev.txt ` + -r requirements.ci.txt ` + -e . + + - name: Run tests on Windows + run: cd test && python run.py prod diff --git a/.gitlint b/.gitlint new file mode 100644 index 00000000..f5e52106 --- /dev/null +++ b/.gitlint @@ -0,0 +1,9 @@ +[general] +# Enable conventional commit linting +contrib=contrib-title-conventional-commits + +# Ignore any data sent to gitlint via stdin (helpful on Windows) +ignore-stdin=true + +# We don't require a body, just a title, even though a body is also a good idea +ignore=body-is-missing diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 10697319..e91dcd6c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,6 +12,8 @@ repos: - id: flake8 - repo: local # Using local repos because these won't work for me from remote repo -EJ + # They're also more convenient because we install them via requirements.dev.txt + # and they are then available on the command line as well as in pre-commit. hooks: - id: isort name: isort @@ -25,7 +27,9 @@ repos: language: system types: [python] stages: [commit] -- repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v0.782' # Use the sha / tag you want to point at - hooks: - id: mypy + name: mypy + entry: mypy + language: system + types: [python] + stages: [commit] diff --git a/.pylintrc b/.pylintrc index 22854677..4d44b940 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,7 +1,14 @@ [MASTER] # A lot of test cases depend on etree, let's allow pylint to load it extension-pkg-allow-list=lxml.etree -# We use isort for sorting our imports, so nevermind what pylint thinks -disable=wrong-import-order + +disable= + # We use isort for sorting our imports, so nevermind what pylint thinks + wrong-import-order, + # I find the "unnecessary" else makes code more readable + no-else-return, + # We use single letter e for exception, f for file handles + invalid-name + # Add . to the PYTHONPATH so pylint knows test cases can import basic_test_case init-hook="import sys; sys.path.append('.')" diff --git a/.readthedocs.yml b/.readthedocs.yml index c926addc..ecbaa4bf 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,13 +1,18 @@ version: 2 build: - os: ubuntu-20.04 - tools: - python: "3.7" + os: ubuntu-20.04 + tools: + python: "3.7" + jobs: + post_install: + - echo "Installing Studio itself in its current state" + - which pip python + - pip install -e . sphinx: - configuration: docs/conf.py + configuration: docs/conf.py python: - install: - - requirements: docs/requirements.txt + install: + - requirements: docs/requirements.txt diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..91da82e6 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,46 @@ +cff-version: 1.2.0 +message: >- + If you use this software in a project of yours and write about it, please + cite our SIGUL 2022 paper using the following citation data. +title: ReadAlongs Studio +url: https://github.com/ReadAlongs/Studio +preferred-citation: + type: conference-paper + title: >- + ReadAlong Studio: Practical Zero-Shot Text-Speech Alignment for Indigenous + Language Audiobooks + authors: + - given-names: Patrick + family-names: Littell + email: Patrick.Littell@nrc-cnrc.gc.ca + affiliation: National Research Council Canada + - given-names: Eric + family-names: Joanis + email: Eric.Joanis@nrc-cnrc.gc.ca + affiliation: National Research Council Canada + - given-names: Aidan + family-names: Pine + email: Aidan.Pine@nrc-cnrc.gc.ca + affiliation: National Research Council Canada + - given-names: Marc + family-names: Tessier + email: Marc.Tessier@nrc-cnrc.gc.ca + affiliation: National Research Council Canada + - given-names: David + family-names: Huggins-Daines + email: dhdaines@gmail.com + affiliation: Independent Researcher + - given-names: Delasie + family-names: Torkornoo + email: delasie.torkornoo@carleton.ca + affiliation: Carleton University + collection-title: Proceedings of SIGUL2022 @LREC2022 + start: 23 + end: 32 + year: 2022 + month: 6 + publisher: + name: European Language Resources Assiciation (ELRA) + location: + name: Marseille + url: http://www.lrec-conf.org/proceedings/lrec2022/workshops/SIGUL/pdf/2022.sigul-1.4.pdf diff --git a/Contributing.md b/Contributing.md index bb2fde66..7013855b 100644 --- a/Contributing.md +++ b/Contributing.md @@ -16,15 +16,16 @@ commits. Run these commands in each of your sandboxes to enable our pre-commit hooks and commitlint: ```sh +pip install -r requirements.dev.txt pre-commit install -npm install +gitlint install-hook ``` ## Pre-commit hooks The ReadAlong Studio team has agreed to systematically use a number of pre-commit hooks to normalize formatting of code. You need to install and enable pre-commit to have these used -when you do your own commits. +automatically when you do your own commits. Pre-commit hooks enabled: - check-yaml validates YAML files @@ -60,11 +61,11 @@ don't forget to do so when you clone a new sandbox! ## commitlint -The team has also agreed to use commitlint-style commit messages. Install and enable -[commitlint](https://github.com/conventional-changelog/commitlint) to have your commits -validated systematically. +The team has also agreed to use [Conventional Commits](https://www.conventionalcommits.org/). +Install and enable [gitlint](https://jorisroovers.com/gitlint/) to have your +commit messages scanned automatically. -Commitlint commits look like this: +Convential commits look like this: type(optional-scope): subject (i.e., short description) @@ -107,32 +108,14 @@ These rules are inspired by these commit formatting guides: ### Enabling commitlint -We run commitlint on each commit message that you write by enabling the commit-msg hook in -Git. It is run via [husky](https://www.npmjs.com/package/husky), which is a JS Git hook -manager, and you need Node to run it. - -If you don't already use Node, this is a bit more work to install that the pre-commit -hooks above, but please take a moment to do this: +You can run commitlint on each commit message that you write by enabling the +commit-msg hook in Git. -- If you don't already use Node or nvm, or if you don't have admin access to the system - version of node, install nvm in your ~/.nvm folder: -```sh -wget -qO- https://raw.githubusercontent.com/nvm-sh/nvm/v0.35.3/install.sh | bash -``` -This will add a few lines to your `.bashrc` file, which you'll need to execute now, -possibly by starting a new shell. - -- Install Node: -```sh -nvm install node -``` +Run this command in your g2p sandbox to install and enable the commit-msg hook: -- In your ReadAlong/Studio sandbox, install the husky commit-msg hook using npm, the node - package manager you just installed using nvm. The file `package.json` in Studio is what - tells npm to install husky as a pre-commit hook, and also what tells husky to invoke - commitlint on your commit messages. ```sh -npm install +pip install -r requirements/requirements.dev.txt +gitlint install-hook ``` - Now, next time you make a change and commit it, your commit log will be checked: diff --git a/Dockerfile b/Dockerfile index b850a467..8b97eb14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,40 +3,46 @@ FROM debian:bullseye-slim ENV APPHOME /opt/readalong-studio ENV PORT 5000 -# Install system dependencies -# - swig: required by pocketsphinx -# - libpulse-dev: required by pocketsphinx -# - portaudio19-dev: required by pocketsphinx -RUN apt-get update && apt-get install -y \ +# Lean, optimized installation of system dependencies +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends --yes \ python3 \ python3-pip \ git \ - swig \ - libpulse-dev \ - portaudio19-dev \ ffmpeg \ - vim-nox + vim-nox \ + less \ + && apt-get clean \ + && apt-get autoremove \ + && rm -fr /var/lib/apt/lists/* # Install 3rd party dependencies in their own layer, for faster rebuilds when we # change ReadAlong-Studio source code -RUN python3 -m pip install gevent -ADD requirements.txt $APPHOME/requirements.txt -RUN python3 -m pip install -r $APPHOME/requirements.txt -# RUN python3 -m pip install gunicorn # If you want to run production server +ADD requirements.* $APPHOME/ +RUN python3 -m pip install --upgrade pip \ + && python3 -m pip install -r $APPHOME/requirements.txt \ + && python3 -m pip install gevent # We don't want Docker to cache the installation of g2p or Studio, so place them # after COPY . $APPHOME, which almost invariable invalidates the cache. COPY . $APPHOME WORKDIR $APPHOME # Get and install the latest g2p -RUN git clone https://github.com/roedoejet/g2p.git -RUN cd g2p && python3 -m pip install -e . +RUN git clone https://github.com/roedoejet/g2p.git \ + && cd g2p \ + && python3 -m pip install -e . + # Install ReadAlong-Studio itself RUN python3 -m pip install -e . -# Run the default gui (on localhost:5000) +# Run the default gui (on localhost:5000, make sure you use -p 5000:5000 when +# you docker run the image) CMD python3 ./run.py # For a production server, comment out the default gui CMD above, and run the # gui using gunicorn instead: -# CMD gunicorn -k gevent -w 1 readalongs.app:app --bind 0.0.0.0:5000 +# CMD gunicorn -k gevent -w 1 readalongs.app:app --bind 0.0.0.0:$PORT + +# For the web API, use this CMD instead, the same on our Heroku deployment, except +# with binding to port 5000 +# CMD gunicorn -w 4 -k uvicorn.workers.UvicornWorker readalongs.web_api:web_api_app --bind 0.0.0.0:$PORT diff --git a/LICENSE b/LICENSE index bee9b87b..162daf13 100644 --- a/LICENSE +++ b/LICENSE @@ -2,6 +2,7 @@ MIT License Copyright (c) 2019 David Huggins-Daines Copyright (c) 2019-2021 National Research Council Canada +Acoustic model in readalongs/static/model/cmusphinx-en-us-5.2 Copyright (c) 2015 Alpha Cephei Inc. licensed under the FreeBSD License; see README in that directory. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Procfile b/Procfile new file mode 100644 index 00000000..f7b08354 --- /dev/null +++ b/Procfile @@ -0,0 +1,2 @@ +# Command for launching the web API server for ReadAlongs-Studio on Heroku +web: gunicorn -w 4 -k uvicorn.workers.UvicornWorker readalongs.web_api:web_api_app diff --git a/README.md b/README.md index e240bb3f..173ee2c3 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # ReadAlong-Studio [![codecov](https://codecov.io/gh/ReadAlongs/Studio/branch/master/graph/badge.svg)](https://codecov.io/gh/ReadAlongs/Studio) -[![Build Status](https://travis-ci.com/ReadAlongs/Studio.svg?branch=master)](https://travis-ci.com/github/ReadAlongs/Studio) +[![Build Status](https://github.com/readalongs/Studio/actions/workflows/tests.yml/badge.svg?branch=master)](https://github.com/ReadAlongs/Studio/actions) [![PyPI package](https://img.shields.io/pypi/v/readalongs.svg)](https://pypi.org/project/readalongs/) [![GitHub license](https://img.shields.io/github/license/ReadAlongs/Studio)](https://github.com/ReadAlongs/Studio/blob/master/LICENSE) [![standard-readme compliant](https://img.shields.io/badge/readme%20style-standard-brightgreen.svg?style=flat-square)](https://github.com/ReadAlongs/Studio) @@ -22,7 +22,8 @@ This library is an end-to-end audio/text aligner. It is meant to be used togethe - [Validation](#Verifying-your-installation) - [Usage](#usage) - [CLI](#cli) - - [Studio](#Studio-web-application) + - [Web API](#web-api) + - [Studio](#studio-web-application) - [Docker](#docker) - [Maintainers](#maintainers) - [Contributing](#contributing) @@ -154,9 +155,25 @@ Basic alignment is done with the following command. `readalongs align TEXTFILE WAVFILE OUTPUTNAME` +### Web API + +This page lists only the most basic commands. + +For more information about how the command line interface works consult the interactive [API Documentation](https://readalong-studio.herokuapp.com/api/v1/docs). + +For information on spinning up your own dev Web API server locally, have a look at [web\_api.py](readalongs/web_api.py). + +#### /langs + +To query a list of available languages in the ReadAlong Studio API, send a GET request to https://readalongs-studio.herokuapp.com/api/v1/langs + +#### /assemble + +This endpoint is a remote procedural call that assembles the data needed to build a readalong using the JavaScript-based [SoundSwallower decoder](https://github.com/ReadAlongs/SoundSwallower). It is an endpoint that accepts POST requests with either plaintext or XML input. Please see the [documentation](https://readalong-studio.herokuapp.com/api/v1/docs) for more information. + ### Studio web application -ReadAlong-Studio has a web interface for creating interactive audiobooks. The web app can be served by first installing ReadAlong-Studio and then running `readalongs run`. A web app will then be available on port 5000. +ReadAlong-Studio has a web interface for creating interactive audiobooks. The web app can be served by first installing ReadAlong-Studio and then running `python3 run.py`. A web app will then be available on port 5000. ### Docker @@ -202,7 +219,17 @@ Feel free to dive in! [Open an issue](https://github.com/ReadAlongs/Studio/issue This repo follows the [Contributor Covenant](http://contributor-covenant.org/version/1/3/0/) Code of Conduct. -Have a look at [Contributing.md](Contributing.md) for help getting started. +You can install our standard Git hooks by running these commands in your sandbox: + +```sh +pip install -r requirements.dev.txt +pre-commit install +gitlint install-hook +``` + +Have a look at [Contributing.md](Contributing.md) for the full details on the +Conventional Commit messages we prefer, our code formatting conventions, and +our Git hooks. ### Contributors @@ -218,6 +245,24 @@ Here is a partial list: Project web page: [ReadAlong Studio: Application for Indigenous audiobooks and videos project](https://nrc.canada.ca/en/research-development/research-collaboration/programs/readalong-studio-application-indigenous-audiobooks-videos-project) +### Citation + +if you use this software in a project of yours and write about it, please cite +us using the following: + +``` +@inproceedings{Littell_ReadAlong_Studio_Practical_2022, + author = {Littell, Patrick and Joanis, Eric and Pine, Aidan and Tessier, Marc and Huggins-Daines, David and Torkornoo, Delasie}, + booktitle = {Proceedings of SIGUL2022 @LREC2022}, + title = {{ReadAlong Studio: Practical Zero-Shot Text-Speech Alignment for Indigenous Language Audiobooks}}, + year = {2022}, + month = {6}, + pages = {23--32}, + publisher = {European Language Resources Assiciation (ELRA)}, + url = {http://www.lrec-conf.org/proceedings/lrec2022/workshops/SIGUL/pdf/2022.sigul-1.4.pdf} +} +``` + ## License -[MIT](LICENSE) © 2019-2021 David Huggins-Daines and National Research Council Canada +[MIT](LICENSE) © 2019-2022 David Huggins-Daines and National Research Council Canada diff --git a/docs/README.md b/docs/README.md index 873aa2c4..75d6e6e9 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,28 +5,35 @@ To contribute to the ReadAlongs Studio documentation, edit the `.rst` files in this folder. -## Build the documentation locally +## Build and view the documentation locally -To build the documention for local inspection, run one of these commands, -which will build the documentation in `./_build/html/` or -`./_build/singlehtml/`: +To build the documentation and review your own changes locally: - make html # multi-page HTML site - make singlehtml # single-page HTML document +1. Install the required build software, Sphinx: -## View the documentation locally + pip install -r requirements.txt -To view the documentation, run an HTTP server in the directory where the build -is found, e.g., +2. Install Studio itself - cd _build/html - python3 -m http.server + (cd .. && pip install -e .) -and navigate to http://127.0.0.1:8000 to view the results (or whatever port -your local web server displays). +3. Run one of these commands, which will build the documentation in `./_build/html/` + or `./_build/singlehtml/`: + + make html # multi-page HTML site + make singlehtml # single-page HTML document + +2. View the documentation by running an HTTP server in the directory where the + build is found, e.g., + + cd _build/html + python3 -m http.server + + and navigating to http://127.0.0.1:8000 (or whatever port your local web + server displays). ## Publish the changes Once your changes are pushed to GitHub and merged into `master` via a Pull -Request, the documentation will be automatically built and published to +Request, the documentation will automatically get built and published to https://readalong-studio.readthedocs.io/en/latest/ diff --git a/docs/advanced-use.rst b/docs/advanced-use.rst index e60450c0..2f6e3e23 100644 --- a/docs/advanced-use.rst +++ b/docs/advanced-use.rst @@ -1,7 +1,24 @@ .. _advanced-use: -Data pre-processing and troubleshooting -======================================= +Advanced topics +=============== + +.. _adding-a-lang: + +Adding a new language to g2p +---------------------------- + +If you want to align an audio book in a language that is not yet supported by +the g2p library, you will have to write your own g2p mapping for that language. + +References: + - The `g2p library `__ and its + `documentation `__. + - The `7-part blog post on creating g2p mappings `__ on the `Mother Tongues Blog `__. + +Once you have created a g2p mapping for your language, please consider +`contributing it to the project `__ +so others can also benefit from your work! Pre-processing your data ------------------------ @@ -77,86 +94,3 @@ pre-processing. num2words 123456789 one hundred and twenty-three million, four hundred and fifty-six thousand, seven hundred and eighty-nine - -Troubleshooting ---------------- - -Here are three types of common errors you may encounter when trying to -run ReadAlongs, and ways to debug them. - -Phones missing in the acoustic model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You may get an error that looks like this:|image1| - -The general structure of your error would look like -``Phone [character] is missing in the acoustic model; word [index] ignored`` -This error is most likely caused not by a bug in your ReadAlong input -files, but by an error in one of your g2p mappings. The error message is -saying that there is a character in your ReadAlong text that is not -being properly converted to English-arpabet (eng-arpabet), which is the -language ReadAlong uses to map text to sound. Thus, ReadAlong cannot -match your text to a corresponding sound (phone) in your audio file -because it cannot understand what sound the text is meant to represent. -Follow these steps to debug the issue **in g2p**. - -1. Identify which characters in each line of the error message are - **not** being converted to eng-arpabet. These will either be: - - a. characters that are not in caps (for example ``g`` in the string - ``gUW`` in the error message shown above.) - b. a character not traditionally used in English (for example é or Ŧ, - or ``ʰ`` in the error message shown above.) You can confirm you - have isolated the right characters by ensuring every other - character in your error message appears as an **output** in the - `eng-ipa-to-arpabet - mapping `__. - These are the problematic characters we need to debug in the error - message shown above: ``g`` and ``ʰ``. - -2. Once you have isolated the characters that are not being converted to - eng-arpabet, you are ready to begin debugging the issue. Start at - step 3 below for each problematic character. - -3. Our next step is to identify which mapping is converting the - problematic characters incorrectly. Most of the time, the issue will - be in either the first or the second of the following mappings: - - i. *xyz-ipa* (where xyz is the ISO language code for your mapping) - ii. *xyz-equiv* (if you have one) - iii. *xyz-ipa_to_eng-ipa* (this mapping must be generated - automatically in g2p. Refer //here_in_the_guide to see how to do - this.) - iv. `eng-ipa-to-arpabet - mapping `__ - (The issue is rarely found here, but it doesn’t hurt to check.) - -4. Find a word in your text that uses the problematic character. For the - sake of example, let us assume the character I am debugging is ``g``, - that appears in the word "dog", in language "xyz". - -5. Make sure you are in the g2p repository and run the word through - ``g2p convert`` to confirm you have isolated the correct characters - to debug: ``g2p convert dog xyz eng-arpabet``. Best practice is to - copy+paste the word directly from your text instead of retyping it. - Make sure to use the ISO code for your language in place of "xyz". - *If the word converts cleanly into eng-arpabet characters, your issue - does not lie in your mapping. //Refer to other potential RA issues* - -6. From the result of the command run in 5, note the characters that do - **not** appear as **inputs** in the `eng-ipa-to-arpabet - mapping `__. - These are the characters that have not been converted into characters - that eng-ipa-to-arpabet can read. These should be the same characters - you identified in step 2. - -7. Run ``g2p convert dog xyz xyz-ipa``. Ensure the result is what you - expect. If not, your error may arise from a problem in this mapping. - refer_to_g2p_troubleshooting. If the result is what you expect, - continue to the next step. - -8. Note the result from running the command in 7. Check that the - characters [TODO-fix this text] (appear/being mapped by generated -- - use debugger or just look at mapping) - -.. |image1| image:: https://i.imgur.com/vKPhTud.png diff --git a/docs/cli-guide.rst b/docs/cli-guide.rst index 6477aa37..956b6e12 100644 --- a/docs/cli-guide.rst +++ b/docs/cli-guide.rst @@ -6,10 +6,10 @@ Command line interface (CLI) user guide This page contains guidelines on using the ReadAlongs CLI. See also :ref:`cli-ref` for the full CLI reference. -The ReadAlongs CLI has two main commands: ``readalongs prepare`` and +The ReadAlongs CLI has two main commands: ``readalongs make-xml`` and ``readalongs align``. -- If your data is a plain text file, you can run ``prepare`` to turn it into +- If your data is a plain text file, you can run ``make-xml`` to turn it into XML, which you can then align with ``align``. Doing this in two steps allows you to modify the XML file before aligning it (e.g., to mark that some text is in a different language, to flag some do-not-align text, or to drop anchors @@ -22,7 +22,7 @@ The ReadAlongs CLI has two main commands: ``readalongs prepare`` and Two additional commands are sometimes useful: ``readalongs tokenize`` and ``readalongs g2p``. -- ``tokenize`` takes the output of ``prepare`` and tokenizes it, wrapping each +- ``tokenize`` takes the output of ``make-xml`` and tokenizes it, wrapping each word in the text in a ```` element. - ``g2p`` takes the output of ``tokenize`` and mapping each word to its @@ -33,12 +33,12 @@ Two additional commands are sometimes useful: ``readalongs tokenize`` and The result of ``tokenize`` or ``g2p`` can be fixed manually if necessary and then used as input to ``align``. -Getting from TXT to XML with readalongs prepare +Getting from TXT to XML with readalongs make-xml ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Run :ref:`cli-prepare` to prepare an XML file for ``align`` from a TXT file. +Run :ref:`cli-make-xml` to make the XML file for ``align`` from a TXT file. -``readalongs prepare [options] [story.txt] [story.xml]`` +``readalongs make-xml [options] [story.txt] [story.xml]`` ``[story.txt]``: path to the plain text input file (TXT) @@ -60,19 +60,19 @@ breaks are marked by two blank lines. | | and will be aligning repeatedly) | +-----------------------------------+-----------------------------------------------+ | ``-h, --help`` | Displays CLI guide for | -| | ``prepare`` | +| | ``make-xml`` | +-----------------------------------+-----------------------------------------------+ The ``-l, --language`` argument requires a language’s 3 character `ISO code `__ as an argument. -The languages supported by RAS can be listed by running ``readalongs prepare -h`` -and they can also be found in the :ref:`cli-prepare` reference. +The languages supported by RAS can be listed by running ``readalongs make-xml -h`` +and they can also be found in the :ref:`cli-make-xml` reference. So, a full command for a story in Algonquin, with an implicit g2p fallback to Undetermined, would be something like: -``readalongs prepare -l alq Studio/story.txt Studio/story.xml`` +``readalongs make-xml -l alq Studio/story.txt Studio/story.xml`` The generated XML will be parsed in to sentences. At this stage you can edit the XML to have any modifications, such as adding ``do-not-align`` @@ -92,7 +92,7 @@ element in the xml (word, sentence, paragraph, or page). dog -If you have already run ``readalongs prepare``, there will be +If you have already run ``readalongs make-xml``, there will be documentation for DNA text in comments at the beginning of the generated xml file. @@ -155,7 +155,7 @@ created, as ``output_base*`` | | configuration file (in JSON | | | format) | +-----------------------------------+-----------------------------------------------+ -| ``--g2p-verbose`` | Display verbose g2p error messages | +| ``--debug-g2p`` | Display verbose g2p debugging messages | +-----------------------------------+-----------------------------------------------+ | ``-s, --save-temps`` | Save intermediate stages of | | | processing and temporary files | @@ -184,13 +184,35 @@ A full command could be something like: - With other extensions, the beginning of the file is examined to automatically determine if it's XML or plain text. -The config.json file -~~~~~~~~~~~~~~~~~~~~ +Supported languages +~~~~~~~~~~~~~~~~~~~ -Some additional parameters can be specified via a config file: create a JSON -file called ``config.json``, possibly in the same folder as your other ReadAlong -input files for convenience. The config file currently accepts two components: -adding images to your ReadAlongs, and DNA audio (see :ref:`dna`). +The ``readalongs langs`` command can be used to list all supported languages. + +Here is that list at the time of compiling this documentation: + +.. command-output:: readalongs langs + +See :ref:`adding-a-lang` for references on adding new languages to that list. + + +Adding titles, images and do-not-align segments via the config.json file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some additional parameters can be specified via a config file: create +a JSON file called ``config.json``, possibly in the same folder as +your other ReadAlong input files for convenience. The config file +currently accepts a few components: adding titles and headers, adding +images to your ReadAlongs, and DNA audio (see :ref:`dna`). + +To add a title and headers to the output HTML, you can use the keys +`"title"`, `"header"`, and `"subheader"`, for example:: + + { + "title": "My awesome read-along", + "header": "A story in my language", + "subheader": "Read by me" + } To add images, indicate the page number as the key, and the name of the image file as the value, as an entry in the ``"images"`` dictionary. @@ -279,7 +301,7 @@ falling back to ``eng`` and then ``und`` (see below) when needed. .. code-block:: bash - readalongs prepare -l fra,eng myfile.txt myfile.xml + readalongs make-xml -l fra,eng myfile.txt myfile.xml readalongs align -l fra,eng myfile.txt myfile.wav output-dir The "Undetermined" language code: und @@ -296,7 +318,7 @@ most text with a few foreign words without any manual intervention. Since we recommend systematically using ``und`` at the end of the cascade, it is now added by default after the languages specified with the ``-l`` -switch to both ``readalongs align`` and ``readalongs prepare``. Note that +switch to both ``readalongs align`` and ``readalongs make-xml``. Note that adding other languages after ``und`` will have no effect, since the Undetermined mapping will map any string to valid ARPABET. @@ -311,7 +333,7 @@ The warning messages issued by ``readalongs g2p`` and ``readalongs align`` indicate which words are causing g2p problems and what fallbacks were tried. It can be worth inspecting to input text to fix any encoding or spelling errors highlighted by these warnings. More detailed messages can be -produced by adding the ``--g2p-verbose`` switch, to obtain a lot more +produced by adding the ``--debug-g2p`` switch, to obtain a lot more information about g2p'ing words in each language g2p was unsucessfully attempted. @@ -325,7 +347,7 @@ The following series of commands: :: - readalongs prepare -l l1,l2 file.txt file.xml + readalongs make-xml -l l1,l2 file.txt file.xml readalongs tokenize file.xml file.tokenized.xml readalongs g2p file.tokenized.xml file.g2p.xml readalongs align file.g2p.xml file.wav output @@ -354,7 +376,7 @@ Anchor syntax ^^^^^^^^^^^^^ Anchors are inserted in the XML file (the output of -``readalongs prepare``, ``readalongs tokenize`` or ``readalongs g2p``) +``readalongs make-xml``, ``readalongs tokenize`` or ``readalongs g2p``) using the following syntax: ```` or ````. The time can be specified in seconds (this is the default) or milliseconds. diff --git a/docs/cli-ref.rst b/docs/cli-ref.rst index cb89f728..34afa0df 100644 --- a/docs/cli-ref.rst +++ b/docs/cli-ref.rst @@ -6,13 +6,14 @@ Command line interface (CLI) reference This page contains the full reference documentation for each command in the CLI. See also :ref:`cli-guide` for guidelines on using the CLI. -The ReadAlongs CLI has four key commands: +The ReadAlongs CLI has five key commands: - :ref:`cli-align`: full alignment pipeline, from plain text or XML to a viewable readalong -- :ref:`cli-prepare`: convert a plain text file into XML, for align -- :ref:`cli-tokenize`: tokenize a prepared XML file +- :ref:`cli-make-xml`: convert a plain text file into XML, for align +- :ref:`cli-tokenize`: tokenize an XML file - :ref:`cli-g2p`: g2p a tokenized XML file +- :ref:`cli-langs`: list supported languages Each command can be run with ``-h`` or ``--help`` to display its usage manual, e.g., ``readalongs -h``, ``readalongs align --help``. @@ -21,9 +22,9 @@ e.g., ``readalongs -h``, ``readalongs align --help``. .. click:: readalongs.cli:align :prog: readalongs align -.. _cli-prepare: -.. click:: readalongs.cli:prepare - :prog: readalongs prepare +.. _cli-make-xml: +.. click:: readalongs.cli:make-xml + :prog: readalongs make-xml .. _cli-tokenize: .. click:: readalongs.cli:tokenize @@ -32,3 +33,7 @@ e.g., ``readalongs -h``, ``readalongs align --help``. .. _cli-g2p: .. click:: readalongs.cli:g2p :prog: readalongs g2p + +.. _cli-langs: +.. click:: readalongs.cli:langs + :prog: readalongs langs diff --git a/docs/conf.py b/docs/conf.py index 3a58f054..af4cd1ef 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,7 +21,7 @@ # -- Project information ----------------------------------------------------- project = "ReadAlongs-Studio" -copyright = "2019-2021 David Huggins-Daines and National Research Council Canada" +copyright = "2019-2022 David Huggins-Daines and National Research Council Canada" author = "David Huggins-Daines, Eric Joanis, Patrick Littell, Aidan Pine" # The short X.Y version @@ -45,6 +45,7 @@ "sphinx.ext.todo", "sphinx.ext.coverage", "sphinx_click.ext", + "sphinxcontrib.programoutput", ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/index.rst b/docs/index.rst index 3abf9e44..a99a1d92 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,7 +1,9 @@ -Welcome to ReadAlong-Studio's documentation! -============================================ +Welcome to ReadAlong-Studio's documentation +=========================================== -.. note:: ReadAlong-Studio is UNDER CONSTRUCTION and should not be expected to be fully documented or even work as expected! Check back soon for more information. +Audiobook alignment for Indigenous languages + +This site provides the full user documentation for ReadAlongs-Studio. .. toctree:: :maxdepth: 2 @@ -13,6 +15,7 @@ Welcome to ReadAlong-Studio's documentation! cli-ref outputs advanced-use + troubleshooting Indices and tables diff --git a/docs/outputs.rst b/docs/outputs.rst index 0d236f7e..e3098d9f 100644 --- a/docs/outputs.rst +++ b/docs/outputs.rst @@ -35,8 +35,8 @@ Below is an example of a minimal implementation in a basic standalone html page. - - + + diff --git a/docs/requirements.txt b/docs/requirements.txt index 8109c985..a85ae383 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,5 @@ Sphinx guzzle_sphinx_theme sphinx-click +sphinxcontrib-programoutput -r ../requirements.txt diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst new file mode 100644 index 00000000..6f395fb3 --- /dev/null +++ b/docs/troubleshooting.rst @@ -0,0 +1,102 @@ +.. _troubleshooting: + +.. note:: This troubleshooting guide is under construction. + +Troubleshooting +=============== + +Here are three types of common errors you may encounter when trying to +run ReadAlongs, and ways to debug them. + +Phones missing in the acoustic model +------------------------------------ + +.. note:: Troubleshooting item under construction + +You may get an error that looks like this:|image1| + +The general structure of your error would look like +``Phone [character] is missing in the acoustic model; word [index] ignored`` +This error is most likely caused not by a bug in your ReadAlong input +files, but by an error in one of your g2p mappings. The error message is +saying that there is a character in your ReadAlong text that is not +being properly converted to English-arpabet (eng-arpabet), which is the +language ReadAlong uses to map text to sound. Thus, ReadAlong cannot +match your text to a corresponding sound (phone) in your audio file +because it cannot understand what sound the text is meant to represent. +Follow these steps to debug the issue **in g2p**. + +1. Identify which characters in each line of the error message are + **not** being converted to eng-arpabet. These will either be: + + a. characters that are not in caps (for example ``g`` in the string + ``gUW`` in the error message shown above.) + b. a character not traditionally used in English (for example é or Ŧ, + or ``ʰ`` in the error message shown above.) You can confirm you + have isolated the right characters by ensuring every other + character in your error message appears as an **output** in the + `eng-ipa-to-arpabet + mapping `__. + These are the problematic characters we need to debug in the error + message shown above: ``g`` and ``ʰ``. + +2. Once you have isolated the characters that are not being converted to + eng-arpabet, you are ready to begin debugging the issue. Start at + step 3 below for each problematic character. + +3. Our next step is to identify which mapping is converting the + problematic characters incorrectly. Most of the time, the issue will + be in either the first or the second of the following mappings: + + i. *xyz-ipa* (where xyz is the ISO language code for your mapping) + ii. *xyz-equiv* (if you have one) + iii. *xyz-ipa_to_eng-ipa* (this mapping must be generated + automatically in g2p. Refer //here_in_the_guide to see how to do + this.) + iv. `eng-ipa-to-arpabet + mapping `__ + (The issue is rarely found here, but it doesn’t hurt to check.) + +4. Find a word in your text that uses the problematic character. For the + sake of example, let us assume the character I am debugging is ``g``, + that appears in the word "dog", in language "xyz". + +5. Make sure you are in the g2p repository and run the word through + ``g2p convert`` to confirm you have isolated the correct characters + to debug: ``g2p convert dog xyz eng-arpabet``. Best practice is to + copy+paste the word directly from your text instead of retyping it. + Make sure to use the ISO code for your language in place of "xyz". + *If the word converts cleanly into eng-arpabet characters, your issue + does not lie in your mapping. //Refer to other potential RA issues* + +6. From the result of the command run in 5, note the characters that do + **not** appear as **inputs** in the `eng-ipa-to-arpabet + mapping `__. + These are the characters that have not been converted into characters + that eng-ipa-to-arpabet can read. These should be the same characters + you identified in step 2. + +7. Run ``g2p convert dog xyz xyz-ipa``. Ensure the result is what you + expect. If not, your error may arise from a problem in this mapping. + refer_to_g2p_troubleshooting. If the result is what you expect, + continue to the next step. + +8. Note the result from running the command in 7. Check that the + characters [TODO-fix this text] (appear/being mapped by generated -- + use debugger or just look at mapping) + +.. |image1| image:: https://i.imgur.com/vKPhTud.png + +Type 2 +------ + +.. note:: TODO + +Common error type 2... + +Type 3 +------ + +.. note:: TODO + +Common error type 3... diff --git a/misc-utils/README.md b/misc-utils/README.md index ae01f8f9..27c01792 100644 --- a/misc-utils/README.md +++ b/misc-utils/README.md @@ -19,7 +19,7 @@ categories under "Sonority Hierarchy" to support other languages. Must be called manually after readalongs tokenize and before readalongs align or readalong g2p: - readalongs prepare -l my_lang file.txt file.xml + readalongs make-xml -l my_lang file.txt file.xml readalongs tokenize file.xml file-tok.xml ./syll_parse.py file-tok.xml file-tok-syll.xml diff --git a/misc-utils/non-caching-server-3.7.py b/misc-utils/non-caching-server-3.7.py index e9d47cf7..a1859494 100755 --- a/misc-utils/non-caching-server-3.7.py +++ b/misc-utils/non-caching-server-3.7.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # This script is copied and modified from # https://github.com/python/cpython/blob/3.7/Lib/http/server.py @@ -93,7 +93,7 @@ # refresh will not fetch manually updated pages. # # Running this script in a root web site folder is equivalent to running -# python3 -m http.server +# python -m http.server # in that folder, except that pages won't get cached. # # - Eric Joanis, 2021: diff --git a/misc-utils/non-caching-server-3.9.py b/misc-utils/non-caching-server-3.9.py index 07b92e16..3a740259 100755 --- a/misc-utils/non-caching-server-3.9.py +++ b/misc-utils/non-caching-server-3.9.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # This script is copied and modified from # https://github.com/python/cpython/blob/3.9/Lib/http/server.py @@ -93,7 +93,7 @@ # refresh will not fetch manually updated pages. # # Running this script in a root web site folder is equivalent to running -# python3 -m http.server +# python -m http.server # in that folder, except that pages won't get cached. # # - Eric Joanis, 2021: diff --git a/misc-utils/syll_parse.py b/misc-utils/syll_parse.py index db3a5060..2a6579c6 100755 --- a/misc-utils/syll_parse.py +++ b/misc-utils/syll_parse.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # Original Copyright and License from https://github.com/alexestes/SonoriPy: # diff --git a/package.json b/package.json deleted file mode 100644 index b153b672..00000000 --- a/package.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "name": "readalongs", - "version": "1.0.0", - "description": "", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "husky": { - "hooks": { - "commit-msg": "commitlint -E HUSKY_GIT_PARAMS" - } - }, - "commitlint": { - "extends": [ - "@commitlint/config-conventional" - ] - }, - "repository": { - "type": "git", - "url": "git+https://github.com/ReadAlongs/Studio.git" - }, - "keywords": [], - "author": "", - "license": "", - "bugs": { - "url": "https://github.com/ReadAlongs/Studio/issues" - }, - "homepage": "https://github.com/ReadAlongs/Studio#readme", - "dependencies": { - "@commitlint/config-conventional": "^8.3.4", - "commitlint": "^8.3.5", - "husky": "^4.2.3" - } -} diff --git a/readalongs/_version.py b/readalongs/_version.py index 6db21f19..42a9f433 100644 --- a/readalongs/_version.py +++ b/readalongs/_version.py @@ -1 +1 @@ -__version__ = "0.2.20211122" +__version__ = "0.2.20220705" diff --git a/readalongs/align.py b/readalongs/align.py index 9b93ce0d..c9125f3d 100644 --- a/readalongs/align.py +++ b/readalongs/align.py @@ -4,13 +4,13 @@ import io import os import shutil +import sys from collections import defaultdict from dataclasses import dataclass from datetime import timedelta -from typing import Dict, List, Union +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union import chevron -import regex as re import soundswallower from lxml import etree from pydub import AudioSegment @@ -42,7 +42,16 @@ from readalongs.text.make_package import create_web_component_html from readalongs.text.make_smil import make_smil from readalongs.text.tokenize_xml import tokenize_xml -from readalongs.text.util import parse_time, save_minimal_index_html, save_txt, save_xml +from readalongs.text.util import ( + get_word_text, + parse_time, + save_minimal_index_html, + save_txt, + save_xml, +) + +MODEL_DIR = os.path.join(os.path.dirname(__file__), "static", "model") +DEFAULT_ACOUSTIC_MODEL = "cmusphinx-en-us-5.2" @dataclass @@ -62,11 +71,13 @@ class WordSequence: words: List -def get_sequences(xml, xml_filename, unit="w", anchor="anchor") -> List[WordSequence]: +def get_sequences( + xml, xml_filename="memory", unit="w", anchor="anchor" +) -> List[WordSequence]: """Return the list of anchor-separated word sequences in xml Args: - xml (etree): xml structure in which to search for words and anchors + xml (etree.ElementTree): xml structure in which to search for words and anchors xml_filename (str): filename, used for error messages only unit (str): element tag of the word units anchor (str): element tag of the anchors @@ -154,37 +165,27 @@ def split_silences(words: List[dict], final_end, excluded_segments: List[dict]) _ = words.pop() -def align_audio( # noqa: C901 - xml_path, - audio_path, - unit="w", - bare=False, - config=None, - save_temps=None, - verbose_g2p_warnings=False, -): - """Align an XML input file to an audio file. +def parse_and_make_xml( + xml_path: str, + config: dict, + save_temps: Optional[str] = None, + verbose_g2p_warnings: Optional[bool] = False, + output_orthography: str = "eng-arpabet", +) -> etree.ElementTree: + """Parse XML input and run tokenization and G2P. Args: xml_path (str): Path to XML input file in TEI-like format - audio_path (str): Path to audio input. Must be in a format supported by ffmpeg - unit (str): Optional; Element to create alignments for, by default 'w' - bare (boolean): Optional; - If False, split silence into adjoining tokens (default) - If True, keep the bare tokens without adjoining silences. - config (object): Optional; ReadAlong-Studio configuration to use + config (dict): Optional; ReadAlong-Studio configuration to use save_temps (str): Optional; Save temporary files, by default None verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings iff True Returns: - Dict[str, List]: TODO + lxml.etree.ElementTree: Parsed and prepared XML Raises: - TODO - """ - results: Dict[str, List] = {"words": [], "audio": None} - + RuntimeError: If XML failed to parse""" # First do G2P try: xml = etree.parse(xml_path).getroot() @@ -192,220 +193,295 @@ def align_audio( # noqa: C901 raise RuntimeError( "Error parsing XML input file %s: %s." % (xml_path, e) ) from e - if config and "images" in config: + if "images" in config: xml = add_images(xml, config) - if config and "xml" in config: + if "xml" in config: xml = add_supplementary_xml(xml, config) xml = tokenize_xml(xml) - if save_temps: + if save_temps is not None: save_xml(save_temps + ".tokenized.xml", xml) - results["tokenized"] = xml = add_ids(xml) - if save_temps: + xml = add_ids(xml) + if save_temps is not None: save_xml(save_temps + ".ids.xml", xml) - xml, valid = convert_xml(xml, verbose_warnings=verbose_g2p_warnings) - if save_temps: + xml, valid = convert_xml( + xml, + verbose_warnings=verbose_g2p_warnings, + output_orthography=output_orthography, + ) + if save_temps is not None: save_xml(save_temps + ".g2p.xml", xml) if not valid: raise RuntimeError( "Some words could not be g2p'd correctly. Aborting. " - "Run with --g2p-verbose for more detailed g2p error logs." + "Run with --debug-g2p for more detailed g2p error logs." ) + return xml - # Prepare the SoundsSwallower (formerly PocketSphinx) configuration - cfg = soundswallower.Decoder.default_config() - model_path = soundswallower.get_model_path() - cfg.set_boolean("-remove_noise", False) - cfg.set_boolean("-remove_silence", False) - cfg.set_string("-hmm", os.path.join(model_path, "en-us")) - # cfg.set_string('-samprate', "no no") - cfg.set_float("-beam", 1e-100) - cfg.set_float("-wbeam", 1e-80) - # Read the audio file - audio = read_audio_from_file(audio_path) - audio = audio.set_channels(1).set_sample_width(2) - audio_length_in_ms = len(audio.raw_data) - # Downsampling is (probably) not necessary - cfg.set_float("-samprate", audio.frame_rate) +def create_asr_config( + config: dict, + audio: AudioSegment, + save_temps: Optional[str] = None, + debug_aligner: Optional[bool] = False, + alignment_mode: str = "auto", +) -> soundswallower.Config: + """Create the base SoundSwallower (formerly PocketSphinx) configuration. - # Process audio, silencing or removing any DNA segments - dna_segments = [] - removed_segments = [] - if config and "do-not-align" in config: - # Sort un-alignable segments and join overlapping ones - dna_segments = sort_and_join_dna_segments(config["do-not-align"]["segments"]) - method = config["do-not-align"].get("method", "remove") - # Determine do-not-align method - if method == "mute": - dna_method = mute_section - elif method == "remove": - dna_method = remove_section - else: - LOGGER.error("Unknown do-not-align method declared") - # Process audio and save temporary files - if method in ("mute", "remove"): - processed_audio = audio - # Process the DNA segments in reverse order so we don't have to correct - # for previously processed ones when using the "remove" method. - for seg in reversed(dna_segments): - processed_audio = dna_method( - processed_audio, int(seg["begin"]), int(seg["end"]) - ) - if save_temps: - _, ext = os.path.splitext(audio_path) - try: - processed_audio.export( - save_temps + "_processed" + ext, format=ext[1:] - ) - except CouldntEncodeError: - try: - os.remove(save_temps + "_processed" + ext) - except BaseException: - pass - LOGGER.warning( - f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" - ) - processed_audio.export(save_temps + "_processed" + ".wav") - removed_segments = dna_segments - audio_data = processed_audio + Args: + config (dict): ReadAlong-Studio configuration to use. + audio (AudioSegment): Audio input from which to take parameters. + save_temps (str): Optional; Prefix for saving temporary files, by default None. + debug_aligner (boolean): Optional; Output debugging info from the aligner. + alignment_mode (str): Optional, controls the decoder beam width + + Returns: + soundswallower.Config: Basic configuration.""" + asr_config = soundswallower.Config() + acoustic_model = config.get( + "acoustic_model", os.path.join(MODEL_DIR, DEFAULT_ACOUSTIC_MODEL) + ) + asr_config["hmm"] = acoustic_model + if alignment_mode == "strict": + asr_config["beam"] = 1e-100 + asr_config["pbeam"] = 1e-100 + asr_config["wbeam"] = 1e-80 + elif alignment_mode == "moderate": + asr_config["beam"] = 1e-200 + asr_config["pbeam"] = 1e-200 + asr_config["wbeam"] = 1e-160 + elif alignment_mode == "loose": + asr_config["beam"] = 0 + asr_config["pbeam"] = 0 + asr_config["wbeam"] = 0 else: - audio_data = audio + assert False and "invalid alignment_mode value" - # Initialize the SoundSwallower decoder with the sample rate from the audio - frame_points = int(cfg.get_float("-samprate") * cfg.get_float("-wlen")) + if debug_aligner: + # With --debug-aligner, we display the SoundSwallower logs on + # screen and set them to maximum strength + asr_config["loglevel"] = "DEBUG" + else: + # Otherwise, we enable logging and direct it to a file if + # saving temporary files + if save_temps is not None and (sys.platform not in ("win32", "cygwin")): + # With --save-temps, we save the SoundSwallower logs to a file. + # This is buggy on Windows, so we don't do it on Windows variants + # (NOTE: should be fixed in SoundSwallower 0.3 though) + ss_log = save_temps + ".soundswallower.log" + asr_config["logfn"] = ss_log + asr_config["loglevel"] = "INFO" + # And otherwise the default is fine (only error messages are printed) + + # Set sampling rate based on audio (FIXME: this may cause problems + # later on if it is too low) + asr_config["samprate"] = audio.frame_rate + # Set the minimum FFT size (no longer necessary since + # SoundSwallower 0.2, but we keep this here for compatibility with + # old versions in case we need to debug things) + frame_points = int(asr_config["samprate"] * asr_config["wlen"]) fft_size = 1 while fft_size < frame_points: fft_size = fft_size << 1 - cfg.set_int("-nfft", fft_size) - frame_size = 1.0 / cfg.get_int("-frate") + asr_config["nfft"] = fft_size - # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100), - # while the audio segments manipulated using pydub are sliced and accessed in - # millisecond intervals. For audio segments, the ms slice assumption is hard-coded - # all over, while frames_to_time() is used to convert segment boundaries returned by - # soundswallower, which are indexes in frames, into durations in seconds. - def frames_to_time(frames): - return frames * frame_size + # Disable VAD + asr_config["remove_noise"] = False - # Extract the list of sequences of words in the XML - word_sequences = get_sequences(xml, xml_path, unit=unit) - end = 0 - for i, word_sequence in enumerate(word_sequences): + return asr_config - i_suffix = "" if i == 0 else "." + str(i + 1) - # Generate dictionary and FSG for the current sequence of words - dict_data = make_dict(word_sequence.words, xml_path, unit=unit) - if save_temps: - dict_file = io.open(save_temps + ".dict" + i_suffix, "wb") - else: - dict_file = PortableNamedTemporaryFile( - prefix="readalongs_dict_", delete=False - ) - dict_file.write(dict_data.encode("utf-8")) - dict_file.close() +def read_noisedict(asr_config: soundswallower.Config) -> Set[str]: + """Read the list of noise words from the acoustic model. - fsg_data = make_fsg(word_sequence.words, xml_path) - if save_temps: - fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb") - else: - fsg_file = PortableNamedTemporaryFile( - prefix="readalongs_fsg_", delete=False - ) - fsg_file.write(fsg_data.encode("utf-8")) - fsg_file.close() + Args: + asr_config (soundswallower.Config): ASR configuration. + Returns: + Set[str]: Set of noise words from noisedict, or a default set + if it could not be found. + """ + try: + noisewords = set() + acoustic_model = asr_config["hmm"] + with open( + os.path.join(acoustic_model, "noisedict"), "rt", encoding="utf-8" + ) as dictfh: + for line in dictfh: + if line.startswith("##") or line.startswith(";;"): + continue + noisewords.add(line.strip().split()[0]) + except FileNotFoundError: + LOGGER.warning("Could not find noisedict, using defaults") + noisewords = {"", "[NOISE]"} - # Extract the part of the audio corresponding to this word sequence - audio_segment = extract_section( - audio_data, word_sequence.start, word_sequence.end - ) - if save_temps and audio_segment is not audio_data: - write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix) - - # Configure soundswallower for this sequence's dict and fsg - cfg.set_string("-dict", dict_file.name) - cfg.set_string("-fsg", fsg_file.name) - ps = soundswallower.Decoder(cfg) - # Align this word sequence - ps.start_utt() - ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True) - ps.end_utt() - - if not ps.seg(): - raise RuntimeError( - "Alignment produced no segments, " - "please examine dictionary and input audio and text." - ) + return noisewords - # List of removed segments for the sequence we are currently processing - curr_removed_segments = dna_union( - word_sequence.start, word_sequence.end, audio_length_in_ms, removed_segments - ) - prev_segment_count = len(results["words"]) - for seg in ps.seg(): - if seg.word in ("", "[NOISE]"): - continue - start = frames_to_time(seg.start_frame) - end = frames_to_time(seg.end_frame + 1) - # change to ms - start_ms = start * 1000 - end_ms = end * 1000 - if curr_removed_segments: - start_ms += calculate_adjustment(start_ms, curr_removed_segments) - end_ms += calculate_adjustment(end_ms, curr_removed_segments) - start_ms, end_ms = correct_adjustments( - start_ms, end_ms, curr_removed_segments - ) - # change back to seconds to write to smil - start = start_ms / 1000 - end = end_ms / 1000 - results["words"].append({"id": seg.word, "start": start, "end": end}) - LOGGER.info("Segment: %s (%.3f : %.3f)", seg.word, start, end) - aligned_segment_count = len(results["words"]) - prev_segment_count - if aligned_segment_count != len(word_sequence.words): - LOGGER.warning( - f"Word sequence {i+1} had {len(word_sequence.words)} tokens " - f"but produced {aligned_segment_count} segments. " - "Check that the anchors are well positioned or " - "that the audio corresponds to the text." - ) - final_end = end +def process_dna( + dna_config: Dict[str, Any], + audio: AudioSegment, + audio_path: Optional[str] = None, + save_temps: Optional[str] = None, +) -> Tuple[AudioSegment, List[dict], List[dict]]: + """Apply do-not-align processing to audio. - if len(results["words"]) == 0: - raise RuntimeError( - "Alignment produced only noise or silence segments, " - "please verify that the text is an actual transcript of the audio." - ) - if len(results["words"]) != len(results["tokenized"].xpath("//" + unit)): - LOGGER.warning( - "Alignment produced a different number of segments and tokens than " - "were in the input. Sequences between some anchors probably did not " - "align successfully. Look for more anchors-related warnings above in the log." - ) + Args: + dna_config (dict): Do-not-align configuration, containing at least "segments" and "method". + audio (AudioSegment): Original audio segment. + audio_path (str): Optional; Path from which audio was loaded (needed for save_temps). + save_temps (str): Optional; Prefix for saving temporary files, by default None. - if not bare: - # Take all the boundaries (anchors) around segments and add them as DNA - # segments for the purpose of splitting silences - dna_for_silence_splitting = copy.deepcopy(dna_segments) - last_end = None - for seq in word_sequences: - if last_end or seq.start: - dna_for_silence_splitting.append( - {"begin": (last_end or seq.start), "end": (seq.start or last_end)} + Returns: + Tuple[AudioSegment, List[dict], List[dict]]: Processed audio + segment, list of segments marked do-not-align, list of segments + actually removed. + """ + # Sort un-alignable segments and join overlapping ones + dna_segments = sort_and_join_dna_segments(dna_config["segments"]) + method = dna_config.get("method", "remove") + # Determine do-not-align method + if method == "mute": + dna_method = mute_section + elif method == "remove": + dna_method = remove_section + else: + LOGGER.error("Unknown do-not-align method declared") + # Process audio and save temporary files + if method in ("mute", "remove"): + processed_audio = audio + # Process the DNA segments in reverse order so we don't have to correct + # for previously processed ones when using the "remove" method. + for dna_seg in reversed(dna_segments): + processed_audio = dna_method( + processed_audio, int(dna_seg["begin"]), int(dna_seg["end"]) + ) + if save_temps is not None: + assert audio_path is not None + _, ext = os.path.splitext(audio_path) + try: + processed_audio.export(save_temps + "_processed" + ext, format=ext[1:]) + except CouldntEncodeError: + try: + os.remove(save_temps + "_processed" + ext) + except BaseException: # Ignore Windows file removal failures + pass + LOGGER.warning( + f"Couldn't find encoder for '{ext[1:]}', defaulting to 'wav'" ) - last_end = seq.end - if last_end: - dna_for_silence_splitting.append({"begin": last_end, "end": last_end}) - dna_for_silence_splitting = sort_and_join_dna_segments( - dna_for_silence_splitting - ) + processed_audio.export(save_temps + "_processed" + ".wav") + removed_segments = dna_segments + return processed_audio, dna_segments, removed_segments + + +def align_sequence( + audio_data: AudioSegment, + word_sequence: WordSequence, + asr_config: soundswallower.Config, + xml_path: str, + i: int, + unit: Optional[str] = "w", + save_temps: Optional[str] = None, +) -> AudioSegment: + """Run alignment for a word sequence. - split_silences(results["words"], final_end, dna_for_silence_splitting) + Args: + audio_data (AudioSegment): Full input audio. + word_sequence (WordSequence): Sequence of units to align. + asr_config (soundswallower.Config): Aligner configuration. + unit (str): Name of unit we are aligning. + xml_path (str): Path to input XML file. + i (int): Index of this sequence in the full file. + + save_temps (str): Optional; Prefix for saving temporary files, + or None to not save them. + + Returns: + Iterable[soundswallower.Seg]: Word (or other unit) alignments. + + Raises: + RuntimeError: If alignment fails (TODO: figure out why). + """ + i_suffix = "" if i == 0 else "." + str(i + 1) + + # Generate dictionary and FSG for the current sequence of words + dict_data = make_dict(word_sequence.words, xml_path, unit=unit) + if save_temps is not None: + dict_file = io.open(save_temps + ".dict" + i_suffix, "wb") + else: + dict_file = PortableNamedTemporaryFile(prefix="readalongs_dict_", delete=True) + dict_file.write(dict_data.encode("utf-8")) + dict_file.close() + + fsg_data = make_fsg(word_sequence.words, xml_path) + if save_temps is not None: + fsg_file = io.open(save_temps + ".fsg" + i_suffix, "wb") + else: + fsg_file = PortableNamedTemporaryFile(prefix="readalongs_fsg_", delete=True) + fsg_file.write(fsg_data.encode("utf-8")) + fsg_file.close() + + # Extract the part of the audio corresponding to this word sequence + audio_segment = extract_section(audio_data, word_sequence.start, word_sequence.end) + if save_temps is not None and audio_segment is not audio_data: + write_audio_to_file(audio_segment, save_temps + ".wav" + i_suffix) + + # Configure soundswallower for this sequence's dict and fsg + asr_config["dict"] = dict_file.name + asr_config["fsg"] = fsg_file.name + + ps = soundswallower.Decoder(asr_config) + # Align this word sequence + ps.start_utt() + ps.process_raw(audio_segment.raw_data, no_search=False, full_utt=True) + ps.end_utt() + + return ps.seg + + +def process_segmentation( + segmentation: Iterable[soundswallower.Seg], + curr_removed_segments: List[dict], + noisewords: Set[str], + frame_size: float, + debug_aligner: Optional[bool] = False, +) -> List[Dict[str, Any]]: + """Correct output alignments based on do-not-align segments.""" + aligned_words: List[Dict[str, Any]] = [] + for word_seg in segmentation: + if word_seg.text in noisewords: + continue + start = word_seg.start + end = word_seg.start + word_seg.duration + # round to milliseconds to avoid imprecisions + start_ms = round(start * 1000) + end_ms = round(end * 1000) + # possibly adjust for removed sections + if curr_removed_segments: + start_ms += calculate_adjustment(start_ms, curr_removed_segments) + end_ms += calculate_adjustment(end_ms, curr_removed_segments) + start_ms, end_ms = correct_adjustments( + start_ms, end_ms, curr_removed_segments + ) + # change back to seconds + start = start_ms / 1000 + end = end_ms / 1000 + if aligned_words: + assert start >= aligned_words[-1]["end"] + aligned_words.append({"id": word_seg.text, "start": start, "end": end}) + if debug_aligner: + LOGGER.info("Segment: %s (%.3f : %.3f)", word_seg.text, start, end) + return aligned_words + + +def insert_silence( + results: Dict[str, Any], + audio: AudioSegment, + xml_path: Optional[str] = "XML Input", +): + """Insert the required silences in the audio stream.""" words_dict = { x["id"]: {"start": x["start"], "end": x["end"]} for x in results["words"] } - silence_offsets = defaultdict(int) + silence_offsets: defaultdict = defaultdict(int) silence = 0 if results["tokenized"].xpath("//silence"): endpoint = 0 @@ -446,11 +522,329 @@ def frames_to_time(frames): word["start"] += silence_offsets[word["id"]] word["end"] += silence_offsets[word["id"]] results["audio"] = audio + + +def align_audio( + xml_path: str, + audio_path: str, + *, # force the remaining arguments to be passed by name + unit: Optional[str] = "w", + bare: Optional[bool] = False, + config: Optional[dict] = None, + save_temps: Optional[str] = None, + verbose_g2p_warnings: Optional[bool] = False, + debug_aligner: Optional[bool] = False, + output_orthography: str = "eng-arpabet", + alignment_mode: str = "auto", +): + """Align an XML input file to an audio file. + + Args: + xml_path (str): Path to XML input file in TEI-like format + audio_path (str): Path to audio input. Must be in a format supported by ffmpeg + unit (str): Optional; Element to create alignments for, by default 'w' + bare (boolean): Optional; + If False, split silence into adjoining tokens (default) + If True, keep the bare tokens without adjoining silences. + config (dict): Optional; ReadAlong-Studio configuration to use + save_temps (str): Optional; Prefix for saving temporary files, or None if + temporary files are not to be saved. + verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings + iff True + debug_aligner (boolean): Optional, output debugging info from the aligner. + alignment_mode (str): Optional, controls the decoder beam width + + Returns: + Dict[str, Any]: TODO + + Raises: + TODO + """ + results: Dict[str, Any] = {"words": [], "audio": None} + if config is None: + config = {} + + xml = parse_and_make_xml( + xml_path=xml_path, + config=config, + verbose_g2p_warnings=verbose_g2p_warnings, + save_temps=save_temps, + output_orthography=output_orthography, + ) + results["tokenized"] = xml + + # Read the audio file + audio = read_audio_from_file(audio_path) + audio = audio.set_channels(1).set_sample_width(2) + audio_length_in_ms = len(audio.raw_data) + + # Expand the list of alignment modes to try + if alignment_mode == "auto": + align_modes = ["strict", "moderate", "loose"] + else: + align_modes = [alignment_mode] + + # Create the ASR configuration for each alignment mode needed + asr_configs = [ + create_asr_config(config, audio, save_temps, debug_aligner, align_mode) + for align_mode in align_modes + ] + asr_config = asr_configs[0] # Default/first ASR Config + + # Process audio, silencing or removing any DNA segments + if "do-not-align" in config: + audio_data, dna_segments, removed_segments = process_dna( + dna_config=config["do-not-align"], + audio=audio, + audio_path=audio_path, + save_temps=save_temps, + ) + else: + audio_data = audio + dna_segments = [] + removed_segments = [] + + # Note: the frames are typically 0.01s long (i.e., the frame rate is typically 100), + # while the audio segments manipulated using pydub are sliced and accessed in + # millisecond intervals. For audio segments, the ms slice assumption is hard-coded + # all over, while frame_size is used to convert segment boundaries returned by + # soundswallower, which are indexes in frames, into durations in seconds. + frame_size = 1.0 / asr_config["frate"] + + # Get list of words to ignore in aligner output + noisewords = read_noisedict(asr_config) + + # Extract the list of sequences of words in the XML + word_sequences = get_sequences(xml, xml_path, unit=unit) + final_end = 0.0 + for i, word_sequence in enumerate(word_sequences): + for j, cur_asr_config in enumerate(asr_configs): + # Run the aligner on this sequence + segmentation = align_sequence( + audio_data=audio_data, + word_sequence=word_sequence, + asr_config=cur_asr_config, + xml_path=xml_path, + i=i, + unit=unit, + save_temps=save_temps, + ) + + # List of removed segments for the sequence we are currently processing + curr_removed_segments = dna_union( + word_sequence.start, + word_sequence.end, + audio_length_in_ms, + removed_segments, + ) + # Process raw segmentation, adjusting alignments for DNA + aligned_words = process_segmentation( + segmentation=segmentation, + curr_removed_segments=curr_removed_segments, + noisewords=noisewords, + frame_size=frame_size, + debug_aligner=debug_aligner, + ) + + if len(aligned_words) != len(word_sequence.words): + LOGGER.warning(f"Align mode {align_modes[j]} failed for sequence {i}.") + else: + LOGGER.info(f"Align mode {align_modes[j]} succeeded for sequence {i}.") + break + + results["words"].extend(aligned_words) + if aligned_words: + final_end = aligned_words[-1]["end"] + if len(aligned_words) != len(word_sequence.words): + LOGGER.warning( + f"Word sequence {i+1} had {len(word_sequence.words)} tokens " + f"but produced {len(aligned_words)} segments. " + "Check that the anchors are well positioned or " + "that the audio corresponds to the text." + ) + + aligned_segment_count = len(results["words"]) + token_count = len(results["tokenized"].xpath(f"//{unit}")) + LOGGER.info(f"Number of words found: {token_count}") + LOGGER.info(f"Number of aligned segments: {aligned_segment_count}") + + if aligned_segment_count == 0: + raise RuntimeError( + "Alignment produced only noise or silence segments, " + "please verify that the text is an actual transcript of the audio." + ) + if aligned_segment_count != token_count: + LOGGER.warning( + "Alignment produced a different number of segments and tokens than " + "were in the input. Sequences between some anchors probably did not " + "align successfully. Look for more anchors-related warnings above in the log." + ) + + # Split silences if requested + if not bare: + # Take all the boundaries (anchors) around segments and add them as DNA + # segments for the purpose of splitting silences + dna_for_silence_splitting = copy.deepcopy(dna_segments) + last_end = None + for seq in word_sequences: + if last_end or seq.start: + dna_for_silence_splitting.append( + {"begin": (last_end or seq.start), "end": (seq.start or last_end)} + ) + last_end = seq.end + if last_end: + dna_for_silence_splitting.append({"begin": last_end, "end": last_end}) + dna_for_silence_splitting = sort_and_join_dna_segments( + dna_for_silence_splitting + ) + split_silences(results["words"], final_end, dna_for_silence_splitting) + + # Insert silences if requested + insert_silence( + results=results, + audio=audio, + xml_path=xml_path, + ) return results -def save_readalong( # noqa C901 - # noqa C901 - ignore the complexity of this function +def get_audio_duration(audiofile: str) -> float: + """Return the duration of audiofile in seconds""" + audio = read_audio_from_file(audiofile) + return audio.frame_count() / audio.frame_rate + + +def save_label_files( + words: List[dict], + tokenized_xml: etree.ElementTree, + duration: float, + output_base: str, + output_formats: Iterable[str], +): + """Save label (TextGrid and/or EAF) files. + + Args: + words: list of words with "id", "start" and "end" + tokenized_xml: tokenized or g2p'd parsed XML object + duration: length of the audio in seconds + output_base (str): Base path for output files + output_formats (Iterable[str]): List of output formats + + Raises: + IndexError: words and tokenized_xml have inconsistent IDs + Exception: TODO, not sure what else this can raise + """ + words_with_text, sentences = get_word_texts_and_sentences(words, tokenized_xml) + textgrid = create_text_grid(words_with_text, sentences, duration) + + if "textgrid" in output_formats: + textgrid.to_file(output_base + ".TextGrid") + + if "eaf" in output_formats: + textgrid.to_eaf().to_file(output_base + ".eaf") + + +def save_subtitles( + words: List[dict], + tokenized_xml: etree.ElementTree, + output_base: str, + output_formats=Iterable[str], +): + """Save subtitle (SRT and/or VTT) files. + + Args: + words: list of words with "id", "start" and "end" + tokenized_xml: tokenized or g2p'd parsed XML object + output_base (str): Base path for output files + output_formats (Iterable[str]): List of output formats + + Raises: + IndexError: words and tokenized_xml have inconsistent IDs + Exception: TODO, not sure what else this can raise + """ + words_with_text, sentences = get_word_texts_and_sentences(words, tokenized_xml) + cc_sentences = write_to_subtitles(sentences) + cc_words = write_to_subtitles(words_with_text) + + if "srt" in output_formats: + cc_sentences.save_as_srt(output_base + "_sentences.srt") + cc_words.save_as_srt(output_base + "_words.srt") + + if "vtt" in output_formats: + cc_words.save(output_base + "_words.vtt") + cc_sentences.save(output_base + "_sentences.vtt") + + +def save_audio( + audiofile: str, output_base: str, audiosegment: Optional[AudioSegment] = None +) -> str: + """Save audio file. + + Args: + audiofile (str): Path to input audio + output_base (str): Base path for output files + output_formats (Iterable[str]): List of output formats + audiosegment (AudioSegment): Optional; trimmed/muted audio + Returns: + str: Path to output audio file. + """ + _, audio_ext = os.path.splitext(audiofile) + audio_path = output_base + audio_ext + audio_format = audio_ext[1:] + if audiosegment is not None: + if audio_format in ["m4a", "aac"]: + audio_format = "ipod" + try: + audiosegment.export(audio_path, format=audio_format) + except CouldntEncodeError: + LOGGER.warning( + f"The audio file at {audio_path} could \ + not be exported in the {audio_format} format. \ + Please ensure your installation of ffmpeg has \ + the necessary codecs." + ) + audio_path = output_base + ".wav" + audiosegment.export(audio_path, format="wav") + else: + shutil.copy(audiofile, audio_path) + return audio_path + + +def save_images(config: Dict[str, Any], output_dir: str): + """Save image files specified in config. + + Args: + config (dict): ReadAlong-Studio configuration + output_dir (str): Output directory + Raises: + FileExistsError: If output directory already exists + """ + assets_dir = os.path.join(output_dir, "assets") + try: + os.mkdir(assets_dir) + except FileExistsError: + if not os.path.isdir(assets_dir): + raise + for _, image in config["images"].items(): + if image[0:4] == "http": + LOGGER.warning( + f"Please make sure {image} is accessible to clients using your read-along." + ) + else: + try: + shutil.copy(image, assets_dir) + except Exception as e: + LOGGER.warning( + f"Please copy {image} to {assets_dir} before deploying your read-along. ({e})" + ) + if os.path.basename(image) != image: + LOGGER.warning( + f"Read-along images were tested with absolute urls (starting with http(s):// " + f"and filenames without a path. {image} might not work as specified." + ) + + +def save_readalong( # this * forces all arguments to be passed by name, because I don't want any # code to depend on their order in the future *, @@ -482,6 +876,9 @@ def save_readalong( # noqa C901 Raises: [TODO] """ + if config is None: + config = {} + # Round all times to three digits, anything more is excess precision # poluting the output files, and usually due to float rounding errors anyway. for w in align_results["words"]: @@ -491,31 +888,23 @@ def save_readalong( # noqa C901 output_base = os.path.join(output_dir, output_basename) # Create textgrid object if outputting to TextGrid or eaf - if "TextGrid" in output_formats or "eaf" in output_formats: - audio = read_audio_from_file(audiofile) - duration = audio.frame_count() / audio.frame_rate - words, sentences = return_words_and_sentences(align_results) - textgrid = write_to_text_grid(words, sentences, duration) - - if "TextGrid" in output_formats: - textgrid.to_file(output_base + ".TextGrid") - - if "eaf" in output_formats: - textgrid.to_eaf().to_file(output_base + ".eaf") + if "textgrid" in output_formats or "eaf" in output_formats: + save_label_files( + words=align_results["words"], + tokenized_xml=align_results["tokenized"], + duration=get_audio_duration(audiofile), + output_base=output_base, + output_formats=output_formats, + ) # Create webvtt object if outputting to vtt or srt if "srt" in output_formats or "vtt" in output_formats: - words, sentences = return_words_and_sentences(align_results) - cc_sentences = write_to_subtitles(sentences) - cc_words = write_to_subtitles(words) - - if "srt" in output_formats: - cc_sentences.save_as_srt(output_base + "_sentences.srt") - cc_words.save_as_srt(output_base + "_words.srt") - - if "vtt" in output_formats: - cc_words.save(output_base + "_words.vtt") - cc_sentences.save(output_base + "_sentences.vtt") + save_subtitles( + words=align_results["words"], + tokenized_xml=align_results["tokenized"], + output_base=output_base, + output_formats=output_formats, + ) tokenized_xml_path = output_base + ".xml" save_xml(tokenized_xml_path, align_results["tokenized"]) @@ -525,38 +914,30 @@ def save_readalong( # noqa C901 tokenized_xhtml_path = output_base + ".xhtml" save_xml(tokenized_xhtml_path, align_results["tokenized"]) - _, audio_ext = os.path.splitext(audiofile) - audio_path = output_base + audio_ext - audio_format = audio_ext[1:] - if audiosegment: - if audio_format in ["m4a", "aac"]: - audio_format = "ipod" - try: - audiosegment.export(audio_path, format=audio_format) - except CouldntEncodeError: - LOGGER.warning( - f"The audio file at {audio_path} could \ - not be exported in the {audio_format} format. \ - Please ensure your installation of ffmpeg has \ - the necessary codecs." - ) - audio_path = output_base + ".wav" - audiosegment.export(audio_path, format="wav") - else: - shutil.copy(audiofile, audio_path) + audio_path = save_audio( + audiofile=audiofile, output_base=output_base, audiosegment=audiosegment + ) smil_path = output_base + ".smil" smil = make_smil( os.path.basename(tokenized_xml_path), os.path.basename(audio_path), - align_results, + align_results["words"], ) save_txt(smil_path, smil) if "html" in output_formats: html_out_path = output_base + ".html" - html_out = create_web_component_html(tokenized_xml_path, smil_path, audio_path) - with open(html_out_path, "w") as f: + html_out = create_web_component_html( + tokenized_xml_path, + smil_path, + audio_path, + config.get("title", "Title goes here"), + config.get("header", "Header goes here"), + config.get("subheader", ""), + config.get("theme", "light"), + ) + with open(html_out_path, "w", encoding="utf-8") as f: f.write(html_out) save_minimal_index_html( @@ -564,103 +945,86 @@ def save_readalong( # noqa C901 os.path.basename(tokenized_xml_path), os.path.basename(smil_path), os.path.basename(audio_path), + config.get("title", "Title goes here"), + config.get("header", "Header goes here"), + config.get("subheader", ""), + config.get("theme", "light"), ) # Copy the image files to the output's asset directory, if any are found - if config and "images" in config: - assets_dir = os.path.join(output_dir, "assets") - try: - os.mkdir(assets_dir) - except FileExistsError: - if not os.path.isdir(assets_dir): - raise - for _, image in config["images"].items(): - if image[0:4] == "http": - LOGGER.warning( - f"Please make sure {image} is accessible to clients using your read-along." - ) - else: - try: - shutil.copy(image, assets_dir) - except Exception as e: - LOGGER.warning( - f"Please copy {image} to {assets_dir} before deploying your read-along. ({e})" - ) - if os.path.basename(image) != image: - LOGGER.warning( - f"Read-along images were tested with absolute urls (starting with http(s):// " - f"and filenames without a path. {image} might not work as specified." - ) + if "images" in config: + save_images(config=config, output_dir=output_dir) -def return_word_from_id(xml: etree, el_id: str) -> str: - """Given an XML document, return the innertext at id +def get_word_element(xml: etree.ElementTree, el_id: str) -> etree.ElementTree: + """Get the xml etree for a given word by its id""" + return xml.xpath(f'//w[@id="{el_id}"]')[0] - Args: - xml (etree): XML document - el_id (str): ID - Returns: - str: Innertext of element with el_id in xml - """ - return xml.xpath('//*[@id="%s"]/text()' % el_id)[0] +def get_ancestor_sent_el(word_el: etree.ElementTree) -> Union[None, etree.ElementTree]: + """Get the ancestor node for word_el, or None""" + while word_el is not None and word_el.tag != "s": + word_el = word_el.getparent() + return word_el -def return_words_and_sentences(results): - """Parse xml into word and sentence 'tier' data +def get_word_texts_and_sentences( + words: List[dict], tokenized_xml: etree.ElementTree +) -> Tuple[List[dict], List[List[dict]]]: + """Parse xml into word and sentence 'tier' data with full textual words Args: - results([TODO type]): [TODO description] + words: list of words with "id", "start" and "end" + tokenized_xml: tokenized or g2p'd parsed XML object Returns: - [TODO type]: [TODO description] + list of words, list of sentences (as a list of lists of words) + The returned words are dicts containing: + "text": the actual textual word from the XML (not the ID) + "start": start time + "end": end time """ - result_id_pattern = re.compile( - r""" - t(?P\d*) # Table - b(?P\d*) # Body - d(?P
\d*) # Div ( Break ) - p(?P\d*) # Paragraph - s(?P\d+) # Sentence - w(?P\d+) # Word - """, - re.VERBOSE, - ) - - all_els = results["words"] - xml = results["tokenized"] sentences = [] - words = [] - all_words = [] - current_sent = 0 - for el in all_els: - parsed = re.search(result_id_pattern, el["id"]) - sent_i = parsed.group("sent") - if int(sent_i) is not current_sent: - sentences.append(words) - words = [] - current_sent += 1 - word = { - "text": return_word_from_id(xml, el["id"]), - "start": el["start"], - "end": el["end"], + sent_words: List[Dict[str, Any]] = [] + all_words: List[Dict[str, Any]] = [] + prev_sent_el = None + for word in words: + # The sentence is considered the set of words under the same element. + # A word that's not under any element is bad input, but we consider + # it a sentence by itself for software robustness. + word_el = get_word_element(tokenized_xml, word["id"]) + sent_el = get_ancestor_sent_el(word_el) + if prev_sent_el is None or sent_el is not prev_sent_el: + if sent_words: + sentences.append(sent_words) + sent_words = [] + prev_sent_el = sent_el + word_with_text = { + "text": get_word_text(word_el), + "start": word["start"], + "end": word["end"], } - words.append(word) - all_words.append(word) - sentences.append(words) + if all_words: + assert word_with_text["start"] >= all_words[-1]["end"] + sent_words.append(word_with_text) + all_words.append(word_with_text) + if sent_words: + sentences.append(sent_words) return all_words, sentences -def write_to_text_grid(words: List[dict], sentences: List[dict], duration: float): - """Write results to Praat TextGrid. Because we are using pympi, we can also export to Elan EAF. +def create_text_grid( + words: List[dict], sentences: List[List[dict]], duration: float +) -> TextGrid: + """Create Praat TextGrid from results. Because we are using pympi, we can also export to Elan EAF. Args: - words (List[dict]): List of word times containing start, end, and value keys - sentences (List[dict]): List of sentence times containing start, end, and value keys + words (List[dict]): List of words containing "text", "start", "end" + sentences (List[dict]): List of sentences (as a list of lists of word dicts) duration (float): duration of entire audio Returns: - TextGrid: Praat TextGrid with word and sentence alignments + TextGrid: Praat TextGrid object with word and sentence alignments """ text_grid = TextGrid(xmax=duration) sentence_tier = text_grid.add_tier(name="Sentence") @@ -757,9 +1121,6 @@ def convert_to_xhtml(tokenized_xml, title="Book"): TEI_TEMPLATE = """ - {{#pages}} @@ -779,13 +1140,56 @@ def convert_to_xhtml(tokenized_xml, title="Book"): """ +def create_tei_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str: + """Create input xml in TEI standard. + Uses the line sequence to infer paragraph and sentence structure from plain text: + Assumes a double blank line marks a page break, and a single blank line + marks a paragraph break. + Creates the XML using chevron + + Args: + lines: lines from the input plain text, e.g., f.readlines() on file handle f + text_languages: non-empty list of languages for g2p conversion + + Returns: + str: Formatted XML, ready to print + """ + assert text_languages, "The text_languages list may not be empty." + kwargs = { + "main_lang": text_languages[0], + "fallback_langs": ",".join(text_languages[1:]), + } + pages: List[dict] = [] + paragraphs: List[dict] = [] + sentences: List[str] = [] + for line in lines: + stripped_line = line.strip() + if stripped_line == "": + if not sentences: + # consider this a page break (unless at the beginning) + pages.append({"paragraphs": paragraphs}) + paragraphs = [] + else: + # add sentences and begin new paragraph + paragraphs.append({"sentences": sentences}) + sentences = [] + else: + # Add text to sentence + sentences.append(stripped_line) + # Add the last paragraph/sentence + if sentences: + paragraphs.append({"sentences": sentences}) + if paragraphs: + pages.append({"paragraphs": paragraphs}) + return chevron.render(TEI_TEMPLATE, {**kwargs, **{"pages": pages}}) + + def create_input_tei(**kwargs): """Create input xml in TEI standard. Uses readlines to infer paragraph and sentence structure from plain text. - TODO: Check if path, if it's just plain text, then render that instead of reading from the file Assumes a double blank line marks a page break, and a single blank line marks a paragraph break. - Outputs to uft-8 XML using pymustache. + Outputs to uft-8 XML using chevron. Args: **kwargs: dict containing these arguments: @@ -806,7 +1210,7 @@ def create_input_tei(**kwargs): try: if kwargs.get("input_file_name", False): filename = kwargs["input_file_name"] - with io.open(kwargs["input_file_name"], encoding="utf8") as f: + with io.open(kwargs["input_file_name"], encoding="utf-8-sig") as f: text = f.readlines() elif kwargs.get("input_file_handle", False): filename = kwargs["input_file_handle"].name @@ -822,14 +1226,11 @@ def create_input_tei(**kwargs): text_langs = kwargs.get("text_languages", None) assert text_langs and isinstance(text_langs, (list, tuple)), "need text_languages" - kwargs["main_lang"] = text_langs[0] - kwargs["fallback_langs"] = ",".join(text_langs[1:]) - - save_temps = kwargs.get("save_temps", False) + save_temps = kwargs.get("save_temps", None) if kwargs.get("output_file", False): filename = kwargs.get("output_file") outfile = io.open(filename, "wb") - elif save_temps: + elif save_temps is not None: filename = save_temps + ".input.xml" outfile = io.open(filename, "wb") else: @@ -837,28 +1238,7 @@ def create_input_tei(**kwargs): prefix="readalongs_xml_", suffix=".xml", delete=True ) filename = outfile.name - pages = [] - paragraphs = [] - sentences = [] - for line in text: - if line == "\n": - if not sentences: - # consider this a page break (unless at the beginning) - pages.append({"paragraphs": paragraphs}) - paragraphs = [] - else: - # add sentences and begin new paragraph - paragraphs.append({"sentences": sentences}) - sentences = [] - else: - # Add text to sentence - sentences.append(line.strip()) - # Add the last paragraph/sentence - if sentences: - paragraphs.append({"sentences": sentences}) - if paragraphs: - pages.append({"paragraphs": paragraphs}) - xml = chevron.render(TEI_TEMPLATE, {**kwargs, **{"pages": pages}}) + xml = create_tei_from_text(text, text_langs) outfile.write(xml.encode("utf-8")) outfile.flush() outfile.close() diff --git a/readalongs/api.py b/readalongs/api.py new file mode 100644 index 00000000..c77454de --- /dev/null +++ b/readalongs/api.py @@ -0,0 +1,159 @@ +""" +api.py: API for calling readalongs CLI commands programmatically + +In this API, functions take the same arguments as on the readalongs +command-line interface. The mapping between CLI options and API options is +that the first long variant of an option described in "readalongs -h" is +the API option name, with hyphens replaced by undercores. + +Example from readalongs align -h: + option in CLI option in API + ================================ ================================= + -l, --language, --languages TEXT language=["l1", "l2"] + -f, --force-overwrite force_overwrite=True + -c, --config PATH config=os.path.join("some", "path", "config.json") + OR config=pathlib.Path("/some/path/config.json") + +As shown above, file names can be constructed using os.path.join() or a Path +class like pathlib.Path. Warning: don't just use "/some/path/config.json" +because that is not portable accross platforms. + +Options that can be specified multiple times on the CLI should be provided as a +list to the API methods. + +All API functions return the following tuple: (status, exception, log) + - status: 0 for OK, non-0 for Error + - exception: any exception caught, one of: + - click.BadParameter: when the is an error with the combination of parameters given + - click.UsageError: when the alignment task requested cannot be completed + - other exceptions: something else unexpected went wrong. Please report this as + a bug at https://github.com/ReadAlongs/Studio/issues if + you come accross such an exception and you believe the + problem is not in your own code. + - log: any logging messages issued during execution +""" + +import io +import logging +from typing import Optional, Tuple + +import click + +from readalongs import cli +from readalongs.log import LOGGER +from readalongs.util import JoinerCallbackForClick, get_langs_deferred + + +def align( + textfile, audiofile, output_base, language=(), output_formats=(), **kwargs +) -> Tuple[int, Optional[Exception], str]: + """Run the "readalongs align" command from within a Python script. + + Args: + textfile (str | Path): input text file (XML or plain text) + audiofile (str | Path): input audio file (format supported by ffmpeg) + output_base (str | Path): basename for output files + language (List[str]): Specify only of textfile is plain text; + list of languages for g2p and g2p cascade + save_temps (bool): Optional; whether to save temporary files + + Run "readalongs align -h" or consult + https://readalong-studio.readthedocs.io/en/latest/cli-ref.html#readalongs-align + for the full list of arguments and their meaning. + + Returns: (status, exception, log_text) + """ + + logging_stream = io.StringIO() + logging_handler = logging.StreamHandler(logging_stream) + try: + # Capture the logs + LOGGER.addHandler(logging_handler) + + align_args = {param.name: param.default for param in cli.align.params} + if language: + language = JoinerCallbackForClick(get_langs_deferred())( + value_groups=language + ) + if output_formats: + output_formats = JoinerCallbackForClick( + cli.SUPPORTED_OUTPUT_FORMATS, drop_case=True + )(value_groups=output_formats) + + align_args.update( + textfile=textfile, + audiofile=audiofile, + output_base=output_base, + language=language, + output_formats=output_formats, + **kwargs + ) + + cli.align.callback(**align_args) # type: ignore + + return (0, None, logging_stream.getvalue()) + except Exception as e: + return (1, e, logging_stream.getvalue()) + finally: + # Remove the log-capturing handler + LOGGER.removeHandler(logging_handler) + + +def make_xml( + plaintextfile, xmlfile, language, **kwargs +) -> Tuple[int, Optional[Exception], str]: + """Run the "readalongs make-xml" command from within a Python script. + + Args: + plaintextfile (str | Path): input plain text file + xmlfile (str | Path): output XML file + language (List[str]): list of languages for g2p and g2p cascade + + Run "readalongs make-xml -h" or consult + https://readalong-studio.readthedocs.io/en/latest/cli-ref.html#readalongs-make-xml + for the full list of arguments and their meaning. + + Returns: (status, exception, log_text) + """ + # plaintextfile is not a file object if passed from click + plaintextfile = ( + plaintextfile.name + if isinstance(plaintextfile, click.utils.LazyFile) + else plaintextfile + ) + logging_stream = io.StringIO() + logging_handler = logging.StreamHandler(logging_stream) + try: + # Capture the logs + LOGGER.addHandler(logging_handler) + + make_xml_args = {param.name: param.default for param in cli.make_xml.params} + try: + with open(plaintextfile, "r", encoding="utf-8-sig") as plaintextfile_handle: + make_xml_args.update( + plaintextfile=plaintextfile_handle, + xmlfile=xmlfile, + language=JoinerCallbackForClick(get_langs_deferred())( + value_groups=language + ), + **kwargs + ) + cli.make_xml.callback(**make_xml_args) # type: ignore + except OSError as e: + # e.g.: FileNotFoundError or PermissionError on open(plaintextfile) above + raise click.UsageError(str(e)) from e + + return (0, None, logging_stream.getvalue()) + except Exception as e: + return (1, e, logging_stream.getvalue()) + finally: + # Remove the log-capturing handler + LOGGER.removeHandler(logging_handler) + + +def prepare(*args, **kwargs): + """Deprecated, use make_xml instead""" + LOGGER.warning( + "readalongs.api.prepare() is deprecated. Please use make_xml() instead." + ) + return make_xml(*args, **kwargs) diff --git a/readalongs/app.py b/readalongs/app.py index 8c3f5604..87945ad0 100644 --- a/readalongs/app.py +++ b/readalongs/app.py @@ -15,4 +15,4 @@ Session(app) socketio = SocketIO(app, manage_session=False) -import readalongs.views # noqa: E402 +import readalongs.views # noqa: E402 F401 diff --git a/readalongs/audio_utils.py b/readalongs/audio_utils.py index da0160f2..7ba85f7c 100644 --- a/readalongs/audio_utils.py +++ b/readalongs/audio_utils.py @@ -4,7 +4,7 @@ in millisecond slices and lets us manipulate them as if they were simple lists. """ -from typing import List, Optional, Tuple, Union +from typing import Union from pydub import AudioSegment @@ -12,8 +12,7 @@ def join_section(audio: AudioSegment, audio_to_insert: AudioSegment, start: int): - """ Given two AudioSegments, insert the second into the first at start (ms) - """ + """Given two AudioSegments, insert the second into the first at start (ms)""" try: return audio[:start] + audio_to_insert + audio[start:] except IndexError: @@ -25,8 +24,7 @@ def join_section(audio: AudioSegment, audio_to_insert: AudioSegment, start: int) def remove_section(audio: AudioSegment, start: int, end: int) -> AudioSegment: - """ Given an AudioSement, remove the section between start (ms) and end (ms) - """ + """Given an AudioSement, remove the section between start (ms) and end (ms)""" try: return audio[:start] + audio[end:] except IndexError: @@ -38,7 +36,7 @@ def remove_section(audio: AudioSegment, start: int, end: int) -> AudioSegment: def mute_section(audio: AudioSegment, start: int, end: int) -> AudioSegment: - """ Given an AudioSegment, reduce the gain between a given interval by 120db. + """Given an AudioSegment, reduce the gain between a given interval by 120db. Effectively, make it silent. Args: @@ -62,7 +60,7 @@ def mute_section(audio: AudioSegment, start: int, end: int) -> AudioSegment: def extract_section( audio: AudioSegment, start: Union[None, int], end: Union[None, int] ) -> AudioSegment: - """ Given an AudioSegment, extract and keep only the [start, end) interval + """Given an AudioSegment, extract and keep only the [start, end) interval Args: audio (AudioSegment): audio segment to extract a section from @@ -89,7 +87,7 @@ def extract_section( def write_audio_to_file(audio: AudioSegment, path: str) -> None: - """ Write AudioSegment to file + """Write AudioSegment to file Args: audio (AudioSegment): audio segment to write @@ -105,7 +103,7 @@ def write_audio_to_file(audio: AudioSegment, path: str) -> None: def read_audio_from_file(path: str) -> AudioSegment: - """ Read in AudioSegment from file + """Read in AudioSegment from file Args: path (str): Path to audiofile diff --git a/readalongs/cli.py b/readalongs/cli.py index 7b95de36..4b9af328 100644 --- a/readalongs/cli.py +++ b/readalongs/cli.py @@ -4,8 +4,8 @@ CLI commands implemented in this file: - align : main command to align text and audio - - prepare : prepare XML input for align from plain text - - tokenize: tokenize the prepared file + - make-xml : make XML input for align from plain text + - tokenize: tokenize the XML file - g2p : apply g2p to the tokenized file - langs : list languages supported by align """ @@ -26,7 +26,12 @@ from readalongs.text.convert_xml import convert_xml from readalongs.text.tokenize_xml import tokenize_xml from readalongs.text.util import save_xml, write_xml -from readalongs.util import JoinerCallback, getLangs, getLangsDeferred +from readalongs.util import ( + JoinerCallbackForClick, + get_langs, + get_langs_deferred, + get_obsolete_callback_for_click, +) SUPPORTED_OUTPUT_FORMATS = { "eaf": "ELAN file", @@ -84,7 +89,7 @@ def cli(): although other output formats like subtitles or Praat TextGrids are available. You can use this command line tool in two ways. The "end-to-end" method with the - "align" command, or using a sequence of steps with "prepare", "tokenize", and "g2p" + "align" command, or using a sequence of steps with "make-xml", "tokenize", and "g2p" to get more control over the process. ## End-to-End @@ -102,16 +107,16 @@ def cli(): Using ReadAlongs this way, you must use the following commands in sequence. \b - prepare + make-xml ======= If you have plain text and you want to mark up some of the XML, you can - use this command to "prepare" your plain text into the XML structure + use this command to turn your plain text into the XML structure used by readalongs. \b tokenize ======== - Use this command to tokenize the output of the previous "readalongs prepare" command. + Use this command to tokenize the output of the previous "readalongs make-xml" command. \b g2p @@ -127,7 +132,7 @@ def cli(): """ -@cli.command( # noqa: C901 +@cli.command( # type: ignore # noqa: C901 # some versions of flake8 need this here context_settings=CONTEXT_SETTINGS, short_help="Force align a text and a sound file." ) @click.argument("textfile", type=click.Path(exists=True, readable=True)) @@ -149,7 +154,7 @@ def cli(): "-o", "--output-formats", multiple=True, - callback=JoinerCallback(SUPPORTED_OUTPUT_FORMATS), + callback=JoinerCallbackForClick(SUPPORTED_OUTPUT_FORMATS, drop_case=True), help=( "Comma- or colon-separated list of additional output file formats to export to. " "The text is always exported as XML and alignments as SMIL, but " @@ -157,7 +162,6 @@ def cli(): + SUPPORTED_OUTPUT_FORMATS_DESC ), ) -@click.option("-d", "--debug", is_flag=True, help="Add debugging messages to logger") @click.option( "-f", "--force-overwrite", is_flag=True, help="Force overwrite output files" ) @@ -167,7 +171,11 @@ def cli(): hidden=True, is_flag=True, default=None, - help="OBSOLETE; the input format is now guessedb by extension or contents", + help="OBSOLETE; the input format is now guessed by extension or contents", + callback=get_obsolete_callback_for_click( + ".txt files are now read as plain text, .xml as XML, and other files based on\n" + "whether they start with List[dict]: - """ Give a list of DNA segments, sort them and join any overlapping ones """ + """Give a list of DNA segments, sort them and join any overlapping ones""" results: List[dict] = [] for seg in sorted(do_not_align_segments, key=lambda x: x["begin"]): if results and results[-1]["end"] >= seg["begin"]: @@ -24,9 +24,9 @@ def sort_and_join_dna_segments(do_not_align_segments: List[dict]) -> List[dict]: def correct_adjustments( start: int, end: int, do_not_align_segments: List[dict] ) -> Tuple[int, int]: - """ Given the start and end of a segment (in ms) and a list of do-not-align segments, - If one of the do-not-align segments occurs inside one of the start-end range, - align the start or end with the do-not-align segment, whichever requires minimal change + """Given the start and end of a segment (in ms) and a list of do-not-align segments, + If one of the do-not-align segments occurs inside one of the start-end range, + align the start or end with the do-not-align segment, whichever requires minimal change """ for seg in do_not_align_segments: if start < seg["begin"] and end > seg["end"]: @@ -38,7 +38,7 @@ def correct_adjustments( def calculate_adjustment(timestamp: int, do_not_align_segments: List[dict]) -> int: - """ Given a time (in ms) and a list of do-not-align segments, + """Given a time (in ms) and a list of do-not-align segments, return the sum (ms) of the lengths of the do-not-align segments that start before the timestamp @@ -58,7 +58,7 @@ def calculate_adjustment(timestamp: int, do_not_align_segments: List[dict]) -> i def segment_intersection(segments1: List[dict], segments2: List[dict]) -> List[dict]: - """ Return the intersection of two lists of segments + """Return the intersection of two lists of segments Precondition: segments1 and segments2 contain sorted, non-overlapping ranges @@ -89,9 +89,9 @@ def segment_intersection(segments1: List[dict], segments2: List[dict]) -> List[d def dna_union( - start, end, audio_length: int, do_not_align_segments: List[dict], + start, end, audio_length: int, do_not_align_segments: List[dict] ) -> List[dict]: - """ Return the DNA list to include [start,end] and exclude do_not_align_segments + """Return the DNA list to include [start,end] and exclude do_not_align_segments Given time range [start, end] to keep, and a list of do-not-align-segments to exclude, calculate the equivalent do-not-align-segment list to keeping only diff --git a/readalongs/epub/create_epub.py b/readalongs/epub/create_epub.py index 70607dba..4c21f236 100644 --- a/readalongs/epub/create_epub.py +++ b/readalongs/epub/create_epub.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- ###################################################################### diff --git a/readalongs/lang/ckt/ckt_to_ipa.backup.json b/readalongs/lang/ckt/ckt_to_ipa.backup.json deleted file mode 100644 index 313a9e2d..00000000 --- a/readalongs/lang/ckt/ckt_to_ipa.backup.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "type": "mapping", - "authors": ["Vasilisa Andrianets", "Patrick Littell"], - "created": "2019-07-02", - "last_modified": "2019-07-02", - "in_metadata": { - "display_name": "Chukchi", - "display": true, - "lang": "ckt", - "format": "custom", - "delimiter": "", - "case_insensitive": true - }, - "out_metadata": { - "lang": "ckt-ipa", - "format": "ipa", - "delimiter": "" - }, - "map": [ - {"in":"ʼ", "out": "ʔ"}, - {"in":"а", "out": "a"}, - {"in":"в", "out": "w"}, - {"in":"г", "out": "ɣ"}, - {"in":"е", "out": "e"}, - {"in":"и", "out": "i"}, - {"in":"й", "out": "j"}, - {"in":"к", "out": "k"}, - {"in":"м", "out": "m"}, - {"in":"н", "out": "n"}, - {"in":"о", "out": "o"}, - {"in":"п", "out": "p"}, - {"in":"р", "out": "ɾ"}, - {"in":"с", "out": "s"}, - {"in":"ч", "out": "s"}, - {"in":"т", "out": "t"}, - {"in":"у", "out": "u"}, - {"in":"ъ", "out": "ʔ"}, - {"in":"ы", "out": "ə"}, - {"in":"ь", "out": "ʔ"}, - {"in":"э", "out": "e"}, - {"in":"ю", "out": "u"}, - {"in":"я", "out": "a"}, - {"in":"ё", "out": "o"}, - {"in":"ӄ", "out": "q"}, - {"in":"ӈ", "out": "ŋ"}, - {"in":"ԓ", "out": "ɬ"} - ] -} diff --git a/readalongs/lang/eng/eng_ipa_to_arpabet.backup.json b/readalongs/lang/eng/eng_ipa_to_arpabet.backup.json deleted file mode 100644 index 6086c7d9..00000000 --- a/readalongs/lang/eng/eng_ipa_to_arpabet.backup.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "type": "mapping", - "authors": ["Patrick Littell"], - "created": "2019-02-13", - "last_modified": "2019-02-15", - "in_metadata": { - "lang": "eng-ipa", - "format": "ipa", - "delimiter": "" - }, - "out_metadata": { - "lang": "eng-arpabet", - "format": "arpabet", - "delimiter": " " - }, - "map": [ - { "in": "ɑ", "out": "AA" }, - { "in": "ɑ̃", "out": "AA N" }, - { "in": "æ", "out": "AE" }, - { "in": "æ̃", "out": "AE N" }, - { "in": "ʌ", "out": "AH" }, - { "in": "ʌ̃", "out": "AH N" }, - { "in": "ɔ", "out": "AO" }, - { "in": "ɔ̃", "out": "AO N" }, - { "in": "aʊ", "out": "AW" }, - { "in": "ə", "out": "AH" }, - { "in": "aɪ", "out": "AY" }, - { "in": "ɛ", "out": "EH" }, - { "in": "ɛ̃", "out": "EH N" }, - { "in": "ɜ˞", "out": "ER" }, - { "in": "eɪ", "out": "EY" }, - { "in": "eː", "out": "EY" }, - { "in": "ej", "out": "EY" }, - { "in": "ẽ", "out": "EY N" }, - { "in": "ẽː", "out": "EY N" }, - { "in": "ɪ", "out": "IH" }, - { "in": "ɪ̃", "out": "IH N" }, - { "in": "ɨ", "out": "IX" }, - { "in": "i", "out": "IY" }, - { "in": "ĩ", "out": "IY N" }, - { "in": "oʊ", "out": "OW" }, - { "in": "ow", "out": "OW" }, - { "in": "oː", "out": "OW" }, - { "in": "õ", "out": "OW N" }, - { "in": "õː", "out": "OW N" }, - { "in": "ɔɪ", "out": "OY" }, - { "in": "ʊ", "out": "UH" }, - { "in": "ʊ̃", "out": "UH N" }, - { "in": "u", "out": "UW" }, - { "in": "ũ", "out": "UW N" }, - { "in": "b", "out": "B" }, - { "in": "tʃ", "out": "CH" }, - { "in": "t͡ʃ", "out": "CH" }, - { "in": "d", "out": "D" }, - { "in": "ð", "out": "DH" }, - { "in": "ɾ", "out": "D" }, - { "in": "l̩", "out": "EL" }, - { "in": "m̩", "out": "EM" }, - { "in": "n̩", "out": "EN" }, - { "in": "f", "out": "F" }, - { "in": "ɡ", "out": "G" }, - { "in": "h", "out": "HH" }, - { "in": "dʒ", "out": "JH" }, - { "in": "k", "out": "K" }, - { "in": "l", "out": "L" }, - { "in": "m", "out": "M" }, - { "in": "n", "out": "N" }, - { "in": "ŋ", "out": "NG" }, - { "in": "ɾ̃", "out": "NX" }, - { "in": "p", "out": "P" }, - { "in": "ʔ", "out": "HH" }, - { "in": "ɹ", "out": "R" }, - { "in": "s", "out": "S" }, - { "in": "ʃ", "out": "SH" }, - { "in": "t", "out": "T" }, - { "in": "θ", "out": "TH" }, - { "in": "v", "out": "V" }, - { "in": "w", "out": "W" }, - { "in": "ʍ", "out": "WH" }, - { "in": "j", "out": "Y" }, - { "in": "z", "out": "Z" }, - { "in": "ʒ", "out": "ZH" } - ] -} diff --git a/readalongs/log.py b/readalongs/log.py index 96e1a59a..b342c276 100644 --- a/readalongs/log.py +++ b/readalongs/log.py @@ -1,24 +1,16 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -####################################################################### -# -# log.py -# -# Setup a logger that has colours! -# -####################################################################### +""" +log.py: Setup a logger that has colours! +""" import logging import coloredlogs -FIELD_STYLES = dict(levelname=dict(color="green", bold=coloredlogs.CAN_USE_BOLD_FONT),) +FIELD_STYLES = dict(levelname=dict(color="green", bold=coloredlogs.CAN_USE_BOLD_FONT)) def setup_logger(name): - """ Create logger and configure with cool colors! - """ + """Create logger and configure with cool colors!""" logging.basicConfig(level=logging.INFO) logger = logging.getLogger(name) diff --git a/readalongs/portable_tempfile.py b/readalongs/portable_tempfile.py index e643e17e..45c26ec4 100644 --- a/readalongs/portable_tempfile.py +++ b/readalongs/portable_tempfile.py @@ -11,7 +11,7 @@ class _PortableNamedTemporaryFileWrapper: - """ Wrapper object around the real NamedTemporaryFile that forwards calls as needed + """Wrapper object around the real NamedTemporaryFile that forwards calls as needed The difference with NamedTemporaryFile is that we cleanup on exit and del, rather than on close. @@ -54,7 +54,7 @@ def cleanup(self): def PortableNamedTemporaryFile( mode="w+b", suffix="", prefix=template, dir=None, delete=True ): - """ Portable named temporary file that works on Windows, Linux and Mac. + """Portable named temporary file that works on Windows, Linux and Mac. This class wraps tempfile.NamedTemporaryFile() with a portable behaviour that works on Windows, Linux and Mac as we need it to. diff --git a/readalongs/run.py b/readalongs/run.py index f57a7885..12edbf5b 100644 --- a/readalongs/run.py +++ b/readalongs/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- ##################################################################################### @@ -22,8 +22,7 @@ def run(): - """ Run app using SocketIO - """ + """Run app using SocketIO""" socketio.run(app) diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/README b/readalongs/static/model/cmusphinx-en-us-5.2/README new file mode 100644 index 00000000..53ee8b32 --- /dev/null +++ b/readalongs/static/model/cmusphinx-en-us-5.2/README @@ -0,0 +1,34 @@ +/* ==================================================================== + * Copyright (c) 2015 Alpha Cephei Inc. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND. + * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,. + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ALPHA CEPHEI INC. + * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT. + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,. + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY. + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT. + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE. + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ==================================================================== + * + */ + +This directory contains generic US english acoustic model trained with +latest sphinxtrain. diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/feat_params.json b/readalongs/static/model/cmusphinx-en-us-5.2/feat_params.json new file mode 100644 index 00000000..85120078 --- /dev/null +++ b/readalongs/static/model/cmusphinx-en-us-5.2/feat_params.json @@ -0,0 +1,11 @@ +{ + "lowerf": 130, + "upperf": 6800, + "nfilt": 25, + "transform": "dct", + "lifter": 22, + "feat": "1s_c_d_dd", + "cmn": "current", + "varnorm": false, + "cmninit": "40,3,-1", +} diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/feature_transform b/readalongs/static/model/cmusphinx-en-us-5.2/feature_transform new file mode 100644 index 00000000..78b4f937 Binary files /dev/null and b/readalongs/static/model/cmusphinx-en-us-5.2/feature_transform differ diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/mdef b/readalongs/static/model/cmusphinx-en-us-5.2/mdef new file mode 100644 index 00000000..ed57c16d Binary files /dev/null and b/readalongs/static/model/cmusphinx-en-us-5.2/mdef differ diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/mdef.ci b/readalongs/static/model/cmusphinx-en-us-5.2/mdef.ci new file mode 100644 index 00000000..7bc2bc75 Binary files /dev/null and b/readalongs/static/model/cmusphinx-en-us-5.2/mdef.ci differ diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/means b/readalongs/static/model/cmusphinx-en-us-5.2/means new file mode 100644 index 00000000..82a41221 Binary files /dev/null and b/readalongs/static/model/cmusphinx-en-us-5.2/means differ diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/mixture_weights b/readalongs/static/model/cmusphinx-en-us-5.2/mixture_weights new file mode 100644 index 00000000..04a06a75 Binary files /dev/null and b/readalongs/static/model/cmusphinx-en-us-5.2/mixture_weights differ diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/noisedict.txt b/readalongs/static/model/cmusphinx-en-us-5.2/noisedict.txt new file mode 100644 index 00000000..00e4c908 --- /dev/null +++ b/readalongs/static/model/cmusphinx-en-us-5.2/noisedict.txt @@ -0,0 +1,9 @@ + SIL + SIL + SIL +[BREATH] +BREATH+ +[COUGH] +COUGH+ +[NOISE] +NOISE+ +[SMACK] +SMACK+ +[UH] +UH+ +[UM] +UM+ diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/transition_matrices b/readalongs/static/model/cmusphinx-en-us-5.2/transition_matrices new file mode 100644 index 00000000..806ff991 Binary files /dev/null and b/readalongs/static/model/cmusphinx-en-us-5.2/transition_matrices differ diff --git a/readalongs/static/model/cmusphinx-en-us-5.2/variances b/readalongs/static/model/cmusphinx-en-us-5.2/variances new file mode 100644 index 00000000..4c6ffbb2 Binary files /dev/null and b/readalongs/static/model/cmusphinx-en-us-5.2/variances differ diff --git a/readalongs/templates/base.html b/readalongs/templates/base.html index e22c84f6..123fee22 100644 --- a/readalongs/templates/base.html +++ b/readalongs/templates/base.html @@ -33,8 +33,8 @@ - - + + diff --git a/readalongs/templates/export.html b/readalongs/templates/export.html index ce538f57..8dc96d30 100644 --- a/readalongs/templates/export.html +++ b/readalongs/templates/export.html @@ -10,8 +10,8 @@

Code

Here's a snippet of code to embed in your site!

- <script type="module" src='https://unpkg.com/@roedoejet/readalong/dist/read-along/read-along.esm.js'></script> - <script nomodule src='https://unpkg.com/@roedoejet/readalong/dist/read-along/read-along.js'></script> + <script type="module" src='https://unpkg.com/@roedoejet/readalong@^0.1.6/dist/read-along/read-along.esm.js'></script> + <script nomodule src='https://unpkg.com/@roedoejet/readalong@^0.1.6/dist/read-along/read-along.js'></script> <read-along text="aligned.xml" alignment="aligned.smil" audio="aligned{{data.audio_ext}}"></read-along>
@@ -31,6 +31,11 @@

ReadAlong

Log

Here's a log for debugging.

{{data.log}}

+ {% if 'log_lines' in data %} + {% for line in data.log_lines %} +

{{line}}

+ {% endfor %} + {% endif %} {% endif %} diff --git a/readalongs/text/add_elements_to_xml.py b/readalongs/text/add_elements_to_xml.py index 2a0cc10f..589345f1 100644 --- a/readalongs/text/add_elements_to_xml.py +++ b/readalongs/text/add_elements_to_xml.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - ################################################### # # add_elements_to_xml.py diff --git a/readalongs/text/add_ids_to_xml.py b/readalongs/text/add_ids_to_xml.py index a2a81de9..ce0148f2 100644 --- a/readalongs/text/add_ids_to_xml.py +++ b/readalongs/text/add_ids_to_xml.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - ################################################### # # add_ids_to_xml.py @@ -16,15 +13,12 @@ # ################################################### -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse from collections import defaultdict from copy import deepcopy from lxml import etree -from readalongs.text.util import is_do_not_align, load_xml, save_xml +from readalongs.text.util import is_do_not_align TAG_TO_ID = { "text": "t", @@ -42,7 +36,7 @@ def add_ids_aux(element: etree, ids: defaultdict, parent_id: str = "") -> defaultdict: - """ Add ids to xml element + """Add ids to xml element Args: element (etree): Element to add ids to @@ -119,19 +113,3 @@ def add_ids(xml: etree) -> etree: continue ids = add_ids_aux(child, ids) return xml - - -def go(input_filename: str, output_filename: str) -> None: - xml = load_xml(input_filename) - xml = add_ids(xml) - save_xml(output_filename, xml) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert XML to another orthography while preserving tags" - ) - parser.add_argument("input", type=str, help="Input XML") - parser.add_argument("output", type=str, help="Output XML") - args = parser.parse_args() - go(args.input, args.output) diff --git a/readalongs/text/convert_xml.py b/readalongs/text/convert_xml.py index 0672e353..c34c4179 100644 --- a/readalongs/text/convert_xml.py +++ b/readalongs/text/convert_xml.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - ########################################################################### # # convert_xml.py @@ -35,7 +33,6 @@ # TODO: Document functions ############################################################################ -import argparse import copy import os import re @@ -43,22 +40,39 @@ from readalongs.log import LOGGER from readalongs.text.lexicon_g2p import getLexiconG2P from readalongs.text.lexicon_g2p_mappings import __file__ as LEXICON_PATH -from readalongs.text.util import ( - get_attrib_recursive, - get_lang_attrib, - load_xml, - save_xml, -) -from readalongs.util import getLangs +from readalongs.text.util import get_attrib_recursive, get_word_text, iterate_over_text + + +def get_same_language_units(element): + """Find all the text in element, grouped by units of the same language + + Returns: list of (lang, text) pairs + """ + same_language_units = [] + current_sublang, current_subword = None, None + for sublang, subword in iterate_over_text(element): + sublang = sublang.strip() if sublang else "" + if current_subword and sublang == current_sublang: + current_subword += subword + else: + if current_subword: + same_language_units.append((current_sublang, current_subword)) + current_sublang, current_subword = sublang, subword + if current_subword: + same_language_units.append((current_sublang, current_subword)) + return same_language_units def convert_words( # noqa: C901 - xml, word_unit="w", output_orthography="eng-arpabet", verbose_warnings=False, + xml, word_unit="w", output_orthography="eng-arpabet", verbose_warnings=False ): """Helper for convert_xml(), with the same Args and Return values, except xml is modified in place returned itself, instead of making a copy. """ + if output_orthography != "eng-arpabet": + LOGGER.info(f"output_orthography={output_orthography}") + # Defer expensive import of g2p to do them only if and when they are needed from g2p.mappings.langs.utils import is_arpabet @@ -94,7 +108,10 @@ def convert_word(word: str, lang: str): # Note: adding eng_ prefix to vars that are used in both blocks to make mypy # happy. Since the two sides of the if and in the same scope, it complains about # type checking otherwise. - assert output_orthography == "eng-arpabet" + if "eng-arpabet" not in output_orthography: + raise ValueError( + f'Cannot g2p "eng" to output orthography "{output_orthography}".' + ) eng_converter = getLexiconG2P( os.path.join(os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json") ) @@ -112,16 +129,16 @@ def convert_word(word: str, lang: str): converter = make_g2p(lang, output_orthography) except InvalidLanguageCode as e: raise ValueError( - f'Could not g2p "{word}" as "{lang}": invalid language code. ' - f"Use one of {getLangs()[0]}" + f'Could not g2p "{word}" from "{lang}" to "{output_orthography}": {e} ' + f'\nRun "readalongs langs" to list languages supported by ReadAlongs Studio.' ) from e except NoPath as e: raise ValueError( - f'Count not g2p "{word}" as "{lang}": no path to "{output_orthography}". ' - f"Use one of {getLangs()[0]}" + f'Could not g2p "{word}": no path from "{lang}" to "{output_orthography}".' + f'\nRun "readalongs langs" to list languages supported by ReadAlongs Studio.' ) from e tg = converter(word) - text = tg.output_string.strip() + text = tg.output_string valid = converter.check(tg, shallow=True) if not valid and verbose_warnings: converter.check(tg, shallow=False, display_warnings=verbose_warnings) @@ -134,55 +151,66 @@ def convert_word(word: str, lang: str): arpabet = word.attrib["ARPABET"] if not is_arpabet(arpabet): LOGGER.warning( - f'Pre-g2p\'d text "{word.text}" has invalid ARPABET conversion "{arpabet}"' + f'Pre-g2p\'d text "{get_word_text(word)}" has invalid ARPABET conversion "{arpabet}"' ) all_g2p_valid = False continue # only convert text within words - if not word.text: + same_language_units = get_same_language_units(word) + if not same_language_units: continue - g2p_lang = get_lang_attrib(word) or "und" # default: Undetermined - g2p_fallbacks = get_attrib_recursive(word, "fallback-langs") - text_to_g2p = word.text - try: - g2p_text, valid = convert_word(text_to_g2p, g2p_lang.strip()) - if not valid: - # This is where we apply the g2p cascade - for lang in re.split(r"[,:]", g2p_fallbacks) if g2p_fallbacks else []: - LOGGER.warning( - f'Could not g2p "{text_to_g2p}" as {g2p_lang}. ' - f"Trying fallback: {lang}." - ) - g2p_lang = lang.strip() - g2p_text, valid = convert_word(text_to_g2p, g2p_lang) - if valid: - word.attrib["effective-g2p-lang"] = g2p_lang - break - else: - all_g2p_valid = False - LOGGER.warning( - f'No valid g2p conversion found for "{text_to_g2p}". ' - f"Check its orthography and language code, " - f"or pick suitable g2p fallback languages." - ) - - # Save the g2p_text from the last conversion attemps, even when - # it's not valid, so it's in the g2p output if the user wants to - # inspect it manually. - word.attrib["ARPABET"] = g2p_text - - except ValueError as e: - LOGGER.warning( - f'Could not g2p "{text_to_g2p}" due to an incorrect ' - f'"xml:lang", "lang" or "fallback-langs" attribute in the XML: {e}' - ) - all_g2p_valid = False + all_arpabet = "" + for lang, text in same_language_units: + g2p_lang = lang or "und" # default: Undetermined + g2p_fallbacks = get_attrib_recursive(word, "fallback-langs") + text_to_g2p = text.strip() + try: + g2p_text, valid = convert_word(text_to_g2p, g2p_lang) + if not valid: + # This is where we apply the g2p cascade + for lang in ( + re.split(r"[,:]", g2p_fallbacks) if g2p_fallbacks else [] + ): + LOGGER.warning( + f'Could not g2p "{text_to_g2p}" as {g2p_lang}. ' + f"Trying fallback: {lang}." + ) + g2p_lang = lang.strip() + g2p_text, valid = convert_word(text_to_g2p, g2p_lang) + if valid: + word.attrib["effective-g2p-lang"] = g2p_lang + break + else: + all_g2p_valid = False + LOGGER.warning( + f'No valid g2p conversion found for "{text_to_g2p}". ' + f"Check its orthography and language code, " + f"or pick suitable g2p fallback languages." + ) + + # Save the g2p_text from the last conversion attemps, even when + # it's not valid, so it's in the g2p output if the user wants to + # inspect it manually. + + all_arpabet = all_arpabet + " " + g2p_text.strip() + + except ValueError as e: + LOGGER.warning( + f'Could not g2p "{text_to_g2p}" due to an incorrect ' + f'"xml:lang", "lang" or "fallback-langs" attribute in the XML: {e}' + ) + all_g2p_valid = False + + if not verbose_warnings: + break + + word.attrib["ARPABET"] = all_arpabet.strip() return xml, all_g2p_valid def convert_xml( - xml, word_unit="w", output_orthography="eng-arpabet", verbose_warnings=False, + xml, word_unit="w", output_orthography="eng-arpabet", verbose_warnings=False ): """Convert all the words in XML though g2p, putting the results in attribute ARPABET @@ -203,33 +231,3 @@ def convert_xml( xml_copy, word_unit, output_orthography, verbose_warnings ) return xml_copy, valid - - -def go( - input_filename, output_filename, word_unit="w", output_orthography="eng-arpabet" -): - xml = load_xml(input_filename) - converted_xml = convert_xml(xml, word_unit, output_orthography) - save_xml(output_filename, converted_xml) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert XML to another orthography while preserving tags" - ) - parser.add_argument("input", type=str, help="Input XML") - parser.add_argument("output", type=str, help="Output XML") - parser.add_argument( - "--word_unit", - type=str, - default="w", - help="XML element that " 'represents a word (default: "w")', - ) - parser.add_argument( - "--out_orth", - type=str, - default="eng-arpabet", - help='Output orthography (default: "eng-arpabet")', - ) - args = parser.parse_args() - go(args.input, args.output, args.word_unit, args.out_orth) diff --git a/readalongs/text/end_to_end.py b/readalongs/text/end_to_end.py index b3d3d2bd..5229eef2 100644 --- a/readalongs/text/end_to_end.py +++ b/readalongs/text/end_to_end.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- ######################################################################### diff --git a/readalongs/text/lexicon_g2p.py b/readalongs/text/lexicon_g2p.py index b8fdf6b0..caeb0072 100644 --- a/readalongs/text/lexicon_g2p.py +++ b/readalongs/text/lexicon_g2p.py @@ -1,7 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - ####################################################################### # # lexicon_g2p.py @@ -25,8 +21,6 @@ # TODO: Move this to the G2P library ###################################################################### -from __future__ import division, print_function, unicode_literals - import os from collections import defaultdict from unicodedata import normalize diff --git a/readalongs/text/make_dict.py b/readalongs/text/make_dict.py index a4df04d9..f2eff571 100644 --- a/readalongs/text/make_dict.py +++ b/readalongs/text/make_dict.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - ################################################## # # make_dict.py @@ -12,15 +9,9 @@ # ################################################## - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse - import chevron from readalongs.log import LOGGER -from readalongs.text.util import load_xml, save_txt DICT_TEMPLATE = """{{#items}} {{id}}\t{{pronunciation}} @@ -28,8 +19,7 @@ """ -def make_dict(word_elements, input_filename, unit="m"): - data = {"items": []} +def generate_dict_entries(word_elements, input_filename, unit): nwords = 0 for e in word_elements: if "id" not in e.attrib: @@ -40,29 +30,25 @@ def make_dict(word_elements, input_filename, unit="m"): if not text: continue nwords += 1 - data["items"].append({"id": e.attrib["id"], "pronunciation": text}) + yield e.attrib["id"], text if nwords == 0: raise RuntimeError("No words in dictionary!") - return chevron.render(DICT_TEMPLATE, data) -def go(input_filename, output_filename, unit): - xml = load_xml(input_filename) - dct = make_dict(xml.xpath(".//" + unit), input_filename, unit) - save_txt(output_filename, dct) +def make_dict_object(word_elements, input_filename="'in-memory'", unit="m"): + return { + word_id: text + for word_id, text in generate_dict_entries(word_elements, input_filename, unit) + } -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Make a pronunciation dictionary from a G2P'd XML file" - ) - parser.add_argument("input", type=str, help="Input XML") - parser.add_argument("output", type=str, help="Output .dict file") - parser.add_argument( - "--unit", - type=str, - default="m", - help="XML tag of the unit of analysis " '(e.g. "w" for word, "m" for morpheme)', - ) - args = parser.parse_args() - go(args.input, args.output, args.unit) +def make_dict(word_elements, input_filename="'in-memory'", unit="m"): + data = { + "items": [ + {"id": word_id, "pronunciation": text} + for word_id, text in generate_dict_entries( + word_elements, input_filename, unit + ) + ] + } + return chevron.render(DICT_TEMPLATE, data) diff --git a/readalongs/text/make_fsg.py b/readalongs/text/make_fsg.py index e629c3e9..6a1ad4d6 100644 --- a/readalongs/text/make_fsg.py +++ b/readalongs/text/make_fsg.py @@ -1,28 +1,19 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -################################################## -# -# make_fsg.py -# -# This module takes a text file, marked up with -# units (e.g. w for word, m for morpheme) and ids -# and converted to IPA, and outputs a FSG -# file for processing by PocketSphinx. -# -################################################## +""" +make_fsg.py generate an FSG or a JSGF for a marked up text file. +This module takes a text file, marked up with units (e.g. w for word, m for +morpheme) and ids and converted to IPA, and outputs an FSG or a JSGF +file for processing by PocketSphinx, SoundSwallower or SoundSwallower.js +""" -from __future__ import absolute_import, division, print_function, unicode_literals -import argparse +import datetime import os import chevron from slugify import slugify from readalongs.log import LOGGER -from readalongs.text.util import load_xml, save_txt FSG_TEMPLATE = """FSG_BEGIN {{name}} NUM_STATES {{num_states}} @@ -36,55 +27,78 @@ """ -def make_fsg(word_elements, filename): - name = slugify(os.path.splitext(os.path.basename(filename))[0]) - data = { - "name": name, # If name includes special characters, pocketsphinx throws a RuntimeError: new_Decoder returned -1 - "states": [], - "num_states": 0, - } +def get_ids(word_elements: list): + """Extract the sequence of id's from word_elements with both an id and + an arpabet pronounciation. + + Words with empty ARPABET are skipped because soundswallower and + pocketsphinx will error out if we give it words with an empty pronunciation + key. In general, what *would* it mean to align sounds to an empty sequence + of phonemes, after all??? + + Yields: + text_ids + """ for e in word_elements: if "id" not in e.attrib: # don't put in elements with no id continue - if not e.text or not e.text.strip(): - LOGGER.warning("No text in node %s", e.attrib["id"]) + if not e.attrib.get("ARPABET", "").strip(): + LOGGER.warning("Skipping node %s with no ARPABET", e.attrib["id"]) continue - text = e.text.strip() - # if not e.text.strip(): # don't put in elements with no text - # continue - data["states"].append( - { - "id": e.attrib["id"] if text else "", - "current": data["num_states"], - "next": data["num_states"] + 1, - } - ) - data["num_states"] += 1 - - data["final_state"] = data["num_states"] - data["num_states"] += 1 + yield e.attrib["id"] + + +def make_fsg(word_elements: list, filename: str = "'in-memory'") -> str: + """Generate an FSG for the given words elements + + Returns: the text contents of the FSG file for processing by PocketSphinx + """ + + states = [ + {"id": text_id, "current": i, "next": i + 1} + for i, text_id in enumerate(get_ids(word_elements)) + ] + + data = { + # If name includes special characters, pocketsphinx throws a RuntimeError: + # new_Decoder returned -1, so pass it through slugify() first + "name": slugify(os.path.splitext(os.path.basename(filename))[0]), + "states": states, + "final_state": len(states), + "num_states": len(states) + 1, + } return chevron.render(FSG_TEMPLATE, data) -def go(input_filename, output_filename, unit): - xml = load_xml(input_filename) - fsg = make_fsg(xml.xpath(".//" + unit), input_filename, unit) - save_txt(output_filename, fsg) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Make an FSG grammar from an XML file with IDs" - ) - parser.add_argument("input", type=str, help="Input XML") - parser.add_argument("output_fsg", type=str, help="Output .fsg file") - parser.add_argument( - "--unit", - type=str, - default="m", - help="XML tag of the unit of analysis " '(e.g. "w" for word, "m" for morpheme)', - ) - args = parser.parse_args() - go(args.input, args.output_fsg, args.unit) +JSGF_TEMPLATE = """#JSGF 1.0 UTF-8; +grammar {{name}}; + +/** + * Auto-generated JSGF grammar for the document {{name}}. + * + * @author Automatically generated by make_jsgf + * @version 1.0 + * @since {{date}} + */ + +public = {{#words}} {{id}} {{/words}} ; +""" + + +def make_jsgf(word_elements: list, filename: str = "'in-memory'") -> str: + """Generate a JSGF for the given words elements + + JSGF = Java Speech Grammar Format + + Returns: + the text contents of the JSGF file for processing by SoundSwallower.js + """ + data = { + "name": os.path.splitext(os.path.basename(filename))[0], + "date": datetime.datetime.today().strftime("%Y-%m-%d"), + "words": [{"id": text_id} for text_id in get_ids(word_elements)], + } + + return chevron.render(JSGF_TEMPLATE, data) diff --git a/readalongs/text/make_jsgf.py b/readalongs/text/make_jsgf.py deleted file mode 100644 index 538cffab..00000000 --- a/readalongs/text/make_jsgf.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -################################################## -# -# make_fsg.py -# -# This module takes a text file, marked up with -# units (e.g. w for word, m for morpheme) and ids -# and converted to IPA, and outputs a FSG -# file for processing by PocketSphinx. -# -# TODO: AP: This docstring seems to have been copied from make_fsg -# and doesn't appear to be used. Do we need this file? -################################################## - - -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse -import datetime -import os - -import chevron - -from readalongs.text.util import load_xml, save_txt - -JSGF_TEMPLATE = """#JSGF 1.0 UTF-8; -grammar {{name}}; - -/** - * Auto-generated JSGF grammar for the document {{name}}. - * - * @author Automatically generated by make_jsgf.py - * @version 1.0 - * @since {{date}} - */ - -public = {{#words}} {{id}} {{/words}} ; -""" - - -def make_jsgf(xml, filename, unit="m"): - data = { - "name": os.path.splitext(os.path.basename(filename))[0], - "date": datetime.datetime.today().strftime("%Y-%m-%d"), - "words": [], - } - - for e in xml.xpath(".//" + unit): - if "id" not in e.attrib: # don't put in elements with no id - continue - text = e.text.strip() - if text == "": # don't put in elements with no text - continue - id = e.attrib["id"] - data["words"].append({"id": id}) - - return chevron.render(JSGF_TEMPLATE, data) - - -def go(input_filename, output_filename, unit): - xml = load_xml(input_filename) - jsgf = make_jsgf(xml, input_filename, unit) - save_txt(output_filename, jsgf) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Make an JSGF grammar from an XML file with IDs" - ) - parser.add_argument("input", type=str, help="Input XML") - parser.add_argument("output_jsgf", type=str, help="Output .jsgf file") - parser.add_argument( - "--unit", - type=str, - default="m", - help="XML tag of the unit of analysis " '(e.g. "w" for word, "m" for morpheme)', - ) - args = parser.parse_args() - go(args.input, args.output_fsg, args.unit) diff --git a/readalongs/text/make_package.py b/readalongs/text/make_package.py index cb6b7be8..75691eea 100644 --- a/readalongs/text/make_package.py +++ b/readalongs/text/make_package.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - ################################################### # # make_package.py @@ -24,8 +21,8 @@ from readalongs.log import LOGGER -JS_BUNDLE_URL = "https://unpkg.com/@roedoejet/readalong/dist/bundle.js" -FONTS_BUNDLE_URL = "https://unpkg.com/@roedoejet/readalong/dist/fonts.b64.css" +JS_BUNDLE_URL = "https://unpkg.com/@roedoejet/readalong@^0.1.6/dist/bundle.js" +FONTS_BUNDLE_URL = "https://unpkg.com/@roedoejet/readalong@^0.1.6/dist/fonts.b64.css" BASIC_HTML = """ @@ -60,28 +57,34 @@ def encode_from_path(path: str) -> str: with open(path, "rb") as f: path_bytes = f.read() - if path.endswith("xml"): + if str(path).endswith("xml"): root = etree.fromstring(path_bytes) for img in root.xpath("//graphic"): url = img.get("url") - res = requests.get(url) if url.startswith("http") else None + if url.startswith("http"): + try: + request_result = requests.get(url) + except requests.exceptions.RequestException: + request_result = None + else: + request_result = None mime = guess_type(url) if os.path.exists(url): with open(url, "rb") as f: img_bytes = f.read() img_b64 = str(b64encode(img_bytes), encoding="utf8") - elif res and res.status_code == 200: - img_b64 = str(b64encode(res.content), encoding="utf8") + elif request_result and request_result.status_code == 200: + img_b64 = str(b64encode(request_result.content), encoding="utf8") else: - LOGGER.warn( - f"The image declared at {url} could not be found. Please check that it exists." + LOGGER.warning( + f"The image declared at {url} could not be found. Please check that it exists or that the URL is valid." ) continue img.attrib["url"] = f"data:{mime[0]};base64,{img_b64}" path_bytes = etree.tostring(root) b64 = str(b64encode(path_bytes), encoding="utf8") mime = guess_type(path) - if path.endswith( + if str(path).endswith( ".m4a" ): # hack to get around guess_type choosing the wrong mime type for .m4a files # TODO: Check other popular audio formats, .wav, .mp3, .ogg, etc... @@ -92,7 +95,7 @@ def encode_from_path(path: str) -> str: ) # Hack: until we properly extract audio from video files, force any video-based mime type to be read as audio else: mime_type = "application" - LOGGER.warn( + LOGGER.warning( f"We could not guess the mime type of file at {path}, we will try the generic mime type 'application', but this might not work with some files" ) return f"data:{mime_type};base64,{b64}" @@ -112,7 +115,7 @@ def create_web_component_html( js = requests.get(JS_BUNDLE_URL) fonts = requests.get(FONTS_BUNDLE_URL) if js.status_code != 200: - LOGGER.warn( + LOGGER.warning( f"Sorry, the JavaScript bundle that is supposed to be at {JS_BUNDLE_URL} returned a {js.status_code}. Your ReadAlong will be bundled using a version that may not be up-to-date. Please check your internet connection." ) with open( @@ -122,7 +125,7 @@ def create_web_component_html( else: js_raw = js.text if fonts.status_code != 200: - LOGGER.warn( + LOGGER.warning( f"Sorry, the fonts bundle that is supposed to be at {FONTS_BUNDLE_URL} returned a {fonts.status_code}. Your ReadAlong will be bundled using a version that may not be up-to-date. Please check your internet connection." ) with open( diff --git a/readalongs/text/make_smil.py b/readalongs/text/make_smil.py index d3d91d7d..a766ad38 100644 --- a/readalongs/text/make_smil.py +++ b/readalongs/text/make_smil.py @@ -1,21 +1,16 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -################################################################### -# -# make_smil.py -# -# Turns alignment into formatted SMIL for ReadAlongs WebComponent -#################################################################### +""" +make_smil.py +Turns alignment into formatted SMIL for ReadAlongs WebComponent +""" -import argparse +from typing import List import chevron +from lxml import etree -from readalongs.text.util import save_txt - -SMIL_TEMPLATE = """ +SMIL_TEMPLATE = """\ + {{#words}} @@ -35,59 +30,72 @@ END_SUBIDX = 3 -def parse_hypseg(text): - """Parse hypseg alignments file and return alignements - - Args: - text(str): hypseg text - - Returns: - dict: a dictionary of all start and end points for each word in text - """ - results = {"words": []} - tokens = text.strip().split() - # results["basename"] = tokens[BASENAME_IDX] - start = float(tokens[START_TIME_IDX]) * 0.01 - i = WORDS_IDX - while i < len(tokens): - word = tokens[i + WORD_SUBIDX] - end = tokens[i + END_SUBIDX] - end = float(end) * 0.01 - if word != "": - results["words"].append({"id": word, "start": start, "end": end}) - start = end - i += WORD_SPAN - return results - - -def make_smil(text_path: str, audio_path: str, results: dict) -> str: +def make_smil(text_path: str, audio_path: str, words: List[dict]) -> str: """Actually render the SMIL + words is a list of dicts with these elements: + { + "id": word id (str), + "start": word start time in seconds (float), + "end": word_end_time_in_seconds (float), + } + Args: - text_path(str): path to text - audio_path(str): path to audio - results(dict): all alignements + text_path (str): path to text + audio_path (str): path to audio + words (List[dict]): all alignments Returns: str: formatted SMIL """ - results["text_path"] = text_path - results["audio_path"] = audio_path - return chevron.render(SMIL_TEMPLATE, results) + return chevron.render( + SMIL_TEMPLATE, + {"text_path": text_path, "audio_path": audio_path, "words": words}, + ) + +def parse_smil(formatted_smil: str) -> List[dict]: + """Extract the list of words and their alignment from a SMIL file content. -def go(seg_path, text_path, audio_path, output_path): - results = make_smil(text_path, audio_path, parse_hypseg(seg_path)) - save_txt(output_path, results) + Args: + formatted_smil (str): the raw, unparsed XML content of the .smil file + Returns: + List[dict]: a list of dicts with these elements: + { + "id": word id (str), + "start": word start time in seconds (float), + "end": word_end_time_in_seconds (float), + } + Raises: + ValueError if there is a problem parsing formatted_smil as valid SMIL + """ -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert XML to another orthography while preserving tags" - ) - parser.add_argument("input_seg", type=str, help="Input hypseg file") - parser.add_argument("text_path", type=str, help="Text filename") - parser.add_argument("audio_path", type=str, help="Audio filename") - parser.add_argument("output", type=str, help="Output SMIL file") - args = parser.parse_args() - go(args.input_seg, args.text_path, args.audio_path, args.output) + please_msg = "Please make sure your SMIL file is valid." + + try: + xml = etree.fromstring(formatted_smil) + except etree.ParseError as e: + raise ValueError(f"Invalid SMIL file: {e}. {please_msg}") + ns = {"smil": "http://www.w3.org/ns/SMIL"} + + words = [] + for par_el in xml.xpath(".//smil:par", namespaces=ns): + text_src = par_el.find("smil:text", namespaces=ns).attrib["src"] + _, _, text_id = text_src.partition("#") + if not text_id: + raise ValueError(f"Missing word id. {please_msg}") + audio_el = par_el.find("smil:audio", namespaces=ns) + try: + clip_begin = float(audio_el.attrib["clipBegin"]) + clip_end = float(audio_el.attrib["clipEnd"]) + except KeyError as e: + raise ValueError(f"Missing 'clipBegin' or 'clipEnd'. {please_msg}") from e + except ValueError as e: + raise ValueError( + f"Invalid 'clipBegin' or 'clipEnd': {e}. {please_msg}." + ) from e + + words.append({"id": text_id, "start": clip_begin, "end": clip_end}) + + return words diff --git a/readalongs/text/tokenize_xml.py b/readalongs/text/tokenize_xml.py index a3e06d4d..b53a9227 100644 --- a/readalongs/text/tokenize_xml.py +++ b/readalongs/text/tokenize_xml.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - ################################################## # # tokenize_xml.py @@ -30,21 +27,12 @@ ################################################## -from __future__ import absolute_import, division, print_function, unicode_literals - -import argparse from copy import deepcopy from lxml import etree from readalongs.log import LOGGER -from readalongs.text.util import ( - get_lang_attrib, - is_do_not_align, - load_xml, - save_xml, - unicode_normalize_xml, -) +from readalongs.text.util import get_lang_attrib, is_do_not_align, unicode_normalize_xml def tokenize_xml_in_place(xml): @@ -57,10 +45,16 @@ def tokenize_xml_in_place(xml): etree: tokenized xml """ - from g2p.mappings.tokenizer import get_tokenizer # Defer expensive import + # Defer expensive import, and use the new version, but keep it + # compatible with older versions of g2p for at least a little while. + try: + from g2p import make_tokenizer + except ImportError: + from g2p import get_tokenizer as make_tokenizer def add_word_children(element): """Recursive helper for tokenize_xml_in_place()""" + tag = etree.QName(element.tag).localname nsmap = element.nsmap if hasattr(element, "nsmap") else element.getroot().nsmap if tag in ["w", "teiHeader", "head"]: # don't do anything to existing words! @@ -78,7 +72,7 @@ def add_word_children(element): new_element.attrib[key] = value lang = get_lang_attrib(element) - tokenizer = get_tokenizer(lang) + tokenizer = make_tokenizer(lang) if element.text: new_element.text = "" for unit in tokenizer.tokenize_text(element.text): @@ -129,19 +123,3 @@ def tokenize_xml(xml): return xml LOGGER.info("Words () not present; tokenizing") return tokenize_xml_in_place(xml) - - -def go(input_filename, output_filename): - xml = load_xml(input_filename) - xml = tokenize_xml(xml) - save_xml(output_filename, xml) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Convert XML to another orthography while preserving tags" - ) - parser.add_argument("input", type=str, help="Input XML") - parser.add_argument("output", type=str, help="Output XML") - args = parser.parse_args() - go(args.input, args.output) diff --git a/readalongs/text/util.py b/readalongs/text/util.py index f95f44dd..159699a8 100644 --- a/readalongs/text/util.py +++ b/readalongs/text/util.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - ########################################### # # util.py @@ -69,7 +66,27 @@ def get_attrib_recursive(element, *attribs): return None -def get_lang_attrib(element): +def iterate_over_text(element: etree.ElementTree): + """Iterate over all actual text contained with element and its sub-elements + + Yields: + (language_code, text) pairs + """ + lang = get_lang_attrib(element) + if element.text: + yield (lang, element.text) + for child in element: + yield from iterate_over_text(child) + if child.tail: + yield (lang, child.tail) + + +def get_word_text(word_element: etree.ElementTree) -> str: + """Given a word element, extract all its text""" + return "".join(text for _, text in iterate_over_text(word_element)) + + +def get_lang_attrib(element: etree.ElementTree): """Return the xml:lang (in priority) or lang (fallback) attribute from element or its closest ancestor that has either, or None when neither is found. """ @@ -93,18 +110,18 @@ def load_xml_zip(zip_path, input_path): def load_xml_with_encoding(input_path): - """ etree.fromstring messes up on declared encodings """ + """etree.fromstring messes up on declared encodings""" return etree.parse(input_path) def write_xml(output_filelike, xml): - """ Write XML to already opened file-like object """ + """Write XML to already opened file-like object""" output_filelike.write(etree.tostring(xml, encoding="utf-8", xml_declaration=True)) output_filelike.write("\n".encode("utf-8")) def save_xml(output_path, xml): - """ Save XML to specific PATH """ + """Save XML to specific PATH""" ensure_dirs(output_path) with open(output_path, "wb") as fout: write_xml(fout, xml) @@ -120,7 +137,7 @@ def save_xml_zip(zip_path, output_path, xml): def load_txt(input_path): - with open(input_path, "r", encoding="utf-8") as fin: + with open(input_path, "r", encoding="utf-8-sig") as fin: return fin.read() @@ -144,7 +161,7 @@ def save_txt_zip(zip_path, output_path, txt): def load_json(input_path): - with open(input_path, "r", encoding="utf-8") as fin: + with open(input_path, "r", encoding="utf-8-sig") as fin: return json.load(fin, object_pairs_hook=OrderedDict) @@ -178,32 +195,46 @@ def copy_file_to_zip(zip_path, origin_path, destination_path): - Insert Title Here + {title} - - Insert Title Here Too + + {header} + {subheader} - - + + """ def save_minimal_index_html( - output_path, tokenized_xml_basename, smil_basename, audio_basename + output_path, + tokenized_xml_basename, + smil_basename, + audio_basename, + title, + header, + subheader, + theme, ): with open(output_path, "w", encoding="utf-8") as fout: fout.write( MINIMAL_INDEX_HTML_TEMPLATE.format( - tokenized_xml_basename, smil_basename, audio_basename + title=title, + text=tokenized_xml_basename, + smil=smil_basename, + audio=audio_basename, + theme=theme, + header=header, + subheader=subheader, ) ) @@ -218,7 +249,7 @@ def unicode_normalize_xml(element): def parse_time(time_string: str) -> int: - """ Parse a time stamp in h/m/s(default)/ms or any combination of these units. + """Parse a time stamp in h/m/s(default)/ms or any combination of these units. Args: time_string (str): timestamp, e.g., "0.23s", "5.234" (implied s), "1234 ms", diff --git a/readalongs/util.py b/readalongs/util.py index 4909a539..7cd3d5bb 100644 --- a/readalongs/util.py +++ b/readalongs/util.py @@ -1,6 +1,5 @@ import re from collections.abc import Iterable -from itertools import tee import click @@ -8,16 +7,16 @@ LANG_NAMES = None -def getLangsDeferred() -> Iterable: +def get_langs_deferred() -> Iterable: """Lazilly get the list of language codes supported by g2p library Yields an Iterable in such a way that the g2p database is only loaded when the results are iterated over, rather than when this function is called. """ - yield from getLangs()[0] + yield from get_langs()[0] -def getLangs(): +def get_langs(): """Get the list of language codes and names supported by the g2p library Returns: @@ -39,23 +38,22 @@ def getLangs(): import g2p.mappings.langs as g2p_langs from networkx import has_path - # LANGS_AVAILABLE in g2p lists langs inferred by the directory structure of + # langs_available in g2p lists langs inferred by the directory structure of # g2p/mappings/langs, but in ReadAlongs, we need all input languages to any mappings. # E.g., for Michif, we need to allow crg-dv and crg-tmd, but not crg, which is what - # LANGS_AVAILABLE contains. So we define our own list of languages here. - LANGS_AVAILABLE = [] + # langs_available contains. So we define our own list of languages here. + langs_available = [] - # Set up LANG_NAMES hash table for studio UI to - # properly name the dropdown options - LANG_NAMES = {"eng": "English"} + # this will be the set of all langs in g2p + "eng", which we need temporarily + full_lang_names = {"eng": "English"} - for k, v in g2p_langs.LANGS.items(): + for _, v in g2p_langs.LANGS.items(): for mapping in v["mappings"]: # add mapping to names hash table - LANG_NAMES[mapping["in_lang"]] = mapping["language_name"] + full_lang_names[mapping["in_lang"]] = mapping["language_name"] # add input id to all available langs list - if mapping["in_lang"] not in LANGS_AVAILABLE: - LANGS_AVAILABLE.append(mapping["in_lang"]) + if mapping["in_lang"] not in langs_available: + langs_available.append(mapping["in_lang"]) # get the key from all networks in g2p module that have a path to 'eng-arpabet', # which is needed for the readalongs @@ -63,7 +61,7 @@ def getLangs(): # Filter out *-norm and crk-no-symbols, these are just intermediate representations. LANGS = [ x - for x in LANGS_AVAILABLE + for x in langs_available if not x.endswith("-ipa") and not x.endswith("-equiv") and not x.endswith("-no-symbols") @@ -75,37 +73,60 @@ def getLangs(): LANGS += ["eng"] # Sort LANGS so the -h messages list them alphabetically LANGS = sorted(LANGS) + + # Set up LANG_NAMES hash table for studio UI to properly name the dropdown options + LANG_NAMES = {lang_code: full_lang_names[lang_code] for lang_code in LANGS} + return LANGS, LANG_NAMES -class JoinerCallback: +# For backwards compatibility, we keep the old names getLangs and getLangsDeferred around. +# For example, ReadAlongsDesktop +# (https://github.com/tobyatgithub/ReadalongsDesktop) depended on the old name, +# and even when it's updated, it'll be helpful to avoid breaking older versions. +getLangs = get_langs +getLangsDeferred = get_langs_deferred + + +class JoinerCallbackForClick: """Command-line parameter validation for multiple-value options. The values can be repeated by giving the option multiple times on the command line, or by joining them with strings matching joiner_re (colon or comma, arbitrarily mixed, by default). - Matching is case insensitive. + Matching is case insensitive iff drop_case is True. """ - def __init__(self, valid_values: Iterable, joiner_re=r"[,:]"): - self.valid_values = valid_values + def __init__(self, valid_values: Iterable, joiner_re=r"[,:]", drop_case=False): + """Get a joiner callback. + + Args: + valid_values: list of valid values for the multi-value option + joiner_re: regex for how to user may join multiple values + drop_case: when true, processed results will be converted to lowercase + """ + self.valid_values = valid_values # ***do not convert this to a list here!*** self.joiner_re = joiner_re + self.drop_case = drop_case # This signature meets the requirements of click.option's callback parameter: - def __call__(self, _ctx, _param, value_groups): - # Defer potentially expensive expansion of valid_values until we really need it. - self.valid_values, valid_values_iterator = tee(self.valid_values, 2) - lc_valid_values = [valid_value.lower() for valid_value in valid_values_iterator] + def __call__(self, _ctx=None, _param=None, value_groups=()): + # Potentially expensive expansion actually required here, so do it now. + self.valid_values = list(self.valid_values) + if self.drop_case: + self.valid_values = [value.lower() for value in self.valid_values] results = [ value.strip() for value_group in value_groups for value in re.split(self.joiner_re, value_group) ] + if self.drop_case: + results = [value.lower() for value in results] for value in results: - if value.lower() not in lc_valid_values: + if value not in self.valid_values: raise click.BadParameter( - f"'{value}' is not one of {self.quoted_list(lc_valid_values)}." + f"'{value}' is not one of {self.quoted_list(self.valid_values)}." ) return results @@ -113,3 +134,20 @@ def __call__(self, _ctx, _param, value_groups): def quoted_list(values): """Display a list of values quoted, for easy reading in error messages.""" return ", ".join("'" + v + "'" for v in values) + + +def get_obsolete_callback_for_click(message): + """Click callback for telling the user an option is obsolete in a helpful way. + + Args: + message (str): message telling the user what the option is replaced by + """ + + def _callback(_ctx, param, value_groups): + if value_groups: + joiner = "' / '" + raise click.BadParameter( + f"The '{joiner.join(param.opts)}' option is obsolete.\n" + message + ) + + return _callback diff --git a/readalongs/views.py b/readalongs/views.py index 3852ac00..9facdef6 100644 --- a/readalongs/views.py +++ b/readalongs/views.py @@ -1,30 +1,25 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -####################################################################### -# -# views.py -# -# Views for ReadAlong Studio web application -# Interactions are described as websocket events and responses -# Corresponding JavaScript is found in readalongs/static/js/main.js -# -####################################################################### +""" +views.py: Views for ReadAlong Studio web application + +Interactions are described as websocket events and responses +Corresponding JavaScript is found in readalongs/static/js/main.js +""" import io import os +import re from datetime import datetime from pathlib import Path -from subprocess import run from tempfile import mkdtemp from zipfile import ZipFile from flask import abort, redirect, render_template, request, send_file, session, url_for from flask_socketio import emit +from readalongs.api import align from readalongs.app import app, socketio from readalongs.log import LOGGER -from readalongs.util import getLangs +from readalongs.util import get_langs ALLOWED_TEXT = ["txt", "xml", "docx"] ALLOWED_AUDIO = ["wav", "mp3"] @@ -44,6 +39,17 @@ def allowed_file(filename: str) -> bool: return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS +def safe_decode(byte_seq: bytes) -> str: + """Convert byte_seq to str if it's valid utf8, otherwise return its str rep + + Does not raise any exceptions: non-utf8 inputs will yield escaped specials. + """ + try: + return byte_seq.decode() + except UnicodeDecodeError: + return str(byte_seq) + + def uploaded_files(dir_path: str) -> dict: """Returns all files that have been uploaded @@ -87,7 +93,7 @@ def update_session_config(**kwargs) -> dict: @app.route("/") def home(): - """ Home View - go to Step 1 which is for uploading files """ + """Home View - go to Step 1 which is for uploading files""" return redirect(url_for("steps", step=1)) @@ -133,14 +139,20 @@ def remove_file(): return redirect(url_for("steps", step=1)) +def option_to_kwargs(option: str) -> str: + if option[0:2] == "--": + option = option[2:] + return option.replace("-", "_") + + @app.route("/step/") def steps(step): - """ Go through steps """ + """Go through steps""" if step == 1: session.clear() session["temp_dir"] = mkdtemp() temp_dir = session["temp_dir"] - langs, lang_names = getLangs() + langs, lang_names = get_langs() return render_template( "upload.html", uploaded=uploaded_files(temp_dir), @@ -150,35 +162,47 @@ def steps(step): return render_template("preview.html") elif step == 3: if "audio" not in session or "text" not in session: - log = "Sorry, it looks like something is wrong with your audio or text. Please try again" + log = "Sorry, it looks like something is wrong with your audio or text. Please try again." + data = {"log": log} + elif session["text"].endswith("txt") and not session.get("config", {}).get( + "lang" + ): + log = "Sorry, the language setting is required for plain text files. Please try again." + data = {"log": log} else: - flags = ["--force-overwrite"] - for option in ["--closed-captioning", "--save-temps", "--text-grid"]: - if session["config"].get(option, False): - flags.append(option) + kwargs = dict() + kwargs["force_overwrite"] = True + kwargs["save_temps"] = session["config"].get("--save-temps", False) + kwargs["output_formats"] = [] + if session["config"].get("--closed-captioning", False): + kwargs["output_formats"].append("srt") + if session["config"].get("--text-grid", False): + kwargs["output_formats"].append("TextGrid") if session["text"].endswith("txt"): - flags.append("--text-input") - flags.append("--language") - flags.append(session["config"]["lang"]) + kwargs["language"] = [session["config"]["lang"]] + timestamp = str(int(datetime.now().timestamp())) output_base = "aligned" + timestamp - args = ( - ["readalongs", "align"] - + flags - + [ - session["text"], - session["audio"], - os.path.join(session["temp_dir"], output_base), - ] - ) - LOGGER.warning(args) + + kwargs["textfile"] = session["text"] + kwargs["audiofile"] = session["audio"] + kwargs["output_base"] = os.path.join(session["temp_dir"], output_base) + LOGGER.info(kwargs) + _, audio_ext = os.path.splitext(session["audio"]) data = {"audio_ext": audio_ext, "base": output_base} + (status, exception, log_text) = align(**kwargs) + status_text = "OK" if status == 0 else "Error" if session["config"].get("show-log", False): - log = run(args, capture_output=True, check=False) - data["log"] = log + data["log"] = f"Status: {status_text}" + if exception: + data["log"] += f"; Exception: {exception!r}" + data["log_lines"] = list(re.split(r"\r?\n", log_text)) else: - run(args, check=False) + if status != 0 or exception: + # Always display errors, even when logs are not requested + data["log"] = f"Status: {status_text}; Exception: {exception!r}" + data["audio_path"] = os.path.join( session["temp_dir"], output_base, output_base + audio_ext ) @@ -213,7 +237,7 @@ def show_zip(base): with ZipFile(data, mode="w") as z: for fname in files_to_download: path = os.path.join(session["temp_dir"], base, fname) - if fname.startswith("aligned"): + if fname.startswith("aligned") or fname == "index.html": z.write(path, fname) data.seek(0) @@ -231,7 +255,7 @@ def show_zip(base): @app.route("/file/", methods=["GET"]) def return_temp_file(fname): fn, _ = os.path.splitext(fname) - LOGGER.warning(session["temp_dir"]) + LOGGER.info(session["temp_dir"]) path = os.path.join(session["temp_dir"], fn, fname) if os.path.exists(path): return send_file(path) diff --git a/readalongs/waveform2svg/audio_util.py b/readalongs/waveform2svg/audio_util.py index 46a4c24f..878b82bc 100644 --- a/readalongs/waveform2svg/audio_util.py +++ b/readalongs/waveform2svg/audio_util.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- ################################################### @@ -29,7 +29,7 @@ def smooth(x, window_size=5): - """ Smooth the waveform to look... well, smooth """ + """Smooth the waveform to look... well, smooth""" if window_size < 3: return x s = np.r_[2 * x[0] - x[window_size - 1 :: -1], x, 2 * x[-1] - x[-1:-window_size:-1]] @@ -39,8 +39,8 @@ def smooth(x, window_size=5): def load_smil(input_path): - """ Get the bucketed max and min value from a sequence of WAV files as - expressed in a SMIL document """ + """Get the bucketed max and min value from a sequence of WAV files as + expressed in a SMIL document""" xml = load_xml(input_path) dirname = os.path.dirname(input_path) data = None diff --git a/readalongs/waveform2svg/make_all_svgs.py b/readalongs/waveform2svg/make_all_svgs.py index cb5bcca1..972fec57 100644 --- a/readalongs/waveform2svg/make_all_svgs.py +++ b/readalongs/waveform2svg/make_all_svgs.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- ######################################################## @@ -12,7 +12,6 @@ from __future__ import absolute_import, division, print_function, unicode_literals import argparse -from io import open from audio_util import save_txt from pitch2svg import make_pitch_svg diff --git a/readalongs/waveform2svg/pitch2svg.py b/readalongs/waveform2svg/pitch2svg.py index 1128617c..4880f0db 100644 --- a/readalongs/waveform2svg/pitch2svg.py +++ b/readalongs/waveform2svg/pitch2svg.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- ################################################### @@ -17,15 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_literals import argparse -import os -from io import open from math import floor import chevron import librosa -import numpy as np -from readalongs.log import LOGGER from readalongs.waveform2svg.audio_util import ( SAMPLE_RATE, load_wav_or_smil, diff --git a/readalongs/waveform2svg/units2svg.py b/readalongs/waveform2svg/units2svg.py index 054da3fd..d5320adf 100644 --- a/readalongs/waveform2svg/units2svg.py +++ b/readalongs/waveform2svg/units2svg.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- ################################################### @@ -19,15 +19,12 @@ import argparse import os from collections import OrderedDict -from io import open import chevron import librosa -import numpy as np from audio_util import save_txt from lxml import etree -from readalongs.log import LOGGER from readalongs.text.util import xpath_default FMIN = 80 diff --git a/readalongs/waveform2svg/waveform2svg.py b/readalongs/waveform2svg/waveform2svg.py index cf67cd23..2098b597 100644 --- a/readalongs/waveform2svg/waveform2svg.py +++ b/readalongs/waveform2svg/waveform2svg.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- ################################################### @@ -14,14 +14,11 @@ from __future__ import absolute_import, division, print_function, unicode_literals import argparse -import os -from io import open from math import ceil, floor import chevron import numpy as np -from readalongs.log import LOGGER from readalongs.waveform2svg.audio_util import load_wav_or_smil, save_txt, smooth SVG_TEMPLATE = """ diff --git a/readalongs/web_api.py b/readalongs/web_api.py new file mode 100644 index 00000000..44ac68d2 --- /dev/null +++ b/readalongs/web_api.py @@ -0,0 +1,428 @@ +""" +REST-ish Web API for ReadAlongs Studio text manipulation operations using FastAPI. + +See https://readalong-studio.herokuapp.com/api/v1/docs for the documentation. + +You can spin up this Web API for development purposes with: + cd readalongs/ + PRODUCTION= uvicorn readalongs.web_api:web_api_app --reload +- The --reload switch will watch for changes under the directory where it's + running and reload the code whenever it changes, so it's best run in readalongs/ +- PRODUCTION= tells uvicorn to run in non-production mode, i.e., in debug mode, + and automatically add the header "access-control-allow-origin: *" to each + response so you won't get CORS errors using this locally with Studio-Web. + +You can also spin up the API server grade (on Linux, not Windows) with gunicorn: + gunicorn -w 4 -k uvicorn.workers.UvicornWorker readalongs.web_api:web_api_app + +Once spun up, the documentation and API playground will be visible at +http://localhost:8000/api/v1/docs +""" + +import io +import os +import tempfile +from enum import Enum +from textwrap import dedent +from typing import Dict, List, Optional, Union + +from fastapi import Body, FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse +from lxml import etree +from pydantic import BaseModel, Field +from starlette.background import BackgroundTask + +from readalongs.align import create_tei_from_text, save_label_files, save_subtitles +from readalongs.log import LOGGER +from readalongs.text.add_ids_to_xml import add_ids +from readalongs.text.convert_xml import convert_xml +from readalongs.text.make_dict import make_dict_object +from readalongs.text.make_fsg import make_jsgf +from readalongs.text.make_smil import parse_smil +from readalongs.text.tokenize_xml import tokenize_xml +from readalongs.util import get_langs + +# Create the app +web_api_app = FastAPI() +# Create the v1 version of the API +v1 = FastAPI() +# Call get_langs() when the server loads to load the languages into memory +LANGS = get_langs() + +if os.getenv("PRODUCTION", True): + origins = [ + "https://readalong-studio.mothertongues.org", + ] # Allow requests from mt app +else: + origins = ["*"] # Allow requests from any origin +web_api_app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["GET", "POST"], + allow_headers=["*"], +) + + +class RequestBase(BaseModel): + """Base request for assemble""" + + text_languages: List[str] + debug: bool = False + + +class PlainTextRequest(RequestBase): + """Request to assemble with input as plain text""" + + text: str + + +class XMLRequest(RequestBase): + """Request to assemble with input as XML""" + + xml: str + + +class AssembleResponse(BaseModel): + """Response from assemble with the XML prepared and the rest.""" + + lexicon: Dict[str, str] # A dictionary of the form {lang_id: lang_name } + jsgf: str # The JSGF-formatted grammar in plain text + text_ids: str # The text ID input for the decoder in plain text + processed_xml: str # The processed XML is returned as a string + input: Optional[Union[XMLRequest, PlainTextRequest]] + parsed: Optional[str] + tokenized: Optional[str] + g2ped: Optional[str] + + +@v1.get("/langs", response_model=Dict[str, str]) +async def langs() -> Dict[str, str]: + """Return the list of supported languages and their names as a dict. + + Returns: + langs as dict with language codes as keys and the full language name as + values, e.g.: + `{ + "alq", "Algonquin", + "atj": "Atikamekw", + "lc3", "Third Language Name", + ... + }` + """ + + return LANGS[1] + + +@v1.post("/assemble", response_model=AssembleResponse) +async def assemble( + request: Union[XMLRequest, PlainTextRequest] = Body( + examples={ + "text": { + "summary": "A basic example with plain text input", + "value": { + "text": "hej verden", + "text_languages": ["dan", "und"], + "debug": False, + }, + }, + "xml": { + "summary": "A basic example with xml input", + "value": { + "xml": "

hej verden

", + "text_languages": ["dan", "und"], + "debug": False, + }, + }, + } + ) +): + """Create an input TEI from the given text (as plain text or XML). + Also creates the required grammar, pronunciation dictionary, + and text needed by the decoder. + + Encoding: all input and output is in UTF-8. + + Args (as dict items in the request body): + - text_languages: the list of languages for g2p processing + - debug: set to true for debugging (default: False) + - either text or xml: + - text: the input text as plain text + - xml: the input text as a readalongs-compatible XML structure + + Returns (as dict items in the response body): + - lexicon: maps word IDs to their pronunciation + - jsgf: grammar for the forced aligner + - text_ids: the list of word_ids as a space-separated string + - processed_xml: the XML with all the readalongs info in it + """ + + if isinstance(request, XMLRequest): + try: + parsed = etree.fromstring(bytes(request.xml, encoding="utf-8")) + except etree.XMLSyntaxError as e: + raise HTTPException( + status_code=422, detail="XML provided is not valid" + ) from e + elif isinstance(request, PlainTextRequest): + parsed = io.StringIO(request.text).readlines() + parsed = etree.fromstring( + bytes( + create_tei_from_text(parsed, text_languages=request.text_languages), + encoding="utf-8", + ) + ) + # tokenize + tokenized = tokenize_xml(parsed) + # add ids + ids_added = add_ids(tokenized) + # g2p + g2ped, valid = convert_xml(ids_added) + if not valid: + raise HTTPException( + status_code=422, + detail="g2p could not be performed, please check your text or your language code", + ) + # create grammar + dict_data, jsgf, text_input = create_grammar(g2ped) + response = { + "lexicon": dict_data, + "jsgf": jsgf, + "text_ids": text_input, + "processed_xml": etree.tostring(g2ped, encoding="utf8").decode(), + } + + if request.debug: + response["input"] = request.dict() + response["parsed"] = etree.tostring(parsed, encoding="utf8") + response["tokenized"] = etree.tostring(tokenized, encoding="utf8") + response["g2ped"] = etree.tostring(g2ped, encoding="utf8") + return response + + +def create_grammar(xml): + """Create the grammar and dictionary data from w elements in the given XML""" + + word_elements = xml.xpath("//w") + dict_data = make_dict_object(word_elements) + fsg_data = make_jsgf(word_elements, filename="test") + text_data = " ".join(xml.xpath("//w/@id")) + return dict_data, fsg_data, text_data + + +class FormatName(Enum): + """The different formats supported to represent readalong alignments""" + + TEXTGRID = "textgrid" # Praat TextGrid format + EAF = "eaf" # ELAN EAF format + SRT = "srt" # SRT subtitle format + VTT = "vtt" # VTT subtitle format + + +class ConvertRequest(BaseModel): + """Convert Request contains the RAS-processed XML and SMIL alignments""" + + audio_duration: float = Field( + example=2.01, + gt=0.0, + title="The duration of the audio used to create the alignment, in seconds.", + ) + + xml: str = Field( + title="The processed_xml returned by /assemble.", + example=dedent( + """\ + + + + +
+

+ hej verden +

+
+ +
+
""" + ), + ) + + smil: str = Field( + title="The result of aligning xml to the audio with SoundSwallower(.js)", + example=dedent( + """\ + + + + + + + + + + """ + ), + ) + + +class SubtitleTier(Enum): + """Which tier of the alignment information is returned""" + + SENTENCE = "sentence" + WORD = "word" + + +@v1.post("/convert_alignment/{output_format}") +async def convert_alignment( # noqa: C901 + request: ConvertRequest, + output_format: FormatName, + tier: Union[SubtitleTier, None] = None, +) -> FileResponse: + """Convert an alignment to a different format. + + Encoding: all input and output is in UTF-8. + + Path Parameter: + - output_format: Format to convert to, one of textgrid (Praat TextGrid), + eaf (ELAN EAF), srt (SRT subtitles), or vtt (VTT subtitles). + + Query Parameter: + - tier: for srt and vtt outputs, whether the subtitles should be at the + sentence (this is the default) or word level. + + Args (as dict items in the request body): + - audio_duration: duration in seconds of the audio file used to create the alignment + - xml: the XML file produced by /assemble + - smil: the SMIL file produced by SoundSwallower(.js) + + Formats supported: + - TextGrid: Praat TextGrid file format + - eaf: ELAN eaf file format + - srt: SRT subtitle format (at the sentence or word level, based on tier) + - vtt: WebVTT subtitle format (at the sentence or word level, based on tier) + + Data privacy consideration: due to limitations of the libraries used to perform + some of these conversions, the output files will be temporarily stored on disk, + but they get deleted immediately as this endpoint returns its output or reports + any error. + + Returns: a file in the format requested + """ + try: + parsed_xml = etree.fromstring(bytes(request.xml, encoding="utf-8")) + except etree.XMLSyntaxError as e: + raise HTTPException(status_code=422, detail="XML provided is not valid") from e + + try: + words = parse_smil(request.smil) + except ValueError as e: + raise HTTPException(status_code=422, detail="SMIL provided is not valid") from e + + # Data privacy consideration: we have to make sure this temporary directory gets + # deleted after the call returns, as we promise in the API documentation. + temp_dir_object = tempfile.TemporaryDirectory() + temp_dir_name = temp_dir_object.name + cleanup = BackgroundTask(temp_dir_object.cleanup) + prefix = os.path.join(temp_dir_name, "aligned") + LOGGER.info("Temporary directory: %s", temp_dir_name) + + try: + if output_format == FormatName.TEXTGRID: + try: + save_label_files( + words, parsed_xml, request.audio_duration, prefix, "textgrid" + ) + except Exception as e: + raise HTTPException( + status_code=422, + detail="XML+SMIL file pair provided cannot be converted", + ) from e + return FileResponse( + prefix + ".TextGrid", + background=cleanup, + media_type="text/plain", + filename="aligned.TextGrid", + ) + + elif output_format == FormatName.EAF: + try: + save_label_files( + words, parsed_xml, request.audio_duration, prefix, "eaf" + ) + except Exception as e: + raise HTTPException( + status_code=422, + detail="XML+SMIL file pair provided cannot be converted", + ) from e + return FileResponse( + prefix + ".eaf", + background=cleanup, + media_type="text/xml", + filename="aligned.eaf", + ) + + elif output_format == FormatName.SRT: + try: + save_subtitles(words, parsed_xml, prefix, "srt") + except Exception as e: + raise HTTPException( + status_code=422, + detail="XML+SMIL file pair provided cannot be converted", + ) from e + if tier == SubtitleTier.WORD: + return FileResponse( + prefix + "_words.srt", + background=cleanup, + media_type="text/plain", + filename="aligned_words.srt", + ) + else: + return FileResponse( + prefix + "_sentences.srt", + background=cleanup, + media_type="text/plain", + filename="aligned_sentences.srt", + ) + + elif output_format == FormatName.VTT: + try: + save_subtitles(words, parsed_xml, prefix, "vtt") + except Exception as e: + raise HTTPException( + status_code=422, + detail="XML+SMIL file pair provided cannot be converted", + ) from e + if tier == SubtitleTier.WORD: + return FileResponse( + prefix + "_words.vtt", + background=cleanup, + media_type="text/plain", + filename="aligned_words.vtt", + ) + else: + return FileResponse( + prefix + "_sentences.vtt", + background=cleanup, + media_type="text/plain", + filename="aligned_sentences.vtt", + ) + + else: + raise HTTPException( + status_code=500, + detail="If this happens, FastAPI Enum validation didn't work so this is a bug!", + ) + + except Exception: + # We don't normally use such a global exception, but in this case we really + # need to make sure the temporary directory is cleaned up, so this except + # catches any and all problems and wipes the temporary data + temp_dir_object.cleanup() + raise + + +# Mount the v1 version of the API to the root of the app +web_api_app.mount("/api/v1", v1) diff --git a/requirements.api.txt b/requirements.api.txt new file mode 100644 index 00000000..05f56516 --- /dev/null +++ b/requirements.api.txt @@ -0,0 +1,3 @@ +# These are dependencies required by the production Web API +gunicorn +uvicorn diff --git a/requirements.ci.txt b/requirements.ci.txt new file mode 100644 index 00000000..def6f9e9 --- /dev/null +++ b/requirements.ci.txt @@ -0,0 +1,5 @@ +# These are dependencies required by our continuous integration and testing pipelines +codecov +coverage +pip-licenses +-r requirements.api.txt diff --git a/requirements.dev.txt b/requirements.dev.txt index 98ecdc63..668a6356 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,4 +1,11 @@ +# This is a set of development dependencies +black~=22.0 +flake8>=4.0.1 +gitlint-core==0.17.0 +isort>=5.10.1 +mypy>=0.941 pre-commit>=2.6.0 -black==19.10b0 -flake8>=3.8.3 -isort>=5.4.2 +types-python-slugify>=5.0.3 +types-pyyaml>=6.0.5 +types-requests>=2.27.11 +types-setuptools>=57.4.9 diff --git a/requirements.min.txt b/requirements.min.txt new file mode 100644 index 00000000..6a20acf0 --- /dev/null +++ b/requirements.min.txt @@ -0,0 +1,18 @@ +# This is the minimal set of dependencies required for the readalongs package +chevron==0.14.0 +click==8.0.4 +coloredlogs==10.0 +fastapi==0.78.0 +Flask>=2.0.0 +Flask-Session==0.3.2 +flask-socketio==4.3.2 +g2p>=0.5.20210825 +lxml==4.9.1 +networkx==2.5 +numpy>=1.16.4 +pydub==0.23.1 +pympi-ling==1.69 +python-slugify==5.0.0 +soundswallower~=0.4.1 +webvtt-py==0.4.2 +werkzeug==2.0.3 diff --git a/requirements.txt b/requirements.txt index aa4d610e..89c49607 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,4 @@ -chevron==0.14.0 -coloredlogs==10.0 -Flask>=2.0.0 -Flask-Session==0.3.2 -flask-socketio==4.3.2 -g2p>=0.5.20210825 -lxml==4.6.5 -networkx==2.5 -numpy>=1.16.4 -panphon>=0.14 -soundswallower==0.1.1 -pydub==0.23.1 -pympi-ling==1.69 -python-slugify==5.0.0 -six==1.12.0 -tqdm==4.31.1 -webvtt-py==0.4.2 +# Heroku can only read requirements from "requirements.txt", +# so we separate the minimal library from the rest of the dependencies. +-r requirements.min.txt +-r requirements.api.txt diff --git a/run.py b/run.py index 8cdf4944..a0f8be30 100644 --- a/run.py +++ b/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """ Run ReadAlong Studio as web application diff --git a/setup.cfg b/setup.cfg index 5f571805..8d4c021f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,3 +9,9 @@ ensure_newline_before_comments=True [mypy] ignore_missing_imports = True + +[flake8] +ignore = E203, E266, E501, W503 +max-line-length = 88 +max-complexity = 18 +select = B,C,E,F,W,T4,B9 diff --git a/setup.py b/setup.py index d5e7d174..4ae97624 100644 --- a/setup.py +++ b/setup.py @@ -9,20 +9,28 @@ version_path = os.path.join(os.path.dirname(readalongs.__file__), "_version.py") VERSION = readalongs.VERSION + "." + build_no -with open(version_path, "w") as f: +with open(version_path, "w", newline="\n", encoding="utf-8") as f: print(f'__version__ = "{VERSION}"', file=f) -with open("requirements.txt") as f: +with open("requirements.min.txt") as f: required = f.read().splitlines() setup( name="readalongs", + license="MIT", python_requires=">=3.7", version=VERSION, - long_description="ReadAlong Studio", + description="ReadAlong Studio", + long_description="ReadAlong Studio, Audiobook alignment for Indigenous languages", + platform=["any"], packages=find_packages(exclude=["test"]), include_package_data=True, zip_safe=False, install_requires=required, entry_points={"console_scripts": ["readalongs = readalongs.cli:cli"]}, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], ) diff --git a/test/basic_test_case.py b/test/basic_test_case.py index 3542a59b..8889c30d 100644 --- a/test/basic_test_case.py +++ b/test/basic_test_case.py @@ -1,7 +1,7 @@ """Common base class for the ReadAlongs test suites""" -import os import tempfile +from pathlib import Path from unittest import TestCase from readalongs.app import app @@ -10,19 +10,33 @@ class BasicTestCase(TestCase): """A Basic Unittest build block class that comes bundled with - a temporary directory (tempdir), and access to an app runner - (self.runner) + a temporary directory (self.tempdir), the path to the test data (self.data_dir), + and access to an app runner (self.runner) + + For convenience, self.tempdir and self.data_dir are pathlib.Path objects + that can be used either with os.path functions or the shorter Path operators. + E.g., these two lines are equivalent: + text_file = os.path.join(self.data_dir, "ej-fra.txt") + text_file = self.data_dir / "ej-fra.txt" """ LOGGER.setLevel("DEBUG") - data_dir = os.path.join(os.path.dirname(__file__), "data") + data_dir = Path(__file__).parent / "data" # Set this to True to keep the temp dirs after running, for manual inspection # but please don't push a commit setting this to True! + # To keep temp dirs for just one subclass, add this line to its setUp() function: + # function before the call to super().setUp(): + # self.keep_temp_dir_after_running = True keep_temp_dir_after_running = False def setUp(self): - """Create a temporary directory, self.tempdir, and a test runner, self.runner""" + """Create a temporary directory, self.tempdir, and a test runner, self.runner + + If a subclass needs its own setUp() function, make sure to call + super().setUp() + at the beginning of it. + """ app.logger.setLevel("DEBUG") self.runner = app.test_cli_runner() tempdir_prefix = f"tmpdir_{type(self).__name__}_" @@ -35,8 +49,14 @@ def setUp(self): # Alternative tempdir code keeps it after running, for manual inspection: self.tempdir = tempfile.mkdtemp(prefix=tempdir_prefix, dir=".") print("tmpdir={}".format(self.tempdir)) + self.tempdir = Path(self.tempdir) def tearDown(self): - """Clean up the temporary directory""" + """Clean up the temporary directory + + If a subclass needs its own tearDown() function, make sure to call + super().tearDown() + at the end of it. + """ if not self.keep_temp_dir_after_running: self.tempdirobj.cleanup() diff --git a/test/data/ej-fra-anchors.xml b/test/data/ej-fra-anchors.xml index 5ac95e85..8200e723 100644 --- a/test/data/ej-fra-anchors.xml +++ b/test/data/ej-fra-anchors.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/ej-fra-anchors2.xml b/test/data/ej-fra-anchors2.xml index ce7af6c2..32573ff9 100644 --- a/test/data/ej-fra-anchors2.xml +++ b/test/data/ej-fra-anchors2.xml @@ -1,8 +1,5 @@ - diff --git a/test/data/ej-fra-converted.xhtml b/test/data/ej-fra-converted.xhtml index 0eaf0592..c4c746a4 100644 --- a/test/data/ej-fra-converted.xhtml +++ b/test/data/ej-fra-converted.xhtml @@ -1,9 +1,6 @@ - Book - + Book

diff --git a/test/data/ej-fra-converted.xml b/test/data/ej-fra-converted.xml index 0988f873..616210c1 100644 --- a/test/data/ej-fra-converted.xml +++ b/test/data/ej-fra-converted.xml @@ -1,8 +1,5 @@ -

diff --git a/test/data/ej-fra-dna.xml b/test/data/ej-fra-dna.xml index b581c7da..a1fe1e53 100644 --- a/test/data/ej-fra-dna.xml +++ b/test/data/ej-fra-dna.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/ej-fra-package.xml b/test/data/ej-fra-package.xml index 1adc6910..188a478f 100644 --- a/test/data/ej-fra-package.xml +++ b/test/data/ej-fra-package.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/ej-fra-silence-bad.xml b/test/data/ej-fra-silence-bad.xml index b069aa96..e34159ef 100644 --- a/test/data/ej-fra-silence-bad.xml +++ b/test/data/ej-fra-silence-bad.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/ej-fra-silence.xml b/test/data/ej-fra-silence.xml index 416e99a4..dac1e681 100644 --- a/test/data/ej-fra-silence.xml +++ b/test/data/ej-fra-silence.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/ej-fra.xml b/test/data/ej-fra.xml index 76ee1230..94bfe357 100644 --- a/test/data/ej-fra.xml +++ b/test/data/ej-fra.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/fra-prepared.xml b/test/data/fra-prepared.xml index 8eaf24a6..12a19f30 100644 --- a/test/data/fra-prepared.xml +++ b/test/data/fra-prepared.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/fra-tokenized.xml b/test/data/fra-tokenized.xml index 894347ce..7b1eee82 100644 --- a/test/data/fra-tokenized.xml +++ b/test/data/fra-tokenized.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/mixed-langs.g2p.xml b/test/data/mixed-langs.g2p.xml index 215e4859..6463c660 100644 --- a/test/data/mixed-langs.g2p.xml +++ b/test/data/mixed-langs.g2p.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/mixed-langs.tokenized.xml b/test/data/mixed-langs.tokenized.xml index 992c133f..74c1d8da 100644 --- a/test/data/mixed-langs.tokenized.xml +++ b/test/data/mixed-langs.tokenized.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/mixed-langs.xml b/test/data/mixed-langs.xml index 09606424..083c1b22 100644 --- a/test/data/mixed-langs.xml +++ b/test/data/mixed-langs.xml @@ -1,8 +1,5 @@ -
diff --git a/test/data/patrickxtlan.xml b/test/data/patrickxtlan.xml index 3ae19c77..9456a6e6 100644 --- a/test/data/patrickxtlan.xml +++ b/test/data/patrickxtlan.xml @@ -3,5 +3,6 @@

Patrickxtła̱n Patrickxtła̱n + fooPatrickbarxtła̱nbaz

diff --git a/test/data/sample-config.json b/test/data/sample-config.json index 93b1e7e6..df3537c6 100644 --- a/test/data/sample-config.json +++ b/test/data/sample-config.json @@ -1,4 +1,9 @@ { + "title": "My awesome read-along", + "header": "A story in my language", + "subheader": "Read by me", + "theme": "light", + "images": { "0": "image-for-page1.jpg", diff --git a/test/run.py b/test/run.py index 9869572e..6efb1d0d 100755 --- a/test/run.py +++ b/test/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """ Top-level runner for out test suites @@ -14,38 +14,43 @@ """ import os +import re import sys from unittest import TestLoader, TestSuite, TextTestRunner from test_align_cli import TestAlignCli from test_anchors import TestAnchors +from test_api import TestAlignApi from test_audio import TestAudio from test_config import TestConfig from test_dna_text import TestDNAText from test_dna_utils import TestDNAUtils from test_force_align import TestForceAlignment, TestXHTML from test_g2p_cli import TestG2pCli -from test_indices import TestIndices +from test_make_xml_cli import TestMakeXMLCli from test_misc import TestMisc from test_package_urls import TestPackageURLs -from test_prepare_cli import TestPrepareCli from test_silence import TestSilence from test_temp_file import TestTempFile from test_tokenize_cli import TestTokenizeCli from test_tokenize_xml import TestTokenizer +from test_web_api import TestWebApi +from test_smil import TestSmilUtilities from readalongs.log import LOGGER -loader = TestLoader() +LOADER = TestLoader() e2e_tests = [ - loader.loadTestsFromTestCase(test) for test in (TestForceAlignment, TestXHTML) + LOADER.loadTestsFromTestCase(test) for test in (TestForceAlignment, TestXHTML) ] -indices_tests = [loader.loadTestsFromTestCase(test) for test in [TestIndices]] +api_tests = [ + LOADER.loadTestsFromTestCase(test) for test in [TestWebApi] +] # TODO: add some load testing with https://locust.io/ other_tests = [ - loader.loadTestsFromTestCase(test) + LOADER.loadTestsFromTestCase(test) for test in [ TestAnchors, TestConfig, @@ -54,43 +59,84 @@ TestTokenizer, TestTokenizeCli, TestTempFile, - TestPrepareCli, + TestMakeXMLCli, TestAudio, TestAlignCli, + TestAlignApi, TestG2pCli, TestMisc, TestSilence, + TestSmilUtilities, TestPackageURLs, + TestWebApi, ] ] -def run_tests(suite): - """Run the specified test suite""" +def list_tests(suite: TestSuite): + for subsuite in suite: + for match in re.finditer(r"tests=\[([^][]+)\]>", str(subsuite)): + yield from match[1].split(", ") + + +def describe_suite(suite: TestSuite): + full_suite = LOADER.discover(os.path.dirname(__file__)) + full_list = list(list_tests(full_suite)) + requested_list = list(list_tests(suite)) + requested_set = set(requested_list) + print("Test suite includes:", *sorted(requested_list), sep="\n"), + print( + "\nTest suite excludes:", + *sorted(test for test in full_list if test not in requested_set), + sep="\n" + ) + + +def run_tests(suite: str, describe: bool = False) -> bool: + """Run the specified test suite. + + Args: + suite: one of "all", "dev", etc specifying which suite to run + describe: if True, list all the test cases instead of running them. + + Returns: True iff success + """ if suite == "e2e": - suite = TestSuite(e2e_tests) + test_suite = TestSuite(e2e_tests) + elif suite == "api": + test_suite = TestSuite(api_tests) elif suite == "dev": - suite = TestSuite(indices_tests + other_tests + e2e_tests) + test_suite = TestSuite(other_tests + e2e_tests) elif suite in ("prod", "all"): - suite = loader.discover(os.path.dirname(__file__)) + test_suite = LOADER.discover(os.path.dirname(__file__)) elif suite == "other": - suite = TestSuite(other_tests) + test_suite = TestSuite(other_tests) else: LOGGER.error( "Sorry, you need to select a Test Suite to run, one of: " "dev, all (or prod), e2e, other" ) - sys.exit(1) + return False - runner = TextTestRunner(verbosity=3) - return runner.run(suite) + if describe: + describe_suite(test_suite) + return True + else: + runner = TextTestRunner(verbosity=3) + return runner.run(test_suite).wasSuccessful() if __name__ == "__main__": + describe = "--describe" in sys.argv + if describe: + sys.argv.remove("--describe") + try: - result = run_tests(sys.argv[1]) - if not result.wasSuccessful(): - raise Exception("Some tests failed. Please see log above.") + result = run_tests(sys.argv[1], describe) + if not result: + LOGGER.error("Some tests failed. Please see log above.") + sys.exit(1) except IndexError: - print("Please specify a test suite to run: i.e. 'dev' or 'all'") + LOGGER.error("Please specify a test suite to run: i.e. 'dev' or 'all'") + sys.exit(1) diff --git a/test/sound_swallower_stub.py b/test/sound_swallower_stub.py index 807a83eb..96c12b03 100644 --- a/test/sound_swallower_stub.py +++ b/test/sound_swallower_stub.py @@ -44,46 +44,31 @@ def SoundSwallowerStub(*segments): class SoundSwallowerDecoderStub: """Stub class so we don't really call the SoundSwallower decoder""" - class Segment: + class Seg: def __init__(self, segment_desc): """Init self from "word_id:start:end" description, e.g. "p0s0w0:0:1".""" - self.word, s, e = segment_desc.split(":") - self.start_frame = int(s) - self.end_frame = int(e) + self.text, s, e = segment_desc.split(":") + self.start = float(s) / 100 + self.duration = (float(e) - float(s)) / 100 def __repr__(self): - return f'Segment(word="{self.word}", start_frame={self.start_frame}, end_frame={self.end_frame})' + return ( + f'Seg(text="{self.text}", start={self.start}, duration={self.duration})' + ) class Config: def __init__(self, *args): pass - def set_boolean(self, *args): - pass - - def set_string(self, *args): - pass - - def set_float(self, *args): - pass - - def set_int(self, *args): - pass - - def get_float(self, *args): - return 1.0 - - def get_int(self, name): - if name == "-frate": + def __getitem__(self, key): + if key == "frate": # Pretend the framerate is always 1000, so the stub times are all in ms return 1000 else: - return 1 + return "SPAM" def __init__(self, *outputs): - self._segments = [ - SoundSwallowerDecoderStub.Segment(segment) for segment in outputs - ] + self._segments = [SoundSwallowerDecoderStub.Seg(segment) for segment in outputs] def __call__(self, *args): return self @@ -97,6 +82,7 @@ def process_raw(self, *args, **kwargs): def end_utt(self): pass + @property def seg(self): return self._segments diff --git a/test/test_align_cli.py b/test/test_align_cli.py index 3f19e34a..c03856d6 100755 --- a/test/test_align_cli.py +++ b/test/test_align_cli.py @@ -1,10 +1,12 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """ Unit test suite for the readalongs align CLI command """ import os +import pathlib +import tempfile from os.path import exists, join from unittest import main @@ -41,9 +43,11 @@ def test_invoke_align(self): "srt:TextGrid,eaf", # tests that we can give -o multiple values, separated by : or , "-l", "fra", + "--align-mode", + "auto", "--config", join(self.data_dir, "sample-config.json"), - join(self.data_dir, "ej-fra.txt"), + self.add_bom(join(self.data_dir, "ej-fra.txt")), join(self.data_dir, "ej-fra.m4a"), output, ], @@ -75,6 +79,10 @@ def test_invoke_align(self): exists(join(output, "tempfiles", "output.tokenized.xml")), "alignment with -s should have created tempfiles/output.tokenized.xml", ) + with open( + join(output, "tempfiles", "output.tokenized.xml"), "r", encoding="utf-8" + ) as f: + self.assertNotIn("\ufeff", f.read()) self.assertTrue( exists(join(output, "assets", "image-for-page1.jpg")), "alignment with image files should have copied image-for-page1.jpg to assets", @@ -82,6 +90,7 @@ def test_invoke_align(self): self.assertIn("image-for-page2.jpg is accessible ", results.stdout) os.unlink("image-for-page1.jpg") self.assertFalse(exists("image-for-page1.jpg")) + self.assertIn("Align mode strict succeeded for sequence 0.", results.stdout) # print(results.stdout) # Move the alignment output to compare with further down @@ -97,10 +106,12 @@ def test_invoke_align(self): [ "-o", "xhtml", + "--align-mode", + "moderate", "-s", "--config", join(self.data_dir, "sample-config.json"), - join(self.data_dir, "ej-fra-dna.xml"), + self.add_bom(join(self.data_dir, "ej-fra-dna.xml")), join(self.data_dir, "ej-fra.m4a"), output, ], @@ -120,6 +131,9 @@ def test_invoke_align(self): exists(join(output, "assets", "image-for-page1.jpg")), "image-for-page1.jpg was not on disk, cannot have been copied", ) + self.assertIn( + "Align mode moderate succeeded for sequence 0.", results_dna.stdout + ) # Functionally the same as self.assertTrue(filecmp.cmp(f1, f2)), but show where # the differences are if the files are not identical @@ -174,6 +188,8 @@ def test_align_with_package(self): output, "-o", "html", + "--config", + self.add_bom(self.data_dir / "sample-config.json"), ], ) # print(results_html.output) @@ -401,9 +417,11 @@ def test_infer_plain_text_or_xml(self): self.assertIn("No input language specified for plain text", results.output) # XML with guess by contents - infile3 = write_file( - join(self.tempdir, "infile3"), - "blah blah", + infile3 = self.add_bom( + write_file( + join(self.tempdir, "infile3"), + "blah blah", + ) ) with SoundSwallowerStub("word:0:1"): results = self.runner.invoke( @@ -447,19 +465,155 @@ def test_infer_plain_text_or_xml(self): self.assertNotEqual(results.exit_code, 0) self.assertIn("Error parsing XML", results.output) + def test_obsolete_switches(self): # Giving -i switch generates an obsolete-switch error message with SoundSwallowerStub("word:0:1"): results = self.runner.invoke( align, [ "-i", - infile5, + join(self.data_dir, "fra.txt"), join(self.data_dir, "noise.mp3"), join(self.tempdir, "outdir6"), ], ) self.assertNotEqual(results.exit_code, 0) - self.assertIn("The -i option is obsolete.", results.output) + self.assertIn("is obsolete.", results.output) + + # Giving --g2p-verbose switch generates an obsolete-switch error message + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "--g2p-verbose", + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir7"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("is obsolete.", results.output) + + # Giving --g2p-fallback switch generates an obsolete-switch error message + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "--g2p-fallback", + "fra:end:und", + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir8"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("is obsolete.", results.output) + + def test_oo_option(self): + """Exercise the hidden -oo / --output-orth option""" + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-oo", + "eng-arpabet", + join(self.data_dir, "ej-fra.xml"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir9"), + ], + ) + self.assertEqual(results.exit_code, 0) + + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-oo", + "not-an-alphabet", + join(self.data_dir, "ej-fra.xml"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir10"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("Could not g2p", results.output) + self.assertIn("not-an-alphabet", results.output) + + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-oo", + "dan-ipa", + join(self.data_dir, "ej-fra.xml"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir11"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("Could not g2p", results.output) + self.assertIn("no path", results.output) + + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-oo", + "dan-ipa", + "-l", + "eng", + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir12"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("Could not g2p", results.output) + self.assertIn('Cannot g2p "eng" to output orthography', results.output) + + def add_bom(self, filename): + """Create a temporary copy of filename with the a BOM in it, in self.tempdir""" + # We pepper calls to add_bom() around the test suite, to make sure all + # different kinds of input files are accepted with and without a BOM + output_file = tempfile.NamedTemporaryFile( + mode="wb", + dir=self.tempdir, + delete=False, + prefix="bom_", + suffix=os.path.basename(filename), + ) + output_file.write(b"\xef\xbb\xbf") + with open(filename, "rb") as file_binary: + output_file.write(file_binary.read()) + output_file.close() + return output_file.name + + def test_add_bom(self): + """Make sure add_bom does what we mean it to, i.e., test the test harness.""" + + def slurp_bin(filename): + with open(filename, "rb") as f: + return f.read() + + def slurp_text(filename, encoding): + with open(filename, "r", encoding=encoding) as f: + return f.read() + + base_file = write_file(self.tempdir / "add-bom-input.txt", "Random Text été") + bom_file = self.add_bom(base_file) + self.assertEqual( + slurp_text(base_file, "utf-8"), slurp_text(bom_file, "utf-8-sig") + ) + self.assertEqual( + slurp_text(bom_file, "utf-8"), "\ufeff" + slurp_text(base_file, "utf-8") + ) + self.assertNotEqual(slurp_bin(base_file), slurp_bin(bom_file)) + self.assertEqual(b"\xef\xbb\xbf" + slurp_bin(base_file), slurp_bin(bom_file)) + + bom_file_pathlib = self.add_bom(pathlib.Path(base_file)) + self.assertEqual( + slurp_text(base_file, "utf-8"), slurp_text(bom_file_pathlib, "utf-8-sig") + ) if __name__ == "__main__": diff --git a/test/test_anchors.py b/test/test_anchors.py index c54e0fd0..f8c6860f 100755 --- a/test/test_anchors.py +++ b/test/test_anchors.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Unit testing for the anchors functionality in readalongs align""" @@ -8,6 +8,7 @@ from basic_test_case import BasicTestCase from readalongs.align import align_audio +from readalongs.log import LOGGER class TestAnchors(BasicTestCase): @@ -70,6 +71,30 @@ def test_anchors_outer_too(self): f"{partial_wav_file} should not be empty", ) + def test_anchors_align_modes(self): + xml_with_anchors = """ + Bonjour. + + Ceci ne peut pas être aligné avec du bruit. + + + """ + xml_file = os.path.join(self.tempdir, "text-with-anchors.xml") + with open(xml_file, "wt", encoding="utf8") as f: + print(xml_with_anchors, file=f) + with self.assertLogs(LOGGER, level="INFO") as cm: + results = align_audio( + xml_file, + os.path.join(self.data_dir, "noise.mp3"), + ) + words = results["words"] + self.assertEqual(len(words), 10) + logger_output = "\n".join(cm.output) + self.assertIn("Align mode strict succeeded for sequence 0.", logger_output) + self.assertIn("Align mode strict failed for sequence 1.", logger_output) + self.assertIn("Align mode moderate failed for sequence 1.", logger_output) + self.assertIn("Align mode loose succeeded for sequence 1.", logger_output) + if __name__ == "__main__": main() diff --git a/test/test_api.py b/test/test_api.py new file mode 100755 index 00000000..4d08c6a7 --- /dev/null +++ b/test/test_api.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +""" +Test suite for the API way to call align +""" + +import os +from unittest import main + +import click +from basic_test_case import BasicTestCase +from sound_swallower_stub import SoundSwallowerStub + +import readalongs.api as api +from readalongs.log import LOGGER + + +class TestAlignApi(BasicTestCase): + """Test suite for the API way to call align()""" + + def test_call_align(self): + # We deliberately pass pathlib.Path objects as input, to make sure the + # API accepts them too. + langs = ("fra",) # make sure language can be an iterable, not just a list. + with SoundSwallowerStub("t0b0d0p0s0w0:920:1520", "t0b0d0p0s1w0:1620:1690"): + (status, exception, log) = api.align( + self.data_dir / "ej-fra.txt", + self.data_dir / "ej-fra.m4a", + self.tempdir / "output", + langs, + output_formats=["html", "TextGrid", "srt"], + ) + self.assertEqual(status, 0) + self.assertTrue(exception is None) + self.assertIn("Words () not present; tokenizing", log) + expected_output_files = ( + "output.smil", + "output.xml", + "output.m4a", + "output.TextGrid", + "output_sentences.srt", + "output_words.srt", + "index.html", + "output.html", + ) + for f in expected_output_files: + self.assertTrue( + (self.tempdir / "output" / f).exists(), + f"successful alignment should have created {f}", + ) + self.assertEqual( + list(langs), + ["fra"], + "Make sure the API call doesn't not modify my variables", + ) + + (status, exception, log) = api.align("", "", self.tempdir / "errors") + self.assertNotEqual(status, 0) + self.assertFalse(exception is None) + + def test_call_make_xml(self): + (status, exception, log) = api.make_xml( + self.data_dir / "ej-fra.txt", self.tempdir / "prepared.xml", ("fra", "eng") + ) + self.assertEqual(status, 0) + self.assertTrue(exception is None) + self.assertIn("Wrote ", log) + with open(self.tempdir / "prepared.xml") as f: + xml_text = f.read() + self.assertIn('xml:lang="fra" fallback-langs="eng,und"', xml_text) + + (status, exception, log) = api.make_xml( + self.data_dir / "ej-fra.txt", + self.tempdir / "bad.xml", + ("fra", "not-a-lang"), + ) + self.assertNotEqual(status, 0) + self.assertTrue(isinstance(exception, click.BadParameter)) + + (status, exception, log) = api.make_xml( + self.data_dir / "file-not-found.txt", self.tempdir / "none.xml", ("fra",) + ) + self.assertNotEqual(status, 0) + self.assertTrue(isinstance(exception, click.UsageError)) + + def test_deprecated_prepare(self): + with self.assertLogs(LOGGER, level="WARNING") as cm: + api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",)) + self.assertIn("deprecated", "\n".join(cm.output)) + + +if __name__ == "__main__": + main() diff --git a/test/test_audio.py b/test/test_audio.py index 8e27e1ff..6c8b1001 100755 --- a/test/test_audio.py +++ b/test/test_audio.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test suite for various audio contents handling methods""" @@ -48,9 +48,14 @@ def align(self, input_text_path, input_audio_path, output_path, flags): def test_mute_section(self): """Should mute section of audio""" + max_before = self.audio_segment[1000:2000].max muted_segment = mute_section(self.audio_segment, 1000, 2000) muted_section = muted_segment[1000:2000] - self.assertLessEqual(muted_section.max, 1) + # This worked with pydub 0.23.1, but it does not work with 0.25.1 + # self.assertLessEqual(muted_section.max, 1) + # Muting applies a gain of -120, so the results is not necessarily 0, + # it's just much smaller. + self.assertLessEqual(muted_section.max, max_before / 1000) def test_remove_section(self): """Should remove section of audio""" @@ -84,6 +89,8 @@ def test_align_sample(self): "pip install --force-reinstall --upgrade might be required " "if dependencies changed.", ) + # Make sure ss logs are disabled + self.assertNotIn("Current configuration", process.stderr) def test_align_removed(self): """Try aligning section with removed audio""" @@ -95,7 +102,7 @@ def test_align_removed(self): # Align input_text_path = os.path.join(self.data_dir, "audio_sample.txt") input_audio_path = audio_output_path - flags = ["-l", "eng"] + flags = ["-l", "eng", "--debug-aligner"] output_path = os.path.join(self.tempdir, "output_removed") process = self.align(input_text_path, input_audio_path, output_path, flags) if process.returncode != 0: @@ -109,6 +116,8 @@ def test_align_removed(self): "pip install --force-reinstall --upgrade might be required " "if dependencies changed.", ) + # Make sure ss logs are enabled + self.assertIn("Current configuration", process.stderr) def test_align_muted(self): """Try aligning section with muted audio""" diff --git a/test/test_config.py b/test/test_config.py index a724f773..9ef01859 100755 --- a/test/test_config.py +++ b/test/test_config.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test suite for loading the config.json configuration file for readalongs align""" @@ -55,7 +55,7 @@ def test_arbitrary_xml(self): # bad xml raises lxml.etree.XMLSyntaxError with self.assertRaises(etree.XMLSyntaxError): new_xml = add_supplementary_xml( - self.xml, {"xml": [{"xpath": "//div[1]", "value": "bloop"}]}, + self.xml, {"xml": [{"xpath": "//div[1]", "value": "bloop"}]} ) # if xpath isn't valid, log warning diff --git a/test/test_dna_text.py b/test/test_dna_text.py index 25127398..a7426ec0 100755 --- a/test/test_dna_text.py +++ b/test/test_dna_text.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test handling of DNA text in tokenization""" diff --git a/test/test_dna_utils.py b/test/test_dna_utils.py index 7218a1dd..4022a6fb 100755 --- a/test/test_dna_utils.py +++ b/test/test_dna_utils.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test suite for DNA segment manupulation methods""" @@ -129,7 +129,7 @@ def test_segment_intersection(self): ) self.assertEqual( segment_intersection( - segments_from_pairs((10, 30)), segments_from_pairs((19, 19)), + segments_from_pairs((10, 30)), segments_from_pairs((19, 19)) ), segments_from_pairs((19, 19)), ) diff --git a/test/test_force_align.py b/test/test_force_align.py index 4793fe8b..15480b66 100755 --- a/test/test_force_align.py +++ b/test/test_force_align.py @@ -1,16 +1,25 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """ Test force-alignment with SoundsSwallower FSG search from Python API """ import os +import shutil import unittest +import wave +from tempfile import TemporaryDirectory from basic_test_case import BasicTestCase from lxml import etree +from soundswallower import get_model_path -from readalongs.align import align_audio, convert_to_xhtml, create_input_tei +from readalongs.align import ( + align_audio, + convert_to_xhtml, + create_input_tei, + get_word_texts_and_sentences, +) from readalongs.log import LOGGER from readalongs.portable_tempfile import PortableNamedTemporaryFile from readalongs.text.util import load_txt, save_xml @@ -23,7 +32,7 @@ def test_align(self): """Basic alignment test case with XML input""" xml_path = os.path.join(self.data_dir, "ej-fra.xml") wav_path = os.path.join(self.data_dir, "ej-fra.m4a") - results = align_audio(xml_path, wav_path, unit="w") + results = align_audio(xml_path, wav_path, unit="w", debug_aligner=True) # Verify that the same IDs are in the output converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml") @@ -52,6 +61,113 @@ def test_align_text(self): for w, xw in zip(words, xml_words): self.assertEqual(xw.attrib["id"], w["id"]) + # White-box testing to make sure srt, TextGrid and vtt output will have the + # sentences collected correctly. + words, sentences = get_word_texts_and_sentences( + results["words"], results["tokenized"] + ) + self.assertEqual(len(sentences), 7) + self.assertEqual(len(words), 99) + + def make_element(tag, text="", tail=""): + """Convenient Element constructor wrapper""" + el = etree.Element(tag) + el.text = text + el.tail = tail + return el + + # Do some word doctoring to make sure sub-word units don't cause trouble + # This might be nicer in a different test case, but I want to reuse + # results from the call above, so I'm glomming it on here... + xml = results["tokenized"] + for i, word_el in enumerate(xml.xpath(".//w")): + if i == 1: + # Modify the + word_el.text += " stuff" + elif i == 2: + # Whole text in one + word_el.text = "" + word_el.append(make_element("subw", "subwordtext")) + elif i == 3: + # with three clean elements + word_el.text = "" + for i in range(3): + word_el.append(make_element("syl", "syl;")) + elif i == 4: + # Messy is still valid structure + word_el.text = "head text;" + word_el.append(make_element("syl", "syllable text;", "syl tail;")) + word_el.tail = "tail from the word itself is ignored;" + # etree.dump(word_el) + elif i == 5: + # Nested sub elements + word_el.append(make_element("syl", "syl;", "tail;")) + word_el[0].append(make_element("subsyl", "sub;")) + word_el.append(make_element("syl", "another syl;")) + break + _, sentences = get_word_texts_and_sentences( + results["words"], results["tokenized"] + ) + self.assertEqual( + [w["text"] for w in sentences[1]], + [ + "Je stuff", + "subwordtext", + "syl;syl;syl;", + "head text;syllable text;syl tail;", + "Joanissyl;sub;tail;another syl;", + ], + ) + + def test_align_switch_am(self): + """Alignment test case with an alternate acoustic model and custom + noise dictionary.""" + xml_path = os.path.join(self.data_dir, "ej-fra.xml") + wav_path = os.path.join(self.data_dir, "ej-fra.m4a") + # Try with some extra stuff in the noisedict + with TemporaryDirectory(prefix="readalongs_am_") as tempdir: + custom_am_path = os.path.join(tempdir, "en-us") + shutil.copytree(get_model_path("en-us"), custom_am_path) + with open(os.path.join(custom_am_path, "noisedict"), "at") as fh: + fh.write(";; here is a comment\n") + fh.write("[BOGUS] SIL\n") + results = align_audio( + xml_path, wav_path, unit="w", config={"acoustic_model": custom_am_path} + ) + # Try with no noisedict + os.remove(os.path.join(custom_am_path, "noisedict")) + results = align_audio( + xml_path, wav_path, unit="w", config={"acoustic_model": custom_am_path} + ) + # Verify that the same IDs are in the output + converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml") + xml = etree.parse(converted_path).getroot() + words = results["words"] + xml_words = xml.xpath(".//w") + self.assertEqual(len(words), len(xml_words)) + for w, xw in zip(words, xml_words): + self.assertEqual(xw.attrib["id"], w["id"]) + + def test_align_fail(self): + """Alignment test case with bad audio that should fail.""" + xml_path = os.path.join(self.data_dir, "ej-fra.xml") + with PortableNamedTemporaryFile(suffix=".wav") as tf: + with wave.open(tf, "wb") as writer: + writer.setnchannels(1) + writer.setsampwidth(2) + writer.setframerate(16000) + writer.writeframes(b"\x00\x00") + with self.assertRaises(RuntimeError): + _ = align_audio(xml_path, tf.name, unit="w") + + def test_bad_align_mode(self): + with self.assertRaises(AssertionError): + _ = align_audio( + os.path.join(self.data_dir, "ej-fra.xml"), + os.path.join(self.data_dir, "noise.mp3"), + alignment_mode="invalid-mode", + ) + class TestXHTML(BasicTestCase): """Test converting the output to xhtml""" @@ -66,7 +182,7 @@ def test_convert(self): txt = load_txt(tf.name) self.maxDiff = None self.assertEqual( - txt, load_txt(os.path.join(self.data_dir, "ej-fra-converted.xhtml")), + txt, load_txt(os.path.join(self.data_dir, "ej-fra-converted.xhtml")) ) diff --git a/test/test_g2p_cli.py b/test/test_g2p_cli.py index 366f2888..48a7c2bc 100755 --- a/test/test_g2p_cli.py +++ b/test/test_g2p_cli.py @@ -1,8 +1,9 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test suite for the readalongs g2p CLI command""" import os +import re from unittest import main from basic_test_case import BasicTestCase @@ -10,11 +11,21 @@ from sound_swallower_stub import SoundSwallowerStub from readalongs.align import align_audio -from readalongs.cli import align, g2p, prepare, tokenize +from readalongs.cli import align, g2p, make_xml, tokenize from readalongs.log import LOGGER from readalongs.text.convert_xml import convert_xml +def run_convert_xml(input_string): + """wrap convert_xml to make unit testing easier""" + return etree.tounicode(convert_xml(etree.fromstring(input_string))[0]) + + +def two_xml_elements(xml_text): + """Extract the opening part of the leading two XML elements in xml_text""" + return xml_text[: 1 + xml_text.find(">", 1 + xml_text.find(">"))] + + class TestG2pCli(BasicTestCase): """Test suite for the readalongs g2p CLI command""" @@ -70,15 +81,31 @@ def test_mixed_langs(self): f"output {g2p_file} and reference {ref_file} differ.", ) - # Write text to a temp file, pass it through prepare -l lang, and then tokenize, + def test_invoke_with_obsolete_switches(self): + """Using obsolete options should yield a helpful error message""" + + input_file = os.path.join(self.data_dir, "fra-tokenized.xml") + g2p_file = os.path.join(self.tempdir, "obsolete1.xml") + results = self.runner.invoke( + g2p, ["--g2p-fallback", "fra:und", input_file, g2p_file] + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("is obsolete", results.output) + + g2p_file = os.path.join(self.tempdir, "obsolete2.xml") + results = self.runner.invoke(g2p, ["--g2p-verbose", input_file, g2p_file]) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("is obsolete", results.output) + + # Write text to a temp file, pass it through make-xml -l lang, and then tokenize, # saving the final results into filename. # filename is assumed to be inside self.tempdir, so we count on tearDown() to clean up. - def write_prepare_tokenize(self, text, lang, filename): + def write_make_xml_tokenize(self, text, lang, filename): """Create the input file for some test cases in this suite""" with open(filename + ".input.txt", "w", encoding="utf8") as f: print(text, file=f) self.runner.invoke( - prepare, + make_xml, [ "-l", lang, @@ -92,7 +119,7 @@ def write_prepare_tokenize(self, text, lang, filename): def test_english_oov(self): """readalongs g2p should handle English OOVs correctly""" tok_file = os.path.join(self.tempdir, "tok.xml") - self.write_prepare_tokenize("This is a froobnelicious OOV.", "eng", tok_file) + self.write_make_xml_tokenize("This is a froobnelicious OOV.", "eng", tok_file) results = self.runner.invoke(g2p, [tok_file]) if self.show_invoke_output: print( @@ -106,7 +133,7 @@ def test_english_oov(self): # with a fall back to und, it works tok_file_with_fallback = os.path.join(self.tempdir, "fallback.xml") - self.write_prepare_tokenize( + self.write_make_xml_tokenize( "This is a froobnelicious OOV.", "eng:und", tok_file_with_fallback ) results = self.runner.invoke(g2p, [tok_file_with_fallback, "-"]) @@ -122,7 +149,7 @@ def test_french_oov(self): """readalongs g2p should handle French OOVs correctly""" tok_file = os.path.join(self.tempdir, "tok.xml") g2p_file = os.path.join(self.tempdir, "g2p.xml") - self.write_prepare_tokenize( + self.write_make_xml_tokenize( "Le ñ n'est pas dans l'alphabet français.", "fra", tok_file ) results = self.runner.invoke(g2p, [tok_file, g2p_file]) @@ -137,7 +164,7 @@ def test_french_oov(self): # with a fall back to und, it works tok_file2 = os.path.join(self.tempdir, "tok2.xml") - self.write_prepare_tokenize( + self.write_make_xml_tokenize( "Le ñ n'est pas dans l'alphabet français.", "fra:und", tok_file2 ) g2p_file2 = os.path.join(self.tempdir, "g2p-fallback.xml") @@ -154,7 +181,7 @@ def test_three_way_fallback(self): """readalongs g2p --g2p-fallback with multi-step cascades""" tok_file = os.path.join(self.tempdir, "text.tokenized.xml") g2p_file = os.path.join(self.tempdir, "text.g2p.xml") - self.write_prepare_tokenize( + self.write_make_xml_tokenize( "In French été works but Nunavut ᓄᓇᕗᑦ does not.", "eng:fra:iku", tok_file ) # Here we also test generating the output filename from the input filename @@ -172,7 +199,7 @@ def test_three_way_fallback(self): # Run with verbose output and look for the warning messages results = self.runner.invoke( - g2p, ["--g2p-verbose", tok_file, g2p_file + "verbose"], + g2p, ["--debug-g2p", tok_file, g2p_file + "verbose"] ) if self.show_invoke_output: print( @@ -186,7 +213,7 @@ def test_three_way_fallback(self): # this text also works with "und", now that we use unidecode tok_file2 = os.path.join(self.tempdir, "text.tokenized2.xml") - self.write_prepare_tokenize( + self.write_make_xml_tokenize( "In French été works but Nunavut ᓄᓇᕗᑦ does not.", "eng:und", tok_file2 ) results = self.runner.invoke(g2p, [tok_file2, "-"]) @@ -214,6 +241,7 @@ def test_align_with_error(self): ) self.assertNotEqual(results.exit_code, 0) self.assertIn("could not be g2p", results.output) + self.assertNotIn("Number of aligned segments", results.output) with SoundSwallowerStub("t0b0d0p0s0w0:920:1620", "t0b0d0p0s1w0:1620:1690"): results = self.runner.invoke( @@ -236,8 +264,8 @@ def test_align_with_error(self): ) self.assertIn("Trying fallback: fra", results.output) self.assertIn("Trying fallback: iku", results.output) - # We get the found segments printed only if g2p succeeded: - self.assertIn("Segment: t0b0d0p0s0w0", results.output) + self.assertNotIn("could not be g2p", results.output) + self.assertIn("Number of aligned segments", results.output) def test_with_stdin(self): """readalongs g2p running with stdin as input""" @@ -287,10 +315,6 @@ def test_align_with_preg2p(self): self.assertIn("HH EH Y", dict_file) # "Hej" in dan self.assertIn("D G IY T UW P IY D", dict_file) # pre-g2p'd OOV - def run_convert_xml(self, input_string): - """wrap convert_xml to make unit testing easier""" - return etree.tounicode(convert_xml(etree.fromstring(input_string))[0]) - def test_convert_xml(self): """unit testing for readalongs.text.convert_xml.convert_xml() @@ -298,12 +322,12 @@ def test_convert_xml(self): It's not very well named, but it still needs unit testing. :) """ self.assertEqual( - self.run_convert_xml("wordnot word"), + run_convert_xml("wordnot word"), 'wordnot word', ) self.assertEqual( - self.run_convert_xml( + run_convert_xml( 'Patrickxtła̱n' ), 'Patrick' @@ -311,10 +335,88 @@ def test_convert_xml(self): ) self.assertEqual( - self.run_convert_xml('Patrickxtła̱n'), + run_convert_xml('Patrickxtła̱n'), 'Patrickxtła̱n', ) + def test_convert_xml_with_newlines(self): + """Newlines inside words are weird, but they should not cause errors""" + + def compact_arpabet(xml_string: str) -> str: + etree_root = etree.fromstring(xml_string) + arpabet = etree_root[0].attrib["ARPABET"] + return re.sub(r"\s+", " ", arpabet) + + converted_1 = run_convert_xml( + """ + first part of the word + second part of the word + """ + ) + converted_2 = run_convert_xml( + "first part of the wordsecond part of the word" + ) + self.assertEqual(compact_arpabet(converted_1), compact_arpabet(converted_2)) + + def test_convert_xml_subwords(self): + """Unit testing for reintroducing subword units""" + self.assertEqual( + run_convert_xml( + 'Patrickxtła̱n' + ), + 'Patrick' + 'xtła̱n', + ) + + self.assertEqual( + run_convert_xml( + 'fooPatrickbarxtła̱nbaz' + ), + '' + 'fooPatrickbarxtła̱nbaz', + ) + + converted_by_syllable = run_convert_xml( + 'abcdefghi' + ) + converted_as_a_whole = run_convert_xml('abcdefghi') + self.assertEqual( + two_xml_elements(converted_by_syllable), + two_xml_elements(converted_as_a_whole), + ) + + moh_eg_with_highlights = "tatiatkèn:sehkwe'" + moh_eg_merged = "tatiatkèn:sehkwe'" + self.assertEqual(two_xml_elements(moh_eg_merged), "") + self.assertEqual( + two_xml_elements(run_convert_xml(moh_eg_with_highlights)), + two_xml_elements(run_convert_xml(moh_eg_merged)), + ) + + moh_example_input_full = """ + + + + tati + atkèn:se + hkwe' + + + """ + _ = run_convert_xml(moh_example_input_full) + + example_with_fallback_lang = """ + + cecinot_really_iku + """ + with self.assertLogs(LOGGER, level="WARNING") as cm: + result = run_convert_xml(example_with_fallback_lang) + self.assertIn("S AH S IY not_really_iku", result) + logger_output = "\n".join(cm.output) + self.assertIn( + 'No valid g2p conversion found for "not_really_iku"', logger_output + ) + def test_convert_xml_invalid(self): """test readalongs.text.convert_xml.convert_xml() with invalid input""" xml = etree.fromstring('valid') @@ -341,11 +443,12 @@ def test_invalid_langs_in_xml(self): """ ) with self.assertLogs(LOGGER, level="WARNING") as cm: - c_xml, valid = convert_xml(xml) + c_xml, valid = convert_xml(xml, verbose_warnings=True) self.assertFalse(valid) logger_output = "\n".join(cm.output) - self.assertIn('"foo": invalid language code', logger_output) - self.assertIn('"crx-syl": no path to "eng-arpabet"', logger_output) + self.assertIn("No lang", logger_output) + self.assertIn("foo", logger_output) + self.assertIn('no path from "crx-syl"', logger_output) if __name__ == "__main__": diff --git a/test/test_indices.py b/test/test_indices.py deleted file mode 100755 index 59c85c64..00000000 --- a/test/test_indices.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python3 - -"""Test suite for handling g2p indices""" - -from unittest import TestCase, main - -from g2p import make_g2p -from g2p.mappings import Mapping -from g2p.transducer import Transducer - -from readalongs.log import LOGGER - - -class TestIndices(TestCase): - """Test suite for handling g2p indices""" - - def test_basic_composition(self): - """Indices mapped through a two-step basic composition""" - mapping = Mapping([{"in": "a", "out": "b"}]) - transducer = Transducer(mapping) - tg = transducer("abba") - self.assertEqual(tg.output_string, "bbbb") - self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2), (3, 3)]) - - def test_tiered_composition(self): - """Indices mapped through a more complex, three-step composition""" - transducer = make_g2p("dan", "eng-arpabet") - tg = transducer("hej") - self.assertEqual(tg.output_string, "HH EH Y ") - self.assertEqual( - tg.edges, - [ - [(0, 0), (1, 1), (2, 2)], - [(0, 0), (1, 1), (2, 2)], - [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5), (2, 6), (2, 7)], - ], - ) - self.assertEqual( - tg.pretty_edges(), - [ - [["h", "h"], ["e", "ɛ"], ["j", "j"]], - [["h", "h"], ["ɛ", "ɛ"], ["j", "j"]], - [ - ["h", "H"], - ["h", "H"], - ["h", " "], - ["ɛ", "E"], - ["ɛ", "H"], - ["ɛ", " "], - ["j", "Y"], - ["j", " "], - ], - ], - ) - - def test_composition_with_none(self): - transducer = make_g2p("ctp", "eng-arpabet") - tg = transducer("Qne\u1D2C") - self.assertEqual(tg.output_string, "HH N EY ") - self.assertEqual( - tg.edges, - [ - [(0, 0), (1, 1), (2, 2), (3, None)], - [(0, 0), (1, 1), (2, 2), (2, 3)], - [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (2, 5), (3, 6), (3, 7)], - ], - ) - self.assertEqual( - tg.pretty_edges(), - [ - [["q", "ʔ"], ["n", "n"], ["e", "e"], ["ᴬ", None]], - [["ʔ", "ʔ"], ["n", "n"], ["e", "e"], ["e", "ː"]], - [ - ["ʔ", "H"], - ["ʔ", "H"], - ["ʔ", " "], - ["n", "N"], - ["n", " "], - ["e", "E"], - ["ː", "Y"], - ["ː", " "], - ], - ], - ) - - def test_fra(self): - transducer = make_g2p("fra", "eng-arpabet") - tg = transducer("mais") - self.assertEqual(tg.output_string, "M EH ") - - -if __name__ == "__main__": - LOGGER.setLevel("DEBUG") - main() diff --git a/test/test_prepare_cli.py b/test/test_make_xml_cli.py similarity index 67% rename from test/test_prepare_cli.py rename to test/test_make_xml_cli.py index d5b09975..7cb59f75 100755 --- a/test/test_prepare_cli.py +++ b/test/test_make_xml_cli.py @@ -1,6 +1,6 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python -"""Test suite for the readalongs prepare CLI command""" +"""Test suite for the readalongs make_xml CLI command""" import io import os @@ -10,13 +10,13 @@ from basic_test_case import BasicTestCase -from readalongs.align import create_input_tei -from readalongs.cli import align, prepare +from readalongs.align import create_input_tei, create_tei_from_text +from readalongs.cli import align, make_xml from readalongs.log import LOGGER -class TestPrepareCli(BasicTestCase): - """Test suite for the readalongs prepare CLI command""" +class TestMakeXMLCli(BasicTestCase): + """Test suite for the readalongs make-xml CLI command""" def setUp(self): super().setUp() @@ -25,56 +25,63 @@ def setUp(self): pass def test_invoke_prepare(self): - """Basic usage of readalongs prepare""" + """Basic usage of deprecated readalongs prepare""" results = self.runner.invoke( - prepare, + make_xml, ["-l", "atj", "-d", self.empty_file, os.path.join(self.tempdir, "delme")], ) self.assertEqual(results.exit_code, 0) - self.assertRegex(results.stdout, "Running readalongs prepare") - # print('Prepare.stdout: {}'.format(results.stdout)) + + def test_invoke_make_xml(self): + """Basic usage of readalongs make-xml""" + results = self.runner.invoke( + make_xml, + ["-l", "atj", "-d", self.empty_file, os.path.join(self.tempdir, "delme")], + ) + self.assertEqual(results.exit_code, 0) + self.assertRegex(results.stdout, "Running readalongs make-xml") def test_no_lang(self): - """Error case: readalongs prepare without the mandatory -l switch""" + """Error case: readalongs make-xml without the mandatory -l switch""" results = self.runner.invoke( - prepare, [self.empty_file, self.empty_file + ".xml"] + make_xml, [self.empty_file, self.empty_file + ".xml"] ) self.assertNotEqual(results.exit_code, 0) self.assertRegex(results.stdout, "Missing.*language") def test_inputfile_not_exist(self): """Error case: input file does not exist""" - results = self.runner.invoke(prepare, "-l atj /file/does/not/exist delme") + results = self.runner.invoke(make_xml, "-l atj /file/does/not/exist delme") self.assertNotEqual(results.exit_code, 0) self.assertRegex(results.stdout, "No such file or directory") def test_outputfile_exists(self): - """Existing output file should not be overwritten by readalongs prepare by default""" + """Existing output file should not be overwritten by readalongs make-xml by default""" results = self.runner.invoke( - prepare, + make_xml, ["-l", "atj", self.empty_file, os.path.join(self.tempdir, "exists")], ) results = self.runner.invoke( - prepare, + make_xml, ["-l", "atj", self.empty_file, os.path.join(self.tempdir, "exists")], ) self.assertNotEqual(results.exit_code, 0) self.assertRegex(results.stdout, "exists.*overwrite") def test_output_exists(self): - """Make sure readalongs prepare create the expected output file""" + """Make sure readalongs make-xml create the expected output file""" xmlfile = os.path.join(self.tempdir, "fra.xml") results = self.runner.invoke( - prepare, ["-l", "fra", os.path.join(self.data_dir, "fra.txt"), xmlfile] + make_xml, ["-l", "fra", os.path.join(self.data_dir, "fra.txt"), xmlfile] ) self.assertEqual(results.exit_code, 0) self.assertTrue(os.path.exists(xmlfile), "output xmlfile did not get created") def test_output_correct(self): - """Make sure the contents of readalongs prepare's output file is correct.""" + """Make sure the contents of readalongs make-xml's output file is correct.""" input_file = os.path.join(self.data_dir, "fra.txt") xml_file = os.path.join(self.tempdir, "fra.xml") - results = self.runner.invoke(prepare, ["-l", "fra", input_file, xml_file]) + results = self.runner.invoke(make_xml, ["-l", "fra", input_file, xml_file]) self.assertEqual(results.exit_code, 0) ref_file = os.path.join(self.data_dir, "fra-prepared.xml") @@ -89,8 +96,8 @@ def test_output_correct(self): ) def test_input_is_stdin(self): - """Validate that readalongs prepare can use stdin as input""" - results = self.runner.invoke(prepare, "-l fra -", input="Ceci est un test.") + """Validate that readalongs make-xml can use stdin as input""" + results = self.runner.invoke(make_xml, "-l fra -", input="Ceci est un test.") # LOGGER.warning("Output: {}".format(results.output)) # LOGGER.warning("Exception: {}".format(results.exception)) self.assertEqual(results.exit_code, 0) @@ -98,24 +105,24 @@ def test_input_is_stdin(self): self.assertIn('', results.output) - results = self.runner.invoke(prepare, ["-l", "fra,iku:und", input_file, "-"]) + results = self.runner.invoke(make_xml, ["-l", "fra,iku:und", input_file, "-"]) self.assertEqual(results.exit_code, 0) self.assertIn('', results.output) results = self.runner.invoke( - prepare, ["-l", "fra:iku", "-l", "und", input_file, "-"] + make_xml, ["-l", "fra:iku", "-l", "und", input_file, "-"] ) self.assertEqual(results.exit_code, 0) self.assertIn('', results.output) - def test_prepare_invalid_lang(self): + def test_make_xml_invalid_lang(self): input_file = os.path.join(self.data_dir, "fra.txt") results = self.runner.invoke( - prepare, ["-l", "fra:notalang:und", input_file, "-"] + make_xml, ["-l", "fra:notalang:und", input_file, "-"] ) self.assertNotEqual(results.exit_code, 0) self.assertRegex(results.output, r"Invalid value.*'notalang'") - def test_prepare_invalid_utf8_input(self): + def test_make_xml_invalid_utf8_input(self): noise_file = os.path.join(self.data_dir, "noise.mp3") # Read noise.mp3 as if it was utf8 text, via create_input_tei(input_file_handle) - results = self.runner.invoke(prepare, ["-l", "fra", noise_file, "-"]) + results = self.runner.invoke(make_xml, ["-l", "fra", noise_file, "-"]) self.assertNotEqual(results.exit_code, 0) self.assertIn("provide a correctly encoded utf-8", results.output) # Read noise.mp3 as if it was utf8 text, via create_input_tei(input_file_name) results = self.runner.invoke( - prepare, ["-l", "fra", noise_file, os.path.join(self.tempdir, "noise.xml")] + make_xml, ["-l", "fra", noise_file, os.path.join(self.tempdir, "noise.xml")] ) self.assertNotEqual(results.exit_code, 0) self.assertIn("provide a correctly encoded utf-8", results.output) @@ -231,6 +238,19 @@ def test_prepare_invalid_utf8_input(self): self.assertNotEqual(results.exit_code, 0) self.assertIn("provide a correctly encoded utf-8", results.output) + def test_blank_lines_stripped(self): + """Blank lines for paragraph and page breaks are allowed to have whitespace""" + input_text_with_spaces = "Ceci est un test\n \nParagraphe\n\t \n \nPage\n" + input_text_stripped = "Ceci est un test\n\nParagraphe\n\n\nPage\n" + + def text2lines(text: str): + return io.StringIO(text).readlines() + + self.assertEqual( + create_tei_from_text(text2lines(input_text_with_spaces), ["fra"]), + create_tei_from_text(text2lines(input_text_stripped), ["fra"]), + ) + if __name__ == "__main__": main() diff --git a/test/test_misc.py b/test/test_misc.py index 21a96347..76fd13a9 100755 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test suite for misc stuff that don't need their own stand-alone suite""" @@ -10,8 +10,13 @@ from test_dna_utils import segments_from_pairs from readalongs.align import split_silences -from readalongs.text.util import get_attrib_recursive, get_lang_attrib, parse_time -from readalongs.util import JoinerCallback +from readalongs.text.util import ( + get_attrib_recursive, + get_lang_attrib, + get_word_text, + parse_time, +) +from readalongs.util import JoinerCallbackForClick class TestMisc(TestCase): @@ -154,13 +159,35 @@ def test_get_attrib_recursive(self): # get_attrib_recursive() --EJJ Nov 2021 def test_joiner_callback(self): - cb = JoinerCallback(iter("qwer")) # iterable over four characters + cb = JoinerCallbackForClick(iter("qwer")) # iterable over four characters self.assertEqual(cb(None, None, ["e:r"]), ["e", "r"]) self.assertEqual(cb(None, None, ["q,w"]), ["q", "w"]) with self.assertRaises(click.BadParameter): cb(None, None, ["q:e", "a,w"]) self.assertEqual(cb(None, None, ["r:q", "w"]), ["r", "q", "w"]) + def test_get_word_text(self): + self.assertEqual( + get_word_text(etree.fromstring("basicword")), + "basicword", + ) + self.assertEqual( + get_word_text(etree.fromstring("subwcase")), + "subwcase", + ) + self.assertEqual( + get_word_text(etree.fromstring("syl1syl2")), + "syl1syl2", + ) + self.assertEqual( + get_word_text(etree.fromstring("textsubtail")), + "textsubtail", + ) + self.assertEqual( + get_word_text(etree.fromstring("abcd")), + "abcd", + ) + if __name__ == "__main__": main() diff --git a/test/test_package_urls.py b/test/test_package_urls.py index 4f76598f..d27df58a 100755 --- a/test/test_package_urls.py +++ b/test/test_package_urls.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python from unittest import main diff --git a/test/test_silence.py b/test/test_silence.py index 212ace8a..9b6691e2 100755 --- a/test/test_silence.py +++ b/test/test_silence.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test suite for inserting silences into a readalong""" diff --git a/test/test_smil.py b/test/test_smil.py new file mode 100644 index 00000000..4217b7c3 --- /dev/null +++ b/test/test_smil.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python + +""" +Unit test suite for the smil writing and parsing utilities +""" + +from textwrap import dedent +from unittest import main + +from basic_test_case import BasicTestCase + +from readalongs.text.make_smil import make_smil, parse_smil + + +class TestSmilUtilities(BasicTestCase): + """Unit test suite for the smil writing and parsing utilities""" + + def setUp(self): + super().setUp() + self.words = [ + {"id": "w1", "start": 0.01, "end": 0.75}, + {"id": "w2", "start": 0.8, "end": 1.04}, + # Make one of the ID's a utf-8 character, to test for handling that correctly. + {"id": "wé3", "start": 1.2, "end": 1.33}, + ] + self.smil = dedent( + """\ + + + + + + + + + + + + + + """ + ) + + def test_make_smil(self): + text_path = "my_text_path" + audio_path = "my_audio_path" + smil = make_smil(text_path, audio_path, self.words) + self.assertEqual(smil, self.smil) + + def test_parse_smil(self): + words = parse_smil(self.smil) + self.assertEqual(words, self.words) + + def test_parse_bad_smil(self): + with self.assertRaises(ValueError): + _ = parse_smil("this is not XML") + + missing_id = dedent( + """\ + + + + + + + + """ + ) + with self.assertRaises(ValueError): + _ = parse_smil(missing_id) + + missing_clip_end = dedent( + """\ + + + + + + + + """ + ) + with self.assertRaises(ValueError): + _ = parse_smil(missing_clip_end) + + bad_float = dedent( + """\ + + + + + + + + """ + ) + with self.assertRaises(ValueError): + _ = parse_smil(bad_float) + + +if __name__ == "__main__": + main() diff --git a/test/test_temp_file.py b/test/test_temp_file.py index 0f38924b..e468f6ac 100755 --- a/test/test_temp_file.py +++ b/test/test_temp_file.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test PortableNamedTemporaryFile class""" diff --git a/test/test_tokenize_cli.py b/test/test_tokenize_cli.py index 87497cc7..59e6f58f 100755 --- a/test/test_tokenize_cli.py +++ b/test/test_tokenize_cli.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Test suite for readalongs tokenize""" @@ -8,7 +8,7 @@ from basic_test_case import BasicTestCase -from readalongs.cli import prepare, tokenize +from readalongs.cli import make_xml, tokenize # from readalongs.log import LOGGER @@ -17,12 +17,13 @@ class TestTokenizeCli(BasicTestCase): """Test suite for the readalongs tokenize CLI command""" def setUp(self): - """setUp() creates self.tempdir and prepares an XML file for use in other tests""" + """setUp() creates self.tempdir and makes an XML file for use in other tests""" super().setUp() self.xmlfile = os.path.join(self.tempdir, "fra.xml") _ = self.runner.invoke( - prepare, ["-l", "fra", os.path.join(self.data_dir, "fra.txt"), self.xmlfile] + make_xml, + ["-l", "fra", os.path.join(self.data_dir, "fra.txt"), self.xmlfile], ) def test_invoke_tok(self): diff --git a/test/test_tokenize_xml.py b/test/test_tokenize_xml.py index 84dfa8e0..4fd3599e 100755 --- a/test/test_tokenize_xml.py +++ b/test/test_tokenize_xml.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Unit test suite for our XML tokenizer module""" diff --git a/test/test_web_api.py b/test/test_web_api.py new file mode 100755 index 00000000..c0a2e93c --- /dev/null +++ b/test/test_web_api.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python + +import os +import re +from textwrap import dedent +from unittest import main + +from basic_test_case import BasicTestCase +from fastapi.testclient import TestClient +from lxml import etree + +from readalongs.log import LOGGER +from readalongs.text.add_ids_to_xml import add_ids +from readalongs.text.convert_xml import convert_xml +from readalongs.text.tokenize_xml import tokenize_xml +from readalongs.util import get_langs +from readalongs.web_api import FormatName, create_grammar, web_api_app + +API_CLIENT = TestClient(web_api_app) + + +class TestWebApi(BasicTestCase): + def slurp_data_file(self, filename: str) -> str: + """Convenience function to slurp a whole file in self.data_dir""" + with open(os.path.join(self.data_dir, filename), encoding="utf8") as f: + return f.read().strip() + + def test_assemble_from_plain_text(self): + # Test the assemble endpoint with plain text + request = { + "text": self.slurp_data_file("ej-fra.txt"), + "text_languages": ["fra"], + } + response = API_CLIENT.post("/api/v1/assemble", json=request) + self.assertEqual(response.status_code, 200) + + def test_bad_path(self): + # Test a request to a path that doesn't exist + response = API_CLIENT.get("/pathdoesntexist") + self.assertEqual(response.status_code, 404) + + def test_bad_method(self): + # Test a request to a valid path with a bad method + response = API_CLIENT.get("/api/v1/assemble") + self.assertEqual(response.status_code, 405) + + def test_assemble_from_xml(self): + # Test the assemble endpoint with XML + request = { + "encoding": "utf-8", # for bwd compat, make sure the encoding is allowed but ignored + "xml": self.slurp_data_file("ej-fra.xml"), + "text_languages": ["fra"], + } + response = API_CLIENT.post("/api/v1/assemble", json=request) + self.assertEqual(response.status_code, 200) + + def test_bad_xml(self): + # Test the assemble endpoint with invalid XML + request = { + "xml": "this is not xml", + "text_languages": ["fra"], + } + response = API_CLIENT.post("/api/v1/assemble", json=request) + self.assertEqual(response.status_code, 422) + + def test_create_grammar(self): + # Test the create grammar function + parsed = etree.fromstring( + bytes(self.slurp_data_file("ej-fra.xml"), encoding="utf8") + ) + tokenized = tokenize_xml(parsed) + ids_added = add_ids(tokenized) + g2ped, valid = convert_xml(ids_added) + word_dict, fsg, text = create_grammar(g2ped) + self.assertTrue(valid) + self.assertIn("Auto-generated JSGF grammar", fsg) + self.assertEqual(len(word_dict), len(text.split())) + self.assertEqual(len(word_dict), 99) + + def test_bad_g2p(self): + # Test the assemble endpoint with invalid g2p languages + request = { + "text": "blah blah", + "text_languages": ["test"], + } + with self.assertLogs(LOGGER, "ERROR"): + response = API_CLIENT.post("/api/v1/assemble", json=request) + self.assertEqual(response.status_code, 422) + + def test_langs(self): + # Test the langs endpoint + response = API_CLIENT.get("/api/v1/langs") + self.assertEqual(response.json(), get_langs()[1]) + self.assertEqual(set(response.json().keys()), set(get_langs()[0])) + + def test_debug(self): + # Test the assemble endpoint with debug mode on + request = { + "text": self.slurp_data_file("ej-fra.txt"), + "debug": True, + "text_languages": ["fra"], + } + response = API_CLIENT.post("/api/v1/assemble", json=request) + content = response.json() + self.assertEqual(content["input"], request) + self.assertGreater(len(content["tokenized"]), 10) + self.assertGreater(len(content["parsed"]), 10) + self.assertGreater(len(content["g2ped"]), 10) + + # Test that debug mode is off by default + request = { + "text": "Ceci est un test.", + "text_languages": ["fra"], + } + response = API_CLIENT.post("/api/v1/assemble", json=request) + content = response.json() + self.assertIsNone(content["input"]) + self.assertIsNone(content["tokenized"]) + self.assertIsNone(content["parsed"]) + self.assertIsNone(content["g2ped"]) + + hej_verden_xml = dedent( + """\ + + + + +
+

+ hej é verden à +

+
+ +
+
+ """ + ) + + hej_verden_smil = dedent( + """\ + + + + + + + + + + + """ + ) + + def test_convert_to_TextGrid(self): + request = { + "audio_duration": 83.1, + "xml": self.hej_verden_xml, + "smil": self.hej_verden_smil, + } + response = API_CLIENT.post("/api/v1/convert_alignment/textgrid", json=request) + self.assertEqual(response.status_code, 200) + self.assertIn("aligned.TextGrid", response.headers["content-disposition"]) + self.assertEqual( + response.text, + dedent( + """\ + File type = "ooTextFile" + Object class = "TextGrid" + + xmin = 0.000000 + xmax = 83.100000 + tiers? + size = 2 + item []: + item [1]: + class = "IntervalTier" + name = "Sentence" + xmin = 0.000000 + xmax = 83.100000 + intervals: size = 3 + intervals [1]: + xmin = 0.000000 + xmax = 17.745000 + text = "" + intervals [2]: + xmin = 17.745000 + xmax = 82.190000 + text = "hej é verden à" + intervals [3]: + xmin = 82.190000 + xmax = 83.100000 + text = "" + item [2]: + class = "IntervalTier" + name = "Word" + xmin = 0.000000 + xmax = 83.100000 + intervals: size = 4 + intervals [1]: + xmin = 0.000000 + xmax = 17.745000 + text = "" + intervals [2]: + xmin = 17.745000 + xmax = 58.600000 + text = "hej é" + intervals [3]: + xmin = 58.600000 + xmax = 82.190000 + text = "verden à" + intervals [4]: + xmin = 82.190000 + xmax = 83.100000 + text = "" + """ + ), + ) + + def test_convert_to_eaf(self): + request = { + "audio_duration": 83.1, + "xml": self.hej_verden_xml, + "smil": self.hej_verden_smil, + } + response = API_CLIENT.post("/api/v1/convert_alignment/eaf", json=request) + self.assertEqual(response.status_code, 200) + self.assertIn("LF, in case we're on Windows. + dedent( + """\ + 1 + 00:00:17,745 --> 00:01:22,190 + hej é verden à + + """ + ), + ) + + response = API_CLIENT.post( + "/api/v1/convert_alignment/srt?tier=word", json=request + ) + self.assertEqual(response.status_code, 200) + self.assertIn("aligned_words.srt", response.headers["content-disposition"]) + self.assertEqual( + response.text.replace("\r", ""), # CRLF->LF, in case we're on Windows. + dedent( + """\ + 1 + 00:00:17,745 --> 00:00:58,600 + hej é + + 2 + 00:00:58,600 --> 00:01:22,190 + verden à + + """ + ), + ) + + def test_convert_to_vtt(self): + request = { + "encoding": "utf-8", # for bwd compat, make sure the encoding is allowed but ignored + "audio_duration": 83.1, + "xml": self.hej_verden_xml, + "smil": self.hej_verden_smil, + } + response = API_CLIENT.post( + "/api/v1/convert_alignment/vtt?tier=sentence", json=request + ) + self.assertEqual(response.status_code, 200) + self.assertIn("aligned_sentences.vtt", response.headers["content-disposition"]) + self.assertEqual( + response.text.replace("\r", ""), # CRLF->LF, in case we're on Windows. + dedent( + """\ + WEBVTT + + 00:00:17.745 --> 00:01:22.190 + hej é verden à + """ + ), + ) + + response = API_CLIENT.post( + "/api/v1/convert_alignment/vtt?tier=word", json=request + ) + self.assertEqual(response.status_code, 200) + self.assertIn("aligned_words.vtt", response.headers["content-disposition"]) + self.assertEqual( + response.text.replace("\r", ""), # CRLF->LF, in case we're on Windows. + dedent( + """\ + WEBVTT + + 00:00:17.745 --> 00:00:58.600 + hej é + + 00:00:58.600 --> 00:01:22.190 + verden à + """ + ), + ) + + def test_convert_to_TextGrid_errors(self): + request = { + "audio_duration": 83.1, + "xml": "this is not XML", + "smil": self.hej_verden_smil, + } + response = API_CLIENT.post("/api/v1/convert_alignment/textgrid", json=request) + self.assertEqual(response.status_code, 422, "Invalid XML should fail.") + + request = { + "audio_duration": 83.1, + "xml": self.hej_verden_xml, + "smil": "This is not SMIL", + } + response = API_CLIENT.post("/api/v1/convert_alignment/textgrid", json=request) + self.assertEqual(response.status_code, 422, "Invalid SMIL should fail.") + + request = { + "audio_duration": -10.0, + "xml": self.hej_verden_xml, + "smil": self.hej_verden_smil, + } + response = API_CLIENT.post("/api/v1/convert_alignment/textgrid", json=request) + self.assertEqual(response.status_code, 422, "Negative duration should fail.") + + def test_cleanup_temp_dir(self): + """Make sure convert's temporary directory actually gets deleted.""" + request = { + "audio_duration": 83.1, + "xml": self.hej_verden_xml, + "smil": self.hej_verden_smil, + } + with self.assertLogs(LOGGER, "INFO") as log_cm: + response = API_CLIENT.post( + "/api/v1/convert_alignment/textgrid", json=request + ) + self.assertEqual(response.status_code, 200) + # print(log_cm.output) + match = re.search( + "Temporary directory: (.*)($|\r|\n)", "\n".join(log_cm.output) + ) + self.assertIsNotNone(match) + self.assertFalse(os.path.isdir(match[1])) + + def test_cleanup_even_if_error(self): + # This is seriously white-box testing... this XML has IDs that don't + # match those in the SMIL file, which will cause an exception deeper in + # the code after the temporary directory is created. We exercise here + # catching that exception in a sane way, with a 422 status code, while + # also making sure the temporary directory gets deleted. + mismatch_xml = dedent( + """\ + + + + +
+

+ hej é verden à +

+
+ +
+
+ """ + ) + request = { + "audio_duration": 83.1, + "xml": mismatch_xml, + "smil": self.hej_verden_smil, + } + for format_name in FormatName: + with self.assertLogs(LOGGER, "INFO") as log_cm: + response = API_CLIENT.post( + f"/api/v1/convert_alignment/{format_name.value}", json=request + ) + self.assertEqual(response.status_code, 422) + # print(log_cm.output) + match = re.search( + "Temporary directory: (.*)($|\r|\n)", "\n".join(log_cm.output) + ) + self.assertIsNotNone(match) + self.assertFalse(os.path.isdir(match[1])) + + def test_convert_to_bad_format(self): + request = { + "audio_duration": 83.1, + "xml": self.hej_verden_xml, + "smil": self.hej_verden_smil, + } + response = API_CLIENT.post("/api/v1/convert_alignment/badformat", json=request) + self.assertEqual(response.status_code, 422) + + request = { + "audio_duration": 83.1, + "xml": self.hej_verden_xml, + "smil": self.hej_verden_smil, + } + response = API_CLIENT.post("/api/v1/convert_alignment", json=request) + self.assertEqual(response.status_code, 404) + + response = API_CLIENT.post( + "/api/v1/convert_alignment/vtt?tier=badtier", json=request + ) + self.assertEqual(response.status_code, 422) + + +if __name__ == "__main__": + main()