diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml index 64ae95f..6859d6f 100644 --- a/.github/workflows/runtests.yml +++ b/.github/workflows/runtests.yml @@ -20,6 +20,7 @@ jobs: matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: ["3.8", "3.9", "3.10"] + testsuite: ["minimal", "full"] steps: - uses: actions/checkout@v2 - name: set up python ${{ matrix.python-version }} @@ -27,14 +28,12 @@ jobs: with: python-version: ${{ matrix.python-version }} cache: 'pip' - - name: install system dependencies - if: runner.os != 'Windows' + - name: install system dependencies (linux) + if: runner.os == 'Linux' # only managed to install system dependencies on Linux runners run: | - if [ "$RUNNER_OS" == "Linux" ]; then - sudo apt update - sudo apt install libgmp-dev libmpfr-dev libmpc-dev - fi + sudo apt update + sudo apt install libgmp-dev libmpfr-dev libmpc-dev - name: install python dependencies run: | python -m pip install --upgrade pip @@ -42,8 +41,11 @@ jobs: - name: run tox (linux) # since system dependencies could only be installed on Linux runners, we run the "full" suite only on Linux ... if: runner.os == 'Linux' - run: tox -e py-full -- --hypothesis-profile=ci - - name: run tox (macos or windows) - # ... on all other OS we run the "recommendedextra" suite - if: runner.os != 'Linux' + run: tox -e py-${{ matrix.testsuite }} -- --hypothesis-profile=ci + - name: run tox (macos or windows - minimal) + if: runner.os != 'Linux' && matrix.testsuite == 'minimal' + run: tox -e py-minimal -- --hypothesis-profile=ci + - name: run tox (macos or windows - recommendedextra) + # ... on all other OS we run the "recommendedextra" suite instead of the "full" suite + if: runner.os != 'Linux' && matrix.testsuite == 'full' run: tox -e py-recommendedextra -- --hypothesis-profile=ci diff --git a/README.rst b/README.rst index 6feb639..5bdf640 100644 --- a/README.rst +++ b/README.rst @@ -14,12 +14,37 @@ The documentation for tmtoolkit is available on `tmtoolkit.readthedocs.org `_. -.. note:: Since Feb 8 2022, the newest version 0.11.0 of tmtoolkit is available on PyPI. This version features a new API - for text processing and mining which is incompatible with prior versions. It's advisable to first read the - first three chapters of the `tutorial `_ - to get used to the new API. You should also re-install tmtoolkit in a new virtual environment or completely - remove the old version prior to upgrading. See the - `installation instructions `_. +**Upgrade note:** + +Since Feb 8 2022, the newest version 0.11.0 of tmtoolkit is available on PyPI. This version features a new API +for text processing and mining which is incompatible with prior versions. It's advisable to first read the +first three chapters of the `tutorial `_ +to get used to the new API. You should also re-install tmtoolkit in a new virtual environment or completely +remove the old version prior to upgrading. See the +`installation instructions `_. + +Requirements and installation +----------------------------- + +**tmtoolkit works with Python 3.8 or newer (tested up to Python 3.10).** + +The tmtoolkit package is highly modular and tries to install as few dependencies as possible. For requirements and +installation procedures, please have a look at the +`installation section in the documentation `_. For short, +the recommended way of installing tmtoolkit is to create and activate a +`Python Virtual Environment ("venv") `_ and then install tmtoolkit with +a recommended set of dependencies and a list of language models via the following: + +.. code-block:: text + + pip install -U "tmtoolkit[recommended]" + # add or remove language codes in the list for installing the models that you need; + # don't use spaces in the list of languages + python -m tmtoolkit setup en,de + +Again, you should have a look at the detailed +`installation instructions `_ in order to install additional +packages that enable more features such as topic modeling. Features -------- @@ -93,14 +118,8 @@ Limits * all data must reside in memory, i.e. no streaming of large data from the hard disk (which for example `Gensim `_ supports) -Requirements and installation -============================== - -For requirements and installation procedures, please have a look at the -`installation section in the documentation `_. - License -======= +------- Code licensed under `Apache License 2.0 `_. See `LICENSE `_ file. diff --git a/conftest.py b/conftest.py index bbda0eb..6b973c7 100644 --- a/conftest.py +++ b/conftest.py @@ -7,4 +7,5 @@ from hypothesis import settings, HealthCheck # profile for CI runs on GitHub machines, which may be slow from time to time so we disable the "too slow" HealthCheck -settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, )) +# and set the timeout deadline very high (60 sec.) +settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, ), deadline=60000) diff --git a/doc/source/install.rst b/doc/source/install.rst index 0b3b3f3..9fb3e66 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -68,21 +68,22 @@ on the preferred package for topic modeling: # you may also select several topic modeling packages pip install -U "tmtoolkit[recommended,lda,sklearn,gensim]" -The minimal installation will only install a base set of dependencies and will only enable the modules for BoW +The **minimal** installation will only install a base set of dependencies and will only enable the modules for BoW statistics, token sequence operations, topic modeling and utility functions. You can install it as follows: .. code-block:: text + # alternative installation if you only want to install a minimum set of dependencies pip install -U tmtoolkit -.. note:: - The tmtoolkit package is about 7MB big, because it contains some example corpora. +.. note:: The tmtoolkit package is about 7MB big, because it contains some example corpora. -After that, you should initially run tmtoolkit's setup routine. This makes sure that all required data files are +**After that, you should initially run tmtoolkit's setup routine.** This makes sure that all required data files are present and downloads them if necessary. You should specify a list of languages for which language models should be downloaded and installed. The list of available language models corresponds with the models provided by `SpaCy `_ (except for "multi-language"). You need to specify the two-letter ISO -language code for the language models that you want to install. E.g. in order to install models for English and German: +language code for the language models that you want to install. **Don't use spaces in the list of languages.** +E.g. in order to install models for English and German: .. code-block:: text diff --git a/doc/source/version_history.rst b/doc/source/version_history.rst index 8324f17..ada6e30 100644 --- a/doc/source/version_history.rst +++ b/doc/source/version_history.rst @@ -3,6 +3,11 @@ Version history =============== +0.11.1 - 2022-02-10 +------------------- + +- show better error messages when dependencies for optional module ``corpus`` are not met +- fix a SciPy deprecation warning 0.11.0 - 2022-02-08 ------------------- diff --git a/examples/README.md b/examples/README.md index 0549399..f56d6cc 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,3 +1,6 @@ # Examples -This folder contains very few examples for *tmtoolkit*. The majority of examples is available as Jupyter Notebooks as part of the [documentation](https://tmtoolkit.readthedocs.io/). You may download these notebooks from the [documentation source](https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/doc/source) and run them on your computer. +This folder contains very few examples for *tmtoolkit*. The majority of examples is available as Jupyter Notebooks as +part of the [documentation](https://tmtoolkit.readthedocs.io/). You may download these notebooks from +the [documentation source](https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/doc/source) and run them +on your computer. diff --git a/examples/benchmark_en_newsarticles.py b/examples/benchmark_en_newsarticles.py index d6dd430..423fc5c 100644 --- a/examples/benchmark_en_newsarticles.py +++ b/examples/benchmark_en_newsarticles.py @@ -1,6 +1,14 @@ """ Benchmarking script that loads and processes English language test corpus with Corpus in parallel. +This examples requires that you have installed tmtoolkit with the recommended set of packages and have installed an +English language model for spaCy: + + pip install -U "tmtoolkit[recommended]" + python -m tmtoolkit setup en + +For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html + To benchmark whole script with `time` from command line run: PYTHONPATH=.. /usr/bin/time -v python benchmark_en_newsarticles.py [NUMBER OF WORKERS] diff --git a/examples/bundestag18_tfidf.py b/examples/bundestag18_tfidf.py index 818d66d..fa30016 100644 --- a/examples/bundestag18_tfidf.py +++ b/examples/bundestag18_tfidf.py @@ -9,6 +9,14 @@ The data for the debates comes from offenesparlament.de, see https://github.com/Datenschule/offenesparlament-data. +This examples requires that you have installed tmtoolkit with the recommended set of packages and have installed a +German language model for spaCy: + + pip install -U "tmtoolkit[recommended]" + python -m tmtoolkit setup de + +For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html + Markus Konrad June 2019 / Feb. 2022 """ diff --git a/examples/gensim_evaluation.py b/examples/gensim_evaluation.py index 1588eac..b0cba19 100644 --- a/examples/gensim_evaluation.py +++ b/examples/gensim_evaluation.py @@ -1,11 +1,17 @@ """ An example for topic modeling evaluation with gensim. -Please note that this is just an example for showing how to perform Topic Model evaluation with Gensim. The +Please note that this is just an example for showing how to perform topic model evaluation with Gensim. The preprocessing of the data is just done quickly and probably not the best way for the given data. -**Important note for Windows users:** -You need to wrap all of the following code in a `if __name__ == '__main__'` block (just as in `lda_evaluation.py`). +This examples requires that you have installed tmtoolkit with the recommended set of packages plus Gensim and have +installed a German language model for spaCy: + + pip install -U "tmtoolkit[recommended,gensim]" + python -m tmtoolkit setup de + +For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html + """ diff --git a/examples/topicmod_lda.py b/examples/topicmod_lda.py index 3656663..5213c25 100644 --- a/examples/topicmod_lda.py +++ b/examples/topicmod_lda.py @@ -2,6 +2,14 @@ An example for topic modeling with LDA with focus on the new plotting functions in `tmtoolkit.corpus.visualize` and in `tmtoolkit.topicmod.visualize`. +This examples requires that you have installed tmtoolkit with the recommended set of packages plus "lda" and have +installed an English language model for spaCy: + + pip install -U "tmtoolkit[recommended,lda]" + python -m tmtoolkit setup en + +For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html + .. codeauthor:: Markus Konrad """ diff --git a/setup.py b/setup.py index 434f059..ec46243 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ from setuptools import setup, find_packages __title__ = 'tmtoolkit' -__version__ = '0.11.0' +__version__ = '0.11.1' __author__ = 'Markus Konrad' __license__ = 'Apache License 2.0' diff --git a/tests/test_corpus.py b/tests/test_corpus.py index ade1414..dc13669 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -5,6 +5,7 @@ .. codeauthor:: Markus Konrad """ + import math import os.path import random @@ -20,8 +21,8 @@ import pytest from hypothesis import given, strategies as st, settings -if not find_spec('spacy'): - pytest.skip("skipping tmtoolkit.corpus tests (spacy not installed)", allow_module_level=True) +if any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')): + pytest.skip("skipping tmtoolkit.corpus tests (required packages not installed)", allow_module_level=True) import spacy from spacy.tokens import Doc diff --git a/tests/test_corpusimport.py b/tests/test_corpusimport.py new file mode 100644 index 0000000..ed91921 --- /dev/null +++ b/tests/test_corpusimport.py @@ -0,0 +1,23 @@ +""" +Tests for importing optional tmtoolkit.corpus module. + +.. codeauthor:: Markus Konrad +""" + +from importlib.util import find_spec + +import pytest + + +def test_import_corpus(): + if any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')): + with pytest.raises(RuntimeError, match='^the required package'): + from tmtoolkit import corpus + with pytest.raises(RuntimeError, match='^the required package'): + from tmtoolkit.corpus import Corpus + else: + from tmtoolkit import corpus + from tmtoolkit.corpus import Corpus + import spacy + import bidict + import loky diff --git a/tests/test_topicmod_model_stats.py b/tests/test_topicmod_model_stats.py index f2ba85b..58207b6 100644 --- a/tests/test_topicmod_model_stats.py +++ b/tests/test_topicmod_model_stats.py @@ -376,11 +376,6 @@ def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_): def test_filter_topics(): - try: - import tmtoolkit.corpus - except ImportError: - pytest.skip("text processing dependencies not installed") - vocab = np.array(['abc', 'abcd', 'cde', 'efg', 'xyz']) distrib = np.array([ # top 3 terms: [0.6, 0.3, 0.05, 0.025, 0.025], # abc, abcd, cde diff --git a/tmtoolkit/__init__.py b/tmtoolkit/__init__.py index 9b5e9cc..eb416d8 100644 --- a/tmtoolkit/__init__.py +++ b/tmtoolkit/__init__.py @@ -8,7 +8,7 @@ import logging __title__ = 'tmtoolkit' -__version__ = '0.11.0' +__version__ = '0.11.1' __author__ = 'Markus Konrad' __license__ = 'Apache License 2.0' @@ -19,5 +19,5 @@ from . import bow, topicmod, tokenseq, types, utils -if find_spec('spacy') and find_spec('globre'): +if not any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')): from . import corpus diff --git a/tmtoolkit/corpus/__init__.py b/tmtoolkit/corpus/__init__.py index 10b8b47..7b408d8 100644 --- a/tmtoolkit/corpus/__init__.py +++ b/tmtoolkit/corpus/__init__.py @@ -10,6 +10,12 @@ from importlib.util import find_spec +for pkg in ('spacy', 'bidict', 'loky'): + if find_spec(pkg) is None: + raise RuntimeError(f'the required package "{pkg}" for text processing is not installed; did you install ' + f'tmtoolkit with "recommended" or "textproc" option? see ' + f'https://tmtoolkit.readthedocs.io/en/latest/install.html for further information') + from ..tokenseq import strip_tags, numbertoken_to_magnitude, simplify_unicode_chars from ._common import DEFAULT_LANGUAGE_MODELS, LANGUAGE_LABELS, simplified_pos diff --git a/tmtoolkit/corpus/_corpus.py b/tmtoolkit/corpus/_corpus.py index 6a954c9..1e2c6e8 100644 --- a/tmtoolkit/corpus/_corpus.py +++ b/tmtoolkit/corpus/_corpus.py @@ -208,8 +208,11 @@ def __init__(self, docs: Optional[Union[Dict[str, str], Sequence[Document]]] = N # model meta information try: model_info = spacy.info(language_model) - except RuntimeError: - raise ValueError(f'language model "{language_model}" cannot be loaded; are you sure it is installed?') + except (RuntimeError, SystemExit): + raise RuntimeError(f'language model "{language_model}" cannot be loaded; are you sure it is installed? ' + f'see https://spacy.io/models or ' + f'https://tmtoolkit.readthedocs.io/en/latest/install.html for further information ' + f'on installing language models') # the default pipeline compenents for SpaCy language models – these would be loaded *and enabled* if not # explicitly excluded diff --git a/tmtoolkit/topicmod/model_stats.py b/tmtoolkit/topicmod/model_stats.py index 9167a7b..1949bc7 100644 --- a/tmtoolkit/topicmod/model_stats.py +++ b/tmtoolkit/topicmod/model_stats.py @@ -564,8 +564,6 @@ def filter_topics(search_pattern, vocab, topic_word_distrib, top_n=None, thresh= If `return_words_and_matches` is True, this function additionally returns a NumPy array with the top words for each topic and a NumPy array with the pattern matches for each topic. - .. note:: Using this function requires that you've installed tmtoolkit with the `[textproc]` option. - .. seealso:: See :func:`tmtoolkit.tokenseq.token_match` for filtering options. :param search_pattern: single match pattern string or list of match pattern strings