diff --git a/.github/stale.yml b/.github/stale.yml deleted file mode 100644 index 3425f7c..0000000 --- a/.github/stale.yml +++ /dev/null @@ -1,11 +0,0 @@ -daysUntilStale: 60 -daysUntilClose: 7 -exemptLabels: - - pinned - - security -staleLabel: wontfix -markComment: > - This issue has been automatically marked as stale because it has not had - recent activity. It will be closed if no further activity occurs. Thank you - for your contributions. -closeComment: false diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml index 3276891..6859d6f 100644 --- a/.github/workflows/runtests.yml +++ b/.github/workflows/runtests.yml @@ -20,6 +20,7 @@ jobs: matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: ["3.8", "3.9", "3.10"] + testsuite: ["minimal", "full"] steps: - uses: actions/checkout@v2 - name: set up python ${{ matrix.python-version }} @@ -27,14 +28,12 @@ jobs: with: python-version: ${{ matrix.python-version }} cache: 'pip' - - name: install system dependencies - if: runner.os != 'Windows' + - name: install system dependencies (linux) + if: runner.os == 'Linux' # only managed to install system dependencies on Linux runners run: | - if [ "$RUNNER_OS" == "Linux" ]; then - sudo apt update - sudo apt install libgmp-dev libmpfr-dev libmpc-dev - fi + sudo apt update + sudo apt install libgmp-dev libmpfr-dev libmpc-dev - name: install python dependencies run: | python -m pip install --upgrade pip @@ -42,8 +41,11 @@ jobs: - name: run tox (linux) # since system dependencies could only be installed on Linux runners, we run the "full" suite only on Linux ... if: runner.os == 'Linux' - run: tox -e py-full - - name: run tox (macos or windows) - # ... on all other OS we run the "recommendedextra" suite - if: runner.os != 'Linux' - run: tox -e py-recommendedextra + run: tox -e py-${{ matrix.testsuite }} -- --hypothesis-profile=ci + - name: run tox (macos or windows - minimal) + if: runner.os != 'Linux' && matrix.testsuite == 'minimal' + run: tox -e py-minimal -- --hypothesis-profile=ci + - name: run tox (macos or windows - recommendedextra) + # ... on all other OS we run the "recommendedextra" suite instead of the "full" suite + if: runner.os != 'Linux' && matrix.testsuite == 'full' + run: tox -e py-recommendedextra -- --hypothesis-profile=ci diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 0000000..df5265b --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,23 @@ +name: Close inactive issues +on: + schedule: + - cron: "23 3 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v3 + with: + days-before-issue-stale: 30 + days-before-issue-close: 14 + stale-issue-label: "stale" + stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} + diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..6b973c7 --- /dev/null +++ b/conftest.py @@ -0,0 +1,11 @@ +""" +Configuration for tests with pytest + +.. codeauthor:: Markus Konrad +""" + +from hypothesis import settings, HealthCheck + +# profile for CI runs on GitHub machines, which may be slow from time to time so we disable the "too slow" HealthCheck +# and set the timeout deadline very high (60 sec.) +settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, ), deadline=60000) diff --git a/doc/source/version_history.rst b/doc/source/version_history.rst index 8324f17..ada6e30 100644 --- a/doc/source/version_history.rst +++ b/doc/source/version_history.rst @@ -3,6 +3,11 @@ Version history =============== +0.11.1 - 2022-02-10 +------------------- + +- show better error messages when dependencies for optional module ``corpus`` are not met +- fix a SciPy deprecation warning 0.11.0 - 2022-02-08 ------------------- diff --git a/examples/minimal_tfidf.py b/examples/minimal_tfidf.py new file mode 100644 index 0000000..f19ca01 --- /dev/null +++ b/examples/minimal_tfidf.py @@ -0,0 +1,33 @@ +""" +A minimal example to showcase a few features of tmtoolkit. + +Markus Konrad +Feb. 2022 +""" + +from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm +from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table + + +# load built-in sample dataset and use 4 worker processes +corp = Corpus.from_builtin_corpus('en-News100', max_workers=4) + +# investigate corpus as dataframe +toktbl = tokens_table(corp) +print(toktbl) + +# apply some text normalization +lemmatize(corp) +to_lowercase(corp) + +# build sparse document-token matrix (DTM) +# document labels identify rows, vocabulary tokens identify columns +mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True) + +# apply tf-idf transformation to DTM +# operation is applied on sparse matrix and uses few memory +tfidf_mat = tfidf(mat) + +# show top 5 tokens per document ranked by tf-idf +top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5) +print(top_tokens) diff --git a/setup.py b/setup.py index 90ffe23..ec46243 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,7 @@ """ tmtoolkit setuptools based setup module + +.. codeauthor:: Markus Konrad """ import os @@ -8,7 +10,7 @@ from setuptools import setup, find_packages __title__ = 'tmtoolkit' -__version__ = '0.11.1.dev' +__version__ = '0.11.1' __author__ = 'Markus Konrad' __license__ = 'Apache License 2.0' diff --git a/tests/_testtools.py b/tests/_testtools.py index 72fd261..67d7a68 100644 --- a/tests/_testtools.py +++ b/tests/_testtools.py @@ -32,7 +32,7 @@ def strategy_dtm(): def strategy_dtm_small(): - return strategy_2d_array(int, 0, 10, min_side=2, max_side=10) + return strategy_2d_array(int, 0, 10, min_side=2, max_side=6) def strategy_2d_prob_distribution(): diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 04d8162..dc13669 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -1687,8 +1687,10 @@ def test_kwic_table_hypothesis(corpora_en_serial_and_parallel_module, **args): else: assert s in dkwic_tok else: - if len(corp[lbl]) > 1: - assert all([args['glue'] in x for x in dkwic[matchattr]]) + # disabled since this is not always the case: the keyword is in a very small document or at the + # start or end of a sentence, there may not be the "glue" string in the context + # if len(corp[lbl]) > 1: + # assert all([args['glue'] in x for x in dkwic[matchattr]]) if not args['inverse']: assert all([s in x for x in dkwic[matchattr]]) diff --git a/tests/test_topicmod_model_stats.py b/tests/test_topicmod_model_stats.py index f2ba85b..58207b6 100644 --- a/tests/test_topicmod_model_stats.py +++ b/tests/test_topicmod_model_stats.py @@ -376,11 +376,6 @@ def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_): def test_filter_topics(): - try: - import tmtoolkit.corpus - except ImportError: - pytest.skip("text processing dependencies not installed") - vocab = np.array(['abc', 'abcd', 'cde', 'efg', 'xyz']) distrib = np.array([ # top 3 terms: [0.6, 0.3, 0.05, 0.025, 0.025], # abc, abcd, cde diff --git a/tests/test_topicmod_visualize.py b/tests/test_topicmod_visualize.py index 827606b..8c5d104 100644 --- a/tests/test_topicmod_visualize.py +++ b/tests/test_topicmod_visualize.py @@ -1,5 +1,4 @@ import os -import random import pytest from hypothesis import given, strategies as st, settings @@ -10,7 +9,7 @@ from ._testtools import strategy_2d_prob_distribution from tmtoolkit.utils import empty_chararray -from tmtoolkit.topicmod import model_io, visualize, evaluate +from tmtoolkit.topicmod import model_io, visualize def test_generate_wordclouds_for_topic_words(): diff --git a/tmtoolkit/__init__.py b/tmtoolkit/__init__.py index 752ede1..eb416d8 100644 --- a/tmtoolkit/__init__.py +++ b/tmtoolkit/__init__.py @@ -8,7 +8,7 @@ import logging __title__ = 'tmtoolkit' -__version__ = '0.11.1.dev' +__version__ = '0.11.1' __author__ = 'Markus Konrad' __license__ = 'Apache License 2.0' diff --git a/tmtoolkit/bow/dtm.py b/tmtoolkit/bow/dtm.py index 6225247..b605aad 100644 --- a/tmtoolkit/bow/dtm.py +++ b/tmtoolkit/bow/dtm.py @@ -132,7 +132,7 @@ def dtm_to_gensim_corpus(dtm): else: dtm_sparse = dtm_t else: - from scipy.sparse.csc import csc_matrix + from scipy.sparse import csc_matrix dtm_sparse = csc_matrix(dtm_t) return gensim.matutils.Sparse2Corpus(dtm_sparse) diff --git a/tmtoolkit/topicmod/model_stats.py b/tmtoolkit/topicmod/model_stats.py index 9167a7b..1949bc7 100644 --- a/tmtoolkit/topicmod/model_stats.py +++ b/tmtoolkit/topicmod/model_stats.py @@ -564,8 +564,6 @@ def filter_topics(search_pattern, vocab, topic_word_distrib, top_n=None, thresh= If `return_words_and_matches` is True, this function additionally returns a NumPy array with the top words for each topic and a NumPy array with the pattern matches for each topic. - .. note:: Using this function requires that you've installed tmtoolkit with the `[textproc]` option. - .. seealso:: See :func:`tmtoolkit.tokenseq.token_match` for filtering options. :param search_pattern: single match pattern string or list of match pattern strings