From 04d818c0f9b9a9a349b785c1ee6f273aeb288551 Mon Sep 17 00:00:00 2001 From: ByteYJ <145717481+ByteYJ@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:00:48 -0400 Subject: [PATCH] Initial commit --- .dvc/.gitignore | 3 + .dvc/config | 0 .dvcignore | 3 + .github/ISSUE_TEMPLATE/bug_report.md | 35 +++ .github/ISSUE_TEMPLATE/user-story.md | 25 ++ .github/workflows/python_package.yml | 36 +++ .github/workflows/ruff.yml | 8 + .gitignore | 129 ++++++++ CHANGELOG.md | 33 ++ CONTRIBUTIONS.md | 38 +++ LICENSE.md | 21 ++ README.md | 231 ++++++++++++++ dag_workflow.png | Bin 0 -> 20788 bytes data/.gitignore | 1 + data/gutenberg/austen-emma.txt | 25 ++ data/gutenberg/austen-persuasion.txt | 24 ++ data/gutenberg/austen-sense.txt | 22 ++ data/gutenberg/bible-kjv.txt | 32 ++ data/gutenberg/blake-poems.txt | 47 +++ data/gutenberg/bryant-stories.txt | 40 +++ data/gutenberg/burgess-busterbrown.txt | 22 ++ data/gutenberg/carroll-alice.txt | 21 ++ data/gutenberg/chesterton-ball.txt | 21 ++ data/gutenberg/chesterton-brown.txt | 20 ++ data/gutenberg/chesterton-thursday.txt | 20 ++ docs/Makefile | 20 ++ docs/conf.py | 55 ++++ docs/index.rst | 24 ++ docs/make.bat | 35 +++ dvc.lock | 18 ++ dvc.yaml | 8 + notebooks/word_count_prototype.ipynb | 372 +++++++++++++++++++++++ pyproject.toml | 47 +++ src/cdstemplate/__init__.py | 0 src/cdstemplate/corpus_counter_script.py | 68 +++++ src/cdstemplate/utils.py | 12 + src/cdstemplate/word_count.py | 114 +++++++ tests/test_word_count.py | 105 +++++++ 38 files changed, 1735 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/user-story.md create mode 100644 .github/workflows/python_package.yml create mode 100644 .github/workflows/ruff.yml create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 CONTRIBUTIONS.md create mode 100644 LICENSE.md create mode 100644 README.md create mode 100644 dag_workflow.png create mode 100644 data/.gitignore create mode 100644 data/gutenberg/austen-emma.txt create mode 100644 data/gutenberg/austen-persuasion.txt create mode 100644 data/gutenberg/austen-sense.txt create mode 100644 data/gutenberg/bible-kjv.txt create mode 100644 data/gutenberg/blake-poems.txt create mode 100644 data/gutenberg/bryant-stories.txt create mode 100644 data/gutenberg/burgess-busterbrown.txt create mode 100644 data/gutenberg/carroll-alice.txt create mode 100644 data/gutenberg/chesterton-ball.txt create mode 100644 data/gutenberg/chesterton-brown.txt create mode 100644 data/gutenberg/chesterton-thursday.txt create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 dvc.lock create mode 100644 dvc.yaml create mode 100644 notebooks/word_count_prototype.ipynb create mode 100644 pyproject.toml create mode 100644 src/cdstemplate/__init__.py create mode 100644 src/cdstemplate/corpus_counter_script.py create mode 100644 src/cdstemplate/utils.py create mode 100644 src/cdstemplate/word_count.py create mode 100644 tests/test_word_count.py diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..aa9237f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,35 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Data Resources** +Please provide any data resources, such as model files, input data or configuration files associated with this issue. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Desktop (please complete the following information):** + - OS: [e.g. iOS] + - Python version [e.g. 3.8] + - Version [e.g. 22] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/user-story.md b/.github/ISSUE_TEMPLATE/user-story.md new file mode 100644 index 0000000..75c9f44 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/user-story.md @@ -0,0 +1,25 @@ +--- +name: User Story +about: Describe context and goals for providing value to users in this project. +title: '' +labels: '' +assignees: '' + +--- + +As a [user concerned] +I want [goal] +so that [reason] + +### Timebox [optional] +What's the maximum amount of time that should be spent working on this? + +### Definition of Done +A checklist of things that need to happen in order for this story to be successfully completed +- [ ] Implement and check-in code changes +- [ ] Test code changes +- [ ] Documentation updated (if needed) +- [ ] Code review + +### Subtasks +You can add additional sub-tasks if you'd like to break down the work further. diff --git a/.github/workflows/python_package.yml b/.github/workflows/python_package.yml new file mode 100644 index 0000000..daa092a --- /dev/null +++ b/.github/workflows/python_package.yml @@ -0,0 +1,36 @@ +name: Python package + +on: + pull_request: + push: + branches: [ $default-branch ] + + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 + pip install .[test] + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest \ No newline at end of file diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml new file mode 100644 index 0000000..b268138 --- /dev/null +++ b/.github/workflows/ruff.yml @@ -0,0 +1,8 @@ +name: Ruff +on: [push, pull_request] +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: chartboost/ruff-action@v1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d00ecbc --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,33 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +You should also add project tags for each release in Github, see [Managing releases in a repository](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository). + +## [2.0.0] - 2024-05-29 +### Added +- Added example auto-built Sphinx documentation in the `docs` folder +- Github workflow for running ruff linter +- A note about conda dependencies to README +- A note about using docker containers to README +- Ruff as a linter for development +### Changed +- All build and packaging switched to use only pyproject.toml +- Minimum python version changed to 3.10 +- Github workflow checks python versions 3.10, 3.11, 3.12 +- Updated DVC version to avoid `ImportError: cannot import name 'fsspec_loop'` in older versions +### Removed +- Removed setup.cfg + +## [1.0.0] - 2022-05-23 +### Added +- README and CHANGELOG +- cdstemplate packages for computing word count from input text +- corpus_counter_script.py as a user-facing script with argparse examples +- Tests of cdstemplate packages +- A github workflow to trigger tests on pull request to the main branch +- Sample text data from Project Gutenberg +- Data Version Control stage for the corpus_counter_script.py +- A sample Jupyter notebook that plots most frequent words the Gutenberg data diff --git a/CONTRIBUTIONS.md b/CONTRIBUTIONS.md new file mode 100644 index 0000000..94614f5 --- /dev/null +++ b/CONTRIBUTIONS.md @@ -0,0 +1,38 @@ +# Contribution Guidelines +This is a community-driven, open source project that welcomes all contributions. Whether you're a seasoned contributor or new to the project, we're grateful for all contributions. + +## Community standards + +We are an inclusive community that values open dialogue, mutual respect, and fair treatment. Every submission will be treated equally and we encourage those with diverse backgrounds and perspectives to contribute. + +We are part of the University of Massachusetts Amherst, so we adhere to the [UMass Code of Student Conduct](https://www.umass.edu/dean_students/codeofconduct). + +## Getting started +Before contributing to the project, take a look at the README file, which contains information about system requirements, environment setup steps, and a project summary. + +Further documentation for this project is found in the docs folder. + +## Selecting an issue +Issues that are open for contribution are given the following labels: +- good-first-issue + - Issues with this tag are suited for those that do not have previous experience with the project. +- help-wanted + - Issues with this tag are open for contribution and are suited for those with experience in contribution. + +## Submitting contributions + +To contribute to the project, do the following: +- [Fork and clone](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the repository +- Create a [branch](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-and-deleting-branches-within-your-repository) for your issue +- Make a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) to the main branch of the upstream repository + - Title your pull request with the issue you fixed + - For example, "Fixed upload error to resolve Issue #987" + - Include a short description of the changes you made + +## Issue reporting and help +Report bugs, issues, or suggested features to * insert email.* + +Direct all questions to *insert email *, but keep in mind that we are a small team and may take awhile to respond. + + + diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..00a65ef --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 University of Massachusetts Amherst, Center for Data Science + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b131646 --- /dev/null +++ b/README.md @@ -0,0 +1,231 @@ +# PythonProjectTemplate + +This repository presents some opinionated guidelines for creating a data science and machine learning project in Python, using the simple example of scripts for counting words in text documents. By following these guidelines you can make it easy for your code to be tested and understood by others (or yourself months from now), so they can reproduce your experiments. + +These are just guidelines, not strict rules, so feel free to alter them to meet your needs. Just keep in mind the goal is that others can understand and run your code, even if you aren't around to ask questions to! + +This template draws a lot of inspiration from [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/). Please read their awesome explanations! + +# Getting Started +## Installing Dependencies and Packages +Use these steps for setting up a development environment to install and work with code in this template: +1) Set up a Python 3 virtual environment using [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html#) or [Virtualenv](https://virtualenv.pypa.io/en/latest/index.html). Read [Python Virtual Environments: A Primer](https://realpython.com/python-virtual-environments-a-primer/#the-virtualenv-project) for details on how to get started with virtual environments and why you need them. For a _really detailed_ explanation, see [An unbiased evaluation of environment management and packaging tools](https://alpopkes.com/posts/python/packaging_tools/). +2) Activate your virtual environment. + +3) Install the package. + - If you want to just use the scripts and package features, install the project by running `pip install .` from the root directory. + - If you will be changing the code and running tests, you can install it by running `pip install -e .[test,dev]`. The `-e/--editable` flag means local changes to the project code will always be available with the package is imported. You wouldn't use this in production, but it's useful for development. + + +For example, if you use Conda, you would run the following to create an environment named `template` with python version 3.10, then activate it and install the package in developer mode: +``` +$ conda create -n template python=3.10 -y +Collecting package metadata (current_repodata.json): done +Solving environment: done + +## Package Plan ## + + environment location: /home/virginia/miniconda3/envs/template + + added / updated specs: + - python=3.10 + + + +The following NEW packages will be INSTALLED: + + package | build + ---------------------------|----------------- +... + +$ conda activate `template` +$ pip install -e .[test,dev] +Obtaining file:///home/virginia/workspace/PythonProjectTemplate + Installing build dependencies ... done + Getting requirements to build wheel ... done + Installing backend dependencies ... done + Preparing wheel metadata ... done +Collecting numpy +... +``` + +## Specifying Requirements +In order for users to install your package and all the libraries it depends on by running `pip install`, you need to provide a `pyproject.toml` file. This has two important sections: +- `project`: List project metadata and version information and all library requirements/dependencies, including for testing or development environments. This is the main file you will work with and add requirements to. Some dependencies +- `build-system`: Define the build tool that is used to package and distribute your code. For this project, we use [SetupTools](https://setuptools.pypa.io/en/latest/userguide/quickstart.html). + +If you'd like to learn more about python packaging, refer to [the Python Packaging User Guide](https://packaging.python.org/en/latest/) or [PEP 517](https://peps.python.org/pep-0517/#build-requirements). + +### Requirements via conda environment files +[Anaconda](https://www.anaconda.com/download/) and its bare bones counterpart, [Miniconda](https://docs.anaconda.com/free/miniconda/index.html), are especially useful if your project depends on libraries that are difficult to install in the standard pythonic way, such as [GPU libraries](https://docs.anaconda.com/free/working-with-conda/packages/gpu-packages/). If this is the case, you should also share a [Conda environment file](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-file-manually) with your code. The conda virtual environment will need to be created and activated before any `pip install` steps. Installations with conda dependencies are usually a little more complicated, so make sure you include step-by-step instructions in documentation. + +### Containerized applications +In cases when its important that your software work exactly the same on every operating system or you want to abstract away difficult installation steps for end user, you can consider creating a [Docker container](https://www.docker.com/resources/what-container/). This is often appropriate deploying services in the cloud or providing an application for a tech-savvy person to use on their own. However, it's not necessary for most of our projects. + + +## Directory Structure +So what does each file in this repository do? +``` +. +├── src + ├── cdstemplate # The python package root - Any code you'd like to be able to import lives here +   ├── corpus_counter_script.py # A script that takes a list of documents as input and outputs a CSV of word counts +   ├── __init__.py # Indicates that this directory is a python package, you can put special import instructions here +    ├── word_count.py # A module that has functions and classes to import +   └── utils.py # A module that handles logging and other internals +├── CHANGELOG.md # Versioning information +├── dag_workflow.png # An image that is linked to in this README +├── data # Data files which may or may not be tracked in Git, but we reserve a folder for them so that users can all have the same relative paths +   ├── gutenberg # Sample text input files, the raw inputs to our experiment pipeline. +   └── gutenberg_counts.csv # The expected output file for our experiment. It's generated by `dvc repro` and is ignored by git. +├── docs # Sphinx auto-documentation uses this folder to run its scripts and store documentation +   ├── _build # Contains the Sphinx doctree and html documentation source code + ├── doctrees # A folder with doctree construction information +   └── html # A folder that contains the html code for all automatically created documentation +   ├── _static # A folder that can contain static code +   ├── _templates # A folder that can contain Sphinx templates + ├── conf.py # A function that configures Sphinx according to user specifications + ├── index.rst # A directory that users can input new functions into for auto-documentation + ├── make.bat # A function that runs auto-documentation + └── Makefile # A function that creates html documentation based on functions in the index.rst file +├── dvc.lock # Data Version Control uses this file to compare experiment versions. It's tracked in Git, but don't edit it manually. +├── dvc.yaml # Create the Data Version Control pipeline stages here +├── notebooks +   └── word_count_prototype.ipynb # A jupyter notebook that makes pretty plots +├── pyproject.toml # Project metadata, dependencies and build tools are declared for proper installation and packaging. +├── README.md # You're reading it now! +└── tests + └── test_word_count.py # Unit and smoke tests for the word_count module +├── .dvc # The configuration file for Data Version Control +├── .github + └── workflows/python_package.yml # Github Workflow file, configures running tests on Github every time a pull request to the main branch is made +├── .gitignore # Lists files that should not be included in version control, created from Github's template .gitignore for Python. +└── .dvcignore # Lists files that Data Version Control should skip when checking for changes in stage dependencies. +``` + + +# Communication Tools and Code +When you work with others, it's not just about the code! + +The README, CHANGELOG and docstrings are just as important. + +- _README.md_ : Summarize the project's purpose and give installation instructions. +- _CHANGELOG.md_ : Tell the user what has changed between versions and why, see [Keep A CHANGELOG](https://keepachangelog.com/en/1.0.0/) +- docstrings: Appear directly in your code and give an overview of each function or object. They can be printed using `help(object)` from the python interpreter or used to automatically generate API documentation with a tool like [Sphinx](https://www.sphinx-doc.org/en/master/index.html). There are many different docstring formats. Your team can choose any they like, just be consistent. This template uses [reStructuredText style](https://peps.python.org/pep-0287/). +- Sphinx : Create html documentation for your functions based on the docstrings you write in the code. Use [Sphinx](https://www.sphinx-doc.org/en/master/index.html) to streamline the documentation process. + +Read [Real Python's Documenting Python Code: A Complete Guide](https://realpython.com/documenting-python-code/) for more ideas about effectively documenting code. The `.md` files are written using [Markdown](https://www.markdownguide.org/), a handy formatting language that is automatically rendered in Github. + +# Tests +Although it's [impossible to generally prove that your code is bug-free](https://en.wikipedia.org/wiki/Undecidable_problem), automated testing is a valuable tool. It provides: +- Proof that your code works as intended in most common examples and important edge cases +- Instant feedback on whether changes to the code broke its functionality +- Examples of how to use the code, a type of documentation + +This repository has tests configured using [pytest](https://pytest.org/) and the Github action defined in `.github/workflows/python_package.yml` will run tests every time you make a pull request to the main branch of the repository. [Unittest](https://docs.python.org/3/library/unittest.html#module-unittest) and [nose2](https://docs.nose2.io/en/latest/) are other common test frameworks for python. + +You can run tests locally using `pytest` or `python -m pytest` from the command line from the root of the repository or configure them to be [run with a debugger in your IDE](https://code.visualstudio.com/docs/python/testing). For example: +``` +$ pytest +======================== test session starts ======================== +platform linux -- Python 3.10.4, pytest-7.1.2, pluggy-1.0.0 +rootdir: /home/virginia/workspace/PythonProjectTemplate +collected 2 items + +tests/test_sample_module.py . +``` + +Read the following articles for tips on writing your own tests: +- [Getting Started With Testing in Python](https://realpython.com/python-testing/) +- [13 Tips for Writing Useful Unit Tests](https://betterprogramming.pub/13-tips-for-writing-useful-unit-tests-ca20706b5368) +- [Why Good Developers Write Bad Unit Tests](https://mtlynch.io/good-developers-bad-tests) + +# Reproducible Experiments +In practice, data science often relies on pipelining many operations together to prepare data, extract features, then train and evaluate models or produce analysis. Whether someone can reproduce your experiments depends on how clearly you lay out the pipeline and parameters that you use for each 'node' in the pipeline, including stating where to find the input data and how it should be formatted. + +In practice, you should write scripts that are flexible enough to change the parameters you'd like to experiment with and define the pipeline using a directed acyclic graph (DAG), where the outputs from earlier steps become the dependencies for later ones. It's good practice to draw out the DAG for your experiment first, noting inputs, outputs and parameters, before you code scripts for the pipeline, like this: + +![DAG diagram](./dag_workflow.png) + +## Reusable Scripts +Our 'experiment' here is simply counting the occurrence of words from a set of documents, in the form of text files, then writing the counts of each word to a CSV file. This operation is made available to users via the `cdstemplate.corpus_counter_script` and by using the [`argparse` command-line parsing library](https://docs.python.org/3/library/argparse.html#module-argparse), we clearly describe the expected input parameters and options, which can be displayed using the `--help` flag. There are [other command-line parsers](https://realpython.com/comparing-python-command-line-parsing-libraries-argparse-docopt-click/) you can use, but `argparse` comes with python, so you don't need to add an extra requirement. + + +Since we have made the package installable and defined it as the `corpus-counter` script in `project.toml`, users can run it using `corpus-counter`, `python -m cdstemplate.corpus_counter_script` or `python src/cdstemplate/corpus_counter_script.py`, but all work the same way: +``` +$ corpus-counter --help +usage: corpus-counter [-h] [--case-insensitive] csv documents [documents ...] + +A script to generate counts of tokens in a corpus + +positional arguments: + csv Path to the output CSV storing token counts. Required. + documents Paths to at least one raw text document that make up the corpus. Required. + +options: + -h, --help show this help message and exit + --case-insensitive, -c + Default is to have case sensitive tokenization. Use this flag to make the token counting + case insensitive. Optional. +$ python src/cdstemplate/corpus_counter_script.py --help +usage: corpus_counter_script.py [-h] [--case-insensitive] +... +$ python -m cdstemplate.corpus_counter_script --help +usage: corpus_counter_script.py [-h] [--case-insensitive] + csv documents [documents ...] + +A script to generate counts of tokens in a corpus +... +``` + +Using the help message, we can understand how to run the script to count all the words in the text files in `data/gutenberg` in a case-insensitive way, saving word counts to a new csv file, `data/gutenberg_counts.csv`: +``` +$ corpus-counter data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive +INFO : 2023-12-08 12:26:10,770 : cdstemplate.corpus_counter_script : Command line arguments: Namespace(csv='data/gutenberg_counts.csv', documents=['data/gutenberg/austen-emma.txt', 'data/gutenberg/austen-persuasion.txt', 'data/gutenberg/austen-sense.txt', 'data/gutenberg/bible-kjv.txt', 'data/gutenberg/blake-poems.txt', 'data/gutenberg/bryant-stories.txt', 'data/gutenberg/burgess-busterbrown.txt', 'data/gutenberg/carroll-alice.txt', 'data/gutenberg/chesterton-ball.txt', 'data/gutenberg/chesterton-brown.txt', 'data/gutenberg/chesterton-thursday.txt'], case_insensitive=True) +DEBUG : 2023-12-08 12:26:10,771 : cdstemplate.word_count : CorpusCounter instantiated, tokenization pattern: \s, case insensitive: True +INFO : 2023-12-08 12:26:10,771 : cdstemplate.corpus_counter_script : Tokenizing document number 0: data/gutenberg/austen-emma.txt +DEBUG : 2023-12-08 12:26:10,771 : cdstemplate.word_count : Tokenizing '[Emma by Jane Austen 1816] +... +``` + +## Data Dependencies Tools +[Build automation tools](https://en.wikipedia.org/wiki/Build_automation) like [Make](https://en.wikipedia.org/wiki/Make_(software)) have been used to resolve dependencies and compile software since the 1970s. Build automation can also be used in data science and machine learning workflows for [many of the same reasons](https://en.wikipedia.org/wiki/Build_automation#Advantages), like eliminating redundant tasks, maintaining history and improved quality and consistency through automating processes. Using a build tool can also be a documentation and communication tool, since it declares the most common ways to run code and reproduce experiments. + +In the Machine Learning Operations (MLOps) community these automation tools are often called [task or workflow orchestration](https://www.datarevenue.com/en-blog/airflow-vs-luigi-vs-argo-vs-mlflow-vs-kubeflow). There are many options, such as [Airflow](https://airflow.apache.org/), [Luigi](https://github.com/spotify/luigi), [MLflow](https://mlflow.org/), [Kubeflow](https://www.kubeflow.org/) and [iterative.ai's DVC and CML](https://iterative.ai/), all with various additional features for versioning experiments, scheduling and visualizations, but at the core they are all built on the same dependency graph principle as the OG [Make](https://opensource.com/article/18/8/what-how-makefile). + +Some of these tools can take a lot of work to set up, so discuss the trade-offs with your team to decide what you'd like to use. In the early stages of a project, we recommend using something easy to set up, like [DVC](https://dvc.org/) or [Make](https://opensource.com/article/18/8/what-how-makefile). + +### DVC Example +In this repository, we have set up a pipeline using [DVC](https://dvc.org/), which has the added benefit of versioning data and experiments. DVC is especially easy to set up for Python projects, because it can be installed via pip in the project requirements and integrates with git. See [DVC Get Started documentation](https://dvc.org/doc/start) for instructions on setting up DVC in your own repository. + +The stages in our word count experiment pipeline are configured in `dvc.yaml`. As described in the previous section, this takes the `data/gutenberg` files as input and produces `data/gutenberg_counts.csv` as the final product. Since `data/gutenberg_counts.csv` should be generated whenever the data or scripts change, it is managed by DVC and ignored by git. You can re-run the pipeline steps by running `dvc repro`. +``` +$ dvc repro +Running stage 'count-words': +> python cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive +INFO : 2022-05-23 11:18:42,813 : __main__ : Command line arguments: Namespace(csv='data/gutenberg_counts.csv', documents=['data/gutenberg/austen-emma.txt', 'data/gutenberg/austen-persuasion.txt', 'data/gutenberg/austen-sense.txt', 'data/gutenberg/bible-kjv.txt', 'data/gutenberg/blake-poems.txt', 'data/gutenberg/bryant-stories.txt', 'data/gutenberg/burgess-busterbrown.txt', 'data/gutenberg/carroll-alice.txt', 'data/gutenberg/chesterton-ball.txt', 'data/gutenberg/chesterton-brown.txt', 'data/gutenberg/chesterton-thursday.txt'], case_insensitive=True) +... +$ dvc repro +Stage 'count-words' didn't change, skipping +Data and pipelines are up to date. +``` + + +You can see the stages in the DAG by running `dvc dag`, in our case it's just a single step called `count-words`: +``` +$ dvc dag ++-------------+ +| count-words | ++-------------+ +``` + +## A Note on Notebooks +We have also included an example Jupyter notebook + +Jupyter notebooks are useful tools for exploratory data analysis, prototyping baseline models and creating visualizations. However, they are _not_ an acceptable way to hand-off code for others to reproduce. Have you ever tried to run someone else's notebook, only to find out a cell was deleted, and you have no idea what it was supposed to do? + +[Don't put data science notebooks into production](https://martinfowler.com/articles/productize-data-sci-notebooks.html), they are [hard to test, version, parametrize and keep track of state](https://www.reddit.com/r/datascience/comments/ezh50g/jupyter_notebooks_in_productionno_just_no/). + +There _are_ [companies that use notebooks in production architecture](https://blog.goodaudience.com/inside-netflixs-notebook-driven-architecture-aedded32145e), but they have entire Devops organizations to help configure deployment and _still_ use workflow tools like [papermill](https://papermill.readthedocs.io/en/latest/) and Airflow to parametrize notebook dependencies. Unless you are willing to put in the effort to parametrize your notebooks in pipeline workflows, don't use them when stability and reproducibility matter. + +Best practices for working with notebooks are changing as they become more popular. However, for now most of these services are too expensive for our partners or difficult to configure. You can use a notebook for prototyping and exploratory analysis, but once the project moves forward, use [`nbconvert`](https://linuxhint.com/convert-jupyter-notebook-python/) to convert the notebook to python code, then add some tests! \ No newline at end of file diff --git a/dag_workflow.png b/dag_workflow.png new file mode 100644 index 0000000000000000000000000000000000000000..129f78d4453d05f6f6b29afc94658137f00fcbe6 GIT binary patch literal 20788 zcmeIac{J4h8$UdyZb~HyZA`lsQFo{;GZhj-l5CAJNwOu`mzhb4M9tkgMNCnaY-2Cm z2$3enZV*$%V1~&uGiHC@qx=3Ye>~^>{&>!Lp7WgVAI==}`MkI5eeKuln)tKUW~*0j zS_y$bR-2zTIR}Br2q2K9I&y!2Z%!5-V}Xw)f#=LlLh@U;On`rud7iL30f7|am3i5I(}xMeshl9&VE+_BUZnrvyWaiBCThyxUF*&h|BoL1F?lBwt1tLesF5>UY^I(f`t@rt^GiMQ zLjY&cC%@ne8b3uCocoZFGdq<~bJSn~5B1lKg9rb}o_P@UNbbi@1*0mR&q(gGP|CGPiL7TFj5A@k&B(h4E2fSV zKi00%k-S0OJr4Z{TZ&mGk`pQlN0!zW$uG50JEkuf4Wv-uDV4Ze7dRsde3i5fi4y*r63;7%4QH+q>ju%?bge%{Hb}E?e=! zj*<*#e%5OV-wf`g`xr+QbR4F9&D_dkH$S%bi5vI%4uuD{GslWs^PG=F@e+G4NO1Bp zhcvJG-3?fBvl?9As}o)4%>1Ha+T_v|EtYqRbrHuuFpQ z1&1jAVl5QozN%MEI$Kq1>~!{ek?B|7)7<0|CilTdX^Ec7392Yr(JqePeVPkKv^k=QE(X3%%8&gT*Y@Kr~z3*ZJbb zdvk=WDV#t!(kF%x_us3-yu7}oR^LSa#npFE)o1A&*$N6YBk0lrWo$AxYI^ux20V4) zSHtW2xowbx7wcfT<~pLpR>I9s)?2DclRFQb6#uY6jJrntvETW*iIU`MKw9?-h_ME< zK8bqe6(T5#L-h;&dM&EAuRD;EZieQX7F(u;z`c3tWpHm1dvIxOo1B2SyKH0F+pOjH z-JocMiR@Ly1$J1gR>HGAw1o!G907mi;a|=fkb~>N>hXSmV0J7;>tG+4(La^Fc0f1o zcFx!$>ZqCYH#|v7!a&NbsByZ5Ew$t{4lHT1X~%p_RHN2OEV#}jaoEK-19lLq3JTiO zpjY2z>dz0b!315-@I+_q_U5F=!lx&2E)GFIWot85EN&`eU#h7(t~H@w7)QD>B1Amj zJs<-a=s3oN0-a**`RXQ-(q&?GML3w5bViN!(f{WGgkdK1rq%csPV}F3Fgr!=&Zg?- zY1`P*b5NBdgy4LbCd6?+Sl)!{$xIs)i_s3P9MU8=hZI_kGd@x{6-B?7lXKxmCPF}P-x37gtH&$IXwphK7O59DpfAT-E}bc z^}3H{G-u>AY;zQt2pu{ulIJf~bEKo3pF#wLy7~9wzO(&N(aZSCsvRA3Jj7|4R|-r)xSiTb^h^*=(K4krI90?d!wxFUcLa5$$pGev8vn3&T`ya_aLN-G9ZO54M zT4x*e>a{Gb?gel8W8iV`Y?B1#AZqbXDI_+`WLT~8Ny>P$Ap9id%K@RVdL1~6?_uV? zQr^KW+0Wq9qd4q{E;C6^r*=%be_Q#H4Qey~+%-;Lsiw{NLDMUDIOVYOf@ zQ|7D@OHdn5cWAk@e8q;0E~l}l1>qyixHwiSNxoi#Y5bz54A?ohVT@Fl`_SfF`;Ne7 zoUCF{vRe$oPSerM^(g2UIUw*(F@NB?ZVx`z>iER#YgR=jf$=+;#$fPGqxi8}hwAB; zVkI*cfh5Cx6eMV2`x|sl9a)t`RFEpv3y)}o0wr@QdV0ijJO;gI2iVwG5$~Wi`2X|b z|Mj(>c6=TFDIZ%0zP<*&&YI?Z`!pw6f{+s&yk533wgG&uMXyG zMkh+X+xJ!!$gh+WPqz_0AnJ3AJ)uW~aVD2z2pFl8jUo7q>RB3Z0N7%yH$8^>MGg_Yki+j_rUo z7A8oA5Rg8%85p|rhkYUfXFuWJIhRMmLT7|(H^rum*V=>`=kjYEKnHv=;Y{BqDiCF+ zg>B0thGg}&v)d#I?l!HoXkgy~fd&$Z3J0&7)m)EHdh5* z1#{`fu<{s4vI-qjUM;z8Pe&6~>tIiScW_E8sjacP7))EX?WEX&7SO{S$8%a}(cN-_ zv%m-3@FBU?&@3*h_QEIACaI(KuG7i!>FIBl# zXNsJONuV-(!23_SUP^dM@Kln_)RvJ~UmNRTfE1YtSuHrFF{$H+m32E}a?=1vBE8@h z;@|8&F)U@25SoX8UH}PcvZNTMhY+m)7W5SY&GZXUYOptV0xV(-q#H zZRhiOIRE59-VgaJfU7=7`EJld5j?skWh$+wV2FkfhQ#pOD+O_I5zSR`HrXOZj5Lze zMeWe`Vnta~UqpNdQj%!?p_%6q5=$m?sU+ARjjM&PZ6XEZ zXm6s4l6@A{Bf#ji8R`}k z=>^dy8LD0r8Nz1J^aa5EoC}47xO{uGzRBq7_TPe$ET@I2!<-&La0UvJjM!MQmso3H zJM)MV$4jY{FV?$xDBk5!*K{Y)(^J%Dq9jKZ`4Yj4#5&#=Oi2P}goB+%;k`#}d^ph2 z9F~v*e38idsKhG)V+moQNtPdh}Q6wyj?;RPqDneonOw_LFG-L2I(TLzkm zggUQ8mjbKSxYDO(n}BzFq?-G((JSp|CtI0=bG^ay+NPte_-sds_9j;ci8uhYRiSBb z8@)oyMikjl^vRPx(!@C55s|HJ2eay&)oJbxeZuG7>+nG5{>iczF&~<-t-tQi>XvP^ zn1cBYSILj(7szKCY~&bkDp&v3DNy{GR(51{LR%?v^vHeq-qKDSsEGaJOlJSxCfjp>i|m9-{4)98NEL|mBma_kI6 z^d!PdAz2DI*1A0Hie=ND>Ttcv;cf{EC4bv-n(xUzG(laL@JEbBTwGX8LNJMfbpUUm1`HI9*ZXuHd1P@Ao{D}!-h5l za^P!L9meIrwfw1D$MIf)=sTheFFZPA;McsGN**cWSEX-wT^cRJrv=k_CkCeJEDok@ zuDGW}LzCtzi&ay_6F{*B+zV_3ff(31(0-FX$s6qti?uq-yU*dj8sr{k7DOD{+M;by z;a}t0@hj>@MU2Q)b=|W&BFCXB?U?Z`EDZ(2qfqDF&veygHezvBw453tx^**W*udeU zk-_yL-kbwuL|N^jt2AarMiUj@^82rZ)ZDJoxz-tJH9K>T>L20%;&q(J<88?mtTqbX zU^p8oNjK4m45!JF-*`Bl@9&|z(pN_Yiw0U>ioVx=oixri)R$$`Z zSUOJcXKEaxJW5bXXq+k%e&`4<*k>K|K+RAG`{{`(`OzMT-3)UEYA<@yFtYGHl@jc8 zOukCME$zEE!W1+|luKvEc+f{0?p9kJzE~6rNj=lac4M6p$*` z_|^XM0&ygsDRcE}F{ z554Gi_q!{>b7igVEWHCxluX*4fPcxT!@ve>o7U}e3Y(bbSW@rf#jh;WmvR@&Hk>dfwnz375 zck+b*GP*r;9`zRr>l7bZHsI|(_r34JCI0S&jQjEs_STni(-HB+6fJAteayJ3X9gCU zq4O6Av`p86XrvyAxQ9var0>$o>WVXM3QKtQ)(VYiqQ9Z-LpLp80qeCy+SkE;45*@Q zmBebr@dHtf8RUSN2_6Vo9Yjwd+6m}UG+r|06+SX85yR=m$x|KT;HRu37b1=DcfAA$ zvxtP=C%u+2AF^Jz7+mpD!d|tlOqH#ai$SPYs_1@F?3jah7{H@WFb?;3F7TqxS>13s z?poHawYg>wGcJP#ee0*Dm5g*;&oR!8%QCf@yO*Fgap6`O&Ar_;c{e*)ZI?*Ti#JK8 z^;S7-V=Co~=zc#I2F=C_sLSgxAGDejGxn-~^n{*9c6@N_3`e+}x~gA}TltEAEI=QB z@$L3^42*UG^^CGZIQhrbZIQiA@IdEj>RS<`h;?LF9W3#s-yfyUmF84!4QA(V=Od=k z)z0VlZrF7ai;?&LEZ}?Z3t!vfvJldTd&+StWey*A!6GY*u!zy&rspdJ2XB;ZoUZgG znb}Oes;Lw2)?cc2FZgAmwU_9ApPku*3#9_0YIs{g`TP5rwBe8sa$mxKwblnR&1SvM zA>DZ5T{K*9p9HVIzhkxpvE_69jcQsU@Yu;Uv@YgFDsH!yBd9ZV&I1=iqovQWA5 z4C4xtdlPZyO@XX6ZEGq2Hm`tq@|B- zNsiO1EGQ9=_bz;WUr;ji?9}l~ zbbxQXkD3{HlbP{mT|x3Q_dy1(rII^r=_yU^fxWFC7~MDf3qj-Ho!(x7)9?pYdhI%K ztsQn?s>yf&6Vxu&&Gmt9z%gogXy{S)3)>{l0D^>k92EV^uWDXUz8cw6m&OP^-BDpZ zm%z)+e5rfDIFa<_gNf=PKj@mioJugJ%e_OM4CxqSDzZ{;?=xEx27KUABw*EiUuRs2EoP+eh> z{Oib6XGhIGEtSmhk}0!+Aq56iwg+CE{j;e%pH|RZ6#N2t`f3HWvnT1*yunVdilC@G zo5$KC@966g?5ErQKFKu5R=r1JWoBg}w!9=^8PUAIXhHo?plt(B9qvubTlU5|Fa3>(-oMY40M1o~Z?3=eM5}a=XL>6G(yww5 z@1%zqz?4jhh!4$$dsJz8&aSX10!_&Pptdr{(#z!;iFNoIgDjuw( z?aJ#tn)w;7aUTzL9}#)b*Cp=;nI1Vor;61AHh|g8+uyS*v-sWdu{`G)-&cvKb zikuF@?W(~?^JgjFOIIT~A6`!_%OsZslQWzE^$dnnMPXa^cCH`zv5zk&*v2`+-qb4> z=*PVly5bdr8{AxU@LQRUZzNVtP^;DSxoo7ih}FA0uQTGk^<2Em!K9w>O|Ar*YmITQ zyAwXVB*oI#`FBWq0Y{UmP~>DX8rlV((i9^gT=u+MzlX3aU^r$cYxCA=Q(?1=LTg-2 zM-ee;OljA&pYJ}euo_LW^{Hp8X_i!EaQ_B%x?&;HmyfaP%=Hl!s?7)(O_wVc$<}Hq zb2*u+I{KQF$0_qemV3WiP0sKez?D^#z-NK>{U>g~#Es)2?+GJFD<0(tPC+HH8kYzX2?NR;|A$7=b6OMft_@du*bdm_HJ=93ndqF$OaOQK?KXj)g7pi}ql~h%fp)U;xfS89EiQkS+ z>}O^sXnPH21P_)?tBO>SnBy<5ZER)w7gD&CRZxsDW&yqBnm1ri>eX!k$twHFf-bS3p40 zB^QKg4QVpc1sNjDV(rIIJ$^P}z$|cwQb(WnhmZt-LL!3<$`_oPSKp;D=h%$VZ{$$N zkc$?482)O9gS;&Y`pQ$9yNVd5{S|bw?Ly}|*?H_y2F^w;v2(#sc6LLmHKaoQRi|f< z0)IkOrbO@{4_)bN{#^6}1b9JryGp)J{Cq1;=^uO1Nb#|LG+q_@>cK=SAf>wVoJ=(8boSWbwC@B+NT_j-!C6}@v zkHPV|jvUc5bHZkZzKAp-7^_dlBmidfXrf&1GrSZw@xcUfW6X&>=oals~gpD*^tn-OgWI=8=RQ4H~&9X_gcO@h?YDnu$L6XiAo9**9oXXj40!AGG8bA zr-o|yRLB#@q45*p?Vj2L{@R;|b}_LclCZZYxNH4i17j~J`z`eKK1n)5fyDv}WR%=B zUs(u~CjI*!YG3>lL;?+*GF3mp?WzV2fltn6MJ8Ev&b-bSAB+5OoEl11$YILhor2C- z?c%#-5BN|EB#rFgYTvMH-7Ml&L~q^=E!*uGQ@4rmb-dQB3EW{xy1zj*f@8K?lM6D~ z4&1jzPPU*TD2?c4{Hr+{_@u~;fRSZ;yZbYhp7EFHbMUs2PY5QTf&%K=Lh2Of;#HP- z36Czx>NU|pUwd*CmEiHkpj4jB86d}&t7%VmcK6yYV@HrEtmy>H~8C&oV0VF)x`nwA$?#S$6qE6GN~QrAe_$8VxrMc+9w{4WVypP3M{F*4`g#)m<|PzQ%xZOpK>Ix_RKdKWz%ZL6XqO<23r&y%fl?p4fA3KmBU}@JD|yx2`8~zHb+li zGGN>$BDa!k-efma@8DI>SaBb{`4M1aR>2~Y>19qjelT3w@Uf%LMZ(vq6 zQC1_kr+!z;yX!Ig$R|leO8z>KBt{Ypos5#64B9IQvgJf=1=M{0R%U^lQ=p*gaFt<~ zwvRWd^@pR7`5q{^@@pt(cUd}2POyJ`(-*t(`w%f(9#$HZ^cuWsOd99?W%Ut8x{tEpj`t!& zvJwhY64@<53=*=KuJLQ3*d~PKN>M0g!JYFIvKsgqRHWWi;M2i9cnPDoYIvbMOHM>g zqGF`SeC4x;;!~5ZS&2knMZuTz_5{$-cINeS@=Ir`{Ucb%GGk3?OJ?4u?Bz)d)Qi>g z%%12!@86K-^eQL5aw+QKFZbt2%OH=!PFi)=*li4r4{UhfCKIGOU=0NYo{>=K3z4v} ziC&PAxATb^Zr;nY+*i0wn$j4$42K<&Oxvafk`^;jjSC`{0^*<`(~@H^d(?yb1th>n zOGq@+T_-UYSKV*TytrZcRaL>)40L|c=20K)tZKp0KgyoU@v@7|no7^^E_)3Fxpv{H z-m|IjSna0~uUc2@GPS)MNoS%kqC#{+&&TI-=*m2@ks$oDjc^(=b&1wIeb4=($k9<7 zd^3!e=1MlAu*(rA!`0K$dkjce(%!^4K=d+Kwu9o_#K`bREk%CFKpK&@e)jRz_8ZdV z2y)3^3FtkQKDJc*QNrLxz&d-l4PA z+;4UoS14N|)A(4Jp+ehUW)IEg8G3p)l+gL{*6Ap^TC5W@7JkEnFoOw*1qG5?E^%)SGNF z*elO!IZqD)Gp1AaY;)!NyAho)=wFeA39b=iFL;tK4Ij1=!&B^K3yb5uU<)#NJa~4h6j1UHEW9T71!uLw{WUg?c5U*(m$W32!UCr)wMpk7&D} z^T#IY^M&e*pbsew1sE?7mHb#ZsJU1)zEbhB>tKlWSgeNw23Jq0xXwx0pPx#)WU)mx zvwjBU2;#KI1f$uj?C0AuG}I5ULGG&s`ElXsi_OmvJR@oN_7!ve`*+~@Y-ifUjTpnX zM{-Y4Wykx1{lmFT;Kb7lV&7)Vw1jz|3VPMYc}@vy7;6Ew%IukR6WJ8MmcQ(p2-)c! z?`@75b?>OgH=3v1Yh?H%7I=2t!`hQwhy7O$_7q;7yj(g_GtRF{`?in@pRBGg2`*q% z=f?5<*QeAs+U@Kx91-z%oUb>=h7M*2Bw19C^gX8(-VUu!tM8E}bVm63+76wxR@bLu zF#_A9MNVyu=T4HGHeD#PdLmXIP!YX&!WnKo+CbYuY7Zto+Ha&%5RsV0BKHJDWwB^! zxg$rb$d^*)7Y~*Kt#JEGK<20GFy3O6VQRmU8?D|>d_iN$6k=jUAW)hgGE=!P4n~<))m+_-;DFI4GjE6_3YEN)a={?t?#NdyhWHNP1-EwiWjU+N5OV z6MJn`I?RtJLUm=svp%q9NqMoRPAXYDUcSj;A1YXQ$v^6)ZvIl|4T?>nS$?+CRbaC= z>8jegcQlzdX7mI@QE>FN8nL^hru(Lz`94IFS4Mc#(QrSjC=5)KP*Zc!3TqH)J1d}* zJ4Qp9MNHmNHrts{1LChcSBl%iCw`}B?3A#CCoG4$eNA!1ULpZj&KjxT{h;1`Eh7um z-j7qH?8Pj@!p^QS*I#HnQAv*0!J0S@n4i>l>xCC%{_NBtMZGj!KiWa3>%Kr~Z#z;= zY8`7yc9v2(gH2HIcXl9Kk9CB4yU6;HherK9wJk7`4qD+nt45rBKwx!pK(CO=-Egbv zTIGl`Htvj>5M9*g9sosQ)tch0w!}4U?Xd-kmz$}kO^L>lLRbBb$`jJ;x5-?y1<^FRq@SXQV*d&0M<-*z-Z1H4Sncd}rsZvh)9&pC1gNAg%66`F#yfQmCTPBQBxmBeTJ2cLwa zxKa57Av#yy3Z6y@?H_!~QQuL)OyJyzk~@S8R*P%e-ebO)`WpCJ@+scx;;)VOOn;P{ z<7Wn}4_3U~6SUD2sNZ;xyVjIhGiWK=HQp+Z0kV^b-V+@jANWN~uC{0m;u*UAC?bQ_ zO@x-~nfhovudq^s*8L?dA~-h%gsr-S0MaeR$DA2Zid#duc1%#k%bBv2ozF+TA}=IY zc#;aNM=SrmixdhM0xgz zG4&3SP%;&lso7)tuQQ1FwvXA2#~O5&uY>+V85^=iLtb*d2TIUCm^qWKX+Z6b{6l-S z5mu5BS~Kj#-=?;pipF%BpTG=vvfsR>5KFRytMTl%(Ib;L{&bM|J@R0@vciu#$7Rq+ z^HM=eXl#yqdQFLp!TCvn0u+$>l?av3I80!6-G{#Yxv+K?ZP9!vja=ia-4v(iRRLQR zx3op5i(hO(%}DOYZ{-B!58UHKVa@xiCsk#ILP#={=`Iw8(++8pdh;t_RM#7MCZBdS0D7hU5Fd)<%O&4dMCwxmqy`JVQTXE6jfLTMV% zBRRQofYi=Ge>x!XAbLzL^bN<6XR6ax0c@d)d?U~MDUI~=KgiBzJ+tU+mSlWDLai4$ zfKgcFhb)_%c@6VTY5IXsWu*U2Y3P=tj_fb+ouGhQrT?C;u?Bd(w|YDW8>5eTLuPLW zyiC-LIByLI%|BXraj~>LrlBM+0(?So-m;ZMV-i{n_J_S_w2!?yMkGq*f^v!pZteHM<5(@HF?$QZKkz zik6vkI6hv1H1Fpi5b7@Yv8OiW;~UyNkY)H0dH7utr)+_O;;a9|K?7AJUM`uf6Ju&Dy_wvC*?=ytPKo?2hq)HwxA~dA1niVs0 zHCcTfPb5LcYxENIb)3%!X_jEz#eR(&IO!gpB0>I3(xorMtWCciQE47_X8`40hA>l! z15#5Y0}Oihr>`UjgXKv585&CZF4b@m0j9wlT3!D2n*Q8NP>h@re2l(b41kp!;NSp+ zMu-%WR|hi>I;X6AaS=HNYOIx0B52U6p8!EsmL3R-Az3pEMpCrUMZ*z}iz)^KKnGP) z7kP52=f&|DaGZZ(YW@;PGnx5OEt_)$j!PPuQk0GwpPET8I8KcNuBj9f)`gIi9sd1z zvX>MbGCVGta{!Z+Nw*@cbiCWUW9N+AyZ1O{x7^gDLGLSwD~|meM6)eGGO1i3zWvS| zUc2GbV>9*2sRz9MVJ`rp1r%$6SzBGfDiU=Ea+frYN#SBrkPrp=(f>CZhI+!A8(nxV z#|EI|r0_5UP@X@i=goEkXej{Wt9@{;QX%oaGg<}UasaZ0Sl=}bz!?_^01O7eU-h+v z|FR(;C;oZ#-+-@Ywv`G@{Zc+e1kg18a4d6%x8H3eim*=G zP-NG_i~eZY!d0W~`OzME_R0tf3XW)J5r}ac*P(eTqnE2bUx|YLVNLZ^28bME0L{r+ zw@4Kk4QBC5FnU`@6rp;p6u=~K{tY&?r&az?0Z&e}FFyGP-Bo?}vA6X7yJG?Yk}W7k zZi&1hUIPT^-!DX78u{rAmQFr5Ct57<%YKm}to|!v@GKZ+X(Y`xHS5*tEgExxZ1NPZ zyZ|odUmVyLH=ZJC1z{_8Iq?`MFTB%V)eZIn(1`!QxfZ06b3VwocmlvxGr$zE69n_5 zs74>p#e}DtV+v7WU>ook8sa+u{Mit@JSRcGO%Is@SnB5h{sfzmYs*1OdL~t@r4Fgb z@A!baMB~8_a=Dmb`0M*301Ub<04D{^9>vOXtfg=vzr#}gx=Uvv0f^oE(9+x%fTC;e zJK#ykbUz!hYY~#@c58S@6t=2o5vM5Ah69*4q|uU{{FTw*o~3B6=UL+^NCnYx!a0gy z@j4CKpuag;Nx76SMHL#&%T%|`!5F)LPgplxBLe8PE8y$MG&))j_ymB@SADBiMLO#y z!~}=)U&rNKDQ{xI{=~|@s(gP0?!00moDf_|4^*W{WP<=90E&wmb2ViDFPXRJ+TZ{s z0kCEpHUQ3O)vM1iX;WQ@&R>SUi5jq=;+hJ-1&tlsk2keSN^b_m#eY!B6N0%NQ7Z(# z_i7b_9^S&d>|Mc!)R)Wr0^6&fjGujXpTgylzi45^X_T0n6~fx>)9td<)teOr zkX6W)LflZ40FV_=_#QCmGIU^EhA9P-pK4z>`lYwZ_Pgvp#rcs7XHG!fjsT%9S+@|q zb^P!$DVh<0I91jyp!(0X;jFKqlKCsO77^o?Z)+vejscR)nUC5JjGUH>r0_zlY?;|2 zeIO|&-=(fxifF`BYc}h^^P7tqzR4B!%#& zRR5Z{p|h5{2hLjx-o1iGLt_C{Yznx)ySIC@N$7Uf#iV`lB9|cCRt<0DOHIh+OhkJO zG#2Qf#aDOhGZT_Xi#TL8H@xY4++QFtISF5Ne~~d|&a{Ef1||%%Avy>P9J9FBznS}H z{0%6asmG-k?~g%K*8p&A-M1!%$|ezyxoHtbDO-yc5_fzfP_%#;fBi@jE(o!aQbemc zcLqD6L{Q%DcC*9u7?{9k@qBvV*>#I=OZ|m-1$QCN9G5R6{J6;jccc!WW(HTzKp8Fk{A&yxnR9 z|I9S0QTsrG&V8h_iq|GmiDrim#>?rAm(atMD(jY)wig>C+K-%T>{D0+s;Tn6byB3X z)I6O}rtFF0CNa1d|9f1=|J?$#x6)wO?e%#w)FG*Rwx0jVd`Rpds_Bn{a7@61y`@4T zCR-04F~Hq(2EwtR%Dy@bzEaQDlcZ7*hQHx*OWcA%w}ENbtTDE0Oopfh{@EYzEDqC> zw0NfMe{=lx)s1t~l!Rb*V}X6LLaj4k#$3&%Ux5A2RbQ=QC0Hz+u6eh_o9 z9r`ET)gI8$Z}=_=u$^f#%|Kw&HBLc@#nR@HlK}79-}62behVwyjQU#qwJ+gn1~kQ3 z+QD?8{DDaUU+Qf|?SU}0zy+r$ErEM4q`wIMW~pYxkGuv)d_r>rd`=tk51ka3Yx6HD z^Ov}tkK)Meu$g(HO+sRzc} zkZAR7NU=Q7BoA#^bBH}9#gs?J?0`H5D&b&6`w!=?UqdsV(Cnn^W_egc10(%~gnnd* z-LeIMFRiSgi*r~9H7{pSC%M<8BnKZ5GkCGLeH-X-|X>M6sZ@c-WcihNE#*F?DRtmjwFY>ZI zEbWbOC`Z1zura{pR8ZXlQ0MLYWFRU)*=~J2EqIV5&eDDG3#ummR84E=Ay*}&4o*1a za>viifL@FXC|GL#S_Z5RE$#ejU$O%NS{@W2@+MO^bkuMH!N}6%h68Z#=jolK1#FdS z{iQA3yP-;|mLPZI7CiG)+L@qgx1HMb>FK!THuZ;@_RAqm=>+>vN9#^QakZPONtv~q zbUPItM-KV73nb@RyKoN_)5~C5u?=YU#7XlfU|Rag6%eTx2r-I33$3m-vzreSy3DU4 zc;}_>Iv%uagBtnMr<>zS-z$4Bl8RNAo%;(HNl!1XzxrdXs!UW7-%&@w!%NL&XQwGwxohef* zXPSpuDe&elaf^_?{i5muT)9*LISAlUki;{tIfQ%tXvlNW>jOFUCj0Ma0GkgsS*QCn)fcHf%yUHk3>m{$`f$F!2 zx%)f9+mJ+s-9bYtfF8;MZB@QNY<2;`+y?@gWH9Rr@GfXTs^1NK*^|I=1F0Ay5U(dd z%>VcL|0wf6Cis6)Jm#l*N&IeR4OtSx8O6@^_eot4`ws$=gu-hZES_RX14^k77PSzu z3!Rd@mDc410evpC|7jvQ0?qB;d?geuJY1X!jkqjso#^ma9!-)gl@zLtm3`@)*)zpabDy*npg6V_cq20 zu>W|ljw1lE#Nu)Oj3q*=-(o?JJfVW6|K&c5m)>Lj{gTjKmX-9ouW11<;^bVBCcGQH zE(u*|l8@dmGsP6Ry;bERaw8*w`)r~j(aUx3#h%Oro|^D8@&I#7{=8t;*BNcmnLDx##Fl}q+)1Doq( z6A7}Vdyr^47It7mudx$aC|7*QDx&&o(uNRFzqPra{H5r@fW@;z_cx;?**nZ^h6Rbxk%B4L`6+=y3+uTnD5ppD5A`td?hB0JpF zt;OJe!rG)*-MsQ@4Cy(I>u+=rIy=ivaGuk#f?cz7Lv}IIeI;PY2^Q^g`wg!~h1z^@ z#4IKVK3P}+_M#y+6EW$8ww(&;bnlr&y|=vi^{J*itU#D{Qfv9Q2nd2Lczb+cwDN#F z6JUev@zs|H%fJL1lz?&g;^@s8xD}vvzw#=^>PLW6Eh6Lc#>%hzmyhs|=3q!Vi0Y84 z=88J8E54ds;U|r8Ku#R;6rmtsN4AV$CZ*|`sPTyoPm}n_f@F(LsT9uw6iGUTkJEwB zCmAf2U8^Z1WHXDHPb5ti;d2K-Pb$P+N-=Of(CxXO(4&V8pN^k7K;jzyjK1MC0W!Qn zh(@is$#oq3(|OS>Zkl1Nk zyH&0Yn1uPP#O>l{(4Q^;W?yo^9UopAe5HJnYe_y76Uni}SDO!1_!VFM0E}x0@L`X8 zB%|Q0tJiGzFd}TX;!irOt9M4eHJS`v@jd6Nb7PdNw4kSlp{>8nV7Pa)%6%&qQp7)U z;rY)3`YKKn>^E^75F9-#Mj4o^_BjyF^JSm|Zsn})wD8yROmP%|vk-;2M>0B#e z5cKq=sEp74>bc;fexf3q%lQYzIbY&dzGszIG2SC_-H#)Kk?Qr+l|p};)jrP9oG4P& zs0oa;wq!owPFQE;I*RpBk!^}nwN z%x#0=e@-t>rg5V?mwLujNBCh8%SvTANVN0kgo!nnp9P!huU}2tZLy4;wF@&wRhm15 zEYp0WWha#lBf-xZ@9H-b+uM44iWn9op0;~_~Tzo{;l)sw=+Ra=N}(PA%SQEQrUldM{t0SktVNx`%Pk^oS^If zWfzGAqg8Wa`w&pcSLQ^V`U{frwE~9cK86D@9x-6wrc@+O+QTTAC52Bh{V|NhzA3tx zCqz7|&L`~6LjvAj2M(l+!Wv=N!wJRvu`Gq0gPU~QUS8|tACszdRBtugs94%*D;%Uf z@8B!{tuhrk8`C^cSPQ`A7E)5ipkq%{C9D6Q@Tp&%2q~F#CHryEq4> z=^03Ctwx}_%X8GW9KoCW#+zzLrW76zOQfxY?39n_^FbAxyA%=Rzqw%-MKapVXQ0 z>_Ce~ES*6Ip7C^7kk#mbWMtU8Dr2IvBtdKKn@CY7m;5w0(+W7nzYW0^Do&8UXlicR zSO0u{fMjN1adB-{oOu1R$A)W9#n0Hu`N>D}7TDoBR-G9KzQ%&}S<=oKmBhJ@u95re!SYEdo8Se__dQxb)tmMPtRF#4 z9JM~6bjdM_tCI_x`n^8E#_k}pCz0gE7gl?y?wC}KSLRT7Ne-4ojE{8LgXj(I??s$S590{h0b)EXBk)d3V!+EUt{*j)L?9;)q#N`UhXE2 z-F1Kdw~{C}g()a!{S$V4DFZF7ZWni-9FkUnrN4*-pK_gE$lViOo3N>`%I4o9IzO2H_4S&ZSY1N)P+`jF=bs#;kk*&Z%>1p9i->CQ1L zp51+{-JJM856ZM?!K}P)JBb@*tUK%elJDL<8(@j|d|^uN@(5R5K6zP<=4D4N^MysO z5Lj`k+RkJDIoj8NS$^k3g=uuI<#DS^-!do`C?aE4v-s+Z3Qd3f@sxR9ie9NNd8`;u zgwBR*Z7yfgYW)AvHv=mLd%u>lZ>qQCOMrQRDc76}^YK2FLfb#2ayHQF#Erh$C(i2+ z;_pz-uFEn;{=E;cq>L-v*O&RwsHxVT-9pq;c8U3a%ouLRkaOZ04Ymsn>YUWq*wrT1 zKI6`-Q?64C?pS`*uKV8GY9LXjj8cp*xmeBpZVhezS8M3~=)!BZ8xShzjqP3yHH7vz z7J(lPV%fuu=#mea)-Nm2?tSw-n!}` z=6faWQ52^2=(9Fle`ty-u{`UyvI{jRC18&SY$npoCwkt=mXW$N5(sr^* z-uaIwH$_y?{716>7c~Dbo$$ZR$^XZ4N{ZX1pbt-4^;m?d?H`Vp1nW6V#-+8%#h>R> z6OwWJ@BR6G-N0T5DAdb9Ahn-?KLddr1SNj(kq8hh;A01fA^wjZ?JMq+C|vhboLEkh RTL1$g=BCyr`6u0O{a?@Fj(q?C literal 0 HcmV?d00001 diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..ff8bf23 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/gutenberg_counts.csv diff --git a/data/gutenberg/austen-emma.txt b/data/gutenberg/austen-emma.txt new file mode 100644 index 0000000..44c118e --- /dev/null +++ b/data/gutenberg/austen-emma.txt @@ -0,0 +1,25 @@ +[Emma by Jane Austen 1816] + +VOLUME I + +CHAPTER I + + +Emma Woodhouse, handsome, clever, and rich, with a comfortable home +and happy disposition, seemed to unite some of the best blessings +of existence; and had lived nearly twenty-one years in the world +with very little to distress or vex her. + +She was the youngest of the two daughters of a most affectionate, +indulgent father; and had, in consequence of her sister's marriage, +been mistress of his house from a very early period. Her mother +had died too long ago for her to have more than an indistinct +remembrance of her caresses; and her place had been supplied +by an excellent woman as governess, who had fallen little short +of a mother in affection. + +Sixteen years had Miss Taylor been in Mr. Woodhouse's family, +less as a governess than a friend, very fond of both daughters, +but particularly of Emma. Between _them_ it was more the intimacy +of sisters. Even before Miss Taylor had ceased to hold the nominal +office of governess, the mildness o \ No newline at end of file diff --git a/data/gutenberg/austen-persuasion.txt b/data/gutenberg/austen-persuasion.txt new file mode 100644 index 0000000..356fa8b --- /dev/null +++ b/data/gutenberg/austen-persuasion.txt @@ -0,0 +1,24 @@ +[Persuasion by Jane Austen 1818] + + +Chapter 1 + + +Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who, +for his own amusement, never took up any book but the Baronetage; +there he found occupation for an idle hour, and consolation in a +distressed one; there his faculties were roused into admiration and +respect, by contemplating the limited remnant of the earliest patents; +there any unwelcome sensations, arising from domestic affairs +changed naturally into pity and contempt as he turned over +the almost endless creations of the last century; and there, +if every other leaf were powerless, he could read his own history +with an interest which never failed. This was the page at which +the favourite volume always opened: + + "ELLIOT OF KELLYNCH HALL. + +"Walter Elliot, born March 1, 1760, married, July 15, 1784, Elizabeth, +daughter of James Stevenson, Esq. of South Park, in the county of +Gloucester, by which lady (who died 1800) he has issue Elizabeth, +born June 1, 1785; Ann \ No newline at end of file diff --git a/data/gutenberg/austen-sense.txt b/data/gutenberg/austen-sense.txt new file mode 100644 index 0000000..b958d68 --- /dev/null +++ b/data/gutenberg/austen-sense.txt @@ -0,0 +1,22 @@ +[Sense and Sensibility by Jane Austen 1811] + +CHAPTER 1 + + +The family of Dashwood had long been settled in Sussex. +Their estate was large, and their residence was at Norland Park, +in the centre of their property, where, for many generations, +they had lived in so respectable a manner as to engage +the general good opinion of their surrounding acquaintance. +The late owner of this estate was a single man, who lived +to a very advanced age, and who for many years of his life, +had a constant companion and housekeeper in his sister. +But her death, which happened ten years before his own, +produced a great alteration in his home; for to supply +her loss, he invited and received into his house the family +of his nephew Mr. Henry Dashwood, the legal inheritor +of the Norland estate, and the person to whom he intended +to bequeath it. In the society of his nephew and niece, +and their children, the old Gentleman's days were +comfortably spent. His attachment to them all increased. +The constant attention \ No newline at end of file diff --git a/data/gutenberg/bible-kjv.txt b/data/gutenberg/bible-kjv.txt new file mode 100644 index 0000000..7a8ae82 --- /dev/null +++ b/data/gutenberg/bible-kjv.txt @@ -0,0 +1,32 @@ +[The King James Bible] + +The Old Testament of the King James Bible + +The First Book of Moses: Called Genesis + + +1:1 In the beginning God created the heaven and the earth. + +1:2 And the earth was without form, and void; and darkness was upon +the face of the deep. And the Spirit of God moved upon the face of the +waters. + +1:3 And God said, Let there be light: and there was light. + +1:4 And God saw the light, that it was good: and God divided the light +from the darkness. + +1:5 And God called the light Day, and the darkness he called Night. +And the evening and the morning were the first day. + +1:6 And God said, Let there be a firmament in the midst of the waters, +and let it divide the waters from the waters. + +1:7 And God made the firmament, and divided the waters which were +under the firmament from the waters which were above the firmament: +and it was so. + +1:8 And God called the firmament Heaven. And the evening and the +morning were the second day. + +1:9 And God said, Let the waters under the heav \ No newline at end of file diff --git a/data/gutenberg/blake-poems.txt b/data/gutenberg/blake-poems.txt new file mode 100644 index 0000000..7ebd928 --- /dev/null +++ b/data/gutenberg/blake-poems.txt @@ -0,0 +1,47 @@ +[Poems by William Blake 1789] + + +SONGS OF INNOCENCE AND OF EXPERIENCE +and THE BOOK of THEL + + + SONGS OF INNOCENCE + + + INTRODUCTION + + Piping down the valleys wild, + Piping songs of pleasant glee, + On a cloud I saw a child, + And he laughing said to me: + + "Pipe a song about a Lamb!" + So I piped with merry cheer. + "Piper, pipe that song again;" + So I piped: he wept to hear. + + "Drop thy pipe, thy happy pipe; + Sing thy songs of happy cheer:!" + So I sang the same again, + While he wept with joy to hear. + + "Piper, sit thee down and write + In a book, that all may read." + So he vanish'd from my sight; + And I pluck'd a hollow reed, + + And I made a rural pen, + And I stain'd the water clear, + And I wrote my happy songs + Every child may joy to hear. + + + THE SHEPHERD + + How sweet is the Shepherd's sweet lot! + From the morn to the evening he stays; + He shall follow his sheep all the day, + And his tongue shall be filled with praise. + + For he hears the lambs' innocent call, + And \ No newline at end of file diff --git a/data/gutenberg/bryant-stories.txt b/data/gutenberg/bryant-stories.txt new file mode 100644 index 0000000..e07d807 --- /dev/null +++ b/data/gutenberg/bryant-stories.txt @@ -0,0 +1,40 @@ +[Stories to Tell to Children by Sara Cone Bryant 1918] + + +TWO LITTLE RIDDLES IN RHYME + + + There's a garden that I ken, + Full of little gentlemen; + Little caps of blue they wear, + And green ribbons, very fair. + (Flax.) + + From house to house he goes, + A messenger small and slight, + And whether it rains or snows, + He sleeps outside in the night. + (The path.) + + + + +THE LITTLE YELLOW TULIP + + +Once there was a little yellow Tulip, and she lived down in a little +dark house under the ground. One day she was sitting there, all by +herself, and it was very still. Suddenly, she heard a little _tap, tap, +tap_, at the door. + +"Who is that?" she said. + +"It's the Rain, and I want to come in," said a soft, sad, little voice. + +"No, you can't come in," the little Tulip said. + +By and by she heard another little _tap, tap, tap_ on the window-pane. + +"Who is there?" she said. + +The same soft little voice answered, "It's the \ No newline at end of file diff --git a/data/gutenberg/burgess-busterbrown.txt b/data/gutenberg/burgess-busterbrown.txt new file mode 100644 index 0000000..ff5e628 --- /dev/null +++ b/data/gutenberg/burgess-busterbrown.txt @@ -0,0 +1,22 @@ +[The Adventures of Buster Bear by Thornton W. Burgess 1920] + +I + +BUSTER BEAR GOES FISHING + + +Buster Bear yawned as he lay on his comfortable bed of leaves and +watched the first early morning sunbeams creeping through the Green +Forest to chase out the Black Shadows. Once more he yawned, and slowly +got to his feet and shook himself. Then he walked over to a big +pine-tree, stood up on his hind legs, reached as high up on the trunk of +the tree as he could, and scratched the bark with his great claws. After +that he yawned until it seemed as if his jaws would crack, and then sat +down to think what he wanted for breakfast. + +While he sat there, trying to make up his mind what would taste best, he +was listening to the sounds that told of the waking of all the little +people who live in the Green Forest. He heard Sammy Jay way off in the +distance screaming, "Thief! Thief!" and grinned. "I wonder," thought +Buster, "if some one has stolen Sammy's breakfast, or if he has stolen +th \ No newline at end of file diff --git a/data/gutenberg/carroll-alice.txt b/data/gutenberg/carroll-alice.txt new file mode 100644 index 0000000..da72958 --- /dev/null +++ b/data/gutenberg/carroll-alice.txt @@ -0,0 +1,21 @@ +[Alice's Adventures in Wonderland by Lewis Carroll 1865] + +CHAPTER I. Down the Rabbit-Hole + +Alice was beginning to get very tired of sitting by her sister on the +bank, and of having nothing to do: once or twice she had peeped into the +book her sister was reading, but it had no pictures or conversations in +it, 'and what is the use of a book,' thought Alice 'without pictures or +conversation?' + +So she was considering in her own mind (as well as she could, for the +hot day made her feel very sleepy and stupid), whether the pleasure +of making a daisy-chain would be worth the trouble of getting up and +picking the daisies, when suddenly a White Rabbit with pink eyes ran +close by her. + +There was nothing so VERY remarkable in that; nor did Alice think it so +VERY much out of the way to hear the Rabbit say to itself, 'Oh dear! +Oh dear! I shall be late!' (when she thought it over afterwards, it +occurred to her that she ought to have wondered at this, but at the time +it all seemed quite natural); but \ No newline at end of file diff --git a/data/gutenberg/chesterton-ball.txt b/data/gutenberg/chesterton-ball.txt new file mode 100644 index 0000000..7efd363 --- /dev/null +++ b/data/gutenberg/chesterton-ball.txt @@ -0,0 +1,21 @@ +[The Ball and The Cross by G.K. Chesterton 1909] + + +I. A DISCUSSION SOMEWHAT IN THE AIR + +The flying ship of Professor Lucifer sang through the skies like +a silver arrow; the bleak white steel of it, gleaming in the +bleak blue emptiness of the evening. That it was far above the +earth was no expression for it; to the two men in it, it seemed +to be far above the stars. The professor had himself invented +the flying machine, and had also invented nearly everything in +it. Every sort of tool or apparatus had, in consequence, to the +full, that fantastic and distorted look which belongs to the +miracles of science. For the world of science and evolution is +far more nameless and elusive and like a dream than the world of +poetry and religion; since in the latter images and ideas remain +themselves eternally, while it is the whole idea of evolution +that identities melt into each other as they do in a nightmare. + +All the tools of Professor Lucifer were the ancient human tools +gone mad, grown into \ No newline at end of file diff --git a/data/gutenberg/chesterton-brown.txt b/data/gutenberg/chesterton-brown.txt new file mode 100644 index 0000000..85c5d03 --- /dev/null +++ b/data/gutenberg/chesterton-brown.txt @@ -0,0 +1,20 @@ +[The Wisdom of Father Brown by G. K. Chesterton 1914] + + +I. The Absence of Mr Glass + + +THE consulting-rooms of Dr Orion Hood, the eminent criminologist +and specialist in certain moral disorders, lay along the sea-front +at Scarborough, in a series of very large and well-lighted french windows, +which showed the North Sea like one endless outer wall of blue-green marble. +In such a place the sea had something of the monotony of a blue-green dado: +for the chambers themselves were ruled throughout by a terrible tidiness +not unlike the terrible tidiness of the sea. It must not be supposed +that Dr Hood's apartments excluded luxury, or even poetry. +These things were there, in their place; but one felt that +they were never allowed out of their place. Luxury was there: +there stood upon a special table eight or ten boxes of the best cigars; +but they were built upon a plan so that the strongest were always +nearest the wall and the mildest nearest the window. A tantalus +containing three kinds of sp \ No newline at end of file diff --git a/data/gutenberg/chesterton-thursday.txt b/data/gutenberg/chesterton-thursday.txt new file mode 100644 index 0000000..e44bd92 --- /dev/null +++ b/data/gutenberg/chesterton-thursday.txt @@ -0,0 +1,20 @@ +[The Man Who Was Thursday by G. K. Chesterton 1908] + +To Edmund Clerihew Bentley + +A cloud was on the mind of men, and wailing went the weather, +Yea, a sick cloud upon the soul when we were boys together. +Science announced nonentity and art admired decay; +The world was old and ended: but you and I were gay; +Round us in antic order their crippled vices came-- +Lust that had lost its laughter, fear that had lost its shame. +Like the white lock of Whistler, that lit our aimless gloom, +Men showed their own white feather as proudly as a plume. +Life was a fly that faded, and death a drone that stung; +The world was very old indeed when you and I were young. +They twisted even decent sin to shapes not to be named: +Men were ashamed of honour; but we were not ashamed. +Weak if we were and foolish, not thus we failed, not thus; +When that black Baal blocked the heavens he had no hymns from us +Children we were--our forts of sand were even as weak as eve, +High as they went we piled them up to break that b \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..00e9eb6 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,55 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + + +# -- Project information ----------------------------------------------------- + +project = 'testdoc' +copyright = '2023, Luke Ruud' +author = 'Luke Ruud' + +# The full version, including alpha/beta/rc tags +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon" +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..7024bb2 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,24 @@ +.. testdoc documentation master file, created by + sphinx-quickstart on Mon Jul 24 13:17:26 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to testdoc's documentation! +=================================== + + +.. automodule:: cdstemplate.word_count + :members: + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..954237b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..aa51e59 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,18 @@ +schema: '2.0' +stages: + count-words: + cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv + data/gutenberg/*.txt --case-insensitive + deps: + - path: data/gutenberg + md5: 41d960155f1a7f55480c03cea68ba2a7.dir + size: 10940 + nfiles: 11 + - path: src/cdstemplate/corpus_counter_script.py + hash: md5 + md5: a4bb400c0cfd7050ac4b761b550a0a56 + size: 2582 + outs: + - path: data/gutenberg_counts.csv + md5: 74abc508b4e4015ab4136405df251a57 + size: 4922 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..ed00753 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,8 @@ +stages: + count-words: + cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive + deps: + - src/cdstemplate/corpus_counter_script.py + - data/gutenberg + outs: + - data/gutenberg_counts.csv \ No newline at end of file diff --git a/notebooks/word_count_prototype.ipynb b/notebooks/word_count_prototype.ipynb new file mode 100644 index 0000000..25fb9a2 --- /dev/null +++ b/notebooks/word_count_prototype.ipynb @@ -0,0 +1,372 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Word Count Prototype\n", + "This notebook presents a prototype of our word count experiment example. It calls our `cdstemplate` library directly, to avoid reduplicating code, then creates a plot of the most frequent words in the corpus. Notebooks are a great way to create visualizations, which often need to be tweaked for readability and aesthetics. \n", + "\n", + "Don't forget to restart your kernel and re-run the notebook completely before you commit or share it with others! This helps avoid problems arising from deleted or reordered cells." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# This cell imports packages and defines any experimental set-up \n", + "# Try to make experimental parameters easy for others to to find by including them in a few cells at the beginning of your notebook\n", + "import logging\n", + "from pathlib import Path\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from cdstemplate import word_count\n", + "\n", + "# You can include this to see log messages from the packages you're using\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "# A relative path to the input data\n", + "input_txt_dir = \"../data/gutenberg\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:cdstemplate.word_count:Adding 181 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 122 word types\n", + "INFO:cdstemplate.word_count:Adding 189 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 105 word types\n", + "INFO:cdstemplate.word_count:Adding 180 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 92 word types\n", + "INFO:cdstemplate.word_count:Adding 192 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 95 word types\n", + "INFO:cdstemplate.word_count:Adding 246 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 83 word types\n", + "INFO:cdstemplate.word_count:Adding 189 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 83 word types\n", + "INFO:cdstemplate.word_count:Adding 258 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 82 word types\n", + "INFO:cdstemplate.word_count:Adding 204 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 57 word types\n", + "INFO:cdstemplate.word_count:Adding 197 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 90 word types\n", + "INFO:cdstemplate.word_count:Adding 180 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 74 word types\n", + "INFO:cdstemplate.word_count:Adding 182 token(s) case insensitively\n", + "INFO:cdstemplate.word_count:Vocabulary size increased by 66 word types\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tokenizing file: ../data/gutenberg/austen-persuasion.txt\n", + "Tokenizing file: ../data/gutenberg/chesterton-ball.txt\n", + "Tokenizing file: ../data/gutenberg/chesterton-brown.txt\n", + "Tokenizing file: ../data/gutenberg/carroll-alice.txt\n", + "Tokenizing file: ../data/gutenberg/bryant-stories.txt\n", + "Tokenizing file: ../data/gutenberg/burgess-busterbrown.txt\n", + "Tokenizing file: ../data/gutenberg/blake-poems.txt\n", + "Tokenizing file: ../data/gutenberg/bible-kjv.txt\n", + "Tokenizing file: ../data/gutenberg/chesterton-thursday.txt\n", + "Tokenizing file: ../data/gutenberg/austen-sense.txt\n", + "Tokenizing file: ../data/gutenberg/austen-emma.txt\n" + ] + } + ], + "source": [ + "# Add counts for each document\n", + "corpus_counter = word_count.CorpusCounter()\n", + "for txt_file in Path(input_txt_dir).glob(\"*.txt\"):\n", + " print(\"Tokenizing file:\", txt_file)\n", + " txt_contents = txt_file.read_text()\n", + " corpus_counter.add_doc(txt_contents)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vocab size: 949\n" + ] + } + ], + "source": [ + "# How many unique words appeared in our corpus?\n", + "print(\"Vocab size:\", corpus_counter.get_vocab_size())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokencount
616\"Drop1
91\"ELLIOT1
571\"I1
475\"It's2
484\"No,1
\n", + "
" + ], + "text/plain": [ + " token count\n", + "616 \"Drop 1\n", + "91 \"ELLIOT 1\n", + "571 \"I 1\n", + "475 \"It's 2\n", + "484 \"No, 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Get the dataframe we'll work with for the display\n", + "word_count_df = corpus_counter.get_token_counts_as_dataframe()\n", + "display(word_count_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokencount
29the127
10of72
38and62
16a43
162to38
.........
259one3
348what3
376when3
704firmament3
147white3
\n", + "

100 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " token count\n", + "29 the 127\n", + "10 of 72\n", + "38 and 62\n", + "16 a 43\n", + "162 to 38\n", + ".. ... ...\n", + "259 one 3\n", + "348 what 3\n", + "376 when 3\n", + "704 firmament 3\n", + "147 white 3\n", + "\n", + "[100 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# We only want to include to top 100 most frequent words in our plot\n", + "top_words_df = word_count_df.sort_values(\"count\", ascending=False).head(100)\n", + "display(top_words_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Top 100 Most Frequent Words in a Subset of Project Gutenberg Texts')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Make the plot and customize how it is displayed\n", + "plt.figure(figsize= (20, 15))\n", + "fig = sns.barplot(data = top_words_df, x = \"count\", y = \"token\")\n", + "fig.set_title(\"Top 100 Most Frequent Words in a Subset of Project Gutenberg Texts\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "8db04dca66b9396af2474eca4189d3a8ab65d348a7a173a34f354ffe25d5d9d4" + }, + "kernelspec": { + "display_name": "Python 3.10.4 ('template')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1d8a32b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,47 @@ +# A build system is required to convert your code into a distributable package. +# setuptools is the oldest and most common build tool, but we also like Poetry +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "cdstemplate" +version = "2.0.0" +description = "A template repo for data science and machine learning projects at UMass Center for Data Science." +readme = "README.md" + +# What version of python does your library work with? +requires-python = ">=3.10" + +# Metadata about your package in case you upload it to PYPI +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +# All the dependencies needed for running your module go here +dependencies = [ + "dvc>=2.42.0", + "numpy", + "pandas", + "scikit-learn", +] + +[project.optional-dependencies] +# Extra dependencies only needed for running tests go here +test = ["pytest"] + +# Dependencies that are useful only to developers, like an autoformatter and support for visualizations in jupyter notebooks go here +dev = [ + "ruff", + "jupyter", + "matplotlib", + "seaborn", + "sphinx", +] + +# If your project contains scripts you'd like to be available command line, you can define them here. +# The value must be of the form ":." +[project.scripts] +corpus-counter = "cdstemplate:corpus_counter_script.main_cli" diff --git a/src/cdstemplate/__init__.py b/src/cdstemplate/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cdstemplate/corpus_counter_script.py b/src/cdstemplate/corpus_counter_script.py new file mode 100644 index 0000000..404550d --- /dev/null +++ b/src/cdstemplate/corpus_counter_script.py @@ -0,0 +1,68 @@ +"""An example of a script you can run. It tokenizes an folder of input documents and +writes the corpus counts to a user-specified CSV file +""" +# Import modules, functions and classes from external libraries +import argparse +import logging +from pathlib import Path + +# Import the code from this project needed for this script +from cdstemplate import word_count, utils + +logger = logging.getLogger(__name__) + +def main_cli(): + """A wrapper function that defines command line arguments and help messages for + when the user wants run this module's code as a script. + """ + # The argument parser gives nice ways to include help message and specify which arguments + # are required or optional, see https://docs.python.org/3/library/argparse.html#prog for usage instructions + parser = argparse.ArgumentParser( + description="A script to generate counts of tokens in a corpus" + ) + + parser.add_argument( + "csv", help="Path to the output CSV storing token counts. Required." + ) + + parser.add_argument( + "documents", + nargs="+", + help="Paths to at least one raw text document that make up the corpus. Required.", + ) + parser.add_argument( + "--case-insensitive", + "-c", + action="store_true", + help="Default is to have case sensitive tokenization. Use this flag to make the token counting case insensitive. Optional.", + ) + + args = parser.parse_args() + utils.configure_logging() + logger.info("Command line arguments: %s", args) + main(args.csv, args.documents, args.case_insensitive) + + +def main(csv_out, documents, case_insensitive=False): + """Determine cumulative word counts for a list of documents and write the results to a CSV file + + :param csv_out: output CSV file path + :type csv_out: str or Path + :param documents: list of paths to documents to parse word counts from + :type documents: list of str + :param case_insensitive: Set to True to lowercase all words in cumulative counts, defaults to False + :type case_insensitive: bool, optional + """ + cc = word_count.CorpusCounter(case_insensitive=case_insensitive) + for i, doc in enumerate(documents): + if i % 2 == 0: + logger.info("Tokenizing document number %s: %s", i, doc) + cc.add_doc(Path(doc).read_text()) + + cc.save_token_counts(csv_out) + + +# The entry point of your script - if a user runs it from the command line, for example using `python -m .` +# or `python .py`, this is what will be run. +if __name__ == "__main__": + main_cli() diff --git a/src/cdstemplate/utils.py b/src/cdstemplate/utils.py new file mode 100644 index 0000000..320e122 --- /dev/null +++ b/src/cdstemplate/utils.py @@ -0,0 +1,12 @@ +"""A module for important set-up and configuration functionality, but doesn't implement the library's key features. +""" +import logging + + +def configure_logging(): + """A helper method that configures logging, usable by any script in this library. + """ + logging.basicConfig( + level=logging.DEBUG, + format="%(levelname)s : %(asctime)s : %(name)s : %(message)s", + ) diff --git a/src/cdstemplate/word_count.py b/src/cdstemplate/word_count.py new file mode 100644 index 0000000..8037dce --- /dev/null +++ b/src/cdstemplate/word_count.py @@ -0,0 +1,114 @@ +"""An example of an module with functions and a class that can be imported once the package is installed. +This module provides operations for tokenization and tracking cumulative word counts in a set of documents. +""" +from collections import Counter +import logging +import re + +import pandas as pd + +# You should use logging instead of print statements in code others will use, +# so they can customize how much detail to see from your package +# Refer to https://realpython.com/python-logging/ for detailed examples. +logger = logging.getLogger(__name__) + + +def tokenize(text, pattern=r"\s"): + """Returns a list of strings, the text split into tokens based on the regex pattern to identify boundaries. + + :param text: the document to tokenize + :type text: str + :param pattern: regex string to split the text on + :type pattern: str + """ + logger.debug("Tokenizing '%s' with pattern '%s'", text, pattern) + + tokenized = re.split(pattern, text) + logger.debug("%s token(s) found.", len(tokenized)) + return tokenized + + +class CorpusCounter: + """A simple class object that tracks document and token counts in a corpus. + """ + + def __init__(self, tokenization_pattern=r"\s", case_insensitive=False): + """Constructor instantiates with empty counters + + :param tokenization_pattern: An optional tokenization pattern so that you are consistently tokenizing all documents the same. Defaults to splitting on whitespace + :param case_insensitive: Set to True to downcase tokens before counting, defaults to False + """ + self.token_counter = Counter() + self.doc_counter = 0 + self.tokenization_pattern = tokenization_pattern + self.case_insensitive = case_insensitive + logger.debug( + "CorpusCounter instantiated, tokenization pattern: %s, case insensitive: %s", + tokenization_pattern, + case_insensitive, + ) + + def add_tokenized_doc(self, token_list): + """Tallies an already tokenized document in the corpus. + + :param token_list: A tokenized document + :type token_list: list or iterable of strings + """ + before_vocab_size = self.get_vocab_size() + non_empty_tokens = [w for w in token_list if w != ""] + if self.case_insensitive: + logger.info("Adding %s token(s) case insensitively", len(token_list)) + self.token_counter.update([w.lower() for w in non_empty_tokens]) + else: + logger.info("Adding %s token(s) case insensitively", len(token_list)) + self.token_counter.update(non_empty_tokens) + after_vocab_size = self.get_vocab_size() + + logger.info( + "Vocabulary size increased by %s word types", + after_vocab_size - before_vocab_size, + ) + + self.doc_counter += 1 + + def add_doc(self, untokenized_doc): + """Tokenizes a document and adds it in the corpus. + + :param untokenized_doc: The document to count tokens for + :type untokenized_doc: str + """ + tokenized = tokenize(untokenized_doc, self.tokenization_pattern) + self.add_tokenized_doc(tokenized) + + def get_token_count(self, token): + """Returns the count of a given token in the corpus + + :param token: The token to retrieve counts of + :type token: str + """ + return self.token_counter[token] + + def get_vocab_size(self): + """Returns vocabulary size (number of unique tokens) + """ + return len(self.token_counter) + + def get_token_counts_as_dataframe(self): + """Returns the token counts of the corpus as a Pandas DataFrame with columns 'token', 'count' + """ + dataframe = pd.DataFrame.from_records( + list(self.token_counter.items()), columns=["token", "count"] + ) + dataframe = dataframe.sort_values("token") + return dataframe + + def save_token_counts(self, csv_file): + """Saves the counts of tokens the corpus to a specified + CSV file in alphabetical order + + :param csv_file: Path to desired CSV output file + :type csv_file: str or Path + """ + logger.info("Saving token counts to %s", csv_file) + self.get_token_counts_as_dataframe().to_csv(csv_file, index=False, header=True) + diff --git a/tests/test_word_count.py b/tests/test_word_count.py new file mode 100644 index 0000000..aa91336 --- /dev/null +++ b/tests/test_word_count.py @@ -0,0 +1,105 @@ +"""Tests for the cdstemplate.word_count methods and classes. + +In pytest, each individual test is a python function that starts with `test`. +""" +# Import your library for testing +from cdstemplate import word_count + + +def test_tokenize_document(): + my_document = "It was all very well to say `Drink me,' but the wise little Alice was not going to do that in a hurry." + + expected_tokens = [ + "It", + "was", + "all", + "very", + "well", + "to", + "say", + "`Drink", + "me,'", + "but", + "the", + "wise", + "little", + "Alice", + "was", + "not", + "going", + "to", + "do", + "that", + "in", + "a", + "hurry.", + ] + + assert word_count.tokenize(my_document) == expected_tokens + + +def test_tokenize_change_pattern(): + formatted_document = "here's-a-document-with-strange-formatting" + expected_tokens = ["here's", "a", "document", "with", "strange", "formatting"] + assert word_count.tokenize(formatted_document, pattern="-") == expected_tokens + + +def test_corpus_counter_init(): + cc = word_count.CorpusCounter() + assert cc.doc_counter == 0 + assert cc.get_token_count("word") == 0 + assert not cc.case_insensitive + assert cc.tokenization_pattern == r"\s" + + +def test_corpus_counter_add_docs(): + cc = word_count.CorpusCounter() + cc.add_doc("a b a word") + assert cc.doc_counter == 1 + assert cc.get_token_count("a") == 2 + assert cc.get_token_count("b") == 1 + assert cc.get_token_count("word") == 1 + cc.add_tokenized_doc(["Word", "word", "b"]) + assert cc.get_token_count("a") == 2 + assert cc.get_token_count("b") == 2 + assert cc.get_token_count("word") == 2 + assert cc.get_token_count("Word") == 1 + + +def test_corpus_counter_add_empty_doc(): + cc = word_count.CorpusCounter() + cc.add_doc("") + assert cc.doc_counter == 1 + assert len(cc.token_counter) == 0 + + +def test_corpus_counter_case_insensitive(): + cc = word_count.CorpusCounter(case_insensitive=True) + cc.add_doc("A a B b") + assert cc.get_token_count("a") == 2 + assert cc.get_token_count("b") == 2 + assert cc.get_token_count("A") == 0 + assert cc.get_token_count("B") == 0 + + +def test_corpus_counter_to_dataframe(): + cc = word_count.CorpusCounter() + cc.add_doc("A a B b") + dataframe = cc.get_token_counts_as_dataframe() + assert dataframe.shape == (4, 2) + assert list(dataframe.columns) == ["token", "count"] + assert set(dataframe["token"]) == set(["A", "a", "B", "b"]) + + +# The tmp_path fixture allows you save results to a temporary directory +# that will automatically be cleaned up by the OS later +def test_corpus_counter_save_csv(tmp_path): + my_csv = tmp_path / "token_count.csv" + cc = word_count.CorpusCounter() + cc.add_doc("a b c") + cc.add_doc("a x y z") + cc.save_token_counts(my_csv) + assert my_csv.exists() + assert my_csv.is_file() + expected_csv = "token,count\na,2\nb,1\nc,1\nx,1\ny,1\nz,1\n" + assert my_csv.read_text() == expected_csv