diff --git a/.all-contributorsrc b/.all-contributorsrc index 5164f9e..584146d 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -3,7 +3,7 @@ "README.md" ], "imageSize": 100, - "commit": false, + "commit": true, "commitConvention": "angular", "contributors": [ { @@ -26,7 +26,10 @@ "code", "review", "test", - "blog" + "blog", + "example", + "doc", + "platform" ] }, { @@ -64,6 +67,7 @@ } ], "contributorsPerLine": 7, + "linkToUsage": true, "skipCi": true, "repoType": "github", "repoHost": "https://github.com", diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index de49a27..515659b 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -1,4 +1,4 @@ -name: style + docs check +name: Check style + docs + types on: pull_request: @@ -12,9 +12,9 @@ jobs: working-directory: . steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.10" cache: 'pip' @@ -22,7 +22,17 @@ jobs: - name: Install package with dependencies run: pip install -e ".[dev]" if: steps.python-cache.outputs.cache-hit != 'true' + + # check code style - name: Run black run: black src --check --diff + + # check docs - name: Check that documentation can be built run: tox -e docs + + # check types with mypy + - name: Install mypy + run: pip install mypy + - name: Check types in python src directory; install needed types + run: mypy --install-types --non-interactive src diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 15b2668..b9678e9 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -11,14 +11,14 @@ on: env: # python version used to calculate and submit code coverage - COV_PYTHON_VERSION: "3.10" + COV_PYTHON_VERSION: "3.11" jobs: python-unit: runs-on: ubuntu-latest strategy: matrix: - python: ["3.8", "3.9", "3.10", "3.11"] + python: ["3.9", "3.10", "3.11", "3.12"] defaults: run: working-directory: . diff --git a/CHANGELOG.md b/CHANGELOG.md index 672b526..e42714f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Change Log +## 0.2 + +- Undate and UndateInterval now include an optional label for named dates or time periods +- Support partially known dates with missing digits (e.g. 1991-1?-10) +- Rich comparison checks for Undate + - improved equality check; now supports comparing Undate object with day precision to datetime.date + - implementations and tests for comparison, sorting and contains (`>`, `<`, `>=`, `<=`, and `in`) +- static method to initialize an Undate object from a datetime.date (used for comparisons) +- Example Jupyter notebook comparing Undate duration calculation against + dates and durations in the [Shakespeare and Company Project](https://shakespeareandco.princeton.edu/) [events dataset](https://doi.org/10.34770/nz90-ym25) +- Preliminary support for parsing Extended Date Time Format (EDTF) level 0 and some of level 1 and transforming into Undate objects +- Dropped support for python 3.8; added python 3.12 +- Python type improvements and preliminary type checking with mypy + ## 0.1 Pre-alpha version with preliminary `Undate` and `UndateInterval` classes diff --git a/README.md b/README.md index 034fef0..f26fd20 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ # undate-python + +[![All Contributors](https://img.shields.io/badge/all_contributors-5-orange.svg?style=flat-square)](#contributors-) + **undate** is a python library for working with uncertain or partially known dates. @@ -59,12 +62,9 @@ This repository uses [git-flow](https://github.com/nvie/gitflow) branching conve - **main** will always contain the most recent release - **develop** branch is the latest version of work in progress -Pull requests should be made against the **develop** branch. +Pull requests for new features should be made against the **develop** branch. -It is recommended to install git flow (on OSX, use brew or ports, e.g.: `brew install git-flow`; -on Ubuntu/Debian, `apt-get install git-flow`) and then initialize it in this repository -via `git flow init` and accept the defaults. Then you can use `git flow feature start` -to create feature development branches. +It is recommended to install git flow (on OSX, use brew or ports, e.g.: `brew install git-flow`; on Ubuntu/Debian, `apt-get install git-flow`) and then initialize it in this repository via `git flow init` and accept the defaults. Then you can use `git flow feature start` to create feature development branches. Alternately, you can check out the develop branch (`git checkout develop`) and create your branches manually based on develop (`git checkout -b feature/xxx-name`). @@ -96,6 +96,18 @@ To run all the tests in a single test file, use pytest and specify the path to t To test cases by name, use pytest: `pytest -k test_str` +### Check python types + +Python typing is currently only enforced by a CI check action using `mypy`. +To run mypy locally, first install mypy and the necessary typing libraries: +```sh +pip install mypy +mypy --install-types +``` + +Once mypy is installed, run `mypy src/` to check. + + ### Create documentation ```sh @@ -111,12 +123,21 @@ tox -e docs Cole Crawford
Cole Crawford

πŸ’» πŸ‘€ ⚠️ - Rebecca Sutton Koeser
Rebecca Sutton Koeser

πŸ’» πŸ‘€ ⚠️ πŸ“ + Rebecca Sutton Koeser
Rebecca Sutton Koeser

πŸ’» πŸ‘€ ⚠️ πŸ“ πŸ’‘ πŸ“– πŸ“¦ Robert Casties
Robert Casties

πŸ”£ Julia Damerow
Julia Damerow

πŸ’» πŸ‘€ ⚠️ πŸ“‹ Malte Vogl
Malte Vogl

πŸ’» πŸ‘€ ⚠️ πŸ“– + + + + + Add your contributions + + + + diff --git a/examples/notebooks/shxco_partial_date_durations.ipynb b/examples/notebooks/shxco_partial_date_durations.ipynb new file mode 100644 index 0000000..6af42bb --- /dev/null +++ b/examples/notebooks/shxco_partial_date_durations.ipynb @@ -0,0 +1,2666 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "s_holu9LI6q1" + }, + "source": [ + "# Compare partial date duration logic\n", + "\n", + "Comparing `UndateInterval` with similar work from Shakespeare and Company Project (S&co for short).\n", + "\n", + "This notebook compares the `UndateInterval` duration calculation for date ranges between partially known dates with the similar logic implemented in the [Shakespeare and Company Project](https://shakespeareandco.princeton.edu/) [events dataset](https://doi.org/10.34770/nz90-ym25). Event start and end dates are in ISO8601 format and include as much precision for the date as is known; format is one of: YYYY, YYYY-MM, YYYY-MM-DD, --MM-DD \n", + "\n", + "Deciding how to calculate date ranges may be contextual; current UndateInterval logic includes both the start and the end date, while the S&co logic does not - so they are off by one. Once we make that adjustment, the borrowing durations in the S&co data match the logic in Undate.\n", + "\n", + "Subscription durations in S&co are sometimes known to be for a particular term (e.g. a year or six months) but without specific dates, perhaps only a year or year and month; Undate calculates durations based on the earliest and latest days in the range, so it overestimates these durations.\n", + "\n", + "*Notebook authored by Rebecca Sutton Koeser, 2023.*\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Users/rkoeser/workarea/env/undate/bin/python -m pip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -q pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 579 + }, + "id": "Q7KZRmj_4ySW", + "outputId": "ee3cacd7-c347-437a-ee8e-91a4086d6e88" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/tn/1_gbhpks7hqbkbln2gjhcdvm0000gp/T/ipykernel_37223/1465467117.py:6: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " events_df = pd.read_csv(\"https://github.com/rlskoeser/shxco-missingdata-specreading/raw/main/data/source-data/SCoData_events_v1.2_2022-01.csv\")\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
0Generic1920NaNhttps://shakespeareandco.princeton.edu/members...Raymonde LinossierLinossier, RaymondeNaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/b...Pigs Is PigsNaNButler, Ellis Parker1906.0NaNLending Library CardSylvia Beach, Raymonde Linossier Lending Libra...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/00...
1Subscription1921NaNhttps://shakespeareandco.princeton.edu/members...Mme GarretaGarreta, MmeNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
2Borrow19221922-08-23https://shakespeareandco.princeton.edu/members...Mr. RhysRhys, Mr.NaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/c...TyphoonNaNConrad, Joseph1902.0NaNLending Library CardSylvia Beach, Rhys Lending Library Card, Box 4...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/67...
3Generic1922NaNhttps://shakespeareandco.princeton.edu/members...Ernest WalshWalsh, ErnestNaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/b...The Pretty LadyNaNBennett, Arnold1918.0NaNLending Library CardSylvia Beach, Ernest Walsh Lending Library Car...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/af...
4Subscription1922NaNhttps://shakespeareandco.princeton.edu/members...Mr. LincolnLincoln, Mr.NaN7.0NaNNaN...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
\n", + "

5 rows Γ— 28 columns

\n", + "
" + ], + "text/plain": [ + " event_type start_date end_date \\\n", + "0 Generic 1920 NaN \n", + "1 Subscription 1921 NaN \n", + "2 Borrow 1922 1922-08-23 \n", + "3 Generic 1922 NaN \n", + "4 Subscription 1922 NaN \n", + "\n", + " member_uris member_names \\\n", + "0 https://shakespeareandco.princeton.edu/members... Raymonde Linossier \n", + "1 https://shakespeareandco.princeton.edu/members... Mme Garreta \n", + "2 https://shakespeareandco.princeton.edu/members... Mr. Rhys \n", + "3 https://shakespeareandco.princeton.edu/members... Ernest Walsh \n", + "4 https://shakespeareandco.princeton.edu/members... Mr. Lincoln \n", + "\n", + " member_sort_names subscription_price_paid subscription_deposit \\\n", + "0 Linossier, Raymonde NaN NaN \n", + "1 Garreta, Mme NaN NaN \n", + "2 Rhys, Mr. NaN NaN \n", + "3 Walsh, Ernest NaN NaN \n", + "4 Lincoln, Mr. NaN 7.0 \n", + "\n", + " subscription_duration subscription_duration_days ... \\\n", + "0 NaN NaN ... \n", + "1 NaN NaN ... \n", + "2 NaN NaN ... \n", + "3 NaN NaN ... \n", + "4 NaN NaN ... \n", + "\n", + " item_uri item_title \\\n", + "0 https://shakespeareandco.princeton.edu/books/b... Pigs Is Pigs \n", + "1 NaN NaN \n", + "2 https://shakespeareandco.princeton.edu/books/c... Typhoon \n", + "3 https://shakespeareandco.princeton.edu/books/b... The Pretty Lady \n", + "4 NaN NaN \n", + "\n", + " item_volume item_authors item_year item_notes \\\n", + "0 NaN Butler, Ellis Parker 1906.0 NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN Conrad, Joseph 1902.0 NaN \n", + "3 NaN Bennett, Arnold 1918.0 NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "0 Lending Library Card Sylvia Beach, Raymonde Linossier Lending Libra... \n", + "1 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "2 Lending Library Card Sylvia Beach, Rhys Lending Library Card, Box 4... \n", + "3 Lending Library Card Sylvia Beach, Ernest Walsh Lending Library Car... \n", + "4 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest \\\n", + "0 https://figgy.princeton.edu/concern/scanned_re... \n", + "1 NaN \n", + "2 https://figgy.princeton.edu/concern/scanned_re... \n", + "3 https://figgy.princeton.edu/concern/scanned_re... \n", + "4 NaN \n", + "\n", + " source_image \n", + "0 https://iiif.princeton.edu/loris/figgy_prod/00... \n", + "1 NaN \n", + "2 https://iiif.princeton.edu/loris/figgy_prod/67... \n", + "3 https://iiif.princeton.edu/loris/figgy_prod/af... \n", + "4 NaN \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# load the 1.2 version of S&co events dataset\n", + "# for convenience, we load a copy from \n", + "\n", + "events_df = pd.read_csv(\"https://github.com/rlskoeser/shxco-missingdata-specreading/raw/main/data/source-data/SCoData_events_v1.2_2022-01.csv\")\n", + "events_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0Y6CsfIAJoqi" + }, + "source": [ + "## Define a method to parse dates and calculate duration\n", + "\n", + "Define a method to initialize an `UndateInterval` from start and end date strings in ISO format as used in S&co datasets\n", + "\n", + "**Note:** There's an off-by-one discrepancy between how we currently calculate duration in Undate and in the Shakespeare and Company Project code; becauS&co code counts the first day in the range but not the last (this could also be thought of as counting half of the start and end dates). For simplicity of comparison here, we subtract one day from the result returned by `UndateInterval.duration`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "y_MqgrQW64uI" + }, + "outputs": [], + "source": [ + "from undate.undate import UndateInterval\n", + "from undate.dateformat.iso8601 import ISO8601DateFormat\n", + "\n", + "def undate_duration(start_date, end_date):\n", + " isoformat = ISO8601DateFormat()\n", + "\n", + " unstart = isoformat.parse(start_date)\n", + " unend = isoformat.parse(end_date)\n", + " interval = UndateInterval(earliest=unstart, latest=unend)\n", + "\n", + " # subtract one here for simplicity of comparison,\n", + " # to reconcile difference between how duration logic\n", + "\n", + " return interval.duration().days - 1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JBVWMB7lJbYB" + }, + "source": [ + "## Compare subscription event durations\n", + "\n", + "S&co data includes membership subscriptions with known duration; the dataset includes them in a human readable format (`subscription_duration`) and in a numeric form (`subscription_duration_days`).\n", + "\n", + "Select subscription events with available duration information to compare with Undate logic." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "id": "c8iPHU5K58cz", + "outputId": "c0cc72ef-ed0b-4a30-d7b5-ea21ef0582c7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_datesubscription_durationsubscription_duration_days
28Arthur Elliott Felkin192719281 year365.0
70Geraldine Deknatel;William Deknatel193119321 year365.0
233Mrs. G. S. Madam1921-071921-081 month31.0
234Anne Moderwell;Hiram Moderwell / H. K. Moderwell1921-091922-025 months153.0
260Victor Llona1923-061923-104 months122.0
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date \\\n", + "28 Arthur Elliott Felkin 1927 1928 \n", + "70 Geraldine Deknatel;William Deknatel 1931 1932 \n", + "233 Mrs. G. S. Madam 1921-07 1921-08 \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", + "260 Victor Llona 1923-06 1923-10 \n", + "\n", + " subscription_duration subscription_duration_days \n", + "28 1 year 365.0 \n", + "70 1 year 365.0 \n", + "233 1 month 31.0 \n", + "234 5 months 153.0 \n", + "260 4 months 122.0 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# identify subscription events with duration information\n", + "subs_duration = events_df[events_df.subscription_duration_days.notna()]\n", + "# limit to fields that are relevant for this exploration\n", + "subs_duration = subs_duration[['member_names', 'start_date', 'end_date', 'subscription_duration', 'subscription_duration_days']]\n", + "subs_duration.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Subscription duration exploration\n", + "\n", + "Briefly explore the duration data information for these subscriptions.\n", + "\n", + "What do the duration day values look like? What rnage of values?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9i0aN7iQ6voY", + "outputId": "fe1ac93f-5571-4bd3-e4c1-06e90cf33f5c" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "subscription_duration_days\n", + "31.0 2997\n", + "30.0 1975\n", + "92.0 936\n", + "91.0 397\n", + "365.0 337\n", + " ... \n", + "69.0 1\n", + "36.0 1\n", + "73.0 1\n", + "574.0 1\n", + "171.0 1\n", + "Name: count, Length: 133, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# What do the subscription duration day values look like?\n", + "subs_duration.subscription_duration_days.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aGqi4LRp60tV", + "outputId": "fbd61c94-41ab-40a7-87c2-cf0548c75d5a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "count 9146.000000\n", + "mean 72.142685\n", + "std 81.559368\n", + "min 1.000000\n", + "25% 30.000000\n", + "50% 31.000000\n", + "75% 91.000000\n", + "max 574.000000\n", + "Name: subscription_duration_days, dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subs_duration.subscription_duration_days.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Do we have any subscriptions with known duration but unknown start or end date?" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "pUsAb16MKqvb", + "outputId": "27f3b8e7-c5a5-4297-eb7e-e37e81945dda" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
\n", + "

0 rows Γ— 28 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [event_type, start_date, end_date, member_uris, member_names, member_sort_names, subscription_price_paid, subscription_deposit, subscription_duration, subscription_duration_days, subscription_volumes, subscription_category, subscription_purchase_date, reimbursement_refund, borrow_status, borrow_duration_days, purchase_price, currency, item_uri, item_title, item_volume, item_authors, item_year, item_notes, source_type, source_citation, source_manifest, source_image]\n", + "Index: []\n", + "\n", + "[0 rows x 28 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# events with unknown start date\n", + "subs_duration[subs_duration.start_date.isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + }, + "id": "0odaog0eK0CN", + "outputId": "1e8814ff-0043-4969-b1d1-7574c3e82008" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_datesubscription_durationsubscription_duration_days
13168Jean (Bakewell) Connolly / Mrs. Cyril Connolly1932-10-06NaNNaN31.0
13686Stanislas Pascal Franchot1933-03-02NaNNaN31.0
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date \\\n", + "13168 Jean (Bakewell) Connolly / Mrs. Cyril Connolly 1932-10-06 NaN \n", + "13686 Stanislas Pascal Franchot 1933-03-02 NaN \n", + "\n", + " subscription_duration subscription_duration_days \n", + "13168 NaN 31.0 \n", + "13686 NaN 31.0 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# events with unknown end date\n", + "subs_duration[subs_duration.end_date.isna()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are two one-month subscriptions with known start date but end date not set. Exclude those from our comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "jwvN9-CgLQRx" + }, + "outputs": [], + "source": [ + "# omit events with unknown end date since we can't recalculate duration\n", + "# (duration in the dataset is based on the subscription duration)\n", + "subs_duration = subs_duration[subs_duration.end_date.notna()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Calculate durations with Undate and compare" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "id": "Z-CVWd3z7Jb6", + "outputId": "d52d57d4-9803-4bfa-9708-bdf149c7098b" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_datesubscription_durationsubscription_duration_daysundate_duration
28Arthur Elliott Felkin192719281 year365.0730
70Geraldine Deknatel;William Deknatel193119321 year365.0730
233Mrs. G. S. Madam1921-071921-081 month31.061
234Anne Moderwell;Hiram Moderwell / H. K. Moderwell1921-091922-025 months153.0180
260Victor Llona1923-061923-104 months122.0152
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date \\\n", + "28 Arthur Elliott Felkin 1927 1928 \n", + "70 Geraldine Deknatel;William Deknatel 1931 1932 \n", + "233 Mrs. G. S. Madam 1921-07 1921-08 \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", + "260 Victor Llona 1923-06 1923-10 \n", + "\n", + " subscription_duration subscription_duration_days undate_duration \n", + "28 1 year 365.0 730 \n", + "70 1 year 365.0 730 \n", + "233 1 month 31.0 61 \n", + "234 5 months 153.0 180 \n", + "260 4 months 122.0 152 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add a new field for duration as calculated by Undate using the method defined previously\n", + "subs_duration[\"undate_duration\"] = subs_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "subs_duration.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "fVf6M2E2LgnH", + "outputId": "87e6585a-670d-466e-d206-caabaaa48df9" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_datesubscription_durationsubscription_duration_daysundate_duration
28Arthur Elliott Felkin192719281 year365.0730
70Geraldine Deknatel;William Deknatel193119321 year365.0730
233Mrs. G. S. Madam1921-071921-081 month31.061
234Anne Moderwell;Hiram Moderwell / H. K. Moderwell1921-091922-025 months153.0180
260Victor Llona1923-061923-104 months122.0152
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date \\\n", + "28 Arthur Elliott Felkin 1927 1928 \n", + "70 Geraldine Deknatel;William Deknatel 1931 1932 \n", + "233 Mrs. G. S. Madam 1921-07 1921-08 \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", + "260 Victor Llona 1923-06 1923-10 \n", + "\n", + " subscription_duration subscription_duration_days undate_duration \n", + "28 1 year 365.0 730 \n", + "70 1 year 365.0 730 \n", + "233 1 month 31.0 61 \n", + "234 5 months 153.0 180 \n", + "260 4 months 122.0 152 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compare undate duration with dataset duration\n", + "subs_duration.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "drnCqTtsL835", + "outputId": "dc042b74-295a-436c-9c70-c6014d986cf7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28Arthur Elliott Felkin192719281 year365.0730365.0
70Geraldine Deknatel;William Deknatel193119321 year365.0730365.0
233Mrs. G. S. Madam1921-071921-081 month31.06130.0
234Anne Moderwell;Hiram Moderwell / H. K. Moderwell1921-091922-025 months153.018027.0
260Victor Llona1923-061923-104 months122.015230.0
........................
35114Capon1941-11-241941-12-241 month30.0300.0
35115Mme Domer1941-11-241941-12-241 month30.0300.0
35116Quesney1941-12-041942-01-041 month31.0310.0
35118Mlle Renauld1941-12-081942-03-083 months90.0900.0
35119Kohlberg1941-12-091942-01-091 month31.0310.0
\n", + "

9144 rows Γ— 7 columns

\n", + "
" + ], + "text/plain": [ + " member_names start_date \\\n", + "28 Arthur Elliott Felkin 1927 \n", + "70 Geraldine Deknatel;William Deknatel 1931 \n", + "233 Mrs. G. S. Madam 1921-07 \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 \n", + "260 Victor Llona 1923-06 \n", + "... ... ... \n", + "35114 Capon 1941-11-24 \n", + "35115 Mme Domer 1941-11-24 \n", + "35116 Quesney 1941-12-04 \n", + "35118 Mlle Renauld 1941-12-08 \n", + "35119 Kohlberg 1941-12-09 \n", + "\n", + " end_date subscription_duration subscription_duration_days \\\n", + "28 1928 1 year 365.0 \n", + "70 1932 1 year 365.0 \n", + "233 1921-08 1 month 31.0 \n", + "234 1922-02 5 months 153.0 \n", + "260 1923-10 4 months 122.0 \n", + "... ... ... ... \n", + "35114 1941-12-24 1 month 30.0 \n", + "35115 1941-12-24 1 month 30.0 \n", + "35116 1942-01-04 1 month 31.0 \n", + "35118 1942-03-08 3 months 90.0 \n", + "35119 1942-01-09 1 month 31.0 \n", + "\n", + " undate_duration duration_diff \n", + "28 730 365.0 \n", + "70 730 365.0 \n", + "233 61 30.0 \n", + "234 180 27.0 \n", + "260 152 30.0 \n", + "... ... ... \n", + "35114 30 0.0 \n", + "35115 30 0.0 \n", + "35116 31 0.0 \n", + "35118 90 0.0 \n", + "35119 31 0.0 \n", + "\n", + "[9144 rows x 7 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what's the difference between the two?\n", + "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration - row.subscription_duration_days, axis=1)\n", + "subs_duration" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z3i984igMNjm", + "outputId": "c8a3580e-a36a-4756-d427-286ba8e5cf91" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "duration_diff\n", + " 0.0 9065\n", + " 30.0 30\n", + " 29.0 21\n", + " 1.0 10\n", + "-1.0 9\n", + " 28.0 4\n", + " 365.0 2\n", + " 27.0 1\n", + " 2.0 1\n", + "-3.0 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subs_duration['duration_diff'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uu9kmAA_gm5o" + }, + "source": [ + "### Investigate discrepancies" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "gdenGvR1MkUG", + "outputId": "589b6b49-3f9c-42d5-e01f-326401007878" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28Arthur Elliott Felkin192719281 year365.0730365.0
70Geraldine Deknatel;William Deknatel193119321 year365.0730365.0
233Mrs. G. S. Madam1921-071921-081 month31.06130.0
234Anne Moderwell;Hiram Moderwell / H. K. Moderwell1921-091922-025 months153.018027.0
260Victor Llona1923-061923-104 months122.015230.0
261Mrs. L. McNair1923-081923-091 month31.06029.0
271RenΓ© Martin1924-021924-031 month29.05930.0
272Nigel Monro1924-021924-042 months60.08929.0
293Madeleine Lorsignol1926-031926-107 months214.024430.0
313M. Mathieu1926-111926-121 month30.06030.0
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date \\\n", + "28 Arthur Elliott Felkin 1927 1928 \n", + "70 Geraldine Deknatel;William Deknatel 1931 1932 \n", + "233 Mrs. G. S. Madam 1921-07 1921-08 \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", + "260 Victor Llona 1923-06 1923-10 \n", + "261 Mrs. L. McNair 1923-08 1923-09 \n", + "271 RenΓ© Martin 1924-02 1924-03 \n", + "272 Nigel Monro 1924-02 1924-04 \n", + "293 Madeleine Lorsignol 1926-03 1926-10 \n", + "313 M. Mathieu 1926-11 1926-12 \n", + "\n", + " subscription_duration subscription_duration_days undate_duration \\\n", + "28 1 year 365.0 730 \n", + "70 1 year 365.0 730 \n", + "233 1 month 31.0 61 \n", + "234 5 months 153.0 180 \n", + "260 4 months 122.0 152 \n", + "261 1 month 31.0 60 \n", + "271 1 month 29.0 59 \n", + "272 2 months 60.0 89 \n", + "293 7 months 214.0 244 \n", + "313 1 month 30.0 60 \n", + "\n", + " duration_diff \n", + "28 365.0 \n", + "70 365.0 \n", + "233 30.0 \n", + "234 27.0 \n", + "260 30.0 \n", + "261 29.0 \n", + "271 30.0 \n", + "272 29.0 \n", + "293 30.0 \n", + "313 30.0 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# investigate the ones with larger differences\n", + "subset_subdurations = subs_duration[subs_duration.duration_diff != 0]\n", + "subset_subdurations.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9_w1Cwl2N81d", + "outputId": "c0733942-16cd-42bf-c9a3-abbf250e44f5" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "subscription_duration\n", + "1 month 38\n", + "3 months 12\n", + "2 months 7\n", + "6 months 6\n", + "4 months 5\n", + "5 months 3\n", + "1 year 2\n", + "7 months 2\n", + "8 months 2\n", + "11 months 1\n", + "10 months 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# too many to lok at once, can we segment by subscription duration?\n", + "subset_subdurations.subscription_duration.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "snv1qguUOHPB", + "outputId": "dce76078-236b-48ee-9607-5d702cf4ee04" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
233Mrs. G. S. Madam1921-071921-081 month31.06130.0
261Mrs. L. McNair1923-081923-091 month31.06029.0
271RenΓ© Martin1924-021924-031 month29.05930.0
313M. Mathieu1926-111926-121 month30.06030.0
354Emmanuel Leopold1928-021928-031 month29.05930.0
356Louis Lozowick1928-021928-031 month29.05930.0
393B. Malbert1929-081929-091 month31.06029.0
394M. McPherson1929-081929-091 month31.06029.0
430R. L. Lowey1930-051930-061 month31.06029.0
444Marguerite Gay Hutchinson1930-111930-121 month30.06030.0
462Rubin Goldberg1931-051931-061 month31.06029.0
464E. H. Morgan1931-061931-071 month30.06030.0
466Mona Millard1931-071931-081 month31.06130.0
468Elaine Cammett1931-081931-091 month31.06029.0
472Frederick McWilliam1931-091931-101 month30.06030.0
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date subscription_duration \\\n", + "233 Mrs. G. S. Madam 1921-07 1921-08 1 month \n", + "261 Mrs. L. McNair 1923-08 1923-09 1 month \n", + "271 RenΓ© Martin 1924-02 1924-03 1 month \n", + "313 M. Mathieu 1926-11 1926-12 1 month \n", + "354 Emmanuel Leopold 1928-02 1928-03 1 month \n", + "356 Louis Lozowick 1928-02 1928-03 1 month \n", + "393 B. Malbert 1929-08 1929-09 1 month \n", + "394 M. McPherson 1929-08 1929-09 1 month \n", + "430 R. L. Lowey 1930-05 1930-06 1 month \n", + "444 Marguerite Gay Hutchinson 1930-11 1930-12 1 month \n", + "462 Rubin Goldberg 1931-05 1931-06 1 month \n", + "464 E. H. Morgan 1931-06 1931-07 1 month \n", + "466 Mona Millard 1931-07 1931-08 1 month \n", + "468 Elaine Cammett 1931-08 1931-09 1 month \n", + "472 Frederick McWilliam 1931-09 1931-10 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "233 31.0 61 30.0 \n", + "261 31.0 60 29.0 \n", + "271 29.0 59 30.0 \n", + "313 30.0 60 30.0 \n", + "354 29.0 59 30.0 \n", + "356 29.0 59 30.0 \n", + "393 31.0 60 29.0 \n", + "394 31.0 60 29.0 \n", + "430 31.0 60 29.0 \n", + "444 30.0 60 30.0 \n", + "462 31.0 60 29.0 \n", + "464 30.0 60 30.0 \n", + "466 31.0 61 30.0 \n", + "468 31.0 60 29.0 \n", + "472 30.0 60 30.0 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# lots of one-month subscriptions, what do the discrepancies look like?\n", + "subset_subdurations[subset_subdurations.subscription_duration == '1 month'].head(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Rm4jqlA4hq9E" + }, + "source": [ + "The first set of these are calculated differently because they are partial dates; undate logic calculates based on earliest possible date through last possible date, but we have additional information in these cases that is project-specific and undate can't take into account, i.e. subscription duration is one month starting sometime in a known year or month.\n", + "\n", + "The handful towards the end that are off by one in either direction (+/-) are a little more concerning... (potential bug in S&co code? or value calculated based on known semantic duration?)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "TEL7qdNhOXHL", + "outputId": "50e051d5-18ae-4f24-a229-fc02fb610ed8" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28Arthur Elliott Felkin192719281 year365.0730365.0
70Geraldine Deknatel;William Deknatel193119321 year365.0730365.0
234Anne Moderwell;Hiram Moderwell / H. K. Moderwell1921-091922-025 months153.018027.0
260Victor Llona1923-061923-104 months122.015230.0
272Nigel Monro1924-021924-042 months60.08929.0
293Madeleine Lorsignol1926-031926-107 months214.024430.0
321Thomas MacGreevy1927-031928-0211 months337.036528.0
331Arthur Moss1927-071927-103 months92.012230.0
337Ruth Meyer1927-101928-068 months244.027329.0
349RenΓ© Leroi1928-011928-043 months91.012029.0
388Gabriel Mourey1929-061930-0410 months304.033329.0
408F. Marsh1930-011930-043 months90.011929.0
409Mrs. Herbert Meyer1930-011930-043 months90.011929.0
412Jacques Delmond1930-011930-098 months243.027229.0
415Loren Mozley1930-021930-064 months120.014929.0
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date \\\n", + "28 Arthur Elliott Felkin 1927 1928 \n", + "70 Geraldine Deknatel;William Deknatel 1931 1932 \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", + "260 Victor Llona 1923-06 1923-10 \n", + "272 Nigel Monro 1924-02 1924-04 \n", + "293 Madeleine Lorsignol 1926-03 1926-10 \n", + "321 Thomas MacGreevy 1927-03 1928-02 \n", + "331 Arthur Moss 1927-07 1927-10 \n", + "337 Ruth Meyer 1927-10 1928-06 \n", + "349 RenΓ© Leroi 1928-01 1928-04 \n", + "388 Gabriel Mourey 1929-06 1930-04 \n", + "408 F. Marsh 1930-01 1930-04 \n", + "409 Mrs. Herbert Meyer 1930-01 1930-04 \n", + "412 Jacques Delmond 1930-01 1930-09 \n", + "415 Loren Mozley 1930-02 1930-06 \n", + "\n", + " subscription_duration subscription_duration_days undate_duration \\\n", + "28 1 year 365.0 730 \n", + "70 1 year 365.0 730 \n", + "234 5 months 153.0 180 \n", + "260 4 months 122.0 152 \n", + "272 2 months 60.0 89 \n", + "293 7 months 214.0 244 \n", + "321 11 months 337.0 365 \n", + "331 3 months 92.0 122 \n", + "337 8 months 244.0 273 \n", + "349 3 months 91.0 120 \n", + "388 10 months 304.0 333 \n", + "408 3 months 90.0 119 \n", + "409 3 months 90.0 119 \n", + "412 8 months 243.0 272 \n", + "415 4 months 120.0 149 \n", + "\n", + " duration_diff \n", + "28 365.0 \n", + "70 365.0 \n", + "234 27.0 \n", + "260 30.0 \n", + "272 29.0 \n", + "293 30.0 \n", + "321 28.0 \n", + "331 30.0 \n", + "337 29.0 \n", + "349 29.0 \n", + "388 29.0 \n", + "408 29.0 \n", + "409 29.0 \n", + "412 29.0 \n", + "415 29.0 " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# durations other than one month\n", + "subset_subdurations[subset_subdurations.subscription_duration != '1 month'].head(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2tk6N7SXKKCu" + }, + "source": [ + "## Compare Borrow event durations\n", + "\n", + "S&co data also includes borrowing events with known duration; it uses the same format as subscriptions (`subscription_duration` and `subscription_duration_days`.\n", + "\n", + "Select borrow events with available duration information to compare with Undate logic." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "fA1Nedmz6cyF", + "outputId": "5230d5ad-fec4-4353-a0d2-9676d1aa776d" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_dateborrow_duration_days
602G. E. Pulsford--01-07--01-136.0
603G. E. Pulsford--01-12--01-208.0
604Robert D. Sage--01-16--02-1631.0
605Gertrude Stein--01-19--01-245.0
606G. E. Pulsford--01-20--01-288.0
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date borrow_duration_days\n", + "602 G. E. Pulsford --01-07 --01-13 6.0\n", + "603 G. E. Pulsford --01-12 --01-20 8.0\n", + "604 Robert D. Sage --01-16 --02-16 31.0\n", + "605 Gertrude Stein --01-19 --01-24 5.0\n", + "606 G. E. Pulsford --01-20 --01-28 8.0" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "borrow_duration = events_df[events_df.borrow_duration_days.notna()]\n", + "# limit to fields we care about for this check\n", + "borrow_duration = borrow_duration[['member_names', 'start_date', 'end_date', 'borrow_duration_days']]\n", + "borrow_duration.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "KPOBIRsTUKM9", + "outputId": "4a251445-e7c7-4250-82df-ece0bc9a3d56" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_dateborrow_duration_days
29903Henri Michaux1961-06-301961-10-0496.0
29904Henri Michaux1961-06-301961-10-0496.0
29905Henri Michaux1961-06-301961-10-0496.0
29907Ann Samyn1961-10-041962-03-21168.0
29908Ann Samyn1961-10-041962-03-21168.0
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date borrow_duration_days\n", + "29903 Henri Michaux 1961-06-30 1961-10-04 96.0\n", + "29904 Henri Michaux 1961-06-30 1961-10-04 96.0\n", + "29905 Henri Michaux 1961-06-30 1961-10-04 96.0\n", + "29907 Ann Samyn 1961-10-04 1962-03-21 168.0\n", + "29908 Ann Samyn 1961-10-04 1962-03-21 168.0" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "borrow_duration.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + }, + "id": "39nEPZva8jDo", + "outputId": "6cff4de2-c188-43ad-dc75-684c4d461029" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_dateborrow_duration_daysundate_duration
602G. E. Pulsford--01-07--01-136.06
603G. E. Pulsford--01-12--01-208.08
604Robert D. Sage--01-16--02-1631.031
605Gertrude Stein--01-19--01-245.05
606G. E. Pulsford--01-20--01-288.08
607Gertrude Stein--01-24--03-2055.055
608Gertrude Stein--01-24--03-2055.055
609Gertrude Stein--01-24--03-2055.055
610Gertrude Stein--01-24--05-30126.0126
611Gertrude Stein--01-24--05-30126.0126
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date borrow_duration_days undate_duration\n", + "602 G. E. Pulsford --01-07 --01-13 6.0 6\n", + "603 G. E. Pulsford --01-12 --01-20 8.0 8\n", + "604 Robert D. Sage --01-16 --02-16 31.0 31\n", + "605 Gertrude Stein --01-19 --01-24 5.0 5\n", + "606 G. E. Pulsford --01-20 --01-28 8.0 8\n", + "607 Gertrude Stein --01-24 --03-20 55.0 55\n", + "608 Gertrude Stein --01-24 --03-20 55.0 55\n", + "609 Gertrude Stein --01-24 --03-20 55.0 55\n", + "610 Gertrude Stein --01-24 --05-30 126.0 126\n", + "611 Gertrude Stein --01-24 --05-30 126.0 126" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add a new field for duration as calculated by undate\n", + "borrow_duration[\"undate_duration\"] = borrow_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "borrow_duration.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "rL5S47wPWfd-", + "outputId": "127af40e-0037-4f99-d590-9cc2466a206b" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_dateborrow_duration_daysundate_durationduration_diff
602G. E. Pulsford--01-07--01-136.060.0
603G. E. Pulsford--01-12--01-208.080.0
604Robert D. Sage--01-16--02-1631.0310.0
605Gertrude Stein--01-19--01-245.050.0
606G. E. Pulsford--01-20--01-288.080.0
607Gertrude Stein--01-24--03-2055.0550.0
608Gertrude Stein--01-24--03-2055.0550.0
609Gertrude Stein--01-24--03-2055.0550.0
610Gertrude Stein--01-24--05-30126.01260.0
611Gertrude Stein--01-24--05-30126.01260.0
\n", + "
" + ], + "text/plain": [ + " member_names start_date end_date borrow_duration_days \\\n", + "602 G. E. Pulsford --01-07 --01-13 6.0 \n", + "603 G. E. Pulsford --01-12 --01-20 8.0 \n", + "604 Robert D. Sage --01-16 --02-16 31.0 \n", + "605 Gertrude Stein --01-19 --01-24 5.0 \n", + "606 G. E. Pulsford --01-20 --01-28 8.0 \n", + "607 Gertrude Stein --01-24 --03-20 55.0 \n", + "608 Gertrude Stein --01-24 --03-20 55.0 \n", + "609 Gertrude Stein --01-24 --03-20 55.0 \n", + "610 Gertrude Stein --01-24 --05-30 126.0 \n", + "611 Gertrude Stein --01-24 --05-30 126.0 \n", + "\n", + " undate_duration duration_diff \n", + "602 6 0.0 \n", + "603 8 0.0 \n", + "604 31 0.0 \n", + "605 5 0.0 \n", + "606 8 0.0 \n", + "607 55 0.0 \n", + "608 55 0.0 \n", + "609 55 0.0 \n", + "610 126 0.0 \n", + "611 126 0.0 " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what's the difference between the two?\n", + "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration - row.borrow_duration_days, axis=1)\n", + "borrow_duration.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DQumLSXZW7r6", + "outputId": "fc5196d6-9d9a-430e-ecb2-c142676c3614" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "duration_diff\n", + "0.0 19728\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what do the duration differences look like?\n", + "borrow_duration.duration_diff.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r0TUYWzSXIil" + }, + "source": [ + "Woohoo, everything matches! πŸŽ‰\n", + "\n", + "In a previous run, there were two borrow events where the calculation did not match; this was due to an error in undate duration method when the start and end dates have unknown years and dates wrap to the following year (e.g., december to january), which has now been corrected.\n", + "\n", + "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#)." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 + }, + "id": "-Bq76gtDWljg", + "outputId": "f1ee526d-b938-4cbf-e93c-c6c91c077ae7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
member_namesstart_dateend_dateborrow_duration_daysundate_durationduration_diff
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [member_names, start_date, end_date, borrow_duration_days, undate_duration, duration_diff]\n", + "Index: []" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "borrow_duration[borrow_duration.duration_diff != 0]" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "Rebecca Sutton Koeser" + } + ], + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/setup.cfg b/setup.cfg index 8d04412..dc228d6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,10 +15,10 @@ keywords = "dates dating uncertainty uncertain-dates unknown partially-known dig classifiers = Development Status :: 2 - Pre-Alpha Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 Intended Audience :: Developers License :: OSI Approved :: Apache Software License Operating System :: OS Independent @@ -40,6 +40,11 @@ packages = find: python_requires = >=3.8 install_requires = python-dateutil + lark + +[options.package_data] +* = + *.lark [options.extras_require] all = @@ -58,23 +63,24 @@ test = pytest-ordering pytest-cov docs = - sphinx + sphinx<7.0.0 sphinx_rtd_theme m2r2 +# pin sphinx because 7.0 currently not compatible with rtd theme [options.packages.find] where = src [tox:tox] -envlist = py38, py39, py310, py311 +envlist = py39, py310, py311, py312 isolated_build = True [gh-actions] python = - 3.8: py38 3.9: py39 3.10: py310 3.11: py311 + 3.12: py312 [pytest] minversion = 6.0 diff --git a/src/undate/__init__.py b/src/undate/__init__.py index 3dc1f76..d3ec452 100644 --- a/src/undate/__init__.py +++ b/src/undate/__init__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/src/undate/dateformat/base.py b/src/undate/dateformat/base.py index eafebe5..f4435f4 100644 --- a/src/undate/dateformat/base.py +++ b/src/undate/dateformat/base.py @@ -15,7 +15,7 @@ import importlib import logging import pkgutil -from typing import Dict +from typing import Dict, Type from functools import lru_cache # functools.cache not available until 3.9 @@ -41,11 +41,12 @@ def to_string(self, undate) -> str: # cache import class method to ensure we only import once @classmethod @lru_cache - def import_formatters(cls): + def import_formatters(cls) -> int: """Import all undate.dateformat formatters so that they will be included in available formatters even if not explicitly imported. Only import once. returns the count of modules imported.""" + logger.debug("Loading formatters under undate.dateformat") import undate.dateformat @@ -65,7 +66,7 @@ def import_formatters(cls): return import_count @classmethod - def available_formatters(cls) -> Dict[str, "BaseDateFormat"]: + def available_formatters(cls) -> Dict[str, Type["BaseDateFormat"]]: # ensure undate formatters are imported cls.import_formatters() return {c.name: c for c in cls.__subclasses__()} # type: ignore diff --git a/src/undate/dateformat/edtf/__init__.py b/src/undate/dateformat/edtf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/undate/dateformat/edtf/edtf.lark b/src/undate/dateformat/edtf/edtf.lark new file mode 100644 index 0000000..6b8e5aa --- /dev/null +++ b/src/undate/dateformat/edtf/edtf.lark @@ -0,0 +1,64 @@ +%import common.ESCAPED_STRING -> STRING +%import common.INT -> INT +%import common.WS +%ignore WS + +// --- EDTF / ISO 8601-2 --- + +?edtf: edtf_level0 | edtf_level1 + +// --- EDTF Level 0 / ISO 8601-1 --- + +?edtf_level0: date | timeinterval +// not implementing datetime for now + +date: year | year "-" month | year "-" month "-" day + +year: INT +month: /(0[1-9])|(1[0-2])/ +day: /([0-2][1-9])|(3[0-1])/ + +timeinterval: date "/" date + + +// EDTF Level 1 + +?edtf_level1: date_level1 | extended_interval + +// qualification may occur at the end of the date +qualification: uncertain | approximate | uncertain_approximate + +uncertain: "?" +approximate: "~" +uncertain_approximate: "%" + +// The character 'X' may be used in place of one or more rightmost +// digits to indicate that the value of that digit is unspecified +unspecified: /X/ +?year_unspecified: /\d+/ unspecified+ +?month_unspecified: "0".."1"? unspecified ~ 1..2 +//?year_month_unspecified: year_l1 "-" month_unspecified +?day_unspecified: "0".."3"? unspecified ~ 1..2 + +// 'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999. +year_fivedigitsplus: /Y\d{5,}/ +?year_l1: year_fivedigitsplus | year | year_unspecified + +// The values 21, 22, 23, 24 may be used used to signify +// ' Spring', 'Summer', 'Autumn', 'Winter', respectively, +// in place of a month value (01 through 12) for a year-and-month format string. +season: /2[1-4]/ +?year_season: year_l1 "-" season + +date_level1: (year_l1 + | year_l1 "-" (month | month_unspecified) + | year_l1 "-" (month | month_unspecified) "-" (day | day_unspecified) + | year_season) qualification? + +// unknown date: double dot or empty string +unknown_date: ".."? +extended_interval: date_level1 "/" date_level1 + | date_level1 "/" unknown_date + | unknown_date "/" date_level1 + +// negative calendar year? \ No newline at end of file diff --git a/src/undate/dateformat/edtf/parser.py b/src/undate/dateformat/edtf/parser.py new file mode 100644 index 0000000..8826b2d --- /dev/null +++ b/src/undate/dateformat/edtf/parser.py @@ -0,0 +1,46 @@ +import os.path + +from lark import Lark + + +grammar_path = os.path.join(os.path.dirname(__file__), "edtf.lark") + +with open(grammar_path) as grammar: + edtf_parser = Lark(grammar.read(), start="edtf") + + +# testcases = [ +# "1984", +# "1984-05", +# "1984-12", +# "1001-03-30", +# "1000/2000", +# "1000-01/2000-05-01", +# # level 1 +# "Y170000002", +# "2001-21", # spring 2001 +# # qualifiers +# "1984?", +# "2004-06~", +# "2004-06-11%", +# # unspecified digits from right +# "201X", +# "20XX", +# "2004-XX", +# "1985-04-XX", +# "1985-XX-XX", +# # open ended intervals +# "1985-04-12/..", +# "1985-04/..", +# "../1985-04-12", +# "/1985-04-12", +# "1984-13", +# ] + +# for testcase in testcases: +# print(f"\n{testcase}") +# tree = edtf_parser.parse(testcase) +# print(tree.pretty()) + + +# error_cases = ["1984-13", "Y1702"] diff --git a/src/undate/dateformat/edtf/transformer.py b/src/undate/dateformat/edtf/transformer.py new file mode 100644 index 0000000..cca3609 --- /dev/null +++ b/src/undate/dateformat/edtf/transformer.py @@ -0,0 +1,70 @@ +from lark import Transformer, Tree, Token +from undate.undate import Undate, UndateInterval + + +class EDTFTransformer(Transformer): + """transform edtf parse tree and return Undate or UndateInterval""" + + INT = int + + def timeinterval(self, items): + # transformed result from parser should be two undate objects; + # combine into an interval + return UndateInterval(*items) + + def date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one value; + # anonymous tokens convert to their value + value = child.children[0] + # convert to integer when possible; otherwise pass as string + try: + value = int(value) + except ValueError: + value = str(value) + parts[str(child.data)] = value + + return Undate(**parts) + + def extended_interval(self, items): + # same as level 1 time interval, except one item may be None + # for an open-ended range + return self.timeinterval(items) + + def unknown_date(self, token): + # unknown date for interval should be passed in as None + return None + + def get_values(self, items): + # get a list of values from tokens; recurses to get subtree tokens + values = [] + for i in items: + if isinstance(i, Token): + values.append(str(i)) + if isinstance(i, Tree): + values.extend(self.get_values(i.children)) + return values + + def year_unspecified(self, items): + # combine parts (numeric & unknown) into a single string + value = "".join(self.get_values(items)) + return Tree(data="year", children=[value]) + + def month_unspecified(self, items): + value = "".join(self.get_values(items)) + return Tree(data="month", children=[value]) + + def day_unspecified(self, items): + value = "".join(self.get_values(items)) + return Tree(data="day", children=[value]) + + def date_level1(self, items): + return self.date(items) + + def year_fivedigitsplus(self, token): + # strip off the leading Y and convert to integer + # TODO: undate is currently limited to 4-digit years + # (datetime max year of 9999) + return tok.update(int(token[:1])) diff --git a/src/undate/dateformat/iso8601.py b/src/undate/dateformat/iso8601.py index 22aff07..f1c5cca 100644 --- a/src/undate/dateformat/iso8601.py +++ b/src/undate/dateformat/iso8601.py @@ -28,7 +28,12 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: if len(parts) == 1: return self._parse_single_date(parts[0]) elif len(parts) == 2: - return UndateInterval(*[self._parse_single_date(p) for p in parts]) + # date range; parse both parts and initialize an interval + start, end = [self._parse_single_date(p) for p in parts] + return UndateInterval(start, end) + else: + # more than two parts = unexpected input + raise ValueError def _parse_single_date(self, value: str) -> Undate: # split single iso date into parts; convert to int or None @@ -48,17 +53,16 @@ def to_string(self, undate: Undate) -> str: date_parts: List[Union[str, None]] = [] # for each part of the date that is known, generate the string format # then combine - for date_portion, known in undate.known_values.items(): - if known: + # TODO: should error if we have year and day but no month + for date_portion, iso_format in self.iso_format.items(): + if undate.is_known(date_portion): # NOTE: datetime strftime for %Y for 3-digit year # results in leading zero in some environments # and not others; force year to always be 4 digits if date_portion == "year": date_parts.append("%04d" % undate.earliest.year) else: - date_parts.append( - undate.earliest.strftime(self.iso_format[date_portion]) - ) + date_parts.append(undate.earliest.strftime(iso_format)) elif date_portion == "year": # if not known but this is year, add '-' for --MM-DD unknown year format date_parts.append("-") diff --git a/src/undate/undate.py b/src/undate/undate.py index 566f869..ee94bb6 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,5 +1,7 @@ import datetime from calendar import monthrange +from enum import IntEnum +import re # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Optional, Dict, Union @@ -9,66 +11,168 @@ from undate.dateformat.base import BaseDateFormat -# duration of a single day +#: duration of a single day ONE_DAY = datetime.timedelta(days=1) +class DatePrecision(IntEnum): + """date precision, to indicate date precision independent from how much + of the date is known.""" + + # numbers should be set to allow logical greater than / less than + # comparison, e.g. year precision > month + + #: day + DAY = 1 + #: month + MONTH = 2 + #: year + YEAR = 3 + + def __str__(self): + return f"{self.name}" + + class Undate: """Simple object for representing uncertain, fuzzy or partially unknown dates""" - DEFAULT_FORMAT = "ISO8601" + DEFAULT_FORMAT: str = "ISO8601" - earliest: Union[datetime.date, None] = None - latest: Union[datetime.date, None] = None + #: symbol for unknown digits within a date value + MISSING_DIGIT: str = "X" + + earliest: datetime.date + latest: datetime.date #: A string to label a specific undate, e.g. "German Unity Date 2022" for Oct. 3, 2022. #: Labels are not taken into account when comparing undate objects. label: Union[str, None] = None - formatter: Union[BaseDateFormat, None] = None + formatter: BaseDateFormat + #: precision of the date (day, month, year, etc.) + precision: DatePrecision + + #: known non-leap year + NON_LEAP_YEAR: int = 2022 def __init__( self, - year: Optional[int] = None, - month: Optional[int] = None, - day: Optional[int] = None, + year: Optional[Union[int, str]] = None, + month: Optional[Union[int, str]] = None, + day: Optional[Union[int, str]] = None, formatter: Optional[BaseDateFormat] = None, label: Optional[str] = None, ): - # TODO: support initializing for unknown values in each of these - # e.g., maybe values could be string or int; if string with - # unknown digits, calculate min/max for unknowns + # keep track of initial values and which values are known + self.initial_values: Dict[str, Optional[Union[int, str]]] = { + "year": year, + "month": month, + "day": day, + } + if day: + self.precision = DatePrecision.DAY + elif month: + self.precision = DatePrecision.MONTH + elif year: + self.precision = DatePrecision.YEAR + + # TODO: refactor partial date min/max calculations + + if year is not None: + try: + year = int(year) + # update initial value since it is used to determine + # whether or not year is known + self.initial_values["year"] = year + min_year = max_year = year + except ValueError: + # year is a string that can't be converted to int + min_year = int(str(year).replace(self.MISSING_DIGIT, "0")) + max_year = int(str(year).replace(self.MISSING_DIGIT, "9")) + else: + min_year = datetime.MINYEAR + max_year = datetime.MAXYEAR + + # if month is passed in as a string but completely unknown, + # treat as none + # TODO: we should preserve this information somehow; + # difference between just a year and and an unknown month within a year + # maybe in terms of granularity / size ? + if month == "XX": + month = None + + min_month = 1 + max_month = 12 + if month is not None: + try: + # treat as an integer if we can + month = int(month) + # update initial value + self.initial_values["month"] = month + min_month = max_month = month + except ValueError: + # if not, calculate min/max for missing digits + min_month, max_month = self._missing_digit_minmax( + str(month), min_month, max_month + ) + + # similar to month above β€” unknown day, but day-level granularity + if day == "XX": + day = None + + if isinstance(day, int) or isinstance(day, str) and day.isnumeric(): + day = int(day) + # update initial value - fully known day + self.initial_values["day"] = day + min_day = max_day = day + else: + # if we have no day or partial day, calculate min / max + min_day = 1 + # if we know year and month (or max month), calculate exactly + if year and month: + _, max_day = monthrange(int(year), max_month) + elif year is None and month: + # If we don't have year and month, + # calculate based on a known non-leap year + # (better than just setting 31, but still not great) + _, max_day = monthrange(self.NON_LEAP_YEAR, max_month) + else: + max_day = 31 + + # if day is partially specified, narrow min/max further + if day is not None: + min_day, max_day = self._missing_digit_minmax(day, min_day, max_day) # for unknowns, assume smallest possible value for earliest and # largest valid for latest - self.earliest = datetime.date(year or datetime.MINYEAR, month or 1, day or 1) - # if day is unknown but we have year and month, calculate max day - if day is None and year and month: - _, maxday = monthrange(year, month) - elif day is None and year is None and month: - # TODO: what to do if we don't have year and month? - # This will produce bad data if the year is a leap year and the month is February - # 2022 chosen below as it is not a not leap year - # Better than just setting 31, but still not great - _, maxday = monthrange(2022, month) - else: - maxday: int = 31 - self.latest = datetime.date( - year or datetime.MAXYEAR, month or 12, day or maxday - ) - # keep track of which values are known - self.known_values: Dict[str, bool] = { - "year": year is not None, - "month": month is not None, - "day": day is not None, - } + self.earliest = datetime.date(min_year, min_month, min_day) + self.latest = datetime.date(max_year, max_month, max_day) - if not formatter: - # TODO subclass definitions not available unless they are imported where Undate() is called - formatter = BaseDateFormat.available_formatters()[self.DEFAULT_FORMAT]() + if formatter is None: + # import all subclass definitions; initialize the default + formatter_cls = BaseDateFormat.available_formatters()[self.DEFAULT_FORMAT] + formatter = formatter_cls() self.formatter = formatter self.label = label def __str__(self) -> str: + # if any portion of the date is partially known, construct + # pseudo ISO8601 format here, since ISO8601 doesn't support unknown digits + # (temporary, should switch to default format that can handle it, e.g. EDTF) + if any(self.is_partially_known(part) for part in ["year", "month", "day"]): + # initial values could be either string or int + year = self.initial_values["year"] + month = self.initial_values["month"] + day = self.initial_values["day"] + # if integer, convert to string with correct number of digits + # replace unknown year with - for --MM or --MM-DD format + parts = [ + f"{year:04d}" if isinstance(year, int) else year or "-", + f"{month:02d}" if isinstance(month, int) else month, + f"{day:02d}" if isinstance(day, int) else day, + ] + # combine, skipping any values that are None + return "-".join([str(p) for p in parts if p != None]) + return self.formatter.to_string(self) def __repr__(self) -> str: @@ -76,24 +180,192 @@ def __repr__(self) -> str: return "" % (self.label, self) return "" % self - def __eq__(self, other: "Undate") -> bool: - # question: should label be taken into account when checking equality? - # for now, assuming label differences don't matter for comparing dates - return ( + def _comparison_type(self, other: object) -> "Undate": + """Common logic for type handling in comparison methods. + Converts to Undate object if possible, otherwise raises + NotImplemented error. Currently only supports conversion + from :class:`datetime.date` + """ + + # support datetime.date by converting to undate + if isinstance(other, datetime.date): + other = Undate.from_datetime_date(other) + + # recommended to support comparison with arbitrary objects + if not isinstance(other, Undate): + return NotImplemented + + return other + + def __eq__(self, other: object) -> bool: + # Note: assumes label differences don't matter for comparing dates + + other = self._comparison_type(other) + + # only a day-precision fully known undate can be equal to a datetime.date + if isinstance(other, datetime.date): + return self.earliest == other and self.latest == other + + # check for apparent equality + looks_equal = ( self.earliest == other.earliest and self.latest == other.latest - and self.known_values == other.known_values + and self.initial_values == other.initial_values + ) + # if everything looks the same, check for any unknowns in initial values + # the same unknown date should NOT be considered equal + # (but do we need a different equivalence check for this?) + + # NOTE: assumes that partially known values can only be written + # in one format (i.e. X for missing digits). + # If we support other formats, will need to normalize to common + # internal format for comparison + if looks_equal and any("X" in str(val) for val in self.initial_values.values()): + return False + + return looks_equal + + def __lt__(self, other: object) -> bool: + other = self._comparison_type(other) + + # if this date ends before the other date starts, + # return true (this date is earlier, so it is less) + if self.latest < other.earliest: + return True + + # if the other one ends before this one starts, + # return false (this date is later, so it is not less) + if other.latest < self.earliest: + return False + + # if it does not, check if one is included within the other + # (e.g., single date within the same year) + # comparison for those cases is not currently supported + elif other in self or self in other: + raise NotImplementedError( + "Can't compare when one date falls within the other" + ) + # NOTE: unsupported comparisons are supposed to return NotImplemented + # However, doing that in this case results in a confusing TypeError! + # TypeError: '<' not supported between instances of 'Undate' and 'Undate' + # How to handle when the comparison is ambiguous / indeterminate? + # we may need a tribool / ternary type (true, false, unknown), + # but not sure what python builtin methods will do with it (unknown = false?) + + # for any other case (i.e., self == other), return false + return False + + def __gt__(self, other: object) -> bool: + # define gt ourselves so we can support > comparison with datetime.date, + # but rely on existing less than implementation. + # strictly greater than must rule out equals + return not (self < other or self == other) + + def __le__(self, other: Union["Undate", datetime.date]) -> bool: + return self == other or self < other + + def __contains__(self, other: object) -> bool: + # if the two dates are strictly equal, don't consider + # either one as containing the other + other = self._comparison_type(other) + + if self == other: + return False + + return ( + self.earliest <= other.earliest + and self.latest >= other.latest + # is precision sufficient for comparing partially known dates? + and self.precision > other.precision ) + @staticmethod + def from_datetime_date(dt_date): + """Initialize an :class:`Undate` object from a :class:`datetime.date`""" + return Undate(dt_date.year, dt_date.month, dt_date.day) + @property def known_year(self) -> bool: - return self.known_values["year"] + return self.is_known("year") + + def is_known(self, part: str) -> bool: + """Check if a part of the date (year, month, day) is known. + Returns False if unknown or only partially known.""" + # TODO: should we use constants or enum for values? + + # if we have an integer, then consider the date known + # if we have a string, then it is only partially known; return false + return isinstance(self.initial_values[part], int) + + def is_partially_known(self, part: str) -> bool: + return isinstance(self.initial_values[part], str) def duration(self) -> datetime.timedelta: - # what is the duration of this date? - # subtract earliest from latest, and add a day to count the starting day + """What is the duration of this date? + Calculate based on earliest and latest date within range, + taking into account the precision of the date even if not all + parts of the date are known.""" + + # if precision is a single day, duration is one day + # no matter when it is or what else is known + if self.precision == DatePrecision.DAY: + return ONE_DAY + + # if precision is month and year is unknown, + # calculate month duration within a single year (not min/max) + if self.precision == DatePrecision.MONTH: + latest = self.latest + if not self.known_year: + # if year is unknown, calculate month duration in + # a single year + latest = datetime.date( + self.earliest.year, self.latest.month, self.latest.day + ) + delta = latest - self.earliest + ONE_DAY + # month duration can't ever be more than 31 days + # (could we ever know if it's smaller?) + + # if granularity == month but not known month, duration = 31 + if delta.days > 31: + return datetime.timedelta(days=31) + return delta + + # otherwise, calculate based on earliest/latest range + + # subtract earliest from latest and add a day to count start day return self.latest - self.earliest + ONE_DAY + def _missing_digit_minmax( + self, value: str, min_val: int, max_val: int + ) -> tuple[int, int]: + # given a possible range, calculate min/max values for a string + # with a missing digit + + # assuming two digit only (i.e., month or day) + possible_values = [f"{n:02}" for n in range(min_val, max_val + 1)] + # ensure input value has two digits + value = "%02s" % value + # generate regex where missing digit matches anything + val_pattern = re.compile(value.replace(self.MISSING_DIGIT, ".")) + # identify all possible matches, then get min and max + matches = [val for val in possible_values if val_pattern.match(val)] + min_match = min(matches) + max_match = max(matches) + + # split input string into a list so we can update individually + new_min_val = list(value) + new_max_val = list(value) + for i, digit in enumerate(value): + # replace the corresponding digit with our min and max + if digit == self.MISSING_DIGIT: + new_min_val[i] = min_match[i] + new_max_val[i] = max_match[i] + + # combine the lists of digits back together and convert to int + min_val = int("".join(new_min_val)) + max_val = int("".join(new_max_val)) + return (min_val, max_val) + class UndateInterval: """A date range between two uncertain dates. @@ -107,12 +379,15 @@ class UndateInterval: """ # date range between two uncertain dates + earliest: Union[Undate, None] + latest: Union[Undate, None] + label: Union[str, None] def __init__( self, - earliest: Union[Undate, None] = None, - latest: Union[Undate, None] = None, - label: Union[str, None] = None, + earliest: Optional[Undate] = None, + latest: Optional[Undate] = None, + label: Optional[str] = None, ): # for now, assume takes two undate objects self.earliest = earliest @@ -140,6 +415,10 @@ def duration(self) -> datetime.timedelta: """ # what is the duration of this date range? + # if range is open-ended, can't calculate + if self.earliest is None or self.latest is None: + return NotImplemented + # if both years are known, subtract end of range from beginning of start if self.latest.known_year and self.earliest.known_year: return self.latest.latest - self.earliest.earliest + ONE_DAY @@ -148,13 +427,18 @@ def duration(self) -> datetime.timedelta: elif not self.latest.known_year and not self.earliest.known_year: # under what circumstances can we assume that if both years # are unknown the dates are in the same year or sequential? - duration = self.latest.earliest - self.earliest.earliest + ONE_DAY + duration = self.latest.earliest - self.earliest.earliest # if we get a negative, we've wrapped from end of one year - # to the beginning of the next + # to the beginning of the next; + # recalculate assuming second date is in the subsequent year if duration.days < 0: end = self.latest.earliest + relativedelta(years=1) duration = end - self.earliest.earliest + # add the additional day *after* checking for a negative + # or after recalculating with adjusted year + duration += ONE_DAY + return duration else: diff --git a/tests/test_dateformat/edtf/test_edtf_parser.py b/tests/test_dateformat/edtf/test_edtf_parser.py new file mode 100644 index 0000000..5a2b8ea --- /dev/null +++ b/tests/test_dateformat/edtf/test_edtf_parser.py @@ -0,0 +1,46 @@ +import pytest + +from undate.dateformat.edtf.parser import edtf_parser + +# for now, just test that valid dates can be parsed + +testcases = [ + "1984", + "1984-05", + "1984-12", + "1001-03-30", + "1000/2000", + "1000-01/2000-05-01", + # level 1 + "Y170000002", + "2001-21", # spring 2001 + # qualifiers + "1984?", + "2004-06~", + "2004-06-11%", + # unspecified digits from right + "201X", + "20XX", + "2004-XX", + "1985-04-XX", + "1985-XX-XX", + # open ended intervals + "1985-04-12/..", + "1985-04/..", + "../1985-04-12", + "/1985-04-12", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert edtf_parser.parse(date_string) + + +error_cases = ["1984-13", "Y1702"] + + +@pytest.mark.parametrize("date_string", error_cases) +def test_should_error(date_string): + with pytest.raises(Exception): + edtf_parser.parse(date_string) diff --git a/tests/test_dateformat/edtf/test_edtf_transformer.py b/tests/test_dateformat/edtf/test_edtf_transformer.py new file mode 100644 index 0000000..3271b8b --- /dev/null +++ b/tests/test_dateformat/edtf/test_edtf_transformer.py @@ -0,0 +1,46 @@ +import pytest + +from undate.undate import Undate, UndateInterval +from undate.dateformat.edtf.parser import edtf_parser +from undate.dateformat.edtf.transformer import EDTFTransformer + +# for now, just test that valid dates can be parsed + +testcases = [ + ("1984", Undate(1984)), + ("1984-05", Undate(1984, 5)), + ("1984-12", Undate(1984, 12)), + ("1001-03-30", Undate(1001, 3, 30)), + ("1000/2000", UndateInterval(Undate(1000), Undate(2000))), + ("1000-01/2000-05-01", UndateInterval(Undate(1000, 1), Undate(2000, 5, 1))), + # # level 1 + # NOTE: undate currently doesn't most of the level 1 functionality + # NOTE: undate currently doesn't support years beyond 9999 (datetime.MAXYEAR) + # ("Y17000002", Undate(17000002)), + # "2001-21", # spring 2001 + # # qualifiers + # "1984?", + # "2004-06~", + # "2004-06-11%", + # # unspecified digits from right + ("201X", Undate("201X")), + ("20XX", Undate("20XX")), + ("2004-XX", Undate(2004, "XX")), + ("1985-04-XX", Undate(1985, 4, "XX")), + ("1985-XX-XX", Undate(1985, "XX", "XX")), + # # open ended intervals + ("1985-04-12/..", UndateInterval(Undate(1985, 4, 12), None)), + ("1985-04/..", UndateInterval(Undate(1985, 4), None)), + ("../1985-04-12", UndateInterval(None, Undate(1985, 4, 12))), + ("/1985-04-12", UndateInterval(None, Undate(1985, 4, 12))), +] + + +@pytest.mark.parametrize("date_string,expected", testcases) +def test_transform(date_string, expected): + transformer = EDTFTransformer() + # parse the input string, then transform to undate object + parsetree = edtf_parser.parse(date_string) + # since the same unknown date is not considered strictly equal, + # compare object representations + assert repr(transformer.transform(parsetree)) == repr(expected) diff --git a/tests/test_dateformat/test_base.py b/tests/test_dateformat/test_base.py index 63568f0..3687a37 100644 --- a/tests/test_dateformat/test_base.py +++ b/tests/test_dateformat/test_base.py @@ -31,9 +31,12 @@ def test_parse_to_string(self): BaseDateFormat().to_string(1991) -@pytest.mark.first def test_import_formatters_import_only_once(caplog): - # run first so we can confirm it runs once + # clear the cache, since any instantiation of an Undate + # object anywhere in the test suite will populate it + BaseDateFormat.import_formatters.cache_clear() + + # run first, and confirm it runs and loads formatters with caplog.at_level(logging.DEBUG): import_count = BaseDateFormat.import_formatters() # should import at least one thing (iso8601) diff --git a/tests/test_undate.py b/tests/test_undate.py index 9217ea4..cf0d9ce 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -1,8 +1,13 @@ -from datetime import timedelta +from datetime import timedelta, date import pytest -from undate.undate import Undate, UndateInterval +from undate.undate import Undate, UndateInterval, DatePrecision + + +class TestDatePrecision: + def test_str(self): + assert str(DatePrecision.YEAR) == "YEAR" class TestUndate: @@ -12,6 +17,17 @@ def test_str(self): assert str(Undate(2022)) == "2022" assert str(Undate(month=11, day=7)) == "--11-07" + def test_partially_known_str(self): + assert str(Undate("19XX")) == "19XX" + assert str(Undate(2022, "1X")) == "2022-1X" + assert str(Undate(2022, 11, "2X")) == "2022-11-2X" + assert str(Undate(month="1X", day=7)) == "--1X-07" + + # TODO: should not allow initializing year/day without month; + # should we infer unknown month? or raise an exception? + # assert str(Undate(2022, day="2X")) == "2022-XX-2X" # currently returns 2022-2X + # assert str(Undate(2022, day=7)) == "2022-XX-07" @ currently returns 2022-07 + def test_repr(self): assert repr(Undate(2022, 11, 7)) == "" assert ( @@ -19,17 +35,118 @@ def test_repr(self): == "" ) + def test_init_str(self): + assert Undate("2000").earliest.year == 2000 + # single or double digit string month should be ok + assert Undate("2000", "2").earliest.month == 2 + assert Undate("2000", "02").earliest.month == 2 + + def test_init_partially_known_year(self): + uncertain1900s = Undate("19XX") + assert uncertain1900s.earliest.year == 1900 + assert uncertain1900s.latest.year == 1999 + + uncertain1x = Undate("1X05") + assert uncertain1x.earliest.year == 1005 + assert uncertain1x.latest.year == 1905 + + uncertain18x7 = Undate("18X7") + assert uncertain18x7.earliest.year == 1807 + assert uncertain18x7.latest.year == 1897 + + def test_init_partially_known_month(self): + uncertain_fall = Undate(1900, "1X") + assert uncertain_fall.earliest.month == 10 + assert uncertain_fall.latest.month == 12 + + uncertain_notfall = Undate(1900, "0X") + assert uncertain_notfall.earliest.month == 1 + assert uncertain_notfall.latest.month == 9 + + # unlikely case, but now possible to calculate + assert Undate(1900, "X1").earliest.month == 1 + assert Undate(1900, "X1").latest.month == 11 + + # treat as unknown but allow + unknown_month = Undate(1900, "XX") + assert unknown_month.earliest.month == 1 + assert unknown_month.latest.month == 12 + assert str(unknown_month) == "1900-XX" + + def test_init_partially_known_day(self): + uncertain_day = Undate(1900, 1, "XX") # treat as None + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 31 + + uncertain_day = Undate(1900, 1, "1X") + assert uncertain_day.earliest.day == 10 + assert uncertain_day.latest.day == 19 + + uncertain_day = Undate(1900, 1, "0X") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 9 + uncertain_day = Undate(1900, 1, "2X") + assert uncertain_day.earliest.day == 20 + assert uncertain_day.latest.day == 29 + uncertain_day = Undate(1900, 1, "3X") + assert uncertain_day.earliest.day == 30 + assert uncertain_day.latest.day == 31 + + uncertain_day = Undate(1900, 1, "X5") + assert uncertain_day.earliest.day == 5 + assert uncertain_day.latest.day == 25 + + uncertain_day = Undate(1900, 1, "X1") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 31 + + # month with only 30 days + uncertain_day = Undate(1900, 6, "X1") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 21 # doesn't go to 31 + uncertain_day = Undate(1900, 6, "3X") + assert uncertain_day.earliest.day == 30 + assert uncertain_day.latest.day == 30 + + # special cases + # february! 28 days usually + uncertain_day = Undate(1900, 2, "2X") + assert uncertain_day.earliest.day == 20 + assert uncertain_day.latest.day == 28 + # february in a leap year + uncertain_day = Undate(2024, 2, "2X") + assert uncertain_day.latest.day == 29 + + def test_init_invalid(self): + with pytest.raises(ValueError): + Undate("19xx") + def test_invalid_date(self): # invalid month should raise an error with pytest.raises(ValueError): Undate(1990, 22) + def test_from_datetime_date(self): + undate_from_date = Undate.from_datetime_date(date(2001, 3, 5)) + assert isinstance(undate_from_date, Undate) + assert undate_from_date == Undate(2001, 3, 5) + def test_eq(self): assert Undate(2022) == Undate(2022) assert Undate(2022, 10) == Undate(2022, 10) assert Undate(2022, 10, 1) == Undate(2022, 10, 1) assert Undate(month=2, day=7) == Undate(month=2, day=7) + def test_eq_datetime_date(self): + # support comparisons with datetime objects for full day-precision + assert Undate(2022, 10, 1) == date(2022, 10, 1) + assert Undate(2022, 10, 1) != date(2022, 10, 2) + assert Undate(1980, 10, 1) != date(2022, 10, 1) + + # other date precisions are not equal + assert Undate(2022) != date(2022, 10, 1) + assert Undate(2022, 10) != date(2022, 10, 1) + def test_not_eq(self): assert Undate(2022) != Undate(2023) assert Undate(2022, 10) != Undate(2022, 11) @@ -38,6 +155,140 @@ def test_not_eq(self): assert Undate(2022) != Undate(2022, 10) assert Undate(2022, 10) != Undate(2022, 10, 1) + # partially unknown dates should NOT be considered equal + assert Undate("19XX") != Undate("19XX") + assert Undate(1980, "XX") != Undate(1980, "XX") + + testdata_lt_gt = [ + # dates to test for gt/lt comparison: earlier date, later date + # - simple cases: same precision where one date is clearly earlier + (Undate(2022), Undate(2023)), + (Undate(1991, 1), Undate(1991, 5)), + (Undate(1856, 3, 3), Undate(1856, 3, 21)), + # - mixed precision where one date is clearly earlier + (Undate(1991, 1), Undate(2000)), + (Undate(1856, 3, 3), Undate(1901)), + # partially known digits where comparison is possible + (Undate("19XX"), Undate("20XX")), + (Undate(1900, "0X"), Undate(1900, "1X")), + # compare with datetime.date objects + (Undate("19XX"), date(2020, 1, 1)), + (Undate(1991, 1), date(1992, 3, 4)), + ] + + @pytest.mark.parametrize("earlier,later", testdata_lt_gt) + def test_lt(self, earlier, later): + assert earlier < later + assert later > earlier + + testdata_lte_gte = testdata_lt_gt.copy() + # add a few exactly equal cases + testdata_lte_gte.extend( + [ + (Undate(1601), Undate(1601)), + (Undate(1991, 1), Undate(1991, 1)), + (Undate(1492, 5, 3), Undate(1492, 5, 3)), + # compare with datetime.date also + (Undate(1492, 5, 3), date(1492, 5, 3)), + ] + ) + + def test_lt_when_eq(self): + # strict less than / greater should return false when equal + assert not Undate(1900) > Undate(1900) + assert not Undate(1900) < Undate(1900) + # same for datetime.date + assert not Undate(1903, 1, 5) < date(1903, 1, 5) + assert not Undate(1903, 1, 5) > date(1903, 1, 5) + + @pytest.mark.parametrize("earlier,later", testdata_lte_gte) + def test_lte(self, earlier, later): + assert earlier <= later + assert later >= earlier + + def test_lt_notimplemented(self): + # how to compare mixed precision where dates overlap? + # if the second date falls *within* earliest/latest, + # then it is not clearly less; not implemented? + with pytest.raises(NotImplementedError, match="date falls within the other"): + assert Undate(2022) < Undate(2022, 5) + + # same if we attempt to compare in the other direction + with pytest.raises(NotImplementedError, match="date falls within the other"): + assert Undate(2022, 5) < Undate(2022) + + testdata_contains = [ + # first date falls within the range of the other + # dates within range: middle, start, end, varying precision + (Undate(2022, 6), Undate(2022)), + (Undate(2022, 1, 1), Undate(2022)), + (Undate(2022, 12, 31), Undate(2022)), + (Undate(2022, 6, 15), Undate(2022, 6)), + # support contains with datetime.date + (date(2022, 6, 1), Undate(2022)), + (date(2022, 6, 1), Undate(2022, 6)), + ] + + @pytest.mark.parametrize("date1,date2", testdata_contains) + def test_contains(self, date1, date2): + assert date1 in date2 + + testdata_not_contains = [ + # dates not in range + (Undate(1980), Undate(2020)), + (Undate(1980), Undate(2020, 6)), + (Undate(1980, 6), Undate(2020, 6)), + # support contains with datetime.date + (date(1980, 6, 1), Undate(2022)), + (date(3001, 6, 1), Undate(2022, 6)), + # partially known dates that are similar but same precision, + # so one does not contain the other + (Undate("199X"), Undate("19XX")), + # - specific month to unknown month + (Undate(1980, 6), Undate(1980, "XX")), + # some of these might overlap, but we don't have enough + # information to determine + # - unknown month to unknown month + (Undate(1980, "XX"), Undate(1980, "XX")), + # - partially unknown month to unknown month + (Undate(1801, "1X"), Undate(1801, "XX")), + ] + + @pytest.mark.parametrize("date1,date2", testdata_not_contains) + def test_not_contains(self, date1, date2): + assert date1 not in date2 + + def test_sorting(self): + # sorting should be possible based on gt/lt + # test simple cases for sorting + d1980 = Undate(1980) + d2002_10 = Undate(2002, 10) + d2002_12 = Undate(2002, 12) + d2012_05_01 = Undate(2012, 5, 1) + + assert sorted([d2012_05_01, d2002_12, d2002_10, d1980]) == [ + d1980, + d2002_10, + d2002_12, + d2012_05_01, + ] + + # what about semi-ambigous cases? + d1991_XX = Undate(1991, "XX") + d1992_01_XX = Undate(1992, 1, "XX") + assert sorted([d1992_01_XX, d1991_XX, d1980]) == [d1980, d1991_XX, d1992_01_XX] + + # what about things we can't compare? + d1991 = Undate(1991) + d1991_02 = Undate(1991, 2) + # for now, this will raise a not implemented error + with pytest.raises(NotImplementedError): + sorted([d1991_02, d1991, d1991_XX]) + + # TODO: partially known year? + # someyear = Undate("1XXX") + # assert sorted([d1991, someyear]) == [someyear, d1991] + def test_duration(self): day_duration = Undate(2022, 11, 7).duration() assert isinstance(day_duration, timedelta) @@ -56,9 +307,40 @@ def test_duration(self): leapyear_duration = Undate(2024).duration() assert leapyear_duration.days == 366 + def test_partiallyknown_duration(self): + # day in unknown month/year + assert Undate(day=5).duration().days == 1 + assert Undate(year=1900, month=11, day="2X").duration().days == 1 + + # month in unknown year + assert Undate(month=6).duration().days == 30 + # partially known month + assert Undate(year=1900, month="1X").duration().days == 31 + # what about february? + # could vary with leap years, but assume non-leapyear + assert Undate(month=2).duration().days == 28 + def test_known_year(self): assert Undate(2022).known_year is True assert Undate(month=2, day=5).known_year is False + # partially known year is not known + assert Undate("19XX").known_year is False + # fully known string year should be known + assert Undate("1900").known_year is True + + def test_is_known_month(self): + assert Undate(2022).is_known("month") is False + assert Undate(2022, 2).is_known("month") is True + assert Undate(2022, "5").is_known("month") is True + assert Undate(2022, "1X").is_known("month") is False + assert Undate(2022, "XX").is_known("month") is False + + def test_is_known_day(self): + assert Undate(1984).is_known("day") is False + assert Undate(month=1, day=3).is_known("day") is True + assert Undate(month=1, day="5").is_known("day") is True + assert Undate(month=1, day="X5").is_known("day") is False + assert Undate(month=1, day="XX").is_known("day") is False class TestUndateInterval: @@ -136,4 +418,15 @@ def test_duration(self): month_noyear_duration = UndateInterval( Undate(None, 12, 1), Undate(None, 1, 1) ).duration() - assert month_noyear_duration.days == 31 + assert month_noyear_duration.days == 32 + # this seems wrong, but we currently count both start and dates + + # real case from Shakespeare and Company Project data; + # second date is a year minus one day in the future + month_noyear_duration = UndateInterval( + Undate(None, 6, 7), Undate(None, 6, 6) + ).duration() + assert month_noyear_duration.days == 365 + + # duration is not supported for open-ended intervals + assert UndateInterval(Undate(2000), None).duration() == NotImplemented