From 4f97dea7f6c06512dd65311ed916f1f2048b33cf Mon Sep 17 00:00:00 2001
From: Andy Shapiro <shapiro.andy@epa.gov>
Date: Mon, 11 Nov 2024 21:28:24 -0500
Subject: [PATCH] Api updates (#79)

* pluralize sessions in batch

* fix type annotation

* updates

* improve coverage

* prepare 24.1a5.dev

* fix docs

* update docs

* remove extra whitespace
---
 docs/source/development.md                    |  28 +++++
 .../source/recipes/custom-excel-exports.ipynb |   8 +-
 src/pybmds/__init__.py                        |   2 +-
 src/pybmds/batch.py                           | 114 +++++++++++++-----
 tests/test_pybmds/models/test_multi_tumor.py  |   2 +-
 tests/test_pybmds/test_batch.py               |  44 +++++--
 6 files changed, 155 insertions(+), 43 deletions(-)

diff --git a/docs/source/development.md b/docs/source/development.md
index 59fd3af0..3971b8ec 100644
--- a/docs/source/development.md
+++ b/docs/source/development.md
@@ -90,6 +90,34 @@ make docs-clean #  Clean documentation
 
 Using the `make serve` command is recommended for editing documentation; it updates the preview in realtime files are saved.
 
+## Versioning
+
+We use [calendar versioning](https://calver.org/) for `pybmds`, where:
+
+* `major` is the year of the release (ex: `28` for a 2028 release)
+* `minor` is incremented for each release of the calendar year, starting at `1`
+* `aN` is the alpha release for testing, where N starts at `1`
+* `dev` is any upcoming pre-release currently under development.
+
+As an example, consider the scenario where we're beginning development our first release in 2028:
+
+* In `pybmds.__init__`, set `__version__ = "28.1a1.dev"`
+* Iterate until we're ready for an alpha release
+    * Update the version to `28.1a1`, and git tag the release `28.1a1`
+    * Immediately change the `main` branch to `28.1a2.dev`
+* Begin testing of `28.1a1`
+    * If changes are needed, iterate on `28.1a2.dev`
+    * If changes are not needed, release a `28.1` by changing the version and minting a tag
+
+The [packaging](https://packaging.pypa.io/en/stable/index.html) package implements [PEP440](https://peps.python.org/pep-0440/), and can be used to check candidate versions:
+
+```python
+from packaging.version import Version
+
+Version('28.1a1.dev')
+# _Version(release=(28, 1), pre=('a', 1), dev=('dev', 0))
+```
+
 ### Priors Report
 
 The `pybmds` package includes Bayesian priors and frequentist parameter initialization settings that have been tuned to help improve model fit performance. To generate a report of the settings in all permutations, run the following command:
diff --git a/docs/source/recipes/custom-excel-exports.ipynb b/docs/source/recipes/custom-excel-exports.ipynb
index d31e28f9..8de4af43 100644
--- a/docs/source/recipes/custom-excel-exports.ipynb
+++ b/docs/source/recipes/custom-excel-exports.ipynb
@@ -101,9 +101,9 @@
     "        sess.add_model(Model, settings)\n",
     "\n",
     "    option_sets = [\n",
-    "        (pybmds.ContinuousRiskType.RelativeDeviation, 0.1, \n",
+    "        (pybmds.ContinuousRiskType.RelativeDeviation, 0.1,\n",
     "         pybmds.ContinuousDistType.normal),\n",
-    "        (pybmds.ContinuousRiskType.RelativeDeviation, 0.1, \n",
+    "        (pybmds.ContinuousRiskType.RelativeDeviation, 0.1,\n",
     "         pybmds.ContinuousDistType.normal_ncv),\n",
     "    ]\n",
     "    sessions = []\n",
@@ -225,7 +225,7 @@
    "outputs": [],
    "source": [
     "rows = []\n",
-    "for i, session in enumerate(sess_batch.session):\n",
+    "for i, session in enumerate(sess_batch.sessions):\n",
     "    for j, model in enumerate(session.models):\n",
     "        data = {\n",
     "            \"session_index\": i,\n",
@@ -356,7 +356,7 @@
    },
    "outputs": [],
    "source": [
-    "model = sess_batch.session[0].models[0]\n",
+    "model = sess_batch.sessions[0].models[0]\n",
     "res = model.results"
    ]
   },
diff --git a/src/pybmds/__init__.py b/src/pybmds/__init__.py
index f149e011..07b2fcbb 100644
--- a/src/pybmds/__init__.py
+++ b/src/pybmds/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "24.1a4"
+__version__ = "24.1a5"  # see docs/development for versioning
 
 from .batch import BatchResponse, BatchSession  # noqa: F401
 from .constants import DistType as ContinuousDistType  # noqa: F401
diff --git a/src/pybmds/batch.py b/src/pybmds/batch.py
index ab3f0ea2..3a886303 100644
--- a/src/pybmds/batch.py
+++ b/src/pybmds/batch.py
@@ -10,7 +10,8 @@
 import pandas as pd
 from tqdm import tqdm
 
-from .datasets.base import DatasetBase
+from .constants import Dtype
+from .datasets.base import DatasetType
 from .models.multi_tumor import Multitumor
 from .reporting.styling import Report, write_citation
 from .session import Session
@@ -25,11 +26,24 @@ class BatchBase:
     pass
 
 
+def _make_zip(data: str, archive: Path):
+    with zipfile.ZipFile(
+        archive, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9
+    ) as zf:
+        zf.writestr("data.json", data=data)
+
+
+def _load_zip(archive: Path) -> str:
+    with zipfile.ZipFile(archive) as zf:
+        with zf.open("data.json") as f:
+            return f.read()
+
+
 class BatchSession(BatchBase):
     def __init__(self, sessions: list[Session] | None = None):
         if sessions is None:
             sessions = []
-        self.session: list[Session] = sessions
+        self.sessions: list[Session] = sessions
         self.errors = []
 
     def df_summary(self) -> pd.DataFrame:
@@ -43,13 +57,13 @@ def df_summary(self) -> pd.DataFrame:
                 ),
                 clean=False,
             )
-            for idx, session in enumerate(self.session)
+            for idx, session in enumerate(self.sessions)
         ]
         return pd.concat(dfs).dropna(axis=1, how="all").fillna("")
 
     def df_dataset(self) -> pd.DataFrame:
         data: list[dict] = []
-        for idx, session in enumerate(self.session):
+        for idx, session in enumerate(self.sessions):
             data.extend(
                 session.dataset.rows(
                     extras=dict(
@@ -64,11 +78,16 @@ def df_dataset(self) -> pd.DataFrame:
 
     def df_params(self) -> pd.DataFrame:
         data: list[dict] = []
-        for idx, session in enumerate(self.session):
+        for idx, session in enumerate(self.sessions):
             for model_index, model in enumerate(session.models):
                 if model.has_results:
+                    func = (
+                        model.results.parameter_rows
+                        if session.dataset.dtype is Dtype.NESTED_DICHOTOMOUS
+                        else model.results.parameters.rows
+                    )
                     data.extend(
-                        model.results.parameters.rows(
+                        func(
                             extras=dict(
                                 session_index=idx,
                                 session_id=session.id,
@@ -122,7 +141,7 @@ def to_docx(
         if report is None:
             report = Report.build_default()
 
-        for session in self.session:
+        for session in self.sessions:
             session.to_docx(
                 report,
                 header_level=header_level,
@@ -133,7 +152,7 @@ def to_docx(
                 session_inputs_table=session_inputs_table,
             )
 
-        if citation and len(self.session) > 0:
+        if citation and len(self.sessions) > 0:
             write_citation(report, header_level=header_level)
 
         return report.document
@@ -144,16 +163,19 @@ def serialize(self) -> str:
         Returns:
             str: A JSON string
         """
-        return json.dumps([session.to_dict() for session in self.session])
+        return json.dumps([session.to_dict() for session in self.sessions])
 
     @classmethod
     def execute(
-        cls, datasets: list[DatasetBase], runner: Callable, nprocs: int | None = None
+        cls,
+        datasets: list[DatasetType],
+        runner: Callable[[DatasetType], BatchResponse],
+        nprocs: int | None = None,
     ) -> Self:
         """Execute sessions using multiple processors.
 
         Args:
-            datasets (list[DatasetBase]): The datasets to execute
+            datasets (list[DatasetType]): The datasets to execute
             runner (Callable[dataset] -> BatchResponse): The method which executes a session
             nprocs (Optional[int]): the number of processors to use; defaults to N-1. If 1 is
                 specified; the batch session is called linearly without a process pool
@@ -187,9 +209,9 @@ def execute(
             if result.success:
                 if isinstance(result.content, list):
                     for item in result.content:
-                        batch.session.append(Session.from_serialized(item))
+                        batch.sessions.append(Session.from_serialized(item))
                 else:
-                    batch.session.append(Session.from_serialized(result.content))
+                    batch.sessions.append(Session.from_serialized(result.content))
             else:
                 batch.errors.append(result.content)
 
@@ -216,10 +238,7 @@ def load(cls, archive: Path) -> Self:
         Returns:
             BatchSession: An instance of this session
         """
-        with zipfile.ZipFile(archive) as zf:
-            with zf.open("data.json") as f:
-                data = f.read()
-        return BatchSession.deserialize(data)
+        return BatchSession.deserialize(_load_zip(archive))
 
     def save(self, archive: Path):
         """Save Session to a compressed zipfile
@@ -227,17 +246,14 @@ def save(self, archive: Path):
         Args:
             fn (Path): The zipfile path
         """
-        with zipfile.ZipFile(
-            archive, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9
-        ) as zf:
-            zf.writestr("data.json", data=self.serialize())
+        return _make_zip(self.serialize(), archive)
 
 
 class MultitumorBatch(BatchBase):
     def __init__(self, sessions: list[Multitumor] | None = None):
         if sessions is None:
             sessions = []
-        self.session: list[Multitumor] = sessions
+        self.sessions: list[Multitumor] = sessions
         self.errors = []
 
     def to_docx(
@@ -260,20 +276,42 @@ def to_docx(
         if report is None:
             report = Report.build_default()
 
-        for session in self.session:
+        for session in self.sessions:
             session.to_docx(
                 report,
                 header_level=header_level,
                 citation=False,
             )
 
-        if citation and len(self.session) > 0:
+        if citation and len(self.sessions) > 0:
             write_citation(report, header_level=header_level)
 
         return report.document
 
     def serialize(self) -> str:
-        return json.dumps([session.to_dict() for session in self.session])
+        return json.dumps([session.to_dict() for session in self.sessions])
+
+    @classmethod
+    def execute(cls, datasets: list[dict], runner: Callable, nprocs: int | None = None) -> Self:
+        """Execute sessions using multiple processors.
+
+        Args:
+            datasets (list[dict]): The datasets to execute
+            runner (Callable[dict] -> Multitumor): The method which executes a session.
+            nprocs (Optional[int]): the number of processors to use; defaults to N-1. If 1 is
+                specified; the batch session is called sequentially
+
+        Returns:
+            A MultitumorBatch with sessions executed.
+        """
+        if nprocs is None:
+            nprocs = max(os.cpu_count() - 1, 1)
+
+        if nprocs > 1:
+            raise NotImplementedError("Not implemented (yet)")
+
+        sessions = [runner(dataset) for dataset in tqdm(datasets, desc="Executing...")]
+        return cls(sessions=sessions)
 
     @classmethod
     def deserialize(cls, data: str) -> Self:
@@ -287,7 +325,7 @@ def df_summary(self) -> pd.DataFrame:
                 extras=dict(session_index=idx),
                 clean=False,
             )
-            for idx, session in enumerate(self.session)
+            for idx, session in enumerate(self.sessions)
         ]
         return pd.concat(dfs).dropna(axis=1, how="all").fillna("")
 
@@ -296,7 +334,7 @@ def df_dataset(self) -> pd.DataFrame:
             session.datasets_df(
                 extras=dict(session_index=idx),
             )
-            for idx, session in enumerate(self.session)
+            for idx, session in enumerate(self.sessions)
         ]
         return pd.concat(dfs).dropna(axis=1, how="all").fillna("")
 
@@ -305,7 +343,7 @@ def df_params(self) -> pd.DataFrame:
             session.params_df(
                 extras=dict(session_index=idx),
             )
-            for idx, session in enumerate(self.session)
+            for idx, session in enumerate(self.sessions)
         ]
         return pd.concat(dfs).dropna(axis=1, how="all").fillna("")
 
@@ -320,3 +358,23 @@ def to_excel(self, path: Path | None = None) -> Path | BytesIO:
             for name, df in data.items():
                 df.to_excel(writer, sheet_name=name, index=False)
         return f
+
+    @classmethod
+    def load(cls, archive: Path) -> Self:
+        """Load a Session from a compressed zipfile
+
+        Args:
+            fn (Path): The zipfile path
+
+        Returns:
+            MultitumorBatch: An instance of this session
+        """
+        return cls.deserialize(_load_zip(archive))
+
+    def save(self, archive: Path):
+        """Save Session to a compressed zipfile
+
+        Args:
+            fn (Path): The zipfile path
+        """
+        return _make_zip(self.serialize(), archive)
diff --git a/tests/test_pybmds/models/test_multi_tumor.py b/tests/test_pybmds/models/test_multi_tumor.py
index 0e763f87..889ac4b2 100644
--- a/tests/test_pybmds/models/test_multi_tumor.py
+++ b/tests/test_pybmds/models/test_multi_tumor.py
@@ -31,7 +31,7 @@ def test_execute(self, mt_datasets, rewrite_data_files, data_path):
         df = session.datasets_df()
 
         # docx
-        docx = session.to_docx()
+        docx = session.to_docx(all_models=True, bmd_cdf_table=True)
 
         if rewrite_data_files:
             (data_path / "reports/multitumor.txt").write_text(text)
diff --git a/tests/test_pybmds/test_batch.py b/tests/test_pybmds/test_batch.py
index 50133250..aff9d297 100644
--- a/tests/test_pybmds/test_batch.py
+++ b/tests/test_pybmds/test_batch.py
@@ -1,6 +1,8 @@
 import tempfile
 from pathlib import Path
 
+import pytest
+
 import pybmds
 from pybmds.batch import BatchResponse, BatchSession, MultitumorBatch
 from pybmds.session import Session
@@ -17,10 +19,10 @@ def _batch_run(ds):
 class TestBatchSession:
     def test_execute(self, ddataset2):
         batch = BatchSession.execute([ddataset2], _batch_run, nprocs=1)
-        assert len(batch.session) == 1
+        assert len(batch.sessions) == 1
 
         batch = BatchSession.execute([ddataset2, ddataset2], _batch_run, nprocs=2)
-        assert len(batch.session) == 2
+        assert len(batch.sessions) == 2
 
     def test_exports_dichotomous(self, ddataset2, rewrite_data_files, data_path):
         datasets = [ddataset2]
@@ -29,17 +31,17 @@ def test_exports_dichotomous(self, ddataset2, rewrite_data_files, data_path):
             session = Session(dataset=dataset)
             session.add_default_models()
             session.execute_and_recommend()
-            batch.session.append(session)
+            batch.sessions.append(session)
 
             session = Session(dataset=dataset)
             session.add_default_bayesian_models()
             session.execute()
-            batch.session.append(session)
+            batch.sessions.append(session)
 
         # check serialization/deserialization
         data = batch.serialize()
         batch2 = batch.deserialize(data)
-        assert len(batch2.session) == len(batch.session)
+        assert len(batch2.sessions) == len(batch.sessions)
 
         # check zip
         zf = Path(tempfile.NamedTemporaryFile().name)
@@ -49,7 +51,7 @@ def test_exports_dichotomous(self, ddataset2, rewrite_data_files, data_path):
             assert zf.exists()
             # unsave
             batch3 = BatchSession.load(zf)
-            assert len(batch3.session) == 2
+            assert len(batch3.sessions) == 2
         finally:
             zf.unlink()
 
@@ -68,12 +70,12 @@ def test_exports_continuous(self, cdataset2, cidataset, rewrite_data_files, data
             session = pybmds.Session(dataset=dataset)
             session.add_model(pybmds.Models.Power)
             session.execute_and_recommend()
-            batch.session.append(session)
+            batch.sessions.append(session)
 
         # check serialization/deserialization
         data = batch.serialize()
         batch2 = batch.deserialize(data)
-        assert len(batch2.session) == len(batch.session)
+        assert len(batch2.sessions) == len(batch.sessions)
 
         # check exports
         excel = batch.to_excel()
@@ -95,12 +97,36 @@ def test_exports(self, mt_datasets, rewrite_data_files, data_path):
         # check serialization/deserialization
         data = batch.serialize()
         batch2 = batch.deserialize(data)
-        assert len(batch2.session) == len(batch.session)
+        assert len(batch2.sessions) == len(batch.sessions)
 
         # check exports
         excel = batch.to_excel()
         docx = batch.to_docx()
 
+        # check load/save
+        zf = Path(tempfile.NamedTemporaryFile().name)
+        try:
+            # save
+            batch.save(zf)
+            assert zf.exists()
+            # load
+            batch2 = MultitumorBatch.load(zf)
+            assert len(batch2.sessions) == 1
+        finally:
+            zf.unlink()
+
         if rewrite_data_files:
             (data_path / "reports/batch-multitumor.xlsx").write_bytes(excel.getvalue())
             docx.save(data_path / "reports/batch-multitumor.docx")
+
+    def test_execute(self, mt_datasets):
+        def _batch_run(ds):
+            sess = pybmds.Multitumor(datasets=ds["datasets"])
+            sess.execute()
+            return sess
+
+        batch = MultitumorBatch.execute([{"datasets": mt_datasets}], _batch_run, nprocs=1)
+        assert len(batch.sessions) == 1
+
+        with pytest.raises(NotImplementedError):
+            MultitumorBatch.execute([{"datasets": mt_datasets}], _batch_run, nprocs=2)