From 4f97dea7f6c06512dd65311ed916f1f2048b33cf Mon Sep 17 00:00:00 2001 From: Andy Shapiro Date: Mon, 11 Nov 2024 21:28:24 -0500 Subject: [PATCH] Api updates (#79) * pluralize sessions in batch * fix type annotation * updates * improve coverage * prepare 24.1a5.dev * fix docs * update docs * remove extra whitespace --- docs/source/development.md | 28 +++++ .../source/recipes/custom-excel-exports.ipynb | 8 +- src/pybmds/__init__.py | 2 +- src/pybmds/batch.py | 114 +++++++++++++----- tests/test_pybmds/models/test_multi_tumor.py | 2 +- tests/test_pybmds/test_batch.py | 44 +++++-- 6 files changed, 155 insertions(+), 43 deletions(-) diff --git a/docs/source/development.md b/docs/source/development.md index 59fd3af0..3971b8ec 100644 --- a/docs/source/development.md +++ b/docs/source/development.md @@ -90,6 +90,34 @@ make docs-clean # Clean documentation Using the `make serve` command is recommended for editing documentation; it updates the preview in realtime files are saved. +## Versioning + +We use [calendar versioning](https://calver.org/) for `pybmds`, where: + +* `major` is the year of the release (ex: `28` for a 2028 release) +* `minor` is incremented for each release of the calendar year, starting at `1` +* `aN` is the alpha release for testing, where N starts at `1` +* `dev` is any upcoming pre-release currently under development. + +As an example, consider the scenario where we're beginning development our first release in 2028: + +* In `pybmds.__init__`, set `__version__ = "28.1a1.dev"` +* Iterate until we're ready for an alpha release + * Update the version to `28.1a1`, and git tag the release `28.1a1` + * Immediately change the `main` branch to `28.1a2.dev` +* Begin testing of `28.1a1` + * If changes are needed, iterate on `28.1a2.dev` + * If changes are not needed, release a `28.1` by changing the version and minting a tag + +The [packaging](https://packaging.pypa.io/en/stable/index.html) package implements [PEP440](https://peps.python.org/pep-0440/), and can be used to check candidate versions: + +```python +from packaging.version import Version + +Version('28.1a1.dev') +# _Version(release=(28, 1), pre=('a', 1), dev=('dev', 0)) +``` + ### Priors Report The `pybmds` package includes Bayesian priors and frequentist parameter initialization settings that have been tuned to help improve model fit performance. To generate a report of the settings in all permutations, run the following command: diff --git a/docs/source/recipes/custom-excel-exports.ipynb b/docs/source/recipes/custom-excel-exports.ipynb index d31e28f9..8de4af43 100644 --- a/docs/source/recipes/custom-excel-exports.ipynb +++ b/docs/source/recipes/custom-excel-exports.ipynb @@ -101,9 +101,9 @@ " sess.add_model(Model, settings)\n", "\n", " option_sets = [\n", - " (pybmds.ContinuousRiskType.RelativeDeviation, 0.1, \n", + " (pybmds.ContinuousRiskType.RelativeDeviation, 0.1,\n", " pybmds.ContinuousDistType.normal),\n", - " (pybmds.ContinuousRiskType.RelativeDeviation, 0.1, \n", + " (pybmds.ContinuousRiskType.RelativeDeviation, 0.1,\n", " pybmds.ContinuousDistType.normal_ncv),\n", " ]\n", " sessions = []\n", @@ -225,7 +225,7 @@ "outputs": [], "source": [ "rows = []\n", - "for i, session in enumerate(sess_batch.session):\n", + "for i, session in enumerate(sess_batch.sessions):\n", " for j, model in enumerate(session.models):\n", " data = {\n", " \"session_index\": i,\n", @@ -356,7 +356,7 @@ }, "outputs": [], "source": [ - "model = sess_batch.session[0].models[0]\n", + "model = sess_batch.sessions[0].models[0]\n", "res = model.results" ] }, diff --git a/src/pybmds/__init__.py b/src/pybmds/__init__.py index f149e011..07b2fcbb 100644 --- a/src/pybmds/__init__.py +++ b/src/pybmds/__init__.py @@ -1,4 +1,4 @@ -__version__ = "24.1a4" +__version__ = "24.1a5" # see docs/development for versioning from .batch import BatchResponse, BatchSession # noqa: F401 from .constants import DistType as ContinuousDistType # noqa: F401 diff --git a/src/pybmds/batch.py b/src/pybmds/batch.py index ab3f0ea2..3a886303 100644 --- a/src/pybmds/batch.py +++ b/src/pybmds/batch.py @@ -10,7 +10,8 @@ import pandas as pd from tqdm import tqdm -from .datasets.base import DatasetBase +from .constants import Dtype +from .datasets.base import DatasetType from .models.multi_tumor import Multitumor from .reporting.styling import Report, write_citation from .session import Session @@ -25,11 +26,24 @@ class BatchBase: pass +def _make_zip(data: str, archive: Path): + with zipfile.ZipFile( + archive, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 + ) as zf: + zf.writestr("data.json", data=data) + + +def _load_zip(archive: Path) -> str: + with zipfile.ZipFile(archive) as zf: + with zf.open("data.json") as f: + return f.read() + + class BatchSession(BatchBase): def __init__(self, sessions: list[Session] | None = None): if sessions is None: sessions = [] - self.session: list[Session] = sessions + self.sessions: list[Session] = sessions self.errors = [] def df_summary(self) -> pd.DataFrame: @@ -43,13 +57,13 @@ def df_summary(self) -> pd.DataFrame: ), clean=False, ) - for idx, session in enumerate(self.session) + for idx, session in enumerate(self.sessions) ] return pd.concat(dfs).dropna(axis=1, how="all").fillna("") def df_dataset(self) -> pd.DataFrame: data: list[dict] = [] - for idx, session in enumerate(self.session): + for idx, session in enumerate(self.sessions): data.extend( session.dataset.rows( extras=dict( @@ -64,11 +78,16 @@ def df_dataset(self) -> pd.DataFrame: def df_params(self) -> pd.DataFrame: data: list[dict] = [] - for idx, session in enumerate(self.session): + for idx, session in enumerate(self.sessions): for model_index, model in enumerate(session.models): if model.has_results: + func = ( + model.results.parameter_rows + if session.dataset.dtype is Dtype.NESTED_DICHOTOMOUS + else model.results.parameters.rows + ) data.extend( - model.results.parameters.rows( + func( extras=dict( session_index=idx, session_id=session.id, @@ -122,7 +141,7 @@ def to_docx( if report is None: report = Report.build_default() - for session in self.session: + for session in self.sessions: session.to_docx( report, header_level=header_level, @@ -133,7 +152,7 @@ def to_docx( session_inputs_table=session_inputs_table, ) - if citation and len(self.session) > 0: + if citation and len(self.sessions) > 0: write_citation(report, header_level=header_level) return report.document @@ -144,16 +163,19 @@ def serialize(self) -> str: Returns: str: A JSON string """ - return json.dumps([session.to_dict() for session in self.session]) + return json.dumps([session.to_dict() for session in self.sessions]) @classmethod def execute( - cls, datasets: list[DatasetBase], runner: Callable, nprocs: int | None = None + cls, + datasets: list[DatasetType], + runner: Callable[[DatasetType], BatchResponse], + nprocs: int | None = None, ) -> Self: """Execute sessions using multiple processors. Args: - datasets (list[DatasetBase]): The datasets to execute + datasets (list[DatasetType]): The datasets to execute runner (Callable[dataset] -> BatchResponse): The method which executes a session nprocs (Optional[int]): the number of processors to use; defaults to N-1. If 1 is specified; the batch session is called linearly without a process pool @@ -187,9 +209,9 @@ def execute( if result.success: if isinstance(result.content, list): for item in result.content: - batch.session.append(Session.from_serialized(item)) + batch.sessions.append(Session.from_serialized(item)) else: - batch.session.append(Session.from_serialized(result.content)) + batch.sessions.append(Session.from_serialized(result.content)) else: batch.errors.append(result.content) @@ -216,10 +238,7 @@ def load(cls, archive: Path) -> Self: Returns: BatchSession: An instance of this session """ - with zipfile.ZipFile(archive) as zf: - with zf.open("data.json") as f: - data = f.read() - return BatchSession.deserialize(data) + return BatchSession.deserialize(_load_zip(archive)) def save(self, archive: Path): """Save Session to a compressed zipfile @@ -227,17 +246,14 @@ def save(self, archive: Path): Args: fn (Path): The zipfile path """ - with zipfile.ZipFile( - archive, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 - ) as zf: - zf.writestr("data.json", data=self.serialize()) + return _make_zip(self.serialize(), archive) class MultitumorBatch(BatchBase): def __init__(self, sessions: list[Multitumor] | None = None): if sessions is None: sessions = [] - self.session: list[Multitumor] = sessions + self.sessions: list[Multitumor] = sessions self.errors = [] def to_docx( @@ -260,20 +276,42 @@ def to_docx( if report is None: report = Report.build_default() - for session in self.session: + for session in self.sessions: session.to_docx( report, header_level=header_level, citation=False, ) - if citation and len(self.session) > 0: + if citation and len(self.sessions) > 0: write_citation(report, header_level=header_level) return report.document def serialize(self) -> str: - return json.dumps([session.to_dict() for session in self.session]) + return json.dumps([session.to_dict() for session in self.sessions]) + + @classmethod + def execute(cls, datasets: list[dict], runner: Callable, nprocs: int | None = None) -> Self: + """Execute sessions using multiple processors. + + Args: + datasets (list[dict]): The datasets to execute + runner (Callable[dict] -> Multitumor): The method which executes a session. + nprocs (Optional[int]): the number of processors to use; defaults to N-1. If 1 is + specified; the batch session is called sequentially + + Returns: + A MultitumorBatch with sessions executed. + """ + if nprocs is None: + nprocs = max(os.cpu_count() - 1, 1) + + if nprocs > 1: + raise NotImplementedError("Not implemented (yet)") + + sessions = [runner(dataset) for dataset in tqdm(datasets, desc="Executing...")] + return cls(sessions=sessions) @classmethod def deserialize(cls, data: str) -> Self: @@ -287,7 +325,7 @@ def df_summary(self) -> pd.DataFrame: extras=dict(session_index=idx), clean=False, ) - for idx, session in enumerate(self.session) + for idx, session in enumerate(self.sessions) ] return pd.concat(dfs).dropna(axis=1, how="all").fillna("") @@ -296,7 +334,7 @@ def df_dataset(self) -> pd.DataFrame: session.datasets_df( extras=dict(session_index=idx), ) - for idx, session in enumerate(self.session) + for idx, session in enumerate(self.sessions) ] return pd.concat(dfs).dropna(axis=1, how="all").fillna("") @@ -305,7 +343,7 @@ def df_params(self) -> pd.DataFrame: session.params_df( extras=dict(session_index=idx), ) - for idx, session in enumerate(self.session) + for idx, session in enumerate(self.sessions) ] return pd.concat(dfs).dropna(axis=1, how="all").fillna("") @@ -320,3 +358,23 @@ def to_excel(self, path: Path | None = None) -> Path | BytesIO: for name, df in data.items(): df.to_excel(writer, sheet_name=name, index=False) return f + + @classmethod + def load(cls, archive: Path) -> Self: + """Load a Session from a compressed zipfile + + Args: + fn (Path): The zipfile path + + Returns: + MultitumorBatch: An instance of this session + """ + return cls.deserialize(_load_zip(archive)) + + def save(self, archive: Path): + """Save Session to a compressed zipfile + + Args: + fn (Path): The zipfile path + """ + return _make_zip(self.serialize(), archive) diff --git a/tests/test_pybmds/models/test_multi_tumor.py b/tests/test_pybmds/models/test_multi_tumor.py index 0e763f87..889ac4b2 100644 --- a/tests/test_pybmds/models/test_multi_tumor.py +++ b/tests/test_pybmds/models/test_multi_tumor.py @@ -31,7 +31,7 @@ def test_execute(self, mt_datasets, rewrite_data_files, data_path): df = session.datasets_df() # docx - docx = session.to_docx() + docx = session.to_docx(all_models=True, bmd_cdf_table=True) if rewrite_data_files: (data_path / "reports/multitumor.txt").write_text(text) diff --git a/tests/test_pybmds/test_batch.py b/tests/test_pybmds/test_batch.py index 50133250..aff9d297 100644 --- a/tests/test_pybmds/test_batch.py +++ b/tests/test_pybmds/test_batch.py @@ -1,6 +1,8 @@ import tempfile from pathlib import Path +import pytest + import pybmds from pybmds.batch import BatchResponse, BatchSession, MultitumorBatch from pybmds.session import Session @@ -17,10 +19,10 @@ def _batch_run(ds): class TestBatchSession: def test_execute(self, ddataset2): batch = BatchSession.execute([ddataset2], _batch_run, nprocs=1) - assert len(batch.session) == 1 + assert len(batch.sessions) == 1 batch = BatchSession.execute([ddataset2, ddataset2], _batch_run, nprocs=2) - assert len(batch.session) == 2 + assert len(batch.sessions) == 2 def test_exports_dichotomous(self, ddataset2, rewrite_data_files, data_path): datasets = [ddataset2] @@ -29,17 +31,17 @@ def test_exports_dichotomous(self, ddataset2, rewrite_data_files, data_path): session = Session(dataset=dataset) session.add_default_models() session.execute_and_recommend() - batch.session.append(session) + batch.sessions.append(session) session = Session(dataset=dataset) session.add_default_bayesian_models() session.execute() - batch.session.append(session) + batch.sessions.append(session) # check serialization/deserialization data = batch.serialize() batch2 = batch.deserialize(data) - assert len(batch2.session) == len(batch.session) + assert len(batch2.sessions) == len(batch.sessions) # check zip zf = Path(tempfile.NamedTemporaryFile().name) @@ -49,7 +51,7 @@ def test_exports_dichotomous(self, ddataset2, rewrite_data_files, data_path): assert zf.exists() # unsave batch3 = BatchSession.load(zf) - assert len(batch3.session) == 2 + assert len(batch3.sessions) == 2 finally: zf.unlink() @@ -68,12 +70,12 @@ def test_exports_continuous(self, cdataset2, cidataset, rewrite_data_files, data session = pybmds.Session(dataset=dataset) session.add_model(pybmds.Models.Power) session.execute_and_recommend() - batch.session.append(session) + batch.sessions.append(session) # check serialization/deserialization data = batch.serialize() batch2 = batch.deserialize(data) - assert len(batch2.session) == len(batch.session) + assert len(batch2.sessions) == len(batch.sessions) # check exports excel = batch.to_excel() @@ -95,12 +97,36 @@ def test_exports(self, mt_datasets, rewrite_data_files, data_path): # check serialization/deserialization data = batch.serialize() batch2 = batch.deserialize(data) - assert len(batch2.session) == len(batch.session) + assert len(batch2.sessions) == len(batch.sessions) # check exports excel = batch.to_excel() docx = batch.to_docx() + # check load/save + zf = Path(tempfile.NamedTemporaryFile().name) + try: + # save + batch.save(zf) + assert zf.exists() + # load + batch2 = MultitumorBatch.load(zf) + assert len(batch2.sessions) == 1 + finally: + zf.unlink() + if rewrite_data_files: (data_path / "reports/batch-multitumor.xlsx").write_bytes(excel.getvalue()) docx.save(data_path / "reports/batch-multitumor.docx") + + def test_execute(self, mt_datasets): + def _batch_run(ds): + sess = pybmds.Multitumor(datasets=ds["datasets"]) + sess.execute() + return sess + + batch = MultitumorBatch.execute([{"datasets": mt_datasets}], _batch_run, nprocs=1) + assert len(batch.sessions) == 1 + + with pytest.raises(NotImplementedError): + MultitumorBatch.execute([{"datasets": mt_datasets}], _batch_run, nprocs=2)