Skip to content

Commit

Permalink
ML: Use testbook instead of pytest-notebook
Browse files Browse the repository at this point in the history
testbook, while the most recent release was in 2021 already, provides
are more convenient interface, effectively only running the notebooks
to completion, not bothering about in-detail cell comparison.

https://pypi.org/project/testbook/

`cratedb_rag_customer_support_langchain.ipynb` needs this, because the
bottom half, where it connects to Jina API, needs to be masked.
  • Loading branch information
amotl committed Apr 19, 2024
1 parent 9e66e2c commit 8328f99
Show file tree
Hide file tree
Showing 10 changed files with 60 additions and 100 deletions.
32 changes: 0 additions & 32 deletions topic/machine-learning/automl/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,38 +19,6 @@ xfail_strict = true
markers = [
]

# pytest-notebook settings
nb_test_files = true
nb_coverage = false
# Default cell timeout is 120 seconds. For heavy computing, it needs to be increased.
nb_exec_timeout = 240
nb_diff_replace = [
# Compensate output of `crash`.
'"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
# Compensate other outputs.
'"/cells/*/outputs/*/data/text/html" "T_....." "T_na"',
'"/cells/*/outputs/*/data/text/plain" "IPython.core.display.HTML object" "pandas.io.formats.style.Styler"',
'"/cells/*/outputs/*/data/text/plain" "pandas.io.formats.style.Styler at 0x.+" "pandas.io.formats.style.Styler"',
'"/cells/*/outputs/*/data/application/vnd.jupyter.widget-view+json" "model_id: .+" "model_id: na"',
'"/cells/*/outputs/*/data/text/html" "\>\d+\.\d+\<\/td\>" "0.3333"',
]
# `vector_search.py` does not include any output(s).
nb_diff_ignore = [
"/metadata/language_info",
"/metadata/widgets",
"/cells/*/execution_count",
"/cells/*/outputs/*/execution_count",
"/cells/*/outputs/*/metadata/nbreg",
# Ignore images.
"/cells/*/outputs/*/data/image/png",
# Ignore all cell output. It is too tedious to compare and maintain.
# The validation hereby extends exclusively to the _execution_ of notebook cells,
# able to catch syntax errors, module import flaws, and runtime errors.
# However, the validation will not catch any regressions on actual cell output,
# or whether any output is produced at all.
"/cells/*/outputs",
]

[tool.coverage.run]
branch = false

Expand Down
1 change: 1 addition & 0 deletions topic/machine-learning/automl/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ plotly<5.21
pycaret[models,parallel,test]==3.3.1
pydantic<2
python-dotenv<2
sqlalchemy==2.*

# Development.
# mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@main
23 changes: 15 additions & 8 deletions topic/machine-learning/automl/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@

import pytest
from cratedb_toolkit.util import DatabaseAdapter
from pueblo.testing.folder import str_list, list_notebooks, list_python_files
from pueblo.testing.snippet import pytest_notebook, pytest_module_function
from pueblo.testing.folder import str_list, list_python_files
from pueblo.testing.notebook import generate_tests
from pueblo.testing.snippet import pytest_module_function
from testbook import testbook

HERE = Path(__file__).parent

Expand Down Expand Up @@ -57,15 +59,20 @@ def churn_dataset(cratedb):
cratedb.run_sql("REFRESH TABLE pycaret_churn;")


@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE)))
def test_notebook(request, notebook: str):
def pytest_generate_tests(metafunc):
"""
From individual Jupyter Notebook file, collect cells as pytest
test cases, and run them.
Generate pytest test case per Jupyter Notebook.
"""
here = Path(__file__).parent
generate_tests(metafunc, path=here)


Not using `NBRegressionFixture`, because it would manually need to be configured.
def test_notebook(notebook):
"""
Execute Jupyter Notebook, one test case per .ipynb file.
"""
pytest_notebook(request=request, filepath=notebook)
with testbook(notebook) as tb:
tb.execute()


@pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE)))
Expand Down
6 changes: 0 additions & 6 deletions topic/machine-learning/llm-langchain/conftest.py

This file was deleted.

18 changes: 0 additions & 18 deletions topic/machine-learning/llm-langchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,6 @@ xfail_strict = true
markers = [
]

# pytest-notebook settings
nb_test_files = true
nb_coverage = true
nb_diff_replace = [
# Compensate output of `crash`.
'"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
]
# `vector_search.py` does not include any output(s).
nb_diff_ignore = [
"/metadata/language_info",
"/cells/*/execution_count",
"/cells/*/outputs/*/execution_count",

# Do not compare details of cell outputs.
# It is impossible to maintain efficiently.
"/cells/*/outputs",
]

[tool.coverage.run]
branch = false

Expand Down
1 change: 1 addition & 0 deletions topic/machine-learning/llm-langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pydantic>=1,<3
pypdf<5
python-dotenv<2
requests-cache<2
sqlalchemy==2.*
unstructured<0.12
google-cloud-aiplatform
langchain-google-vertexai
Expand Down
39 changes: 27 additions & 12 deletions topic/machine-learning/llm-langchain/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
import pytest

from cratedb_toolkit.io.sql import DatabaseAdapter
from pueblo.testing.folder import str_list, list_notebooks, list_python_files
from pueblo.testing.snippet import pytest_module_function, pytest_notebook
from nbclient.exceptions import CellExecutionError
from pueblo.testing.folder import str_list, list_python_files
from pueblo.testing.notebook import generate_tests
from pueblo.testing.snippet import pytest_module_function
from testbook import testbook

HERE = Path(__file__).parent

Expand All @@ -26,20 +29,32 @@ def reset_database(cratedb):
time.sleep(0.01)


@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE)))
def test_notebook(request, notebook: str):
def pytest_generate_tests(metafunc):
"""
From individual Jupyter Notebook file, collect cells as pytest
test cases, and run them.
Not using `NBRegressionFixture`, because it would manually need to be configured.
Generate pytest test case per Jupyter Notebook.
"""
here = Path(__file__).parent
generate_tests(metafunc, path=here)

# Skip Vertex AI examples, because authenticating is more complicated.
if "vertexai" in str(notebook):
raise pytest.skip("Skipping Vertex AI due to lack of authentication")

pytest_notebook(request=request, filepath=notebook)
def test_notebook(notebook):
"""
Execute Jupyter Notebook, one test case per .ipynb file.
"""
# Skip Vertex AI examples, because authenticating is more complicated.
if "vertexai" in notebook.name:
raise pytest.skip(f"Skipping Vertex AI due to lack of authentication: {notebook.name}")

with testbook(notebook) as tb:
try:
tb.execute()

# Skip notebook if `pytest.exit()` is invoked, usually by
# `getenvpass()`, when authentication token is not given.
except CellExecutionError as ex:
msg = str(ex)
if "[skip-notebook]" in msg:
raise pytest.skip(msg)


@pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE)))
Expand Down
16 changes: 0 additions & 16 deletions topic/machine-learning/mlops-mlflow/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,6 @@ xfail_strict = true
markers = [
]

# pytest-notebook settings
nb_test_files = true
nb_coverage = true
nb_diff_replace = [
# Compensate output of `crash`.
'"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"',
]
# `vector_search.py` does not include any output(s).
nb_diff_ignore = [
"/metadata/language_info",
"/cells/*/execution_count",
"/cells/*/outputs/*/execution_count",
# Ignore images.
"/cells/*/outputs/*/data/image/png",
]

[tool.coverage.run]
branch = false

Expand Down
1 change: 1 addition & 0 deletions topic/machine-learning/mlops-mlflow/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ distributed>=2024.4.1 # Python 3.11.9 breaks previous Dask
mlflow-cratedb==2.11.3
pydantic<3
salesforce-merlion>=2,<3
sqlalchemy==2.*

# Development.
# mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@main
23 changes: 15 additions & 8 deletions topic/machine-learning/mlops-mlflow/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import pytest

from cratedb_toolkit.util import DatabaseAdapter
from pueblo.testing.folder import str_list, list_notebooks, list_python_files
from pueblo.testing.snippet import pytest_module_function, pytest_notebook
from pueblo.testing.folder import str_list, list_python_files
from pueblo.testing.notebook import generate_tests
from pueblo.testing.snippet import pytest_module_function
from testbook import testbook

HERE = Path(__file__).parent

Expand All @@ -22,15 +24,20 @@ def db_init(cratedb):
cratedb.run_sql("DROP TABLE IF EXISTS machine_data;")


@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE)))
def test_notebook(request, notebook: str):
def pytest_generate_tests(metafunc):
"""
From individual Jupyter Notebook file, collect cells as pytest
test cases, and run them.
Generate pytest test case per Jupyter Notebook.
"""
here = Path(__file__).parent
generate_tests(metafunc, path=here)


Not using `NBRegressionFixture`, because it would manually need to be configured.
def test_notebook(notebook):
"""
Execute Jupyter Notebook, one test case per .ipynb file.
"""
pytest_notebook(request=request, filepath=notebook)
with testbook(notebook) as tb:
tb.execute()


@pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE)))
Expand Down

0 comments on commit 8328f99

Please sign in to comment.