diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 445a259..5744e99 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -8,9 +8,10 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
     strategy:
       matrix:
+        os: [ubuntu-latest, macos-latest]
         python-version: ["3.11", "3.12"]
 
     steps:
@@ -30,7 +31,15 @@ jobs:
 
     - run: echo "$PWD/.venv/bin" >> $GITHUB_PATH
 
-    - name: Test
-      env:
-        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      run: ./scripts/run_tests.sh
+    - run: pytest tests/ --junitxml=junit/test-results-${{ matrix.os }}-${{ matrix.python-version }}.xml --cov=ml_instrumentation --cov-report=html:coverage/cov-${{ matrix.os }}-${{ matrix.python-version }}.html
+
+    - name: Upload pytest test results
+      uses: actions/upload-artifact@v4
+      with:
+        name: pytest-results-${{ matrix.os }}-${{ matrix.python-version }}
+        path: |
+          junit/test-results-${{ matrix.os }}-${{ matrix.python-version }}.xml
+          coverage/cov-${{ matrix.os }}-${{ matrix.python-version }}.html
+
+        # Use always() to always run this step to publish test results when there are test failures
+      if: ${{ always() }}
diff --git a/pyproject.toml b/pyproject.toml
index 81371d8..f13aaf3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,8 @@ license = {text = "MIT"}
 dev = [
     "pip",
     "ruff",
+    "pytest",
+    "pytest-cov",
     "commitizen",
     "pre-commit",
     "matplotlib",
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
deleted file mode 100755
index 49ea816..0000000
--- a/scripts/run_tests.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-# pyright --stats
-
-export PYTHONPATH=RlEvaluation
-python3 -m unittest discover -p "*test_*.py"
diff --git a/tests/_utils/test_data.py b/tests/_utils/test_data.py
index 238478e..fe1d7b4 100644
--- a/tests/_utils/test_data.py
+++ b/tests/_utils/test_data.py
@@ -1,62 +1,60 @@
-import unittest
 import numpy as np
 import pandas as pd
 
 from rlevaluation._utils.data import normalizeDataType, make_wide_format, is_wide_format
 from tests.test_utils.mock_data import generate_split_over_seed
 
-class TestData(unittest.TestCase):
-    def test_normalizeDataType(self):
-        # turn pandas dataframe into a numpy array
-        test_data = pd.DataFrame({
-            'alpha': [0.01, 0.01, 0.1],
-            'results': [1, 2, 3],
-        })
-        got = normalizeDataType(test_data, 2, 'results')
-        self.assertIsInstance(got, np.ndarray)
-        self.assertEqual(np.ndim(got), 2)
-
-        # keep numpy array untouched
-        test_data = np.array([
-            [1, 2, 3],
-            [2, 3, 4],
-            [3, 4, 5],
-        ])
-        got = normalizeDataType(test_data, 2)
-        self.assertIsInstance(got, np.ndarray)
-        self.assertEqual(np.ndim(got), 2)
-
-        # TODO: test shape normalization
-
-    def test_make_wide_format(self):
-        # works for one results column
-        df = generate_split_over_seed()
-
-        hypers = {'stepsize', 'optimizer'}
-        metrics = {'results'}
-
-        got = make_wide_format(df, hypers=hypers, metrics=metrics, seed_col='run')
-
-        self.assertEqual(len(got), 6)
-        self.assertEqual(got.iloc[0]['results'].shape, (10, 300))
-
-        # works for two results columns
-        df2 = df.copy()
-        df2['results-2'] = df2['results'] * 2
-        metrics = {'results', 'results-2'}
-
-        got = make_wide_format(df2, hypers=hypers, metrics=metrics, seed_col='run')
-
-        self.assertEqual(len(got), 6)
-        self.assertEqual(got.iloc[0]['results'].shape, (10, 300))
-        self.assertEqual(got.iloc[0]['results-2'].shape, (10, 300))
-
-        # should not change already wide data
-        self.assertFalse(is_wide_format(df, metrics, 'run'))
-        self.assertFalse(is_wide_format(df2, metrics, 'run'))
-
-        got = make_wide_format(df2, hypers=hypers, metrics=metrics, seed_col='run')
-        self.assertTrue(is_wide_format(got, metrics, 'run'))
-
-        got2 = make_wide_format(got, hypers=hypers, metrics=metrics, seed_col='run')
-        self.assertEqual(id(got), id(got2))
+def test_normalizeDataType():
+    # turn pandas dataframe into a numpy array
+    test_data = pd.DataFrame({
+        'alpha': [0.01, 0.01, 0.1],
+        'results': [1, 2, 3],
+    })
+    got = normalizeDataType(test_data, 2, 'results')
+    assert isinstance(got, np.ndarray)
+    assert np.ndim(got) == 2
+
+    # keep numpy array untouched
+    test_data = np.array([
+        [1, 2, 3],
+        [2, 3, 4],
+        [3, 4, 5],
+    ])
+    got = normalizeDataType(test_data, 2)
+    assert isinstance(got, np.ndarray)
+    assert np.ndim(got) == 2
+
+    # TODO: test shape normalization
+
+def test_make_wide_format():
+    # works for one results column
+    df = generate_split_over_seed()
+
+    hypers = {'stepsize', 'optimizer'}
+    metrics = {'results'}
+
+    got = make_wide_format(df, hypers=hypers, metrics=metrics, seed_col='run')
+
+    assert len(got) == 6
+    assert got.iloc[0]['results'].shape == (10, 300)
+
+    # works for two results columns
+    df2 = df.copy()
+    df2['results-2'] = df2['results'] * 2
+    metrics = {'results', 'results-2'}
+
+    got = make_wide_format(df2, hypers=hypers, metrics=metrics, seed_col='run')
+
+    assert len(got) == 6
+    assert got.iloc[0]['results'].shape == (10, 300)
+    assert got.iloc[0]['results-2'].shape == (10, 300)
+
+    # should not change already wide data
+    assert not is_wide_format(df, metrics, 'run')
+    assert not is_wide_format(df2, metrics, 'run')
+
+    got = make_wide_format(df2, hypers=hypers, metrics=metrics, seed_col='run')
+    assert is_wide_format(got, metrics, 'run')
+
+    got2 = make_wide_format(got, hypers=hypers, metrics=metrics, seed_col='run')
+    assert id(got) == id(got2)
diff --git a/tests/test_hypers.py b/tests/test_hypers.py
index 3ff08de..e21c3fa 100644
--- a/tests/test_hypers.py
+++ b/tests/test_hypers.py
@@ -1,18 +1,16 @@
-import unittest
 import pandas as pd
 
 from rlevaluation.hypers import select_best_hypers, Preference
 from rlevaluation.config import data_definition
 
-class TestHypers(unittest.TestCase):
-    def test_select_best_hypers(self):
-        test_data = pd.DataFrame({
-            'alpha': [0.1, 0.01, 0.001],
-            'seed': [0, 0, 0],
-            'result': [0, 2, 1],
-        })
+def test_select_best_hypers():
+    test_data = pd.DataFrame({
+        'alpha': [0.1, 0.01, 0.001],
+        'seed': [0, 0, 0],
+        'result': [0, 2, 1],
+    })
 
-        d = data_definition(hyper_cols=['alpha'])
+    d = data_definition(hyper_cols=['alpha'])
 
-        best = select_best_hypers(test_data, 'result', Preference.high, data_definition=d)
-        self.assertEqual(best.best_configuration[0], 0.01)
+    best = select_best_hypers(test_data, 'result', Preference.high, data_definition=d)
+    assert best.best_configuration[0] == 0.01