Merge branch 'main' into 159-feat-bms-return-variable-number-of-models

AutoResearch · Dec 7, 2022 · aed52ab · aed52ab
2 parents ee06f86 + 7a8ed4a
commit aed52ab
Show file tree

Hide file tree

Showing 14 changed files with 1,013 additions and 127 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/ambv/black
-    rev: 22.8.0
+    rev: 22.10.0
     hooks:
       - id: black
   - repo: https://github.com/pycqa/isort
@@ -10,6 +10,7 @@ repos:
         args:
         - "--profile=black"
         - "--filter-files"
+        - "--project=autora"
   - repo: https://github.com/pycqa/flake8
     rev: 5.0.4
     hooks:
@@ -19,7 +20,7 @@ repos:
         - "--extend-ignore=E203"
         - "--per-file-ignores=__init__.py:F401"
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v0.971"
+    rev: "v0.991"
     hooks:
       - id: mypy
         additional_dependencies: [types-requests]

diff --git a/autora/experimentalist/filter.py b/autora/experimentalist/filter.py
@@ -1,2 +1,128 @@
+from enum import Enum
+from typing import Callable, Iterable, Tuple
+
+import numpy as np
+
+
 def weber_filter(values):
     return filter(lambda s: s[0] <= s[1], values)
+
+
+def train_test_filter(
+    seed: int = 180, train_p: float = 0.5
+) -> Tuple[Callable[[Iterable], Iterable], Callable[[Iterable], Iterable]]:
+    """
+    A pipeline filter which pseudorandomly assigns values from the input into "train" or "test"
+    groups. This is particularly useful when working with streams of data of potentially
+    unbounded length.
+
+    This isn't a great method for small datasets, as it doesn't guarantee producing training
+    and test sets which are as close as possible to the specified desired proportions.
+    Consider using the scikit-learn `train_test_split` for cases where it's practical to
+    enumerate the full dataset in advance.
+
+    Args:
+        seed: random number generator seeding value
+        train_p: proportion of data which go into the training set. A float between 0 and 1.
+
+    Returns:
+        a tuple of callables `(train_filter, test_filter)` which split the input data
+            into two complementary streams.
+
+
+    Examples:
+        We can create complementary train and test filters using the function:
+        >>> train_filter, test_filter = train_test_filter(train_p=0.6, seed=180)
+
+        The `train_filter` generates a sequence of ~60% of the input list –
+        in this case, 15 of 20 datapoints.
+        Note that the correct split would be 12 of 20 data points.
+        Again, for data with bounded length it is advisable
+        to use scikit-learn `train_test_split` instead.
+        >>> list(train_filter(range(20)))
+        [0, 2, 3, 4, 5, 6, 9, 10, 11, 12, 15, 16, 17, 18, 19]
+
+        When we run the `test_filter`, it fills in the gaps, giving us the remaining 5 values:
+        >>> list(test_filter(range(20)))
+        [1, 7, 8, 13, 14]
+
+        We can continue to generate new values for as long as we like using the same filter and the
+        continuation of the input range:
+        >>> list(train_filter(range(20, 40)))
+        [20, 22, 23, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39]
+
+        ... and some more.
+        >>> list(train_filter(range(40, 50)))
+        [41, 42, 44, 45, 46, 49]
+
+        As the number of samples grows, the fraction in the train and test sets
+        will approach `train_p` and `1 - train_p`.
+
+        The test_filter fills in the gaps again.
+        >>> list(test_filter(range(20, 30)))
+        [21, 24, 25, 26]
+
+        If you rerun the *same* test_filter on a fresh range, then the results will be different
+        to the first time around:
+        >>> list(test_filter(range(20)))
+        [5, 10, 13, 17, 18]
+
+        ... but if you regenerate the test_filter, it'll reproduce the original sequence
+        >>> _, test_filter_regenerated = train_test_filter(train_p=0.6, seed=180)
+        >>> list(test_filter_regenerated(range(20)))
+        [1, 7, 8, 13, 14]
+
+        It also works on tuple-valued lists:
+        >>> from itertools import product
+        >>> train_filter_tuple, test_filter_tuple = train_test_filter(train_p=0.3, seed=42)
+        >>> list(test_filter_tuple(product(["a", "b"], [1, 2, 3])))
+        [('a', 1), ('a', 2), ('a', 3), ('b', 1), ('b', 3)]
+
+        >>> list(train_filter_tuple(product(["a","b"], [1,2,3])))
+        [('b', 2)]
+
+        >>> from itertools import count, takewhile
+        >>> train_filter_unbounded, test_filter_unbounded = train_test_filter(train_p=0.5, seed=21)
+
+        >>> list(takewhile(lambda s: s < 90, count(79)))
+        [79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
+
+        >>> train_pool = train_filter_unbounded(count(79))
+        >>> list(takewhile(lambda s: s < 90, train_pool))
+        [82, 85, 86, 89]
+
+        >>> test_pool = test_filter_unbounded(count(79))
+        >>> list(takewhile(lambda s: s < 90, test_pool))
+        [79, 80, 81, 83, 84, 87, 88]
+
+        >>> list(takewhile(lambda s: s < 110, test_pool))
+        [91, 93, 94, 97, 100, 105, 106, 109]
+
+    """
+
+    test_p = 1 - train_p
+
+    _TrainTest = Enum("_TrainTest", ["train", "test"])
+
+    def train_test_stream():
+        """Generates a pseudorandom stream of _TrainTest.train and _TrainTest.test."""
+        rng = np.random.default_rng(seed)
+        while True:
+            yield rng.choice([_TrainTest.train, _TrainTest.test], p=(train_p, test_p))
+
+    def _factory(allow):
+        """Factory to make complementary generators which split their input
+        corresponding to the values of the pseudorandom train_test_stream."""
+        _stream = train_test_stream()
+
+        def _generator(values):
+            """Generator which yields items from the `values` depending on
+            whether the corresponding item from the `_stream`
+            matches the `allow` parameter."""
+            for v, train_test in zip(values, _stream):
+                if train_test == allow:
+                    yield v
+
+        return _generator
+
+    return _factory(_TrainTest.train), _factory(_TrainTest.test)
diff --git a/autora/experimentalist/sampler/model_disagreement.py b/autora/experimentalist/sampler/model_disagreement.py
@@ -0,0 +1,66 @@
+import itertools
+from typing import Iterable, List
+
+import numpy as np
+
+
+def model_disagreement_sampler(X: np.array, models: List, num_samples: int = 1):
+    """
+    A sampler that returns selected samples for independent variables
+    for which the models disagree the most in terms of their predictions.
+
+    Args:
+        X: pool of IV conditions to evaluate in terms of model disagreement
+        models: List of Scikit-learn (regression or classification) models to compare
+        num_samples: number of samples to select
+
+    Returns: Sampled pool
+    """
+
+    if isinstance(X, Iterable):
+        X = np.array(list(X))
+
+    X_predict = np.array(X)
+    if len(X_predict.shape) == 1:
+        X_predict = X_predict.reshape(-1, 1)
+
+    model_disagreement = list()
+
+    # collect diagreements for each model pair
+    for model_a, model_b in itertools.combinations(models, 2):
+
+        # determine the prediction method
+        if hasattr(model_a, "predict_proba") and hasattr(model_b, "predict_proba"):
+            model_a_predict = model_a.predict_proba
+            model_b_predict = model_b.predict_proba
+        elif hasattr(model_a, "predict") and hasattr(model_b, "predict"):
+            model_a_predict = model_a.predict
+            model_b_predict = model_b.predict
+        else:
+            raise AttributeError(
+                "Models must both have `predict_proba` or `predict` method."
+            )
+
+        # get predictions from both models
+        y_a = model_a_predict(X_predict)
+        y_b = model_b_predict(X_predict)
+
+        assert y_a.shape == y_b.shape, "Models must have same output shape."
+
+        # determine the disagreement between the two models in terms of mean-squared error
+        if len(y_a.shape) == 1:
+            disagreement = (y_a - y_b) ** 2
+        else:
+            disagreement = np.mean((y_a - y_b) ** 2, axis=1)
+
+        model_disagreement.append(disagreement)
+
+    assert len(model_disagreement) >= 1, "No disagreements to compare."
+
+    # sum up all model disagreements
+    summed_disagreement = np.sum(model_disagreement, axis=0)
+
+    # sort the summed disagreements and select the top n
+    idx = (-summed_disagreement).argsort()[:num_samples]
+
+    return X[idx]