Skip to content

Commit

Permalink
Merge branch 'main' into 159-feat-bms-return-variable-number-of-models
Browse files Browse the repository at this point in the history
  • Loading branch information
TheLemonPig authored Dec 7, 2022
2 parents ee06f86 + 7a8ed4a commit aed52ab
Show file tree
Hide file tree
Showing 14 changed files with 1,013 additions and 127 deletions.
5 changes: 3 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/ambv/black
rev: 22.8.0
rev: 22.10.0
hooks:
- id: black
- repo: https://github.com/pycqa/isort
Expand All @@ -10,6 +10,7 @@ repos:
args:
- "--profile=black"
- "--filter-files"
- "--project=autora"
- repo: https://github.com/pycqa/flake8
rev: 5.0.4
hooks:
Expand All @@ -19,7 +20,7 @@ repos:
- "--extend-ignore=E203"
- "--per-file-ignores=__init__.py:F401"
- repo: https://github.com/pre-commit/mirrors-mypy
rev: "v0.971"
rev: "v0.991"
hooks:
- id: mypy
additional_dependencies: [types-requests]
Expand Down
126 changes: 126 additions & 0 deletions autora/experimentalist/filter.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,128 @@
from enum import Enum
from typing import Callable, Iterable, Tuple

import numpy as np


def weber_filter(values):
return filter(lambda s: s[0] <= s[1], values)


def train_test_filter(
seed: int = 180, train_p: float = 0.5
) -> Tuple[Callable[[Iterable], Iterable], Callable[[Iterable], Iterable]]:
"""
A pipeline filter which pseudorandomly assigns values from the input into "train" or "test"
groups. This is particularly useful when working with streams of data of potentially
unbounded length.
This isn't a great method for small datasets, as it doesn't guarantee producing training
and test sets which are as close as possible to the specified desired proportions.
Consider using the scikit-learn `train_test_split` for cases where it's practical to
enumerate the full dataset in advance.
Args:
seed: random number generator seeding value
train_p: proportion of data which go into the training set. A float between 0 and 1.
Returns:
a tuple of callables `(train_filter, test_filter)` which split the input data
into two complementary streams.
Examples:
We can create complementary train and test filters using the function:
>>> train_filter, test_filter = train_test_filter(train_p=0.6, seed=180)
The `train_filter` generates a sequence of ~60% of the input list –
in this case, 15 of 20 datapoints.
Note that the correct split would be 12 of 20 data points.
Again, for data with bounded length it is advisable
to use scikit-learn `train_test_split` instead.
>>> list(train_filter(range(20)))
[0, 2, 3, 4, 5, 6, 9, 10, 11, 12, 15, 16, 17, 18, 19]
When we run the `test_filter`, it fills in the gaps, giving us the remaining 5 values:
>>> list(test_filter(range(20)))
[1, 7, 8, 13, 14]
We can continue to generate new values for as long as we like using the same filter and the
continuation of the input range:
>>> list(train_filter(range(20, 40)))
[20, 22, 23, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39]
... and some more.
>>> list(train_filter(range(40, 50)))
[41, 42, 44, 45, 46, 49]
As the number of samples grows, the fraction in the train and test sets
will approach `train_p` and `1 - train_p`.
The test_filter fills in the gaps again.
>>> list(test_filter(range(20, 30)))
[21, 24, 25, 26]
If you rerun the *same* test_filter on a fresh range, then the results will be different
to the first time around:
>>> list(test_filter(range(20)))
[5, 10, 13, 17, 18]
... but if you regenerate the test_filter, it'll reproduce the original sequence
>>> _, test_filter_regenerated = train_test_filter(train_p=0.6, seed=180)
>>> list(test_filter_regenerated(range(20)))
[1, 7, 8, 13, 14]
It also works on tuple-valued lists:
>>> from itertools import product
>>> train_filter_tuple, test_filter_tuple = train_test_filter(train_p=0.3, seed=42)
>>> list(test_filter_tuple(product(["a", "b"], [1, 2, 3])))
[('a', 1), ('a', 2), ('a', 3), ('b', 1), ('b', 3)]
>>> list(train_filter_tuple(product(["a","b"], [1,2,3])))
[('b', 2)]
>>> from itertools import count, takewhile
>>> train_filter_unbounded, test_filter_unbounded = train_test_filter(train_p=0.5, seed=21)
>>> list(takewhile(lambda s: s < 90, count(79)))
[79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
>>> train_pool = train_filter_unbounded(count(79))
>>> list(takewhile(lambda s: s < 90, train_pool))
[82, 85, 86, 89]
>>> test_pool = test_filter_unbounded(count(79))
>>> list(takewhile(lambda s: s < 90, test_pool))
[79, 80, 81, 83, 84, 87, 88]
>>> list(takewhile(lambda s: s < 110, test_pool))
[91, 93, 94, 97, 100, 105, 106, 109]
"""

test_p = 1 - train_p

_TrainTest = Enum("_TrainTest", ["train", "test"])

def train_test_stream():
"""Generates a pseudorandom stream of _TrainTest.train and _TrainTest.test."""
rng = np.random.default_rng(seed)
while True:
yield rng.choice([_TrainTest.train, _TrainTest.test], p=(train_p, test_p))

def _factory(allow):
"""Factory to make complementary generators which split their input
corresponding to the values of the pseudorandom train_test_stream."""
_stream = train_test_stream()

def _generator(values):
"""Generator which yields items from the `values` depending on
whether the corresponding item from the `_stream`
matches the `allow` parameter."""
for v, train_test in zip(values, _stream):
if train_test == allow:
yield v

return _generator

return _factory(_TrainTest.train), _factory(_TrainTest.test)
66 changes: 66 additions & 0 deletions autora/experimentalist/sampler/model_disagreement.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import itertools
from typing import Iterable, List

import numpy as np


def model_disagreement_sampler(X: np.array, models: List, num_samples: int = 1):
"""
A sampler that returns selected samples for independent variables
for which the models disagree the most in terms of their predictions.
Args:
X: pool of IV conditions to evaluate in terms of model disagreement
models: List of Scikit-learn (regression or classification) models to compare
num_samples: number of samples to select
Returns: Sampled pool
"""

if isinstance(X, Iterable):
X = np.array(list(X))

X_predict = np.array(X)
if len(X_predict.shape) == 1:
X_predict = X_predict.reshape(-1, 1)

model_disagreement = list()

# collect diagreements for each model pair
for model_a, model_b in itertools.combinations(models, 2):

# determine the prediction method
if hasattr(model_a, "predict_proba") and hasattr(model_b, "predict_proba"):
model_a_predict = model_a.predict_proba
model_b_predict = model_b.predict_proba
elif hasattr(model_a, "predict") and hasattr(model_b, "predict"):
model_a_predict = model_a.predict
model_b_predict = model_b.predict
else:
raise AttributeError(
"Models must both have `predict_proba` or `predict` method."
)

# get predictions from both models
y_a = model_a_predict(X_predict)
y_b = model_b_predict(X_predict)

assert y_a.shape == y_b.shape, "Models must have same output shape."

# determine the disagreement between the two models in terms of mean-squared error
if len(y_a.shape) == 1:
disagreement = (y_a - y_b) ** 2
else:
disagreement = np.mean((y_a - y_b) ** 2, axis=1)

model_disagreement.append(disagreement)

assert len(model_disagreement) >= 1, "No disagreements to compare."

# sum up all model disagreements
summed_disagreement = np.sum(model_disagreement, axis=0)

# sort the summed disagreements and select the top n
idx = (-summed_disagreement).argsort()[:num_samples]

return X[idx]
Loading

0 comments on commit aed52ab

Please sign in to comment.