-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQueriesCommittee.py
151 lines (99 loc) · 4.82 KB
/
QueriesCommittee.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from collections import Counter
from typing import Callable, Union
import numpy as np
from scipy.stats import entropy
from sklearn.exceptions import NotFittedError
from random import shuffle
from sklearn.metrics import pairwise_distances
import CommitteeClass
from Arguments import multi_argmax, shuffled_argmax
from CommitteeClass import CommitteeRegressor
from scipy.spatial.distance import cosine, euclidean
def vote_entropy(committee: CommitteeClass, X_val, **predict_proba_kwargs):
n_learners = len(committee)
try:
votes = committee.vote()
except NotFittedError:
return print('There was an error in the vote function from CommitteeClass')
p_vote = np.zeros(shape=(X_val.shape[0], len(committee)))
for vote_idx, vote in enumerate(votes):
vote_counter = Counter(vote)
for class_idx, class_label in enumerate(committee.classes_):
p_vote[vote_idx, class_idx] = vote_counter[class_label] / n_learners
entr = entropy(p_vote, axis=1)
return entr
def vote_entropy_sampling(committee: CommitteeClass, X_val, n_instances: int = 1,
random_tie_break=False, **predict_proba_kwargs):
disagreement = vote_entropy(committee, X_val, **predict_proba_kwargs)
if not random_tie_break:
return multi_argmax(disagreement, n_instances=n_instances)
return shuffled_argmax(disagreement, n_instances=n_instances)
def KLMaxDisagreement(committee: CommitteeClass, X_val, **predict_proba_kwargs):
try:
p_vote = committee.vote_proba()
except NotFittedError:
return print('There was an error in the vote_proba function from CommitteeClass')
p_consensus = np.mean(p_vote, axis=1)
learner_kl_div = np.zeros(shape=(X_val.shape[0], len(committee)))
for learner_idx, _ in enumerate(committee):
learner_kl_div[:, learner_idx] = entropy(np.transpose(p_vote[:, learner_idx, :]), qk=np.transpose(p_consensus))
return np.max(learner_kl_div, axis=1)
def max_disagreement_sampling(committee: CommitteeClass, X_val, n_instances: int = 1,
random_tie_break=False, **disagreement_measure_kwargs):
disagreement = KLMaxDisagreement(committee, X_val, **disagreement_measure_kwargs)
if not random_tie_break:
return multi_argmax(disagreement, n_instances=n_instances)
return shuffled_argmax(disagreement, n_instances=n_instances)
def consensus_entropy(committee: CommitteeClass, X_val, **predict_kwargs):
try:
proba = committee.predict_proba(X_val, **predict_kwargs)
except NotFittedError:
print('Issue with predict proba of the committees')
entr = np.transpose(entropy(np.transpose(proba)))
return entr
def consensus_entropy_sampling(committee: CommitteeClass, X_val, n_instances: int = 1, random_tie_break=False,
**disagreement_measure_kwargs):
disagreement = consensus_entropy(committee, X_val, **disagreement_measure_kwargs)
if not random_tie_break:
return multi_argmax(disagreement, n_instances=n_instances)
return shuffled_argmax(disagreement, n_instances=n_instances)
def get_cluster_samples(data, num_clusters: int = 5, max_epoch: int = 5, limit: int = 5000):
#if limit > 0:
# shuffle(data)
# data = data[:limit]
cosine_clusters = CosineClusters(num_clusters)
cosine_clusters.add_random_training_items(data)
for i in range(0, max_epoch):
print("Epoch " + str(i))
added = cosine_clusters.add_items_to_best_cluster(data)
if added == 0:
break
centroids = cosine_clusters.get_centroids()
outliers = cosine_clusters.get_outliers()
randoms = cosine_clusters.get_randoms(3, verbose=True)
return centroids + outliers + randoms
def similarize_distance(distance_measure: Callable) -> Callable:
def sim(*args, **kwargs):
return 1/(1+distance_measure(*args, **kwargs))
return sim
cosine_similarity = similarize_distance(cosine)
euclidean_similarity = similarize_distance(euclidean)
def information_density(X, metric: Union[str, Callable] = 'euclidean') -> np.ndarray:
similarity_mtx = 1/(1+pairwise_distances(X, X, metric = metric))
return similarity_mtx.mean(axis=1)
def max_std_sampling(regressor: CommitteeRegressor, X_unlabeled: np.ndarray, n_instances: int = 1,
random_tie_break = False, **predict_kwargs):
'''
Regressor standard deviation sampling strategy
:param regressor:
:param X_unlabeled:
:param n_instances:
:param random_tie_break:
:param predict_kwargs:
:return:
'''
_, std = regressor.predict(X_unlabeled, return_std=True, **predict_kwargs)
std = std.reshape(X_unlabeled.shape[0], )
if not random_tie_break:
return multi_argmax(std, n_instances=n_instances), std
return shuffled_argmax(std, n_instances=n_instances), std