ntucllab · CoolJosh0221 · Oct 26, 2024 · Nov 16, 2024 · Dec 21, 2024 · Dec 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -61,3 +61,9 @@ target/
 examples/australian.txt
 examples/diabetes.txt
 examples/heart.txt
+
+libact/query_strategies/_hintsvm.c
+.devcontainer/
+.github/dependabot.yml
+
+temp/
diff --git a/README.md b/README.md
@@ -21,16 +21,16 @@ Comments and questions on the package is welcomed at `libact-users@googlegroups.
 
 # Basic Dependencies
 
-* Python 2.7, 3.3, 3.4, 3.5, 3.6
+* Python 2.7, 3.3, 3.4, 3.5, 3.6, 3.10
 
-* Python dependencies
+* Debian (>= 7) / Ubuntu (>= 14.04)
 ```
-pip install -r requirements.txt
+sudo apt-get install build-essential gfortran libatlas-base-dev liblapacke-dev python3-dev
 ```
 
-* Debian (>= 7) / Ubuntu (>= 14.04)
+* Python dependencies
 ```
-sudo apt-get install build-essential gfortran libatlas-base-dev liblapacke-dev python3-dev
+pip install -r requirements.txt
 ```
 
 * Arch
@@ -66,9 +66,10 @@ python setup.py install --user
 ```
 
 To build and install from souce for all users on Unix/Linux:
+
+**(This is the recommended method for Python 3.10 users)**
 ```
-python setup.py build
-sudo python setup.py install
+pip install -e .
 ```
 
 ## Installation Options
@@ -154,4 +155,4 @@ If you find this package useful, please cite the original works (see Reference o
 
 # Acknowledgments
 
-The authors thank Chih-Wei Chang and other members of the [Computational Learning Lab](https://learner.csie.ntu.edu.tw/) at National Taiwan University for valuable discussions and various contributions to making this package better.
+The authors thank Chih-Wei Chang and other members of the [Computational Learning Lab](https://learner.csie.ntu.edu.tw/) at National Taiwan University for valuable discussions and various contributions to making this package better.
diff --git a/examples/plot.py b/examples/plot.py
@@ -10,15 +10,12 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
-try:
-    from sklearn.model_selection import train_test_split
-except ImportError:
-    from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 
 # libact classes
 from libact.base.dataset import Dataset, import_libsvm_sparse
 from libact.models import LogisticRegression
-from libact.query_strategies import RandomSampling, UncertaintySampling
+from libact.query_strategies import RandomSampling, UncertaintySampling, VarianceReduction, HintSVM
 from libact.labelers import IdealLabeler
 
 
@@ -64,6 +61,8 @@ def main():
     trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
         split_train_test(dataset_filepath, test_size, n_labeled)
     trn_ds2 = copy.deepcopy(trn_ds)
+    trn_ds3 = copy.deepcopy(trn_ds)
+    trn_ds4 = copy.deepcopy(trn_ds)
     lbr = IdealLabeler(fully_labeled_trn_ds)
 
     quota = len(y_train) - n_labeled    # number of samples to query
@@ -78,20 +77,33 @@ def main():
     model = LogisticRegression()
     E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
 
+    # qs3 = VarianceReduction(trn_ds3, model=LogisticRegression())
+    # E_in_3, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
+
+    qs4 = HintSVM(trn_ds4)
+    E_in_4, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
+
     # Plot the learning curve of UncertaintySampling to RandomSampling
     # The x-axis is the number of queries, and the y-axis is the corresponding
     # error rate.
     query_num = np.arange(1, quota + 1)
-    plt.plot(query_num, E_in_1, 'b', label='qs Ein')
-    plt.plot(query_num, E_in_2, 'r', label='random Ein')
-    plt.plot(query_num, E_out_1, 'g', label='qs Eout')
-    plt.plot(query_num, E_out_2, 'k', label='random Eout')
+    plt.plot(query_num, E_in_1, 'b', label='qs Ein',
+             linestyle='dashed')
+    plt.plot(query_num, E_out_1, 'b', label='qs Eout')
+    plt.plot(query_num, E_in_2, 'r', label='random Ein',
+             linestyle='dashed')
+    plt.plot(query_num, E_out_2, 'r', label='random Eout')
+    # plt.plot(query_num, E_in_3, 'g', label='vr Ein',  linestyle='dashed')
+    # plt.plot(query_num, E_out_3, 'g', label='vr Eout')
+    plt.plot(query_num, E_in_4, 'k', label='SVM Ein',
+             linestyle='dashed')
+    plt.plot(query_num, E_out_4, 'k', label='SVM Eout')
     plt.xlabel('Number of Queries')
     plt.ylabel('Error')
     plt.title('Experiment Result')
     plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
                fancybox=True, shadow=True, ncol=5)
-    plt.show()
+    plt.savefig("test.png")
 
 
 if __name__ == '__main__':

diff --git a/libact/base/dataset.py b/libact/base/dataset.py
@@ -34,12 +34,14 @@ class Dataset(object):
     """
 
     def __init__(self, X=None, y=None):
-        if X is None: X = np.array([])
+        if X is None:
+            X = np.array([])
         elif not isinstance(X, sp.csr_matrix):
             X = np.array(X)
 
-        if y is None: y = []
-        y = np.array(y)
+        if y is None:
+            y = []
+        y = np.array(y, dtype=object)
 
         self._X = X
         self._y = y
@@ -123,7 +125,7 @@ def append(self, feature, label=None):
         """
         if isinstance(self._X, np.ndarray):
             self._X = np.vstack([self._X, feature])
-        else: # sp.csr_matrix
+        else:  # sp.csr_matrix
             self._X = sp.vstack([self._X, feature])
         self._y = np.append(self._y, label)
 
@@ -218,7 +220,7 @@ def labeled_uniform_sample(self, sample_size, replace=True):
         sample_size
         """
         idx = np.random.choice(np.where(self.get_labeled_mask())[0],
-                               size=sample_size, replace=replace )
+                               size=sample_size, replace=replace)
         return Dataset(self._X[idx], self._y[idx])
 
 

diff --git a/libact/query_strategies/_hintsvm.pyx b/libact/query_strategies/_hintsvm.pyx
@@ -1,9 +1,9 @@
 #defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 
-import  numpy as np
+import numpy as np
 cimport numpy as np
 from libc.stdlib cimport free
-cimport _hintsvm
+from . cimport _hintsvm
 
 cdef extern from *:
     ctypedef struct svm_parameter:

diff --git a/libact/query_strategies/density_weighted_meta.py b/libact/query_strategies/density_weighted_meta.py
@@ -86,13 +86,12 @@ def __init__(self, dataset, base_query_strategy, similarity_metric=None,
         else:
             self.clustering_method = KMeans(
                 n_clusters=5, random_state=self.random_state_)
-        
+
         if similarity_metric is not None:
             self.similarity_metric = similarity_metric
         else:
             self.similarity_metric = cosine_similarity
 
-
     @inherit_docstring_from(QueryStrategy)
     def update(self, entry_id, label):
         pass
@@ -104,7 +103,7 @@ def _get_scores(self):
         scores = self.base_query_strategy._get_scores()
         _, X_pool = dataset.get_unlabeled_entries()
         unlabeled_entry_ids, base_scores = zip(*scores)
-        
+
         self.clustering_method.fit(X)
         pool_cluster = self.clustering_method.predict(X_pool)
         cluster_center = self.clustering_method.cluster_centers_
@@ -126,6 +125,7 @@ def make_query(self):
         dataset = self.dataset
 
         unlabeled_entry_ids, scores = zip(*self._get_scores())
-        ask_id = self.random_state_.choice(np.where(scores == np.max(scores))[0])
+        ask_id = self.random_state_.choice(
+            np.where(scores == np.max(scores))[0])
 
-        return unlabeled_entry_ids[ask_id]
+        return unlabeled_entry_ids[ask_id]
diff --git a/libact/query_strategies/density_weighted_uncertainty_sampling.py b/libact/query_strategies/density_weighted_uncertainty_sampling.py
@@ -101,7 +101,7 @@ def __init__(self, *args, **kwargs):
         dis = np.zeros((len(all_x), self.n_clusts))
         for i in range(self.n_clusts):
             dis[:, i] = np.exp(-np.einsum('ij,ji->i', (all_x - centers[i]),
-                (all_x - centers[i]).T) / 2 / self.sigma)
+                                          (all_x - centers[i]).T) / 2 / self.sigma)
 
         # EM percedure to estimate the prior
         for _ in range(self.max_iter):
@@ -153,6 +153,7 @@ def make_query(self):
 
         return unlabeled_entry_ids[ask_id]
 
+
 class DensityWeightedLogisticRegression(object):
     """Density Weighted Logistic Regression
 
@@ -203,15 +204,16 @@ def __init__(self, density_estimate, centers, C):
 
     def _likelihood(self, w, X, y):
         w = w.reshape(-1, 1)
-        sigmoid = lambda t: 1. / (1. + np.exp(-t))
+        def sigmoid(t): return 1. / (1. + np.exp(-t))
         # w --> shape = (d+1, 1)
-        L = lambda w: (self.C/2. * np.dot(w[:-1].T, w[:-1]) - \
-                np.sum(np.log(
-                    np.sum(self.density *
-                        sigmoid(np.dot(y,
-                                       (np.dot(self.centers, w[:-1]) + w[-1]).T)
-                        ), axis=1)
-                ), axis=0))[0][0]
+
+        def L(w): return (self.C/2. * np.dot(w[:-1].T, w[:-1]) -
+                          np.sum(np.log(
+                              np.sum(self.density *
+                                     sigmoid(np.dot(y,
+                                                    (np.dot(self.centers, w[:-1]) + w[-1]).T)
+                                             ), axis=1)
+                          ), axis=0))[0][0]
 
         return L(w)
 
@@ -235,7 +237,7 @@ def predict(self):
 
         """
         if self.w_ is not None:
-            sigmoid = lambda t: 1. / (1. + np.exp(-t))
+            def sigmoid(t): return 1. / (1. + np.exp(-t))
             return sigmoid(np.dot(self.centers, self.w_[:-1]) + self.w_[-1])
         else:
             # TODO the model is not trained

diff --git a/libact/query_strategies/multilabel/cost_sensitive_reference_pair_encoding.py b/libact/query_strategies/multilabel/cost_sensitive_reference_pair_encoding.py
@@ -73,7 +73,8 @@ class CostSensitiveReferencePairEncoding(QueryStrategy):
 
     def __init__(self, dataset, scoring_fn, model, base_model, n_models=100,
                  n_jobs=1, random_state=None):
-        super(CostSensitiveReferencePairEncoding, self).__init__(dataset=dataset)
+        super(CostSensitiveReferencePairEncoding,
+              self).__init__(dataset=dataset)
 
         self.model_ = model
         self.csrpe_ = CSRPE(scoring_fn=scoring_fn, base_clf=base_model,
@@ -94,12 +95,12 @@ def make_query(self):
         Z = self.csrpe_.predicted_code(X_pool)
         predZ = self.csrpe_.encode(predY)
 
-        dist = paired_distances(Z, predZ, metric=hamming) # z1 z2
-        dist2 = self.csrpe_.predict_dist(X_pool) # z1 zt
-        #dist3 = self.csrpe.distance(predZ) # z2 zt
+        dist = paired_distances(Z, predZ, metric=hamming)  # z1 z2
+        dist2 = self.csrpe_.predict_dist(X_pool)  # z1 zt
+        # dist3 = self.csrpe.distance(predZ) # z2 zt
 
         dist = dist + dist2
-        #dist = dist + dist3
+        # dist = dist + dist3
 
         ask_id = self.random_state_.choice(
             np.where(np.isclose(dist, np.max(dist)))[0])
@@ -127,8 +128,10 @@ def train(self, X, y):
         self.n_samples = np.shape(X)[0]
         self.n_labels = np.shape(y)[1]
 
-        score0 = self.scoring_fn(y, np.tile(self.rep_label[0], (self.n_samples, 1)))
-        score1 = self.scoring_fn(y, np.tile(self.rep_label[1], (self.n_samples, 1)))
+        score0 = self.scoring_fn(y, np.tile(
+            self.rep_label[0], (self.n_samples, 1)))
+        score1 = self.scoring_fn(y, np.tile(
+            self.rep_label[1], (self.n_samples, 1)))
         lbl = (((score1 - score0) > 0) + 0.0)
 
         weight = np.abs(score1 - score0)
@@ -153,8 +156,8 @@ def __init__(self, scoring_fn, base_clf, n_clfs, n_jobs,
                  metric='euclidean', random_state=None):
         self.scoring_fn = scoring_fn
         self.base_clf = base_clf
-        self.nn_ = NearestNeighbors(1, algorithm='ball_tree',
-                metric=metric, n_jobs=n_jobs)
+        self.nn_ = NearestNeighbors(n_neighbors=1, algorithm='ball_tree',
+                                    metric=metric, n_jobs=n_jobs)
         self.n_clfs = n_clfs
         self.random_state_ = seed_random_state(random_state)
 
@@ -224,4 +227,4 @@ def predict_dist(self, X):
         encoded = self.predicted_code(X)
         dist, _ = self.nn_.kneighbors(encoded, 1, return_distance=True)
         dist = dist.reshape(-1)
-        return dist
+        return dist
diff --git a/libact/query_strategies/tests/test_density_weighted_meta.py b/libact/query_strategies/tests/test_density_weighted_meta.py
@@ -24,7 +24,8 @@ def setUp(self):
         self.quota = 10
 
     def test_density_weighted_meta_uncertainty_lc(self):
-        trn_ds = Dataset(self.X[:20], np.concatenate([self.y[:6], [None] * 14]))
+        trn_ds = Dataset(self.X[:20], np.concatenate(
+            [self.y[:6], [None] * 14]))
         base_qs = UncertaintySampling(
             trn_ds, method='lc',
             model=LogisticRegression(solver='liblinear', multi_class="ovr"))
@@ -37,7 +38,8 @@ def test_density_weighted_meta_uncertainty_lc(self):
             beta=1.0, random_state=1126)
         model = LogisticRegression(solver='liblinear', multi_class="ovr")
         qseq = run_qs(trn_ds, qs, self.y, self.quota)
-        assert_array_equal(qseq, np.array([13, 18,  9, 12,  8, 16, 10, 19, 15, 17]))
+        assert_array_equal(qseq, np.array(
+            [18, 13,  9, 12,  8, 16, 10, 19, 15,  7]))
 
 
 if __name__ == '__main__':

diff --git a/libact/query_strategies/tests/test_realdata.py b/libact/query_strategies/tests/test_realdata.py
@@ -153,7 +153,7 @@ def test_DensityWeightedUncertaintySampling(self):
         qs = DWUS(trn_ds, random_state=1126)
         qseq = run_qs(trn_ds, qs, self.y, self.quota)
         assert_array_equal(
-            qseq, np.array([30, 179, 104, 186, 28, 65, 142, 62, 257, 221]))
+            qseq, np.array([257, 220, 179,  84, 208,  70, 245,  62,  50,  69]))
 
 
 if __name__ == '__main__':

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
-setuptools
-numpy
-scipy
-scikit-learn>=0.24
-matplotlib
-Cython
-joblib
+setuptools==75.8.0
+numpy==2.2.2
+scipy==1.15.1
+scikit-learn==1.6.1
+matplotlib==3.10.0
+Cython==3.0.11
+joblib==1.4.2
diff --git a/setup.py b/setup.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python
 
-from io import open # python 2 compatibility
+from io import open  # python 2 compatibility
 import os
 from setuptools import setup, Extension
 import sys
 
 BUILD_HINTSVM = int(os.environ.get("LIBACT_BUILD_HINTSVM", 1))
-BUILD_VARIANCE_REDUCTION = int(os.environ.get("LIBACT_BUILD_VARIANCE_REDUCTION", 1))
+BUILD_VARIANCE_REDUCTION = int(os.environ.get(
+    "LIBACT_BUILD_VARIANCE_REDUCTION", 1))
 
 
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
@@ -85,11 +86,12 @@
     name='libact',
     version='0.1.6',
     description='Pool-based active learning in Python',
-    long_description=open('README.md', 'r', newline='', encoding='utf-8').read(),
+    long_description=open('README.md', 'r', newline='',
+                          encoding='utf-8').read(),
     long_description_content_type="text/markdown",
     author='Y.-Y. Yang, S.-C. Lee, Y.-A. Chung, T.-E. Wu, H.-T. Lin',
     author_email='[email protected], [email protected], '
-        '[email protected], [email protected], [email protected]',
+    '[email protected], [email protected], [email protected]',
     url='https://github.com/ntucllab/libact',
     cmdclass=cmdclasses,
     setup_requires=setup_requires,