Bug fix and more realistic test

usc-isi-i2 · Sep 26, 2017 · 2aeac30 · 2aeac30
1 parent bf43afb
commit 2aeac30
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 17 deletions.
diff --git a/dsbox/overfitdetector/detector.py b/dsbox/overfitdetector/detector.py
@@ -42,7 +42,7 @@ class Detector(SupervisedLearnerPrimitiveBase[Input, Output, Params]):
     def __init__(self, *, n_sample_instances: int = 500, n_sample_iterations: int = 100, columns: list = list(),
                  model: SupervisedLearnerPrimitiveBase = None) -> None:
         super().__init__()
-        logging.basicConfig(level=logging.INFO)
+        logging.basicConfig(level=logging.DEBUG)
         self.__logger = logging.getLogger(__name__)
         self.n_sample_iterations = n_sample_iterations
         self.n_sample_instances = n_sample_instances
@@ -373,19 +373,33 @@ def find_matching_rows(self, row_values, columns, training_data, num_neighbors=1
 
                 matches = training_data
                 for (col, val) in zip(col_set, vals_for_col_set):
+                    tmp_matches = []
                     if self.is_number(val):
-                        lower_bound = float(val) * (1.0 - real_value_extend)
-                        upper_bound = float(val)*(1.0 + real_value_extend)
-                        matches = matches[matches[col] >= lower_bound]
-                        matches = matches[matches[col] <= upper_bound]
+                        if isinstance(val, float):
+                            lower_bound = float(val) * (1.0 - real_value_extend)
+                            upper_bound = float(val) * (1.0 + real_value_extend)
+                            tmp_matches_lwr = matches[matches[col] >= lower_bound]
+                            tmp_matches_upr = matches[matches[col] <= upper_bound]
+                            tmp_matches = pandas.concat([tmp_matches_lwr, tmp_matches_upr])
+                        else:
+                            tmp_matches = matches[matches[col] == val]
                     else:
                         if val.isalnum():
-                            matches = matches[matches[col] == '%s' % val]
+                            tmp_matches = matches[matches[col] == '%s' % val]
                         else:
-                            matches = matches[matches[col] == val]
+                            tmp_matches = matches[matches[col] == val]
+
+                    if len(tmp_matches) > 0:  # do the conjunction, but only as long as we have some data...
+                        matches = tmp_matches
+                    else:
+                        break
+
+                    self.__logger.debug("Dataframe query: (%s, %s) found %d matches." % (col, str(val), len(matches)))
+                    #print("Dataframe query: (%s, %s) found %d matches." % (col, str(val), len(matches)))
 
-                self.__logger.debug("Dataframe query: %s found %d matches." % (qry_string, len(matches)))
                 if len(matches) > 0:
+                    self.__logger.debug("Col set yielded %d matches total" % (len(matches)))
+                    #print("Col set yielded %d matches total" % (len(matches)))
                     all_matches = all_matches.append(matches)
 
             if len(all_matches) > 0:

diff --git a/tests/dsbox/overfit-detector/test_data/trainData.csv.gz b/tests/dsbox/overfit-detector/test_data/trainData.csv.gz
diff --git a/tests/dsbox/overfit-detector/test_data/trainTargets.csv.gz b/tests/dsbox/overfit-detector/test_data/trainTargets.csv.gz
diff --git a/tests/dsbox/overfit-detector/test_detector.py b/tests/dsbox/overfit-detector/test_detector.py
@@ -5,6 +5,7 @@
 import numpy as np
 from dsbox.overfitdetector.detector import Detector
 from sklearn.linear_model import LogisticRegression
+from sklearn import preprocessing
 import logging
 import math
 
@@ -13,7 +14,10 @@ class Detectorests(unittest.TestCase):
     def setUp(self):
         self.__detector = Detector()
         self.__dir_path = os.getcwd()
-        self.__detector.set_logger(logging.ERROR)
+        logging.basicConfig(level=logging.DEBUG)
+
+        self.__train_data_file = self.__dir_path+"/tests/dsbox/overfit-detector/test_data/trainData.csv.gz"
+        self.__train_labels_file = self.__dir_path+"/tests/dsbox/overfit-detector/test_data/trainTargets.csv.gz"
 
         datas = {
             "indep1": [],
@@ -48,15 +52,28 @@ def setUp(self):
         self.__test_df = pd.DataFrame(datas)
 
     def test_detector(self):
-        data = np.array([1., 2., 3., 4.])
-        labels = np.array(1)
-        for i in range(10):
-            data = np.vstack([data, [1., 2., 3., 4.]])
-            labels = np.append(labels, 1)
-        for i in range(10):
-            data = np.vstack([data, [2., 3., 4., 5.]])
-            labels = np.append(labels, 0)
 
+        #data = np.array([1., 2., 3., 4.])
+        #labels = np.array(1)
+        #for i in range(10):
+        #    data = np.vstack([data, [1., 2., 3., 4.]])
+        #    labels = np.append(labels, 1)
+        #for i in range(10):
+        #    data = np.vstack([data, [2., 3., 4., 5.]])
+        #    labels = np.append(labels, 0)
+
+        data = pd.read_csv(self.__train_data_file, header=0).fillna(0.0).replace('', '0')
+        del data['d3mIndex']
+        labels = pd.read_csv(self.__train_labels_file, header=0).fillna(0.0).replace('', '0')['Hall_of_Fame']
+
+        # Encode the categorical data in training data
+        # Encode the categorical data in the test targets, uses the first target of the dataset as a target
+        trainDataLabelEncoders = dict()
+        for col in ['Player', 'Position']:
+            trainDataLabelEncoders[col] = preprocessing.LabelEncoder().fit(data[col])
+            data[col] = trainDataLabelEncoders[col].transform(data[col])
+
+        # Train the model
         mdl = LogisticRegression().fit(data, labels)
         dd = Detector(n_sample_instances=10, n_sample_iterations=20, columns=['0', '1', '2', '3'], model=mdl)
         dd.set_training_data(inputs=data, outputs=labels)