Skip to content

Commit

Permalink
Bug fix and more realistic test
Browse files Browse the repository at this point in the history
  • Loading branch information
mmichelsonIF committed Sep 26, 2017
1 parent bf43afb commit 2aeac30
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 17 deletions.
30 changes: 22 additions & 8 deletions dsbox/overfitdetector/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class Detector(SupervisedLearnerPrimitiveBase[Input, Output, Params]):
def __init__(self, *, n_sample_instances: int = 500, n_sample_iterations: int = 100, columns: list = list(),
model: SupervisedLearnerPrimitiveBase = None) -> None:
super().__init__()
logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.DEBUG)
self.__logger = logging.getLogger(__name__)
self.n_sample_iterations = n_sample_iterations
self.n_sample_instances = n_sample_instances
Expand Down Expand Up @@ -373,19 +373,33 @@ def find_matching_rows(self, row_values, columns, training_data, num_neighbors=1

matches = training_data
for (col, val) in zip(col_set, vals_for_col_set):
tmp_matches = []
if self.is_number(val):
lower_bound = float(val) * (1.0 - real_value_extend)
upper_bound = float(val)*(1.0 + real_value_extend)
matches = matches[matches[col] >= lower_bound]
matches = matches[matches[col] <= upper_bound]
if isinstance(val, float):
lower_bound = float(val) * (1.0 - real_value_extend)
upper_bound = float(val) * (1.0 + real_value_extend)
tmp_matches_lwr = matches[matches[col] >= lower_bound]
tmp_matches_upr = matches[matches[col] <= upper_bound]
tmp_matches = pandas.concat([tmp_matches_lwr, tmp_matches_upr])
else:
tmp_matches = matches[matches[col] == val]
else:
if val.isalnum():
matches = matches[matches[col] == '%s' % val]
tmp_matches = matches[matches[col] == '%s' % val]
else:
matches = matches[matches[col] == val]
tmp_matches = matches[matches[col] == val]

if len(tmp_matches) > 0: # do the conjunction, but only as long as we have some data...
matches = tmp_matches
else:
break

self.__logger.debug("Dataframe query: (%s, %s) found %d matches." % (col, str(val), len(matches)))
#print("Dataframe query: (%s, %s) found %d matches." % (col, str(val), len(matches)))

self.__logger.debug("Dataframe query: %s found %d matches." % (qry_string, len(matches)))
if len(matches) > 0:
self.__logger.debug("Col set yielded %d matches total" % (len(matches)))
#print("Col set yielded %d matches total" % (len(matches)))
all_matches = all_matches.append(matches)

if len(all_matches) > 0:
Expand Down
Binary file not shown.
Binary file not shown.
35 changes: 26 additions & 9 deletions tests/dsbox/overfit-detector/test_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
from dsbox.overfitdetector.detector import Detector
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import logging
import math

Expand All @@ -13,7 +14,10 @@ class Detectorests(unittest.TestCase):
def setUp(self):
self.__detector = Detector()
self.__dir_path = os.getcwd()
self.__detector.set_logger(logging.ERROR)
logging.basicConfig(level=logging.DEBUG)

self.__train_data_file = self.__dir_path+"/tests/dsbox/overfit-detector/test_data/trainData.csv.gz"
self.__train_labels_file = self.__dir_path+"/tests/dsbox/overfit-detector/test_data/trainTargets.csv.gz"

datas = {
"indep1": [],
Expand Down Expand Up @@ -48,15 +52,28 @@ def setUp(self):
self.__test_df = pd.DataFrame(datas)

def test_detector(self):
data = np.array([1., 2., 3., 4.])
labels = np.array(1)
for i in range(10):
data = np.vstack([data, [1., 2., 3., 4.]])
labels = np.append(labels, 1)
for i in range(10):
data = np.vstack([data, [2., 3., 4., 5.]])
labels = np.append(labels, 0)

#data = np.array([1., 2., 3., 4.])
#labels = np.array(1)
#for i in range(10):
# data = np.vstack([data, [1., 2., 3., 4.]])
# labels = np.append(labels, 1)
#for i in range(10):
# data = np.vstack([data, [2., 3., 4., 5.]])
# labels = np.append(labels, 0)

data = pd.read_csv(self.__train_data_file, header=0).fillna(0.0).replace('', '0')
del data['d3mIndex']
labels = pd.read_csv(self.__train_labels_file, header=0).fillna(0.0).replace('', '0')['Hall_of_Fame']

# Encode the categorical data in training data
# Encode the categorical data in the test targets, uses the first target of the dataset as a target
trainDataLabelEncoders = dict()
for col in ['Player', 'Position']:
trainDataLabelEncoders[col] = preprocessing.LabelEncoder().fit(data[col])
data[col] = trainDataLabelEncoders[col].transform(data[col])

# Train the model
mdl = LogisticRegression().fit(data, labels)
dd = Detector(n_sample_instances=10, n_sample_iterations=20, columns=['0', '1', '2', '3'], model=mdl)
dd.set_training_data(inputs=data, outputs=labels)
Expand Down

0 comments on commit 2aeac30

Please sign in to comment.