-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_model.py
69 lines (55 loc) · 2.67 KB
/
test_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import email
import sklearn.pipeline
import sklearn.feature_extraction.text
import sklearn.tree
import sklearn.decomposition
import features
import pandas
import numpy as np
# Transforms any function into a map over the sample
class FunctionMapper(sklearn.pipeline.BaseEstimator, sklearn.pipeline.TransformerMixin):
def __init__(self, function):
self.function = function
def fit(self, x, y=None):
return self
def transform(self, x, y=None):
return map(self.function, x)
# Transforms any function into a map over the sample
class FunctionTransformer(sklearn.pipeline.BaseEstimator, sklearn.pipeline.TransformerMixin):
def __init__(self, function):
self.function = function
self.model = sklearn.feature_extraction.DictVectorizer()
def fit(self, x, y=None):
return self
def transform(self, x, y=None):
return self.model.fit_transform(map(self.function, x), y)
if __name__ == '__main__':
pipeline = sklearn.pipeline.Pipeline([
('transform_email', FunctionMapper(email.message_from_string)),
('generate_features', sklearn.pipeline.FeatureUnion([
('content_type_features', FunctionTransformer(features.generate_content_type)),
('email_counts_features', FunctionTransformer(features.generate_email_counts)),
('case_ratio_features', FunctionTransformer(features.generate_upper_to_lower_case_ratios)),
('email_chain_features', FunctionTransformer(features.generate_subject_is_chain)),
('link_features', FunctionTransformer(features.generate_number_of_links)),
('mailing_list_features', FunctionTransformer(features.generate_is_mailing_list)),
('bag_of_words_features', sklearn.pipeline.Pipeline([
('extract_payload', FunctionMapper(str)),
('generate_bow', sklearn.feature_extraction.text.TfidfVectorizer()),
('pca', sklearn.decomposition.TruncatedSVD(n_components=200))
]))
], n_jobs=2)),
('replace_nans', sklearn.preprocessing.Imputer(missing_values='NaN', strategy='mean')),
('train_tree', sklearn.tree.DecisionTreeClassifier())
])
# Load processed data
dataset = pandas.read_msgpack('./data/development.msg', encoding='latin-1')
# TODO: These three lines are just for fast iteration while testing
import numpy
mask = numpy.random.rand(len(dataset)) < 0.1
dataset = dataset[mask]
# Separate features and labels
features = dataset['email'].values
labels = dataset['class'].apply(lambda x: x == 1).values
res = sklearn.cross_validation.cross_val_score(pipeline, features, labels, cv=10, scoring='roc_auc')
print(res)