forked from eriklindernoren/ML-From-Scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
naive_bayes.py
120 lines (103 loc) · 4.17 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from __future__ import division, print_function
from sklearn import datasets
import matplotlib.pyplot as plt
import math
import sys
import os
import numpy as np
import pandas as pd
from mlfromscratch.utils.data_manipulation import train_test_split, normalize
from mlfromscratch.utils.data_operation import accuracy_score
from mlfromscratch.unsupervised_learning import PCA
from mlfromscratch.utils import Plot
class NaiveBayes():
"""The Gaussian Naive Bayes classifier. """
def __init__(self):
self.classes = None
self.X = None
self.y = None
# Gaussian prob. distribution parameters
self.parameters = []
def fit(self, X, y):
self.X = X
self.y = y
self.classes = np.unique(y)
# Calculate the mean and variance of each feature for each class
for i in range(len(self.classes)):
c = self.classes[i]
# Only select the rows where the species equals the given class
x_where_c = X[np.where(y == c)]
# Add the mean and variance for each feature
self.parameters.append([])
for j in range(len(x_where_c[0, :])):
col = x_where_c[:, j]
parameters = {}
parameters["mean"] = col.mean()
parameters["var"] = col.var()
self.parameters[i].append(parameters)
# Gaussian probability distribution
def _calculate_probability(self, mean, var, x):
coeff = (1.0 / (math.sqrt((2.0 * math.pi) * var)))
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * var)))
return coeff * exponent
# Calculate the prior of class c (samples where class == c / total number
# of samples)
def _calculate_prior(self, c):
# Selects the rows where the class label is c
x_where_c = self.X[np.where(self.y == c)]
n_class_instances = np.shape(x_where_c)[0]
n_total_instances = np.shape(self.X)[0]
return n_class_instances / n_total_instances
# Classify using Bayes Rule, P(Y|X) = P(X|Y)*P(Y)/P(X)
# P(X|Y) - Probability. Gaussian distribution (given by calculate_probability)
# P(Y) - Prior (given by calculate_prior)
# P(X) - Scales the posterior to the range 0 - 1 (ignored)
# Classify the sample as the class that results in the largest P(Y|X)
# (posterior)
def _classify(self, sample):
posteriors = []
# Go through list of classes
for i in range(len(self.classes)):
c = self.classes[i]
prior = self._calculate_prior(c)
posterior = prior
# multiply with the additional probabilties
# Naive assumption (independence):
# P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
for j, params in enumerate(self.parameters[i]):
sample_feature = sample[j]
mean = params["mean"]
var = params["var"]
# Determine P(x|Y)
prob = self._calculate_probability(mean, var, sample_feature)
# Multiply with the rest
posterior *= prob
# Total probability = P(Y)*P(x1|Y)*P(x2|Y)*...*P(xN|Y)
posteriors.append(posterior)
# Get the largest probability and return the class corresponding
# to that probability
index_of_max = np.argmax(posteriors)
max_value = posteriors[index_of_max]
return self.classes[index_of_max]
# Predict the class labels corresponding to the
# samples in X
def predict(self, X):
y_pred = []
for sample in X:
y = self._classify(sample)
y_pred.append(y)
return y_pred
def main():
data = datasets.load_iris()
X = normalize(data.data)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
clf = NaiveBayes()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
# Reduce dimension to two using PCA and plot the results
Plot().plot_in_2d(X_test, y_pred, title="Naive Bayes", accuracy=accuracy, legend_labels=data.target_names)
if __name__ == "__main__":
main()