-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3-ID3.py
89 lines (83 loc) · 3.5 KB
/
3-ID3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import math
import pandas as pd
from collections import Counter
tennis_df = pd.DataFrame.from_csv('tennis.csv')
#Shuffle data OPTIONAL LINE if this line is used accuracy changes each time this code is executed!
tennis_df = tennis_df.sample(frac=1).reset_index(drop=True)
class Node:
def __init__(self, data, attribute = None):
self.decision_attribute = attribute
self.child = {}
self.data = data
self.decision = None
def calculate_entropy(probs):
return sum( [-prob*math.log(prob, 2) for prob in probs] )
def splitData(a_list,attribute,Class):
return a_list[ a_list[attribute] == Class]
def entropy(a_list ,attribute = 'PlayTennis', Gain = False):
cnt = Counter(a_list[attribute]) # Counter calculates the propotion of class
num_instances = len(a_list[attribute])
probs = [x / num_instances for x in cnt.values()] # x means count of each attribute.
if not Gain:
return calculate_entropy(probs)
print(cnt.items())
gain = 0
for Class , prob in zip(cnt.keys(),probs):
gain += -prob *entropy(splitData(a_list,attribute,Class))
return gain
def information_gain(data):
Max_gain = -1
Max_gain_Attribute = None
for attribute in data.keys(): #Find max information gain
if attribute == 'PlayTennis':
continue
gain = entropy(data) + entropy(data,attribute,Gain= True)
print("{} {}".format(gain,attribute))
if gain > Max_gain:
Max_gain = gain
Max_gain_Attribute = attribute
return Max_gain_Attribute
def id3(root):
global nodes
if len(root.data.keys()) == 1 or len(root.data) == 1: #end of decision tree.
cnt = Counter(root.data['PlayTennis'])
root.decision = cnt.most_common(1)[0][0] # Yes or No
print("Decision=",root.decision)
return
Max_gain_Attribute = information_gain(root.data)
root.decision_attribute = Max_gain_Attribute
print("------------------------------")
for attribute in set(root.data[Max_gain_Attribute]):
#split data based on values in table
childData = splitData(root.data , Max_gain_Attribute , attribute)
root.child[attribute] = Node(childData.drop([Max_gain_Attribute],axis = 1))
id3(root.child[attribute])
def predict(example,root):
if root.decision != None:
return root.decision
try: #if that attribute is not there in that part of tree(id3 trained on the split data)
#eg: Temo(cold , hot , mild) but when divided into subtrees data is divided.
#some branches have only cold and hot for eg.
prediction = predict(example , root.child[example[root.decision_attribute]])
return prediction
except:
return "No"
training_data = tennis_df.iloc[:-4] # all but last four instances
test_data = tennis_df.iloc[-4:] # just the last four
root = Node(data = training_data)
id3(root)
print(test_data)
predictions = [predict(test ,root) for _,test in test_data.iterrows() ]
correct = test_data['PlayTennis']
print("predictions are : {}".format(predictions))
print("actual:{}".format(test_data['PlayTennis']))
print("ACCURACY:{}".format(sum([1 for x,y in zip(predictions,correct) if x==y])/(len(predictions)) ))
### OPTIONAL
def display_tree(root):
if root.decision != None:
return "{" + str(root.decision) + "},"
tree ="{" +str(root.decision_attribute) + ":"
for attribute , node in root.child.items():
tree += "{"+str(attribute)+":"+display_tree(node)+"},"
return tree + "}"
print(display_tree(root))