-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathid3.py
121 lines (88 loc) · 2.99 KB
/
id3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 27 22:55:44 2018
@author: raghuvansh
"""
import numpy as np
from math import log
from data_loader import read_data
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ''
def __str__(self):
return self.attribute
def subtables(data, col, delete):
dictionary = dict()
items = np.unique(data[:, col])
count = np.zeros((items.shape[0], 1), dtype=np.int64)
for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]:
count[x] += 1
for x in range(items.shape[0]):
dictionary[items[x]] = np.empty((count[x][0], data.shape[1]), dtype='S32')
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dictionary[items[x]][pos] = data[y]
pos += 1
if delete:
dictionary[items[x]] = np.delete(dictionary[items[x]], col, 1)
return items, dictionary
def entropy(S):
items = np.unique(S)
if items.size == 1:
return 0
counts = np.zeros((items.shape[0], 1))
sums = 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / S.shape[0]
for count in counts:
sums += -1 * count * log(count, 2)
return sums
def info_gain(data, col):
items, dictionary = subtables(data, col, delete=False)
entropies = np.zeros((items.shape[0], 1))
for x in range(items.shape[0]):
entropies[x] = entropy(dictionary[items[x]][:, -1])
total_entropy = entropy(data[:, -1])
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy
def build_tree(data, metadata):
if np.unique(data[:, -1]).shape[0] == 1:
node = Node('')
node.answer = np.unique(data[:, -1])[0]
return node
gains = np.zeros((data.shape[1] - 1, 1))
for col in range(data.shape[1] - 1):
gains[col] = info_gain(data, col)
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dictionary = subtables(data, split, delete= True)
for x in range(items.shape[0]):
child = build_tree(dictionary[items[x]], metadata)
node.children.append((items[x], child))
return node
def empty(size):
s = ""
for x in range(size):
s += " "
return s
def print_tree(node, level):
if node.answer != "":
print(empty(level), node.answer)
return
print(empty(level), node.attribute)
for value, n in node.children:
print(empty(level + 1), value)
print_tree(n, level + 2)
filename = input()
metadata, traindata = read_data(filename)
data = np.asarray(traindata)
node = build_tree(data, metadata)
print_tree(node, 0)