-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathAutoDocClass.py
118 lines (91 loc) · 3.37 KB
/
AutoDocClass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Data Loading
import preproc
import pandas as pd
from time import time
C={'letter': 0,
'form': 1,
'email': 2,
'handwritten': 3,
'advertisement': 4,
'scientific report': 5,
'scientific publication': 6,
'specification': 7,
'file folder': 8,
'news article': 9,
'budget': 10,
'invoice': 11,
'presentation': 12,
'questionnaire': 13,
'resume': 14,
'memo': 15}
# Trial Parameters
Nr= 764
Nc= 764
f_method='phash'
n_train=14
n_val= 5
c_method='ParForest'
hash_s= 256
dataset_path= "/Users/robi/AnacondaProjects/AutoDocProject/rvl-cdip/"
results_path= "/Users/robi/AnacondaProjects/AutoDocProject/Results_Images/"
# Read the training set file and put it in a dictionary: "Class name-List of files"
df_train= pd.read_csv(dataset_path+"labels/train.txt", delim_whitespace=True,
dtype={'Filename': str,'Class': int})
df_test= pd.read_csv(dataset_path+"labels/test.txt", delim_whitespace=True,
dtype={'Filename': str,'Class': int})
df_val= pd.read_csv(dataset_path+"labels/val.txt", delim_whitespace=True,
dtype={'Filename': str,'Class': int})
# create_data: Read the dataset filenames per class and count number of images
dataset, nitems= preproc.create_data(df_train, df_test,df_val)
# create_training: Get batch of Nb images per class, full resolution
train_set, y_train= preproc.create_sample(dataset, nitems, dataset_path, Nr, Nc, label='train', Nbatch=n_train)
#%% Image Test
import matplotlib.pyplot as plt
import numpy as np
plt.imshow( np.reshape( train_set[88,:], (Nr,Nc) ), cmap=plt.get_cmap('gray') ),
plt.colorbar()
plt.show()
#t0= time()
#%% Feature Extraction
import features
trfeat_set= features.feat_extr(train_set,Nr,Nc, feat_type=f_method, hash_size=hash_s)
#del train_set
#%% Image Test
import matplotlib.pyplot as plt
import numpy as np
NN= 210
rN= int( np.sqrt( int((hash_s**2)/4) ) )
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.imshow( np.reshape( train_set[NN,:], (Nr,Nc) ), cmap=plt.get_cmap('gray'), aspect='auto' ), ax1.set_title("Original Document "+str(Nr)+"x"+str(Nc))
ax2.imshow( np.reshape( trfeat_set[NN,:rN**2], (rN,rN) ), cmap=plt.get_cmap('gray'), aspect='auto' ), ax2.set_title("Perceptual Hash "+str(rN)+"x"+str(int(rN/2)))
plt.colorbar()
plt.show()
#%% Learning on training set
import learning
# Only in case of phash or ahash or whash0;des
clasf= learning.learn(trfeat_set,y_train, method=c_method)
#%% Prediction on test set
# create_validation: Get batch of Nb images per class, full resolution
test_set, y_test,Nr,Nc= preproc.create_sample(dataset, nitems, label='test', Nbatch=n_val)
tefeat_set= features.feat_extr(test_set,Nr,Nc, feat_type=f_method, hash_size=hash_s)
del test_set
# Prediction
yp= clasf.predict(tefeat_set)
#print("Done in %0.3fs" % (time()-t0))
#%% Validation
import validation
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yp)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
#plt.figure()
#validation.plot_confusion_matrix(cnf_matrix, classes=C,
# title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure()
validation.plot_confusion_matrix(cnf_matrix, classes=C, normalize=True,
title='Normalized confusion matrix - '+f_method+'+'+c_method)
plt.show()