-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
449 lines (345 loc) · 17.5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
###########################################
# Project : Gcash screenshot parser
# Author: Alfred Abanto
#
# Description:
# Just a small proof of concept
# demonstrating how we can use OCR
# (Optical Character Recognition)
# to automate obtaining key information
# from gcash screenshots, intended
# for small businesses
#
# License: MIT
# Date: 14/11/2022
###########################################
import threading
import re
import tkinter as tk
import tkinter.messagebox
import tkinter.filedialog
import tkinter.scrolledtext
from tkinter.scrolledtext import ScrolledText
from tkinter import ttk
from tesserocr import PyTessBaseAPI
import xlsxwriter
from dateutil.parser import parse
from PyPDF2 import PdfReader
class Gcash_parser(tk.Tk):
def __init__(self):
#This calls allows the class the inherit tk.Tk()
#which is similar to simply calling root = Tk()
super().__init__()
#Set gui
self.title('Gcash Screenshot Parser')
self.geometry('900x500')
#Make list to lookup month abreviations
self.months = ['Jan', 'Feb', 'Mar' , 'Apr', 'May', 'Jun', 'Jul', \
'Aug', 'Sept', 'Sep', 'Oct', 'Nov', 'Dec']
#Dictionary to convert months (abbreviated or not) to numeric counterparts
self.mon2num_dict = {
'Jan' : 1,
'January' : 1,
'Feb' : 2,
'February' : 2,
'Mar' : 3,
'March' : 3,
'Apr' : 4,
'April' : 4,
'May': 5,
'Jun': 6,
'June': 6,
'Jul': 7,
'July': 7,
'Aug': 8,
'August': 8,
'Sep': 9,
'Sept': 9,
'September': 9,
'Oct': 10,
'October': 10,
'Nov': 11,
'November': 11,
'Dec': 12,
'December': 12
}
#Make list to temporarily store the last parse run
self.last_run = []
self.main_win = ttk.Frame(self)
self.main_win.pack()
#----------------Settup Buttons for interface--------------
self.btn_pnl = ttk.Frame(self.main_win)
self.btn_pnl.pack()
self.get_files_btn = ttk.Button(self.btn_pnl,text='Select image')
self.get_files_btn['command'] = self.multhithread_ocr
self.get_files_btn.grid(row=0, column=0, sticky='W')
self.export_to_xlsx_btn = ttk.Button(self.btn_pnl, text='Export to xlsx')
self.export_to_xlsx_btn['command'] = self.export_last_run
self.export_to_xlsx_btn.grid(row=0, column=2, sticky='W')
self.export_to_xlsx_btn['state'] = 'disable'
self.pw_label = ttk.Label(self.btn_pnl, text="PDF Password")
self.pw_label.grid(row=1, column=0,sticky='W')
self.pw_entry = ttk.Entry(self.btn_pnl,show="*", width=25)
self.pw_entry.grid(row=1,column=1,sticky="W")
self.get_pdf_btn = ttk.Button(self.btn_pnl,text='Select PDF')
self.get_pdf_btn['command'] = self.get_data_from_pdf
self.get_pdf_btn.grid(row=0, column=1, sticky='W')
#----------------Other Gui elements -----------------
#Scrolled text widget for logging
self.log_area = ScrolledText(self.main_win, width=500)
self.log_area.insert(tk.INSERT,"Please select screenshots to parse..")
self.log_area.pack(fill=tk.BOTH , expand=True)
#Password buffer and flag for protected pdfs
self.password_buffer = ''
self.password_entered = False
# Uses tesserocr to print info
def get_data_from_files(self):
#Clean last run cache and clear log area
self.last_run = []
self.log_area.delete('1.0', tk.END)
#Ask for screenshot files to parse
img_files = tkinter.filedialog.askopenfilenames(initialdir='./')
#Send message that no file was selected
if(len(img_files) == 0):
tkinter.messagebox.showerror('Selection error', 'Error: Files selected')
#Enable function button again
if(self.get_files_btn['state'] != tk.NORMAL):
self.get_files_btn['state'] = tk.NORMAL
return None
#Start process
self.log_area.insert(tk.INSERT,'Selected files: {}\n Starting image recognition'.format(len(img_files)))
# Api is automatically finalized when used in a with-statement (context manager).
# otherwise api.End() should be explicitly called when it's no longer needed.
with PyTessBaseAPI(path= './data/tessdata') as api:
for img in img_files:
#use api to convert recognized text to string of chars
#this is based on the original C++ api
api.SetImageFile(img)
raw_text = api.GetUTF8Text()
raw_list = raw_text.split('\n')
#Print only the valuable information to log
#Store these in variables
self.log_area.insert(tk.INSERT,'\n{}\n'.format(img))
date_str = ''
amnt = -1
ref_str = ''
#Flags to tell me if the key info has been found
fnd_amt = False
fnd_ref = False
fnd_dat = False
for line in raw_list:
#If amount has not been parsed just yet then look for it first
if(((line.find('Amount Due PHP') != -1) or \
(line.find('Amount Paid PHP') != -1) or \
(line.find('Amount') != -1) or \
(line.find('PHP') != -1) or \
(line.find('Total PHP') != -1) ) \
and not fnd_amt):
# Parse string containing figure :
# - remove trailing spaces and commas
# - convert to float
amnt = float( line[line.find('PHP') + len('PHP'):].strip().replace(',',''))
#Print for debugging
self.log_area.insert(tk.INSERT, 'Amount : PHP {}'.format(amnt) + '\n')
#Assert flag
fnd_amt = True
elif((line.find('Ref. No.') != -1) and not fnd_ref):
# Parse string containing ref. no. :
# - remove trailing spaces and commas
ref_str = line[line.find('Ref. No.') + len('Ref. No.'): ].strip().replace(' ','')
self.log_area.insert(tk.INSERT, 'Ref. No. : {} \n'.format(ref_str))
fnd_ref = True
else:
for month in self.months:
if((line.find(month) != -1) and not fnd_dat ):
date_str = line
# Parse string
try:
#Try to parse everything using dateutil
str_parsed = parse(date_str, fuzzy=True)
#If succesful use datetime methods to extract date/time
tm = str_parsed.time().strftime("%I:%M %p")
numeric_date = str_parsed.date()
except:
#Exception thrown check for errors in OCR
#check if day and year have been concatenated
date_list = date_str.split(' ')
if( (len(date_list[1]) > 2) and (len(date_list) < 4)):
#Assume error date format will be: %mm %dd%yyyy,%HH%MM
#Reconstruct malformed date string and parse with dateutil
day_yr_str = '{} {}'.format(date_list[1][0:2] , date_list[1][2:6])
new_date_str = parse('{} {}'.format(date_list[0], day_yr_str))
#If succesful use datetime methods to extract date/time
tm = new_date_str.time().strftime("%I:%M %p")
numeric_date = new_date_str.date()
#print("Exception resolved: {} {}".format(numeric_date, tm))
else:
#Unresolved exception, possibly caused by an Add
print("Exception: {}".format(date_str))
print("Exception not resolved")
continue
self.log_area.insert(tk.INSERT, 'Date : {}\n'.format(numeric_date))
fnd_dat = True
if(fnd_dat and fnd_amt and fnd_ref):
#After all key information is found store in cache for export
#Get file name only
img_filename = img.split('/')
img_filename = img_filename[-1]
#Store in last run cache
self.last_run.append([ img_filename, numeric_date, tm, ref_str ,amnt])
#Clean up
fnd_dat = False
fnd_amt = False
fnd_ref = False
#Reset flags
amnt = -1
ref_str = ''
date_str = ''
#Uncomment for debugging
# else:
# line = line.strip()
# if(len(line) != 0):
# print('Exception read: {}'.format(line))
self.log_area.insert(tk.INSERT,'\n--------------------------\n')
self.log_area.see('end')
self.log_area.insert(tk.INSERT,'Done! Files parsed: {} \n'.format(len(self.last_run)))
#Change state of export to xlsx button after first run
if(self.export_to_xlsx_btn['state'] != tk.NORMAL):
self.export_to_xlsx_btn['state'] = tk.NORMAL
#Enable function button again
if(self.get_files_btn['state'] != tk.NORMAL):
self.get_files_btn['state'] = tk.NORMAL
print('Done')
def export_last_run(self):
output_filename = tkinter.filedialog.asksaveasfilename( initialdir='./', defaultextension=".xlsx")
#Send message that no file was selected
if(len(output_filename) == 0):
tkinter.messagebox.showerror('Selection error', 'Error: No filename input')
return None
# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook(output_filename)
worksheet = workbook.add_worksheet()
# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
#Add headers
worksheet.write(row, col, 'File name')
worksheet.write(row, col + 1, 'Date')
worksheet.write(row, col + 2, 'Time')
worksheet.write(row, col + 3, 'Ref. No.')
worksheet.write(row, col + 4, 'Amount')
#Increment row index
row += 1
# Iterate over the data and write it out row by row.
for filename, date, time, ref, amnt in (self.last_run):
worksheet.write(row, col, str(filename))
worksheet.write(row, col + 1, str(date))
worksheet.write(row, col + 2, str(time))
worksheet.write(row, col + 3, str(ref))
worksheet.write(row, col + 4, amnt)
row += 1
workbook.close()
self.log_area.insert(tk.INSERT,'Export Done! rows written: {} '.format(row - 1))
self.log_area.see('end')
#Get data from pdf
def get_data_from_pdf(self):
self.last_run = []
self.log_area.delete('1.0', tk.END)
#TODO Disable get data from pdf button
try:
#Ask for pdf file to parse
ENCRYPTED_FILE_PATH = tkinter.filedialog.askopenfilename(initialdir='./')
except:
tkinter.messagebox.showerror('Selection error', 'Error: No file selected')
#Enable function button again
if(self.get_files_btn['state'] != tk.NORMAL):
self.get_files_btn['state'] = tk.NORMAL
return None
#ENCRYPTED_FILE_PATH = '.\\transaction_history.pdf'
#print(ENCRYPTED_FILE_PATH)
with open(ENCRYPTED_FILE_PATH, mode='rb') as f:
reader = PdfReader(f)
if reader.is_encrypted:
#Get password for encrypted pdf
#Attempt to open secure pdf
try:
reader.decrypt(self.pw_entry.get())
except:
tkinter.messagebox.showerror('Security error', 'Error: Wrong password')
#Enable function button again
if(self.get_files_btn['state'] != tk.NORMAL):
self.get_files_btn['state'] = tk.NORMAL
return 0
#Uncomment to test if we opened it correctly by counting pages
#print('Number of pages {}'.format(len(reader.pages)))
reconstructed_lines = []
for page in reader.pages:
lines = page.extract_text()
lines_parsed = lines.split('\n')
#Declare buffer to repair fragmented lines
is_multiline = False
temp_str = ''
for line in lines_parsed:
#If last character on the string is a space assume that it is a fragment
if(line[-1] == ' ' and not is_multiline):
#Assert flag and store upper fragment
is_multiline = True
temp_str = line
elif(is_multiline):
#Concat lower fragment to upper fragment of the line
temp_str += line
#append reconstructed lines
reconstructed_lines.append(temp_str)
#De-assert flag
is_multiline = False
else:
#If line is complete then just append to buffer
reconstructed_lines.append(line)
headers = reconstructed_lines[0:4]
for header in headers:
self.log_area.insert(tk.INSERT,header + '\n')
#Trim headers and final balance, assume strict formatting
trans_data = reconstructed_lines[4:-2]
#Print transaction data
for line in trans_data:
#split line by space
line_list = line.split()
date = line_list[0].strip('\t').strip()
time = '{} {}'.format(line_list[1],line_list[2])
#strip the referrence number of any stray strings
ref_num = line_list[-3]
ref_num = re.sub('[^0-9]','', ref_num)
amnt = line_list[-2]
data_str = 'Date: {}\t Time: {}\t Ref No.: {}\t Amount: {}\n'.format(date, time, ref_num, amnt)
self.log_area.insert(tk.INSERT,data_str)
self.log_area.see('end')
#Get file name only
pdf_filename = ENCRYPTED_FILE_PATH.split('/')
pdf_filename = pdf_filename[-1]
#Store in last run cache
self.last_run.append([ pdf_filename, date, time, ref_num ,amnt])
#Enable function button again
if(self.get_pdf_btn['state'] != tk.NORMAL):
self.get_pdf_btn['state'] = tk.NORMAL
#Change state of export to xlsx button after first run
if(self.export_to_xlsx_btn['state'] != tk.NORMAL):
self.export_to_xlsx_btn['state'] = tk.NORMAL
#Clear password
self.pw_entry.delete(0,tk.END)
#Use multithreading to prevent process from blocking the main program window
def multhithread_pdf(self):
#Create new thread for printing
t2 = threading.Thread(target=self.get_data_from_pdf )
t2.start()
#Disable button to prevent users from running while we're parsing
self.get_pdf_btn['state'] = tk.DISABLED
#Use multithreading to prevent process from blocking the main program window
def multhithread_ocr(self):
#Create new thread for printing
t1 = threading.Thread(target=self.get_data_from_files )
t1.start()
#Disable button to prevent users from running while we're parsing
self.get_files_btn['state'] = tk.DISABLED
if __name__ == "__main__":
app = Gcash_parser()
app.mainloop()