-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_rename_ocr.py
125 lines (103 loc) · 3.32 KB
/
pdf_rename_ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
This script reads a pdf document (lien waivers in this case) via OCR and renames the file based on the generated text.
libraries used: pdf2image, pytesseract, openCV, re, os
@author: varun.singh
Install libraries commands:
pip install pdf2image
conda install -c conda-forge poppler
pip install opencv-python==4.5.5.64
Set tesseract_cmd to folder containing tesseract.exe
pytesseract.pytesseract.tesseract_cmd = 'SYSTEM PATH TO TESSERACT.EXE'
"""
# import libraries
from pdf2image import convert_from_path
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = 'SYSTEM PATH TO TESSERACT.EXE'
import re
import os
"""
# Loop performs the following actions:
# 1. Open each pdf file in the working directory.
# 2. Convert file to image and save image.
# 3. Reading and parsing text from image via regex.
# 4. Rename file based on parsed text.
# 5. If parsing is unsuccessful or the parsed name is too large (>100 chars), renames the file with "notChanged" prefix
# Set working directory to folder containing pdf files
"""
for file in os.listdir('.'):
if file.lower()[-3:] != 'pdf':
continue
print(file)
pages = convert_from_path(file, 500) #500 dpi resolution for image conversion
# save image
imName = file[:-4] + '.jpg'
pages[0].save(imName, 'JPEG')
# read image
image = cv2.imread(imName,0)
# generate text from image
text = pytesseract.image_to_string(image, lang='eng')
print()
# parsing date of format Month DD, YYYY
date = re.search(r'\s{1}\D{2,9}\s{1}\d{0,2},\s{1}\d{4}',text)
try:
date = date.group(0)
date = date[1:]
date = date.replace(",", "")
date = date.replace("\n", "")
except:
date = ''
# parse conditional/unconditional
if re.search(r'UNCONDITIONAL',text) == None:
lwType = 'LWC'
else:
lwType = 'LWU'
sub = re.search(r'Undersigned Lienor.{2}(\w{2,15}\s){1,3}',text)
try:
sub = sub.group(0)
sub = sub[20:]
sub = sub.replace("\n", "")
except:
sub = ''
# Parse Invoice number
invNo = re.search(r'Invoice/Payment Number.{4,7}',text)
try:
invNo = invNo.group(0)
invNo = invNo[24:]
invNo = invNo.replace("\n", "")
except:
invNo = ''
# Search Amount
amt = re.search(r'Payment Amount.{4,15}',text)
try:
amt = amt.group(0)
amt = amt[16:]
amt = amt.replace("\n", "")
except:
amt = ''
# parse parent company
toCompany = re.search(r'to\D{5,}on the job of',text)
try:
toCompany = toCompany.group(0)
toCompany = toCompany[3:]
ind = len(toCompany) - len(' on the job of')
toCompany = toCompany[:ind]
toCompany = toCompany.replace(",", "")
toCompany = toCompany.replace(".", "")
toCompany = toCompany.replace("\n", "")
except:
toCompany = ''
# Join parts together to form name + error handling
print(lwType)
print(sub)
print(invNo)
print(date)
print(toCompany)
createdName = lwType + " " + sub + " " + invNo + " " + date + " " + toCompany + ".pdf"
if len(createdName) > 100:
os.rename(file, file[:-4]+"notchanged.pdf")
continue
try:
os.rename(file,createdName)
except:
continue