-
Notifications
You must be signed in to change notification settings - Fork 0
/
7B. Tesseract_OCR_text_file.py
29 lines (23 loc) · 1.26 KB
/
7B. Tesseract_OCR_text_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# download and install Tesseract in your machine for your specific operating system (https://tesseract-ocr.github.io/tessdoc/Installation.html)
# install Pytesseract with pip using cmd ("pip install pytesseract")
from PIL import Image
import pytesseract
from pathlib import Path
# To use Tesseract, in the Python code you must specify the path of tesseract.exe (or insert it in the system environment variables)
# that in the default installation options is normally saved in C:/Users/AppData/Local/Tesseract-OCR/tesseract.exe
#pytesseract.pytesseract.tesseract_cmd = '' # insert your Tesseract path
# Here are three ways to customize the settings (engine modes, page segmentation modes or boths)
custom_oem_config = r'--oem 3'
custom_psm_config = r'--psm 3'
custom_oem_psm_config = r'--oem 3 --psm 3'
input_folder = Path('output/rotated')
file_path = 'output/file.txt'
with open(file_path, 'w', encoding='utf-8') as f:
f.write("")
for img in input_folder.iterdir():
if img.suffix == '.jpg':
with open (img, 'rb') as f:
file= Image.open(f)
ocrText= pytesseract.image_to_string(file, lang="ita", config=custom_oem_psm_config)
with open (file_path, 'a', encoding='utf-8') as text:
text.write(ocrText)