-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
143 lines (114 loc) · 4.18 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from io import BytesIO
from flask import Flask, jsonify, make_response, request, Response
from requests import get
from PIL import Image
import pytesseract
from config import TESSDATA_DIR
tessdata_dir_config = f'--tessdata-dir {TESSDATA_DIR}'
available_langs = pytesseract.get_languages(config=tessdata_dir_config)
def str2bool(v):
if not v:
return False
if isinstance(v, bool):
return v
return v.lower() in ("yes", "true", "t", "1")
def parse_lang(lang):
if not lang:
return None
# Format for lang: eng khm
if '+' in lang:
return lang
langs = lang.split(' ')
return '+'.join(langs)
def is_lang_supported(lang):
if not lang:
return False
# Format for lang: eng+khm
langs = lang.split('+')
for l in langs:
print("Checking lang:", l)
if l not in available_langs:
return False
return True
def image_to_text(data, lang, dl_to_file=False):
image = Image.open(data)
image = image.convert('L') # convert image to black and white
text = pytesseract.image_to_string(
image, config=tessdata_dir_config, lang=lang)
if dl_to_file:
return Response(text, mimetype='text/plain', headers={
'Content-Disposition': 'attachment;filename=ocr.txt'
})
return make_response(jsonify({
'text': text,
'text_length': len(text),
'text_words': len(text.split(' ')),
'text_lines': len(text.splitlines()),
'text_chars': len(text.replace(' ', '')),
'lang': lang,
}), 200)
def create_app():
app = Flask(__name__)
@app.route('/', methods=['GET', 'POST'])
def index():
# Format for lang: eng+khm
lang = request.args.get('lang') or request.form.get('lang')
lang = parse_lang(lang)
dl = request.args.get('dl') or request.form.get('dl')
dl = str2bool(dl)
if lang:
if not is_lang_supported(lang):
return make_response(jsonify({
'error': 'Language not supported',
'lang': lang,
}), 400)
if request.method == 'POST':
file = request.files.get('file')
if file:
mine_type = file.content_type
if mine_type == 'image/png' or mine_type == 'image/jpeg' or mine_type == 'image/jpg':
return image_to_text(file.stream, lang, dl_to_file=dl)
else:
return make_response(jsonify({
'error': 'File only png/jpeg/jpg image is supported',
'mine_type': mine_type,
}), 400)
url = request.args.get('url') or request.form.get('url')
if url:
downloadImageFromUrl = get(url)
mine_type = downloadImageFromUrl.headers['Content-Type']
data = downloadImageFromUrl.content
if mine_type == 'image/png' or mine_type == 'image/jpeg' or mine_type == 'image/jpg':
return image_to_text(BytesIO(data), lang, dl_to_file=dl)
else:
return make_response(jsonify({
'error': 'Only png/jpeg/jpg image is supported',
'mine_type': mine_type,
}), 400)
return make_response(jsonify({
'error': 'No file or url provided',
'params': {
'file': 'file',
'url': 'https://example.com/image.png',
'lang': 'eng+khm',
},
"langs": {
"link": "/langs?lang=eng+khm",
}
}), 400)
@app.route('/langs', methods=['GET'])
def langs():
global available_langs
if not available_langs:
available_langs = pytesseract.get_languages(
config=tessdata_dir_config)
lang = request.args.get('lang') or request.form.get('lang')
lang = parse_lang(lang)
supported = is_lang_supported(lang)
return make_response(jsonify({
'supported': supported,
'lang': lang,
'langs': available_langs,
"tessdata": "https://github.com/tesseract-ocr/tessdata.git",
}), 200)
return app