-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr.py
126 lines (118 loc) · 5.4 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests
import base64
import easyocr
from dotenv import load_dotenv
from general import relative_path, os, json, Image, tqdm
MAX_SIZE: int = 5_000_000 # maximum file size in bytes, using the WEBP format
# ENGINE: list[str] = ['easyocr']
# ENGINE: list[str] = ['anthropic', 'claude-3-5-sonnet-20240620', 'claude_3_5_sonnet']
ENGINE: list[str] = ['llama_cpp/v2/vision', 'qwen-vl-next_b2583']
with open(relative_path('data/cropped_images/prompt.md'), 'r', encoding='utf-8') as _f:
PROMPT = _f.read()
load_dotenv()
def decrease_size(input_path: str, output_path: str) -> None:
"""
Decrease the size of an image and convert it to WEBP.
:param input_path: input path
:param output_path: output path, preferably with .webp extension
:param max_size: maximum file size in bytes
:param max_side: maximum resolution in pixels
:return: None
"""
with Image.open(input_path) as img:
original_size = os.path.getsize(input_path)
if original_size <= MAX_SIZE:
print("Image is already below the maximum size.")
return None
width, height = img.size
while width > 0 and height > 0:
img_resized = img.resize((width, height), Image.Resampling.LANCZOS)
img_resized.save(output_path, format=output_path.split('.')[-1].upper())
if os.path.getsize(output_path) <= MAX_SIZE:
print(f"Reduced image size to {os.path.getsize(output_path)} bytes.")
break
width, height = int(width * 0.9), int(height * 0.9)
if os.path.getsize(output_path) > MAX_SIZE:
raise ValueError("Could not reduce PNG size below max_size by reducing resolution.")
def main() -> None:
"""
Calls the OCR engine and saves the results in the 'data2/frames_ocr' directory.
:return: None
"""
if ENGINE[0] == 'easyocr':
# https://github.com/JaidedAI/EasyOCR
reader = easyocr.Reader(['de', 'fr', 'en'], gpu=True)
for file in tqdm(os.listdir(relative_path('data2/frames'))):
if file.endswith('.png'):
result = reader.readtext(relative_path(f'data2/frames/{file}'))
with open(relative_path(f"data2/frames_ocr/{file.rsplit('.', 1)[0]}.txt"), 'w', encoding='utf-8') as f:
f.write('\n'.join([i[1] for i in result]))
elif ENGINE[0] == 'anthropic':
file = input('Enter file name without extension: ')
input_path = relative_path(f"data/cropped_images/frames/{file}.png")
output_path = relative_path(f"tmp/frames_claude/{file}.webp")
# decrease size of the image to fit it into the requirements of anthropic
decrease_size(input_path, output_path)
with open(output_path, 'rb') as f:
image = base64.b64encode(f.read()).decode('utf-8')
# https://docs.anthropic.com/en/api/messages
response = requests.post(
url='https://api.anthropic.com/v1/messages',
headers={
'x-api-key': os.environ['ANTHROPIC_API_KEY'],
'anthropic-version': '2023-06-01',
'content-type': 'application/json',
},
data=json.dumps({
'model': ENGINE[1],
'max_tokens': 1024,
'messages': [
{
'role': 'user', 'content': [
{
'type': 'image',
'source': {
'type': 'base64',
'media_type': 'image/webp',
'data': image,
},
},
{
'type': 'text',
'text': PROMPT,
},
],
},
],
}),
)
data = response.json()
with open(relative_path(f"data/cropped_images/frames_claude/{file}.json"), 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4)
print(data)
elif ENGINE[0] == 'llama_cpp/v2/vision':
for file in tqdm(os.listdir(relative_path('data/cropped_images/frames'))):
if file.endswith('.png'):
input_path = relative_path(f"data/cropped_images/frames/{file}")
output_path = relative_path(f"tmp/frames_local/{file.rsplit('.', 1)[0]}.webp")
# decrease size of the image to fit it into the requirements of anthropic
decrease_size(input_path, output_path)
response = requests.post(
url='http://127.0.0.1:11434/llama_cpp/v2/vision',
headers={
'x-version': '2024-05-21',
'content-type': 'application/json',
},
data=json.dumps({
'task': PROMPT,
'model': ENGINE[1],
'image_path': output_path,
}),
)
data: str = response.json()['text']
with open(relative_path(f"data/cropped_images/frames_local/{file.rsplit('.', 1)[0]}.txt"),
'w', encoding='utf-8') as f:
f.write(data)
print(data)
if __name__ == '__main__':
main()