-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
1765 lines (1478 loc) · 88.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import json
import random
import traceback
import fitz # PyMuPDF
import sys
import csv
import numpy as np
import matplotlib.pyplot as plt
from kneed import KneeLocator
import os
import time
import re
import math
from collections import Counter
import requests
from scipy import stats
import pyperclip
import spacy
import re
from openai import OpenAI
from tqdm import tqdm
import psutil
import subprocess
def add_date_to_logs():
"""为日志文件添加日期标记,以区分不同的处理轮次"""
log_files = [
'_error_pdfs.log',
'_invalid_pdfs_case.log',
'_processed_pdfs.log',
'_processed_pdfs_requests_dump.log',
'_task_duration.log',
'_scanned_pdfs_case.log',
'_skipped_pdfs.log'
]
current_date = time.strftime("%Y-%m-%d %H:%M:%S")
separator = f"\n{'='*50}\n新的处理轮次开始于: {current_date}\n{'='*50}\n"
for log_file in log_files:
if os.path.exists(log_file):
with open(log_file, 'a', encoding='utf-8') as f:
f.write(separator)
else:
with open(log_file, 'w', encoding='utf-8') as f:
f.write(f"日志文件创建于: {current_date}\n{separator}")
print(f"已为所有日志文件添加日期标记: {current_date}")
# 在程序开始时调用此函数
add_date_to_logs()
# 读取需要跳过的路径列表
skip_paths = []
try:
with open('skip_path_list.txt', 'r', encoding='utf-8') as skip_file:
skip_paths = [os.path.normpath(line.strip()) for line in skip_file]
print(f"已读取跳过路径列表: {skip_paths[:10]}")
except FileNotFoundError:
print("未找到skip_path_list.txt文件,将处理所有PDF")
include_paths = []
try:
with open('include_path_list.txt', 'r', encoding='utf-8') as include_file:
include_paths = [os.path.normpath(line.strip()) for line in include_file]
print(f"已读取包含路径列表: {include_paths[:10]}")
except FileNotFoundError:
print("未找到include_path_list.txt文件,将处理所有PDF")
# nlp = spacy.load("en_core_web_sm")
def has_chinese(text):
pattern = re.compile(r'[\u4e00-\u9fff]')
return pattern.search(text) is not None
def is_regular_content(text):
words = re.findall(r'[a-zA-Z]{2,}', text) # 提取至少包含2个字母的英文单词
if len(words) >= 5:
for i in range(len(words) - 4):
if all(len(word) >= 2 for word in words[i:i+5]):
return True
return False
# doc = nlp(text)
# for token in doc:
# if token.pos_ in ["NOUN", "VERB", "ADJ"]:
# return True
# # 检查文本是否包含常见的中文标点符号
# if re.search(r'[,。!?、""'']', text):
# return True
# # 检查文本是否以大写字母开头,并且包含多个单词
# if re.match(r'^[A-Z][a-z]+ ', text):
# return True
# # 检查文本是否包含常见的英文单词
# common_words = ['the', 'and', 'is', 'are', 'of', 'in', 'to', 'for', 'on', 'with']
# if any(word in text.lower() for word in common_words):
# return True
# return False
import hashlib
def get_pdf_md5(pdf_path):
hash_md5 = hashlib.md5()
with open(pdf_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def is_script_running(script_path):
script_name = os.path.basename(script_path)
for process in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
# Check if it's a Python process
if process.info['name'].lower().startswith('python'):
cmdline = process.info['cmdline']
print(cmdline)
# Check if the script name is in the command line arguments
if len(cmdline) > 1 and script_name in cmdline[-1]:
return True
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
return False
def translate_paragraphs(paragraphs, pdf_path):
url = "http://localhost:8000/translate"
# 分离常规内容和非常规内容
regular_content = []
non_regular_content = []
for i, para in enumerate(paragraphs):
if is_regular_content(para):
regular_content.append((i, para))
else:
non_regular_content.append((i, para))
# 只发送常规内容进行翻译
data = {
"paragraphs": [para for _, para in regular_content],
"target_lang": "zh",
"test_mode": False # 设置为 True 来启用测试模式,返回原始段落
}
if dump_json_only:
pdf_md5 = get_pdf_md5(pdf_path)
key = f"{os.path.basename(pdf_path)}_{pdf_md5}"
existing_data[key] = data
translated_paragraphs = ["DUMP_JSON_MODE_ " + para for _, para in regular_content]
else:
response = requests.post(url, json=data)
if response.status_code != 200:
raise Exception(f"Translation failed: {response.text}")
translated_paragraphs = response.json()["translated_paragraphs"]
# 合并翻译结果和非常规内容
result = [None] * len(paragraphs)
for (i, _), translated in zip(regular_content, translated_paragraphs):
result[i] = translated
for i, para in non_regular_content:
result[i] = para
return result
import asyncio
import logging
from datetime import datetime
from openai import AsyncOpenAI
def translate_paragraphs_openai_stream(paragraphs, file_name):
client = AsyncOpenAI(
api_key="sk-......",
base_url='https://sapi.onechats.top/v1/'
)
sem = asyncio.Semaphore(100) # 限制并发数为100
failure_counter = 0
async def translate_paragraph(para, index):
async with sem:
for attempt in range(5): # 最多重试5次
try:
if is_regular_content(para):
# prompt = f"将以下文本翻译成中文:\n\n{para}\n\n翻译:"
response = await client.chat.completions.create(
messages=[
{"role": "system", "content": "Translate the text into Chinese. ONLY RETURN TRANSLATED TEXT AND NOTHING ELSE."},
{"role": "user", "content": para}
],
model="gpt-3.5-turbo",
stream=False
)
translated_para = response.choices[0].message.content.strip()
print(f"\n已翻译段落 {index+1}/{len(paragraphs)}:\n{para[:60]}...\n{translated_para[:60]}...")
return translated_para
else:
return para
except Exception as e:
current_time = datetime.now().strftime("%H:%M:%S")
print(f"{current_time} 尝试 {attempt+1}\n - 段落:\n {para[:40]}\n - 错误: {str(e)}\n")
logging.error(f"{current_time} 尝试 {attempt+1}\n - 段落:\n {para}\n - 错误: {str(e)}\n")
await asyncio.sleep(1) # 等待1秒后重试
nonlocal failure_counter
failure_counter += 1
with open(f'{file_name}_failed_paragraphs.log', 'a') as f:
f.write(para + '\n\n')
return None
async def translate_all():
tasks = [translate_paragraph(para, i) for i, para in enumerate(paragraphs)]
return await asyncio.gather(*tasks)
logging.basicConfig(filename=f'{file_name}_error.log', level=logging.ERROR)
# 使用事件循环运行异步函数
loop = asyncio.get_event_loop()
translated_paragraphs = loop.run_until_complete(translate_all())
print(f'翻译完成。失败次数: {failure_counter}。请查看日志文件了解详情。')
return translated_paragraphs
def word_num(text):
pattern = re.compile(r'[\u4e00-\u9fff]')
if pattern.search(text) is not None:
return len(pattern.findall(text))
else:
return len(text.split())
def is_scanned_pdf(pdf_path):
print(f"Checking if {pdf_path} is a scanned PDF...")
doc = fitz.open(pdf_path)
# Check the first 10 pages
for i in range(min(10, len(doc))):
page = doc.load_page(i)
if page.get_text():
return False # Not a scanned PDF
return True # No text found in the first 10 pages, probably a scanned PDF
def get_toc_info_per_page(pdf_path):
doc = fitz.open(pdf_path)
toc = doc.get_toc() # Get TOC information
print(f"TOC: {toc}")
page_toc_dict = {}
if not toc:
return page_toc_dict
for page in range(1, doc.page_count + 1):
for i in range(len(toc) - 1):
current_item, next_item = toc[i], toc[i+1]
if current_item[2] <= page <= next_item[2]:
lvl, title, pagenum, *_ = current_item
if page not in page_toc_dict:
page_toc_dict[page] = []
page_toc_dict[page].append({'level': lvl, 'title': title.strip()})
# break
# For the last TOC item, assume it continues to the last page of the document
if page >= toc[-1][2]:
lvl, title, pagenum, *_ = toc[-1]
if page not in page_toc_dict:
page_toc_dict[page] = []
page_toc_dict[page].append({'level': lvl, 'title': title.strip()})
return page_toc_dict
def extract_paragraphs(pdf_path, heuristic):
doc = fitz.open(pdf_path)
pdfdata = doc.tobytes()
doc_original = fitz.open("pdf", pdfdata)
paragraphs = []
hot_slot_page_num = ''
hot_slot_index = ''
line_number = 0
page_nums = []
toc_titles = []
toc_info = get_toc_info_per_page(pdf_path)
# for page, info in toc_info.items():
# print(f"Page {page} TOC Info: {info}")
# Store all blocks in a list
all_blocks_and_its_page = []
# store all aspect ratio of blocks in a list
all_blocks_aspect_ratio = []
# List to store span heights
span_heights = []
first_word_of_the_span = []
text_and_punctuations_before_indentation_dict = {}
all_line_heights = []
mean_line_height = 0
from tqdm import tqdm
pbar_pages = tqdm(enumerate(doc), total=len(doc), desc="正在提取文本块")
for i, page in pbar_pages:
text_and_punctuations_before_indentation_dict[i+1] = []
# pbar_pages.set_description(f"提取第 {i+1} 页的文本块")
pbar_pages.update(1)
data = page.get_text("dict")
# for block in data['blocks']:
for j, block in enumerate(data['blocks']):
if block['type'] == 0:
# mark the boundary of the block
# page.draw_rect(block['bbox'], color=(0, 1, 0), width=0.5)
all_blocks_and_its_page.append((page, block, i+1, j))
# calculate the aspect ratio of the block
block_aspect_ratio = (block['bbox'][2] - block['bbox'][0])/(block['bbox'][3] - block['bbox'][1])
all_blocks_aspect_ratio.append(block_aspect_ratio)
for line in block["lines"]:
line_height = line['bbox'][3] - line['bbox'][1]
all_line_heights.append(line_height)
for span in line["spans"]:
# Add the height of the span to the list
rect = fitz.Rect(span["bbox"])
height = math.ceil(rect.height) # Round height up to the nearest integer
span_heights.append(height)
first_word_of_the_span.append([span['text'][:20],height])
mean_line_height = np.mean(all_line_heights)
# Count occurrences of each span height
span_height_counter = Counter(span_heights)
# Get the most common span height
span_height = span_height_counter.most_common(1)[0][0]
print("The most common rounded-up span height is:", span_height)
paragraphs_to_be_translated_list = []
paragraphs_to_be_translated_bbox_list = []
paragraphs_to_be_translated_page_num_list = []
current_paragraph = ''
# create a list obj that can be assigned to a variable
current_block_bbox = [0,0,0,0]
header_detected_dict = {}
from tqdm import tqdm
pbar = tqdm(total=len(all_blocks_and_its_page), desc="处理文本块", colour="green")
for i, (page, block, page_number, block_index) in enumerate(all_blocks_and_its_page):
# pbar.set_description(f"处理第 {page_number} 页的第 {block_index + 1} 个文本块")
pbar.update(1)
last_line_right = None
last_line_left = None
merged_line = {}
block_number = i + 1
# check if header detected dict has key page number
if page_number not in header_detected_dict:
header_detected_dict[page_number] = False
top_right = (block['bbox'][2] - 15, block['bbox'][1] + 5) # top-right coordinates of the block
page.insert_text(top_right, str(block_number), fontsize=5, color=(1, 0, 0)) # insert block number
full_page_bbox = page.rect
upper_left_quarter = fitz.Rect(full_page_bbox.x0, full_page_bbox.y0, full_page_bbox.width / 2, full_page_bbox.height / 2)
lower_right_corner = fitz.Rect(full_page_bbox.width / 2, full_page_bbox.height * 3/4, full_page_bbox.width, full_page_bbox.height)
# draw a 5 pixel square at the upper left corner of the page, and 10 pixel on the upper right corner
page.draw_rect((0, 0, 5, 5), color=(1, 0, 0), fill=(1, 0, 0), width=0.5)
page.draw_rect((full_page_bbox.width - 10, 0, full_page_bbox.width, 10), color=(1, 0, 0), fill=(1, 0, 0), width=0.5)
for line in block['lines']:
# Merge spans on the same line
if merged_line and abs(merged_line['bbox'][1] - line['bbox'][1]) < 0.25 * span_height \
and line['bbox'][0] - merged_line['bbox'][2] > 0.1 * span_height:
merged_line['bbox'][2] = max(merged_line['bbox'][2], line['bbox'][2])
merged_line['bbox'][3] = max(merged_line['bbox'][3], line['bbox'][3])
merged_line['text'] += ' ' + ' '.join([span['text'] for span in line['spans']])
else:
# Process the merged line
if merged_line:
line_text = merged_line['text']
current_line_left, _, current_line_right, _ = merged_line['bbox']
if current_paragraph:
is_indented = last_line_left is not None and current_line_left - last_line_left > 0.7 * span_height
line_not_filled = last_line_right is not None and current_line_right - last_line_right > 4 * span_height
# Check if the last line ends with punctuation
if current_paragraph.strip():
last_line_ends_with_punctuation = current_paragraph.strip()[-1] in ['.', '!', '?', '。', '!', '?','"', '”']
else:
last_line_ends_with_punctuation = False
last_line_ends_with_punctuation_and_script = re.search(r'[.?!。!?”"] *\d+$', current_paragraph.strip()) is not None
last_line_ends_with_punctuation = last_line_ends_with_punctuation or last_line_ends_with_punctuation_and_script
if (is_indented or line_not_filled) and last_line_ends_with_punctuation:
text_and_punctuations_before_indentation_dict[page_number].append(current_paragraph[-10:].strip())
# check if the upper left corner of the block is in the upper left quarter of the page
# if block['bbox'][0] < upper_left_quarter.x1 and block['bbox'][1] < upper_left_quarter.y1:
# first_line_in_upper_left_quarter = True
# else:
# first_line_in_upper_left_quarter = False
# current_paragraph start with a lower case letter
if 000000 and (current_paragraph[0].islower() or has_chinese(current_paragraph)) and \
word_num(current_paragraph) > 20 and hot_slot_page_num == page_number -1:
# and first_line_in_upper_left_quarter # too hash
# current_paragraph is a continuation of the last paragraph
last_2_words = ' '.join(paragraphs[hot_slot_index].split()[-2:])
page.insert_text((0 + 5, block['bbox'][1] + 5), last_2_words, fontsize=5, color=(0, 0, 0))
paragraphs[hot_slot_index] += ' ' + current_paragraph
hot_slot_page_num = ''
hot_slot_index = ''
page.draw_rect((block['bbox'][0] - 7, block['bbox'][1], block['bbox'][0] - 5, block['bbox'][3]), color=(0, 1, 0), fill = (0, 1, 0), width=0.5)
page.insert_text((block['bbox'][0] - 7, block['bbox'][1] + 5), 'continued_1', fontsize=5, color=(0, 0, 0))
page.draw_rect((block['bbox'][2], merged_line['bbox'][1] - 2, block['bbox'][2] + 2, merged_line['bbox'][1]), color=(0, 0, 0), fill = (0,0,0), width=0.5)
page.insert_text((block['bbox'][2] + 2, merged_line['bbox'][1]), 'by_new_para', fontsize=5, color=(0, 0, 0))
else:
paragraphs.append(current_paragraph)
# 对文本块进行修订并插入模拟翻译
bbox = current_block_bbox
rect = fitz.Rect(bbox)
# # reversed_text = current_paragraph[::-1] # 反转文本作为模拟翻译
# # reversed_text = reversed_text.upper()
# reversed_text = '> ' + current_paragraph + ' <'
# # 使用红色注释覆盖原文本
# page.add_redact_annot(rect, text="")
# page.apply_redactions()
# # 插入模拟翻译后的文本,红色字体
# html = f'''
# {reversed_text}
# '''
# page.insert_htmlbox(rect, html, css="* {background-color: red; font-size: 30px;}")
# page.draw_rect(rect, color=(0, 1, 0), width=2)
# 将文本添加到待翻译列表
paragraphs_to_be_translated_list.append(current_paragraph)
paragraphs_to_be_translated_bbox_list.append(rect)
paragraphs_to_be_translated_page_num_list.append(page_number)
page_nums.append(page_number)
page.draw_rect((block['bbox'][2], merged_line['bbox'][1] - 2, block['bbox'][2] + 2, merged_line['bbox'][1]), color=(0, 0, 0), fill = (0,0,0), width=0.5)
page.insert_text((block['bbox'][2] + 2, merged_line['bbox'][1]), 'by_new_para', fontsize=5, color=(0, 0, 0))
titles_list = []
for info in toc_info.get(page_number, [{'title': 'N/A'}]):
titles_list.append(info['title'])
title_concat = ' | '.join(titles_list)
toc_titles.append(title_concat)
# toc_titles.append(toc_info.get(page_number, [{'title': 'N/A'}])[0]['title'])
current_paragraph = line_text
current_block_bbox = list(merged_line['bbox'])
page.draw_rect(merged_line['bbox'], color=(0, 1, 0), width=0.5)
else:
current_paragraph += ' ' + line_text
current_block_bbox[0] = min(current_block_bbox[0], merged_line['bbox'][0])
current_block_bbox[2] = max(current_block_bbox[2], merged_line['bbox'][2])
current_block_bbox[3] = max(current_block_bbox[3], merged_line['bbox'][3])
# Normal extended line, draw a black box
page.draw_rect(merged_line['bbox'], color=(0, 0, 0), width=0.5)
else:
current_paragraph = line_text
current_block_bbox = list(merged_line['bbox'])
last_line_right = current_line_right
last_line_left = current_line_left
# Start a new merged line
merged_line = {
'bbox': list(line['bbox']),
'text': ' '.join([span['text'] for span in line['spans']]),
}
merged_line['bbox'][0] = min(merged_line['bbox'][0], block['bbox'][0])
if 'small in relation to free-energy gradients' in merged_line['text']:
pass
# --- 计算行高百分比排名并插入信息 ---
line_height = merged_line['bbox'][3] - merged_line['bbox'][1]
# 计算相比平均行高的倍数
ratio_over_mena_line_height = line_height / mean_line_height
line_height_percentile = stats.percentileofscore(all_line_heights, line_height)
page.insert_text((merged_line['bbox'][0] - 20, (merged_line['bbox'][1] + merged_line['bbox'][3]) / 2),
f"{line_height_percentile:.0f}% ({ratio_over_mena_line_height:.1f})",
fontsize=5, color=(1, 0, 0))
# page.insert_text((merged_line['bbox'][0] - 7, merged_line['bbox'][1] + 4), str(line_number), fontsize=5, color=(0, 0, 0))
# line_number += 1
# if '432 Proceedings of the IEEE' in merged_line['text']:
# pass
# Don't forget to process the last merged line
if merged_line:
line_text = merged_line['text']
current_line_left, _, current_line_right, _ = merged_line['bbox']
if current_paragraph:
is_indented = last_line_left is not None and current_line_left - last_line_left > 0.7 * span_height
line_not_filled = last_line_right is not None and current_line_right - last_line_right > 4 * span_height
# Check if the last line ends with punctuation
if current_paragraph.strip():
last_line_ends_with_punctuation = current_paragraph.strip()[-1] in ['.', '!', '?', '。', '!', '?','"', '”']
else:
last_line_ends_with_punctuation = False
last_line_ends_with_punctuation_and_script = re.search(r'[.?!。!?”"] *\d+$', current_paragraph.strip()) is not None
last_line_ends_with_punctuation = last_line_ends_with_punctuation or last_line_ends_with_punctuation_and_script
if (is_indented or line_not_filled) and last_line_ends_with_punctuation:
text_and_punctuations_before_indentation_dict[page_number].append(current_paragraph[-10:].strip())
# check if the upper left corner of the block is in the upper left quarter of the page
# if block['bbox'][0] < upper_left_quarter.x1 and block['bbox'][1] < upper_left_quarter.y1:
# first_line_in_upper_left_quarter = True
# else:
# first_line_in_upper_left_quarter = False
# current_paragraph start with a lower case letter
if 000000 and (current_paragraph[0].islower() or has_chinese(current_paragraph)) and \
word_num(current_paragraph) > 20 and hot_slot_page_num == page_number -1:
# and first_line_in_upper_left_quarter # too hash
# current_paragraph is a continuation of the last paragraph
last_2_words = ' '.join(paragraphs[hot_slot_index].split()[-2:])
page.insert_text((0 + 5, block['bbox'][1] + 5), last_2_words, fontsize=5, color=(0, 0, 0))
paragraphs[hot_slot_index] += ' ' + current_paragraph
hot_slot_page_num = ''
hot_slot_index = ''
page.draw_rect((block['bbox'][0] - 7, block['bbox'][1], block['bbox'][0] - 5, block['bbox'][3]), color=(0, 1, 0), fill = (0, 1, 0), width=0.5)
page.insert_text((block['bbox'][0] - 7, block['bbox'][1] + 5), 'continued_2', fontsize=5, color=(0, 0, 0))
page.draw_rect((block['bbox'][2], block['bbox'][3] - 4, block['bbox'][2] + 4, block['bbox'][3]), color=(0, 0, 0), fill = (0,0,0), width=0.5)
page.insert_text((block['bbox'][2], block['bbox'][3] - 5), 'by_last_line_new_para', fontsize=5, color=(0, 0, 0))# toc_titles.append(toc_info.get(page_number, [{'title': 'N/A'}])[0]['title'])
else:
paragraphs.append(current_paragraph)
# 对文本块进行修订并插入模拟翻译
bbox = current_block_bbox
rect = fitz.Rect(bbox)
# # reversed_text = current_paragraph[::-1] # 反转文本作为模拟翻译
# # reversed_text = reversed_text.upper()
# reversed_text = '> ' + current_paragraph + ' <'
# # 使用红色注释覆盖原文本
# page.add_redact_annot(rect, text="")
# page.apply_redactions()
# # 插入模拟翻译后的文本,红色字体
# html = f'''
# {reversed_text}
# '''
# page.insert_htmlbox(rect, html, css="* {background-color: red; font-size: 30px;}")
# page.draw_rect(rect, color=(0, 1, 0), width=2)
# 将文本添加到待翻译列表
paragraphs_to_be_translated_list.append(current_paragraph)
paragraphs_to_be_translated_bbox_list.append(rect)
paragraphs_to_be_translated_page_num_list.append(page_number)
page_nums.append(page_number)
page.draw_rect((block['bbox'][2], merged_line['bbox'][1] - 2, block['bbox'][2] + 2, merged_line['bbox'][1]), color=(0, 0, 0), fill = (0,0,0), width=0.5)
page.insert_text((block['bbox'][2] + 2, merged_line['bbox'][1]), 'by_last_line_new_para', fontsize=5, color=(0, 0, 0))# toc_titles.append(toc_info.get(page_number, [{'title': 'N/A'}])[0]['title'])
titles_list = []
for info in toc_info.get(page_number, [{'title': 'N/A'}]):
titles_list.append(info['title'])
title_concat = ' | '.join(titles_list)
toc_titles.append(title_concat)
current_paragraph = line_text
current_block_bbox = list(merged_line['bbox'])
else:
current_paragraph += ' ' + line_text
current_block_bbox[0] = min(current_block_bbox[0], merged_line['bbox'][0])
current_block_bbox[2] = max(current_block_bbox[2], merged_line['bbox'][2])
current_block_bbox[3] = max(current_block_bbox[3], merged_line['bbox'][3])
else:
current_paragraph = line_text
current_block_bbox = list(merged_line['bbox'])
# look ahead for 未完待续 的迹象
last_line_ends_with_punctuation = False
if current_paragraph.strip():
last_line_ends_with_punctuation = current_paragraph.strip()[-1] in ['.', '!', '?', '。', '!', '?','"', '”']
last_line_ends_with_punctuation_and_script = re.search(r'[.?!。!?”"] *\d+$', current_paragraph.strip()) is not None
last_line_ends_with_punctuation = last_line_ends_with_punctuation or last_line_ends_with_punctuation_and_script
unfinished_column = False
just_first_line = False
just_sparsed_line = False
just_left_overhang = False
# if i + 2 < len(all_blocks_and_its_page):
if i + 1 < len(all_blocks_and_its_page):
# 是否未完待续
next_block = all_blocks_and_its_page[i + 1][1]
# next_block_2 = all_blocks_and_its_page[i + 2][1]
# 间隙蓝色方框
# also skip appending if this block is left and right-aligned with the next block and
# not ending with punctuation which means in a large chance it is a wrongly classified regular line in a block
left_indentation = block['bbox'][0] - next_block['bbox'][0]
left_overhang = next_block['bbox'][0] - block['bbox'][0]
horizontal_gap = next_block['bbox'][0] - block['bbox'][2]
# horizontal_gap_2 = next_block_2['bbox'][0] - block['bbox'][2]
vertical_gap = next_block['bbox'][1] - block['bbox'][3]
left_alignment = abs(block['bbox'][0] - next_block['bbox'][0])
right_alignment = abs(block['bbox'][2] - next_block['bbox'][2])
block_height = block['bbox'][3] - block['bbox'][1]
# draw a rect around the spacing between two blocks
page.draw_rect((block['bbox'][0], block['bbox'][3], next_block['bbox'][2], next_block['bbox'][1]), color=(0, 0, 1), width=0.5)
# insert text to mark the spacing between two blocks with round to 2 decimal places
text_ = str(round(left_alignment, 2))+' - '+str(round(right_alignment, 2))+' - '+str(round(vertical_gap, 2))
page.insert_text((block['bbox'][0] + 5, block['bbox'][3] + 5), text_, fontsize=5, color=(0, 0, 0))
# 右侧蓝色方框
# 未完的左侧 Column
# 有待继续
# Skip appending if the paragraph is not finished on this block
# vertical_alignment = abs(block['bbox'][1] - next_block['bbox'][1])
# horizontal_gap = next_block['bbox'][0] - block['bbox'][2]
if current_paragraph and 4 * span_height > horizontal_gap > span_height \
and not last_line_ends_with_punctuation:
# draw rect at the end of the paragraph
page.draw_rect((block['bbox'][2] + 3, block['bbox'][1], block['bbox'][2] + 5, block['bbox'][3]), color=(1, 0, 0), fill=(1, 0, 0), width=0.5)
page.insert_text((block['bbox'][2] + 5, block['bbox'][1] + 5), 'unfinished_column', fontsize=5, color=(0, 0, 0))
# forced to be false as delayed redact could be complex
unfinished_column = False
# continue
# 左侧红色方框
# 实为自然段首行
# 有待继续
# Also skip appending if this block is a wrongly classified first line of the next block
# right_alignment = abs(block['bbox'][2] - next_block['bbox'][2])
# left_indentation = block['bbox'][0] - next_block['bbox'][0]
# block_height = block['bbox'][3] - block['bbox'][1]
if right_alignment < span_height and 5 * span_height > left_indentation > 0.7 * span_height and vertical_gap < 0.5 * span_height and block_height < 2 * span_height:
# This block is a wrongly classified first line of the next block
page.draw_rect((block['bbox'][2] + 1, block['bbox'][1], block['bbox'][2] + 3, block['bbox'][3]), color=(1, 0, 0), width=0.5)
page.insert_text((block['bbox'][2] + 3, block['bbox'][1] + 5), 'just_first_line', fontsize=5, color=(0, 0, 0))
just_first_line = True
# continue
# 左侧红色方框
# 实为行间距较大的普通行
# 有待继续
# to deal with the large line spacing and thus wrongly classified regular line in a block, now consider the punctuation in this end
# if left_alignment < 5 and right_alignment < 5 and vertical_gap < 5:
# not even consider right alignment to include the last line of a paragraph that would normally not filled with text
# block_height = block['bbox'][3] - block['bbox'][1]
# if left_alignment < 0.7 * span_height and \
# (right_alignment < 0.7 * span_height or block['bbox'][2] - next_block['bbox'][2] > 0) and \
# vertical_gap < 0.5 * span_height: # 2023-10-2
# right_alignment < 0.7 * span_height and \ 2023-10-20
if left_alignment < 0.7 * span_height and \
0 < vertical_gap < 0.5 * span_height and block_height < 2 * span_height:
# vertical_gap < 0.5 * span_height and block_height < 2 * span_height: as some two or more lines are recognized as one
# only when first line of next block wasn't indentated
left_of_first_line_of_next_block = next_block['lines'][0]['bbox'][0]
if left_of_first_line_of_next_block - block['bbox'][0] < span_height:
line_height = merged_line['bbox'][3] - merged_line['bbox'][1]
line_height_percentile = stats.percentileofscore(all_line_heights, line_height)
# ratio_over_mena_line_height = line_height / mean_line_height
if not line_height_percentile > 80 and not line_height > 1.5 * mean_line_height:
page.draw_rect((block['bbox'][2] - 1, block['bbox'][1], block['bbox'][2] + 1, block['bbox'][3]), color=(1, 0, 0), width=0.5)
page.insert_text((block['bbox'][2] + 1, block['bbox'][1] + 5), 'just_sparsed_line', fontsize=5, color=(0, 0, 0))
# whenever we are here, hot slot flag should not apply any more
# hot_slot_page_num = ''
# hot_slot_index = ''
# no, not really
just_sparsed_line = True
# continue
# left_overhang = next_block['bbox'][0] - block['bbox'][0]
last_line_ends_with_punctuation = False
if current_paragraph.strip():
last_line_ends_with_punctuation = current_paragraph.strip()[-1] in ['.', '!', '?', '。', '!', '?','"', '”']
last_line_ends_with_punctuation_and_script = re.search(r'[.?!。!?”"] *\d+$', current_paragraph.strip()) is not None
last_line_ends_with_punctuation = last_line_ends_with_punctuation or last_line_ends_with_punctuation_and_script
# num_last_chars = min(len(current_paragraph.strip()), 5)
# current_paragraph_last_chars = current_paragraph.strip()[-num_last_chars:]
# block_height = block['bbox'][3] - block['bbox'][1]
if left_overhang > 0.7 * span_height and right_alignment < 0.7 * span_height and block_height < 2 * span_height \
and vertical_gap < 0.5 * span_height and not last_line_ends_with_punctuation:
page.draw_rect((block['bbox'][2] - 1, block['bbox'][1], block['bbox'][2] + 1, block['bbox'][3]), color=(1, 0, 0), width=0.5)
page.insert_text((block['bbox'][2] + 1, block['bbox'][1] + 5), 'just_left_overhang', fontsize=5, color=(0, 0, 0))
# page.insert_text((block['bbox'][2] + 1, block['bbox'][1] + 10), ''+current_paragraph_last_chars, fontsize=5, color=(0, 0, 0))
# page.insert_text((block['bbox'][2] + 1, block['bbox'][1] + 15), str(len(current_paragraph.strip())), fontsize=5, color=(0, 0, 0))
# whenever we are here, hot slot flag should not apply any more
# hot_slot_page_num = ''
# hot_slot_index = ''
# no, not really
just_left_overhang = True
# continue
if current_paragraph:
# check if the upper left corner of the block is in the upper left quarter of the page
# if block['bbox'][0] < upper_left_quarter.x1 and block['bbox'][1] < upper_left_quarter.y1:
# first_line_in_upper_left_quarter = True
# else:
# first_line_in_upper_left_quarter = False
# also make sure it's not a header by checking vertical spacing between this block and the next block
is_header = False
# aspect_ration = all_blocks_aspect_ratio[i]
if block_index == 0 and i + 1 < len(all_blocks_and_its_page) and header_detected_dict[page_number] == False:
# Skip appending if the paragraph is not finished on this block
next_block = all_blocks_and_its_page[i + 1][1]
left_alignment = abs(block['bbox'][0] - next_block['bbox'][0])
right_alignment = abs(block['bbox'][2] - next_block['bbox'][2])
vertical_gap = next_block['bbox'][1] - block['bbox'][3]
block_height = block['bbox'][3] - block['bbox'][1]
if block_height < 2 * span_height and vertical_gap > span_height:
page.draw_rect((block['bbox'][2], block['bbox'][1], block['bbox'][2] + 4, block['bbox'][1] + 4), fill=(1, 1, 0), width=0.5)
is_header = True
header_detected_dict[page_number] = True
# and hot_slot_page_num == page_number -1 and first_line_in_upper_left_quarter \ # maybe too hash
if 000000 and (not is_header) and (current_paragraph[0].islower() or has_chinese(current_paragraph)) and \
word_num(current_paragraph) > 20 and hot_slot_page_num == page_number -1 \
and not unfinished_column and not just_first_line and not just_sparsed_line and not just_left_overhang:
text_and_punctuations_before_indentation_dict[page_number].append(current_paragraph[-10:].strip())
# current_paragraph is a continuation of the last paragraph
last_2_words = ' '.join(paragraphs[hot_slot_index].split()[-2:])
page.insert_text((0 + 5, block['bbox'][1] + 5), last_2_words, fontsize=5, color=(0, 0, 0))
paragraphs[hot_slot_index] += ' ' + current_paragraph
hot_slot_page_num = ''
hot_slot_index = ''
page.draw_rect((block['bbox'][0] - 7, block['bbox'][1], block['bbox'][0] - 5, block['bbox'][3]), color=(0, 1, 0), fill = (0, 1, 0), width=0.5)
page.insert_text((block['bbox'][0] - 7, block['bbox'][1] + 5), 'continued_3', fontsize=5, color=(0, 0, 0))
page.draw_rect((block['bbox'][2], block['bbox'][3] - 4, block['bbox'][2] + 4, block['bbox'][3]), color=(0, 0, 0), fill = (0,0,0), width=0.5)
page.insert_text((block['bbox'][2], block['bbox'][3] - 5), 'para_end', fontsize=5, color=(0, 0, 0))# toc_titles.append(toc_info.get(page_number, [{'title': 'N/A'}])[0]['title'])
current_paragraph = ''
elif not unfinished_column and not just_first_line and not just_sparsed_line and not just_left_overhang:
text_and_punctuations_before_indentation_dict[page_number].append(current_paragraph[-10:].strip())
paragraphs.append(current_paragraph)
# 对文本块进行修订并插入模拟翻译
bbox = current_block_bbox
rect = fitz.Rect(bbox)
# # reversed_text = current_paragraph[::-1] # 反转文本作为模拟翻译
# # reversed_text = reversed_text.upper()
# reversed_text = '> ' + current_paragraph + ' <'
# # 使用红色注释覆盖原文本
# page.add_redact_annot(rect, text="")
# page.apply_redactions()
# # 插入模拟翻译后的文本,红色字体
# html = f'''
# {reversed_text}
# '''
# # font-size 30
# page.insert_htmlbox(rect, html, css="* {background-color: red; font-size: 30px;}")
# page.draw_rect(rect, color=(0, 1, 0), width=2)
# 将文本添加到待翻译列表
paragraphs_to_be_translated_list.append(current_paragraph)
paragraphs_to_be_translated_bbox_list.append(rect)
paragraphs_to_be_translated_page_num_list.append(page_number)
page_nums.append(page_number)
page.draw_rect((block['bbox'][2], block['bbox'][3] - 4, block['bbox'][2] + 4, block['bbox'][3]), color=(0, 0, 0), fill = (0,0,0), width=0.5)
page.insert_text((block['bbox'][2], block['bbox'][3] - 5), 'para_end', fontsize=5, color=(0, 0, 0))# toc_titles.append(toc_info.get(page_number, [{'title': 'N/A'}])[0]['title'])
# toc_titles.append(toc_info.get(page_number, [{'title': 'N/A'}])[0]['title'])
titles_list = []
for info in toc_info.get(page_number, [{'title': 'N/A'}]):
titles_list.append(info['title'])
title_concat = ' | '.join(titles_list)
toc_titles.append(title_concat)
# set hot slot index and page number as a flag if current paragraph is not ended with punctuation
# if current_paragraph.strip()[-1] not in ['.', '!', '?','。','!','?','"', '”']:
# if end with lower case letter, set it as hot slot
# check x1 of the last line of the block is almost the same as x1 of the block
line_bboxs = [line['bbox'] for line in block['lines']]
# check if the lower right corner of the block is in the lower right quarter of the page
if block['bbox'][2] > lower_right_corner.x0 and block['bbox'][3] > lower_right_corner.y0:
last_line_in_lower_right_corner = True
else:
last_line_in_lower_right_corner = False
# if last_line_ends_with_lower_case and last_line_in_lower_right_corner:
if not last_line_ends_with_punctuation and block['bbox'][2] - line_bboxs[-1][2] < 3 * span_height and last_line_in_lower_right_corner:
if word_num(current_paragraph) > 20:
page.draw_rect((block['bbox'][2] + 5, block['bbox'][1], block['bbox'][2] + 7, block['bbox'][3]), color=(1, 0, 0), fill = (1, 0, 0), width=0.5)
page.insert_text((block['bbox'][2] + 7, block['bbox'][1] + 5), 'unfinished', fontsize=5, color=(0, 0, 0))
hot_slot_index = len(paragraphs) - 1
hot_slot_page_num = page_number
# check word count of last hot slot paragraph, update it if current paragraph is longer
# elif hot_slot_index != '' and len(paragraphs[hot_slot_index].split()) < word_num(current_paragraph):
# page.draw_rect((block['bbox'][2], block['bbox'][1], block['bbox'][2] + 5, block['bbox'][3]), color=(0, 0, 1), fill = (0,0,1), width=0.5)
# hot_slot_index = len(paragraphs) - 1
# hot_slot_page = page_number
current_paragraph = ''
current_block_bbox = [0,0,0,0]
if heuristic:
paragraphs = []
paragraphs_fixed = []
with open(pdf_path[:-4] + '_paragraphs_fixed.txt', 'a') as f:
f.write(pdf_path[:-4] + '_paragraphs_fixed:\n\n')
doc_ = fitz.open(pdf_path)
for i, page in enumerate(doc_):
print(f"\n\nExtract paragraphs from page {i+1}...\n")
text = page.get_text('text')
# paragraph_break = re.compile(r'(?<=[.?!。!?”"] *\d+)\s*\n')
paragraph_break = re.compile(r'(?<=[.?!。!?”"])\s*\n')
page_paragraphs = re.split(paragraph_break, text)
text_and_punctuations_before_indentation_on_this_page_list = text_and_punctuations_before_indentation_dict.get(i+1, [])
# print(text_and_punctuations_before_indentation_on_this_page_list)
text_and_punctuations_before_indentation_on_this_page_list = [text_and_punctuations.replace(' ', '') for text_and_punctuations in text_and_punctuations_before_indentation_on_this_page_list]
# print(text_and_punctuations_before_indentation_on_this_page_list)
# for text_and_punctuations in text_and_punctuations_before_indentation_on_this_page_list:
# if text_and_punctuations:
# text_and_punctuations_pattern = re.compile(re.escape(text_and_punctuations) + r'$')
# for paragraph in page_paragraphs:
# if not text_and_punctuations_pattern.search(paragraph):
# print(paragraph)
# text = text.replace(text_and_punctuations, text_and_punctuations + '|||||||||||||||||||')
for paragraph in page_paragraphs:
paragraph_ = paragraph.replace('\n', '')
paragraph_ = paragraph_.replace(' ', '')
match_any_text_and_punctuations = False
for text_and_punctuations in text_and_punctuations_before_indentation_on_this_page_list:
text_and_punctuations_pattern = re.compile(re.escape(text_and_punctuations) + r'$')
if text_and_punctuations_pattern.search(paragraph_):
match_any_text_and_punctuations = True
if not match_any_text_and_punctuations:
print('\nFalse positive paragraph_:')
print(text_and_punctuations_before_indentation_on_this_page_list)
print(paragraph_)
text = text.replace(paragraph, paragraph + '<||--||>')
with open(pdf_path[:-4] + '_paragraphs_fixed.txt', 'a') as f:
f.write(f'Page {i+1} paragraphs_fixed:\n{paragraph}\n\n')
# paragraph_break = re.compile(r'(?<=[.?!。!?”"] *\d+)\s*\n')
paragraph_break = re.compile(r'(?<=[.?!。!?”"])\s*\n')
page_paragraphs = re.split(paragraph_break, text)
for paragraph in page_paragraphs:
if paragraph.strip():
paragraph = paragraph.replace(' ', '')
paragraph = paragraph.replace('<||--||>', '')
# split lines in the same paragraph
lines = paragraph.split('\n')
# join them back with a space if in english
if not has_chinese(paragraph):
paragraph = ' '.join(lines)
else:
paragraph = ''.join(lines)
paragraphs.append(paragraph)
page_nums.append(i+1)
titles_list = []
for info in toc_info.get(i+1, [{'title': 'N/A'}]):
titles_list.append(info['title'])
title_concat = ' | '.join(titles_list)
toc_titles.append(title_concat)
# print(text_and_punctuations_before_indentation_dict)
return paragraphs, page_nums, toc_titles, doc, []
# doc.save(pdf_path[:-4] + "_marked.pdf")
# doc.save(pdf_path[:25] + "_layout_marked.pdf")
# print(span_heights)
# print(span_height)
# print(first_word_of_the_span)
paragraphs_ = []
for paragraph in paragraphs:
if has_chinese(paragraph):
paragraph = paragraph.replace(' ', '')
paragraphs_.append(paragraph)
###############################
print('等待翻译完成...')
if use_openai:
file_name = os.path.splitext(os.path.basename(pdf_path))[0]
paragraphs_translation_list = translate_paragraphs_openai_stream(paragraphs_to_be_translated_list, file_name)
print('使用OpenAI完成翻译!')
else:
# 添加30秒超时等待本地服务器启动
start_time = time.time()
retry_count = 0
while True:
try:
paragraphs_translation_list = translate_paragraphs(paragraphs_to_be_translated_list, pdf_path)
print('翻译完成!')
break
except requests.exceptions.ConnectionError:
if time.time() - start_time > 30:
raise Exception("本地翻译服务器30秒内未能启动,请检查服务器状态。")
retry_count += 1
print(f"正在尝试连接本地翻译服务器,第 {retry_count} 次重试...")
time.sleep(1)
if dump_json_only:
return
print("翻译结果:")
for i, translation in enumerate(paragraphs_translation_list, 1):
print(f"段落 {i}: {translation}")
# 模拟翻译结果,生成与原文单词数相近的中文
# paragraphs_translation_list = []
# for paragraph in paragraphs_to_be_translated_list:
# word_count = len(paragraph.split())
# sample_text = """麦卡锡分别与信息论之父克劳德·香农和电气工程先驱纳撒尼尔·罗切斯特合作。麦卡锡在达特茅斯时说服明斯基、香农和罗切斯特帮助他组织"一项为期2个月,10人的人工智能研究,计划在1956年夏天进行。"人工智能这个术语是麦卡锡的创造;他想要将这个领域和一个名为控制论的相关努力区分开来。麦卡锡后来承认这个名字实际上并不受人欢迎——毕竟,目标是真正的,而非"人工"的智能——但"我必须给它一个名字,所以我给它起名为'人工智能'"。"""
# translated_paragraph = sample_text[:word_count * 2] # 确保长度与原文单词数相近
# paragraphs_translation_list.append(translated_paragraph)
# print(paragraphs_translation_list)
# print('翻译完成,使用了OpenAI!')
paragraphs_to_be_translated_dict = {}
for original_paragraph, translated_paragraph, bbox, page_num in zip(
paragraphs_to_be_translated_list,
paragraphs_translation_list,
paragraphs_to_be_translated_bbox_list,
paragraphs_to_be_translated_page_num_list
):
if page_num not in paragraphs_to_be_translated_dict:
paragraphs_to_be_translated_dict[page_num] = []
paragraphs_to_be_translated_dict[page_num].append({
'original_paragraph': original_paragraph, # 保存原始文本
'translated_paragraph': translated_paragraph,
'bbox': bbox
})
# re-open another copy of the pdf
doc_translated = fitz.open(pdf_path)
# font_files = [
# "Deng.ttf", "Dengb.ttf", "Dengl.ttf", "FZSTK.TTF", "FZYTK.TTF",
# "HYZhongHeiTi-197.ttf", "msyh.ttc", "msyhbd.ttc", "msyhl.ttc",
# "simfang.ttf", "simhei.ttf", "simkai.ttf", "SIMLI.TTF", "simsun.ttc",
# "SIMYOU.TTF", "SourceHanSansCN-Bold.otf", "SourceHanSansCN-ExtraLight.otf",
# "SourceHanSansCN-Heavy.otf", "SourceHanSansCN-Light.otf",
# "SourceHanSansCN-Medium.otf", "SourceHanSansCN-Normal.otf",
# "SourceHanSansCN-Regular.otf", "STCAIYUN.TTF", "STFANGSO.TTF",
# "STHUPO.TTF", "STKAITI.TTF", "STLITI.TTF", "STSONG.TTF", "STXIHEI.TTF",
# "STXINGKA.TTF", "STXINWEI.TTF", "STZHONGS.TTF"
# ]
# font_files = ["STFANGSO.TTF"]
# font_files = ["STXIHEI.TTF"]
# font_files = ["SourceHanSansCN-Normal.otf"]
# font_files = ["simsun.ttc"]
font_files = ["方正宋三_GBK.TTF"]
# font_files = [os.path.join("./fonts", font) for font in font_files]
original_cwd = os.getcwd()
current_file_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(current_file_dir)
font_files = [os.path.normpath(os.path.join("./fonts", font)).replace("\\", "/") for font in font_files]
css = " ".join([f"""@font-face {{font-family: fam{i}; src: url("{font}");}}""" for i, font in enumerate(font_files)])
from tqdm import tqdm
total_paragraphs = sum(len(paragraphs) for paragraphs in paragraphs_to_be_translated_dict.values())
pbar = tqdm(total=total_paragraphs, desc="替换页面和段落")
for i, page in enumerate(doc_translated):
# 保存原始链接
original_links = [link for link in page.get_links()]
# print(f"页面 {i+1} 开始时的链接数: {len(original_links)}")
if i+1 in paragraphs_to_be_translated_dict:
for paragraph_data in paragraphs_to_be_translated_dict[i+1]:
original_paragraph = paragraph_data['original_paragraph']
translated_paragraph = paragraph_data['translated_paragraph']
if not is_regular_content(original_paragraph):
pbar.update(1)
continue
rect = fitz.Rect(paragraph_data['bbox'])
page.add_redact_annot(rect, text="")
page.apply_redactions(images=0)
random_font_index = random.randint(0, len(font_files)-1)
random_font = f"fam{random_font_index}"
html = f"""<span style="font-family:{random_font}; line-height: 1.5; letter-spacing: 0.05em;">{translated_paragraph}</span>"""