forked from Nagi-ovo/CHSI-Converter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_info.py
112 lines (101 loc) · 4.08 KB
/
extract_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
from pypdf import PdfReader
from pypinyin import lazy_pinyin
from flask import flash
def extract_text_from_pdf(pdf_path):
try:
pdf_file_obj = open(pdf_path, 'rb')
pdf_reader = PdfReader(pdf_file_obj)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
pdf_file_obj.close()
return text
except Exception as e:
flash(f"从PDF提取文本错误:{e}")
raise
def extract_info(patterns_dict, text):
results = {}
for prop, pattern in patterns_dict.items():
match = re.search(pattern, text)
if match:
if prop == 'Name':
pinyin = lazy_pinyin(match.group(1))
first_char = pinyin[0].capitalize()
remaining_chars = ''.join(pinyin[1:]).capitalize()
results[prop] = remaining_chars + ' ' + first_char
elif prop == 'Gender':
if match.group(1) == '男':
results[prop] = 'Male'
elif match.group(1) == '女':
results[prop] = 'Female'
else:
results[prop] = None
elif prop == 'Ethnic':
value = match.group(1)
pinyin_list = lazy_pinyin(value[:-1])
results[prop] = ''.join(pinyin_list).title()
elif prop == 'Date of Birth' or prop == 'Date of Enrollment':
value = match.group(1)
parts = value.split('年')
year = parts[0]
month_day = parts[1].split('月')
formatted_date = month_day[0] + '/' + month_day[1].replace('日', '') + '/' + year
results[prop] = formatted_date
elif prop == 'Levels' :
results[prop] = 'Undergraduate'
elif prop == 'Form' :
results[prop] = 'General full-time remote study'
elif prop == 'Educational System':
results[prop] = match.group(1)+' years'
elif prop == 'Type':
results[prop] = 'General higher education'
elif prop == 'School Status':
value = match.group(1)
date_part = value.split(":")[1]
parts = date_part.split('年')
year = parts[0]
month_day = parts[1].split('月')
day = month_day[1].replace('日', '').replace(')', '') # Remove trailing bracket from the day
formatted_date = month_day[0] + '/' + day + '/' + year
results[prop] = 'Student registration (Expected graduation date: ' + formatted_date + ')'
elif prop == 'Update Date':
value = match.group(1)
parts = value.split('年')
year = parts[0]
month_day = parts[1].split('月')
formatted_date = month_day[0] + '/' + month_day[1].replace('日', '') + '/' + year
results[prop] = formatted_date
else:
results[prop] = match.group(1)
else:
results[prop] = None
return results
def extract_info_from_pdf(path):
text = extract_text_from_pdf(path)
def rc(pattern):
return re.compile(r'{}\s*([^\s]*)'.format(pattern))
patterns_dict = {
'Update Date': rc('更新日期:'),
'Name': rc('姓名'),
'Gender': rc('性别'),
'Id Number': rc('证件号码'),
'Ethnic': rc('民族'),
'Date of Birth': rc('出生日期 '),
'Institution': rc('院校'),
'Levels': rc('层次'),
'Faculties': rc('院系'),
'Class': rc('班级'),
'Major': rc('专业'),
'Student Number': rc('学号'),
'Form': rc('形式'),
'Date of Enrollment': rc('入学日期'),
'Educational System': rc('学制'),
'Type': rc('类型'),
'School Status': rc('学籍状态'),
}
# 获取匹配的信息
results = extract_info(patterns_dict, text)
# for prop, value in results.items():
# print(f'{prop}: {value}')
return results