-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc_statistics.py
105 lines (92 loc) · 2.99 KB
/
doc_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
from doctools_lib import paragraph_iterator
from docx import Document
from colorama import Fore, init, Back
from pprint import pprint
def analize_figures(lines):
stop_tag = re.compile(r'<f+stop>')
continue_tag = re.compile(r'<f+continue>')
pat = re.compile(
r'^\s*(Рис\.|Рисунок)\s+(?P<num>[\d.]+?)\s*([–-].*|[. ]*$)')
rez = []
do_analysis = True
for p in lines:
if stop_tag.search(p):
do_analysis = False
if continue_tag.search(p):
do_analysis = True
if not do_analysis:
continue
m = pat.match(p)
if m:
rez.append(m.group('num'))
return rez
def analize_formulas(lines):
stop_tag = re.compile(r'<e+stop>')
continue_tag = re.compile(r'<e+continue>')
pat = re.compile(r'[^а-яА-Я]*\((?P<num>[\d.-]+)\)\s*$')
rez = []
do_analysis = True
for p in lines:
if stop_tag.search(p):
do_analysis = False
if continue_tag.search(p):
do_analysis = True
if not do_analysis:
continue
m = pat.match(p)
if m:
rez.append(m.group('num'))
return rez
def analize_tables(lines):
stop_tag = re.compile(r'<t+stop>')
continue_tag = re.compile(r'<t+continue>')
pat = re.compile(
r'^\s*(Таб\.|Таблица|Табл.)\s+(?P<num>[\d.]+?)\s*([–-].*|[. ]*$)')
rez = []
do_analysis = True
for p in lines:
if stop_tag.search(p):
do_analysis = False
if continue_tag.search(p):
do_analysis = True
if not do_analysis:
continue
m = pat.match(p)
if m:
rez.append(m.group('num'))
return rez
def get_lines(doc):
rez = []
for p in paragraph_iterator(doc):
rez.append(p.text)
return rez
def doc_analysis(doc):
init(autoreset=True)
print(Fore.BLACK+Back.WHITE+'Reading text...')
lines = get_lines(Document(doc))
print(Fore.BLACK+Back.WHITE+'Done...')
f = analize_figures(lines)
print(Fore.GREEN + 'Figures:', len(f))
pprint(f, indent=5, width=40, compact=True)
e = analize_formulas(lines)
print(Fore.GREEN + 'Equations:', len(e))
pprint(e, indent=5, width=40, compact=True)
t = analize_tables(lines)
print(Fore.GREEN + 'Tables:', len(t))
pprint(t, indent=5, width=40, compact=True)
print(Back.YELLOW+Fore.BLACK + 'Figures:' +
Fore.WHITE+Back.BLACK+'\t'+str(len(f)))
print(Back.YELLOW+Fore.BLACK + 'Equations:' +
Fore.WHITE+Back.BLACK+'\t'+str(len(e)))
print(Back.YELLOW+Fore.BLACK + 'Tables:' +
Fore.WHITE+Back.BLACK+'\t'+str(len(t)))
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="""
Usage example: python doc_statistics.py test.docx
Gets number of figures, equations and tables from docx-file.
""")
parser.add_argument('input', help='Input docx file')
args = parser.parse_args()
doc_analysis(args.input)