-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
114 lines (95 loc) · 3.37 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: UTF-8 -*-
# __author__ = Huang Wenguan
# date : 2017.3.15
from bs4 import BeautifulSoup
import jieba
import os
import os.path
import re
def parseHtmlfromFile(file):
'''
given a html, parse it, and return the text in it.
at the first try, we only try to extract text in <span>
itype : document(html)
rtype : str
'''
#parser = argparse.ArgumentParser(description="you know, just time")
#parser.add_argument('--delta', default = 3)
#args = parser.parse_args()
res = []
try:
soup = BeautifulSoup(open(file), "lxml")
y = soup.find_all('div')
for i in y:
res += list(i.stripped_strings)
except:
print('something wrong in ',str(file))
res.append(str(file))
return ''.join(res)
def jbTokenizer(raw_text):
'''
itype: str
otype: list
'''
seg_list = jieba.cut(raw_text)
return list(seg_list)
def sentTokenizer(raw_text):
'''
separate sentences first, which is encoded as a list of words, and return a list of sentences
itype: str
otype: list[list[str]]
'''
stop_punctuations = '[,。?()《》]'
sentences = re.split(stop_punctuations,raw_text)
return [jbTokenizer(s) for s in sentences]
def purify(raw_list, tag = 'whole'):
'''
remove unecessary elements like
1. number
2. punctuation
itype: list[str]
otype: list[str]
'''
number_pattern = re.compile(r'.*[\d].*')
containNumber = lambda x : number_pattern.match(x)
punctuations = ',.()、--%//√: ' # ,。?()《》 leave for separating sentences
if tag == 'whole':
return [x for x in raw_list if not containNumber(x) and not x in punctuations]
elif tag == 'sent':
return [purify(s) for s in raw_list]
def iterateFolder(htmlpath, outputpath):
'''
handle all files in htmlpath, an save the tokenized ouput in ouputpath
'''
print('starting iterating ...')
for parent, dirnames, filenames in os.walk(htmlpath):
print('total in ', str(len(filenames)))
i = 0
for filename in filenames:
if 'html' not in filename: continue
print('going for ', str(i))
full_name = os.path.join(parent,filename)
#print("parent is ", parent)
#print("filename is ", filename)
#print("the full name of the file is:", full_name)
# real deal
raw_text = parseHtmlfromFile(full_name)
# this code will treat every file as a list of words
#coarse_result = jbTokenizer(raw_text)
#fine_result = purify(coarse_result)
# this code will treat every file as a list of sentences, with each sentences as a list of words
sentences_separated_result = sentTokenizer(raw_text)
fine_result = purify(sentences_separated_result, 'sent')
# push it into a file
output_filename = filename[:-4] + 'txt'
output_fullname = os.path.join(outputpath, output_filename)
output_file = open(output_fullname,'w')
output_file.write(str(fine_result))
output_file.close()
i += 1
def main():
htmlpath = '/home/vincent/tmp/shang1000/out'
outputpath = '/home/vincent/tmp/out1000_sentence'
iterateFolder(htmlpath, outputpath)
if __name__ == "__main__":
main()