-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCleanText.py
117 lines (95 loc) · 3.79 KB
/
CleanText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from __future__ import print_function
from dataloader.base import BaseLoader
from konlpy.tag import Okt
import importlib
import argparse
import parser
import importlib
import json
import os
import copy
from konlpy.tag import Mecab
import re
from lexrankr import LexRank
class BasicLoader(BaseLoader):
def __init__(self):
super().__init__()
def parse_args(self):
parser = argparse.ArgumentParser()
parser.add_argument('--data_input_path', type=str, default='/Users/angeonhui/Desktop/data/crawled_txt/body', help='Base path of the input texts.')
self.args, remaining_args = parser.parse_known_args()
return copy.deepcopy(self.args), remaining_args
def prepare(self):
# retrieve text name list
print(self.args.data_input_path)
input_path = os.path.join(self.args.data_input_path)
self.text_name_list = [os.path.splitext(f)[0] for f in os.listdir(input_path) if f.lower().endswith('.txt') and f != '.txt']
print(self.text_name_list)
print('data: %d texts are prepared' % (len(self.text_name_list)))
def get_num_texts(self):
return len(self.text_name_list)
def _get_input_text(self, text_index):
text = None
if (text is None):
text_path = os.path.join(self.args.data_input_path, ('%s.txt' % (self.text_name_list[text_index])))
text = self._load_text(text_path)
return text
def _load_text(self, path):
f = open(path, 'r')
text = f.read()
f.close()
return text
def _save_text(self, text, text_index):
path = self.args.data_input_path
save_path = os.path.join(path, ('%s.txt' % (self.text_name_list[text_index])))
with open(save_path, 'w') as f:
f.write(text)
mecab = Mecab()
def spacing_mecab(wrongSentence):
tagged = mecab.pos(wrongSentence)
corrected = ""
for i in tagged:
# print(i[0], i[1])
if i[1] in ('JKS', 'JKC', 'JKG', 'JKO', 'JKB' ,'JKV', 'JKQ', 'JX', 'JC', 'EP', 'EC', 'ETN', 'ETM', 'XPN', 'XSN', 'XSV', 'SA', 'SF'):
corrected += i[0]
else:
corrected += " "+i[0]
if not corrected:
return corrected+'.'
if corrected[0] == " ":
corrected = corrected[1:]
return corrected
def clean_sentense(txt):
pattern = '(\d\d\d-\d\d\d\d-\d\d\d\d)' # 전화번호 제거 (000-0000-0000),\d: 숫자 1개
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail제거, a-z 사이의 문자,
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = '([ㄱ-ㅎㅏ-ㅣ]+)' # 한글 자음, 모음 제거
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = '<[^>]*>' # HTML 태그 제거
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = '[^\w\s.]' # 특수기호제거
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = 'ˇˇ+' # removing info and tab
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = 'bsc' # removing info and tab
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = 'body' # removing info and tab
txt = re.sub(pattern=pattern, repl='', string=txt)
pattern = ' +' # removing info and tab
txt = re.sub(pattern=pattern, repl='', string=txt)
return txt
basic_loader = BasicLoader()
basic_loader.parse_args()
basic_loader.prepare()
text_list = list(range(basic_loader.get_num_texts()))
idx = 0
while idx in text_list:
text = basic_loader._get_input_text(idx)
text = clean_sentense(text)
text = spacing_mecab(text)
print(text)
basic_loader._save_text(text, idx)
idx += 1