-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreprocessing.py
67 lines (56 loc) · 2.02 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import string
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import collections
class Preprocessing:
stopword = open("stopword-list.txt", "r").read().split('\n')
@staticmethod
def cleaning(text):
remove_link = re.sub('[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*',' ',text)
replace_punctuation = remove_link.maketrans(string.punctuation, ' ' * len(string.punctuation))
text = remove_link.translate(replace_punctuation)
cleaning = text.translate(str.maketrans('', '', '1234567890'))
return re.sub('\s+', ' ', cleaning).strip()
@staticmethod
def case_folding(text):
return str.casefold(text)
@staticmethod
def tokenisasi(text):
return text.split()
@staticmethod
def filtering(text):
return [text for text in text if text not in Preprocessing.stopword]
@staticmethod
def type(text):
return list(collections.OrderedDict([(word, None) for word in text]))
@staticmethod
def stemming (text):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
return [stemmer.stem(x) for x in text]
@staticmethod
def all_in_one(text):
return Preprocessing.filtering(
Preprocessing.stemming(
Preprocessing.type(
Preprocessing.tokenisasi(
Preprocessing.case_folding(
Preprocessing.cleaning(text)
)
)
)
)
)
@staticmethod
def all_in_one_without_type(text):
print('preprocessing')
print(text)
return Preprocessing.filtering(
Preprocessing.stemming(
Preprocessing.tokenisasi(
Preprocessing.case_folding(
Preprocessing.cleaning(text)
)
)
)
)