-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
82 lines (69 loc) · 1.97 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import string
import unidecode
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def trim(text):
return text.strip()
def text_lowercase(text):
return text.lower()
def keep_alphanumeric(text):
pattern = re.compile('[\W_]+', re.UNICODE)
return pattern.sub(' ', text)
def letters_only(text):
return re.sub('[^a-zA-Z]+', ' ', text)
def remove_punctuation(text):
punctuation = string.punctuation + '’—°€'
for punct in punctuation:
text = text.replace(punct, ' ')
return text
def remove_accents(text):
return unidecode.unidecode(text)
def remove_digits(text):
for digit in string.digits:
text = text.replace(digit, ' ')
return text
def tokenize(text):
text = word_tokenize(text)
return text
def clean(text):
text = remove_punctuation(text)
text = remove_accents(text)
text = text_lowercase(text)
return text
def remove_stopwords(text):
stop_words = set(stopwords.words('french'))
words = text.split(" ")
text = [i for i in words if not i in stop_words]
text = " ".joint(text)
return text
def preprocessing(text):
text = text_lowercase(text)
text = remove_punctuation(text)
text = remove_accents(text)
text = remove_digits(text)
text = tokenize(text)
text = remove_stopwords(text)
text = ' '.join(text)
stemmer = FrenchStemmer()
text = stemmer.stem(text)
return text
def preprocessing_all(text):
text = text_lowercase(text)
text = remove_accents(text)
text = letters_only(text)
text = tokenize(text)
text = remove_stopwords(text)
text = ' '.join(text)
stemmer = FrenchStemmer()
text = stemmer.stem(text)
return text
def preprocessing_no_stemmer(text):
text = text_lowercase(text)
text = remove_accents(text)
text = letters_only(text)
text = tokenize(text)
text = remove_stopwords(text)
text = ' '.join(text)
return text