This repository has been archived by the owner on Jan 17, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcarta_corpus_preproc.py
62 lines (50 loc) · 1.79 KB
/
carta_corpus_preproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
## Read letter and join in a single string (as file contents)
with open('carta.txt', 'r') as fd:
text_raw = []
for line in fd.readlines():
text_raw.append(line)
fd.close()
text_raw = ''.join(text_raw)
## Start pre-process. Remove stopwords and decapitalize
stopwords_ptbr = stopwords.words('portuguese')
text_pp = text_raw
## Creates the lists with text, paragraphs, sentences and words
text = []
text_nosw = []
paragraphs = []
paragraphs_nosw = []
sentences = []
sentences_nosw = []
words = []
words_nosw = []
for paragraph in text_pp.split('\n\n'):
paragraph = ' '.join(paragraph.split('\n')).strip()
paragraph_tkn = word_tokenize(paragraph.lower())
paragraph_nosw = [ w for w in paragraph_tkn if w not in stopwords_ptbr ]
paragraph_nosw = ' '.join(paragraph_nosw)
for sentence in paragraph.split('.'):
sentence = sentence.strip()
for word in sentence.split(' '):
word = word.strip()
if word != '':
words.append(word)
if sentence != '':
sentences.append(sentence)
if paragraph != '':
paragraphs.append(paragraph)
paragraphs_nosw.append(paragraph_nosw)
sentences_nosw.extend(paragraph_nosw.split(' . '))
words_nosw.extend(paragraph_nosw.split(' '))
text = text_pp.replace('\n', ' ').strip()
text_nosw = ' '.join(paragraphs_nosw)
text_nosw_nopunct = ''.join([ w for w in text_nosw if w not in [',', ':', '.', ' - ', ';', '–', ' — ', '—']])
text_nosw_nopunct = ' '.join(word_tokenize(text_nosw_nopunct))
##
freq = ngram_frequency(text_nosw_nopunct, 1)
total = compute_probability(freq)
##
print(freq)
##
fig, ax = plt.subplots(1, 1, figsize=(5, 5), dpi=150)
bar1 = ax.bar(freq.keys(), freq.values())
#ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical', ha='right')