-
Notifications
You must be signed in to change notification settings - Fork 8
/
stats_gigaword.py
31 lines (29 loc) · 1.25 KB
/
stats_gigaword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from configs import output_dir
from collections import Counter
from nltk.stem import WordNetLemmatizer
import codecs
import os
if __name__ == '__main__':
cutoff_freq = 20
token_count = Counter()
lemma_count = Counter()
wordnet_lemmatizer = WordNetLemmatizer()
preprocessed_gigaword_path = 'preprocessed-data/694cb4d/gigaword.txt'
with codecs.open(preprocessed_gigaword_path, 'r', 'utf-8') as f:
for line_no, line in enumerate(f):
for tok in line.split():
tok = tok.strip()
token_count[tok] += 1
lemma = wordnet_lemmatizer.lemmatize(tok.lower())
lemma_count[lemma] += 1
if (line_no+1) % 100000 == 0:
print(line_no+1)
# if line_no >= 1000: break # for debugging
with codecs.open(os.path.join(output_dir, 'token.lst'), 'w', 'utf-8') as f:
for tok in token_count:
if token_count[tok] >= cutoff_freq:
f.write('%s\t%d\n' %(tok, token_count[tok]))
with codecs.open(os.path.join(output_dir, 'lemma.lst'), 'w', 'utf-8') as f:
for lemma in lemma_count:
if lemma_count[lemma] >= cutoff_freq:
f.write('%s\t%d\n' %(lemma, lemma_count[lemma]))