-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcollocation_analysis.py
142 lines (109 loc) · 5.8 KB
/
collocation_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Collocation Analysis
--------------------
Looking at bi-grams and tri-grams, ordering and printing by statistics
"""
from __future__ import print_function
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, \
TrigramCollocationFinder
from nlpkit.tokenizer import tokenize, bigram_prep, trigram_prep, break_list_by_window, allow_noun_adj_adp, tokens_to_texts, \
lemmatizing, is_noun_or_adj
from nlpkit.utils import lines_to_file
# very slow compare to libs
def top_unigram(tokenized_docs, nbr_of_tops=1, should_filter=False):
if should_filter:
filtered_tokenized_docs = [bigram_prep(doc) for doc in tokenized_docs]
else:
filtered_tokenized_docs = tokenized_docs
flatten_corpus = [t for d in filtered_tokenized_docs for t in d]
vocab = [[el, flatten_corpus.count(el)] for el in set(flatten_corpus)]
sorted_vocab = sorted(vocab, key=lambda w: w[1])
sorted_vocab.reverse()
return sorted_vocab[0:nbr_of_tops]
def top_trigram(tokenized_docs, as_str=True, should_filter=False, nbr_of_tops=1):
trigrams_finder = create_trigram_finder(tokenized_docs, should_filter)
most_common_as_tuple_list = [(most_common[0][0], most_common[0][1], most_common[0][2], most_common[1])
for most_common in trigrams_finder.ngram_fd.most_common()]
tops = most_common_as_tuple_list[0:nbr_of_tops]
if as_str:
results = [['%s_%s_%s' % (t[0], t[1], t[2]), t[3]] for t in tops]
else:
results = tops
return results
def top_bigram(tokenized_docs, as_str=True, should_filter=False, nbr_of_tops=1):
bigrams_finder = create_bigram_finder(tokenized_docs, should_filter)
most_common_as_tuple_list = [(most_common[0][0], most_common[0][1], most_common[1])
for most_common in bigrams_finder.ngram_fd.most_common()]
tops = most_common_as_tuple_list[0:nbr_of_tops]
if as_str:
results = [['%s_%s' % (t[0], t[1]), t[2]] for t in tops]
else:
results = tops
return results
def create_bigram_finder(tokenized_docs, should_filter=False):
if should_filter:
bigrams_data_samples = [bigram_prep(doc) for doc in tokenized_docs]
else:
bigrams_data_samples = tokenized_docs
bigrams_finder = BigramCollocationFinder.from_documents(bigrams_data_samples)
return bigrams_finder
def create_trigram_finder(tokenized_docs, should_filter=False):
if should_filter:
trigrams_data_samples = [trigram_prep(doc) for doc in tokenized_docs]
else:
trigrams_data_samples = tokenized_docs
trigrams_finder = TrigramCollocationFinder.from_documents(trigrams_data_samples)
return trigrams_finder
def save_trigrams(tokenized_docs, shouldWriteToFile=False):
trigrams_finder = create_trigram_finder(tokenized_docs)
trigram_measures = TrigramAssocMeasures()
trigrams_scores = trigrams_finder.score_ngrams(trigram_measures.likelihood_ratio)
trigrams_counts = ['%s_%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[0][2], most_common[1])
for most_common in trigrams_finder.ngram_fd.most_common()]
trigrams_scores_as_str = [
'%s_%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[0][2], most_common[1])
for most_common in trigrams_scores]
if shouldWriteToFile:
with open('./output/trigrams_counts.csv', "w", encoding="utf8") as fout:
lines_to_file(trigrams_counts, fout)
with open('./output/trigrams_lr_scores.csv', "w", encoding="utf8") as fout:
lines_to_file(trigrams_scores_as_str, fout)
def is_nouns_or_adjs(bigram):
return is_noun_or_adj(bigram[0]) and is_noun_or_adj(bigram[1])
def key_from_bigram(bigram):
return bigram[0] + '_' + bigram[1]
def save_bigrams(tokenized_docs, shouldWriteToFile=False):
samples_pos = [allow_noun_adj_adp(doc) for doc in tokenized_docs]
samples_rawlem = [lemmatizing(tokens_to_texts(doc)) for doc in samples_pos]
bigrams_pos = [break_list_by_window(s) for s in samples_pos]
bigrams_rawlem = [break_list_by_window(s) for s in samples_rawlem]
bigrams_count = {}
for i, doc in enumerate(bigrams_pos):
for j, bigram in enumerate(doc):
if is_nouns_or_adjs(bigram):
lem_bigram = bigrams_rawlem[i][j]
k = key_from_bigram(lem_bigram)
if k in bigrams_count:
bigrams_count[k] += 1
else:
bigrams_count[k] = 1
bigrams_count_array = [[b, bigrams_count[b]] for b in bigrams_count]
sorted_bigrams = sorted(bigrams_count_array, key=lambda b: b[1], reverse=True)
bigrams_str = ['%s,%d\n' % (b[0], b[1]) for b in sorted_bigrams]
if shouldWriteToFile:
with open('./output/bigrams_counts.csv', "w", encoding="utf8") as fout:
lines_to_file(bigrams_str, fout)
def DEPRECATED_save_bigrams(tokenized_docs, shouldWriteToFile=False):
bigrams_data_samples = [bigram_prep(doc) for doc in tokenized_docs]
bigram_measures = BigramAssocMeasures()
bigrams_finder = BigramCollocationFinder.from_documents(bigrams_data_samples)
bigrams_scores = bigrams_finder.score_ngrams(bigram_measures.likelihood_ratio)
bigrams_counts = ['%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[1])
for most_common in bigrams_finder.ngram_fd.most_common()]
bigrams_scores_as_str = ['%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[1])
for most_common in bigrams_scores]
if shouldWriteToFile:
with open('./output/bigrams_counts.csv', "w", encoding="utf8") as fout:
lines_to_file(bigrams_counts, fout)
with open('./output/bigrams_lr_scores.csv', "w", encoding="utf8") as fout:
lines_to_file(bigrams_scores_as_str, fout)