-
Notifications
You must be signed in to change notification settings - Fork 0
/
bigrams.py
77 lines (61 loc) · 2.46 KB
/
bigrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import nltk
from nltk.collocations import *
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
stopset = set(stopwords.words('english'))
stops = [word for word in stopwords.words('english')]
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
wordnet_lemmatizer = WordNetLemmatizer()
def _stopwords(extend_stopwords=None) -> None:
if type(extend_stopwords) == list:
stops.extend(extend_stopwords)
if type(extend_stopwords) == str:
stops.append(extend_stopwords)
def _collocations(text:str) -> str:
"""This is a helper function that creates bigrams out of the text.
Collocations are bigrams that are paired together based on a
similarity score. This is an effort to construct useful bigrams.
Args:
text: string that needs to be turned into bigrams
Returns:
str: returns the bigrams if found or returns the unigram text
"""
filter_stops = lambda w: len(w) < 3 or w in stops
uncovered_words = [word for word in word_tokenize(text) if word.lower() not in stops]
uncovered_words = [wordnet_lemmatizer.lemmatize(word) for word in uncovered_words]
finder = BigramCollocationFinder.from_words(uncovered_words)
finder.apply_word_filter(filter_stops)
bigram = finder.nbest(bigram_measures.pmi, 1)
if not bigram:
return text
return f"{bigram[0][0]}_{bigram[0][1]}"
def _create_bigrams(col: pd.Series) -> list:
""" Helper function to take the column of text and return the bigrams
based on the input text
Args:
df: column of text such as df['input_text']
Returns:
bigrams: list of bigrams and in some cases unigrams (when bigrams don't exist)
"""
if type(col) != pd.Series:
raise TypeError("The parameter passed must be a Series object.Pass the column you wish to create n-grams out of.")
try:
bigrams = []
for index, value in col.items():
if type(value) != str:
continue
bigram = _collocations(value)
bigrams.append(bigram)
return bigrams
except Exception as e:
print(e)
def runner(col: pd.Series, stopwords=None):
_stopwords(stopwords)
return _create_bigrams(col)