-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_example_data.py
130 lines (88 loc) · 2.96 KB
/
generate_example_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Messy file for generating data for widget example
"""
import pickle
from functools import partial
from typing import List
from gensim.models import Word2Vec
from nltk.corpus import inaugural, reuters, twitter_samples
from nltk.tokenize import WordPunctTokenizer
from openTSNE import TSNE
from sklearn.decomposition import PCA
from w2widget.doc2vec import Doc2Vec, calculate_inverse_frequency
import json
# import nltk
# nltk.download('twitter_samples')
### Twitter samples
# twitter_samples._readme = 'README.txt'
# print(twitter_samples.readme())
# twitter_samples.abspaths()
# tweets_path = twitter_samples.abspaths()[-1]
# # with open(tweets_path, 'r') as f:
# tweets = []
# with twitter_samples.open(tweets_path) as f:
# for line in f:
# tweets.append(json.loads(line.strip()))
# docs = [tweet["text"] for tweet in tweets if "retweeted_status" not in tweet]
### Reuters
docs = []
for path in reuters.fileids():
with reuters.open(path) as f:
docs.append(f.read())
### Inaugural speeches
# docs = []
# for doc in inaugural.abspaths():
# # with open(doc, 'r') as f:
# f = inaugural.open(doc)
# docs.append(f.read())
# f.close()
## Text preprocessing
tokenizer = WordPunctTokenizer()
def tokenize_with_ws(text: str, tokenizer) -> List[str]:
return [x for y in [tokenizer(x) + [" "] for x in text.split()] for x in y]
tokenizer.tokenize_with_ws = partial(tokenize_with_ws, tokenizer=tokenizer.tokenize)
document_tokens = [
[token.lower() for token in tokenizer.tokenize_with_ws(doc) if token.isalnum()]
for doc in docs
]
tokens_with_ws = [tokenizer.tokenize_with_ws(doc) for doc in docs]
print("\nSaving tokens with white spaces...\n")
with open("data/tokens_with_ws.pkl", "wb") as f:
pickle.dump(tokens_with_ws, f)
## Train word2vec model
w2v = Word2Vec()
wv_model = Word2Vec(
document_tokens,
vector_size=200,
window=10,
workers=4,
seed=42,
epochs=10,
min_count=2,
).wv
## Reduce dimensions
normed_vectors = wv_model.get_normed_vectors()
pca = PCA(n_components=50)
pca_embedding = pca.fit_transform(normed_vectors)
TSNE_embedding = TSNE(
n_components=2, learning_rate="auto", random_state=420, verbose=1
).fit(pca_embedding)
wv_tsne_word_embedding = TSNE_embedding.transform(pca_embedding)
print("\nSaving wv_model...\n")
with open("data/wv_model.pkl", "wb") as f:
pickle.dump(wv_model, f)
print("\nSaving wv_tsne embeddings...\n")
with open("data/wv_tsne_embedding.pkl", "wb") as f:
pickle.dump(wv_tsne_word_embedding, f)
## doc2vec
word_weights = calculate_inverse_frequency(document_tokens)
dv_model = Doc2Vec(wv_model, word_weights)
dv_model.add_doc2vec(document_tokens)
dv_model.reduce_dimensions()
dv_tsne_embedding = dv_model.TSNE_embedding_array
print("\nSaving dv_model...\n")
with open("data/dv_model.pkl", "wb") as f:
pickle.dump(dv_model, f)
print("\nSaving dv_tsne embeddings...\n")
with open("data/dv_tsne_embedding.pkl", "wb") as f:
pickle.dump(dv_tsne_embedding, f)