-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocessor_clean.py
99 lines (75 loc) · 5.02 KB
/
preprocessor_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
if __name__ == "__main__": # sort of like with MPI, we need this to do multiprocessing on windows
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, PCA, TruncatedSVD, LatentDirichletAllocation
import time
import re # regex
start_time = time.time()
raw_text = pickle.load(open('raw_text_data.pkl','rb'))
# Make sure NaNs turn into strings
# (We probably don't want this in the long run)
raw_text = [str(x) for x in raw_text]
print("Number of Samples:", len(raw_text))
clean_text = [" ".join([ # joins a list of words back together with spaces in between them
re.sub(r'\W+', '', # force alphanumeric (after doing @ and # checks)
word.replace('"','').lower()) # force lower case, remove double quotes
for word in tweet.split() # go word by word and keep them if...
if len(word)>2 and # they are 3 characters or longer
not word.startswith('@') and # they don't start with @, #, or http
not word.startswith('#') and
not word.startswith('http')]
).encode('ascii', errors='ignore') # force ascii encoding, ignore weird characters just in case
for tweet in raw_text]
stop_words = [] # stop words file includes English, Spanish, and Catalan
with open('stop_words.txt','r') as f:
stop_words = [word.replace("\n",'') for word in f.readlines()] # Have to remove \n's because I didn't copy the stop words cleanly
print("Stop word examples:", stop_words[:10])
print("\n----20 TWEETS----")
# Lets make sure this looks right...
for tweet in clean_text[:20]: # First 20 tweets!
print(tweet) # the b before these means they are ascii encoded
print("--------------")
tf_idf = TfidfVectorizer(min_df=10,stop_words=stop_words)
# min_df means ignore words that appear in less than that many tweets
# we specify our stop words list here too
text_tf_idf = tf_idf.fit_transform(clean_text) # like we talked about,
# fit_transform is short hand for doing a .fit() then a .transform()
# because 2 lines of code is already too much I guess...
print("Dumping feature names to disk...")
pickle.dump(tf_idf.get_feature_names(), open('TF_IDF_feature_names.pkl', 'wb'))
print("TF-IDF Sample:")
print(text_tf_idf[0],"\n") # Important note! TF-IDF spits out a sparse matrix by
# default so don't be shocked when the print statement gives you
# (0, INDEX) VALUE
# Its just a more concise way of writing it - we leave out all the 0 entries
# and only spit out the values and their indexes. Its 1D so the first index
# will always be 0
# These two are basically interchangable: (Go take a nap while they run...)
# topic_model = NMF(n_components=1000,verbose=1,tol=.001,alpha=.1,l1_ratio=.2)
topic_model = LatentDirichletAllocation(n_topics=1000,n_jobs=-1,doc_topic_prior=50/1000.0, topic_word_prior=.1, verbose=100,batch_size=int(len(raw_text)/10),max_doc_update_iter=1000,mean_change_tol=.0001,learning_offset=30)
# for LDA:
# n_jobs=-1 means run on every logical core
# doc_topic prior and topic_word prior are alpha and beta terms respectively
# LatentDirichletAllocation runs in online mode by default, just make sure batch_size is small enough to fit in RAM
# Learning offset is how much to reduce importance of early batches - higher values mean early batches have less weight (early batches tend to dominate in online training of LDA)
# for NMF:
# alpha is how much to regularize
# l1 ratio is how much alpha to allocate to l1 vs l2 regularization (microblogs paper did both so we do too)
# tol is how small violation must be for NMF to stop iterating.
# I set it higher than default so it doesn't take as long to run. Smaller is usually better though
# Sure lets compress to 100 topics why not
# Verbose prints out how close to convergence we are after each iteration
# When violation is less than .0001 by default, NMF is finished (set to .001 now)
text_topic_model_W = topic_model.fit_transform(text_tf_idf) # NMF's .transform() returns W by
# default, but we can get H as follows:
text_topic_model_H = topic_model.components_
print("Topic Model Components:")
print(text_topic_model_W[0]) # topic memberships of tweet 0
print(len(text_topic_model_H[0]))
print(text_topic_model_H[0]) # this is relative word frequencies within topic 0.
# Maybe. We might need to to transpose this...
text_topic_model_WH = (text_topic_model_W,text_topic_model_H)
pickle.dump(text_topic_model_WH, open('LDA_topics_WH.pkl','wb'), protocol=4) # Save it to
pickle.dump(topic_model, open('LDA.pkl','wb'), protocol=4)
# disk so we don't have to keep recalculating it later
print("\n--- Completed in %s seconds! ---" % (time.time() - start_time))