Skip to content

Commit

Permalink
Add rough example for making topic model predictions using Python.
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisdubois committed Aug 19, 2015
1 parent 06225d2 commit d067e03
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 1 deletion.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ guide](https://github.com/graphlab-code/how-to/blob/master/CONTRIBUTING.md).
Want to request a new How-To or have feedback on one listed below? Please open
a Git
[issue](https://github.com/graphlab-code/how-to/issues?q=is%3Aopen+is%3Aissue)
or send us [feedback](http://dato.com/company/contact.html).
or send us [feedback](http://dato.com/company/contact.html).

Data Ingress
-------------
Expand Down Expand Up @@ -38,6 +38,7 @@ Text Analytics
---------------
* [Find the unique words used in an SArray of text documents](sarray_vocabulary.py)
* [Compute word frequencies for each word in an SArray of text documents](word_frequency.py)
* [Example of making predictions using topic model parameters](predict_topic_model.py)

Image Analytics
---------------
Expand Down
109 changes: 109 additions & 0 deletions predict_topic_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import numpy as np
import graphlab as gl

def predict(document_bow, word_topic_counts, topic_counts, vocab,
alpha=0.1, beta=0.01, num_burnin=5):
"""
Make predictions for a single document.
Parameters
----------
document_bow : dict
Dictionary with words as keys and document frequencies as counts.
word_topic_counts : numpy array, num_vocab x num_topics
Number of times a given word has ever been assigned to a topic.
topic_counts : numpy vector of length num_topics
Number of times any word has been assigned to a topic.
vocab : dict
Words are keys and unique integer is the value.
alpha : float
Hyperparameter. See topic_model docs.
beta : float
Hyperparameter. See topic_model docs.
num_burnin : int
Number of iterations of Gibbs sampling to perform at predict time.
Returns
-------
out : numpy array of length num_topics
Probabilities that the document belongs to each topic.
"""
num_vocab, num_topics = word_topic_counts.shape

# proportion of each topic in this test doc
doc_topic_counts = np.zeros(num_topics)
# Assignment of each unique word
doc_topic_assignments = []

# Initialize assignments and counts
# NB: we are assuming document_bow doesn't change.
for i, (word, freq) in enumerate(document_bow.iteritems()):
if word not in vocab: # skip words not present in training set
continue
topic = np.random.randint(0, num_topics-1)
doc_topic_assignments.append(topic)
doc_topic_counts[topic] += freq

# Sample topic assignments for the test document
for burnin in range(num_burnin):
for i, (word, freq) in enumerate(document_bow.iteritems()):
if word not in vocab:
continue
word_id = vocab[word]

# Get old topic and decrement counts
topic = doc_topic_assignments[i]
doc_topic_counts[topic] -= freq

# Sample a new topic
gamma = np.zeros(num_topics) # store probabilities
for k in range(num_topics):
gamma[k] = (doc_topic_counts[k] + alpha) * (word_topic_counts[word_id, k] + beta) / (topic_counts[k] + num_vocab * beta)
gamma = gamma / gamma.sum() # normalize to probabilities
topic = np.random.choice(num_topics, 1, p=gamma)

# Use new topic to increment counts
doc_topic_assignments[i] = topic
doc_topic_counts[topic] += freq

# Create predictions
predictions = np.zeros(num_topics)
total_doc_topic_counts = doc_topic_counts.sum()
for k in range(num_topics):
predictions[k] = (doc_topic_counts[k] + alpha) / (total_doc_topic_counts + num_topics * alpha)
return predictions / predictions.sum()


if __name__ == '__main__':
docs = gl.SFrame({'text': [{'first': 5, 'doc': 1}, {'second': 3, 'doc': 5}]})
m = gl.topic_model.create(docs)

# Get test document in bag of words format
document_bow = docs['text'][0]

# Input: Global parameters from trained model

# Number of times each word in the vocabulary has ever been assigned to topic k (in any document). You can make an approximate version of this by multiplying m['topics'] by some large number (e.g. number of tokens in corpus) that indicates how strong you "believe" in these topics. Make it into counts by flooring it to an integer.
prior_strength = 1000000
word_topic_counts = np.array(m['topics']['topic_probabilities'])
word_topic_counts = np.floor(prior_strength * word_topic_counts)

# Number of times any word as been assigned to each topic.
topic_counts = word_topic_counts.sum(0)

# Get vocabulary lookup
num_topics = m['num_topics']
vocab = {}
for i, w in enumerate(m['topics']['vocabulary']):
vocab[w] = i
num_vocab = len(vocab)

# Make prediction on test document
probs = predict(document_bow, word_topic_counts, topic_counts, vocab)

0 comments on commit d067e03

Please sign in to comment.