Add rough example for making topic model predictions using Python.

turi-code · Aug 19, 2015 · d067e03 · d067e03
1 parent 06225d2
commit d067e03
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ guide](https://github.com/graphlab-code/how-to/blob/master/CONTRIBUTING.md).
 Want to request a new How-To or have feedback on one listed below? Please open
 a Git
 [issue](https://github.com/graphlab-code/how-to/issues?q=is%3Aopen+is%3Aissue)
-or send us [feedback](http://dato.com/company/contact.html). 
+or send us [feedback](http://dato.com/company/contact.html).
 
 Data Ingress
 -------------
@@ -38,6 +38,7 @@ Text Analytics
 ---------------
 * [Find the unique words used in an SArray of text documents](sarray_vocabulary.py)
 * [Compute word frequencies for each word in an SArray of text documents](word_frequency.py)
+* [Example of making predictions using topic model parameters](predict_topic_model.py)
 
 Image Analytics
 ---------------

diff --git a/predict_topic_model.py b/predict_topic_model.py
@@ -0,0 +1,109 @@
+import numpy as np
+import graphlab as gl
+
+def predict(document_bow, word_topic_counts, topic_counts, vocab,
+            alpha=0.1, beta=0.01, num_burnin=5):
+    """
+    Make predictions for a single document.
+
+    Parameters
+    ----------
+    document_bow : dict
+        Dictionary with words as keys and document frequencies as counts.
+
+    word_topic_counts : numpy array, num_vocab x num_topics
+        Number of times a given word has ever been assigned to a topic.
+
+    topic_counts : numpy vector of length num_topics
+        Number of times any word has been assigned to a topic.
+
+    vocab : dict
+        Words are keys and unique integer is the value.
+
+    alpha : float
+        Hyperparameter. See topic_model docs.
+
+    beta : float
+        Hyperparameter. See topic_model docs.
+
+    num_burnin : int
+        Number of iterations of Gibbs sampling to perform at predict time.
+
+    Returns
+    -------
+    out : numpy array of length num_topics
+        Probabilities that the document belongs to each topic.
+
+    """
+    num_vocab, num_topics = word_topic_counts.shape
+
+    # proportion of each topic in this test doc
+    doc_topic_counts = np.zeros(num_topics)
+    # Assignment of each unique word
+    doc_topic_assignments = []
+
+    # Initialize assignments and counts
+    # NB: we are assuming document_bow doesn't change.
+    for i, (word, freq) in enumerate(document_bow.iteritems()):
+        if word not in vocab:  # skip words not present in training set
+            continue
+        topic = np.random.randint(0, num_topics-1)
+        doc_topic_assignments.append(topic)
+        doc_topic_counts[topic] += freq
+
+    # Sample topic assignments for the test document
+    for burnin in range(num_burnin):
+        for i, (word, freq) in enumerate(document_bow.iteritems()):
+            if word not in vocab:
+                continue
+            word_id = vocab[word]
+
+            # Get old topic and decrement counts
+            topic = doc_topic_assignments[i]
+            doc_topic_counts[topic] -= freq
+
+            # Sample a new topic
+            gamma = np.zeros(num_topics)  # store probabilities
+            for k in range(num_topics):
+                gamma[k] = (doc_topic_counts[k] + alpha) * (word_topic_counts[word_id, k] + beta) / (topic_counts[k] + num_vocab * beta)
+            gamma = gamma / gamma.sum()  # normalize to probabilities
+            topic = np.random.choice(num_topics, 1, p=gamma)
+
+            # Use new topic to increment counts
+            doc_topic_assignments[i] = topic
+            doc_topic_counts[topic] += freq
+
+    # Create predictions
+    predictions = np.zeros(num_topics)
+    total_doc_topic_counts = doc_topic_counts.sum()
+    for k in range(num_topics):
+        predictions[k] = (doc_topic_counts[k] + alpha) / (total_doc_topic_counts + num_topics * alpha)
+    return predictions / predictions.sum()
+
+
+if __name__ == '__main__':
+    docs = gl.SFrame({'text': [{'first': 5, 'doc': 1}, {'second': 3, 'doc': 5}]})
+    m = gl.topic_model.create(docs)
+
+    # Get test document in bag of words format
+    document_bow = docs['text'][0]
+
+    # Input: Global parameters from trained model
+
+    # Number of times each word in the vocabulary has ever been assigned to topic k (in any document). You can make an approximate version of this by multiplying m['topics'] by some large number (e.g. number of tokens in corpus) that indicates how strong you "believe" in these topics. Make it into counts by flooring it to an integer.
+    prior_strength = 1000000
+    word_topic_counts = np.array(m['topics']['topic_probabilities'])
+    word_topic_counts = np.floor(prior_strength * word_topic_counts)
+
+    # Number of times any word as been assigned to each topic.
+    topic_counts = word_topic_counts.sum(0)
+
+    # Get vocabulary lookup
+    num_topics = m['num_topics']
+    vocab = {}
+    for i, w in enumerate(m['topics']['vocabulary']):
+        vocab[w] = i
+    num_vocab = len(vocab)
+
+    # Make prediction on test document
+    probs = predict(document_bow, word_topic_counts, topic_counts, vocab)