From d067e03004b778c02cf14c8a739a8c2efff0ae10 Mon Sep 17 00:00:00 2001 From: Chris DuBois Date: Tue, 18 Aug 2015 18:00:04 -0700 Subject: [PATCH] Add rough example for making topic model predictions using Python. --- README.md | 3 +- predict_topic_model.py | 109 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 predict_topic_model.py diff --git a/README.md b/README.md index cfde82f..f625fb9 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ guide](https://github.com/graphlab-code/how-to/blob/master/CONTRIBUTING.md). Want to request a new How-To or have feedback on one listed below? Please open a Git [issue](https://github.com/graphlab-code/how-to/issues?q=is%3Aopen+is%3Aissue) -or send us [feedback](http://dato.com/company/contact.html). +or send us [feedback](http://dato.com/company/contact.html). Data Ingress ------------- @@ -38,6 +38,7 @@ Text Analytics --------------- * [Find the unique words used in an SArray of text documents](sarray_vocabulary.py) * [Compute word frequencies for each word in an SArray of text documents](word_frequency.py) +* [Example of making predictions using topic model parameters](predict_topic_model.py) Image Analytics --------------- diff --git a/predict_topic_model.py b/predict_topic_model.py new file mode 100644 index 0000000..3771973 --- /dev/null +++ b/predict_topic_model.py @@ -0,0 +1,109 @@ +import numpy as np +import graphlab as gl + +def predict(document_bow, word_topic_counts, topic_counts, vocab, + alpha=0.1, beta=0.01, num_burnin=5): + """ + Make predictions for a single document. + + Parameters + ---------- + document_bow : dict + Dictionary with words as keys and document frequencies as counts. + + word_topic_counts : numpy array, num_vocab x num_topics + Number of times a given word has ever been assigned to a topic. + + topic_counts : numpy vector of length num_topics + Number of times any word has been assigned to a topic. + + vocab : dict + Words are keys and unique integer is the value. + + alpha : float + Hyperparameter. See topic_model docs. + + beta : float + Hyperparameter. See topic_model docs. + + num_burnin : int + Number of iterations of Gibbs sampling to perform at predict time. + + Returns + ------- + out : numpy array of length num_topics + Probabilities that the document belongs to each topic. + + """ + num_vocab, num_topics = word_topic_counts.shape + + # proportion of each topic in this test doc + doc_topic_counts = np.zeros(num_topics) + # Assignment of each unique word + doc_topic_assignments = [] + + # Initialize assignments and counts + # NB: we are assuming document_bow doesn't change. + for i, (word, freq) in enumerate(document_bow.iteritems()): + if word not in vocab: # skip words not present in training set + continue + topic = np.random.randint(0, num_topics-1) + doc_topic_assignments.append(topic) + doc_topic_counts[topic] += freq + + # Sample topic assignments for the test document + for burnin in range(num_burnin): + for i, (word, freq) in enumerate(document_bow.iteritems()): + if word not in vocab: + continue + word_id = vocab[word] + + # Get old topic and decrement counts + topic = doc_topic_assignments[i] + doc_topic_counts[topic] -= freq + + # Sample a new topic + gamma = np.zeros(num_topics) # store probabilities + for k in range(num_topics): + gamma[k] = (doc_topic_counts[k] + alpha) * (word_topic_counts[word_id, k] + beta) / (topic_counts[k] + num_vocab * beta) + gamma = gamma / gamma.sum() # normalize to probabilities + topic = np.random.choice(num_topics, 1, p=gamma) + + # Use new topic to increment counts + doc_topic_assignments[i] = topic + doc_topic_counts[topic] += freq + + # Create predictions + predictions = np.zeros(num_topics) + total_doc_topic_counts = doc_topic_counts.sum() + for k in range(num_topics): + predictions[k] = (doc_topic_counts[k] + alpha) / (total_doc_topic_counts + num_topics * alpha) + return predictions / predictions.sum() + + +if __name__ == '__main__': + docs = gl.SFrame({'text': [{'first': 5, 'doc': 1}, {'second': 3, 'doc': 5}]}) + m = gl.topic_model.create(docs) + + # Get test document in bag of words format + document_bow = docs['text'][0] + + # Input: Global parameters from trained model + + # Number of times each word in the vocabulary has ever been assigned to topic k (in any document). You can make an approximate version of this by multiplying m['topics'] by some large number (e.g. number of tokens in corpus) that indicates how strong you "believe" in these topics. Make it into counts by flooring it to an integer. + prior_strength = 1000000 + word_topic_counts = np.array(m['topics']['topic_probabilities']) + word_topic_counts = np.floor(prior_strength * word_topic_counts) + + # Number of times any word as been assigned to each topic. + topic_counts = word_topic_counts.sum(0) + + # Get vocabulary lookup + num_topics = m['num_topics'] + vocab = {} + for i, w in enumerate(m['topics']['vocabulary']): + vocab[w] = i + num_vocab = len(vocab) + + # Make prediction on test document + probs = predict(document_bow, word_topic_counts, topic_counts, vocab)