Skip to content
This repository has been archived by the owner on Jul 16, 2021. It is now read-only.

Latent Dirichlet Allocation #172

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ datasets = []
[dependencies]
num = { version = "0.1.35", default-features = false }
rand = "0.3.14"
rulinalg = "0.3.7"
rulinalg = "0.4.0"
4 changes: 2 additions & 2 deletions examples/k-means_generating_cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ fn generate_data(centroids: &Matrix<f64>,

for _ in 0..points_per_centroid {
// Generate points from each centroid
for centroid in centroids.iter_rows() {
for centroid in centroids.row_iter() {
// Generate a point randomly around the centroid
let mut point = Vec::with_capacity(centroids.cols());
for feature in centroid {
for feature in centroid.iter() {
point.push(feature + normal_rv.ind_sample(&mut rng));
}

Expand Down
191 changes: 191 additions & 0 deletions examples/lda_gen.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
/// An example of how Latent Diriclhet Allocation (LDA) can be used. This example begins by
/// generating a distribution of words to categories. This distribution is creatred so that
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo here: distribution is CREATED.

/// there are 10 topics. Each of the 25 words are assigned to two topics with equal probability.
/// (The distribution of words is printed to the screen as a chart. Each entry in the chart
///corresponds to a word in the vocabulary, arranged into a square for easy viewing). Documents
/// are then generated based on these distributions (each topic is assumed equally likely to be
/// assigned to a document, but each document has only one topic).
///
/// Once the documents are created, then the example uses LDA to attempt to reverse engineer the
/// distrbution of words, and prints the results to the screen for comparison.

extern crate rusty_machine;
extern crate rand;
extern crate rulinalg;

use rusty_machine::linalg::{Matrix, BaseMatrix, Vector};
use rusty_machine::learning::UnSupModel;
use rusty_machine::learning::lda::LDA;

use rand::{thread_rng, Rng};
use rand::distributions::{gamma, IndependentSample};

use std::cmp::max;

/// Given `topic_count` topics, this function will create a distrbution of words for each
/// topic. For simplicity, this function assumes that the total number of words in the corpus
/// will be `(topic_count / 2)^2`.
fn generate_word_distribution(topic_count: usize) -> Matrix<f64> {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor point: Potential issue here with assuming topic_count is divisible by 2. From what I can tell this shouldn't break anything but I wanted to draw you attention to it regardless.

let width = topic_count / 2;
let vocab_size = width * width;
let initial_value = 1.0 / width as f64;
Matrix::from_fn(topic_count, vocab_size, |col, row| {
if row < width {
// Horizontal topics
if col / width == row {
initial_value
} else {
0.0
}
} else {
//Vertical topics
if col % width == (row - width) {
initial_value
} else {
0.0
}
}
})
}

/// Samples `count` times from a dirichlet distribution with alpha as given and
/// beta 1.0.
fn get_dirichlet(count: usize, alpha: f64) -> Vector<f64> {
let mut rng = thread_rng();
let g_dist = gamma::Gamma::new(alpha, 1.0);
let result = Vector::from_fn(count, |_| {
g_dist.ind_sample(&mut rng)
});
let sum = result.sum();
result / sum
}

/// Generates a document based on a word distributiion as given. The topics are randomly sampled
/// from a dirichlet distribution and then the word sampled from the selected topic.
fn generate_document(word_distribution: &Matrix<f64>, topic_count:usize, vocab_size: usize, document_length: usize, alpha: f64) -> Vec<usize> {
let mut document = vec![0; vocab_size];
let topic_distribution = get_dirichlet(topic_count, alpha);
for _ in 0..document_length {
let topic = choose_from(&topic_distribution);
let word = choose_from(&word_distribution.row(topic).into());
document[word] += 1;
}
document
}

/// Generate a collection of documents based on the word distribution
fn generate_documents(word_distribution: &Matrix<f64>, topic_count: usize, vocab_size: usize, document_count: usize, document_length: usize, alpha: f64) -> Matrix<usize> {
let mut documents = Vec::with_capacity(vocab_size * document_count);
for _ in 0..document_count {
documents.append(&mut generate_document(word_distribution, topic_count, vocab_size, document_length, alpha));
}
Matrix::new(document_count, vocab_size, documents)
}

/// Chooses from a vector of probailities.
fn choose_from(probability: &Vector<f64>) -> usize {
let mut rng = thread_rng();
let selection:f64 = rng.next_f64();
let mut total:f64 = 0.0;
for (index, p) in probability.iter().enumerate() {
total += *p;
if total >= selection {
return index;
}
}
return probability.size() - 1;
}

/// Displays the distrbution of words to a topic as a square graph
fn topic_to_string(topic: &Vector<f64>, width: usize, topic_index: usize) -> String {
let max = topic.iter().fold(0.0, |a, b|{
if a > *b {
a
} else {
*b
}
});
let mut result = String::with_capacity(topic.size() * (topic.size()/width) + 18);
result.push_str(&format!("Topic {}\n", topic_index));
result.push_str("-------\n");
for (index, element) in topic.iter().enumerate() {
let col = index % width;
let out = element / max * 9.0;
if out >= 1.0 {
result.push_str(&(out as u32).to_string());
} else {
result.push('.');
}
if col == width - 1 {
result.push('\n');
}
}
result
}


/// Prints a collection of multiline strings in columns
fn print_multi_line(o: &Vec<String>, column_width: usize) {
let o_split:Vec<_> = o.iter().map(|col| {col.split('\n').collect::<Vec<_>>()}).collect();
let mut still_printing = true;
let mut line_index = 0;
while still_printing {
let mut gap = 0;
still_printing = false;
for col in o_split.iter() {
if col.len() > line_index {
if gap > 0 {
print!("{:width$}", "", width=column_width * gap);
gap = 0;
}
let line = col[line_index];
print!("{:width$}", line, width=column_width);
still_printing = true;
} else {
gap += 1;
}
}
print!("\n");
line_index += 1

}
}


/// Prints the word distribution within topics
fn print_topic_distribution(dist: &Matrix<f64>, topic_count: usize, width: usize) {
let top_strings = &dist.row_iter().take(topic_count/2).enumerate().map(|(topic_index, topic)|topic_to_string(&topic.into(), width, topic_index + 1)).collect();
let bottom_strings = &dist.row_iter().skip(topic_count/2).enumerate().map(|(topic_index, topic)|topic_to_string(&topic.into(), width, topic_index + 1 + topic_count / 2)).collect();

print_multi_line(top_strings, max(12, width + 1));
print_multi_line(bottom_strings, max(12, width + 1));
}

pub fn main() {
// Set initial constants
// These can be changed as you wish
let topic_count = 28;
let document_length = 100;
let document_count = 500;
let alpha = 0.1;

// Don't change these though; they are calculated based on the above
let width = topic_count / 2;
let vocab_count = width * width;
println!("Creating word distribution");
let word_distribution = generate_word_distribution(topic_count);
println!("Distrbution generated:");
print_topic_distribution(&word_distribution, topic_count, width);
println!("Generating documents");
let input = generate_documents(&word_distribution, topic_count, vocab_count, document_count, document_length, alpha);
let lda = LDA::new(topic_count, alpha, 0.1);
println!("Predicting word distrbution from generated documents");
let result = lda.predict(&(input, 30)).unwrap();
let dist = result.phi();
println!("Prediction completed. Predicted word distribution:");
println!("(Should be similar generated distribution above)", );

print_topic_distribution(&dist, topic_count, width);


}
12 changes: 6 additions & 6 deletions examples/naive_bayes_dogs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,16 +135,16 @@ fn main() {
// Score how well we did.
let mut hits = 0;
let unprinted_total = test_set_size.saturating_sub(10) as usize;
for (dog, prediction) in test_dogs.iter().zip(predictions.iter_rows()).take(unprinted_total) {
evaluate_prediction(&mut hits, dog, prediction);
for (dog, prediction) in test_dogs.iter().zip(predictions.row_iter()).take(unprinted_total) {
evaluate_prediction(&mut hits, dog, prediction.raw_slice());
}

if unprinted_total > 0 {
println!("...");
}
for (dog, prediction) in test_dogs.iter().zip(predictions.iter_rows()).skip(unprinted_total) {
let (actual_color, accurate) = evaluate_prediction(&mut hits, dog, prediction);

for (dog, prediction) in test_dogs.iter().zip(predictions.row_iter()).skip(unprinted_total) {
let (actual_color, accurate) = evaluate_prediction(&mut hits, dog, prediction.raw_slice());
println!("Predicted: {:?}; Actual: {:?}; Accurate? {:?}",
dog.color, actual_color, accurate);
}
Expand Down
50 changes: 47 additions & 3 deletions src/analysis/score.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,24 @@ pub fn accuracy<I>(outputs: I, targets: I) -> f64
correct as f64 / len
}


/// Returns the fraction of outputs rows which match their target.
pub fn row_accuracy(outputs: &Matrix<f64>, targets: &Matrix<f64>) -> f64 {
accuracy(outputs.iter_rows(), targets.iter_rows())
pub fn row_accuracy<T: PartialEq>(outputs: &Matrix<T>, targets: &Matrix<T>) -> f64 {

assert!(outputs.rows() == targets.rows());
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little confused by this change. I'm guessing it comes from the rulinalg-0.4 PR?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, I am pretty certain I took this one directly from the linalg bump PR

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've now merged a new linalg bump PR into master. I guess you'll want to rebase from that (or just C+P the relevant modules onto a new branch :) )

let len = outputs.rows() as f64;

let correct = outputs.row_iter()
.zip(targets.row_iter())
.filter(|&(ref x, ref y)| x.raw_slice()
.iter()
.zip(y.raw_slice())
.all(|(v1, v2)| v1 == v2))
.count();
correct as f64 / len

// Row doesn't impl PartialEq
// accuracy(outputs.row_iter(), targets.row_iter())
}

/// Returns the precision score for 2 class classification.
Expand Down Expand Up @@ -212,7 +227,8 @@ pub fn neg_mean_squared_error(outputs: &Matrix<f64>, targets: &Matrix<f64>) -> f
#[cfg(test)]
mod tests {
use linalg::Matrix;
use super::{accuracy, precision, recall, f1, neg_mean_squared_error};
use super::{accuracy, precision, recall, f1, neg_mean_squared_error, row_accuracy};


#[test]
fn test_accuracy() {
Expand Down Expand Up @@ -331,6 +347,34 @@ mod tests {
f1(outputs.iter(), targets.iter());
}

#[test]
fn test_row_accuracy() {
let outputs = matrix![1, 0;
0, 1;
1, 0];
let targets = matrix![1, 0;
0, 1;
1, 0];
assert_eq!(row_accuracy(&outputs, &targets), 1.0);

let outputs = matrix![1, 0;
0, 1;
1, 0];
let targets = matrix![0, 1;
0, 1;
1, 0];
assert_eq!(row_accuracy(&outputs, &targets), 2. / 3.);

let outputs = matrix![1., 0.;
0., 1.;
1., 0.];
let targets = matrix![0., 1.;
0., 1.;
1., 0.];
assert_eq!(row_accuracy(&outputs, &targets), 2. / 3.);
}


#[test]
fn test_neg_mean_squared_error_1d() {
let outputs = Matrix::new(3, 1, vec![1f64, 2f64, 3f64]);
Expand Down
10 changes: 5 additions & 5 deletions src/data/transforms/minmax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ impl<T: Float> Transformer<Matrix<T>> for MinMaxScaler<T> {
// https://github.com/AtheMathmo/rulinalg/pull/115
let mut input_min_max = vec![(T::max_value(), T::min_value()); features];

for row in inputs.iter_rows() {
for row in inputs.row_iter() {
for (idx, (feature, min_max)) in row.into_iter().zip(input_min_max.iter_mut()).enumerate() {
if !feature.is_finite() {
return Err(Error::new(ErrorKind::InvalidData,
Expand Down Expand Up @@ -145,12 +145,12 @@ impl<T: Float> Transformer<Matrix<T>> for MinMaxScaler<T> {
Err(Error::new(ErrorKind::InvalidData,
"Input data has different number of columns from fitted data."))
} else {
for row in inputs.iter_rows_mut() {
utils::in_place_vec_bin_op(row, scales.data(), |x, &y| {
for mut row in inputs.row_iter_mut() {
utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), scales.data(), |x, &y| {
*x = *x * y;
});

utils::in_place_vec_bin_op(row, consts.data(), |x, &y| {
utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), consts.data(), |x, &y| {
*x = *x + y;
});
}
Expand All @@ -174,7 +174,7 @@ impl<T: Float> Invertible<Matrix<T>> for MinMaxScaler<T> {
"Inputs have different feature count than transformer."));
}

for row in inputs.iter_rows_mut() {
for mut row in inputs.row_iter_mut() {
for i in 0..features {
row[i] = (row[i] - consts[i]) / scales[i];
}
Expand Down
12 changes: 6 additions & 6 deletions src/data/transforms/standardize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ impl<T: Float + FromPrimitive> Transformer<Matrix<T>> for Standardizer<T> {
Err(Error::new(ErrorKind::InvalidData,
"Input data has different number of columns from fitted data."))
} else {
for row in inputs.iter_rows_mut() {
for mut row in inputs.row_iter_mut() {
// Subtract the mean
utils::in_place_vec_bin_op(row, means.data(), |x, &y| *x = *x - y);
utils::in_place_vec_bin_op(row, variances.data(), |x, &y| {
utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), means.data(), |x, &y| *x = *x - y);
utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), variances.data(), |x, &y| {
*x = (*x * self.scaled_stdev / y.sqrt()) + self.scaled_mean
});
}
Expand All @@ -143,13 +143,13 @@ impl<T: Float + FromPrimitive> Invertible<Matrix<T>> for Standardizer<T> {
"Inputs have different feature count than transformer."));
}

for row in inputs.iter_rows_mut() {
utils::in_place_vec_bin_op(row, &variances.data(), |x, &y| {
for mut row in inputs.row_iter_mut() {
utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), &variances.data(), |x, &y| {
*x = (*x - self.scaled_mean) * y.sqrt() / self.scaled_stdev
});

// Add the mean
utils::in_place_vec_bin_op(row, &means.data(), |x, &y| *x = *x + y);
utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), &means.data(), |x, &y| *x = *x + y);
}

Ok(inputs)
Expand Down
Loading