AtheMathmo · dyule · Dec 27, 2016 · Jan 17, 2017 · Jan 19, 2017 · Jan 19, 2017
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,4 +17,4 @@ datasets = []
 [dependencies]
 num = { version = "0.1.35", default-features = false }
 rand = "0.3.14"
-rulinalg = "0.3.7"
+rulinalg = "0.4.0"
diff --git a/examples/k-means_generating_cluster.rs b/examples/k-means_generating_cluster.rs
@@ -24,10 +24,10 @@ fn generate_data(centroids: &Matrix<f64>,
 
     for _ in 0..points_per_centroid {
         // Generate points from each centroid
-        for centroid in centroids.iter_rows() {
+        for centroid in centroids.row_iter() {
             // Generate a point randomly around the centroid
             let mut point = Vec::with_capacity(centroids.cols());
-            for feature in centroid {
+            for feature in centroid.iter() {
                 point.push(feature + normal_rv.ind_sample(&mut rng));
             }
 

diff --git a/examples/lda_gen.rs b/examples/lda_gen.rs
@@ -0,0 +1,191 @@
+/// An example of how Latent Diriclhet Allocation (LDA) can be used.  This example begins by
+/// generating a distribution of words to categories.  This distribution is creatred so that
+/// there are 10 topics.  Each of the 25 words are assigned to two topics with equal probability.
+/// (The distribution of words is printed to the screen as a chart.  Each entry in the chart
+///corresponds to a word in the vocabulary, arranged into a square for easy viewing).  Documents
+/// are then generated based on these distributions (each topic is assumed equally likely to be
+/// assigned to a document, but each document has only one topic).
+///
+/// Once the documents are created, then the example uses LDA to attempt to reverse engineer the
+/// distrbution of words, and prints the results to the screen for comparison.
+
+extern crate rusty_machine;
+extern crate rand;
+extern crate rulinalg;
+
+use rusty_machine::linalg::{Matrix, BaseMatrix, Vector};
+use rusty_machine::learning::UnSupModel;
+use rusty_machine::learning::lda::LDA;
+
+use rand::{thread_rng, Rng};
+use rand::distributions::{gamma, IndependentSample};
+
+use std::cmp::max;
+
+/// Given `topic_count` topics, this function will create a distrbution of words for each
+/// topic.  For simplicity, this function assumes that the total number of words in the corpus
+/// will be `(topic_count / 2)^2`.
+fn generate_word_distribution(topic_count: usize) -> Matrix<f64> {
+    let width = topic_count / 2;
+    let vocab_size = width * width;
+    let initial_value = 1.0 / width as f64;
+    Matrix::from_fn(topic_count, vocab_size, |col, row| {
+        if row < width {
+            // Horizontal topics
+            if col / width == row {
+                initial_value
+            } else {
+                0.0
+            }
+        } else {
+            //Vertical topics
+            if col % width == (row - width) {
+                initial_value
+            } else {
+                0.0
+            }
+        }
+    })
+}
+
+/// Samples `count` times from a dirichlet distribution with alpha as given and
+/// beta 1.0.
+fn get_dirichlet(count: usize, alpha: f64) -> Vector<f64> {
+    let mut rng = thread_rng();
+    let g_dist = gamma::Gamma::new(alpha, 1.0);
+    let result = Vector::from_fn(count, |_| {
+        g_dist.ind_sample(&mut rng)
+    });
+    let sum = result.sum();
+    result / sum
+}
+
+/// Generates a document based on a word distributiion as given.  The topics are randomly sampled
+/// from a dirichlet distribution and then the word sampled from the selected topic.
+fn generate_document(word_distribution: &Matrix<f64>, topic_count:usize, vocab_size: usize, document_length: usize, alpha: f64) -> Vec<usize> {
+    let mut document = vec![0; vocab_size];
+    let topic_distribution = get_dirichlet(topic_count, alpha);
+    for _ in 0..document_length {
+        let topic = choose_from(&topic_distribution);
+        let word = choose_from(&word_distribution.row(topic).into());
+        document[word] += 1;
+    }
+    document
+}
+
+/// Generate a collection of documents based on the word distribution
+fn generate_documents(word_distribution: &Matrix<f64>, topic_count: usize, vocab_size: usize, document_count: usize, document_length: usize, alpha: f64) -> Matrix<usize> {
+    let mut documents = Vec::with_capacity(vocab_size * document_count);
+    for _ in 0..document_count {
+        documents.append(&mut generate_document(word_distribution, topic_count, vocab_size, document_length, alpha));
+    }
+    Matrix::new(document_count, vocab_size, documents)
+}
+
+/// Chooses from a vector of probailities.
+fn choose_from(probability: &Vector<f64>) -> usize {
+    let mut rng = thread_rng();
+    let selection:f64 = rng.next_f64();
+    let mut total:f64 = 0.0;
+    for (index, p) in probability.iter().enumerate() {
+        total += *p;
+        if total >= selection {
+            return index;
+        }
+    }
+    return probability.size() - 1;
+}
+
+/// Displays the distrbution of words to a topic as a square graph
+fn topic_to_string(topic: &Vector<f64>, width: usize, topic_index: usize) -> String {
+    let max = topic.iter().fold(0.0, |a, b|{
+        if a > *b {
+            a
+        } else {
+            *b
+        }
+    });
+    let mut result = String::with_capacity(topic.size() * (topic.size()/width) + 18);
+    result.push_str(&format!("Topic {}\n", topic_index));
+    result.push_str("-------\n");
+    for (index, element) in topic.iter().enumerate() {
+        let col = index % width;
+        let out = element / max * 9.0;
+        if out >= 1.0 {
+            result.push_str(&(out as u32).to_string());
+        } else {
+            result.push('.');
+        }
+        if col == width - 1 {
+            result.push('\n');
+        }
+    }
+    result
+}
+
+
+/// Prints a collection of multiline strings in columns
+fn print_multi_line(o: &Vec<String>, column_width: usize) {
+    let o_split:Vec<_> = o.iter().map(|col| {col.split('\n').collect::<Vec<_>>()}).collect();
+    let mut still_printing = true;
+    let mut line_index = 0;
+    while still_printing {
+        let mut gap = 0;
+        still_printing = false;
+        for col in o_split.iter() {
+            if col.len() > line_index {
+                if gap > 0 {
+                    print!("{:width$}", "", width=column_width * gap);
+                    gap = 0;
+                }
+                let line = col[line_index];
+                print!("{:width$}", line, width=column_width);
+                still_printing = true;
+            } else {
+                gap += 1;
+            }
+        }
+        print!("\n");
+        line_index += 1
+
+    }
+}
+
+
+/// Prints the word distribution within topics
+fn print_topic_distribution(dist: &Matrix<f64>, topic_count: usize, width: usize) {
+    let top_strings = &dist.row_iter().take(topic_count/2).enumerate().map(|(topic_index, topic)|topic_to_string(&topic.into(), width, topic_index + 1)).collect();
+    let bottom_strings = &dist.row_iter().skip(topic_count/2).enumerate().map(|(topic_index, topic)|topic_to_string(&topic.into(), width, topic_index + 1 + topic_count / 2)).collect();
+
+    print_multi_line(top_strings, max(12, width + 1));
+    print_multi_line(bottom_strings, max(12, width + 1));
+}
+
+pub fn main() {
+    // Set initial constants
+    // These can be changed as you wish
+    let topic_count = 28;
+    let document_length = 100;
+    let document_count = 500;
+    let alpha = 0.1;
+
+    // Don't change these though; they are calculated based on the above
+    let width = topic_count / 2;
+    let vocab_count = width * width;
+    println!("Creating word distribution");
+    let word_distribution = generate_word_distribution(topic_count);
+    println!("Distrbution generated:");
+    print_topic_distribution(&word_distribution, topic_count, width);
+    println!("Generating documents");
+    let input = generate_documents(&word_distribution, topic_count, vocab_count, document_count, document_length, alpha);
+    let lda = LDA::new(topic_count, alpha, 0.1);
+    println!("Predicting word distrbution from generated documents");
+    let result =  lda.predict(&(input, 30)).unwrap();
+    let dist = result.phi();
+    println!("Prediction completed.  Predicted word distribution:");
+    println!("(Should be similar generated distribution above)", );
+
+    print_topic_distribution(&dist, topic_count, width);
+
+
+}
diff --git a/examples/naive_bayes_dogs.rs b/examples/naive_bayes_dogs.rs
@@ -135,16 +135,16 @@ fn main() {
     // Score how well we did.
     let mut hits = 0;
     let unprinted_total = test_set_size.saturating_sub(10) as usize;
-    for (dog, prediction) in test_dogs.iter().zip(predictions.iter_rows()).take(unprinted_total) {
-        evaluate_prediction(&mut hits, dog, prediction);
+    for (dog, prediction) in test_dogs.iter().zip(predictions.row_iter()).take(unprinted_total) {
+        evaluate_prediction(&mut hits, dog, prediction.raw_slice());
     }
-    
+
     if unprinted_total > 0 {
         println!("...");
     }
-    
-    for (dog, prediction) in test_dogs.iter().zip(predictions.iter_rows()).skip(unprinted_total) {
-        let (actual_color, accurate) = evaluate_prediction(&mut hits, dog, prediction);
+
+    for (dog, prediction) in test_dogs.iter().zip(predictions.row_iter()).skip(unprinted_total) {
+        let (actual_color, accurate) = evaluate_prediction(&mut hits, dog, prediction.raw_slice());
         println!("Predicted: {:?}; Actual: {:?}; Accurate? {:?}",
                  dog.color, actual_color, accurate);
     }

diff --git a/src/analysis/score.rs b/src/analysis/score.rs
@@ -44,9 +44,24 @@ pub fn accuracy<I>(outputs: I, targets: I) -> f64
     correct as f64 / len
 }
 
+
 /// Returns the fraction of outputs rows which match their target.
-pub fn row_accuracy(outputs: &Matrix<f64>, targets: &Matrix<f64>) -> f64 {
-    accuracy(outputs.iter_rows(), targets.iter_rows())
+pub fn row_accuracy<T: PartialEq>(outputs: &Matrix<T>, targets: &Matrix<T>) -> f64 {
+
+    assert!(outputs.rows() == targets.rows());
+    let len = outputs.rows() as f64;
+
+    let correct = outputs.row_iter()
+                         .zip(targets.row_iter())
+                         .filter(|&(ref x, ref y)| x.raw_slice()
+                                                    .iter()
+                                                    .zip(y.raw_slice())
+                                                    .all(|(v1, v2)| v1 == v2))
+                         .count();
+    correct as f64 / len
+
+    // Row doesn't impl PartialEq
+    // accuracy(outputs.row_iter(), targets.row_iter())
 }
 
 /// Returns the precision score for 2 class classification.
@@ -212,7 +227,8 @@ pub fn neg_mean_squared_error(outputs: &Matrix<f64>, targets: &Matrix<f64>) -> f
 #[cfg(test)]
 mod tests {
     use linalg::Matrix;
-    use super::{accuracy, precision, recall, f1, neg_mean_squared_error};
+    use super::{accuracy, precision, recall, f1, neg_mean_squared_error, row_accuracy};
+
 
     #[test]
     fn test_accuracy() {
@@ -331,6 +347,34 @@ mod tests {
         f1(outputs.iter(), targets.iter());
     }
 
+    #[test]
+    fn test_row_accuracy() {
+        let outputs = matrix![1, 0;
+                              0, 1;
+                              1, 0];
+        let targets = matrix![1, 0;
+                              0, 1;
+                              1, 0];
+        assert_eq!(row_accuracy(&outputs, &targets), 1.0);
+
+        let outputs = matrix![1, 0;
+                              0, 1;
+                              1, 0];
+        let targets = matrix![0, 1;
+                              0, 1;
+                              1, 0];
+        assert_eq!(row_accuracy(&outputs, &targets), 2. / 3.);
+
+        let outputs = matrix![1., 0.;
+                              0., 1.;
+                              1., 0.];
+        let targets = matrix![0., 1.;
+                              0., 1.;
+                              1., 0.];
+        assert_eq!(row_accuracy(&outputs, &targets), 2. / 3.);
+    }
+
+
     #[test]
     fn test_neg_mean_squared_error_1d() {
         let outputs = Matrix::new(3, 1, vec![1f64, 2f64, 3f64]);

diff --git a/src/data/transforms/minmax.rs b/src/data/transforms/minmax.rs
@@ -90,7 +90,7 @@ impl<T: Float> Transformer<Matrix<T>> for MinMaxScaler<T> {
         // https://github.com/AtheMathmo/rulinalg/pull/115
         let mut input_min_max = vec![(T::max_value(), T::min_value()); features];
 
-        for row in inputs.iter_rows() {
+        for row in inputs.row_iter() {
             for (idx, (feature, min_max)) in row.into_iter().zip(input_min_max.iter_mut()).enumerate() {
                 if !feature.is_finite() {
                     return Err(Error::new(ErrorKind::InvalidData,
@@ -145,12 +145,12 @@ impl<T: Float> Transformer<Matrix<T>> for MinMaxScaler<T> {
                 Err(Error::new(ErrorKind::InvalidData,
                                "Input data has different number of columns from fitted data."))
             } else {
-                for row in inputs.iter_rows_mut() {
-                    utils::in_place_vec_bin_op(row, scales.data(), |x, &y| {
+                for mut row in inputs.row_iter_mut() {
+                    utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), scales.data(), |x, &y| {
                         *x = *x * y;
                     });
 
-                    utils::in_place_vec_bin_op(row, consts.data(), |x, &y| {
+                    utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), consts.data(), |x, &y| {
                         *x = *x + y;
                     });
                 }
@@ -174,7 +174,7 @@ impl<T: Float> Invertible<Matrix<T>> for MinMaxScaler<T> {
                                       "Inputs have different feature count than transformer."));
             }
 
-            for row in inputs.iter_rows_mut() {
+            for mut row in inputs.row_iter_mut() {
                 for i in 0..features {
                     row[i] = (row[i] - consts[i]) / scales[i];
                 }

diff --git a/src/data/transforms/standardize.rs b/src/data/transforms/standardize.rs
@@ -118,10 +118,10 @@ impl<T: Float + FromPrimitive> Transformer<Matrix<T>> for Standardizer<T> {
                 Err(Error::new(ErrorKind::InvalidData,
                                "Input data has different number of columns from fitted data."))
             } else {
-                for row in inputs.iter_rows_mut() {
+                for mut row in inputs.row_iter_mut() {
                     // Subtract the mean
-                    utils::in_place_vec_bin_op(row, means.data(), |x, &y| *x = *x - y);
-                    utils::in_place_vec_bin_op(row, variances.data(), |x, &y| {
+                    utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), means.data(), |x, &y| *x = *x - y);
+                    utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), variances.data(), |x, &y| {
                         *x = (*x * self.scaled_stdev / y.sqrt()) + self.scaled_mean
                     });
                 }
@@ -143,13 +143,13 @@ impl<T: Float + FromPrimitive> Invertible<Matrix<T>> for Standardizer<T> {
                                       "Inputs have different feature count than transformer."));
             }
 
-            for row in inputs.iter_rows_mut() {
-                utils::in_place_vec_bin_op(row, &variances.data(), |x, &y| {
+            for mut row in inputs.row_iter_mut() {
+                utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), &variances.data(), |x, &y| {
                     *x = (*x - self.scaled_mean) * y.sqrt() / self.scaled_stdev
                 });
 
                 // Add the mean
-                utils::in_place_vec_bin_op(row, &means.data(), |x, &y| *x = *x + y);
+                utils::in_place_vec_bin_op(&mut row.raw_slice_mut(), &means.data(), |x, &y| *x = *x + y);
             }
 
             Ok(inputs)