PRML
diff --git a/Contents.m b/Contents.m
@@ -0,0 +1,126 @@
+% CHAPTER01
+%   condEntropy      - Compute conditional entropy z=H(x|y) of two discrete variables x and y.
+%   entropy          - Compute entropy z=H(x) of a discrete variable x.
+%   jointEntropy     - Compute joint entropy z=H(x,y) of two discrete variables x and y.
+%   mutInfo          - Compute mutual information I(x,y) of two discrete variables x and y.
+%   nmi              - Compute normalized mutual information I(x,y)/sqrt(H(x)*H(y)) of two discrete variables x and y.
+%   nvi              - Compute normalized variation information z=(1-I(x,y)/H(x,y)) of two discrete variables x and y.
+%   relatEntropy     - Compute relative entropy (a.k.a KL divergence) z=KL(p(x)||p(y)) of two discrete variables x and y.
+% CHAPTER02    
+%   logDirichlet     - Compute log pdf of a Dirichlet distribution.
+%   logGauss         - Compute log pdf of a Gaussian distribution.
+%   logKde           - Compute log pdf of kernel density estimator.
+%   logMn            - Compute log pdf of a multinomial distribution.
+%   logMvGamma       - Compute logarithm multivariate Gamma function 
+%   logSt            - Compute log pdf of a Student's t distribution.
+%   logVmf           - Compute log pdf of a von Mises-Fisher distribution.
+%   logWishart       - Compute log pdf of a Wishart distribution.
+% CHAPTER03    
+%   linReg           - Fit linear regression model y=w'x+w0  
+%   linRegFp         - Fit empirical Bayesian linear model with Mackay fixed point method (p.168)
+%   linRegPred       - Compute linear regression model reponse y = w'*X+w0 and likelihood
+%   linRnd           - Generate data from a linear model p(t|w,x)=G(w'x+w0,sigma), sigma=sqrt(1/beta) 
+% CHAPTER04    
+%   binPlot          - Plot binary classification result for 2d data
+%   fda              - Fisher (linear) discriminant analysis
+%   logitBin         - Logistic regression for binary classification optimized by Newton-Raphson method.
+%   logitBinPred     - Prediction of binary logistic regression model
+%   logitMn          - Multinomial regression for multiclass problem (Multinomial likelihood)
+%   logitMnPred      - Prediction of multiclass (multinomial) logistic regression model
+%   sigmoid          - Sigmod function
+%   softmax          - Softmax function
+% CHAPTER05
+%   mlpClass         - Train a multilayer perceptron neural network for classification with backpropagation
+%   mlpClassPred     - Multilayer perceptron classification prediction
+%   mlpReg           - Train a multilayer perceptron neural network for regression with backpropagation
+%   mlpRegPred       - Multilayer perceptron regression prediction
+% CHAPTER06    
+%   kn2sd            - Transform a kernel matrix (or inner product matrix) to a squared distance matrix
+%   knCenter         - Centerize the data in the kernel space
+%   knGauss          - Gaussian (RBF) kernel K = exp(-|x-y|/(2s));
+%   knKmeans         - Perform kernel kmeans clustering.
+%   knKmeansPred     - Prediction for kernel kmeans clusterng
+%   knLin            - Linear kernel (inner product)
+%   knPca            - Kernel PCA
+%   knPcaPred        - Prediction for kernel PCA
+%   knPoly           - Polynomial kernel k(x,y)=(x'y+c)^o
+%   knReg            - Gaussian process (kernel) regression
+%   knRegPred        - Prediction for Gaussian Process (kernel) regression model
+%   sd2kn            - Transform a squared distance matrix to a kernel matrix. 
+% CHAPTER07    
+%   rvmBinFp         - Relevance Vector Machine (ARD sparse prior) for binary classification.
+%   rvmBinPred       - Prodict the label for binary logistic regression model
+%   rvmRegFp         - Relevance Vector Machine (ARD sparse prior) for regression
+%   rvmRegPred       - Compute RVM regression model reponse y = w'*X+w0 and likelihood 
+%   rvmRegSeq        - Sparse Bayesian Regression (RVM) using sequential algorithm
+% CHAPTER08    
+%  MRF    
+%   mrfBethe         - Compute Bethe energy
+%   mrfBp            - Undirected graph belief propagation for MRF
+%   mrfGibbs         - Compute Gibbs energy
+%   mrfIsGa          - Contruct a latent Ising MRF with Gaussian observation
+%   mrfMf            - Mean field for MRF
+%  NaiveBayes    
+%   nbBern           - Naive bayes classifier with indepenet Bernoulli.
+%   nbBernPred       - Prediction of naive Bayes classifier with independent Bernoulli.
+%   nbGauss          - Naive bayes classifier with indepenet Gaussian
+%   nbGaussPred      - Prediction of naive Bayes classifier with independent Gaussian.
+% CHAPTER09    
+%   kmeans           - Perform kmeans clustering.
+%   kmeansPred       - Prediction for kmeans clusterng
+%   kmeansRnd        - Generate samples from a Gaussian mixture distribution with common variances (kmeans model).
+%   kmedoids         - Perform k-medoids clustering.
+%   kseeds           - Perform kmeans++ seeding
+%   linRegEm         - Fit empirical Bayesian linear regression model with EM (p.448 chapter 9.3.4)
+%   mixBernEm        - Perform EM algorithm for fitting the Bernoulli mixture model.
+%   mixBernRnd       - Generate samples from a Bernoulli mixture distribution.
+%   mixGaussEm       - Perform EM algorithm for fitting the Gaussian mixture model.
+%   mixGaussPred     - Predict label and responsibility for Gaussian mixture model.
+%   mixGaussRnd      - Genarate samples form a Gaussian mixture model.
+%   rvmBinEm         - Relevance Vector Machine (ARD sparse prior) for binary classification.
+%   rvmRegEm         - Relevance Vector Machine (ARD sparse prior) for regression
+% CHAPTER10
+%   linRegVb         - Variational Bayesian inference for linear regression.
+%   mixGaussEvidence - Variational lower bound of the model evidence (log of marginal likelihood)
+%   mixGaussVb       - Variational Bayesian inference for Gaussian mixture.
+%   mixGaussVbPred   - Predict label and responsibility for Gaussian mixture model trained by VB.
+%   rvmRegVb         - Variational Bayesian inference for RVM regression.
+% CHAPTER11
+%   dirichletRnd     - Generate samples from a Dirichlet distribution.
+%   discreteRnd      - Generate samples from a discrete distribution (multinomial).
+%   Gauss            - Class for Gaussian distribution used by Dirichlet process
+%   gaussRnd         - Generate samples from a Gaussian distribution.
+%   GaussWishart     - Class for Gaussian-Wishart distribution used by Dirichlet process
+%   mixDpGb          - Collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 
+%   mixDpGbOl        - Online collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 
+%   mixGaussGb       - Collapsed Gibbs sampling for Dirichlet process (infinite) Gaussian mixture model (a.k.a. DPGM). 
+%   mixGaussSample   - Genarate samples form a Gaussian mixture model with GaussianWishart prior.
+% CHAPTER12 
+%   fa               - Perform EM algorithm for factor analysis model
+%   pca              - Principal component analysis
+%   pcaEm            - Perform EM-like algorithm for PCA (by Sam Roweis).
+%   pcaEmC           - Perform Constrained EM like algorithm for PCA.
+%   ppcaEm           - Perform EM algorithm to maiximize likelihood of probabilistic PCA model.
+%   ppcaRnd          - Generate data from probabilistic PCA model
+%   ppcaVb           - Perform variatioanl Bayeisan inference for probabilistic PCA model. 
+% CHAPTER13 
+%  HMM 
+%   hmmEm            - EM algorithm to fit the parameters of HMM model (a.k.a Baum-Welch algorithm)
+%   hmmFilter        - HMM forward filtering algorithm. 
+%   hmmRnd           - Generate a data sequence from a hidden Markov model.
+%   hmmSmoother      - HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm).
+%   hmmViterbi       - Viterbi algorithm (calculated in log scale to improve numerical stability).
+%  LDS 
+%   kalmanFilter     - Kalman filter (forward algorithm for linear dynamic system)
+%   kalmanSmoother   - Kalman smoother (forward-backward algorithm for linear dynamic system)
+%   ldsEm            - EM algorithm for parameter estimation of linear dynamic system.
+%   ldsPca           - Subspace method for learning linear dynamic system.
+%   ldsRnd           - Generate a data sequence from linear dynamic system.
+% CHAPTER14 
+%   adaboostBin      - Adaboost for binary classification (weak learner: kmeans)
+%   adaboostBinPred  - Prediction of binary Adaboost
+%   mixLinPred       - Prediction function for mxiture of linear regression
+%   mixLinReg        - Mixture of linear regression
+%   mixLinRnd        - Generate data from mixture of linear model
+%   mixLogitBin      - Mixture of logistic regression model for binary classification optimized by Newton-Raphson method
+%   mixLogitBinPred  - Prediction function for mixture of logistic regression
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Mo Chen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,36 +1,37 @@
 Introduction
 -------
-This package is a Matlab implementation of the algorithms described in the classical machine learning textbook:
+This Matlab package implements machine learning algorithms described in the great textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
-Note: this package requires Matlab **R2016b** or later, since it utilizes a new syntax of Matlab called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting in Python).
+It is written purely in Matlab language. It is self-contained. There is no external dependency.
 
-Description
--------
-While developing this package, I stick to following principles
+Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires Statistics Toolbox (for some simple random number generator) and Image Processing Toolbox (for reading image data).
 
-* Succinct: The code is extremely terse. Minimizing the number of lines is a primal target. As a result, the core of the algorithms can be easily spot.
-* Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementations. Usually, functions in this package are orders faster than Matlab builtin ones which provide the same functionality (eg. kmeans). If anyone have found any Matlab implementation that is faster than mine, I am happy to further optimize.
-* Robust: Many tricks for numerical stability are applied, such as probability computation in log scale and square root matrix update to enforce matrix symmetry, etc.
-* Learnable: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines. Symbols are in sync with the book.
-* Practical: The package is designed not only to be easily read, but also to be easily used to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
+Design Goal
+-------
+* Succinct: The code is extremely compact. Minimizing code length is a major goal. As a result, the core of the algorithms can be easily spotted.
+* Efficient: Many tricks for speeding up Matlab code are applied (e.g. vectorization, matrix factorization, etc.). Usually, functions in this package are orders faster than Matlab builtin ones (e.g. kmeans).
+* Robust: Many tricks for numerical stability are applied, such as computing probability in logrithm domain, square root matrix update to enforce matrix symmetry\PD, etc.
+* Readable: The code is heavily commented. Corresponding formulas in PRML are annoted. Symbols are in sync with the book.
+* Practical: The package is not only readable, but also meant to be easily used and modified to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
 
 Installation
 -------
-1. Download the package to your local path (e.g. PRMLT/) by running: `git clone https://github.com/PRML/PRMLT.git`.
+1. Download the package to a local folder (e.g. ~/PRMLT/) by running: 
+```console
+git clone https://github.com/PRML/PRMLT.git
+```
+2. Run Matlab and navigate to the folder (~/PRMLT/), then run the init.m script.
 
-2. Run Matlab and navigate to PRMLT/, then run the init.m script.
-
-3. Run some demos in PRMLT/demo directory. Enjoy!
+3. Run some demos in ~/PRMLT/demo folder. Enjoy!
 
 FeedBack
 -------
-If you found any bug or have any suggestion, please do file issues. I am graceful for any feedback and will do my best to improve this package.
+If you find any bug or have any suggestion, please do file issues. I am graceful for any feedback and will do my best to improve this package.
 
 License
 -------
-Currently Released Under GPLv3
-
+Released under MIT license
 
 Contact
 -------

diff --git a/chapter01/entropy.m b/chapter01/entropy.m
@@ -6,10 +6,7 @@
 %   z: entropy z=H(x)
 % Written by Mo Chen (sth4nth@gmail.com).
 n = numel(x);
-[u,~,x] = unique(x);
-k = numel(u);
-idx = 1:n;
-Mx = sparse(idx,x,1,n,k,n);
-Px = nonzeros(mean(Mx,1));
+[~,~,x] = unique(x);
+Px = accumarray(x, 1)/n;
 Hx = -dot(Px,log2(Px));
 z = max(0,Hx);
diff --git a/chapter04/logitBin.m b/chapter04/logitBin.m
@@ -1,16 +1,16 @@
-function [model, llh] = logitBin(X, y, lambda, eta)
+function [model, llh] = logitBin(X, y, lambda)
 % Logistic regression for binary classification optimized by Newton-Raphson method.
 % Input:
 %   X: d x n data matrix
-%   z: 1 x n label (0/1)
+%   y: 1 x n label (0/1)
 %   lambda: regularization parameter
-%   eta: step size
+%   alpha: step size
 % Output:
 %   model: trained model structure
 %   llh: loglikelihood
 % Written by Mo Chen (sth4nth@gmail.com).
 if nargin < 4
-    eta = 1e-1;
+    alpha = 1e-1;
 end
 if nargin < 3
     lambda = 1e-4;
@@ -20,18 +20,17 @@
 tol = 1e-4;
 epoch = 200;
 llh = -inf(1,epoch);
-h = 2*y-1;
 w = rand(d,1);
 for t = 2:epoch
     a = w'*X;
-    llh(t) = -(sum(log1pexp(-h.*a))+0.5*lambda*dot(w,w))/n; % 4.89
-    if llh(t)-llh(t-1) < tol; break; end
+    llh(t) = (dot(a,y)-sum(log1pexp(a))-0.5*lambda*dot(w,w))/n; % 4.90
+    if abs(llh(t)-llh(t-1)) < tol; break; end
     z = sigmoid(a);                     % 4.87
     g = X*(z-y)'+lambda*w;              % 4.96
     r = z.*(1-z);                       % 4.98
     Xw = bsxfun(@times, X, sqrt(r));
     H = Xw*Xw'+lambda*eye(d);           % 4.97
-    w = w-eta*(H\g); 
+    w = w-alpha*(H\g);                  % 4.92
 end
 llh = llh(2:t);
 model.w = w;
diff --git a/chapter05/mlp.m b/chapter05/mlp.m
diff --git a/chapter05/mlpClass.m b/chapter05/mlpClass.m
@@ -0,0 +1,63 @@
+function [model, L] = mlpClass(X, y, k, lambda)
+% Train a multilayer perceptron neural network for multiclass classification with backpropagation
+% logistic activation function is used.
+% Input:
+%   X: d x n data matrix
+%   y: 1 x n label vector
+%   k: T x 1 vector to specify number of hidden nodes in each layer
+%   lambda: regularization parameter
+% Ouput:
+%   model: model structure
+%   L: (regularized cross entropy) loss
+% Written by Mo Chen (sth4nth@gmail.com).
+if nargin < 4
+    lambda = 1e-2;
+end
+eta = 1e-3;
+tol = 1e-4;
+maxiter = 50000;
+L = inf(1,maxiter);
+
+Y = sparse(y,1:numel(y),1);
+k = [size(X,1);k(:);size(Y,1)];
+T = numel(k)-1;
+W = cell(T,1);
+b = cell(T,1);
+for t = 1:T
+    W{t} = randn(k(t),k(t+1));
+    b{t} = randn(k(t+1),1);
+end
+R = cell(T,1);
+Z = cell(T+1,1);
+Z{1} = X;
+for iter = 2:maxiter
+%     forward
+    for t = 1:T-1
+        Z{t+1} = sigmoid(W{t}'*Z{t}+b{t});         % 5.10 5.113
+    end
+    Z{T+1} = softmax(W{T}'*Z{T}+b{T});   
+
+%     loss
+    E = Z{T+1};
+    Wn = cellfun(@(x) dot(x(:),x(:)),W);            % |W|^2
+    L(iter) = -dot(Y(:),log(E(:)))+0.5*lambda*sum(Wn);
+    if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end
+
+%     backward
+    R{T} = Z{T+1}-Y;                
+    for t = T-1:-1:1
+        df = Z{t+1}.*(1-Z{t+1});    % h'(a)
+        R{t} = df.*(W{t+1}*R{t+1});     % 5.66
+    end
+
+%     gradient descent
+    for t=1:T
+        dW = Z{t}*R{t}'+lambda*W{t};      % 5.67
+        db = sum(R{t},2);
+        W{t} = W{t}-eta*dW;               % 5.43
+        b{t} = b{t}-eta*db;
+    end
+end
+L = L(2:iter);
+model.W = W;
+model.b = b;
diff --git a/chapter05/mlpClassPred.m b/chapter05/mlpClassPred.m
@@ -0,0 +1,19 @@
+function [y, P] = mlpClassPred(model, X)
+% Multilayer perceptron classification prediction
+% logistic activation function is used.
+% Input:
+%   model: model structure
+%   X: d x n data matrix
+% Ouput:
+%   y: 1 x n label vector
+%   P: k x n probability matrix
+% Written by Mo Chen (sth4nth@gmail.com).
+W = model.W;
+b = model.b;
+T = length(W);
+Z = X;
+for t = 1:T-1
+    Z = sigmoid(W{t}'*Z+b{t});
+end
+P = softmax(W{T}'*Z+b{T});
+[~,y] = max(P,[],1);