From 58dd5112032819bce5c2fb7fa84c3e1c1dc38fc3 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 29 Aug 2016 01:03:12 +0800
Subject: [PATCH 001/119] added bp ep stump

---
 chapter08/fgSp.m       | 7 +++++++
 chapter08/hmmBp.m      | 6 ++++++
 chapter08/mrfSp.m      | 5 +++++
 chapter10/gpEp.m       | 7 +++++++
 chapter10/hmmEp.m      | 7 +++++++
 chapter10/mixGaussEp.m | 7 +++++++
 6 files changed, 39 insertions(+)
 create mode 100644 chapter08/fgSp.m
 create mode 100644 chapter08/hmmBp.m
 create mode 100644 chapter08/mrfSp.m
 create mode 100644 chapter10/gpEp.m
 create mode 100644 chapter10/hmmEp.m
 create mode 100644 chapter10/mixGaussEp.m

diff --git a/chapter08/fgSp.m b/chapter08/fgSp.m
new file mode 100644
index 0000000..a41b324
--- /dev/null
+++ b/chapter08/fgSp.m
@@ -0,0 +1,7 @@
+function model = fgSp(A)
+% sum product belief propagation on factor graph
+% support parallel schedule and serial schdule
+% A: affinity matrix (sparse) of a MRF graph
+
+
+
diff --git a/chapter08/hmmBp.m b/chapter08/hmmBp.m
new file mode 100644
index 0000000..0c86c69
--- /dev/null
+++ b/chapter08/hmmBp.m
@@ -0,0 +1,6 @@
+function [ output_args ] = hmmBp( input_args )
+% sum product belief propagation for HMM model
+% support parallel schedule and serial schdule
+% A: affinity matrix (sparse) of a MRF graph
+
+
diff --git a/chapter08/mrfSp.m b/chapter08/mrfSp.m
new file mode 100644
index 0000000..1e4671d
--- /dev/null
+++ b/chapter08/mrfSp.m
@@ -0,0 +1,5 @@
+function model = mrfSp(A)
+% sum product belief propagation on Markov random field (undirected graphical model)
+% support parallel schedule and serial schdule
+% A: affinity matrix (sparse) of a MRF graph
+
diff --git a/chapter10/gpEp.m b/chapter10/gpEp.m
new file mode 100644
index 0000000..0ee4a59
--- /dev/null
+++ b/chapter10/gpEp.m
@@ -0,0 +1,7 @@
+function [ output_args ] = gpEp( input_args )
+%GPEP Summary of this function goes here
+%   Detailed explanation goes here
+
+
+end
+
diff --git a/chapter10/hmmEp.m b/chapter10/hmmEp.m
new file mode 100644
index 0000000..00d0807
--- /dev/null
+++ b/chapter10/hmmEp.m
@@ -0,0 +1,7 @@
+function [ output_args ] = hmmEp( input_args )
+%HMMEP Summary of this function goes here
+%   Detailed explanation goes here
+
+
+end
+
diff --git a/chapter10/mixGaussEp.m b/chapter10/mixGaussEp.m
new file mode 100644
index 0000000..6d56405
--- /dev/null
+++ b/chapter10/mixGaussEp.m
@@ -0,0 +1,7 @@
+function [ output_args ] = mixGaussEp( input_args )
+%MIXGAUSSEP Summary of this function goes here
+%   Detailed explanation goes here
+
+
+end
+

From 7067c9450d80b78cbabb44e2e9db33d3d5f6d751 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 9 Feb 2017 21:20:26 +0800
Subject: [PATCH 002/119] minor tweak

---
 common/logsumexp.m   | 2 +-
 common/normalize.m   | 2 +-
 common/standardize.m | 5 -----
 common/unitize.m     | 2 +-
 4 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/common/logsumexp.m b/common/logsumexp.m
index 9838e4f..1098342 100644
--- a/common/logsumexp.m
+++ b/common/logsumexp.m
@@ -2,7 +2,7 @@
 % Compute log(sum(exp(X),dim)) while avoiding numerical underflow.
 %   By default dim = 1 (columns).
 % Written by Mo Chen (sth4nth@gmail.com).
-if nargin == 1, 
+if nargin == 1
     % Determine which dimension sum will use
     dim = find(size(X)~=1,1);
     if isempty(dim), dim = 1; end
diff --git a/common/normalize.m b/common/normalize.m
index be81e2c..c7ae7a1 100644
--- a/common/normalize.m
+++ b/common/normalize.m
@@ -2,7 +2,7 @@
 % Normalize the vectors to be summing to one
 %   By default dim = 1 (columns).
 % Written by Michael Chen (sth4nth@gmail.com).
-if nargin == 1, 
+if nargin == 1
     % Determine which dimension sum will use
     dim = find(size(X)~=1,1);
     if isempty(dim), dim = 1; end
diff --git a/common/standardize.m b/common/standardize.m
index 233fadd..14321ab 100644
--- a/common/standardize.m
+++ b/common/standardize.m
@@ -2,11 +2,6 @@
 % Unitize the vectors to be unit length
 %   By default dim = 1 (columns).
 % Written by Mo Chen (sth4nth@gmail.com).
-if nargin == 1, 
-    % Determine which dimension sum will use
-    dim = find(size(X)~=1,1);
-    if isempty(dim), dim = 1; end
-end
 X = bsxfun(@minux,X,mean(X,2));
 s = sqrt(mean(sum(X.^2,1)));
 Y = X/s;
\ No newline at end of file
diff --git a/common/unitize.m b/common/unitize.m
index feb12bb..22297be 100644
--- a/common/unitize.m
+++ b/common/unitize.m
@@ -2,7 +2,7 @@
 % Unitize the vectors to be unit length
 %   By default dim = 1 (columns).
 % Written by Mo Chen (sth4nth@gmail.com).
-if nargin == 1, 
+if nargin == 1
     % Determine which dimension sum will use
     dim = find(size(X)~=1,1);
     if isempty(dim), dim = 1; end

From 03bdc614a3438b17985ae508965496e4a0ffe819 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 14 Feb 2017 16:59:36 +0800
Subject: [PATCH 003/119] tweak logsumexp a little. nothing serious

---
 chapter04/softmax.m |  2 +-
 common/logsumexp.m  | 13 ++++---------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/chapter04/softmax.m b/chapter04/softmax.m
index e7ab72e..429e5d5 100644
--- a/chapter04/softmax.m
+++ b/chapter04/softmax.m
@@ -2,7 +2,7 @@
 % Softmax function
 %   By default dim = 1 (columns).
 % Written by Mo Chen (sth4nth@gmail.com).
-if nargin == 1, 
+if nargin == 1
     % Determine which dimension sum will use
     dim = find(size(x)~=1,1);
     if isempty(dim), dim = 1; end
diff --git a/common/logsumexp.m b/common/logsumexp.m
index 1098342..67b36bc 100644
--- a/common/logsumexp.m
+++ b/common/logsumexp.m
@@ -3,15 +3,10 @@
 %   By default dim = 1 (columns).
 % Written by Mo Chen (sth4nth@gmail.com).
 if nargin == 1
-    % Determine which dimension sum will use
     dim = find(size(X)~=1,1);
     if isempty(dim), dim = 1; end
 end
-
-% subtract the largest in each dim
-y = max(X,[],dim);
-s = y+log(sum(exp(bsxfun(@minus,X,y)),dim));   % TODO: use log1p
-i = isinf(y);
-if any(i(:))
-    s(i) = y(i);
-end
\ No newline at end of file
+a = max(X,[],dim);
+s = a+log(sum(exp(X-a),dim));   % TODO: use log1p
+i = isinf(a);
+s(i) = a(i);
\ No newline at end of file

From 03857d11afa607f643c7e6f516164c1e4ca8d658 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 14 Feb 2017 18:33:09 +0800
Subject: [PATCH 004/119] tweak softmax

---
 chapter04/softmax.m | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/chapter04/softmax.m b/chapter04/softmax.m
index 429e5d5..3f82baf 100644
--- a/chapter04/softmax.m
+++ b/chapter04/softmax.m
@@ -1,10 +1,10 @@
-function s = softmax(x, dim)
+function [Y,s] = softmax(X, dim)
 % Softmax function
 %   By default dim = 1 (columns).
 % Written by Mo Chen (sth4nth@gmail.com).
 if nargin == 1
-    % Determine which dimension sum will use
-    dim = find(size(x)~=1,1);
+    dim = find(size(X)~=1,1);
     if isempty(dim), dim = 1; end
 end
-s = exp(bsxfun(@minus,x,logsumexp(x,dim)));
+s = logsumexp(X,dim);
+Y = exp(X-s);

From da5b8c690264db83eb3c5c6469dbc2a9b4fb5a0b Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 27 Feb 2017 00:18:10 +0800
Subject: [PATCH 005/119] remove empty stump functions

---
 chapter08/fgSp.m       | 7 -------
 chapter08/hmmBp.m      | 6 ------
 chapter08/mrfSp.m      | 5 -----
 chapter10/gpEp.m       | 7 -------
 chapter10/hmmEp.m      | 7 -------
 chapter10/mixGaussEp.m | 7 -------
 6 files changed, 39 deletions(-)
 delete mode 100644 chapter08/fgSp.m
 delete mode 100644 chapter08/hmmBp.m
 delete mode 100644 chapter08/mrfSp.m
 delete mode 100644 chapter10/gpEp.m
 delete mode 100644 chapter10/hmmEp.m
 delete mode 100644 chapter10/mixGaussEp.m

diff --git a/chapter08/fgSp.m b/chapter08/fgSp.m
deleted file mode 100644
index a41b324..0000000
--- a/chapter08/fgSp.m
+++ /dev/null
@@ -1,7 +0,0 @@
-function model = fgSp(A)
-% sum product belief propagation on factor graph
-% support parallel schedule and serial schdule
-% A: affinity matrix (sparse) of a MRF graph
-
-
-
diff --git a/chapter08/hmmBp.m b/chapter08/hmmBp.m
deleted file mode 100644
index 0c86c69..0000000
--- a/chapter08/hmmBp.m
+++ /dev/null
@@ -1,6 +0,0 @@
-function [ output_args ] = hmmBp( input_args )
-% sum product belief propagation for HMM model
-% support parallel schedule and serial schdule
-% A: affinity matrix (sparse) of a MRF graph
-
-
diff --git a/chapter08/mrfSp.m b/chapter08/mrfSp.m
deleted file mode 100644
index 1e4671d..0000000
--- a/chapter08/mrfSp.m
+++ /dev/null
@@ -1,5 +0,0 @@
-function model = mrfSp(A)
-% sum product belief propagation on Markov random field (undirected graphical model)
-% support parallel schedule and serial schdule
-% A: affinity matrix (sparse) of a MRF graph
-
diff --git a/chapter10/gpEp.m b/chapter10/gpEp.m
deleted file mode 100644
index 0ee4a59..0000000
--- a/chapter10/gpEp.m
+++ /dev/null
@@ -1,7 +0,0 @@
-function [ output_args ] = gpEp( input_args )
-%GPEP Summary of this function goes here
-%   Detailed explanation goes here
-
-
-end
-
diff --git a/chapter10/hmmEp.m b/chapter10/hmmEp.m
deleted file mode 100644
index 00d0807..0000000
--- a/chapter10/hmmEp.m
+++ /dev/null
@@ -1,7 +0,0 @@
-function [ output_args ] = hmmEp( input_args )
-%HMMEP Summary of this function goes here
-%   Detailed explanation goes here
-
-
-end
-
diff --git a/chapter10/mixGaussEp.m b/chapter10/mixGaussEp.m
deleted file mode 100644
index 6d56405..0000000
--- a/chapter10/mixGaussEp.m
+++ /dev/null
@@ -1,7 +0,0 @@
-function [ output_args ] = mixGaussEp( input_args )
-%MIXGAUSSEP Summary of this function goes here
-%   Detailed explanation goes here
-
-
-end
-

From a08bbf1f67f868eeee30d704291853eb28499a53 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 27 Feb 2017 00:25:23 +0800
Subject: [PATCH 006/119] Update TODO.txt

---
 TODO.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/TODO.txt b/TODO.txt
index 09388db..4b7292b 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,4 +1,3 @@
 TODO: 
-ch10: EP
 ch13: LDS numerical stability (numerical stable (square root) version of Kalman filter and smoother)
 ch05: MLP bias and gradient unit

From a765f9e6c405f47638306655b926c035dffefe9a Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 9 Mar 2017 23:39:23 +0800
Subject: [PATCH 007/119] refactorized HMM code

---
 chapter13/HMM/hmmEm.m           | 37 +++++++++++++++++++++------------
 chapter13/HMM/hmmFilter.m       | 31 ++++++++++++++++++---------
 chapter13/HMM/hmmFilter_.m      | 19 -----------------
 chapter13/HMM/hmmRecSmoother_.m | 23 --------------------
 chapter13/HMM/hmmSmoother.m     | 37 ++++++++++++++++++++++-----------
 chapter13/HMM/hmmSmoother_.m    | 29 --------------------------
 chapter13/HMM/hmmViterbi.m      | 36 +++++++++++++++++++++-----------
 chapter13/HMM/hmmViterbi_.m     | 25 ----------------------
 demo/ch13/hmm_demo.m            | 17 +++++++--------
 9 files changed, 102 insertions(+), 152 deletions(-)
 delete mode 100644 chapter13/HMM/hmmFilter_.m
 delete mode 100644 chapter13/HMM/hmmRecSmoother_.m
 delete mode 100644 chapter13/HMM/hmmSmoother_.m
 delete mode 100644 chapter13/HMM/hmmViterbi_.m

diff --git a/chapter13/HMM/hmmEm.m b/chapter13/HMM/hmmEm.m
index c14a897..ef5829e 100644
--- a/chapter13/HMM/hmmEm.m
+++ b/chapter13/HMM/hmmEm.m
@@ -8,37 +8,48 @@
 %   llh: loglikelihood
 % Written by Mo Chen (sth4nth@gmail.com).
 n = size(x,2);
-d = max(x);
-X = sparse(x,1:n,1,d,n);
-
+X = sparse(x,1:n,1);
+d = size(X,1);
 if isstruct(init)   % init with a model
     A = init.A;
     E = init.E;
     s = init.s;
 elseif numel(init) == 1  % random init with latent k
     k = init;
+    s = normalize(rand(k,1),1);  
     A = normalize(rand(k,k),2);
     E = normalize(rand(k,d),2);
-    s = normalize(rand(k,1),1);
 end
-M = E*X;
-
 tol = 1e-4;
 maxIter = 100;
 llh = -inf(1,maxIter);
 for iter = 2:maxIter
+    M = E*X;
 %     E-step
-    [gamma,alpha,beta,c] = hmmSmoother_(M,A,s);
-    llh(iter) = sum(log(c(c>0)));
+    [gamma,alpha,beta,c] = hmmSmoother(M,A,s);
+    llh(iter) = mean(log(c));
     if llh(iter)-llh(iter-1) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
 %     M-step 
-    A = normalize(A.*(alpha(:,1:n-1)*bsxfun(@times,beta(:,2:n).*M(:,2:n),1./c(2:end))'),2);      % 13.19
     s = gamma(:,1);                                                                             % 13.18
-    M = bsxfun(@times,gamma*X',1./sum(gamma,2))*X;                                          
+    A = normalize(A.*(alpha(:,1:n-1)*(beta(:,2:n).*M(:,2:n)./c(2:n))'),2);      % 13.19 13.43 13.65
+    E = bsxfun(@times,gamma*X',1./sum(gamma,2));                                                 % 13.23
 end
-llh = llh(2:iter);
+model.s = s;
 model.A = A;
 model.E = E;
-model.s = s;
-
+llh = llh(2:iter);
 
+function [gamma, alpha, beta, c] = hmmSmoother(M, A, s)
+[K,T] = size(M);
+At = A';
+c = zeros(1,T);
+alpha = zeros(K,T);
+[alpha(:,1),c(1)] = normalize(s.*M(:,1),1);
+for t = 2:T
+    [alpha(:,t),c(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1);  % 13.59
+end
+beta = ones(K,T);
+for t = T-1:-1:1
+    beta(:,t) = A*(beta(:,t+1).*M(:,t+1))/c(t+1);   % 13.62
+end
+gamma = alpha.*beta;                  % 13.64
diff --git a/chapter13/HMM/hmmFilter.m b/chapter13/HMM/hmmFilter.m
index 1be2c63..c6fd1da 100644
--- a/chapter13/HMM/hmmFilter.m
+++ b/chapter13/HMM/hmmFilter.m
@@ -1,20 +1,31 @@
-function [alpha, energy] = hmmFilter(x, model)
-% HMM forward filtering algorithm. This is a wrapper function which transform input and call underlying algorithm
-% Unlike the method described in the book of PRML, the alpha returned is the normalized version: alpha(t)=p(z_t|x_{1:t})
-% Computing unnormalized version alpha(t)=p(z_t,x_{1:t}) is numerical unstable, which grows exponential fast to infinity.
+function [alpha, llh] = hmmFilter0(model, x)
+% HMM forward filtering algorithm. 
+% The alpha returned by this function is the normalized version (posterior): alpha(t)=p(z_t|x_{1:t})
+% Unnormalized version (joint distribution): alpha(t)=p(z_t,x_{1:t}) is numerical unstable.
 % Input:
 %   x: 1 x n integer vector which is the sequence of observations
-%   model:  model structure
+%   model: model structure which contains
+%       model.s: k x 1 start probability vector
+%       model.A: k x k transition matrix
+%       model.E: k x d emission matrix
 % Output:
 %   alpha: k x n matrix of posterior alpha(t)=p(z_t|x_{1:t})
-%   enery: loglikelihood
+%   llh: loglikelihood
 % Written by Mo Chen (sth4nth@gmail.com).
+s = model.s;
 A = model.A;
 E = model.E;
-s = model.s;
 
 n = size(x,2);
-d = max(x);
-X = sparse(x,1:n,1,d,n);
+X = sparse(x,1:n,1);
 M = E*X;
-[alpha, energy] = hmmFilter_(M, A, s);
\ No newline at end of file
+
+[K,T] = size(M);
+At = A';
+llh = zeros(1,T);
+alpha = zeros(K,T);
+[alpha(:,1),llh(1)] = normalize(s.*M(:,1),1);
+for t = 2:T
+    [alpha(:,t),llh(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1);    % 13.59
+end
+llh = sum(log(llh(llh>0)));
\ No newline at end of file
diff --git a/chapter13/HMM/hmmFilter_.m b/chapter13/HMM/hmmFilter_.m
deleted file mode 100644
index ffd8fcc..0000000
--- a/chapter13/HMM/hmmFilter_.m
+++ /dev/null
@@ -1,19 +0,0 @@
-function [alpha, energy] = hmmFilter_(M, A, s)
-% Implmentation function of HMM forward filtering algorithm.
-% Input:
-%   M: k x n emmision data matrix M=E*X
-%   A: k x k transition matrix
-%   s: k x 1 starting probability (prior)
-% Output:
-%   alpha: k x n matrix of posterior alpha(t)=p(z_t|x_{1:t})
-%   enery: loglikelihood
-% Written by Mo Chen (sth4nth@gmail.com).
-[K,T] = size(M);
-At = A';
-energy = zeros(1,T);
-alpha = zeros(K,T);
-[alpha(:,1),energy(1)] = normalize(s.*M(:,1),1);
-for t = 2:T
-    [alpha(:,t),energy(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1);    % 13.59
-end
-energy = sum(log(energy(energy>0)));
\ No newline at end of file
diff --git a/chapter13/HMM/hmmRecSmoother_.m b/chapter13/HMM/hmmRecSmoother_.m
deleted file mode 100644
index 8c4139b..0000000
--- a/chapter13/HMM/hmmRecSmoother_.m
+++ /dev/null
@@ -1,23 +0,0 @@
-function [ gamma, c ] = hmmRecSmoother_( M, A, s )
-% Forward-backward (recursive gamma no alpha-beta) alogrithm for HMM to compute posterior p(z_i|x)
-% Input:
-%   x: 1xn observation
-%   s: kx1 starting probability of p(z_1|s)
-%   A: kxk transition probability
-%   E: kxd emission probability
-% Output:
-%   gamma: 1xn posterier p(z_i|x)
-%   llh: loglikelihood or evidence lnp(x)
-% Written by Mo Chen sth4nth@gmail.com
-[K,T] = size(M);
-At = A';
-c = zeros(1,T); % normalization constant
-gamma = zeros(K,T);
-[gamma(:,1),c(1)] = normalize(s.*M(:,1),1);
-for t = 2:T
-    [gamma(:,t),c(t)] = normalize((At*gamma(:,t-1)).*M(:,t),1);  % 13.59
-end
-for t = T-1:-1:1
-    gamma(:,t) = normalize(bsxfun(@times,A,gamma(:,t)),1)*gamma(:,t+1);
-end
-
diff --git a/chapter13/HMM/hmmSmoother.m b/chapter13/HMM/hmmSmoother.m
index aa904ed..01bbdac 100644
--- a/chapter13/HMM/hmmSmoother.m
+++ b/chapter13/HMM/hmmSmoother.m
@@ -1,24 +1,37 @@
-function [gamma, alpha, beta, c] = hmmSmoother(x, model)
-% HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm). This is a wrapper function which transform input and call underlying algorithm
-% Unlike the method described in the book of PRML, the alpha and beta
-% returned is the normalized.
-% Computing unnormalized version alpha and beta is numerical unstable, which grows exponential fast to infinity.
+function [gamma, alpha, beta, c] = hmmSmoother0(model, x)
+% HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm).
+% The alpha and beta returned by this function are the normalized version.
 % Input:
 %   x: 1 x n integer vector which is the sequence of observations
-%   model:  model structure
+%   model: model structure which contains
+%       model.s: k x 1 start probability vector
+%       model.A: k x k transition matrix
+%       model.E: k x d emission matrix
 % Output:
 %   gamma: k x n matrix of posterior gamma(t)=p(z_t,x_{1:T})
 %   alpha: k x n matrix of posterior alpha(t)=p(z_t|x_{1:T})
 %   beta: k x n matrix of posterior beta(t)=gamma(t)/alpha(t)
-%   c: loglikelihood
+%   c: 1 x n normalization constant vector
 % Written by Mo Chen (sth4nth@gmail.com).
+s = model.s;
 A = model.A;
 E = model.E;
-s = model.s;
 
 n = size(x,2);
-d = max(x);
-X = sparse(x,1:n,1,d,n);
+X = sparse(x,1:n,1);
 M = E*X;
-[gamma, alpha, beta, c] = hmmSmoother_(M, A, s);
-% [gamma,c] = hmmRecSmoother_(M, A, s);
\ No newline at end of file
+
+[K,T] = size(M);
+At = A';
+c = zeros(1,T); % normalization constant
+alpha = zeros(K,T);
+[alpha(:,1),c(1)] = normalize(s.*M(:,1),1);
+for t = 2:T
+    [alpha(:,t),c(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1);  % 13.59
+end
+beta = ones(K,T);
+for t = T-1:-1:1
+    beta(:,t) = A*(beta(:,t+1).*M(:,t+1))/c(t+1);   % 13.62
+end
+gamma = alpha.*beta;                  % 13.64
+
diff --git a/chapter13/HMM/hmmSmoother_.m b/chapter13/HMM/hmmSmoother_.m
deleted file mode 100644
index f6d2a71..0000000
--- a/chapter13/HMM/hmmSmoother_.m
+++ /dev/null
@@ -1,29 +0,0 @@
-function [gamma, alpha, beta, c] = hmmSmoother_(M, A, s)
-% Implmentation function HMM smoothing alogrithm.
-% Unlike the method described in the book of PRML, the alpha and beta
-% returned is the normalized.
-% Computing unnormalized version alpha and beta is numerical unstable, which grows exponential fast to infinity.
-% Input:
-%   M: k x n emmision data matrix M=E*X
-%   A: k x k transition matrix
-%   s: k x 1 start prior probability
-% Output:
-%   gamma: k x n matrix of posterior gamma(t)=p(z_t,x_{1:T})
-%   alpha: k x n matrix of posterior alpha(t)=p(z_t|x_{1:T})
-%   beta: k x n matrix of posterior beta(t)=gamma(t)/alpha(t)
-%   c: loglikelihood
-% Written by Mo Chen (sth4nth@gmail.com).
-[K,T] = size(M);
-At = A';
-c = zeros(1,T); % normalization constant
-alpha = zeros(K,T);
-[alpha(:,1),c(1)] = normalize(s.*M(:,1),1);
-for t = 2:T
-    [alpha(:,t),c(t)] = normalize((At*alpha(:,t-1)).*M(:,t),1);  % 13.59
-end
-beta = ones(K,T);
-for t = T-1:-1:1
-    beta(:,t) = A*(beta(:,t+1).*M(:,t+1))/c(t+1);   % 13.62
-end
-gamma = alpha.*beta;                  % 13.64
-
diff --git a/chapter13/HMM/hmmViterbi.m b/chapter13/HMM/hmmViterbi.m
index 1f76a01..44ae94b 100644
--- a/chapter13/HMM/hmmViterbi.m
+++ b/chapter13/HMM/hmmViterbi.m
@@ -1,19 +1,31 @@
-function [z, llh] = hmmViterbi(x, model)
-% Viterbi algorithm calculated in log scale to improve numerical stability.
-% This is a wrapper function which transform input and call underlying algorithm
+function [z, llh] = hmmViterbi(model, x)
+% Viterbi algorithm (calculated in log scale to improve numerical stability).
 % Input:
 %   x: 1 x n integer vector which is the sequence of observations
-%   model:  model structure
+%   model: model structure which contains
+%       model.s: k x 1 start probability vector
+%       model.A: k x k transition matrix
+%       model.E: k x d emission matrix
 % Output:
 %   z: 1 x n latent state
 %   llh:  loglikelihood
 % Written by Mo Chen (sth4nth@gmail.com).
-A = model.A;
-E = model.E;
-s = model.s;
-
 n = size(x,2);
-d = max(x);
-X = sparse(x,1:n,1,d,n);
-M = E*X;
-[z,llh] = hmmViterbi_(M, A, s);
+X = sparse(x,1:n,1);
+s = log(model.s);
+A = log(model.A);
+M = log(model.E*X);
+
+k = numel(s);
+Z = zeros(k,n);
+Z(:,1) = 1:k;
+v = s(:)+M(:,1);
+for t = 2:n
+    [v,idx] = max(bsxfun(@plus,A,v),[],1);    % 13.68
+    v = v(:)+M(:,t);
+    Z = Z(idx,:);
+    Z(:,t) = 1:k;
+end
+[llh,idx] = max(v);
+z = Z(idx,:);
+
diff --git a/chapter13/HMM/hmmViterbi_.m b/chapter13/HMM/hmmViterbi_.m
deleted file mode 100644
index 07480be..0000000
--- a/chapter13/HMM/hmmViterbi_.m
+++ /dev/null
@@ -1,25 +0,0 @@
-function [z, llh] = hmmViterbi_(M, A, s)
-% Implmentation function of Viterbi algorithm. 
-% Input:
-%   M: k x n emmision data matrix M=E*X
-%   A: k x k transition matrix
-%   s: k x 1 starting probability (prior)
-% Output:
-%   z: 1 x n latent state
-%   llh:  loglikelihood
-% Written by Mo Chen (sth4nth@gmail.com).
-[k,n] = size(M);
-Z = zeros(k,n);
-A = log(A);
-M = log(M);
-Z(:,1) = 1:k;
-v = log(s(:))+M(:,1);
-for t = 2:n
-    [v,idx] = max(bsxfun(@plus,A,v),[],1);    % 13.68
-    v = v(:)+M(:,t);
-    Z = Z(idx,:);
-    Z(:,t) = 1:k;
-end
-[llh,idx] = max(v);
-z = Z(idx,:);
-
diff --git a/demo/ch13/hmm_demo.m b/demo/ch13/hmm_demo.m
index 58156b8..66e994f 100644
--- a/demo/ch13/hmm_demo.m
+++ b/demo/ch13/hmm_demo.m
@@ -3,13 +3,12 @@
 d = 3;
 k = 2;
 n = 10000;
-[x, model] = hmmRnd(d, k, n);
-%%
-[z,p] = hmmViterbi(x,model);
-%%
-[alpha,llh] = hmmFilter(x,model);
-%%
-[gamma,alpha,beta,c] = hmmSmoother(x,model);
-%%
-[model, llh] = hmmEm(x,k);
+%% Viterbi algorithm
+[z, llh] = hmmViterbi(model, x);
+%% HMM filter (forward algorithm)
+[alpha, llh] = hmmFilter(model, x);
+%% HMM smoother (forward backward)
+[gamma,alpha,beta,c] = hmmSmoother(model, x);
+%% Baum-Welch algorithm
+[model, llh] = hmmEm(x,init);
 plot(llh)

From 0d88ef5e06ba5dd492917e9cff6e7374c8c6eb88 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 9 Mar 2017 23:54:39 +0800
Subject: [PATCH 008/119] minor fix for function names of HMM

---
 chapter13/HMM/hmmFilter.m   | 2 +-
 chapter13/HMM/hmmSmoother.m | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/chapter13/HMM/hmmFilter.m b/chapter13/HMM/hmmFilter.m
index c6fd1da..dad93a2 100644
--- a/chapter13/HMM/hmmFilter.m
+++ b/chapter13/HMM/hmmFilter.m
@@ -1,4 +1,4 @@
-function [alpha, llh] = hmmFilter0(model, x)
+function [alpha, llh] = hmmFilter(model, x)
 % HMM forward filtering algorithm. 
 % The alpha returned by this function is the normalized version (posterior): alpha(t)=p(z_t|x_{1:t})
 % Unnormalized version (joint distribution): alpha(t)=p(z_t,x_{1:t}) is numerical unstable.
diff --git a/chapter13/HMM/hmmSmoother.m b/chapter13/HMM/hmmSmoother.m
index 01bbdac..fb97ec5 100644
--- a/chapter13/HMM/hmmSmoother.m
+++ b/chapter13/HMM/hmmSmoother.m
@@ -1,4 +1,4 @@
-function [gamma, alpha, beta, c] = hmmSmoother0(model, x)
+function [gamma, alpha, beta, c] = hmmSmoother(model, x)
 % HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm).
 % The alpha and beta returned by this function are the normalized version.
 % Input:

From ccedd275ad070e3b89662bc1a8dc35657e4f6083 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 00:20:53 +0800
Subject: [PATCH 009/119] update LDS

---
 chapter13/LDS/TODO.txt         |  2 --
 chapter13/LDS/kalmanFilter.m   |  8 +++++---
 chapter13/LDS/kalmanSmoother.m |  6 ++++--
 chapter13/LDS/ldsEm.m          | 21 ++++++++++++++++++---
 demo/ch13/lds_demo.m           | 10 +++++-----
 5 files changed, 32 insertions(+), 15 deletions(-)
 delete mode 100644 chapter13/LDS/TODO.txt

diff --git a/chapter13/LDS/TODO.txt b/chapter13/LDS/TODO.txt
deleted file mode 100644
index 19e6c48..0000000
--- a/chapter13/LDS/TODO.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-(1) test against matlab implementation of kalman filter
-(2) simplify ldsEm with less parameters (G=diag(g), S=I) 
diff --git a/chapter13/LDS/kalmanFilter.m b/chapter13/LDS/kalmanFilter.m
index 83cf979..0005ee6 100644
--- a/chapter13/LDS/kalmanFilter.m
+++ b/chapter13/LDS/kalmanFilter.m
@@ -1,5 +1,7 @@
-function [mu, V, llh] = kalmanFilter(X, model)
-% Kalman filter 
+function [mu, V, llh] = kalmanFilter(model, X)
+% Kalman filter (forward algorithm for linear dynamic system)
+% NOTE: This is the exact implementation of the Kalman filter algorithm in PRML.
+% However, this algorithm is not practical. It is numerical unstable. 
 % Input:
 %   X: d x n data matrix
 %   model: model structure
@@ -23,7 +25,7 @@
 I = eye(k);
 
 PC = P*C';
-R = (C*PC+S);
+R = C*PC+S;
 K = PC/R;                                        % 13.97
 mu(:,1) = mu0+K*(X(:,1)-C*mu0);                     % 13.94
 V(:,:,1) = (I-K*C)*P;                               % 13.95
diff --git a/chapter13/LDS/kalmanSmoother.m b/chapter13/LDS/kalmanSmoother.m
index c0aa02a..8254230 100644
--- a/chapter13/LDS/kalmanSmoother.m
+++ b/chapter13/LDS/kalmanSmoother.m
@@ -1,5 +1,7 @@
-function [nu, U, Ezz, Ezy, llh] = kalmanSmoother(X, model)
+function [nu, U, Ezz, Ezy, llh] = kalmanSmoother(model, X)
 % Kalman smoother (forward-backward algorithm for linear dynamic system)
+% NOTE: This is the exact implementation of the Kalman smoother algorithm in PRML.
+% However, this algorithm is not practical. It is numerical unstable. 
 % Input:
 %   X: d x n data matrix
 %   model: model structure
@@ -28,7 +30,7 @@
 
 % forward
 PC = P0*C';
-R = (C*PC+S);
+R = C*PC+S;
 K = PC/R;
 mu(:,1) = mu0+K*(X(:,1)-C*mu0);
 V(:,:,1) = (I-K*C)*P0;
diff --git a/chapter13/LDS/ldsEm.m b/chapter13/LDS/ldsEm.m
index e9d548b..7f283e4 100644
--- a/chapter13/LDS/ldsEm.m
+++ b/chapter13/LDS/ldsEm.m
@@ -1,5 +1,8 @@
-function [model, llh] = ldsEm(X, model)
+function [model, llh] = ldsEm(X, init)
 % EM algorithm for parameter estimation of linear dynamic system.
+% NOTE: This is the exact implementation of the EM algorithm in PRML.
+% However, this algorithm is not practical. It is numerical unstable and 
+% there is too much redundant degree of freedom. 
 % Input:
 %   X: d x n data matrix
 %   model: prior model structure
@@ -7,12 +10,24 @@
 %   model: trained model structure
 %   llh: loglikelihood
 % Written by Mo Chen (sth4nth@gmail.com).
-tol = 1e-4;
+d = size(X,1);
+if isstruct(init)   % init with a model
+    model = init;
+elseif numel(init) == 1  % random init with latent k
+    k = init;
+    model.A = randn(k,k);
+    model.G = iwishrnd(eye(k),k);
+    model.C = randn(d,k);
+    model.S = iwishrnd(eye(d),d);
+    model.mu0 = randn(k,1);
+    model.P0 = iwishrnd(eye(k),k);
+end
+tol = 1e-2;
 maxIter = 100;
 llh = -inf(1,maxIter);
 for iter = 2:maxIter
 %     E-step
-    [nu, U, Ezz, Ezy, llh(iter)] = kalmanSmoother(X, model);
+    [nu, U, Ezz, Ezy, llh(iter)] = kalmanSmoother(model,X);
     if llh(iter)-llh(iter-1) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
 %     M-step 
     model = maximization(X, nu, U, Ezz, Ezy);
diff --git a/demo/ch13/lds_demo.m b/demo/ch13/lds_demo.m
index 4181123..8c0b30e 100644
--- a/demo/ch13/lds_demo.m
+++ b/demo/ch13/lds_demo.m
@@ -6,9 +6,9 @@
 n = 100;
  
 [X,Z,model] = ldsRnd(d,k,n);
-[mu, V, llh] = kalmanFilter(X, model);
-
-[nu, U, Ezz, Ezy, llh] = kalmanSmoother(X, model);
-[model, llh] = ldsEm(X, model);
-plot(llh);
+[mu, V, llh] = kalmanFilter(model, X);
 
+[nu, U, Ezz, Ezy, llh] = kalmanSmoother(model, X);
+% [model, llh] = ldsEm(X,k);
+% plot(llh);
+% 

From 88f8e5a29248a646cdddbbd69a0aa1a9ad200295 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 00:24:18 +0800
Subject: [PATCH 010/119] minor fix

---
 chapter09/mixGaussPred.m    | 2 +-
 demo/ch09/mixGaussEm_demo.m | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/chapter09/mixGaussPred.m b/chapter09/mixGaussPred.m
index ebebaa0..f614081 100644
--- a/chapter09/mixGaussPred.m
+++ b/chapter09/mixGaussPred.m
@@ -1,4 +1,4 @@
-function [label, R] = mixGaussPred(X, model)
+function [label, R] = mixGaussPred(model, X)
 % Predict label and responsibility for Gaussian mixture model.
 % Input:
 %   X: d x n data matrix
diff --git a/demo/ch09/mixGaussEm_demo.m b/demo/ch09/mixGaussEm_demo.m
index 95a14bb..a6ab295 100644
--- a/demo/ch09/mixGaussEm_demo.m
+++ b/demo/ch09/mixGaussEm_demo.m
@@ -16,6 +16,6 @@
 figure;
 plotClass(X1,z1);
 % predict
-z2 = mixGaussPred(X2,model);
+z2 = mixGaussPred(model,X2);
 figure;
 plotClass(X2,z2);
\ No newline at end of file

From 170e45b05df2a7df180ccbcdb60b6ba507154ab6 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 00:26:06 +0800
Subject: [PATCH 011/119] update todo

---
 TODO.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/TODO.txt b/TODO.txt
index 4b7292b..de2a55b 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,3 +1 @@
-TODO: 
-ch13: LDS numerical stability (numerical stable (square root) version of Kalman filter and smoother)
-ch05: MLP bias and gradient unit
+ch05: MLP bias and gradient unit (2nd order)

From 018fe824d96943a50c4a6b3adc0e4b7564675192 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 00:34:13 +0800
Subject: [PATCH 012/119] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index d87c6e7..a8c7b35 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@ Introduction
 This package is a Matlab implementation of the algorithms described in the classical machine learning textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
+Note: this package requires Matlab R2016b or later, since it utilizes a new syntax of Matlab.
+
 Description
 -------
 The design goal of the code are as follows:

From 362845e062f61749775fc80821bf0166482a540c Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 02:03:36 +0800
Subject: [PATCH 013/119] add sample function of mixture model from prior

---
 chapter11/GaussWishart.m    | 15 +++++++++++++++
 chapter11/mixDpGb.m         |  4 ++--
 chapter11/mixGaussSample.m  | 18 ++++++++++++++++++
 demo/ch11/mixGaussGb_demo.m | 13 +++++++++----
 4 files changed, 44 insertions(+), 6 deletions(-)
 create mode 100644 chapter11/mixGaussSample.m

diff --git a/chapter11/GaussWishart.m b/chapter11/GaussWishart.m
index b802718..a867735 100644
--- a/chapter11/GaussWishart.m
+++ b/chapter11/GaussWishart.m
@@ -20,6 +20,10 @@
          function obj = clone(obj)
          end
          
+         function d = dim(obj)
+             d = numel(obj.m_);
+         end
+         
          function obj = addData(obj, X)
              kappa0 = obj.kappa_;
              m0 = obj.m_;
@@ -89,5 +93,16 @@
              c = gammaln((v+d)/2)-gammaln(v/2)-(d*log(v*pi)+2*sum(log(diag(U))))/2;
              y = c+o;
          end
+         
+         function [mu, Sigma] = sample(obj)
+%              Sample a Gaussian distribution from GaussianWishart prior
+             kappa = obj.kappa_;
+             m = obj.m_;
+             nu = obj.nu_;
+             U = obj.U_;
+             
+             Sigma = iwishrnd(U'*U,nu);
+             mu = gaussRnd(m,Sigma/kappa);
+         end
      end
 end
diff --git a/chapter11/mixDpGb.m b/chapter11/mixDpGb.m
index 4396c7b..e0e3ba2 100644
--- a/chapter11/mixDpGb.m
+++ b/chapter11/mixDpGb.m
@@ -14,7 +14,7 @@
 n = size(X,2);
 [label,Theta,w] = mixDpGbOl(X,alpha,theta);
 nk = n*w;
-maxIter = 200;
+maxIter = 50;
 llh = zeros(1,maxIter);
 for iter = 1:maxIter
     for i = randperm(n)
@@ -34,7 +34,7 @@
         llh(iter) = llh(iter)+sum(p-log(n));
         k = discreteRnd(exp(p-logsumexp(p)));
         if k == numel(Theta)+1                 % add extra cluster
-            Theta{k} = theta.clone.addSample(x);
+            Theta{k} = theta.clone().addSample(x);
             nk = [nk,1];
         else
             Theta{k} = Theta{k}.addSample(x);
diff --git a/chapter11/mixGaussSample.m b/chapter11/mixGaussSample.m
new file mode 100644
index 0000000..44d9aa5
--- /dev/null
+++ b/chapter11/mixGaussSample.m
@@ -0,0 +1,18 @@
+function [X, z] = mixGaussSample(Theta, w, n )
+% Genarate samples form a Gaussian mixture model with GaussianWishart prior.
+% Input:
+%   Theta: cell of GaussianWishart priors of components
+%   w: weight of components
+%   n: number of data
+% Output:
+%   X: d x n data matrix
+%   z: 1 x n response variable
+% Written by Mo Chen (sth4nth@gmail.com).
+z = discreteRnd(w,n);
+d = Theta{1}.dim();
+X = zeros(d,n);
+for i = 1:numel(w)
+    idx = z==i;
+    [mu,Sigma] = Theta{i}.sample(); % invpd(wishrnd(W0,v0));
+    X(:,idx) = gaussRnd(mu,Sigma,sum(idx));
+end
diff --git a/demo/ch11/mixGaussGb_demo.m b/demo/ch11/mixGaussGb_demo.m
index 326c71c..9f8154f 100644
--- a/demo/ch11/mixGaussGb_demo.m
+++ b/demo/ch11/mixGaussGb_demo.m
@@ -3,9 +3,14 @@
 d = 2;
 k = 3;
 n = 500;
-[X,label] = mixGaussRnd(d,k,n);
-plotClass(X,label);
+[X,z] = mixGaussRnd(d,k,n);
+plotClass(X,z);
 
-[y,model] = mixGaussGb(X);
+[z,Theta,w,llh] = mixGaussGb(X);
 figure
-plotClass(X,y);
\ No newline at end of file
+plotClass(X,z);
+
+[X,z] = mixGaussSample(Theta,w,n);
+figure
+plotClass(X,z);
+

From 5074a1122cf75804ce7178d2d7654a976a34653d Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 02:06:51 +0800
Subject: [PATCH 014/119] remove todo.txt

---
 TODO.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 TODO.txt

diff --git a/TODO.txt b/TODO.txt
deleted file mode 100644
index de2a55b..0000000
--- a/TODO.txt
+++ /dev/null
@@ -1 +0,0 @@
-ch05: MLP bias and gradient unit (2nd order)

From 9e70e291a080b4465c32fadf4232209b409efb6e Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 02:11:42 +0800
Subject: [PATCH 015/119] minor tweak logistic regression

---
 chapter04/logitBinPred.m | 2 +-
 chapter04/logitMnPred.m  | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/chapter04/logitBinPred.m b/chapter04/logitBinPred.m
index 01ee10b..14cbed9 100644
--- a/chapter04/logitBinPred.m
+++ b/chapter04/logitBinPred.m
@@ -9,6 +9,6 @@
 % Written by Mo Chen (sth4nth@gmail.com).
 X = [X;ones(1,size(X,2))];
 w = model.w;
-p = exp(-log1pexp(-w'*X)); 
+p = sigmoid(w'*X);
 y = round(p);
 
diff --git a/chapter04/logitMnPred.m b/chapter04/logitMnPred.m
index 60010e0..f30db00 100644
--- a/chapter04/logitMnPred.m
+++ b/chapter04/logitMnPred.m
@@ -9,6 +9,5 @@
 % Written by Mo Chen (sth4nth@gmail.com).
 W = model.W;
 X = [X; ones(1,size(X,2))];
-A = W'*X;                                   
-P = exp(bsxfun(@minus,A,logsumexp(A,1)));  
+P = softmax(W'*X);
 [~, y] = max(P,[],1);
\ No newline at end of file

From 4db85d6585df4072b04431b0772541049fb6b835 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 15:51:05 +0800
Subject: [PATCH 016/119] fix hmm demo

---
 chapter13/HMM/hmmEm.m | 2 +-
 demo/ch13/hmm_demo.m  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/chapter13/HMM/hmmEm.m b/chapter13/HMM/hmmEm.m
index ef5829e..455106b 100644
--- a/chapter13/HMM/hmmEm.m
+++ b/chapter13/HMM/hmmEm.m
@@ -32,7 +32,7 @@
 %     M-step 
     s = gamma(:,1);                                                                             % 13.18
     A = normalize(A.*(alpha(:,1:n-1)*(beta(:,2:n).*M(:,2:n)./c(2:n))'),2);      % 13.19 13.43 13.65
-    E = bsxfun(@times,gamma*X',1./sum(gamma,2));                                                 % 13.23
+    E = (gamma*X')./sum(gamma,2);                            % 13.23
 end
 model.s = s;
 model.A = A;
diff --git a/demo/ch13/hmm_demo.m b/demo/ch13/hmm_demo.m
index 66e994f..59e68a5 100644
--- a/demo/ch13/hmm_demo.m
+++ b/demo/ch13/hmm_demo.m
@@ -3,6 +3,7 @@
 d = 3;
 k = 2;
 n = 10000;
+[x,model] = hmmRnd(d,k,n);
 %% Viterbi algorithm
 [z, llh] = hmmViterbi(model, x);
 %% HMM filter (forward algorithm)
@@ -10,5 +11,5 @@
 %% HMM smoother (forward backward)
 [gamma,alpha,beta,c] = hmmSmoother(model, x);
 %% Baum-Welch algorithm
-[model, llh] = hmmEm(x,init);
+[model, llh] = hmmEm(x,2);
 plot(llh)

From 63b08a17d9c0a663c2eee106bfc69e33c4aedc20 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 15:51:50 +0800
Subject: [PATCH 017/119] fix hmm demo

---
 demo/ch13/hmm_demo.m | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/demo/ch13/hmm_demo.m b/demo/ch13/hmm_demo.m
index 59e68a5..025f009 100644
--- a/demo/ch13/hmm_demo.m
+++ b/demo/ch13/hmm_demo.m
@@ -1,8 +1,5 @@
 % demos for HMM in ch13
-
-d = 3;
-k = 2;
-n = 10000;
+d = 3; k = 2; n = 10000;
 [x,model] = hmmRnd(d,k,n);
 %% Viterbi algorithm
 [z, llh] = hmmViterbi(model, x);
@@ -11,5 +8,5 @@
 %% HMM smoother (forward backward)
 [gamma,alpha,beta,c] = hmmSmoother(model, x);
 %% Baum-Welch algorithm
-[model, llh] = hmmEm(x,2);
+[model, llh] = hmmEm(x,k);
 plot(llh)

From df0f61ae34b78daf07282c36a1c74ecbb4d32140 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 10 Mar 2017 16:29:28 +0800
Subject: [PATCH 018/119] working on mlp

---
 chapter05/mlp.m | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/chapter05/mlp.m b/chapter05/mlp.m
index 65c6467..294443f 100644
--- a/chapter05/mlp.m
+++ b/chapter05/mlp.m
@@ -1,5 +1,5 @@
-function [model, mse] = mlp(X, Y, h)
-% Multilayer perceptron
+function [model, mse] = mlp(X, Y, h, eta)
+% Train a multilayer perceptron neural network
 % Input:
 %   X: d x n data matrix
 %   Y: p x n response matrix
@@ -8,6 +8,9 @@
 %   model: model structure
 %   mse: mean square error
 % Written by Mo Chen (sth4nth@gmail.com).
+if nargin < 4
+    eta = 1/size(X,2);
+end
 h = [size(X,1);h(:);size(Y,1)];
 L = numel(h);
 W = cell(L-1);
@@ -16,8 +19,7 @@
 end
 Z = cell(L);
 Z{1} = X;
-eta = 1/size(X,2);
-maxiter = 2000;
+maxiter = 200;
 mse = zeros(1,maxiter);
 for iter = 1:maxiter
 %     forward
@@ -26,7 +28,7 @@
     end
 %     backward
     E = Y-Z{L};
-    mse(iter) = mean(dot(E(:),E(:)));
+    mse(iter) = mean(E.*E);
     for l = L-1:-1:1
         df = Z{l+1}.*(1-Z{l+1});
         dG = df.*E;

From 3979a238b0be614545e057a775849cba83d8ce10 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 01:17:42 +0800
Subject: [PATCH 019/119] minor fix

---
 chapter05/mlp.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter05/mlp.m b/chapter05/mlp.m
index 294443f..e19105c 100644
--- a/chapter05/mlp.m
+++ b/chapter05/mlp.m
@@ -28,7 +28,7 @@
     end
 %     backward
     E = Y-Z{L};
-    mse(iter) = mean(E.*E);
+    mse(iter) =  mean(dot(E,E),1);
     for l = L-1:-1:1
         df = Z{l+1}.*(1-Z{l+1});
         dG = df.*E;

From 3ca6be8655e249581e8f9774ec7f3a6dfc6e1f10 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 15:03:03 +0800
Subject: [PATCH 020/119] tweak kmeans

---
 chapter06/knKmeans.m    | 16 +++++++---------
 chapter09/kmeans.m      | 20 +++++++++-----------
 chapter09/kmeansPred.m  |  8 ++++----
 demo/ch06/knLin_demo.m  |  4 ++--
 demo/ch09/kmeans_demo.m |  2 +-
 5 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/chapter06/knKmeans.m b/chapter06/knKmeans.m
index 62d4c5c..2796aba 100755
--- a/chapter06/knKmeans.m
+++ b/chapter06/knKmeans.m
@@ -1,5 +1,5 @@
-function [label, energy, model] = knKmeans(X, init, kn)
-% Perform kernel k-means clustering.
+function [label, model, energy] = knKmeans(X, init, kn)
+% Perform kernel kmeans clustering.
 % Input:
 %   K: n x n kernel matrix
 %   init: either number of clusters (k) or initial label (1xn)
@@ -21,15 +21,13 @@
     kn = @knGauss;
 end
 K = kn(X,X);
-last = 0;
+last = zeros(1,n);
 while any(label ~= last)
-    [u,~,label(:)] = unique(label);   % remove empty clusters
-    k = numel(u);
-    E = sparse(label,1:n,1,k,n,n);
-    E = spdiags(1./sum(E,2),0,k,k)*E;
+    [~,~,last(:)] = unique(label);   % remove empty clusters
+    E = sparse(last,1:n,1);
+    E = E./sum(E,2);
     T = E*K;
-    last = label;
-    [val, label] = max(bsxfun(@minus,T,diag(T*E')/2),[],1);
+    [val, label] = max(T-diag(T*E')/2,[],1);
 end
 energy = trace(K)-2*sum(val); 
 if nargout == 3
diff --git a/chapter09/kmeans.m b/chapter09/kmeans.m
index f5175be..29a2e6b 100644
--- a/chapter09/kmeans.m
+++ b/chapter09/kmeans.m
@@ -1,5 +1,5 @@
-function [label, energy, model] = kmeans(X, init)
-% Perform k-means clustering.
+function [label, m, energy] = kmeans(X, init)
+% Perform kmeans clustering.
 % Input:
 %   X: d x n data matrix
 %   init: k number of clusters or label (1 x n vector)
@@ -9,20 +9,18 @@
 %   model: trained model structure
 % Written by Mo Chen (sth4nth@gmail.com).
 n = size(X,2);
+idx = 1:n;
+last = zeros(1,n);
 if numel(init)==1
     k = init;
     label = ceil(k*rand(1,n));
 elseif numel(init)==n
     label = init;
 end
-last = 0;
 while any(label ~= last)
-    [u,~,label(:)] = unique(label);   % remove empty clusters
-    k = numel(u);
-    E = sparse(1:n,label,1,n,k,n);  % transform label into indicator matrix
-    m = X*(E*spdiags(1./sum(E,1)',0,k,k));    % compute centers 
-    last = label;
-    [val,label] = max(bsxfun(@minus,m'*X,dot(m,m,1)'/2),[],1); % assign labels
+    [~,~,last(:)] = unique(label);   % remove empty clusters
+    E = sparse(idx,last,1);  % transform label into indicator matrix
+    m = X*(E./sum(E,1));    % compute centers 
+    [val,label] = min(dot(m,m,1)'/2-m'*X,[],1); % assign labels
 end
-energy = dot(X(:),X(:))-2*sum(val); 
-model.means = m;
\ No newline at end of file
+energy = dot(X(:),X(:),1)+2*sum(val);
\ No newline at end of file
diff --git a/chapter09/kmeansPred.m b/chapter09/kmeansPred.m
index fc71464..83dc633 100644
--- a/chapter09/kmeansPred.m
+++ b/chapter09/kmeansPred.m
@@ -1,11 +1,11 @@
-function [label, energy] = kmeansPred(model, Xt)
+function [label, energy] = kmeansPred(m, X)
 % Prediction for kmeans clusterng
 % Input:
-%   model: trained model structure
-%   Xt: d x n testing data
+%   model: dx k cluster center matrix
+%   X: d x n testing data
 % Output:
 %   label: 1 x n cluster label
 %   energy: optimization target value
 % Written by Mo Chen (sth4nth@gmail.com).
-[val,label] = min(sqdist(model.means, Xt));
+[val,label] = min(dot(X,X,1)+dot(m,m,1)'-2*m'*X,[],1); % assign labels
 energy = sum(val);
\ No newline at end of file
diff --git a/demo/ch06/knLin_demo.m b/demo/ch06/knLin_demo.m
index 1073a74..9ae12b3 100644
--- a/demo/ch06/knLin_demo.m
+++ b/demo/ch06/knLin_demo.m
@@ -25,8 +25,8 @@
 n = 500;
 [X,y] = kmeansRnd(d,k,n);
 init = ceil(k*rand(1,n));
-[y_kn,en_kn,model_kn] = knKmeans(X,init,@knLin);
-[y_lin,en_lin,model_lin] = kmeans(X,init);
+[y_kn,model_kn,en_kn] = knKmeans(X,init,@knLin);
+[y_lin,model_lin,en_lin] = kmeans(X,init);
 
 idx = 1:2:n;
 Xt = X(:,idx);
diff --git a/demo/ch09/kmeans_demo.m b/demo/ch09/kmeans_demo.m
index 3083b94..4c22e94 100644
--- a/demo/ch09/kmeans_demo.m
+++ b/demo/ch09/kmeans_demo.m
@@ -3,7 +3,7 @@
 k = 3;
 n = 5000;
 [X,label] = kmeansRnd(d,k,n);
-y = kmeans(X,k);
+y = litekmeans(X,k);
 plotClass(X,label);
 figure;
 plotClass(X,y);

From e0593a1f6ba897e5dc38abb41ded6b62c407545c Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 15:09:24 +0800
Subject: [PATCH 021/119] tweak kmedoids

---
 chapter09/kmedoids.m      | 15 +++++++--------
 demo/ch09/kmedoids_demo.m |  4 +++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/chapter09/kmedoids.m b/chapter09/kmedoids.m
index e633a1f..74d269c 100644
--- a/chapter09/kmedoids.m
+++ b/chapter09/kmedoids.m
@@ -1,12 +1,12 @@
-function [label, energy, index] = kmedoids(X, init)
+function [label, index, energy] = kmedoids0(X, init)
 % Perform k-medoids clustering.
 % Input:
 %   X: d x n data matrix
 %   init: k number of clusters or label (1 x n vector)
 % Output:
 %   label: 1 x n cluster label
-%   energy: optimization target value
 %   index: index of medoids
+%   energy: optimization target value
 % Written by Mo Chen (sth4nth@gmail.com).
 [d,n] = size(X);
 if numel(init)==1
@@ -15,15 +15,14 @@
 elseif numel(init)==n
     label = init;
 end
-X = bsxfun(@minus,X,mean(X,2));             % reduce chance of numerical problems
+X = X-mean(X,2);             % reduce chance of numerical problems
 v = dot(X,X,1);
-D = bsxfun(@plus,v,v')-2*(X'*X);            % Euclidean distance matrix
+D = v+v'-2*(X'*X);            % Euclidean distance matrix
 D(sub2ind([d,d],1:d,1:d)) = 0;              % reduce chance of numerical problems
-last = 0;
+last = zeros(1,n);
 while any(label ~= last)
-    [u,~,label(:)] = unique(label);   % remove empty clusters
-    [~, index] = min(D*sparse(1:n,label,1,n,numel(u),n),[],1);  % find k medoids
-    last = label;
+    [~,~,last(:)] = unique(label);   % remove empty clusters
+    [~, index] = min(D*sparse(1:n,last,1),[],1);  % find k medoids
     [val, label] = min(D(index,:),[],1);                % assign labels
 end
 energy = sum(val);
diff --git a/demo/ch09/kmedoids_demo.m b/demo/ch09/kmedoids_demo.m
index 90c764f..1f36b16 100644
--- a/demo/ch09/kmedoids_demo.m
+++ b/demo/ch09/kmedoids_demo.m
@@ -3,7 +3,9 @@
 k = 3;
 n = 5000;
 [X,label] = kmeansRnd(d,k,n);
-y = kmedoids(X,k);
+init = ceil(k*rand(1,n));
+[y, idx, v] = kmedoids(X,init);
 plotClass(X,label);
 figure;
 plotClass(X,y);
+

From 634f7b66f95e55a3747c9eef0b3aef7ebd96fda8 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 15:09:54 +0800
Subject: [PATCH 022/119] tweak kmedoids

---
 chapter09/kmedoids.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter09/kmedoids.m b/chapter09/kmedoids.m
index 74d269c..7499905 100644
--- a/chapter09/kmedoids.m
+++ b/chapter09/kmedoids.m
@@ -1,4 +1,4 @@
-function [label, index, energy] = kmedoids0(X, init)
+function [label, index, energy] = kmedoids(X, init)
 % Perform k-medoids clustering.
 % Input:
 %   X: d x n data matrix

From d854233615874f010766fd983cdb5570b1e84162 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 15:17:54 +0800
Subject: [PATCH 023/119] tweak knkmeans

---
 chapter06/knKmeans.m   |  2 +-
 demo/ch06/knLin_demo.m | 78 +++++++++++++++++++++---------------------
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/chapter06/knKmeans.m b/chapter06/knKmeans.m
index 2796aba..80b6d5a 100755
--- a/chapter06/knKmeans.m
+++ b/chapter06/knKmeans.m
@@ -27,7 +27,7 @@
     E = sparse(last,1:n,1);
     E = E./sum(E,2);
     T = E*K;
-    [val, label] = max(T-diag(T*E')/2,[],1);
+    [val, label] = max(T-dot(T,E,2)/2,[],1);
 end
 energy = trace(K)-2*sum(val); 
 if nargout == 3
diff --git a/demo/ch06/knLin_demo.m b/demo/ch06/knLin_demo.m
index 9ae12b3..9259164 100644
--- a/demo/ch06/knLin_demo.m
+++ b/demo/ch06/knLin_demo.m
@@ -1,23 +1,23 @@
-%% Kernel regression with linear kernel is EQUIVALENT to linear regression
-clear; close all;
-n = 100;
-x = linspace(0,2*pi,n);   % test data
-t = sin(x)+rand(1,n)/2;
-
-lambda = 1e-4;
-model_kn = knReg(x,t,lambda,@knLin);
-model_lin = linReg(x,t,lambda);
-
-idx = 1:2:n;
-xt = x(:,idx);
-tt = t(idx);
-
-[y_kn, sigma_kn,p_kn] = knRegPred(model_kn,xt,tt);
-[y_lin, sigma_lin,p_lin] = linRegPred(model_lin,xt,tt);
-
-maxdiff(y_kn,y_lin)
-maxdiff(sigma_kn,sigma_lin)
-maxdiff(p_kn,p_lin)
+% %% Kernel regression with linear kernel is EQUIVALENT to linear regression
+% clear; close all;
+% n = 100;
+% x = linspace(0,2*pi,n);   % test data
+% t = sin(x)+rand(1,n)/2;
+% 
+% lambda = 1e-4;
+% model_kn = knReg(x,t,lambda,@knLin);
+% model_lin = linReg(x,t,lambda);
+% 
+% idx = 1:2:n;
+% xt = x(:,idx);
+% tt = t(idx);
+% 
+% [y_kn, sigma_kn,p_kn] = knRegPred(model_kn,xt,tt);
+% [y_lin, sigma_lin,p_lin] = linRegPred(model_lin,xt,tt);
+% 
+% maxdiff(y_kn,y_lin)
+% maxdiff(sigma_kn,sigma_lin)
+% maxdiff(p_kn,p_lin)
 %% Kernel kmeans with linear kernel is EQUIVALENT to kmeans
 clear; close all;
 d = 2;
@@ -40,22 +40,22 @@
 maxdiff(t_kn,t_lin)
 maxdiff(ent_kn,ent_lin)
 %% Kernel PCA with linear kernel is EQUIVALENT TO PCA
-clear; close all;
-d = 10;
-q = 2;
-n = 500;
-X = randn(d,n);
-
-
-model_kn = knPca(X,q,@knLin);
-idx = 1:2:n;
-Xt = X(:,idx);
-
-Y_kn = knPcaPred(model_kn,Xt);
-
-[U,L,mu,mse] = pca(X,q);
-Y_lin = U'*bsxfun(@minus,Xt,mu);   % projection
-
-
-R = Y_lin/Y_kn;    % the results are equivalent up to a rotation.
-maxdiff(R*R', eye(q))
+% clear; close all;
+% d = 10;
+% q = 2;
+% n = 500;
+% X = randn(d,n);
+% 
+% 
+% model_kn = knPca(X,q,@knLin);
+% idx = 1:2:n;
+% Xt = X(:,idx);
+% 
+% Y_kn = knPcaPred(model_kn,Xt);
+% 
+% [U,L,mu,mse] = pca(X,q);
+% Y_lin = U'*bsxfun(@minus,Xt,mu);   % projection
+% 
+% 
+% R = Y_lin/Y_kn;    % the results are equivalent up to a rotation.
+% maxdiff(R*R', eye(q))

From d051acda7364ad981093d917494c0b5152028b6e Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 17:23:12 +0800
Subject: [PATCH 024/119] tweak sqdist

---
 common/sqdist.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/sqdist.m b/common/sqdist.m
index 31668b5..e836d50 100644
--- a/common/sqdist.m
+++ b/common/sqdist.m
@@ -5,4 +5,4 @@
 % Output:
 %   D: n1 x n2 square Euclidean distance matrix
 % Written by Mo Chen (sth4nth@gmail.com).
-D = bsxfun(@plus,dot(X2,X2,1),dot(X1,X1,1)')-2*(X1'*X2);
+D = dot(X1,X1,1)'+dot(X2,X2,1)-2*(X1'*X2);

From 57ec8a3352238a2fc35fadd22cab6a77922a5f3f Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 17:46:49 +0800
Subject: [PATCH 025/119] tweak nb

---
 chapter08/nbBern.m     | 9 ++++-----
 chapter08/nbBernPred.m | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/chapter08/nbBern.m b/chapter08/nbBern.m
index 50a1726..51e9dee 100644
--- a/chapter08/nbBern.m
+++ b/chapter08/nbBern.m
@@ -6,12 +6,11 @@
 % Output:
 %   model: trained model structure
 % Written by Mo Chen (sth4nth@gmail.com).
-k = max(t);
 n = size(X,2);
-E = sparse(t,1:n,1,k,n,n);
-nk = full(sum(E,2));
-w = nk/n;
-mu = full(sparse(X)*E'*spdiags(1./nk,0,k,k));  
+E = sparse(1:n,t,1);
+nk = sum(E,1);
+w = full(nk/n);
+mu = X*(E./nk);  
 
 model.mu = mu;      % d x k means 
 model.w = w;
\ No newline at end of file
diff --git a/chapter08/nbBernPred.m b/chapter08/nbBernPred.m
index b7c5890..2f308df 100644
--- a/chapter08/nbBernPred.m
+++ b/chapter08/nbBernPred.m
@@ -10,6 +10,6 @@
 w = model.w;
 X = sparse(X);
 R = log(mu)'*X+log(1-mu)'*(1-X);
-R = bsxfun(@plus,R,log(w));
+R = bsxfun(@plus,R,log(w(:)));
 [~,y] = max(R,[],1);
 

From 467b74c5a8f5c819136e4368354deb9e2fa246ef Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 17:47:45 +0800
Subject: [PATCH 026/119] tweak nb

---
 chapter08/nbBern.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter08/nbBern.m b/chapter08/nbBern.m
index 51e9dee..f260b34 100644
--- a/chapter08/nbBern.m
+++ b/chapter08/nbBern.m
@@ -9,7 +9,7 @@
 n = size(X,2);
 E = sparse(1:n,t,1);
 nk = sum(E,1);
-w = full(nk/n);
+w = full(nk)/n;
 mu = X*(E./nk);  
 
 model.mu = mu;      % d x k means 

From 0c32f995d30d68a3f19f29b426b7234f5cab0af8 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 18:05:00 +0800
Subject: [PATCH 027/119] tweak mixBernEM

---
 chapter09/mixBernEm.m | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/chapter09/mixBernEm.m b/chapter09/mixBernEm.m
index 65612e4..4c1f36d 100644
--- a/chapter09/mixBernEm.m
+++ b/chapter09/mixBernEm.m
@@ -13,7 +13,7 @@
 X = sparse(X);
 n = size(X,2);
 label = ceil(k*rand(1,n));  % random initialization
-R = sparse(label,1:n,1,k,n,n);
+R = full(sparse(1:n,label,1));
 tol = 1e-8;
 maxiter = 500;
 llh = -inf(1,maxiter);
@@ -22,23 +22,20 @@
     [R, llh(iter)] = expectation(X,model);
     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end;
 end
-[~,label(:)] = max(R,[],1);
+[~,label(:)] = max(R,[],2);
 llh = llh(2:iter);
 
 function [R, llh] = expectation(X, model)
 mu = model.mu;
 w = model.w;
-n = size(X,2);
-R = full(log(mu)'*X+log(1-mu)'*(1-X));
-R = bsxfun(@plus,R,log(w));
-T = logsumexp(R,1);
-llh = sum(T)/n; % loglikelihood
-R = exp(bsxfun(@minus,R,T));
+R = X'*log(mu)+(1-X)'*log(1-mu)+log(w);
+T = logsumexp(R,2);
+llh = mean(T); % loglikelihood
+R = exp(R-T);
 
 function model = maximization(X, R)
-n = size(R,2);
-nk = full(sum(R,2));
-w = nk/n;
-mu = bsxfun(@times,full(X*R'),1./nk');
+nk = sum(R,1);
+w = nk/sum(nk);
+mu = (X*R)./nk;
 model.mu = mu;
 model.w = w;
\ No newline at end of file

From 9465a1f4998ac993529c1fbf264516f82239afab Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 18:07:57 +0800
Subject: [PATCH 028/119] tweak nbBernPred

---
 chapter08/nbBernPred.m | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/chapter08/nbBernPred.m b/chapter08/nbBernPred.m
index 2f308df..525a890 100644
--- a/chapter08/nbBernPred.m
+++ b/chapter08/nbBernPred.m
@@ -8,8 +8,5 @@
 % Written by Mo Chen (sth4nth@gmail.com).
 mu = model.mu;
 w = model.w;
-X = sparse(X);
-R = log(mu)'*X+log(1-mu)'*(1-X);
-R = bsxfun(@plus,R,log(w(:)));
-[~,y] = max(R,[],1);
+[~,y] = max(log(mu)'*X+log(1-mu)'*(1-X)+log(w(:)),[],1);
 

From be1bbc32f311956b9295e3c05d51b1e84345027e Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 18:19:36 +0800
Subject: [PATCH 029/119] tweak logGauss

---
 chapter02/logGauss.m | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/chapter02/logGauss.m b/chapter02/logGauss.m
index edefd89..912e1c2 100644
--- a/chapter02/logGauss.m
+++ b/chapter02/logGauss.m
@@ -7,21 +7,13 @@
 % Output:
 %   y: 1 x n probability density in logrithm scale y=log p(x)
 % Written by Mo Chen (sth4nth@gmail.com).
-[d,k] = size(mu);
-if all(size(sigma)==d) && k==1   % one mu and one dxd sigma
-    X = bsxfun(@minus,X,mu);
-    [R,p]= chol(sigma);
-    if p ~= 0
-        error('ERROR: sigma is not PD.');
-    end
-    Q = R'\X;
-    q = dot(Q,Q,1);  % quadratic term (M distance)
-    c = d*log(2*pi)+2*sum(log(diag(R)));   % normalization constant
-    y = -0.5*(c+q);
-elseif size(sigma,1)==1 && size(sigma,2)==size(mu,2) % k mu and (k or one) scalar sigma
-    X2 = repmat(dot(X,X,1)',1,k);
-    D = bsxfun(@plus,X2-2*X'*mu,dot(mu,mu,1));
-    q = bsxfun(@times,D,1./sigma);  % M distance
-    c = d*(log(2*pi)+2*log(sigma));          % normalization constant
-    y = -0.5*bsxfun(@plus,q,c);
-end
\ No newline at end of file
+d = size(X,1);
+X = X-mu;
+[U,p]= chol(sigma);
+if p ~= 0
+    error('ERROR: sigma is not PD.');
+end
+Q = U'\X;
+q = dot(Q,Q,1);  % quadratic term (M distance)
+c = d*log(2*pi)+2*sum(log(diag(U)));   % normalization constant
+y = -(c+q)/2;

From 7c97784fe48b055841ccc0d493e2337885ae97bb Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 18:20:53 +0800
Subject: [PATCH 030/119] tweak logKde

---
 chapter02/logKde.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter02/logKde.m b/chapter02/logKde.m
index 1a869bd..98c6079 100644
--- a/chapter02/logKde.m
+++ b/chapter02/logKde.m
@@ -6,5 +6,5 @@
 % Output:
 %   z: probability density in logrithm scale z=log p(x|y)
 % Written by Mo Chen (sth4nth@gmail.com).
-D = bsxfun(@plus,full(dot(X,X,1)),full(dot(Y,Y,1))')-full(2*(Y'*X));
+D = dot(X,X,1)+dot(Y,Y,1)'-2*(Y'*X);
 z = logsumexp(D/(-2*sigma^2),1)-0.5*log(2*pi)-log(sigma*size(Y,2),1);

From 60f5c5ec2afbec8ff6be0bce5f9f5b53c0f1098a Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 22:12:48 +0800
Subject: [PATCH 031/119] tweak gson

---
 chapter02/logMvGamma.m | 6 +++---
 common/gson.m          | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/chapter02/logMvGamma.m b/chapter02/logMvGamma.m
index fd27c5e..d1ee2b2 100644
--- a/chapter02/logMvGamma.m
+++ b/chapter02/logMvGamma.m
@@ -9,8 +9,8 @@
 % Output:
 %   y: m x n logarithm multivariate Gamma
 % Written by Michael Chen (sth4nth@gmail.com).
-s = size(x);
-x = reshape(x,1,prod(s));
+sz = size(x);
+x = reshape(x,1,prod(sz));
 x = bsxfun(@plus,repmat(x,d,1),(1-(1:d)')/2);
 y = d*(d-1)/4*log(pi)+sum(gammaln(x),1);
-y = reshape(y,s);
\ No newline at end of file
+y = reshape(y,sz);
\ No newline at end of file
diff --git a/common/gson.m b/common/gson.m
index 1fcdda4..66e4c46 100644
--- a/common/gson.m
+++ b/common/gson.m
@@ -4,10 +4,10 @@
 [d,n] = size(X);
 m = min(d,n);
 R = zeros(m,n);
-Q = zeros(d,m);
+Q = zeros(d,0);
 for i = 1:m
-    R(1:i-1,i) = Q(:,1:i-1)'*X(:,i);
-    v = X(:,i)-Q(:,1:i-1)*R(1:i-1,i);
+    R(1:i-1,i) = Q'*X(:,i);
+    v = X(:,i)-Q*R(1:i-1,i);
     R(i,i) = norm(v);
     Q(:,i) = v/R(i,i);
 end

From 3294bebfb0e6bd63480539805cef1d59d37315fa Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 22:26:49 +0800
Subject: [PATCH 032/119] remove gsog and mgsog

---
 common/gsog.m  | 14 --------------
 common/mgsog.m | 18 ------------------
 2 files changed, 32 deletions(-)
 delete mode 100644 common/gsog.m
 delete mode 100644 common/mgsog.m

diff --git a/common/gsog.m b/common/gsog.m
deleted file mode 100644
index 3a2b5b5..0000000
--- a/common/gsog.m
+++ /dev/null
@@ -1,14 +0,0 @@
-function [Q, R] = gsog(X)
-% Gram-Schmidt orthogonalization
-% Written by Mo Chen (sth4nth@gmail.com).
-[d,n] = size(X);
-m = min(d,n);
-R = eye(m,n);
-Q = zeros(d,m);
-D = zeros(1,m);
-for i = 1:m
-    R(1:i-1,i) = bsxfun(@times,Q(:,1:i-1),1./D(1:i-1))'*X(:,i);
-    Q(:,i) = X(:,i)-Q(:,1:i-1)*R(1:i-1,i);
-    D(i) = dot(Q(:,i),Q(:,i));
-end
-R(:,m+1:n) = bsxfun(@times,Q,1./D)'*X(:,m+1:n);
\ No newline at end of file
diff --git a/common/mgsog.m b/common/mgsog.m
deleted file mode 100644
index 003ce87..0000000
--- a/common/mgsog.m
+++ /dev/null
@@ -1,18 +0,0 @@
-function [Q, R] = mgsog(X)
-% Modified Gram-Schmidt orthogonalization
-% Written by Mo Chen (sth4nth@gmail.com).
-[d,n] = size(X);
-m = min(d,n);
-R = eye(m,n);
-Q = zeros(d,m);
-D = zeros(1,m);
-for i = 1:m
-    v = X(:,i);
-    for j = 1:i-1
-        R(j,i) = Q(:,j)'*v/D(j);
-        v = v-R(j,i)*Q(:,j);
-    end
-    Q(:,i) = v;
-    D(i) = dot(Q(:,i),Q(:,i));
-end
-R(:,m+1:n) = bsxfun(@times,Q,1./D)'*X(:,m+1:n);
\ No newline at end of file

From d389cfadf7e0ab8dfda82fdbb9d4e0c767bb1c9f Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 22:48:37 +0800
Subject: [PATCH 033/119] tweak logMvGamma

---
 chapter02/logMvGamma.m | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/chapter02/logMvGamma.m b/chapter02/logMvGamma.m
index d1ee2b2..d33ed47 100644
--- a/chapter02/logMvGamma.m
+++ b/chapter02/logMvGamma.m
@@ -1,4 +1,4 @@
-function y = logMvGamma(x,d)
+function y = logMvGamma(x, d)
 % Compute logarithm multivariate Gamma function 
 % which is used in the probability density function of the Wishart and inverse Wishart distributions.
 % Gamma_d(x) = pi^(d(d-1)/4) \prod_(j=1)^d Gamma(x+(1-j)/2)
@@ -9,8 +9,5 @@
 % Output:
 %   y: m x n logarithm multivariate Gamma
 % Written by Michael Chen (sth4nth@gmail.com).
-sz = size(x);
-x = reshape(x,1,prod(sz));
-x = bsxfun(@plus,repmat(x,d,1),(1-(1:d)')/2);
-y = d*(d-1)/4*log(pi)+sum(gammaln(x),1);
-y = reshape(y,sz);
\ No newline at end of file
+y = d*(d-1)/4*log(pi)+sum(gammaln(x(:)+(1-(1:d))/2),2);
+y = reshape(y,size(x));
\ No newline at end of file

From a88a79dc2aad1f4bbdb6b35529d27a52adc419e7 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 22:54:24 +0800
Subject: [PATCH 034/119] fix rvmBinPred

---
 chapter07/rvmBinPred.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter07/rvmBinPred.m b/chapter07/rvmBinPred.m
index 2b00a0f..1bb97f5 100644
--- a/chapter07/rvmBinPred.m
+++ b/chapter07/rvmBinPred.m
@@ -11,5 +11,5 @@
 X = [X;ones(1,size(X,2))];
 X = X(index,:);
 w = model.w;
-p = exp(-log1pexp(w'*X)); 
+p = sigmoid(w'*X); 
 y = round(p);

From b208345359ba16972361cceb46afb4101699b458 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 23:10:53 +0800
Subject: [PATCH 035/119] roll back logGauss.m

---
 chapter02/logGauss.m | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/chapter02/logGauss.m b/chapter02/logGauss.m
index 912e1c2..edefd89 100644
--- a/chapter02/logGauss.m
+++ b/chapter02/logGauss.m
@@ -7,13 +7,21 @@
 % Output:
 %   y: 1 x n probability density in logrithm scale y=log p(x)
 % Written by Mo Chen (sth4nth@gmail.com).
-d = size(X,1);
-X = X-mu;
-[U,p]= chol(sigma);
-if p ~= 0
-    error('ERROR: sigma is not PD.');
-end
-Q = U'\X;
-q = dot(Q,Q,1);  % quadratic term (M distance)
-c = d*log(2*pi)+2*sum(log(diag(U)));   % normalization constant
-y = -(c+q)/2;
+[d,k] = size(mu);
+if all(size(sigma)==d) && k==1   % one mu and one dxd sigma
+    X = bsxfun(@minus,X,mu);
+    [R,p]= chol(sigma);
+    if p ~= 0
+        error('ERROR: sigma is not PD.');
+    end
+    Q = R'\X;
+    q = dot(Q,Q,1);  % quadratic term (M distance)
+    c = d*log(2*pi)+2*sum(log(diag(R)));   % normalization constant
+    y = -0.5*(c+q);
+elseif size(sigma,1)==1 && size(sigma,2)==size(mu,2) % k mu and (k or one) scalar sigma
+    X2 = repmat(dot(X,X,1)',1,k);
+    D = bsxfun(@plus,X2-2*X'*mu,dot(mu,mu,1));
+    q = bsxfun(@times,D,1./sigma);  % M distance
+    c = d*(log(2*pi)+2*log(sigma));          % normalization constant
+    y = -0.5*bsxfun(@plus,q,c);
+end
\ No newline at end of file

From 38c3f9b6a35b3293a70487f7183c7a87f38b64b2 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 11 Mar 2017 23:12:18 +0800
Subject: [PATCH 036/119] tweak knKmeansPred

---
 chapter06/knKmeansPred.m |  4 +--
 demo/ch06/knLin_demo.m   | 78 ++++++++++++++++++++--------------------
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/chapter06/knKmeansPred.m b/chapter06/knKmeansPred.m
index b4ae474..30dc653 100644
--- a/chapter06/knKmeansPred.m
+++ b/chapter06/knKmeansPred.m
@@ -14,7 +14,7 @@
 n = size(X,2);
 k = max(t);
 E = sparse(t,1:n,1,k,n,n);
-E = bsxfun(@times,E,1./sum(E,2));
-Z = bsxfun(@minus,E*kn(X,Xt),diag(E*kn(X,X)*E')/2);
+E = E./sum(E,2);
+Z = E*kn(X,Xt)-dot(E*kn(X,X),E,2)/2;
 [val, label] = max(Z,[],1);
 energy = sum(kn(Xt))-2*sum(val);
diff --git a/demo/ch06/knLin_demo.m b/demo/ch06/knLin_demo.m
index 9259164..9ae12b3 100644
--- a/demo/ch06/knLin_demo.m
+++ b/demo/ch06/knLin_demo.m
@@ -1,23 +1,23 @@
-% %% Kernel regression with linear kernel is EQUIVALENT to linear regression
-% clear; close all;
-% n = 100;
-% x = linspace(0,2*pi,n);   % test data
-% t = sin(x)+rand(1,n)/2;
-% 
-% lambda = 1e-4;
-% model_kn = knReg(x,t,lambda,@knLin);
-% model_lin = linReg(x,t,lambda);
-% 
-% idx = 1:2:n;
-% xt = x(:,idx);
-% tt = t(idx);
-% 
-% [y_kn, sigma_kn,p_kn] = knRegPred(model_kn,xt,tt);
-% [y_lin, sigma_lin,p_lin] = linRegPred(model_lin,xt,tt);
-% 
-% maxdiff(y_kn,y_lin)
-% maxdiff(sigma_kn,sigma_lin)
-% maxdiff(p_kn,p_lin)
+%% Kernel regression with linear kernel is EQUIVALENT to linear regression
+clear; close all;
+n = 100;
+x = linspace(0,2*pi,n);   % test data
+t = sin(x)+rand(1,n)/2;
+
+lambda = 1e-4;
+model_kn = knReg(x,t,lambda,@knLin);
+model_lin = linReg(x,t,lambda);
+
+idx = 1:2:n;
+xt = x(:,idx);
+tt = t(idx);
+
+[y_kn, sigma_kn,p_kn] = knRegPred(model_kn,xt,tt);
+[y_lin, sigma_lin,p_lin] = linRegPred(model_lin,xt,tt);
+
+maxdiff(y_kn,y_lin)
+maxdiff(sigma_kn,sigma_lin)
+maxdiff(p_kn,p_lin)
 %% Kernel kmeans with linear kernel is EQUIVALENT to kmeans
 clear; close all;
 d = 2;
@@ -40,22 +40,22 @@
 maxdiff(t_kn,t_lin)
 maxdiff(ent_kn,ent_lin)
 %% Kernel PCA with linear kernel is EQUIVALENT TO PCA
-% clear; close all;
-% d = 10;
-% q = 2;
-% n = 500;
-% X = randn(d,n);
-% 
-% 
-% model_kn = knPca(X,q,@knLin);
-% idx = 1:2:n;
-% Xt = X(:,idx);
-% 
-% Y_kn = knPcaPred(model_kn,Xt);
-% 
-% [U,L,mu,mse] = pca(X,q);
-% Y_lin = U'*bsxfun(@minus,Xt,mu);   % projection
-% 
-% 
-% R = Y_lin/Y_kn;    % the results are equivalent up to a rotation.
-% maxdiff(R*R', eye(q))
+clear; close all;
+d = 10;
+q = 2;
+n = 500;
+X = randn(d,n);
+
+
+model_kn = knPca(X,q,@knLin);
+idx = 1:2:n;
+Xt = X(:,idx);
+
+Y_kn = knPcaPred(model_kn,Xt);
+
+[U,L,mu,mse] = pca(X,q);
+Y_lin = U'*bsxfun(@minus,Xt,mu);   % projection
+
+
+R = Y_lin/Y_kn;    % the results are equivalent up to a rotation.
+maxdiff(R*R', eye(q))

From 23dc01d6a34dbe3f3d2d25599183744a6f328b5f Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 12 Mar 2017 00:04:56 +0800
Subject: [PATCH 037/119] fix linRegPred knRegPred

---
 chapter02/logGauss.m   | 28 ++++++++++------------------
 chapter03/linRegPred.m |  3 +--
 chapter06/knRegPred.m  |  2 +-
 3 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/chapter02/logGauss.m b/chapter02/logGauss.m
index edefd89..912e1c2 100644
--- a/chapter02/logGauss.m
+++ b/chapter02/logGauss.m
@@ -7,21 +7,13 @@
 % Output:
 %   y: 1 x n probability density in logrithm scale y=log p(x)
 % Written by Mo Chen (sth4nth@gmail.com).
-[d,k] = size(mu);
-if all(size(sigma)==d) && k==1   % one mu and one dxd sigma
-    X = bsxfun(@minus,X,mu);
-    [R,p]= chol(sigma);
-    if p ~= 0
-        error('ERROR: sigma is not PD.');
-    end
-    Q = R'\X;
-    q = dot(Q,Q,1);  % quadratic term (M distance)
-    c = d*log(2*pi)+2*sum(log(diag(R)));   % normalization constant
-    y = -0.5*(c+q);
-elseif size(sigma,1)==1 && size(sigma,2)==size(mu,2) % k mu and (k or one) scalar sigma
-    X2 = repmat(dot(X,X,1)',1,k);
-    D = bsxfun(@plus,X2-2*X'*mu,dot(mu,mu,1));
-    q = bsxfun(@times,D,1./sigma);  % M distance
-    c = d*(log(2*pi)+2*log(sigma));          % normalization constant
-    y = -0.5*bsxfun(@plus,q,c);
-end
\ No newline at end of file
+d = size(X,1);
+X = X-mu;
+[U,p]= chol(sigma);
+if p ~= 0
+    error('ERROR: sigma is not PD.');
+end
+Q = U'\X;
+q = dot(Q,Q,1);  % quadratic term (M distance)
+c = d*log(2*pi)+2*sum(log(diag(U)));   % normalization constant
+y = -(c+q)/2;
diff --git a/chapter03/linRegPred.m b/chapter03/linRegPred.m
index 0926ff0..9ddf650 100644
--- a/chapter03/linRegPred.m
+++ b/chapter03/linRegPred.m
@@ -26,7 +26,6 @@
 end
 
 if nargin == 3 && nargout == 3
-    p = exp(logGauss(t,y,sigma));
-%     p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma));
+    p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma));
 end
 
diff --git a/chapter06/knRegPred.m b/chapter06/knRegPred.m
index 8e89ad9..6910657 100755
--- a/chapter06/knRegPred.m
+++ b/chapter06/knRegPred.m
@@ -25,5 +25,5 @@
 end
 
 if nargin == 3 && nargout == 3
-    p = exp(logGauss(t,y,sigma));
+    p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma));
 end
\ No newline at end of file

From 81c2796be32f88405e9bf1f8c9b6913d450d25c3 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 12 Mar 2017 00:10:11 +0800
Subject: [PATCH 038/119] fix rvmRegPred

---
 chapter07/rvmRegPred.m | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/chapter07/rvmRegPred.m b/chapter07/rvmRegPred.m
index 24ab3e8..8c80bd0 100644
--- a/chapter07/rvmRegPred.m
+++ b/chapter07/rvmRegPred.m
@@ -25,6 +25,5 @@
 end
 
 if nargin == 3 && nargout == 3
-    p = exp(logGauss(t,y,sigma));
-%     p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma));
+    p = exp(-0.5*(((t-y)./sigma).^2+log(2*pi))-log(sigma));
 end

From 3f04958673dc596597a5eaa3627c0590ead86986 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 12 Mar 2017 00:45:40 +0800
Subject: [PATCH 039/119] Update README.md

---
 README.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index a8c7b35..a272327 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,11 @@ Description
 -------
 The design goal of the code are as follows:
 
-1. Clean: Code is very succinct. There are little nasty guarding code that distracts readers' attention. As a result, the core of the algorithms can be easily spot.
-2. Efficient: Many tricks for making Matlab scripts efficient were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementation. Usually, functions in this package are orders faster than Matlab builtin functions which provide the same functionality (eg. kmeans). If anyone found any Matlab implementation that is faster than mine, I am happy to further optimize.
+1. Succinct: Code is extremely terse. Minimizing the number of line of code is one of the primal target. As a result, the core of the algorithms can be easily spot.
+2. Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementation. Usually, functions in this package are orders faster than Matlab builtin functions which provide the same functionality (eg. kmeans). If anyone found any Matlab implementation that is faster than mine, I am happy to further optimize.
 3. Robust: Many numerical stability techniques are applied, such as probability computation in log scale to avoid numerical underflow and overflow, square root form update of symmetric matrix, etc.
-4. Easy to learn: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines.
-5. Practical: The package is designed not only to be easily read, but also to be easily used to facilitate ML research. Many functions in this package are already widely used  (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
-
+4. Easy to learn: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines. Symbols are in sync with the book.
+5. Practical: The package is designed not only to be easily read, but also to be easily used to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
 
 Installation
 -------
@@ -22,7 +21,11 @@ Installation
 
 2. Run Matlab and navigate to package location as working directory, then run the init.m script.
 
-3. Run some demos in the demo directory. Enjoy!
+3. Run some demos in the your_location/demo directory. Enjoy!
+
+FeedBack
+-------
+If you found any bugs or have any suggestion, please do fire issues. I am graceful for any feedback and do my best to improve this package.
 
 License
 -------

From 444f001c9fea6307513b22974dd3e9b7c0944104 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 12 Mar 2017 00:50:57 +0800
Subject: [PATCH 040/119] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a272327..5f53e50 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ Introduction
 This package is a Matlab implementation of the algorithms described in the classical machine learning textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
-Note: this package requires Matlab R2016b or later, since it utilizes a new syntax of Matlab.
+Note: this package requires Matlab R2016b or later, since it utilizes a new syntax of Matlab called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting in Python).
 
 Description
 -------
@@ -25,7 +25,7 @@ Installation
 
 FeedBack
 -------
-If you found any bugs or have any suggestion, please do fire issues. I am graceful for any feedback and do my best to improve this package.
+If you found any bug or have any suggestion, please do fire issues. I am graceful for any feedback and will do my best to improve this package.
 
 License
 -------

From e31258d89e81a875acc85cb228ab76ee72bba690 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 13 Mar 2017 01:40:02 +0800
Subject: [PATCH 041/119] tweak kmeans to the extreme

---
 chapter09/kmeans.m     |  9 ++++-----
 chapter09/kmeansPred.m |  4 ++--
 chapter09/litekmeans.m | 17 +++++++++++++++++
 common/normalize.m     |  2 +-
 4 files changed, 24 insertions(+), 8 deletions(-)
 create mode 100644 chapter09/litekmeans.m

diff --git a/chapter09/kmeans.m b/chapter09/kmeans.m
index 29a2e6b..6e9caa1 100644
--- a/chapter09/kmeans.m
+++ b/chapter09/kmeans.m
@@ -1,12 +1,12 @@
-function [label, m, energy] = kmeans(X, init)
+function [label, mu, energy] = kmeans(X, init)
 % Perform kmeans clustering.
 % Input:
 %   X: d x n data matrix
 %   init: k number of clusters or label (1 x n vector)
 % Output:
 %   label: 1 x n cluster label
+%   mu: d x k center of clusters
 %   energy: optimization target value
-%   model: trained model structure
 % Written by Mo Chen (sth4nth@gmail.com).
 n = size(X,2);
 idx = 1:n;
@@ -19,8 +19,7 @@
 end
 while any(label ~= last)
     [~,~,last(:)] = unique(label);   % remove empty clusters
-    E = sparse(idx,last,1);  % transform label into indicator matrix
-    m = X*(E./sum(E,1));    % compute centers 
-    [val,label] = min(dot(m,m,1)'/2-m'*X,[],1); % assign labels
+    mu = X*normalize(sparse(idx,last,1),1);    % compute centers 
+    [val,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign labels
 end
 energy = dot(X(:),X(:),1)+2*sum(val);
\ No newline at end of file
diff --git a/chapter09/kmeansPred.m b/chapter09/kmeansPred.m
index 83dc633..7ed1278 100644
--- a/chapter09/kmeansPred.m
+++ b/chapter09/kmeansPred.m
@@ -1,4 +1,4 @@
-function [label, energy] = kmeansPred(m, X)
+function [label, energy] = kmeansPred(mu, X)
 % Prediction for kmeans clusterng
 % Input:
 %   model: dx k cluster center matrix
@@ -7,5 +7,5 @@
 %   label: 1 x n cluster label
 %   energy: optimization target value
 % Written by Mo Chen (sth4nth@gmail.com).
-[val,label] = min(dot(X,X,1)+dot(m,m,1)'-2*m'*X,[],1); % assign labels
+[val,label] = min(dot(X,X,1)+dot(mu,mu,1)'-2*mu'*X,[],1); % assign labels
 energy = sum(val);
\ No newline at end of file
diff --git a/chapter09/litekmeans.m b/chapter09/litekmeans.m
new file mode 100644
index 0000000..9bf7b37
--- /dev/null
+++ b/chapter09/litekmeans.m
@@ -0,0 +1,17 @@
+function [label, mu] = litekmeans(X, k)
+n = size(X,2);
+last = zeros(1,n);
+label = ceil(k*rand(1,n));
+while any(label ~= last)
+    [~,~,last(:)] = unique(label);            % remove empty clusters
+    mu = X*normalize(sparse(1:n,last,1),1);    % compute cluster centers 
+    [~,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels
+end
+% Perform kmeans clustering.
+% Input:
+%   X: d x n data matrix
+%   k: number of clusters
+% Output:
+%   label: 1 x n cluster label
+%   mu: d x k center of clusters
+% Written by Mo Chen (sth4nth@gmail.com).
\ No newline at end of file
diff --git a/common/normalize.m b/common/normalize.m
index c7ae7a1..9bca004 100644
--- a/common/normalize.m
+++ b/common/normalize.m
@@ -8,4 +8,4 @@
     if isempty(dim), dim = 1; end
 end
 s = sum(X,dim);
-Y = bsxfun(@times,X,1./s);
\ No newline at end of file
+Y = X./s;
\ No newline at end of file

From 3a9055dfbdb5e3ee78c4caea94da0deeacfcf041 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 13 Mar 2017 01:57:54 +0800
Subject: [PATCH 042/119] tweak kmeans to the extreme

---
 chapter09/litekmeans.m | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/chapter09/litekmeans.m b/chapter09/litekmeans.m
index 9bf7b37..03afa99 100644
--- a/chapter09/litekmeans.m
+++ b/chapter09/litekmeans.m
@@ -1,17 +1,16 @@
-function [label, mu] = litekmeans(X, k)
-n = size(X,2);
-last = zeros(1,n);
-label = ceil(k*rand(1,n));
+function [label, mu] = litekmeans(X, label)
+idx = 1:size(X,2);
+last = idx;
 while any(label ~= last)
     [~,~,last(:)] = unique(label);            % remove empty clusters
-    mu = X*normalize(sparse(1:n,last,1),1);    % compute cluster centers 
+    mu = X*normalize(sparse(idx,last,1),1);    % compute cluster centers 
     [~,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels
 end
 % Perform kmeans clustering.
 % Input:
 %   X: d x n data matrix
-%   k: number of clusters
+%   label: initial sample labels
 % Output:
-%   label: 1 x n cluster label
+%   label: 1 x n sample label
 %   mu: d x k center of clusters
 % Written by Mo Chen (sth4nth@gmail.com).
\ No newline at end of file

From ef94fc35ca6d18e35685a9d5e9b227e25da9414f Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 13 Mar 2017 18:55:57 +0800
Subject: [PATCH 043/119] doc update

---
 chapter06/knKmeans.m   | 4 ++--
 chapter09/kmeans.m     | 2 +-
 chapter09/kmedoids.m   | 2 +-
 chapter09/litekmeans.m | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/chapter06/knKmeans.m b/chapter06/knKmeans.m
index 80b6d5a..49c6c15 100755
--- a/chapter06/knKmeans.m
+++ b/chapter06/knKmeans.m
@@ -4,9 +4,9 @@
 %   K: n x n kernel matrix
 %   init: either number of clusters (k) or initial label (1xn)
 % Output:
-%   label: 1 x n clustering result label
-%   energy: optimization target value
+%   label: 1 x n sample labels
 %   model: trained model structure
+%   energy: optimization target value
 % Reference: Kernel Methods for Pattern Analysis
 % by John Shawe-Taylor, Nello Cristianini
 % Written by Mo Chen (sth4nth@gmail.com).
diff --git a/chapter09/kmeans.m b/chapter09/kmeans.m
index 6e9caa1..eb154b3 100644
--- a/chapter09/kmeans.m
+++ b/chapter09/kmeans.m
@@ -4,7 +4,7 @@
 %   X: d x n data matrix
 %   init: k number of clusters or label (1 x n vector)
 % Output:
-%   label: 1 x n cluster label
+%   label: 1 x n sample labels
 %   mu: d x k center of clusters
 %   energy: optimization target value
 % Written by Mo Chen (sth4nth@gmail.com).
diff --git a/chapter09/kmedoids.m b/chapter09/kmedoids.m
index 7499905..ff94a60 100644
--- a/chapter09/kmedoids.m
+++ b/chapter09/kmedoids.m
@@ -4,7 +4,7 @@
 %   X: d x n data matrix
 %   init: k number of clusters or label (1 x n vector)
 % Output:
-%   label: 1 x n cluster label
+%   label: 1 x n sample labels
 %   index: index of medoids
 %   energy: optimization target value
 % Written by Mo Chen (sth4nth@gmail.com).
diff --git a/chapter09/litekmeans.m b/chapter09/litekmeans.m
index 03afa99..8d68434 100644
--- a/chapter09/litekmeans.m
+++ b/chapter09/litekmeans.m
@@ -11,6 +11,6 @@
 %   X: d x n data matrix
 %   label: initial sample labels
 % Output:
-%   label: 1 x n sample label
+%   label: 1 x n sample labels
 %   mu: d x k center of clusters
 % Written by Mo Chen (sth4nth@gmail.com).
\ No newline at end of file

From 3cc9d1396568679979794949c1970010ef08cd6a Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 13 Mar 2017 18:56:29 +0800
Subject: [PATCH 044/119] kmeanspp tbd

---
 chapter09/kmeanspp.m | 45 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 chapter09/kmeanspp.m

diff --git a/chapter09/kmeanspp.m b/chapter09/kmeanspp.m
new file mode 100644
index 0000000..c590175
--- /dev/null
+++ b/chapter09/kmeanspp.m
@@ -0,0 +1,45 @@
+function [label, mu, energy] =  kmeanspp(X, k)
+% Perform kmeans clustering.
+% Input:
+%   X: d x n data matrix
+%   k: number of clusters
+% Output:
+%   label: 1 x n sample labels
+%   mu: d x k center of clusters
+%   energy: optimization target value
+% Written by Mo Chen (sth4nth@gmail.com).
+[label, mu, energy] = kmeans(X, kseeds(X,k));
+
+% TBD: label and energy
+function [label, mu, energy] = kseeds(X, k)
+% kmeans++ seeding
+[d,n] = size(X);
+v = inf(1,n);
+mu = zeros(d,k);
+mu(:,1) = X(:,ceil(n*rand));
+label = zeros(1,n);
+for i = 2:k
+    X0 = X-mu(:,i-1);
+    [v,label] = min(v,dot(X0,X0,1));
+    mu(:,i) = X(:,randp(v));
+end
+energy = sum(v);
+
+% Done
+function idx = randp(p)
+% sample one of k by probability
+p = cumsum(p);
+p = p/p(end);
+idx = find(rand<p,1);
+
+% Done
+function [label, mu, energy] = kmeans(X, label)
+% standard kmeans (Lloyd iteration)
+idx = 1:size(X,2);
+last = idx;
+while any(label ~= last)
+    [~,~,last(:)] = unique(label);            % remove empty clusters
+    mu = X*normalize(sparse(idx,last,1),1);    % compute cluster centers 
+    [val,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels
+end
+energy = dot(X(:),X(:),1)+2*sum(val);
\ No newline at end of file

From 5636167858b06e1c0c8db2b01579a7b864266bd2 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 13 Mar 2017 22:47:08 +0800
Subject: [PATCH 045/119] reorgnize. I'm done with kmeans

---
 chapter09/kmeans.m      | 32 +++++++++++++++++------------
 chapter09/kmeanspp.m    | 45 -----------------------------------------
 chapter09/kseeds.m      | 17 ++++++++++++++++
 chapter09/litekmeans.m  | 16 ---------------
 common/randp.m          |  3 +++
 demo/ch09/kmeans_demo.m | 22 +++++++++++++++++++-
 6 files changed, 60 insertions(+), 75 deletions(-)
 delete mode 100644 chapter09/kmeanspp.m
 create mode 100644 chapter09/kseeds.m
 delete mode 100644 chapter09/litekmeans.m
 create mode 100644 common/randp.m

diff --git a/chapter09/kmeans.m b/chapter09/kmeans.m
index eb154b3..74fd82c 100644
--- a/chapter09/kmeans.m
+++ b/chapter09/kmeans.m
@@ -1,25 +1,31 @@
-function [label, mu, energy] = kmeans(X, init)
+function [label, mu, energy] = kmeans(X, m)
 % Perform kmeans clustering.
 % Input:
 %   X: d x n data matrix
-%   init: k number of clusters or label (1 x n vector)
+%   m: initialization parameter
 % Output:
 %   label: 1 x n sample labels
 %   mu: d x k center of clusters
 %   energy: optimization target value
 % Written by Mo Chen (sth4nth@gmail.com).
-n = size(X,2);
+label = init(X, m);
+n = numel(label);
 idx = 1:n;
 last = zeros(1,n);
-if numel(init)==1
-    k = init;
-    label = ceil(k*rand(1,n));
-elseif numel(init)==n
-    label = init;
-end
 while any(label ~= last)
-    [~,~,last(:)] = unique(label);   % remove empty clusters
-    mu = X*normalize(sparse(idx,last,1),1);    % compute centers 
-    [val,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign labels
+    [~,~,last(:)] = unique(label);                  % remove empty clusters
+    mu = X*normalize(sparse(idx,last,1),1);         % compute cluster centers 
+    [val,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1);  % assign sample labels
 end
-energy = dot(X(:),X(:),1)+2*sum(val);
\ No newline at end of file
+energy = dot(X(:),X(:),1)+2*sum(val);
+
+function label = init(X, m)
+[d,n] = size(X);
+if numel(m) == 1                           % random initialization
+    mu = X(:,randperm(n,m));
+    [~,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); 
+elseif all(size(m) == [1,n])               % init with labels
+    label = m;
+elseif size(m,1) == d                      % init with seeds (centers)
+    [~,label] = min(dot(m,m,1)'/2-m'*X,[],1); 
+end
\ No newline at end of file
diff --git a/chapter09/kmeanspp.m b/chapter09/kmeanspp.m
deleted file mode 100644
index c590175..0000000
--- a/chapter09/kmeanspp.m
+++ /dev/null
@@ -1,45 +0,0 @@
-function [label, mu, energy] =  kmeanspp(X, k)
-% Perform kmeans clustering.
-% Input:
-%   X: d x n data matrix
-%   k: number of clusters
-% Output:
-%   label: 1 x n sample labels
-%   mu: d x k center of clusters
-%   energy: optimization target value
-% Written by Mo Chen (sth4nth@gmail.com).
-[label, mu, energy] = kmeans(X, kseeds(X,k));
-
-% TBD: label and energy
-function [label, mu, energy] = kseeds(X, k)
-% kmeans++ seeding
-[d,n] = size(X);
-v = inf(1,n);
-mu = zeros(d,k);
-mu(:,1) = X(:,ceil(n*rand));
-label = zeros(1,n);
-for i = 2:k
-    X0 = X-mu(:,i-1);
-    [v,label] = min(v,dot(X0,X0,1));
-    mu(:,i) = X(:,randp(v));
-end
-energy = sum(v);
-
-% Done
-function idx = randp(p)
-% sample one of k by probability
-p = cumsum(p);
-p = p/p(end);
-idx = find(rand<p,1);
-
-% Done
-function [label, mu, energy] = kmeans(X, label)
-% standard kmeans (Lloyd iteration)
-idx = 1:size(X,2);
-last = idx;
-while any(label ~= last)
-    [~,~,last(:)] = unique(label);            % remove empty clusters
-    mu = X*normalize(sparse(idx,last,1),1);    % compute cluster centers 
-    [val,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels
-end
-energy = dot(X(:),X(:),1)+2*sum(val);
\ No newline at end of file
diff --git a/chapter09/kseeds.m b/chapter09/kseeds.m
new file mode 100644
index 0000000..1f108bc
--- /dev/null
+++ b/chapter09/kseeds.m
@@ -0,0 +1,17 @@
+function mu = kseeds(X, k)
+% Perform kmeans++ seeding
+% Input:
+%   X: d x n data matrix
+%   k: number of seeds
+% Output:
+%   label: 1 x n sample labels
+%   mu: d x k seeds
+%   energy: kmeans target value
+% Written by Mo Chen (sth4nth@gmail.com).
+n = size(X,2);
+D = inf(1,n);
+mu = X(:,ceil(n*rand));
+for i = 2:k
+    D = min(D,sum((X-mu(:,i-1)).^2,1));
+    mu(:,i) = X(:,randp(D));
+end
diff --git a/chapter09/litekmeans.m b/chapter09/litekmeans.m
deleted file mode 100644
index 8d68434..0000000
--- a/chapter09/litekmeans.m
+++ /dev/null
@@ -1,16 +0,0 @@
-function [label, mu] = litekmeans(X, label)
-idx = 1:size(X,2);
-last = idx;
-while any(label ~= last)
-    [~,~,last(:)] = unique(label);            % remove empty clusters
-    mu = X*normalize(sparse(idx,last,1),1);    % compute cluster centers 
-    [~,label] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels
-end
-% Perform kmeans clustering.
-% Input:
-%   X: d x n data matrix
-%   label: initial sample labels
-% Output:
-%   label: 1 x n sample labels
-%   mu: d x k center of clusters
-% Written by Mo Chen (sth4nth@gmail.com).
\ No newline at end of file
diff --git a/common/randp.m b/common/randp.m
new file mode 100644
index 0000000..731c97e
--- /dev/null
+++ b/common/randp.m
@@ -0,0 +1,3 @@
+function i = randp(p)
+% Sample a integer in [1:k] with given probability p
+i = find(rand<cumsum(normalize(p)),1);
diff --git a/demo/ch09/kmeans_demo.m b/demo/ch09/kmeans_demo.m
index 4c22e94..9a725f9 100644
--- a/demo/ch09/kmeans_demo.m
+++ b/demo/ch09/kmeans_demo.m
@@ -2,8 +2,28 @@
 d = 2;
 k = 3;
 n = 5000;
+%% Generate data
 [X,label] = kmeansRnd(d,k,n);
-y = litekmeans(X,k);
 plotClass(X,label);
+%% kmeans with random initialization 
+y = kmeans(X,k);
+figure;
+plotClass(X,y);
+%% kmeans init with labels
+y = kmeans(X,label);
+figure;
+plotClass(X,y);
+%% kmeans init with centers 
+mu = rand(d,k);
+y = kmeans(X,mu);
+figure;
+plotClass(X,y);
+%% kmeans init with kmeans++ seeding 
+y = kmeans(X,kseeds(X,k));
+figure;
+plotClass(X,y);
+%% kmeans++ seeding 
+mu = kseeds(X,k);
+[~,y] = min(dot(mu,mu,1)'/2-mu'*X,[],1); % assign sample labels
 figure;
 plotClass(X,y);

From 869634b4ce0b8bcaa7473d0b5a0d00d759fc29ef Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 13 Mar 2017 23:22:54 +0800
Subject: [PATCH 046/119] fix doc

---
 chapter09/kseeds.m | 2 --
 1 file changed, 2 deletions(-)

diff --git a/chapter09/kseeds.m b/chapter09/kseeds.m
index 1f108bc..ad37a4f 100644
--- a/chapter09/kseeds.m
+++ b/chapter09/kseeds.m
@@ -4,9 +4,7 @@
 %   X: d x n data matrix
 %   k: number of seeds
 % Output:
-%   label: 1 x n sample labels
 %   mu: d x k seeds
-%   energy: kmeans target value
 % Written by Mo Chen (sth4nth@gmail.com).
 n = size(X,2);
 D = inf(1,n);

From 2a9d1365caf615c4973e2fe839ba6a6899faf75a Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 25 Mar 2017 03:14:53 +0800
Subject: [PATCH 047/119] add back the naive method of model evidence

---
 chapter10/mixGaussEvidence.m | 68 ++++++++++++++++++++++++++++++++++++
 chapter10/mixGaussVb.m       |  2 +-
 demo/ch10/mixGaussVb_demo.m  |  8 +++++
 3 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 chapter10/mixGaussEvidence.m

diff --git a/chapter10/mixGaussEvidence.m b/chapter10/mixGaussEvidence.m
new file mode 100644
index 0000000..4a53599
--- /dev/null
+++ b/chapter10/mixGaussEvidence.m
@@ -0,0 +1,68 @@
+function L = mixGaussEvidence(X, model, prior)
+% Variational lower bound of the model evidence (log of marginal)
+% This the method by the book. It is equivalent to the bound inside mixGaussVb.
+% Reference: Pattern Recognition and Machine Learning by Christopher M. Bishop (P.474)
+% Written by Mo Chen (sth4nth@gmail.com).
+alpha0 = prior.alpha;
+kappa0 = prior.kappa;
+m0 = prior.m;
+v0 = prior.v;
+M0 = prior.M;
+
+alpha = model.alpha; % Dirichlet
+kappa = model.kappa;   % Gaussian
+m = model.m;         % Gasusian
+v = model.v;         % Whishart
+% M = model.M;         % Whishart: inv(W) = V'*V
+U = model.U;
+R = model.R;
+logR = model.logR;
+
+[d,k] = size(m);
+nk = sum(R,1); % 10.51
+
+Elogpi = psi(0,alpha)-psi(0,sum(alpha));
+Epz = dot(nk,Elogpi);
+Eqz = dot(R(:),logR(:));
+logCalpha0 = gammaln(k*alpha0)-k*gammaln(alpha0);
+Eppi = logCalpha0+(alpha0-1)*sum(Elogpi);
+logCalpha = gammaln(sum(alpha))-sum(gammaln(alpha));
+Eqpi = dot(alpha-1,Elogpi)+logCalpha;
+
+U0 = chol(M0);
+sqrtR = sqrt(R);
+xbar = bsxfun(@times,X*R,1./nk); % 10.52
+
+logW = zeros(1,k);
+trSW = zeros(1,k);
+trM0W = zeros(1,k);
+xbarmWxbarm = zeros(1,k);
+mm0Wmm0 = zeros(1,k);
+for i = 1:k
+    Ui = U(:,:,i);
+    logW(i) = -2*sum(log(diag(Ui)));      
+    
+    Xs = bsxfun(@times,bsxfun(@minus,X,xbar(:,i)),sqrtR(:,i)');
+    V = chol(Xs*Xs'/nk(i));
+    Q = V/Ui;
+    trSW(i) = dot(Q(:),Q(:));  % equivalent to tr(SW)=trace(S/M)
+    Q = U0/Ui;
+    trM0W(i) = dot(Q(:),Q(:));
+
+    q = Ui'\(xbar(:,i)-m(:,i));
+    xbarmWxbarm(i) = dot(q,q);
+    q = Ui'\(m(:,i)-m0);
+    mm0Wmm0(i) = dot(q,q);
+end
+ElogLambda = sum(psi(0,bsxfun(@minus,v+1,(1:d)')/2),1)+d*log(2)+logW; % 10.65
+Epmu = sum(d*log(kappa0/(2*pi))+ElogLambda-d*kappa0./kappa-kappa0*(v.*mm0Wmm0))/2;
+logB0 = v0*sum(log(diag(U0)))-0.5*v0*d*log(2)-logMvGamma(0.5*v0,d);
+EpLambda = k*logB0+0.5*(v0-d-1)*sum(ElogLambda)-0.5*dot(v,trM0W);
+
+Eqmu = 0.5*sum(ElogLambda+d*log(kappa/(2*pi)))-0.5*d*k;
+logB =  -v.*(logW+d*log(2))/2-logMvGamma(0.5*v,d);
+EqLambda = 0.5*sum((v-d-1).*ElogLambda-v*d)+sum(logB);
+
+EpX = 0.5*dot(nk,ElogLambda-d./kappa-v.*trSW-v.*xbarmWxbarm-d*log(2*pi));
+
+L = Epz-Eqz+Eppi-Eqpi+Epmu-Eqmu+EpLambda-EqLambda+EpX;
\ No newline at end of file
diff --git a/chapter10/mixGaussVb.m b/chapter10/mixGaussVb.m
index b784c8f..1daf32a 100644
--- a/chapter10/mixGaussVb.m
+++ b/chapter10/mixGaussVb.m
@@ -27,7 +27,7 @@
 for iter = 2:maxiter
     model = expect(X,model);
     model = maximize(X,model,prior);
-    L(iter) = bound(X,model,prior)/n;
+    L(iter) = bound(X,model,prior);
     if abs(L(iter)-L(iter-1)) < tol*abs(L(iter)); break; end
 end
 L = L(2:iter);
diff --git a/demo/ch10/mixGaussVb_demo.m b/demo/ch10/mixGaussVb_demo.m
index a91b6f2..ee336d5 100755
--- a/demo/ch10/mixGaussVb_demo.m
+++ b/demo/ch10/mixGaussVb_demo.m
@@ -15,6 +15,14 @@
 plotClass(X1,y1);
 figure;
 plot(L)
+% Model Evidence
+prior.alpha = 1;
+prior.kappa = 1;
+prior.m = mean(X1,2);
+prior.v = d+1;
+prior.M = eye(d);   % M = inv(W)
+L0 = mixGaussEvidence(X1, model, prior);
+L0-L(end)
 % Predict testing data
 [y2, R] = mixGaussVbPred(model,X2);
 figure;

From 8ffe5dc5d8575f670e0f27df0d7be04bd452f90a Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 26 Mar 2017 03:49:44 +0800
Subject: [PATCH 048/119] fix comment

---
 chapter10/mixGaussEvidence.m | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chapter10/mixGaussEvidence.m b/chapter10/mixGaussEvidence.m
index 4a53599..046dd56 100644
--- a/chapter10/mixGaussEvidence.m
+++ b/chapter10/mixGaussEvidence.m
@@ -1,6 +1,6 @@
 function L = mixGaussEvidence(X, model, prior)
-% Variational lower bound of the model evidence (log of marginal)
-% This the method by the book. It is equivalent to the bound inside mixGaussVb.
+% Variational lower bound of the model evidence (log of marginal likelihood)
+% This function implements the method in the book PRML. It is equivalent to the bound inside mixGaussVb function.
 % Reference: Pattern Recognition and Machine Learning by Christopher M. Bishop (P.474)
 % Written by Mo Chen (sth4nth@gmail.com).
 alpha0 = prior.alpha;

From 330fe228a0d9e1706d7f6dab3782a178b8d2d20f Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 27 Mar 2017 02:31:04 +0800
Subject: [PATCH 049/119] add lognormexp

---
 common/lognormexp.m | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 common/lognormexp.m

diff --git a/common/lognormexp.m b/common/lognormexp.m
new file mode 100644
index 0000000..10d9f95
--- /dev/null
+++ b/common/lognormexp.m
@@ -0,0 +1,10 @@
+function [Y,s] = lognormexp(X, dim)
+% Compute log(normalize(exp(x),dim)) while avoiding numerical underflow.
+%   By default dim = 1 (columns).
+% Written by Mo Chen (sth4nth@gmail.com).
+if nargin == 1
+    dim = find(size(X)~=1,1);
+    if isempty(dim), dim = 1; end
+end
+s = logsumexp(X,dim);
+Y = exp(X-s);

From 88a9ee42f94d02bcc339a0aa79e752da0ebdba38 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 29 Mar 2017 05:36:19 +0800
Subject: [PATCH 050/119] add common functions

---
 common/slice.m | 20 ++++++++++++++++++++
 common/sub.m   | 16 ++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 common/slice.m
 create mode 100644 common/sub.m

diff --git a/common/slice.m b/common/slice.m
new file mode 100644
index 0000000..bb1e57e
--- /dev/null
+++ b/common/slice.m
@@ -0,0 +1,20 @@
+function B = slice(A, dim, index)
+% slice(A,2,index) = A(:,index,:)
+sz = size(A);
+sz(dim) = numel(index);
+IDX = cell(1,ndims(A));
+for i = 1:ndims(A)
+    if i == dim
+        idx = index;
+    else
+        idx = 1:sz(i);
+    end
+    shape = ones(1,ndims(A));
+    shape(i) = sz(i);
+    idx = reshape(idx,shape);
+    shape = sz;
+    shape(i) = 1;
+    idx = repmat(idx,shape);
+    IDX{i} = idx(:);
+end
+B = reshape(A(sub2ind(size(A),IDX{:})),sz);
\ No newline at end of file
diff --git a/common/sub.m b/common/sub.m
new file mode 100644
index 0000000..6a800f1
--- /dev/null
+++ b/common/sub.m
@@ -0,0 +1,16 @@
+function B = sub(A, varargin)
+% submat(A,i,j,k) = A(i;j;k)
+assert(ndims(A)==numel(varargin));
+sz = cellfun(@numel,varargin);
+IDX = cell(1,ndims(A));
+for i = 1:ndims(A)
+    idx = varargin{i};
+    shape = ones(1,ndims(A));
+    shape(i) = sz(i);
+    idx = reshape(idx,shape);
+    shape = sz;
+    shape(i) = 1;
+    idx = repmat(idx,shape);
+    IDX{i} = idx(:);
+end
+B = reshape(A(sub2ind(size(A),IDX{:})),sz);
\ No newline at end of file

From fa8d013f9086aaf1e327dbfd669c579aafc85707 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 29 Mar 2017 05:38:01 +0800
Subject: [PATCH 051/119] fix doc

---
 common/slice.m | 1 +
 common/sub.m   | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/common/slice.m b/common/slice.m
index bb1e57e..56be588 100644
--- a/common/slice.m
+++ b/common/slice.m
@@ -1,5 +1,6 @@
 function B = slice(A, dim, index)
 % slice(A,2,index) = A(:,index,:)
+% Written by Mo Chen (sth4nth@gmail.com).
 sz = size(A);
 sz(dim) = numel(index);
 IDX = cell(1,ndims(A));
diff --git a/common/sub.m b/common/sub.m
index 6a800f1..8d7de28 100644
--- a/common/sub.m
+++ b/common/sub.m
@@ -1,5 +1,6 @@
 function B = sub(A, varargin)
-% submat(A,i,j,k) = A(i;j;k)
+% sub(A,i,j,k) = A(i;j;k)
+% Written by Mo Chen (sth4nth@gmail.com).
 assert(ndims(A)==numel(varargin));
 sz = cellfun(@numel,varargin);
 IDX = cell(1,ndims(A));

From 6de925744d2a36421794a0b24a50a3bc052d0e07 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 29 Mar 2017 22:36:32 -0700
Subject: [PATCH 052/119] add lattice

---
 common/lattice.m | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 common/lattice.m

diff --git a/common/lattice.m b/common/lattice.m
new file mode 100644
index 0000000..0f2f515
--- /dev/null
+++ b/common/lattice.m
@@ -0,0 +1,17 @@
+function A = lattice( sz )
+% Create an undirected graph corresponding to sz lattice
+% Example:
+%   plot(graph(lattice([2,2,3])))
+% Input:
+%   sz: 1 x d size of lattice
+% Output:
+%   A: prod(sz) x prod(sz) adjacent matrix of an undirected graph
+% Written by Mo Chen (sth4nth@gmail.com)
+d = numel(sz);
+step = cumprod(sz);
+n = step(end);
+M = reshape(1:n,sz);
+S = arrayfun(@(i) reshape(slice(M,i,1:sz(i)-1),1,[]), 1:d,'UniformOutput',false);
+T = arrayfun(@(i) reshape(slice(M,i,2:sz(i)),1,[]), 1:d,'UniformOutput',false);
+A = sparse([S{:}],[T{:}],1,n,n);
+A = A+A';
\ No newline at end of file

From d1b3fe21e87cbcadff6f265ee109060528a08a09 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 2 Apr 2017 04:05:46 +0800
Subject: [PATCH 053/119] fix lognormexp

---
 common/lognormexp.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/lognormexp.m b/common/lognormexp.m
index 10d9f95..8db9c78 100644
--- a/common/lognormexp.m
+++ b/common/lognormexp.m
@@ -7,4 +7,4 @@
     if isempty(dim), dim = 1; end
 end
 s = logsumexp(X,dim);
-Y = exp(X-s);
+Y = X-s;

From 3aedcb4d0d9a4eae460ab002a3d4a18ddff29f94 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 28 May 2017 12:53:51 +0800
Subject: [PATCH 054/119] add MRF mean field

---
 chapter08/betheEnergy.m |  11 ++++++
 chapter08/demo.m        |  76 ++++++++++++++++++++++++++++++++++++++++
 chapter08/gibbsEnergy.m |   9 +++++
 chapter08/im2mrf.m      |  20 +++++++++++
 chapter08/letterX.mat   | Bin 0 -> 273 bytes
 chapter08/meanField.m   |  38 ++++++++++++++++++++
 6 files changed, 154 insertions(+)
 create mode 100644 chapter08/betheEnergy.m
 create mode 100644 chapter08/demo.m
 create mode 100644 chapter08/gibbsEnergy.m
 create mode 100644 chapter08/im2mrf.m
 create mode 100644 chapter08/letterX.mat
 create mode 100644 chapter08/meanField.m

diff --git a/chapter08/betheEnergy.m b/chapter08/betheEnergy.m
new file mode 100644
index 0000000..d663e8b
--- /dev/null
+++ b/chapter08/betheEnergy.m
@@ -0,0 +1,11 @@
+function lnZ = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel)
+% Compute Bethe free energy
+% TBD: deal with log(0) for entropy
+edgePot = reshape(edgePot,[],size(edgePot,3));
+edgeBel = reshape(edgeBel,[],size(edgeBel,3));
+Ex = dot(nodeBel,nodePot,1);
+Exy = dot(edgeBel,edgePot,1);
+Hx = -dot(nodeBel,log(nodeBel),1);
+Hxy = -dot(edgeBel,log(edgeBel),1);
+d = full(sum(logical(A),1));
+lnZ = -sum(Ex)-sum(Exy)-sum((d-1).*Hx)+sum(Hxy);
diff --git a/chapter08/demo.m b/chapter08/demo.m
new file mode 100644
index 0000000..8005d9a
--- /dev/null
+++ b/chapter08/demo.m
@@ -0,0 +1,76 @@
+clear; close all;
+% load letterA.mat;
+% X = A;
+load letterX.mat
+%% Original image
+epoch = 50;
+J = 1;   % ising parameter
+sigma = 1; % noise level
+
+img = double(X);
+img = sign(img-mean(img(:)));
+
+figure;
+subplot(2,3,1);
+imagesc(img);
+title('Original image');
+axis image;
+colormap gray;
+%% Noisy image
+y = img + sigma*randn(size(img)); % noisy signal
+subplot(2,3,2);
+imagesc(y);
+title('Noisy image');
+axis image;
+colormap gray;
+%% Mean Field
+[A, nodePot, edgePot] = im2mrf(y, sigma, J);
+[nodeBel, edgeBel, lnZ] = meanField(A, nodePot, edgePot, epoch);
+lnZ0 = gibbsEnergy(nodePot, edgePot, nodeBel, edgeBel);
+lnZ1 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
+maxdiff(lnZ0, lnZ(end))
+maxdiff(lnZ0, lnZ1)
+
+subplot(2,3,3);
+imagesc(reshape(nodeBel(1,:),size(img)));
+title('MF');
+axis image;
+colormap gray;
+%% Belief Propagation
+% [nodeBel,edgeBel] = belProp(A, nodePot, edgePot, epoch);
+% 
+% [nodeBel0,edgeBel0] = belProp0(A, nodePot, edgePot, epoch);
+% maxdiff(nodeBel,nodeBel0)
+% maxdiff(edgeBel,edgeBel0)
+% 
+% subplot(2,3,4);
+% imagesc(reshape(nodeBel(1,:),size(img)));
+% title('BP');
+% axis image;
+% colormap gray;
+% %% Expectation Propagation
+% [nodeBel,edgeBel] = expProp(A, nodePot, edgePot, epoch);
+% 
+% lnZ0 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
+% 
+% [nodeBel0,edgeBel0] = expProp0(A, nodePot, edgePot, epoch);
+% maxdiff(nodeBel,nodeBel0)
+% maxdiff(edgeBel,edgeBel0)
+% 
+% subplot(2,3,5);
+% imagesc(reshape(nodeBel(1,:),size(img)));
+% title('EP');
+% axis image;
+% colormap gray;
+% %% EP-BP
+% [nodeBel,edgeBel] = expBelProp(A, nodePot, edgePot, epoch);
+% 
+% [nodeBel0,edgeBel0] = expBelProp0(A, nodePot, edgePot, epoch);
+% maxdiff(nodeBel,nodeBel0)
+% maxdiff(edgeBel,edgeBel0)
+% 
+% subplot(2,3,6);
+% imagesc(reshape(nodeBel(1,:),size(img)));
+% title('EBP');
+% axis image;
+% colormap gray;
diff --git a/chapter08/gibbsEnergy.m b/chapter08/gibbsEnergy.m
new file mode 100644
index 0000000..b4c0aec
--- /dev/null
+++ b/chapter08/gibbsEnergy.m
@@ -0,0 +1,9 @@
+function lnZ = gibbsEnergy(nodePot, edgePot, nodeBel, edgeBel)
+% Compute Gibbs free energy
+% TBD: deal with log(0) for entropy
+edgePot = reshape(edgePot,[],size(edgePot,3));
+edgeBel = reshape(edgeBel,[],size(edgeBel,3));
+Ex = dot(nodeBel,nodePot,1);
+Exy = dot(edgeBel,edgePot,1);
+Hx = dot(nodeBel,log(nodeBel),1);
+lnZ = -(sum(Ex)+sum(Exy)+sum(Hx));
\ No newline at end of file
diff --git a/chapter08/im2mrf.m b/chapter08/im2mrf.m
new file mode 100644
index 0000000..b960381
--- /dev/null
+++ b/chapter08/im2mrf.m
@@ -0,0 +1,20 @@
+function [A, nodePot, edgePot] = im2mrf(im, sigma, J)
+% Convert a image to Ising MRF with distribution p(x)=exp(-sum(nodePot)-sum(edgePot)-lnZ)
+% Input:
+%   im: row x col image
+%   sigma: variance of Gaussian node potential
+%   J: parameter of Ising edge
+% Output:
+%   nodePot: 2 x n node potential
+%   edgePot: 2 x 2 x m edge potential
+
+A = lattice(size(im));
+[s,t,e] = find(tril(A));
+nEdge = numel(e);
+e(:) = 1:nEdge;
+A = sparse([s;t],[t;s],[e;e]);
+
+z = [1;-1];
+y = reshape(im,1,[]);
+nodePot = (y-z).^2/(2*sigma^2);
+edgePot = repmat(-J*(z*z'),[1, 1, nEdge]);
\ No newline at end of file
diff --git a/chapter08/letterX.mat b/chapter08/letterX.mat
new file mode 100644
index 0000000000000000000000000000000000000000..eab4464282f232af265fc013705b0b4b1a2cf802
GIT binary patch
literal 273
zcmeZu4DoSvQZUssQ1EpO(M`+DN!3vZ$Vn_o%P-2cQV4Jk_w>_Ia4t$sEJ;mK$j`G<
z2q{ff@J}vLFfvduGO{u;w=yvTvJEU1NCpgyp1%AH3=Ew>Trp?w<h#5L3Op_Ri;t<N
z7qA>J{O_p~^mMuI18&nL5BIIUCQ`=moM+7x^OiTs5C0xKeX%%jo{h?WX~7ls&oj3K
zDORp?kl<Qq($N$%;r(e9*5!-37Uw(=pJ8FT!a|DY&94oq8C#AYF{;^>=Mj=TEAaC2
e%=zp$zpO88pL=cnpVN8wb_pGR#HMhVg%to3Ltq8~

literal 0
HcmV?d00001

diff --git a/chapter08/meanField.m b/chapter08/meanField.m
new file mode 100644
index 0000000..2176f53
--- /dev/null
+++ b/chapter08/meanField.m
@@ -0,0 +1,38 @@
+function [nodeBel, edgeBel, lnZ] = meanField(A, nodePot, edgePot, epoch)
+% Mean field for MRF
+% Assuming egdePot is symmetric
+% Input: 
+%   A: n x n adjacent matrix of undirected graph, where value is edge index
+%   nodePot: k x n node potential
+%   edgePot: k x k x m edge potential
+% Output:
+%   nodeBel: k x n node belief
+%   edgeBel: k x k x m edge belief
+%   L: variational lower bound
+% Written by Mo Chen (sth4nth@gmail.com)
+tol = 0;
+if nargin < 4
+    epoch = 10;
+    tol = 1e-4;
+end
+lnZ = -inf(1,epoch+1);
+[nodeBel,L] = softmax(-nodePot,1);    % init nodeBel    
+for iter = 1:epoch
+    for i = 1:numel(L)
+        [~,j,e] = find(A(i,:));             % neighbors
+        np = nodePot(:,i);
+        [lnp ,lnz] = lognormexp(-np-reshape(edgePot(:,:,e),2,[])*reshape(nodeBel(:,j),[],1));
+        p = exp(lnp);
+        L(i) = -dot(p,lnp+np)+lnz; %
+        nodeBel(:,i) = p;
+    end
+    lnZ(iter+1) = sum(L)/2;
+    if abs(lnZ(iter+1)-lnZ(iter))/abs(lnZ(iter)) < tol; break; end
+end
+lnZ = lnZ(2:iter);
+
+[s,t,e] = find(tril(A));
+edgeBel = zeros(size(edgePot));
+for l = 1:numel(e)
+    edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))';
+end
\ No newline at end of file

From 8ddf99a44058642de4284838cd75fef488fca1c3 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 28 May 2017 13:21:10 +0800
Subject: [PATCH 055/119] add discrete MRF BP and EP

---
 chapter08/belProp.m         | 63 +++++++++++++++++++++++++++++++++++++
 chapter08/belProp0.m        | 63 +++++++++++++++++++++++++++++++++++++
 chapter08/expProp.m         | 59 ++++++++++++++++++++++++++++++++++
 chapter08/expProp0.m        | 60 +++++++++++++++++++++++++++++++++++
 chapter08/imageMeanField.m  | 18 +++++++++++
 chapter08/isingMeanField.m  | 18 +++++++++++
 chapter08/isingMeanField0.m | 18 +++++++++++
 7 files changed, 299 insertions(+)
 create mode 100644 chapter08/belProp.m
 create mode 100644 chapter08/belProp0.m
 create mode 100644 chapter08/expProp.m
 create mode 100644 chapter08/expProp0.m
 create mode 100644 chapter08/imageMeanField.m
 create mode 100644 chapter08/isingMeanField.m
 create mode 100644 chapter08/isingMeanField0.m

diff --git a/chapter08/belProp.m b/chapter08/belProp.m
new file mode 100644
index 0000000..c9a73da
--- /dev/null
+++ b/chapter08/belProp.m
@@ -0,0 +1,63 @@
+function [nodeBel, edgeBel] = belProp(A, nodePot, edgePot, epoch)
+% Belief propagation for MRF
+% Assuming egdePot is symmetric
+% Input: 
+%   A: n x n adjacent matrix of undirected graph, where value is edge index
+%   nodePot: k x n node potential
+%   edgePot: k x k x m edge potential
+% Output:
+%   nodeBel: k x n node belief
+%   edgeBel: k x k x m edge belief
+%   L: variational lower bound (Bethe energy)
+% Written by Mo Chen (sth4nth@gmail.com)
+nodePot = exp(-nodePot);  
+edgePot = exp(-edgePot);
+
+tol = 0;
+if nargin < 4
+    epoch = 10;
+    tol = 1e-4;
+end
+[k,n] = size(nodePot);
+m = size(edgePot,3);
+
+[s,t,e] = find(tril(A));
+A = sparse([s;t],[t;s],[e;e+m]);       % digraph adjacent matrix, where value is message index
+mu = ones(k,2*m)/k;                     % message
+for iter = 1:epoch
+    mu0 = mu;
+    for i = 1:n
+        in = nonzeros(A(:,i));                      % incoming message index
+        nb = nodePot(:,i).*prod(mu(:,in),2);                       % product of incoming message
+        for l = in'
+            ep = edgePot(:,:,ud(l,m));
+            mu(:,rd(l,m)) = normalize(ep*(nb./mu(:,l)));
+        end
+    end
+    if max(abs(mu(:)-mu0(:))) < tol; break; end
+end
+
+nodeBel = zeros(k,n);
+for i = 1:n
+    nodeBel(:,i) = nodePot(:,i).*prod(mu(:,nonzeros(A(:,i))),2);
+end
+nodeBel = normalize(nodeBel,1);
+
+edgeBel = zeros(k,k,m);
+for l = 1:m
+    eij = e(l);
+    eji = eij+m;
+    ep = edgePot(:,:,eij);
+    nbt = nodeBel(:,t(l))./mu(:,eij);
+    nbs = nodeBel(:,s(l))./mu(:,eji);
+    eb = (nbt*nbs').*ep;
+    edgeBel(:,:,eij) = eb./sum(eb(:));
+end
+
+function i = rd(i, m)
+% reverse direction edge index
+i = mod(i+m-1,2*m)+1;
+
+function i = ud(i, m)
+% undirected edge index
+i = mod(i-1,m)+1;
\ No newline at end of file
diff --git a/chapter08/belProp0.m b/chapter08/belProp0.m
new file mode 100644
index 0000000..e59ef62
--- /dev/null
+++ b/chapter08/belProp0.m
@@ -0,0 +1,63 @@
+function [nodeBel, edgeBel] = belProp0(A, nodePot, edgePot, epoch)
+% Belief propagation for MRF, calculation in log scale
+% Assuming egdePot is symmetric
+% Input: 
+%   A: n x n adjacent matrix of undirected graph, where value is edge index
+%   nodePot: k x n node potential
+%   edgePot: k x k x m edge potential
+% Output:
+%   nodeBel: k x n node belief
+%   edgeBel: k x k x m edge belief
+%   L: variational lower bound (Bethe energy)
+% Written by Mo Chen (sth4nth@gmail.com)
+tol = 0;
+if nargin < 4
+    epoch = 10;
+    tol = 1e-4;
+end
+[k,n] = size(nodePot);
+m = size(edgePot,3);
+
+[s,t,e] = find(tril(A));
+A = sparse([s;t],[t;s],[e;e+m]);       % digraph adjacent matrix, where value is message index
+mu = zeros(k,2*m)-log(k);              % message
+for iter = 1:epoch
+    mu0 = mu;
+    for i = 1:n
+        in = nonzeros(A(:,i));                      % incoming message index
+        nb = -nodePot(:,i)+sum(mu(:,in),2);                       % product of incoming message
+        for l = in'
+            ep = edgePot(:,:,ud(l,m));
+            mut = logsumexp(-ep+(nb-mu(:,l)),1);
+            mu(:,rd(l,m)) = mut-logsumexp(mut);
+        end
+    end
+    if max(abs(mu(:)-mu0(:))) < tol; break; end
+end
+
+nodeBel = zeros(k,n);
+for i = 1:n
+    nb = -nodePot(:,i)+sum(mu(:,nonzeros(A(:,i))),2);
+    nodeBel(:,i) = nb-logsumexp(nb);
+end
+
+edgeBel = zeros(k,k,m);
+for l = 1:m
+    eij = e(l);
+    eji = eij+m;
+    ep = edgePot(:,:,eij);
+    nbt = nodeBel(:,t(l))-mu(:,eij);
+    nbs = nodeBel(:,s(l))-mu(:,eji);
+    eb = (nbt+nbs')-ep;
+    edgeBel(:,:,eij) = eb-logsumexp(eb(:));
+end
+nodeBel = exp(nodeBel);
+edgeBel = exp(edgeBel);
+
+function i = rd(i, m)
+% reverse direction edge index
+i = mod(i+m-1,2*m)+1;
+
+function i = ud(i, m)
+% undirected edge index
+i = mod(i-1,m)+1;
\ No newline at end of file
diff --git a/chapter08/expProp.m b/chapter08/expProp.m
new file mode 100644
index 0000000..a8f42b3
--- /dev/null
+++ b/chapter08/expProp.m
@@ -0,0 +1,59 @@
+function [nodeBel, edgeBel] = expProp(A, nodePot, edgePot, epoch)
+% Expectation propagation for MRF
+% Assuming egdePot is symmetric
+% Another implementation with precompute nodeBel and update during iterations
+% Input: 
+%   A: n x n adjacent matrix of undirected graph, where value is edge index
+%   nodePot: k x n node potential
+%   edgePot: k x k x m edge potential
+% Output:
+%   nodeBel: k x n node belief
+%   edgeBel: k x k x m edge belief
+%   L: variational lower bound (Bethe energy)
+% Written by Mo Chen (sth4nth@gmail.com)
+
+% working in exp domain
+nodePot = exp(-nodePot);  
+edgePot = exp(-edgePot);
+
+tol = 0;
+if nargin < 4
+    epoch = 10;
+    tol = 1e-4;
+end
+k = size(nodePot,1);
+m = size(edgePot,3);
+
+[s,t,e] = find(tril(A));
+mu = ones(k,2*m)/k;         % message
+nodeBel = normalize(nodePot,1);
+for iter = 1:epoch
+    mu0 = mu;
+    for l = 1:m
+        i = s(l);
+        j = t(l);
+        eij = e(l);
+        eji = eij+m;
+        ep = edgePot(:,:,eij);
+
+        nodeBel(:,j) = nodeBel(:,j)./mu(:,eij);
+        mu(:,eij) = normalize(ep*(nodeBel(:,i)./mu(:,eji)));
+        nodeBel(:,j) = normalize(nodeBel(:,j).*mu(:,eij));
+        
+        nodeBel(:,i) = nodeBel(:,i)./mu(:,eji);
+        mu(:,eji) = normalize(ep*(nodeBel(:,j)./mu(:,eij)));
+        nodeBel(:,i) = normalize(nodeBel(:,i).*mu(:,eji));
+    end
+    if max(abs(mu(:)-mu0(:))) < tol; break; end
+end
+
+edgeBel = zeros(k,k,m);
+for l = 1:m
+    eij = e(l);
+    eji = eij+m;
+    ep = edgePot(:,:,eij);
+    nbt = nodeBel(:,t(l))./mu(:,eij);
+    nbs = nodeBel(:,s(l))./mu(:,eji);
+    eb = (nbt*nbs').*ep;
+    edgeBel(:,:,eij) = eb./sum(eb(:));
+end
diff --git a/chapter08/expProp0.m b/chapter08/expProp0.m
new file mode 100644
index 0000000..d6f2eb1
--- /dev/null
+++ b/chapter08/expProp0.m
@@ -0,0 +1,60 @@
+function [nodeBel, edgeBel] = expProp0(A, nodePot, edgePot, epoch)
+% Expectation propagation for MRF, calculation in log scale
+% Assuming egdePot is symmetric
+% Another implementation with precompute nodeBel and update during iterations
+% Input: 
+%   A: n x n adjacent matrix of undirected graph, where value is edge index
+%   nodePot: k x n node potential
+%   edgePot: k x k x m edge potential
+% Output:
+%   nodeBel: k x n node belief
+%   edgeBel: k x k x m edge belief
+%   L: variational lower bound (Bethe energy)
+% Written by Mo Chen (sth4nth@gmail.com)
+tol = 0;
+if nargin < 4
+    epoch = 10;
+    tol = 1e-4;
+end
+k = size(nodePot,1);
+m = size(edgePot,3);
+
+[s,t,e] = find(tril(A));
+mu = zeros(k,2*m)-log(k);    
+nodeBel = -nodePot-logsumexp(-nodePot,1);
+for iter = 1:epoch
+    mu0 = mu;
+    for l = 1:m
+        i = s(l);
+        j = t(l);
+        eij = e(l);
+        eji = eij+m;
+        ep = edgePot(:,:,eij);
+
+        nodeBel(:,j) = nodeBel(:,j)-mu(:,eij);
+        mut = logsumexp(-ep+(nodeBel(:,i)-mu(:,eji)),1);
+        mu(:,eij) = mut-logsumexp(mut);
+        nb = nodeBel(:,j)+mu(:,eij);
+        nodeBel(:,j) = nb-logsumexp(nb);
+        
+        nodeBel(:,i) = nodeBel(:,i)-mu(:,eji);
+        mut = logsumexp(-ep+(nodeBel(:,j)-mu(:,eij)),1);
+        mu(:,eji) = mut-logsumexp(mut);
+        nb = nodeBel(:,i)+mu(:,eji);
+        nodeBel(:,i) = nb-logsumexp(nb);
+    end
+    if max(abs(mu(:)-mu0(:))) < tol; break; end
+end
+
+edgeBel = zeros(k,k,m);
+for l = 1:m
+    eij = e(l);
+    eji = eij+m;
+    ep = edgePot(:,:,eij);
+    nbt = nodeBel(:,t(l))-mu(:,eij);
+    nbs = nodeBel(:,s(l))-mu(:,eji);
+    eb = (nbt+nbs')-ep;
+    edgeBel(:,:,eij) = eb-logsumexp(eb(:));
+end
+nodeBel = exp(nodeBel);
+edgeBel = exp(edgeBel);
\ No newline at end of file
diff --git a/chapter08/imageMeanField.m b/chapter08/imageMeanField.m
new file mode 100644
index 0000000..a747f75
--- /dev/null
+++ b/chapter08/imageMeanField.m
@@ -0,0 +1,18 @@
+function nodeBel = imageMeanField(M, N, nodePot, edgePot, epoch)
+if nargin < 5
+    epoch = 10;
+end
+stride = [-1,1,-M,M];
+nodeBel = softmax(-nodePot,1);
+for t = 1:epoch
+    for j = 1:N
+        for i = 1:M
+            pos = i + M*(j-1);
+            ne = pos + stride;
+            ne([i,i,j,j] == [1,M,1,N]) = [];
+            nodeBel(:,pos) = softmax(-edgePot*sum(nodeBel(:,ne),2)-nodePot(:,pos));
+        end
+    end
+end 
+
+
diff --git a/chapter08/isingMeanField.m b/chapter08/isingMeanField.m
new file mode 100644
index 0000000..81a9887
--- /dev/null
+++ b/chapter08/isingMeanField.m
@@ -0,0 +1,18 @@
+function mu = isingMeanField(J, h, epoch)
+if nargin < 3
+    epoch = 10;
+end
+[M,N] = size(h);
+mu =  tanh(h);
+stride = [-1,1,-M,M];
+for t = 1:epoch
+    for j = 1:N
+        for i = 1:M
+            pos = i + M*(j-1);
+            ne = pos + stride;
+            ne([i,i,j,j] == [1,M,1,N]) = [];
+            mu(i,j) = tanh(J*sum(mu(ne)) + h(i,j));
+        end
+    end
+end 
+
diff --git a/chapter08/isingMeanField0.m b/chapter08/isingMeanField0.m
new file mode 100644
index 0000000..f68cba0
--- /dev/null
+++ b/chapter08/isingMeanField0.m
@@ -0,0 +1,18 @@
+function mu = isingMeanField0(J, h, epoch)
+% use padding trick
+if nargin < 3
+    epoch = 10;
+end
+mu = zeros(size(h)+2);                        % padding
+[m,n] = size(mu);
+mu(2:m-1,2:n-1) = tanh(h);               % init
+stride = [-1,1,-m,m];
+for t = 1:epoch
+    for j = 2:n-1
+        for i = 2:m-1
+            ne = i + m*(j-1) + stride;
+            mu(i,j) = tanh(J*sum(mu(ne))+h(i-1,j-1));
+        end
+    end
+end
+mu = mu(2:m-1,2:n-1);
\ No newline at end of file

From d7cdb1af170f9f85c0bfd1b7eae3f5e1ed144b60 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 28 May 2017 14:16:39 +0800
Subject: [PATCH 056/119] refine MRF mean field

---
 chapter08/demo.m         | 13 ++++++-------
 chapter08/im2mrf.m       |  2 +-
 chapter08/meanField.m    | 38 --------------------------------------
 chapter08/mrfMeanField.m | 28 ++++++++++++++++++++++++++++
 4 files changed, 35 insertions(+), 46 deletions(-)
 delete mode 100644 chapter08/meanField.m
 create mode 100644 chapter08/mrfMeanField.m

diff --git a/chapter08/demo.m b/chapter08/demo.m
index 8005d9a..04a10fc 100644
--- a/chapter08/demo.m
+++ b/chapter08/demo.m
@@ -24,16 +24,15 @@
 axis image;
 colormap gray;
 %% Mean Field
-[A, nodePot, edgePot] = im2mrf(y, sigma, J);
-[nodeBel, edgeBel, lnZ] = meanField(A, nodePot, edgePot, epoch);
-lnZ0 = gibbsEnergy(nodePot, edgePot, nodeBel, edgeBel);
-lnZ1 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
-maxdiff(lnZ0, lnZ(end))
-maxdiff(lnZ0, lnZ1)
+[A, nodePot, edgePot] = im2mrf(y, J, sigma);
+[nodeBel, edgeBel] = mrfMeanField(A, nodePot, edgePot, epoch);
+lnZ = gibbsEnergy(nodePot, edgePot, nodeBel, edgeBel);
+lnZ0 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
+maxdiff(lnZ, lnZ0)
 
 subplot(2,3,3);
 imagesc(reshape(nodeBel(1,:),size(img)));
-title('MF');
+title('Mean Field');
 axis image;
 colormap gray;
 %% Belief Propagation
diff --git a/chapter08/im2mrf.m b/chapter08/im2mrf.m
index b960381..3d9e173 100644
--- a/chapter08/im2mrf.m
+++ b/chapter08/im2mrf.m
@@ -1,4 +1,4 @@
-function [A, nodePot, edgePot] = im2mrf(im, sigma, J)
+function [A, nodePot, edgePot] = im2mrf(im, J, sigma)
 % Convert a image to Ising MRF with distribution p(x)=exp(-sum(nodePot)-sum(edgePot)-lnZ)
 % Input:
 %   im: row x col image
diff --git a/chapter08/meanField.m b/chapter08/meanField.m
deleted file mode 100644
index 2176f53..0000000
--- a/chapter08/meanField.m
+++ /dev/null
@@ -1,38 +0,0 @@
-function [nodeBel, edgeBel, lnZ] = meanField(A, nodePot, edgePot, epoch)
-% Mean field for MRF
-% Assuming egdePot is symmetric
-% Input: 
-%   A: n x n adjacent matrix of undirected graph, where value is edge index
-%   nodePot: k x n node potential
-%   edgePot: k x k x m edge potential
-% Output:
-%   nodeBel: k x n node belief
-%   edgeBel: k x k x m edge belief
-%   L: variational lower bound
-% Written by Mo Chen (sth4nth@gmail.com)
-tol = 0;
-if nargin < 4
-    epoch = 10;
-    tol = 1e-4;
-end
-lnZ = -inf(1,epoch+1);
-[nodeBel,L] = softmax(-nodePot,1);    % init nodeBel    
-for iter = 1:epoch
-    for i = 1:numel(L)
-        [~,j,e] = find(A(i,:));             % neighbors
-        np = nodePot(:,i);
-        [lnp ,lnz] = lognormexp(-np-reshape(edgePot(:,:,e),2,[])*reshape(nodeBel(:,j),[],1));
-        p = exp(lnp);
-        L(i) = -dot(p,lnp+np)+lnz; %
-        nodeBel(:,i) = p;
-    end
-    lnZ(iter+1) = sum(L)/2;
-    if abs(lnZ(iter+1)-lnZ(iter))/abs(lnZ(iter)) < tol; break; end
-end
-lnZ = lnZ(2:iter);
-
-[s,t,e] = find(tril(A));
-edgeBel = zeros(size(edgePot));
-for l = 1:numel(e)
-    edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))';
-end
\ No newline at end of file
diff --git a/chapter08/mrfMeanField.m b/chapter08/mrfMeanField.m
new file mode 100644
index 0000000..091c964
--- /dev/null
+++ b/chapter08/mrfMeanField.m
@@ -0,0 +1,28 @@
+function [nodeBel, edgeBel, lnZ] = mrfMeanField(A, nodePot, edgePot, epoch)
+% Mean field for MRF (Assuming that egdePot is symmetric)
+% p(x)=exp(-E(x))/Z, E(x)=\sum(edgePot)+sum(nodePot)
+% Input: 
+%   A: n x n adjacent matrix of undirected graph, where value is edge index
+%   nodePot: k x n node potential 
+%   edgePot: k x k x m edge potential 
+% Output:
+%   nodeBel: k x n node belief q(x_i)
+%   edgeBel: k x k x m edge belief q(x_i,x_j)
+% Written by Mo Chen (sth4nth@gmail.com)
+if nargin < 4
+    epoch = 50;
+end
+lnZ = -inf(1,epoch+1);
+[nodeBel,L] = softmax(-nodePot,1);    % init nodeBel    
+for iter = 1:epoch
+    for i = 1:numel(L)
+        [~,j,e] = find(A(i,:));             % neighbors
+        nodeBel(:,i) = softmax(-nodePot(:,i)-reshape(edgePot(:,:,e),2,[])*reshape(nodeBel(:,j),[],1));
+    end
+end
+
+[s,t,e] = find(tril(A));
+edgeBel = zeros(size(edgePot));
+for l = 1:numel(e)
+    edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))';
+end
\ No newline at end of file

From a24ec5a3353dd073b52ad74414a4f7b030bf9b8e Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 28 May 2017 14:51:02 +0800
Subject: [PATCH 057/119] refine MRF

---
 chapter08/belProp0.m                  | 63 ---------------------------
 chapter08/demo.m                      | 57 ++++++++----------------
 chapter08/expProp0.m                  | 60 -------------------------
 chapter08/{belProp.m => mrfBelProp.m} | 17 ++++----
 chapter08/{expProp.m => mrfExpProp.m} | 18 +++-----
 chapter08/mrfMeanField.m              |  7 ++-
 6 files changed, 39 insertions(+), 183 deletions(-)
 delete mode 100644 chapter08/belProp0.m
 delete mode 100644 chapter08/expProp0.m
 rename chapter08/{belProp.m => mrfBelProp.m} (88%)
 rename chapter08/{expProp.m => mrfExpProp.m} (80%)

diff --git a/chapter08/belProp0.m b/chapter08/belProp0.m
deleted file mode 100644
index e59ef62..0000000
--- a/chapter08/belProp0.m
+++ /dev/null
@@ -1,63 +0,0 @@
-function [nodeBel, edgeBel] = belProp0(A, nodePot, edgePot, epoch)
-% Belief propagation for MRF, calculation in log scale
-% Assuming egdePot is symmetric
-% Input: 
-%   A: n x n adjacent matrix of undirected graph, where value is edge index
-%   nodePot: k x n node potential
-%   edgePot: k x k x m edge potential
-% Output:
-%   nodeBel: k x n node belief
-%   edgeBel: k x k x m edge belief
-%   L: variational lower bound (Bethe energy)
-% Written by Mo Chen (sth4nth@gmail.com)
-tol = 0;
-if nargin < 4
-    epoch = 10;
-    tol = 1e-4;
-end
-[k,n] = size(nodePot);
-m = size(edgePot,3);
-
-[s,t,e] = find(tril(A));
-A = sparse([s;t],[t;s],[e;e+m]);       % digraph adjacent matrix, where value is message index
-mu = zeros(k,2*m)-log(k);              % message
-for iter = 1:epoch
-    mu0 = mu;
-    for i = 1:n
-        in = nonzeros(A(:,i));                      % incoming message index
-        nb = -nodePot(:,i)+sum(mu(:,in),2);                       % product of incoming message
-        for l = in'
-            ep = edgePot(:,:,ud(l,m));
-            mut = logsumexp(-ep+(nb-mu(:,l)),1);
-            mu(:,rd(l,m)) = mut-logsumexp(mut);
-        end
-    end
-    if max(abs(mu(:)-mu0(:))) < tol; break; end
-end
-
-nodeBel = zeros(k,n);
-for i = 1:n
-    nb = -nodePot(:,i)+sum(mu(:,nonzeros(A(:,i))),2);
-    nodeBel(:,i) = nb-logsumexp(nb);
-end
-
-edgeBel = zeros(k,k,m);
-for l = 1:m
-    eij = e(l);
-    eji = eij+m;
-    ep = edgePot(:,:,eij);
-    nbt = nodeBel(:,t(l))-mu(:,eij);
-    nbs = nodeBel(:,s(l))-mu(:,eji);
-    eb = (nbt+nbs')-ep;
-    edgeBel(:,:,eij) = eb-logsumexp(eb(:));
-end
-nodeBel = exp(nodeBel);
-edgeBel = exp(edgeBel);
-
-function i = rd(i, m)
-% reverse direction edge index
-i = mod(i+m-1,2*m)+1;
-
-function i = ud(i, m)
-% undirected edge index
-i = mod(i-1,m)+1;
\ No newline at end of file
diff --git a/chapter08/demo.m b/chapter08/demo.m
index 04a10fc..fd24aab 100644
--- a/chapter08/demo.m
+++ b/chapter08/demo.m
@@ -30,46 +30,27 @@
 lnZ0 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
 maxdiff(lnZ, lnZ0)
 
-subplot(2,3,3);
+subplot(2,3,4);
 imagesc(reshape(nodeBel(1,:),size(img)));
 title('Mean Field');
 axis image;
 colormap gray;
 %% Belief Propagation
-% [nodeBel,edgeBel] = belProp(A, nodePot, edgePot, epoch);
-% 
-% [nodeBel0,edgeBel0] = belProp0(A, nodePot, edgePot, epoch);
-% maxdiff(nodeBel,nodeBel0)
-% maxdiff(edgeBel,edgeBel0)
-% 
-% subplot(2,3,4);
-% imagesc(reshape(nodeBel(1,:),size(img)));
-% title('BP');
-% axis image;
-% colormap gray;
-% %% Expectation Propagation
-% [nodeBel,edgeBel] = expProp(A, nodePot, edgePot, epoch);
-% 
-% lnZ0 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
-% 
-% [nodeBel0,edgeBel0] = expProp0(A, nodePot, edgePot, epoch);
-% maxdiff(nodeBel,nodeBel0)
-% maxdiff(edgeBel,edgeBel0)
-% 
-% subplot(2,3,5);
-% imagesc(reshape(nodeBel(1,:),size(img)));
-% title('EP');
-% axis image;
-% colormap gray;
-% %% EP-BP
-% [nodeBel,edgeBel] = expBelProp(A, nodePot, edgePot, epoch);
-% 
-% [nodeBel0,edgeBel0] = expBelProp0(A, nodePot, edgePot, epoch);
-% maxdiff(nodeBel,nodeBel0)
-% maxdiff(edgeBel,edgeBel0)
-% 
-% subplot(2,3,6);
-% imagesc(reshape(nodeBel(1,:),size(img)));
-% title('EBP');
-% axis image;
-% colormap gray;
+[nodeBel,edgeBel] = mrfBelProp(A, nodePot, edgePot, epoch);
+lnZ = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
+
+subplot(2,3,5);
+imagesc(reshape(nodeBel(1,:),size(img)));
+title('Belief propagation');
+axis image;
+colormap gray;
+%% Expectation Propagation
+[nodeBel,edgeBel] = mrfExpProp(A, nodePot, edgePot, epoch);
+lnZ0 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
+maxdiff(lnZ, lnZ0)
+
+subplot(2,3,6);
+imagesc(reshape(nodeBel(1,:),size(img)));
+title('Expectation Propagation');
+axis image;
+colormap gray;
diff --git a/chapter08/expProp0.m b/chapter08/expProp0.m
deleted file mode 100644
index d6f2eb1..0000000
--- a/chapter08/expProp0.m
+++ /dev/null
@@ -1,60 +0,0 @@
-function [nodeBel, edgeBel] = expProp0(A, nodePot, edgePot, epoch)
-% Expectation propagation for MRF, calculation in log scale
-% Assuming egdePot is symmetric
-% Another implementation with precompute nodeBel and update during iterations
-% Input: 
-%   A: n x n adjacent matrix of undirected graph, where value is edge index
-%   nodePot: k x n node potential
-%   edgePot: k x k x m edge potential
-% Output:
-%   nodeBel: k x n node belief
-%   edgeBel: k x k x m edge belief
-%   L: variational lower bound (Bethe energy)
-% Written by Mo Chen (sth4nth@gmail.com)
-tol = 0;
-if nargin < 4
-    epoch = 10;
-    tol = 1e-4;
-end
-k = size(nodePot,1);
-m = size(edgePot,3);
-
-[s,t,e] = find(tril(A));
-mu = zeros(k,2*m)-log(k);    
-nodeBel = -nodePot-logsumexp(-nodePot,1);
-for iter = 1:epoch
-    mu0 = mu;
-    for l = 1:m
-        i = s(l);
-        j = t(l);
-        eij = e(l);
-        eji = eij+m;
-        ep = edgePot(:,:,eij);
-
-        nodeBel(:,j) = nodeBel(:,j)-mu(:,eij);
-        mut = logsumexp(-ep+(nodeBel(:,i)-mu(:,eji)),1);
-        mu(:,eij) = mut-logsumexp(mut);
-        nb = nodeBel(:,j)+mu(:,eij);
-        nodeBel(:,j) = nb-logsumexp(nb);
-        
-        nodeBel(:,i) = nodeBel(:,i)-mu(:,eji);
-        mut = logsumexp(-ep+(nodeBel(:,j)-mu(:,eij)),1);
-        mu(:,eji) = mut-logsumexp(mut);
-        nb = nodeBel(:,i)+mu(:,eji);
-        nodeBel(:,i) = nb-logsumexp(nb);
-    end
-    if max(abs(mu(:)-mu0(:))) < tol; break; end
-end
-
-edgeBel = zeros(k,k,m);
-for l = 1:m
-    eij = e(l);
-    eji = eij+m;
-    ep = edgePot(:,:,eij);
-    nbt = nodeBel(:,t(l))-mu(:,eij);
-    nbs = nodeBel(:,s(l))-mu(:,eji);
-    eb = (nbt+nbs')-ep;
-    edgeBel(:,:,eij) = eb-logsumexp(eb(:));
-end
-nodeBel = exp(nodeBel);
-edgeBel = exp(edgeBel);
\ No newline at end of file
diff --git a/chapter08/belProp.m b/chapter08/mrfBelProp.m
similarity index 88%
rename from chapter08/belProp.m
rename to chapter08/mrfBelProp.m
index c9a73da..a556d9c 100644
--- a/chapter08/belProp.m
+++ b/chapter08/mrfBelProp.m
@@ -1,6 +1,5 @@
-function [nodeBel, edgeBel] = belProp(A, nodePot, edgePot, epoch)
-% Belief propagation for MRF
-% Assuming egdePot is symmetric
+function [nodeBel, edgeBel] = mrfBelProp(A, nodePot, edgePot, epoch)
+% Belief propagation for MRF (Assuming that egdePot is symmetric)
 % Input: 
 %   A: n x n adjacent matrix of undirected graph, where value is edge index
 %   nodePot: k x n node potential
@@ -8,16 +7,16 @@
 % Output:
 %   nodeBel: k x n node belief
 %   edgeBel: k x k x m edge belief
-%   L: variational lower bound (Bethe energy)
 % Written by Mo Chen (sth4nth@gmail.com)
-nodePot = exp(-nodePot);  
-edgePot = exp(-edgePot);
-
 tol = 0;
 if nargin < 4
-    epoch = 10;
-    tol = 1e-4;
+    epoch = 50;
+    tol = 1e-8;
 end
+
+nodePot = exp(-nodePot);  
+edgePot = exp(-edgePot);
+
 [k,n] = size(nodePot);
 m = size(edgePot,3);
 
diff --git a/chapter08/expProp.m b/chapter08/mrfExpProp.m
similarity index 80%
rename from chapter08/expProp.m
rename to chapter08/mrfExpProp.m
index a8f42b3..26969f2 100644
--- a/chapter08/expProp.m
+++ b/chapter08/mrfExpProp.m
@@ -1,7 +1,5 @@
-function [nodeBel, edgeBel] = expProp(A, nodePot, edgePot, epoch)
-% Expectation propagation for MRF
-% Assuming egdePot is symmetric
-% Another implementation with precompute nodeBel and update during iterations
+function [nodeBel, edgeBel] = mrfExpProp(A, nodePot, edgePot, epoch)
+% Expectation propagation for MRF (Assuming that egdePot is symmetric)
 % Input: 
 %   A: n x n adjacent matrix of undirected graph, where value is edge index
 %   nodePot: k x n node potential
@@ -9,18 +7,16 @@
 % Output:
 %   nodeBel: k x n node belief
 %   edgeBel: k x k x m edge belief
-%   L: variational lower bound (Bethe energy)
 % Written by Mo Chen (sth4nth@gmail.com)
+tol = 0;
+if nargin < 4
+    epoch = 50;
+    tol = 1e-8;
+end
 
-% working in exp domain
 nodePot = exp(-nodePot);  
 edgePot = exp(-edgePot);
 
-tol = 0;
-if nargin < 4
-    epoch = 10;
-    tol = 1e-4;
-end
 k = size(nodePot,1);
 m = size(edgePot,3);
 
diff --git a/chapter08/mrfMeanField.m b/chapter08/mrfMeanField.m
index 091c964..2f767cd 100644
--- a/chapter08/mrfMeanField.m
+++ b/chapter08/mrfMeanField.m
@@ -1,4 +1,4 @@
-function [nodeBel, edgeBel, lnZ] = mrfMeanField(A, nodePot, edgePot, epoch)
+function [nodeBel, edgeBel] = mrfMeanField(A, nodePot, edgePot, epoch)
 % Mean field for MRF (Assuming that egdePot is symmetric)
 % p(x)=exp(-E(x))/Z, E(x)=\sum(edgePot)+sum(nodePot)
 % Input: 
@@ -9,16 +9,19 @@
 %   nodeBel: k x n node belief q(x_i)
 %   edgeBel: k x k x m edge belief q(x_i,x_j)
 % Written by Mo Chen (sth4nth@gmail.com)
+tol = 0;
 if nargin < 4
     epoch = 50;
+    tol = 1e-8;
 end
-lnZ = -inf(1,epoch+1);
 [nodeBel,L] = softmax(-nodePot,1);    % init nodeBel    
 for iter = 1:epoch
+    nodeBel0 = nodeBel;
     for i = 1:numel(L)
         [~,j,e] = find(A(i,:));             % neighbors
         nodeBel(:,i) = softmax(-nodePot(:,i)-reshape(edgePot(:,:,e),2,[])*reshape(nodeBel(:,j),[],1));
     end
+    if max(abs(nodeBel(:)-nodeBel0(:))) < tol; break; end
 end
 
 [s,t,e] = find(tril(A));

From cc83a35fad2cdd358fbcbc79d573e3e97ac1eed8 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 28 May 2017 15:13:31 +0800
Subject: [PATCH 058/119] add Ising mean field

---
 chapter08/demo.m            | 23 ++++++++++++++++-------
 chapter08/imageMeanField.m  | 18 ------------------
 chapter08/isingMeanField.m  | 14 +++++++++++++-
 chapter08/isingMeanField0.m | 18 ------------------
 4 files changed, 29 insertions(+), 44 deletions(-)
 delete mode 100644 chapter08/imageMeanField.m
 delete mode 100644 chapter08/isingMeanField0.m

diff --git a/chapter08/demo.m b/chapter08/demo.m
index fd24aab..5cede61 100644
--- a/chapter08/demo.m
+++ b/chapter08/demo.m
@@ -1,12 +1,6 @@
 clear; close all;
-% load letterA.mat;
-% X = A;
-load letterX.mat
 %% Original image
-epoch = 50;
-J = 1;   % ising parameter
-sigma = 1; % noise level
-
+load letterX.mat
 img = double(X);
 img = sign(img-mean(img(:)));
 
@@ -18,11 +12,16 @@
 colormap gray;
 %% Noisy image
 y = img + sigma*randn(size(img)); % noisy signal
+
 subplot(2,3,2);
 imagesc(y);
 title('Noisy image');
 axis image;
 colormap gray;
+%% Parameters
+epoch = 50;
+J = 1;   % Ising parameter
+sigma = 1; % noise level
 %% Mean Field
 [A, nodePot, edgePot] = im2mrf(y, J, sigma);
 [nodeBel, edgeBel] = mrfMeanField(A, nodePot, edgePot, epoch);
@@ -35,6 +34,16 @@
 title('Mean Field');
 axis image;
 colormap gray;
+%% Ising Mean Field 
+h = reshape(0.5*diff(nodePot),size(img));
+mu = isingMeanField(J, h, epoch);
+maxdiff(reshape(mu,1,[]), [1,-1]*nodeBel)
+
+subplot(2,3,3);
+imagesc(mu)
+title('Ising Mean Field');
+axis image;
+colormap gray;
 %% Belief Propagation
 [nodeBel,edgeBel] = mrfBelProp(A, nodePot, edgePot, epoch);
 lnZ = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
diff --git a/chapter08/imageMeanField.m b/chapter08/imageMeanField.m
deleted file mode 100644
index a747f75..0000000
--- a/chapter08/imageMeanField.m
+++ /dev/null
@@ -1,18 +0,0 @@
-function nodeBel = imageMeanField(M, N, nodePot, edgePot, epoch)
-if nargin < 5
-    epoch = 10;
-end
-stride = [-1,1,-M,M];
-nodeBel = softmax(-nodePot,1);
-for t = 1:epoch
-    for j = 1:N
-        for i = 1:M
-            pos = i + M*(j-1);
-            ne = pos + stride;
-            ne([i,i,j,j] == [1,M,1,N]) = [];
-            nodeBel(:,pos) = softmax(-edgePot*sum(nodeBel(:,ne),2)-nodePot(:,pos));
-        end
-    end
-end 
-
-
diff --git a/chapter08/isingMeanField.m b/chapter08/isingMeanField.m
index 81a9887..ad7d286 100644
--- a/chapter08/isingMeanField.m
+++ b/chapter08/isingMeanField.m
@@ -1,11 +1,22 @@
 function mu = isingMeanField(J, h, epoch)
+% Mean field for 2d Ising model
+% Input: 
+%   J: scalar edge potential
+%   h: M X N image size node potential
+%   edgePot: k x k x m edge potential 
+% Output:
+%   mu: M x N image size expectation
+% Written by Mo Chen (sth4nth@gmail.com)
+tol = 0;
 if nargin < 3
-    epoch = 10;
+    epoch = 50;
+    tol = 1e-8;
 end
 [M,N] = size(h);
 mu =  tanh(h);
 stride = [-1,1,-M,M];
 for t = 1:epoch
+    mu0 = mu;
     for j = 1:N
         for i = 1:M
             pos = i + M*(j-1);
@@ -14,5 +25,6 @@
             mu(i,j) = tanh(J*sum(mu(ne)) + h(i,j));
         end
     end
+    if max(abs(mu(:)-mu0(:))) < tol; break; end
 end 
 
diff --git a/chapter08/isingMeanField0.m b/chapter08/isingMeanField0.m
deleted file mode 100644
index f68cba0..0000000
--- a/chapter08/isingMeanField0.m
+++ /dev/null
@@ -1,18 +0,0 @@
-function mu = isingMeanField0(J, h, epoch)
-% use padding trick
-if nargin < 3
-    epoch = 10;
-end
-mu = zeros(size(h)+2);                        % padding
-[m,n] = size(mu);
-mu(2:m-1,2:n-1) = tanh(h);               % init
-stride = [-1,1,-m,m];
-for t = 1:epoch
-    for j = 2:n-1
-        for i = 2:m-1
-            ne = i + m*(j-1) + stride;
-            mu(i,j) = tanh(J*sum(mu(ne))+h(i-1,j-1));
-        end
-    end
-end
-mu = mu(2:m-1,2:n-1);
\ No newline at end of file

From 1a9cfa8c1a46fe99ffbd879496ce72c460c95fe3 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 28 May 2017 15:16:03 +0800
Subject: [PATCH 059/119] move demo

---
 chapter08/demo.m => demo/ch08/mrf_demo.m | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename chapter08/demo.m => demo/ch08/mrf_demo.m (100%)

diff --git a/chapter08/demo.m b/demo/ch08/mrf_demo.m
similarity index 100%
rename from chapter08/demo.m
rename to demo/ch08/mrf_demo.m

From 81c8932c7d4df0db8dc9fc443257544906ce6295 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 28 May 2017 15:16:41 +0800
Subject: [PATCH 060/119] move data

---
 {chapter08 => demo/ch08}/letterX.mat | Bin
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {chapter08 => demo/ch08}/letterX.mat (100%)

diff --git a/chapter08/letterX.mat b/demo/ch08/letterX.mat
similarity index 100%
rename from chapter08/letterX.mat
rename to demo/ch08/letterX.mat

From f3c82fbc54d68f305acb36fd45d4d07fb8fd1956 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 31 May 2017 00:40:18 +0800
Subject: [PATCH 061/119] Update README.md

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 5f53e50..191a1f3 100644
--- a/README.md
+++ b/README.md
@@ -3,29 +3,29 @@ Introduction
 This package is a Matlab implementation of the algorithms described in the classical machine learning textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
-Note: this package requires Matlab R2016b or later, since it utilizes a new syntax of Matlab called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting in Python).
+Note: this package requires Matlab **R2016b** or later, since it utilizes a new syntax of Matlab called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting in Python).
 
 Description
 -------
 The design goal of the code are as follows:
 
-1. Succinct: Code is extremely terse. Minimizing the number of line of code is one of the primal target. As a result, the core of the algorithms can be easily spot.
-2. Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementation. Usually, functions in this package are orders faster than Matlab builtin functions which provide the same functionality (eg. kmeans). If anyone found any Matlab implementation that is faster than mine, I am happy to further optimize.
-3. Robust: Many numerical stability techniques are applied, such as probability computation in log scale to avoid numerical underflow and overflow, square root form update of symmetric matrix, etc.
-4. Easy to learn: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines. Symbols are in sync with the book.
-5. Practical: The package is designed not only to be easily read, but also to be easily used to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
+* Succinct: The code is extremely terse. Minimizing the number of lines is a primal target. As a result, the core of the algorithms can be easily spot.
+* Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementations. Usually, functions in this package are orders faster than Matlab builtin ones which provide the same functionality (eg. kmeans). If anyone have found any Matlab implementation that is faster than mine, I am happy to further optimize.
+* Robust: Many tricks for numerical stability are applied, such as probability computation in log scale and square root matrix update to enforce matrix symmetry, etc.
+* Learnable: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines. Symbols are in sync with the book.
+* Practical: The package is designed not only to be easily read, but also to be easily used to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
 
 Installation
 -------
-1. Download the package by running: `git clone https://github.com/PRML/PRMLT.git`.
+1. Download the package to your local path (e.g. PRMLT/) by running: `git clone https://github.com/PRML/PRMLT.git`.
 
-2. Run Matlab and navigate to package location as working directory, then run the init.m script.
+2. Run Matlab and navigate to PRMLT/, then run the init.m script.
 
-3. Run some demos in the your_location/demo directory. Enjoy!
+3. Run some demos in PRMLT/demo directory. Enjoy!
 
 FeedBack
 -------
-If you found any bug or have any suggestion, please do fire issues. I am graceful for any feedback and will do my best to improve this package.
+If you found any bug or have any suggestion, please do file issues. I am graceful for any feedback and will do my best to improve this package.
 
 License
 -------

From dc1acefcb8f2f0ff070dbe486d18f13371833e14 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 31 May 2017 00:45:27 +0800
Subject: [PATCH 062/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 191a1f3..73bb290 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Note: this package requires Matlab **R2016b** or later, since it utilizes a new
 
 Description
 -------
-The design goal of the code are as follows:
+While developing this package, I stick to following prinples
 
 * Succinct: The code is extremely terse. Minimizing the number of lines is a primal target. As a result, the core of the algorithms can be easily spot.
 * Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementations. Usually, functions in this package are orders faster than Matlab builtin ones which provide the same functionality (eg. kmeans). If anyone have found any Matlab implementation that is faster than mine, I am happy to further optimize.

From b72f9a680c5bafe71d3a616270849e5d2c7867b3 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 31 May 2017 00:45:49 +0800
Subject: [PATCH 063/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 73bb290..1cd2db2 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Note: this package requires Matlab **R2016b** or later, since it utilizes a new
 
 Description
 -------
-While developing this package, I stick to following prinples
+While developing this package, I stick to following principles
 
 * Succinct: The code is extremely terse. Minimizing the number of lines is a primal target. As a result, the core of the algorithms can be easily spot.
 * Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementations. Usually, functions in this package are orders faster than Matlab builtin ones which provide the same functionality (eg. kmeans). If anyone have found any Matlab implementation that is faster than mine, I am happy to further optimize.

From 75f3cdf5ecd224ca06d73ea69379e8dcc1850143 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 31 May 2017 15:00:26 +0800
Subject: [PATCH 064/119] fix a minor bug

---
 chapter05/mlp.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter05/mlp.m b/chapter05/mlp.m
index e19105c..baf78a1 100644
--- a/chapter05/mlp.m
+++ b/chapter05/mlp.m
@@ -17,7 +17,7 @@
 for l = 1:L-1
     W{l} = randn(h(l),h(l+1));
 end
-Z = cell(L);
+Z = cell(1,L);
 Z{1} = X;
 maxiter = 200;
 mse = zeros(1,maxiter);

From 66a59ca44cd437f39d6e4e74084b6586633e7cc6 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 31 May 2017 18:39:20 +0800
Subject: [PATCH 065/119] refine mlp

---
 chapter05/mlp.m | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/chapter05/mlp.m b/chapter05/mlp.m
index baf78a1..19e8b76 100644
--- a/chapter05/mlp.m
+++ b/chapter05/mlp.m
@@ -1,17 +1,15 @@
-function [model, mse] = mlp(X, Y, h, eta)
+function [model, mse] = mlp(X, T, h)
 % Train a multilayer perceptron neural network
 % Input:
 %   X: d x n data matrix
-%   Y: p x n response matrix
+%   T: p x n response matrix
 %   h: L x 1 vector specify number of hidden nodes in each layer l
 % Ouput:
 %   model: model structure
 %   mse: mean square error
 % Written by Mo Chen (sth4nth@gmail.com).
-if nargin < 4
-    eta = 1/size(X,2);
-end
-h = [size(X,1);h(:);size(Y,1)];
+eta = 1/size(X,2);
+h = [size(X,1);h(:);size(T,1)];
 L = numel(h);
 W = cell(L-1);
 for l = 1:L-1
@@ -24,10 +22,10 @@
 for iter = 1:maxiter
 %     forward
     for l = 2:L
-        Z{l} = sigmoid(W{l-1}'*Z{l-1});
+        Z{l} = sigmoid(W{l-1}'*Z{l-1});   % 5.10, 5.49
     end
 %     backward
-    E = Y-Z{L};
+    E = T-Z{L};
     mse(iter) =  mean(dot(E,E),1);
     for l = L-1:-1:1
         df = Z{l+1}.*(1-Z{l+1});

From 92c822088cf56b934c8c2afa3b344f57923ab325 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 2 Jun 2017 04:57:20 +0800
Subject: [PATCH 066/119] fix mlp

---
 chapter05/mlp.m | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chapter05/mlp.m b/chapter05/mlp.m
index 19e8b76..df987b1 100644
--- a/chapter05/mlp.m
+++ b/chapter05/mlp.m
@@ -11,11 +11,11 @@
 eta = 1/size(X,2);
 h = [size(X,1);h(:);size(T,1)];
 L = numel(h);
-W = cell(L-1);
+W = cell(L-1,1);
 for l = 1:L-1
     W{l} = randn(h(l),h(l+1));
 end
-Z = cell(1,L);
+Z = cell(L,1);
 Z{1} = X;
 maxiter = 200;
 mse = zeros(1,maxiter);

From dd161e46c87cb9baa61dd469f115241eb60ef453 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 2 Jun 2017 05:06:04 +0800
Subject: [PATCH 067/119] refine mlpPred.m

---
 chapter05/mlpPred.m | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/chapter05/mlpPred.m b/chapter05/mlpPred.m
index 0b64ca1..e6fc280 100644
--- a/chapter05/mlpPred.m
+++ b/chapter05/mlpPred.m
@@ -1,4 +1,4 @@
-function y = mlpPred(model, X)
+function Y = mlpPred(model, X)
 % Multilayer perceptron prediction
 % Input:
 %   model: model structure
@@ -8,9 +8,7 @@
 % Written by Mo Chen (sth4nth@gmail.com).
 W = model.W;
 L = length(W)+1;
-Z = cell(L);
-Z{1} = X;
+Y = X;
 for l = 2:L
-    Z{l} = sigmoid(W{l-1}'*Z{l-1});
-end
-y = Z{L};
+    Y = sigmoid(W{l-1}'*Y);
+end
\ No newline at end of file

From e1c19a5a13949b00e6a1ebaaad64d69fa673bd2b Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 2 Jun 2017 05:11:07 +0800
Subject: [PATCH 068/119] refine mlpPred.m

---
 chapter05/mlpPred.m | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/chapter05/mlpPred.m b/chapter05/mlpPred.m
index e6fc280..0ce5fb1 100644
--- a/chapter05/mlpPred.m
+++ b/chapter05/mlpPred.m
@@ -7,8 +7,7 @@
 %   Y: p x n response matrix
 % Written by Mo Chen (sth4nth@gmail.com).
 W = model.W;
-L = length(W)+1;
 Y = X;
-for l = 2:L
-    Y = sigmoid(W{l-1}'*Y);
+for l = 1:length(W)
+    Y = sigmoid(W{l}'*Y);
 end
\ No newline at end of file

From 272477eccc2dfc95d3bd1ba4c4962db8ced2df58 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 26 Aug 2017 13:55:17 +0800
Subject: [PATCH 069/119] add log1mexp.m

---
 common/log1mexp.m | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 common/log1mexp.m

diff --git a/common/log1mexp.m b/common/log1mexp.m
new file mode 100644
index 0000000..3892c70
--- /dev/null
+++ b/common/log1mexp.m
@@ -0,0 +1,7 @@
+function y = log1mexp(x)
+% Accurately compute y = log(1-exp(-x))
+% reference: Accurately Computing log(1-exp(-|a|)) Martin Machler
+y = x;
+i = x > log(2);
+y(i) = log1p(-exp(-x(i)));
+y(~i) = log(-expm1(-x(~i)));

From 159d5cd4ab2ae4ce873f14901fcd81706940e86d Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 26 Aug 2017 19:35:50 +0800
Subject: [PATCH 070/119] update log1pexp log1mexp

---
 common/log1mexp.m | 8 ++++----
 common/log1pexp.m | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/common/log1mexp.m b/common/log1mexp.m
index 3892c70..51918e1 100644
--- a/common/log1mexp.m
+++ b/common/log1mexp.m
@@ -1,7 +1,7 @@
 function y = log1mexp(x)
-% Accurately compute y = log(1-exp(-x))
+% Accurately compute y = log(1-exp(x))
 % reference: Accurately Computing log(1-exp(-|a|)) Martin Machler
 y = x;
-i = x > log(2);
-y(i) = log1p(-exp(-x(i)));
-y(~i) = log(-expm1(-x(~i)));
+i = x < -log(2);
+y(i) = log1p(-exp(x(i)));
+y(~i) = log(-expm1(x(~i)));
diff --git a/common/log1pexp.m b/common/log1pexp.m
index 7ad0b9d..10096e5 100644
--- a/common/log1pexp.m
+++ b/common/log1pexp.m
@@ -1,7 +1,8 @@
 function y = log1pexp(x)
 % Accurately compute y = log(1+exp(x))
-% reference: Accurately Computing log(1-exp(|a|)) Martin Machler
-seed = 33.3;
+% reference: Accurately Computing log(1-exp(-|a|)) Martin Machler
 y = x;
-idx = x<seed;
-y(idx) = log1p(exp(x(idx)));
\ No newline at end of file
+i = x > 18;
+j = i & (x <= 33.3);
+y(~i) = log1p(exp(x(~i)));
+y(j) = x(j)+exp(-x(j));

From c4df199480fa844ffbb6f4f2687395fc1ddbf3e2 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sun, 22 Oct 2017 17:09:35 +0800
Subject: [PATCH 071/119] improve entropy

---
 chapter01/entropy.m | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/chapter01/entropy.m b/chapter01/entropy.m
index 8cfd5a0..c59c7bd 100644
--- a/chapter01/entropy.m
+++ b/chapter01/entropy.m
@@ -6,10 +6,7 @@
 %   z: entropy z=H(x)
 % Written by Mo Chen (sth4nth@gmail.com).
 n = numel(x);
-[u,~,x] = unique(x);
-k = numel(u);
-idx = 1:n;
-Mx = sparse(idx,x,1,n,k,n);
-Px = nonzeros(mean(Mx,1));
+[~,~,x] = unique(x);
+Px = accumarray(x, 1)/n;
 Hx = -dot(Px,log2(Px));
 z = max(0,Hx);
\ No newline at end of file

From 0af5717978b5b2017c643fd1f3293c09cc497259 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 16 Nov 2017 05:21:08 +0800
Subject: [PATCH 072/119] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1cd2db2..cd1debf 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ Introduction
 This package is a Matlab implementation of the algorithms described in the classical machine learning textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
-Note: this package requires Matlab **R2016b** or later, since it utilizes a new syntax of Matlab called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting in Python).
+Note: this package requires Matlab **R2016b** or latter, since it utilizes a new syntax of Matlab called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting in Python).
 
 Description
 -------
@@ -12,7 +12,7 @@ While developing this package, I stick to following principles
 * Succinct: The code is extremely terse. Minimizing the number of lines is a primal target. As a result, the core of the algorithms can be easily spot.
 * Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementations. Usually, functions in this package are orders faster than Matlab builtin ones which provide the same functionality (eg. kmeans). If anyone have found any Matlab implementation that is faster than mine, I am happy to further optimize.
 * Robust: Many tricks for numerical stability are applied, such as probability computation in log scale and square root matrix update to enforce matrix symmetry, etc.
-* Learnable: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines. Symbols are in sync with the book.
+* Readable: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines. Symbols are in sync with the book.
 * Practical: The package is designed not only to be easily read, but also to be easily used to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
 
 Installation
@@ -21,7 +21,7 @@ Installation
 
 2. Run Matlab and navigate to PRMLT/, then run the init.m script.
 
-3. Run some demos in PRMLT/demo directory. Enjoy!
+3. Try demos in PRMLT/demo directory to verify installation correctness. Enjoy!
 
 FeedBack
 -------

From 2e96df71b77ac5a4615ab8f0755ad25c95f33220 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 16 Nov 2017 05:23:04 +0800
Subject: [PATCH 073/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cd1debf..73840b2 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Description
 -------
 While developing this package, I stick to following principles
 
-* Succinct: The code is extremely terse. Minimizing the number of lines is a primal target. As a result, the core of the algorithms can be easily spot.
+* Succinct: The code is extremely terse. Minimizing the number of lines is one of the primal goals. As a result, the core of the algorithms can be easily spot.
 * Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementations. Usually, functions in this package are orders faster than Matlab builtin ones which provide the same functionality (eg. kmeans). If anyone have found any Matlab implementation that is faster than mine, I am happy to further optimize.
 * Robust: Many tricks for numerical stability are applied, such as probability computation in log scale and square root matrix update to enforce matrix symmetry, etc.
 * Readable: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines. Symbols are in sync with the book.

From a71624867610c09fc9c76072faf4bc3abb20fdfa Mon Sep 17 00:00:00 2001
From: txingml <cheerconi@163.com>
Date: Sat, 10 Mar 2018 17:29:20 +0800
Subject: [PATCH 074/119] fix linRegVb and rvmRegVb

---
 chapter10/linRegVb.m | 2 +-
 chapter10/rvmRegVb.m | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/chapter10/linRegVb.m b/chapter10/linRegVb.m
index e3fe01f..0017e31 100644
--- a/chapter10/linRegVb.m
+++ b/chapter10/linRegVb.m
@@ -53,7 +53,7 @@
     KLalpha = -a*log(b);
 %     q(beta)
     e2 = sum((t-Ew'*X).^2);    
-    invUX = U\X;
+    invUX = U'\X;
     trXSX = dot(invUX(:),invUX(:));
     d = d0+0.5*(e2+trXSX);
     Ebeta = c/d; 
diff --git a/chapter10/rvmRegVb.m b/chapter10/rvmRegVb.m
index 4616cba..2430662 100644
--- a/chapter10/rvmRegVb.m
+++ b/chapter10/rvmRegVb.m
@@ -56,7 +56,7 @@
     KLalpha = -sum(a*log(b));
 %     q(beta)
     e2 = sum((t-Ew'*X).^2);    
-    invUX = U\X;
+    invUX = U'\X;
     trXSX = dot(invUX(:),invUX(:));
     d = d0+0.5*(e2+trXSX);
     Ebeta = c/d; 

From 7d7c3aab0585210fc2cf1b4298b4d92d902dd686 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 12 Mar 2018 02:55:19 +0800
Subject: [PATCH 075/119] make code consistent for linRegVb and rvmRegVb

---
 chapter10/linRegVb.m | 2 +-
 chapter10/rvmRegVb.m | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/chapter10/linRegVb.m b/chapter10/linRegVb.m
index 0017e31..49ba81e 100644
--- a/chapter10/linRegVb.m
+++ b/chapter10/linRegVb.m
@@ -46,7 +46,7 @@
     KLw = -sum(log(diag(U)));        
 %     q(alpha)
     w2 = dot(Ew,Ew);
-    invU = U\I;   
+    invU = U'\I;   
     trS = dot(invU(:),invU(:));
     b = b0+0.5*(w2+trS);                      % 10.95
     Ealpha = a/b;                              % 10.102
diff --git a/chapter10/rvmRegVb.m b/chapter10/rvmRegVb.m
index 2430662..2ced2d7 100644
--- a/chapter10/rvmRegVb.m
+++ b/chapter10/rvmRegVb.m
@@ -49,7 +49,7 @@
     KLw = -sum(log(diag(U)));        
 %     q(alpha)
     w2 = Ew.*Ew;
-    invU = U\I;
+    invU = U'\I;
     dgS = dot(invU,invU,2);
     b = b0+0.5*(w2+dgS);
     Ealpha = a./b;

From 7349e4a2548d5a22bd6f472cb76a7a8434167fc5 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Mon, 12 Mar 2018 03:11:38 +0800
Subject: [PATCH 076/119] minor improvements

---
 chapter09/linRegEm.m  | 10 +++++-----
 chapter10/linRegVb.m  |  2 +-
 chapter10/rvmRegVb.m  |  2 +-
 chapter14/mixLinReg.m |  1 -
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/chapter09/linRegEm.m b/chapter09/linRegEm.m
index 00280fe..5534bfa 100644
--- a/chapter09/linRegEm.m
+++ b/chapter09/linRegEm.m
@@ -14,7 +14,7 @@
     beta = 0.5;
 end
 [d,n] = size(X);
-
+I = eye(d);
 xbar = mean(X,2);
 tbar = mean(t,2);
 
@@ -39,12 +39,12 @@
     llh(iter) = 0.5*(d*log(alpha)+n*log(beta)-alpha*m2-beta*e2-logdetA-n*log(2*pi));  % 3.86
     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end
     
-    V = inv(U);
-    trS = dot(V(:),V(:));    % A=inv(S)
+    invU = U'\I;
+    trS = dot(invU(:),invU(:));    % A=inv(S)
     alpha = d/(m2+trS);   % 9.63
     
-    UX = U'\X;
-    trXSX = dot(UX(:),UX(:));
+    invUX = U'\X;
+    trXSX = dot(invUX(:),invUX(:));
     beta = n/(e2+trXSX);  % 9.68 is wrong
 end
 w0 = tbar-dot(m,xbar);
diff --git a/chapter10/linRegVb.m b/chapter10/linRegVb.m
index 49ba81e..809c3fa 100644
--- a/chapter10/linRegVb.m
+++ b/chapter10/linRegVb.m
@@ -8,7 +8,6 @@
 %   model: trained model structure
 %   energy: variational lower bound
 % Written by Mo Chen (sth4nth@gmail.com).
-[m,n] = size(X);
 if nargin < 3
     a0 = 1e-4;
     b0 = 1e-4;
@@ -20,6 +19,7 @@
     c0 = prior.c;
     d0 = prior.d;
 end
+[m,n] = size(X);
 I = eye(m);
 xbar = mean(X,2);
 tbar = mean(t,2);
diff --git a/chapter10/rvmRegVb.m b/chapter10/rvmRegVb.m
index 2ced2d7..91d073b 100644
--- a/chapter10/rvmRegVb.m
+++ b/chapter10/rvmRegVb.m
@@ -8,7 +8,6 @@
 %   model: trained model structure
 %   energy: variational lower bound
 % Written by Mo Chen (sth4nth@gmail.com).
-[m,n] = size(X);
 if nargin < 3
     a0 = 1e-4;
     b0 = 1e-4;
@@ -20,6 +19,7 @@
     c0 = prior.c;
     d0 = prior.d;
 end
+[m,n] = size(X);
 idx = (1:m)';
 dg = sub2ind([m,m],idx,idx);
 I = eye(m);
diff --git a/chapter14/mixLinReg.m b/chapter14/mixLinReg.m
index aa530f5..7bf90cb 100644
--- a/chapter14/mixLinReg.m
+++ b/chapter14/mixLinReg.m
@@ -45,7 +45,6 @@
     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter)); break; end
 end
 llh = llh(2:iter);
-label = max(R,[],1);
 model.alpha = alpha; % mixing coefficient
 model.beta = beta; % mixture component precision
 model.W = W;  % linear model coefficent

From 3f9d968e1ab90f879b9a5b4035fc72aecaad3dd9 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 19 Apr 2018 11:02:48 +0800
Subject: [PATCH 077/119] update MRF

---
 chapter08/MRF/mrfBethe.m                 | 12 ++++
 chapter08/MRF/mrfBp.m                    | 56 +++++++++++++++++
 chapter08/MRF/mrfGibbs.m                 | 11 ++++
 chapter08/MRF/mrfIsGa.m                  | 21 +++++++
 chapter08/MRF/mrfMf.m                    | 34 +++++++++++
 chapter08/{ => NaiveBayes}/nbBern.m      |  0
 chapter08/{ => NaiveBayes}/nbBernPred.m  |  0
 chapter08/{ => NaiveBayes}/nbGauss.m     |  0
 chapter08/{ => NaiveBayes}/nbGaussPred.m |  0
 chapter08/betheEnergy.m                  | 11 ----
 chapter08/gibbsEnergy.m                  |  9 ---
 chapter08/im2mrf.m                       | 20 -------
 chapter08/isingMeanField.m               | 30 ----------
 chapter08/mrfBelProp.m                   | 62 -------------------
 chapter08/mrfExpProp.m                   | 55 -----------------
 chapter08/mrfMeanField.m                 | 31 ----------
 demo/ch08/mrf_demo.m                     | 76 +++++++++++-------------
 17 files changed, 168 insertions(+), 260 deletions(-)
 create mode 100644 chapter08/MRF/mrfBethe.m
 create mode 100644 chapter08/MRF/mrfBp.m
 create mode 100644 chapter08/MRF/mrfGibbs.m
 create mode 100644 chapter08/MRF/mrfIsGa.m
 create mode 100644 chapter08/MRF/mrfMf.m
 rename chapter08/{ => NaiveBayes}/nbBern.m (100%)
 rename chapter08/{ => NaiveBayes}/nbBernPred.m (100%)
 rename chapter08/{ => NaiveBayes}/nbGauss.m (100%)
 rename chapter08/{ => NaiveBayes}/nbGaussPred.m (100%)
 delete mode 100644 chapter08/betheEnergy.m
 delete mode 100644 chapter08/gibbsEnergy.m
 delete mode 100644 chapter08/im2mrf.m
 delete mode 100644 chapter08/isingMeanField.m
 delete mode 100644 chapter08/mrfBelProp.m
 delete mode 100644 chapter08/mrfExpProp.m
 delete mode 100644 chapter08/mrfMeanField.m

diff --git a/chapter08/MRF/mrfBethe.m b/chapter08/MRF/mrfBethe.m
new file mode 100644
index 0000000..85b69d6
--- /dev/null
+++ b/chapter08/MRF/mrfBethe.m
@@ -0,0 +1,12 @@
+function lnZ = mrfBethe(A, nodePot, edgePot, nodeBel, edgeBel)
+% Compute Bethe energy
+[s,t,e] = find(triu(A));
+edgeCor = zeros(size(edgePot));
+for l = 1:numel(e)
+    edgeCor(:,:,e(l)) = edgeBel(:,:,e(l))./(nodeBel(:,s(l))*nodeBel(:,t(l))');
+end
+Ex = dot(nodeBel(:),nodePot(:));
+Exy = dot(edgeBel(:),edgePot(:));
+Hx = -dot(nodeBel(:),log(nodeBel(:)));
+Ixy = dot(edgeBel(:),log(edgeCor(:)));
+lnZ = Ex+Exy+Hx-Ixy;
\ No newline at end of file
diff --git a/chapter08/MRF/mrfBp.m b/chapter08/MRF/mrfBp.m
new file mode 100644
index 0000000..ff2746b
--- /dev/null
+++ b/chapter08/MRF/mrfBp.m
@@ -0,0 +1,56 @@
+function [nodeBel, edgeBel, L] = mrfBp(A, nodePot, edgePot, epoch)
+% Undirected graph belief propagation for MRF
+% Assuming egdePot is symmetric
+% Input: 
+%   A: n x n adjacent matrix of undirected graph, where value is edge index
+%   nodePot: k x n node potential
+%   edgePot: k x k x m edge potential
+% Output:
+%   nodeBel: k x n node belief
+%   edgeBel: k x k x m edge belief
+%   L: variational lower bound (Bethe energy)
+% Written by Mo Chen (sth4nth@gmail.com)
+if nargin < 4
+    epoch = 10;
+end
+expNodePot = exp(nodePot);  
+expEdgePot = exp(edgePot);
+[k,n] = size(nodePot);
+m = size(edgePot,3);
+
+[s,t,e] = find(triu(A));
+A = sparse([s;t],[t;s],[e;e+m]);       % digraph adjacent matrix, where value is message index
+mu = ones(k,2*m)/k;                     % message factor to node
+
+nodeBel = zeros(k,n);
+edgeBel = zeros(k,k,m);
+L = -inf(1,epoch+1);
+for iter = 1:epoch
+    for i = 1:n
+        in = nonzeros(A(:,i));                      % incoming message index
+        nb = expNodePot(:,i).*prod(mu(:,in),2);                       % product of incoming message
+        for l = in'
+            ep = expEdgePot(:,:,ud(l,m));
+            mu(:,rd(l,m)) = normalize(ep*(nb./mu(:,l)));
+        end
+        nodeBel(:,i) = nb/sum(nb);
+    end
+    
+    for l = 1:m
+        st = e(l);
+        nut = nodeBel(:,t(l))./mu(:,st);
+        nus = nodeBel(:,s(l))./mu(:,st+m);
+        eb = expEdgePot(:,:,st).*(nus*nut');
+        edgeBel(:,:,st) = eb./sum(eb(:));
+    end
+    L(iter+1) = mrfBethe(A,nodePot,edgePot,nodeBel,edgeBel);
+end
+L = L(1,2:iter+1);
+
+function i = rd(i, m)
+% reverse direction edge index
+i = mod(i+m-1,2*m)+1;
+
+function i = ud(i, m)
+% undirected edge index
+i = mod(i-1,m)+1;
\ No newline at end of file
diff --git a/chapter08/MRF/mrfGibbs.m b/chapter08/MRF/mrfGibbs.m
new file mode 100644
index 0000000..60f75cd
--- /dev/null
+++ b/chapter08/MRF/mrfGibbs.m
@@ -0,0 +1,11 @@
+function lnZ = mrfGibbs(A, nodePot, edgePot, nodeBel)
+% Compute Gibbs energy
+[s,t,e] = find(triu(A));
+edgeBel = zeros(size(edgePot));
+for l = 1:numel(e)
+    edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))';
+end
+Ex = dot(nodeBel(:),nodePot(:));
+Exy = dot(edgeBel(:),edgePot(:));
+Hx = -dot(nodeBel(:),log(nodeBel(:)));
+lnZ = Ex+Exy+Hx;
\ No newline at end of file
diff --git a/chapter08/MRF/mrfIsGa.m b/chapter08/MRF/mrfIsGa.m
new file mode 100644
index 0000000..a303119
--- /dev/null
+++ b/chapter08/MRF/mrfIsGa.m
@@ -0,0 +1,21 @@
+function [A, nodePot, edgePot] = mrfIsGa(im, sigma, J)
+% Contruct a latent Ising MRF with Gaussian observation
+% Input:
+%   im: row x col image
+%   sigma: variance of Gaussian node potential
+%   J: parameter of Ising edge
+% Output:
+%   A: n x n adjacent matrix
+%   nodePot: 2 x n node potential
+%   edgePot: 2 x 2 x m edge potential
+% Written by Mo Chen (sth4nth@gmail.com)
+A = lattice(size(im));
+[s,t,e] = find(triu(A));
+m = numel(e);
+e(:) = 1:m;
+A = sparse([s;t],[t;s],[e;e]);
+
+z = [1;-1];
+x = reshape(im,1,[]);
+nodePot = -(x-z).^2/(2*sigma^2);
+edgePot = repmat(J*(z*z'),[1, 1, m]);
\ No newline at end of file
diff --git a/chapter08/MRF/mrfMf.m b/chapter08/MRF/mrfMf.m
new file mode 100644
index 0000000..366164b
--- /dev/null
+++ b/chapter08/MRF/mrfMf.m
@@ -0,0 +1,34 @@
+function [nodeBel, edgeBel, L] = mrfMf(A, nodePot, edgePot, epoch)
+% Mean field for MRF
+% Assuming egdePot is symmetric
+% Input: 
+%   A: n x n adjacent matrix of undirected graph, where value is edge index
+%   nodePot: k x n node potential
+%   edgePot: k x k x m edge potential
+% Output:
+%   nodeBel: k x n node belief
+%   edgeBel: k x k x m edge belief
+% Written by Mo Chen (sth4nth@gmail.com)
+if nargin < 4
+    epoch = 10;
+end
+L = -inf(1,epoch+1);
+[nodeBel,lnZ] = softmax(nodePot,1);    % initialization    
+for iter = 1:epoch
+    for i = 1:size(nodePot,2)
+        [~,j,e] = find(A(i,:));             % neighbors
+        [nodeBel(:,i),lnZ(i)] = softmax(nodePot(:,i)+reshape(edgePot(:,:,e),2,[])*reshape(nodeBel(:,j),[],1));
+    end
+%     E = dot(nodeBel,nodePot,1);
+%     H = -dot(nodeBel,log(nodeBel),1);
+%     L(iter+1) = sum(lnZ+E+H)/2;
+    L(iter+1) = mrfGibbs(A,nodePot,edgePot,nodeBel);
+%     if abs(L(iter+1)-L(iter))/abs(L(iter)) < tol; break; end
+end
+L = L(1,2:iter+1);
+
+[s,t,e] = find(triu(A));
+edgeBel = zeros(size(edgePot));
+for l = 1:numel(e)
+    edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))';
+end
\ No newline at end of file
diff --git a/chapter08/nbBern.m b/chapter08/NaiveBayes/nbBern.m
similarity index 100%
rename from chapter08/nbBern.m
rename to chapter08/NaiveBayes/nbBern.m
diff --git a/chapter08/nbBernPred.m b/chapter08/NaiveBayes/nbBernPred.m
similarity index 100%
rename from chapter08/nbBernPred.m
rename to chapter08/NaiveBayes/nbBernPred.m
diff --git a/chapter08/nbGauss.m b/chapter08/NaiveBayes/nbGauss.m
similarity index 100%
rename from chapter08/nbGauss.m
rename to chapter08/NaiveBayes/nbGauss.m
diff --git a/chapter08/nbGaussPred.m b/chapter08/NaiveBayes/nbGaussPred.m
similarity index 100%
rename from chapter08/nbGaussPred.m
rename to chapter08/NaiveBayes/nbGaussPred.m
diff --git a/chapter08/betheEnergy.m b/chapter08/betheEnergy.m
deleted file mode 100644
index d663e8b..0000000
--- a/chapter08/betheEnergy.m
+++ /dev/null
@@ -1,11 +0,0 @@
-function lnZ = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel)
-% Compute Bethe free energy
-% TBD: deal with log(0) for entropy
-edgePot = reshape(edgePot,[],size(edgePot,3));
-edgeBel = reshape(edgeBel,[],size(edgeBel,3));
-Ex = dot(nodeBel,nodePot,1);
-Exy = dot(edgeBel,edgePot,1);
-Hx = -dot(nodeBel,log(nodeBel),1);
-Hxy = -dot(edgeBel,log(edgeBel),1);
-d = full(sum(logical(A),1));
-lnZ = -sum(Ex)-sum(Exy)-sum((d-1).*Hx)+sum(Hxy);
diff --git a/chapter08/gibbsEnergy.m b/chapter08/gibbsEnergy.m
deleted file mode 100644
index b4c0aec..0000000
--- a/chapter08/gibbsEnergy.m
+++ /dev/null
@@ -1,9 +0,0 @@
-function lnZ = gibbsEnergy(nodePot, edgePot, nodeBel, edgeBel)
-% Compute Gibbs free energy
-% TBD: deal with log(0) for entropy
-edgePot = reshape(edgePot,[],size(edgePot,3));
-edgeBel = reshape(edgeBel,[],size(edgeBel,3));
-Ex = dot(nodeBel,nodePot,1);
-Exy = dot(edgeBel,edgePot,1);
-Hx = dot(nodeBel,log(nodeBel),1);
-lnZ = -(sum(Ex)+sum(Exy)+sum(Hx));
\ No newline at end of file
diff --git a/chapter08/im2mrf.m b/chapter08/im2mrf.m
deleted file mode 100644
index 3d9e173..0000000
--- a/chapter08/im2mrf.m
+++ /dev/null
@@ -1,20 +0,0 @@
-function [A, nodePot, edgePot] = im2mrf(im, J, sigma)
-% Convert a image to Ising MRF with distribution p(x)=exp(-sum(nodePot)-sum(edgePot)-lnZ)
-% Input:
-%   im: row x col image
-%   sigma: variance of Gaussian node potential
-%   J: parameter of Ising edge
-% Output:
-%   nodePot: 2 x n node potential
-%   edgePot: 2 x 2 x m edge potential
-
-A = lattice(size(im));
-[s,t,e] = find(tril(A));
-nEdge = numel(e);
-e(:) = 1:nEdge;
-A = sparse([s;t],[t;s],[e;e]);
-
-z = [1;-1];
-y = reshape(im,1,[]);
-nodePot = (y-z).^2/(2*sigma^2);
-edgePot = repmat(-J*(z*z'),[1, 1, nEdge]);
\ No newline at end of file
diff --git a/chapter08/isingMeanField.m b/chapter08/isingMeanField.m
deleted file mode 100644
index ad7d286..0000000
--- a/chapter08/isingMeanField.m
+++ /dev/null
@@ -1,30 +0,0 @@
-function mu = isingMeanField(J, h, epoch)
-% Mean field for 2d Ising model
-% Input: 
-%   J: scalar edge potential
-%   h: M X N image size node potential
-%   edgePot: k x k x m edge potential 
-% Output:
-%   mu: M x N image size expectation
-% Written by Mo Chen (sth4nth@gmail.com)
-tol = 0;
-if nargin < 3
-    epoch = 50;
-    tol = 1e-8;
-end
-[M,N] = size(h);
-mu =  tanh(h);
-stride = [-1,1,-M,M];
-for t = 1:epoch
-    mu0 = mu;
-    for j = 1:N
-        for i = 1:M
-            pos = i + M*(j-1);
-            ne = pos + stride;
-            ne([i,i,j,j] == [1,M,1,N]) = [];
-            mu(i,j) = tanh(J*sum(mu(ne)) + h(i,j));
-        end
-    end
-    if max(abs(mu(:)-mu0(:))) < tol; break; end
-end 
-
diff --git a/chapter08/mrfBelProp.m b/chapter08/mrfBelProp.m
deleted file mode 100644
index a556d9c..0000000
--- a/chapter08/mrfBelProp.m
+++ /dev/null
@@ -1,62 +0,0 @@
-function [nodeBel, edgeBel] = mrfBelProp(A, nodePot, edgePot, epoch)
-% Belief propagation for MRF (Assuming that egdePot is symmetric)
-% Input: 
-%   A: n x n adjacent matrix of undirected graph, where value is edge index
-%   nodePot: k x n node potential
-%   edgePot: k x k x m edge potential
-% Output:
-%   nodeBel: k x n node belief
-%   edgeBel: k x k x m edge belief
-% Written by Mo Chen (sth4nth@gmail.com)
-tol = 0;
-if nargin < 4
-    epoch = 50;
-    tol = 1e-8;
-end
-
-nodePot = exp(-nodePot);  
-edgePot = exp(-edgePot);
-
-[k,n] = size(nodePot);
-m = size(edgePot,3);
-
-[s,t,e] = find(tril(A));
-A = sparse([s;t],[t;s],[e;e+m]);       % digraph adjacent matrix, where value is message index
-mu = ones(k,2*m)/k;                     % message
-for iter = 1:epoch
-    mu0 = mu;
-    for i = 1:n
-        in = nonzeros(A(:,i));                      % incoming message index
-        nb = nodePot(:,i).*prod(mu(:,in),2);                       % product of incoming message
-        for l = in'
-            ep = edgePot(:,:,ud(l,m));
-            mu(:,rd(l,m)) = normalize(ep*(nb./mu(:,l)));
-        end
-    end
-    if max(abs(mu(:)-mu0(:))) < tol; break; end
-end
-
-nodeBel = zeros(k,n);
-for i = 1:n
-    nodeBel(:,i) = nodePot(:,i).*prod(mu(:,nonzeros(A(:,i))),2);
-end
-nodeBel = normalize(nodeBel,1);
-
-edgeBel = zeros(k,k,m);
-for l = 1:m
-    eij = e(l);
-    eji = eij+m;
-    ep = edgePot(:,:,eij);
-    nbt = nodeBel(:,t(l))./mu(:,eij);
-    nbs = nodeBel(:,s(l))./mu(:,eji);
-    eb = (nbt*nbs').*ep;
-    edgeBel(:,:,eij) = eb./sum(eb(:));
-end
-
-function i = rd(i, m)
-% reverse direction edge index
-i = mod(i+m-1,2*m)+1;
-
-function i = ud(i, m)
-% undirected edge index
-i = mod(i-1,m)+1;
\ No newline at end of file
diff --git a/chapter08/mrfExpProp.m b/chapter08/mrfExpProp.m
deleted file mode 100644
index 26969f2..0000000
--- a/chapter08/mrfExpProp.m
+++ /dev/null
@@ -1,55 +0,0 @@
-function [nodeBel, edgeBel] = mrfExpProp(A, nodePot, edgePot, epoch)
-% Expectation propagation for MRF (Assuming that egdePot is symmetric)
-% Input: 
-%   A: n x n adjacent matrix of undirected graph, where value is edge index
-%   nodePot: k x n node potential
-%   edgePot: k x k x m edge potential
-% Output:
-%   nodeBel: k x n node belief
-%   edgeBel: k x k x m edge belief
-% Written by Mo Chen (sth4nth@gmail.com)
-tol = 0;
-if nargin < 4
-    epoch = 50;
-    tol = 1e-8;
-end
-
-nodePot = exp(-nodePot);  
-edgePot = exp(-edgePot);
-
-k = size(nodePot,1);
-m = size(edgePot,3);
-
-[s,t,e] = find(tril(A));
-mu = ones(k,2*m)/k;         % message
-nodeBel = normalize(nodePot,1);
-for iter = 1:epoch
-    mu0 = mu;
-    for l = 1:m
-        i = s(l);
-        j = t(l);
-        eij = e(l);
-        eji = eij+m;
-        ep = edgePot(:,:,eij);
-
-        nodeBel(:,j) = nodeBel(:,j)./mu(:,eij);
-        mu(:,eij) = normalize(ep*(nodeBel(:,i)./mu(:,eji)));
-        nodeBel(:,j) = normalize(nodeBel(:,j).*mu(:,eij));
-        
-        nodeBel(:,i) = nodeBel(:,i)./mu(:,eji);
-        mu(:,eji) = normalize(ep*(nodeBel(:,j)./mu(:,eij)));
-        nodeBel(:,i) = normalize(nodeBel(:,i).*mu(:,eji));
-    end
-    if max(abs(mu(:)-mu0(:))) < tol; break; end
-end
-
-edgeBel = zeros(k,k,m);
-for l = 1:m
-    eij = e(l);
-    eji = eij+m;
-    ep = edgePot(:,:,eij);
-    nbt = nodeBel(:,t(l))./mu(:,eij);
-    nbs = nodeBel(:,s(l))./mu(:,eji);
-    eb = (nbt*nbs').*ep;
-    edgeBel(:,:,eij) = eb./sum(eb(:));
-end
diff --git a/chapter08/mrfMeanField.m b/chapter08/mrfMeanField.m
deleted file mode 100644
index 2f767cd..0000000
--- a/chapter08/mrfMeanField.m
+++ /dev/null
@@ -1,31 +0,0 @@
-function [nodeBel, edgeBel] = mrfMeanField(A, nodePot, edgePot, epoch)
-% Mean field for MRF (Assuming that egdePot is symmetric)
-% p(x)=exp(-E(x))/Z, E(x)=\sum(edgePot)+sum(nodePot)
-% Input: 
-%   A: n x n adjacent matrix of undirected graph, where value is edge index
-%   nodePot: k x n node potential 
-%   edgePot: k x k x m edge potential 
-% Output:
-%   nodeBel: k x n node belief q(x_i)
-%   edgeBel: k x k x m edge belief q(x_i,x_j)
-% Written by Mo Chen (sth4nth@gmail.com)
-tol = 0;
-if nargin < 4
-    epoch = 50;
-    tol = 1e-8;
-end
-[nodeBel,L] = softmax(-nodePot,1);    % init nodeBel    
-for iter = 1:epoch
-    nodeBel0 = nodeBel;
-    for i = 1:numel(L)
-        [~,j,e] = find(A(i,:));             % neighbors
-        nodeBel(:,i) = softmax(-nodePot(:,i)-reshape(edgePot(:,:,e),2,[])*reshape(nodeBel(:,j),[],1));
-    end
-    if max(abs(nodeBel(:)-nodeBel0(:))) < tol; break; end
-end
-
-[s,t,e] = find(tril(A));
-edgeBel = zeros(size(edgePot));
-for l = 1:numel(e)
-    edgeBel(:,:,e(l)) = nodeBel(:,s(l))*nodeBel(:,t(l))';
-end
\ No newline at end of file
diff --git a/demo/ch08/mrf_demo.m b/demo/ch08/mrf_demo.m
index 5cede61..b15942f 100644
--- a/demo/ch08/mrf_demo.m
+++ b/demo/ch08/mrf_demo.m
@@ -1,65 +1,57 @@
+% Done!
 clear; close all;
-%% Original image
+% load letterA.mat;
+% X = A;
 load letterX.mat
+%% Original image
 img = double(X);
 img = sign(img-mean(img(:)));
 
 figure;
-subplot(2,3,1);
+subplot(2,2,1);
 imagesc(img);
 title('Original image');
 axis image;
 colormap gray;
 %% Noisy image
-y = img + sigma*randn(size(img)); % noisy signal
-
-subplot(2,3,2);
-imagesc(y);
+sigma = 1; % noise level
+x = img + sigma*randn(size(img)); % noisy signal
+subplot(2,2,2);
+imagesc(x);
 title('Noisy image');
 axis image;
 colormap gray;
-%% Parameters
-epoch = 50;
-J = 1;   % Ising parameter
-sigma = 1; % noise level
+%% Construct MRF data
+epoch = 20;
+J = 1;   % ising parameter
+[A,nodePot,edgePot] = mrfIsGa(x,sigma,J);
 %% Mean Field
-[A, nodePot, edgePot] = im2mrf(y, J, sigma);
-[nodeBel, edgeBel] = mrfMeanField(A, nodePot, edgePot, epoch);
-lnZ = gibbsEnergy(nodePot, edgePot, nodeBel, edgeBel);
-lnZ0 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
-maxdiff(lnZ, lnZ0)
+[nodeBel0,edgeBel0,lnZ0] = mrfMf(A,nodePot,edgePot,epoch);
 
-subplot(2,3,4);
-imagesc(reshape(nodeBel(1,:),size(img)));
-title('Mean Field');
-axis image;
-colormap gray;
-%% Ising Mean Field 
-h = reshape(0.5*diff(nodePot),size(img));
-mu = isingMeanField(J, h, epoch);
-maxdiff(reshape(mu,1,[]), [1,-1]*nodeBel)
+L0 = mrfGibbs(A,nodePot,edgePot,nodeBel0);
+L1 = mrfBethe(A,nodePot,edgePot,nodeBel0,edgeBel0);
+maxdiff(L0,lnZ0(end))
+maxdiff(L0,L1)
 
-subplot(2,3,3);
-imagesc(mu)
-title('Ising Mean Field');
+subplot(2,2,3);
+imagesc(reshape(nodeBel0(1,:),size(img)));
+title('Mean Field');
 axis image;
 colormap gray;
 %% Belief Propagation
-[nodeBel,edgeBel] = mrfBelProp(A, nodePot, edgePot, epoch);
-lnZ = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
-
-subplot(2,3,5);
-imagesc(reshape(nodeBel(1,:),size(img)));
-title('Belief propagation');
-axis image;
-colormap gray;
-%% Expectation Propagation
-[nodeBel,edgeBel] = mrfExpProp(A, nodePot, edgePot, epoch);
-lnZ0 = betheEnergy(A, nodePot, edgePot, nodeBel, edgeBel);
-maxdiff(lnZ, lnZ0)
+[nodeBel1,edgeBel1,lnZ1] = mrfBp(A,nodePot,edgePot,epoch);
 
-subplot(2,3,6);
-imagesc(reshape(nodeBel(1,:),size(img)));
-title('Expectation Propagation');
+subplot(2,2,4);
+imagesc(reshape(nodeBel1(1,:),size(img)));
+title('Belief Propagation');
 axis image;
 colormap gray;
+%% Energy comparation
+figure
+epochs = 1:epoch;
+plot( epochs,lnZ0,'-', ...
+      epochs,lnZ1,'-');
+xlabel('epoch');       %  add axis labels and plot title
+ylabel('energy');
+title('Energy Comparation');
+legend('MF','BP');
\ No newline at end of file

From d858d5d692091a31f25f786200513bec4a506fe0 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 15 Nov 2018 19:58:52 +0800
Subject: [PATCH 078/119] fix some demos

---
 demo/ch04/logitBin_demo.m |  6 +++---
 demo/ch07/rvmBinEm_demo.m |  2 +-
 demo/ch07/rvmBinFp_demo.m |  2 +-
 demo/ch09/rvmBinEm_demo.m | 13 -------------
 4 files changed, 5 insertions(+), 18 deletions(-)
 delete mode 100644 demo/ch09/rvmBinEm_demo.m

diff --git a/demo/ch04/logitBin_demo.m b/demo/ch04/logitBin_demo.m
index dd2c020..502aedf 100644
--- a/demo/ch04/logitBin_demo.m
+++ b/demo/ch04/logitBin_demo.m
@@ -6,9 +6,9 @@
 d = 2;
 k = 2;
 n = 1000;
-[X,y] = kmeansRnd(d,k,n);
-[model, llh] = logitBin(X,y-1);
+[X,t] = kmeansRnd(d,k,n);
+[model, llh] = logitBin(X,t-1);
 plot(llh);
-t = logitBinPred(model,X)+1;
+y = logitBinPred(model,X)+1;
 figure
 binPlot(model,X,y)
\ No newline at end of file
diff --git a/demo/ch07/rvmBinEm_demo.m b/demo/ch07/rvmBinEm_demo.m
index 039e856..54ed1bb 100644
--- a/demo/ch07/rvmBinEm_demo.m
+++ b/demo/ch07/rvmBinEm_demo.m
@@ -9,4 +9,4 @@
 plot(llh);
 y = rvmBinPred(model,X)+1;
 figure;
-binPlot(model,X,y);
+plotClass(X,y);
diff --git a/demo/ch07/rvmBinFp_demo.m b/demo/ch07/rvmBinFp_demo.m
index 2dcb2ae..ff1f823 100644
--- a/demo/ch07/rvmBinFp_demo.m
+++ b/demo/ch07/rvmBinFp_demo.m
@@ -9,4 +9,4 @@
 plot(llh);
 y = rvmBinPred(model,X)+1;
 figure;
-binPlot(model,X,y);
+plotClass(X,y);
diff --git a/demo/ch09/rvmBinEm_demo.m b/demo/ch09/rvmBinEm_demo.m
deleted file mode 100644
index f15ae6e..0000000
--- a/demo/ch09/rvmBinEm_demo.m
+++ /dev/null
@@ -1,13 +0,0 @@
-%% RVM classification via EM
-clear; close all
-k = 2;
-d = 2;
-n = 1000;
-[X,t] = kmeansRnd(d,k,n);
-[x1,x2] = meshgrid(linspace(min(X(1,:)),max(X(1,:)),n), linspace(min(X(2,:)),max(X(2,:)),n));
-
-[model, llh] = rvmBinEm(X,t-1);
-plot(llh);
-y = rvmBinPred(model,X)+1;
-figure;
-binPlot(model,X,y);
\ No newline at end of file

From 75933f7a1dfab3a31f50e46ec900c78fbcab3cc8 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 15 Nov 2018 22:33:56 +0800
Subject: [PATCH 079/119] minor tweak

---
 chapter12/ppcaVb.m      | 10 +++++-----
 demo/ch12/ppcaVb_demo.m |  6 ++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/chapter12/ppcaVb.m b/chapter12/ppcaVb.m
index 2c6b249..a07b4e8 100644
--- a/chapter12/ppcaVb.m
+++ b/chapter12/ppcaVb.m
@@ -1,4 +1,4 @@
-function [model, energy] = ppcaVb(X, q, prior)
+function [model, L] = ppcaVb(X, q, prior)
 % Perform variatioanl Bayeisan inference for probabilistic PCA model. 
 % Input:
 %   X: d x n data matrix
@@ -27,7 +27,7 @@
 end
 tol = 1e-6;
 maxIter = 500;
-energy = -inf(1,maxIter);
+L = -inf(1,maxIter);
 
 mu = mean(X,2);
 Xo = bsxfun(@minus, X, mu);
@@ -67,10 +67,10 @@
 %     Emu = Ebeta/(lambda+n*Ebeta)*sum(X-WZ,2);
 
 %     lower bound
-    energy(iter) = KLalpha+KLbeta+KLW+KLZ;
-    if energy(iter)-energy(iter-1) < tol*abs(energy(iter-1)); break; end  
+    L(iter) = KLalpha+KLbeta+KLW+KLZ;
+    if L(iter)-L(iter-1) < tol*abs(L(iter-1)); break; end  
 end
-energy = energy(2:iter);
+L = L(2:iter);
 
 model.Z = EZ;
 model.W = EW;
diff --git a/demo/ch12/ppcaVb_demo.m b/demo/ch12/ppcaVb_demo.m
index 074fce3..ac43da3 100644
--- a/demo/ch12/ppcaVb_demo.m
+++ b/demo/ch12/ppcaVb_demo.m
@@ -1,5 +1,4 @@
 % demos for ch12
-
 clear; close all;
 d = 3;
 m = 2;
@@ -7,7 +6,6 @@
 
 X = ppcaRnd(m,d,n);
 plotClass(X);
-
 %% Variational Bayesian probabilistic PCA
-[model, energy] = ppcaVb(X);
-plot(energy);
+[model, L] = ppcaVb(X);
+plot(L);

From fcf7e0d285260eddf0dcec1a62e39d387c15234f Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 15 Nov 2018 22:35:49 +0800
Subject: [PATCH 080/119] fix typo

---
 chapter12/ppcaVb.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter12/ppcaVb.m b/chapter12/ppcaVb.m
index a07b4e8..e60d523 100644
--- a/chapter12/ppcaVb.m
+++ b/chapter12/ppcaVb.m
@@ -5,7 +5,7 @@
 %   q: dimension of target space
 % Output:
 %   model: trained model structure
-%   ernergy: variantional lower bound
+%   L: variantional lower bound
 % Reference: 
 %   Pattern Recognition and Machine Learning by Christopher M. Bishop 
 % Written by Mo Chen (sth4nth@gmail.com).

From 2fb33395dd6061b87955a855ef799ad88d359d0d Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 20 Nov 2018 19:11:42 +0800
Subject: [PATCH 081/119] rewrite mlp and backpropagation for regression

---
 chapter05/mlp.m                       | 39 ------------------
 chapter05/mlpReg.m                    | 59 +++++++++++++++++++++++++++
 chapter05/{mlpPred.m => mlpRegPred.m} | 14 ++++---
 demo/ch05/mlp_demo.m                  | 22 ++++++----
 4 files changed, 82 insertions(+), 52 deletions(-)
 delete mode 100644 chapter05/mlp.m
 create mode 100644 chapter05/mlpReg.m
 rename chapter05/{mlpPred.m => mlpRegPred.m} (53%)

diff --git a/chapter05/mlp.m b/chapter05/mlp.m
deleted file mode 100644
index df987b1..0000000
--- a/chapter05/mlp.m
+++ /dev/null
@@ -1,39 +0,0 @@
-function [model, mse] = mlp(X, T, h)
-% Train a multilayer perceptron neural network
-% Input:
-%   X: d x n data matrix
-%   T: p x n response matrix
-%   h: L x 1 vector specify number of hidden nodes in each layer l
-% Ouput:
-%   model: model structure
-%   mse: mean square error
-% Written by Mo Chen (sth4nth@gmail.com).
-eta = 1/size(X,2);
-h = [size(X,1);h(:);size(T,1)];
-L = numel(h);
-W = cell(L-1,1);
-for l = 1:L-1
-    W{l} = randn(h(l),h(l+1));
-end
-Z = cell(L,1);
-Z{1} = X;
-maxiter = 200;
-mse = zeros(1,maxiter);
-for iter = 1:maxiter
-%     forward
-    for l = 2:L
-        Z{l} = sigmoid(W{l-1}'*Z{l-1});   % 5.10, 5.49
-    end
-%     backward
-    E = T-Z{L};
-    mse(iter) =  mean(dot(E,E),1);
-    for l = L-1:-1:1
-        df = Z{l+1}.*(1-Z{l+1});
-        dG = df.*E;
-        dW = Z{l}*dG';
-        W{l} = W{l}+eta*dW;
-        E = W{l}*dG;
-    end
-end
-mse = mse(1:iter);
-model.W = W;
\ No newline at end of file
diff --git a/chapter05/mlpReg.m b/chapter05/mlpReg.m
new file mode 100644
index 0000000..caf42d1
--- /dev/null
+++ b/chapter05/mlpReg.m
@@ -0,0 +1,59 @@
+function [model, L] = mlpReg(X,Y,k,lambda)
+% Train a multilayer perceptron neural network
+% Input:
+%   X: d x n data matrix
+%   Y: p x n response matrix
+%   k: T x 1 vector to specify number of hidden nodes in each layer
+%   lambda: regularization parameter
+% Ouput:
+%   model: model structure
+%   L: loss
+% Written by Mo Chen (sth4nth@gmail.com).
+if nargin < 4
+    lambda = 1e-2;
+end
+eta = 1e-3;
+maxiter = 50000;
+L = inf(1,maxiter);
+
+k = [size(X,1);k(:);size(Y,1)];
+T = numel(k)-1;
+W = cell(T,1);
+b = cell(T,1);
+for t = 1:T
+    W{t} = randn(k(t),k(t+1));
+    b{t} = randn(k(t+1),1);
+end
+R = cell(T,1);
+Z = cell(T+1,1);
+Z{1} = X;
+for iter = 2:maxiter
+%     forward
+    for t = 1:T-1
+        Z{t+1} = tanh(W{t}'*Z{t}+b{t});
+    end
+    Z{T+1} = W{T}'*Z{T}+b{T};
+
+%     loss
+    E = Z{T+1}-Y;     
+    Wn = cellfun(@(x) dot(x(:),x(:)),W);            % |W|^2
+    L(iter) = dot(E(:),E(:))+lambda*sum(Wn);
+
+%     backward
+    R{T} = E;                % delta
+    for t = T-1:-1:1
+        df = 1-Z{t+1}.^2;    % h'(a)
+        R{t} = df.*(W{t+1}*R{t+1});    % delta
+    end
+    
+%     gradient descent
+    for t=1:T
+        dW = Z{t}*R{t}'+lambda*W{t};
+        db = sum(R{t},2);
+        W{t} = W{t}-eta*dW;
+        b{t} = b{t}-eta*db;
+    end
+end
+L = L(1,2:iter);
+model.W = W;
+model.b = b;
diff --git a/chapter05/mlpPred.m b/chapter05/mlpRegPred.m
similarity index 53%
rename from chapter05/mlpPred.m
rename to chapter05/mlpRegPred.m
index 0ce5fb1..ce71bc5 100644
--- a/chapter05/mlpPred.m
+++ b/chapter05/mlpRegPred.m
@@ -1,4 +1,4 @@
-function Y = mlpPred(model, X)
+function Y = mlpRegPred(model, X)
 % Multilayer perceptron prediction
 % Input:
 %   model: model structure
@@ -7,7 +7,11 @@
 %   Y: p x n response matrix
 % Written by Mo Chen (sth4nth@gmail.com).
 W = model.W;
-Y = X;
-for l = 1:length(W)
-    Y = sigmoid(W{l}'*Y);
-end
\ No newline at end of file
+b = model.b;
+T = length(W);
+Z = cell(T+1,1);
+Z{1} = X;
+for t = 1:T-1
+    Z{t+1} = tanh(W{t}'*Z{t}+b{t});
+end
+Y = W{T}'*Z{T}+b{T};
\ No newline at end of file
diff --git a/demo/ch05/mlp_demo.m b/demo/ch05/mlp_demo.m
index 9e55c26..75c170a 100644
--- a/demo/ch05/mlp_demo.m
+++ b/demo/ch05/mlp_demo.m
@@ -1,9 +1,15 @@
 clear; close all;
-h = [4,5];
-X = [0 0 1 1;0 1 0 1];
-T = [0 1 1 0];
-[model,mse] = mlp(X,T,h);
-plot(mse);
-disp(['T = [' num2str(T) ']']);
-Y = mlpPred(model,X);
-disp(['Y = [' num2str(Y) ']']);
\ No newline at end of file
+n = 200;
+x = linspace(0,2*pi,n);
+y = sin(x);
+
+k = [3,4];            % two hidden layers with 3 and 4 hidden nodes
+lambda = 1e-2;
+[model, L] = mlpReg(x,y,k);
+t = mlpRegPred(model,x);
+plot(L);
+figure;
+hold on
+plot(x,y,'.');
+plot(x,t);
+hold off
\ No newline at end of file

From b63676b131d4b672f08a6b29e3cb59ebc9a7e28a Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 20 Nov 2018 19:13:27 +0800
Subject: [PATCH 082/119] tweak mlpRegPred.m

---
 chapter05/mlpRegPred.m | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/chapter05/mlpRegPred.m b/chapter05/mlpRegPred.m
index ce71bc5..e3bba3f 100644
--- a/chapter05/mlpRegPred.m
+++ b/chapter05/mlpRegPred.m
@@ -9,9 +9,8 @@
 W = model.W;
 b = model.b;
 T = length(W);
-Z = cell(T+1,1);
-Z{1} = X;
+Y = X;
 for t = 1:T-1
-    Z{t+1} = tanh(W{t}'*Z{t}+b{t});
+    Y = tanh(W{t}'*Y+b{t});
 end
-Y = W{T}'*Z{T}+b{T};
\ No newline at end of file
+Y = W{T}'*Y+b{T};
\ No newline at end of file

From 48cf34b2f014453bf5643a6192850ba64438cb87 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 24 Nov 2018 01:30:19 +0800
Subject: [PATCH 083/119] Create LICENSE

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e37e360
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Mo Chen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

From bd8c51c0ae01c10c8894087cb743c9002de4e7cc Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 24 Nov 2018 01:31:12 +0800
Subject: [PATCH 084/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 73840b2..8f71920 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ If you found any bug or have any suggestion, please do file issues. I am gracefu
 
 License
 -------
-Currently Released Under GPLv3
+Released Under MIT License
 
 
 Contact

From e21c48947c2ac369716c34744abeca3d97ccd9e6 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 24 Nov 2018 01:51:50 +0800
Subject: [PATCH 085/119] Update README.md

---
 README.md | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 8f71920..9a8ea43 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,33 @@
 Introduction
 -------
-This package is a Matlab implementation of the algorithms described in the classical machine learning textbook:
+This package is a Matlab implementation of the algorithms described in the machine learning textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
-Note: this package requires Matlab **R2016b** or latter, since it utilizes a new syntax of Matlab called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting in Python).
+Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting).
 
-Description
+Design Goal
 -------
-While developing this package, I stick to following principles
-
-* Succinct: The code is extremely terse. Minimizing the number of lines is one of the primal goals. As a result, the core of the algorithms can be easily spot.
-* Efficient: Many tricks for making Matlab scripts fast were applied (eg. vectorization and matrix factorization). Many functions are even comparable with C implementations. Usually, functions in this package are orders faster than Matlab builtin ones which provide the same functionality (eg. kmeans). If anyone have found any Matlab implementation that is faster than mine, I am happy to further optimize.
-* Robust: Many tricks for numerical stability are applied, such as probability computation in log scale and square root matrix update to enforce matrix symmetry, etc.
-* Readable: The code is heavily commented. Reference formulas in PRML book are indicated for corresponding code lines. Symbols are in sync with the book.
-* Practical: The package is designed not only to be easily read, but also to be easily used to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
+* Succinct: The code is extremely compact. Minimizing code length is a major goal. As a result, the core of the algorithms can be easily spotted.
+* Efficient: Many tricks to speedup Matlab code were applied (eg. vectorization, matrix factorization, etc.). Usually, functions in this package are orders faster than Matlab builtin ones (eg. kmeans).
+* Robust: Many tricks for numerical stability are applied, such as computing probability in log domain, square root matrix update to enforce matrix symmetry\PD, etc.
+* Readable: The code is heavily commented. Corresponding formulas in PRML are annoted. Symbols are in sync with the book.
+* Practical: The package is not only readable, but also meant to be easily used and modified to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
 
 Installation
 -------
-1. Download the package to your local path (e.g. PRMLT/) by running: `git clone https://github.com/PRML/PRMLT.git`.
+1. Download the package to a local folder (e.g. ~/PRMLT/) by running: `git clone https://github.com/PRML/PRMLT.git`.
 
-2. Run Matlab and navigate to PRMLT/, then run the init.m script.
+2. Run Matlab and navigate to the folder (~/PRMLT/), then run the init.m script.
 
-3. Try demos in PRMLT/demo directory to verify installation correctness. Enjoy!
+3. Run some demos in ~/PRMLT/demo folder. Enjoy!
 
 FeedBack
 -------
-If you found any bug or have any suggestion, please do file issues. I am graceful for any feedback and will do my best to improve this package.
+If you find any bug or have any suggestion, please do file issues. I am graceful for any feedback and will do my best to improve this package.
 
 License
 -------
-Released Under MIT License
-
+Released under MIT license
 
 Contact
 -------

From 83dbea0ad3e39b074d0cf54977fe4496d20e0c1e Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 24 Nov 2018 01:59:54 +0800
Subject: [PATCH 086/119] Update README.md

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9a8ea43..703c0f4 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,11 @@
 Introduction
 -------
-This package is a Matlab implementation of the algorithms described in the machine learning textbook:
+This Matlab package implementes machine learning algorithms described in the great textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
-Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting).
+It is written purely in Matlab language. It is self-contained. There is no outside denpency.
+
+Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires statistical toolbox (for some simple random number generator) and image processing box (for reading image data).
 
 Design Goal
 -------

From 0c2a768b32262f62a839afb61ee9d7459ddc3ff0 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 24 Nov 2018 02:00:55 +0800
Subject: [PATCH 087/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 703c0f4..a4645c5 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.mi
 
 It is written purely in Matlab language. It is self-contained. There is no outside denpency.
 
-Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires statistical toolbox (for some simple random number generator) and image processing box (for reading image data).
+Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires statistical toolbox (for some simple random number generator) and image processing toolbox (for reading image data).
 
 Design Goal
 -------

From 364201b5f2cdbfe81fd9df2272f7bd457cbdac5f Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 24 Nov 2018 02:02:47 +0800
Subject: [PATCH 088/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a4645c5..c618080 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.mi
 
 It is written purely in Matlab language. It is self-contained. There is no outside denpency.
 
-Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires statistical toolbox (for some simple random number generator) and image processing toolbox (for reading image data).
+Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires Statistics Toolbox (for some simple random number generator) and Image Processing Toolbox (for reading image data).
 
 Design Goal
 -------

From bb05b4c5e523131dd01fad85baf2692cf1c9b95e Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 24 Nov 2018 02:03:57 +0800
Subject: [PATCH 089/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c618080..7223bd0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 Introduction
 -------
-This Matlab package implementes machine learning algorithms described in the great textbook:
+This Matlab package implements machine learning algorithms described in the great textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
 It is written purely in Matlab language. It is self-contained. There is no outside denpency.

From 86686b86be7cd8c499780d087fc5aaaaa950150d Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Sat, 24 Nov 2018 02:08:59 +0800
Subject: [PATCH 090/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7223bd0..1817d05 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ Introduction
 This Matlab package implements machine learning algorithms described in the great textbook:
 Pattern Recognition and Machine Learning by C. Bishop ([PRML](http://research.microsoft.com/en-us/um/people/cmbishop/prml/)).
 
-It is written purely in Matlab language. It is self-contained. There is no outside denpency.
+It is written purely in Matlab language. It is self-contained. There is no external dependency.
 
 Note: this package requires Matlab **R2016b** or latter, since it utilizes a new Matlab syntax called [Implicit expansion](https://cn.mathworks.com/help/matlab/release-notes.html?rntext=implicit+expansion&startrelease=R2016b&endrelease=R2016b&groupby=release&sortby=descending) (a.k.a. broadcasting). It also requires Statistics Toolbox (for some simple random number generator) and Image Processing Toolbox (for reading image data).
 

From 25a456ed4c20db65863532cf63f55f82c18e934a Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 27 Nov 2018 15:54:22 +0800
Subject: [PATCH 091/119] Set theme jekyll-theme-midnight

---
 _config.yml | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 _config.yml

diff --git a/_config.yml b/_config.yml
new file mode 100644
index 0000000..1885487
--- /dev/null
+++ b/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight
\ No newline at end of file

From 700df90820b1239f4e2739a778dbc59bcb697cfc Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 27 Nov 2018 15:55:58 +0800
Subject: [PATCH 092/119] Set theme jekyll-theme-modernist

---
 _config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_config.yml b/_config.yml
index 1885487..cc35c1d 100644
--- a/_config.yml
+++ b/_config.yml
@@ -1 +1 @@
-theme: jekyll-theme-midnight
\ No newline at end of file
+theme: jekyll-theme-modernist
\ No newline at end of file

From 96db6fb5049a234df0bca71192a1e719a588fd02 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 27 Nov 2018 15:57:54 +0800
Subject: [PATCH 093/119] Set theme jekyll-theme-midnight

---
 _config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_config.yml b/_config.yml
index cc35c1d..1885487 100644
--- a/_config.yml
+++ b/_config.yml
@@ -1 +1 @@
-theme: jekyll-theme-modernist
\ No newline at end of file
+theme: jekyll-theme-midnight
\ No newline at end of file

From 0763001c9491b6efd2ffb1f9990f6f5d61b7f84b Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 27 Nov 2018 17:00:57 +0800
Subject: [PATCH 094/119] Delete _config.yml

---
 _config.yml | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 _config.yml

diff --git a/_config.yml b/_config.yml
deleted file mode 100644
index 1885487..0000000
--- a/_config.yml
+++ /dev/null
@@ -1 +0,0 @@
-theme: jekyll-theme-midnight
\ No newline at end of file

From 2025472ae9eac4d7a386c9b9e385b5acd5da3ed3 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 27 Nov 2018 20:02:09 +0800
Subject: [PATCH 095/119] Update README.md

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 1817d05..ddd02c6 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,10 @@ Design Goal
 
 Installation
 -------
-1. Download the package to a local folder (e.g. ~/PRMLT/) by running: `git clone https://github.com/PRML/PRMLT.git`.
-
+1. Download the package to a local folder (e.g. ~/PRMLT/) by running: 
+```console
+git clone https://github.com/PRML/PRMLT.git
+```
 2. Run Matlab and navigate to the folder (~/PRMLT/), then run the init.m script.
 
 3. Run some demos in ~/PRMLT/demo folder. Enjoy!

From 0523c2c8ab7ebf3a22d7616b91643e147c235071 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 29 Nov 2018 02:55:51 +0800
Subject: [PATCH 096/119] refactor kalmanFilter and fix kalmanSmoother

---
 chapter13/LDS/kalmanFilter.m   |  4 ++--
 chapter13/LDS/kalmanSmoother.m | 28 ++++++++++++++--------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/chapter13/LDS/kalmanFilter.m b/chapter13/LDS/kalmanFilter.m
index 0005ee6..19a4137 100644
--- a/chapter13/LDS/kalmanFilter.m
+++ b/chapter13/LDS/kalmanFilter.m
@@ -32,11 +32,11 @@
 llh(1) = logGauss(X(:,1),C*mu0,R);
 for i = 2:n
     [mu(:,i), V(:,:,i), llh(i)] = ...
-        forwardStep(X(:,i), mu(:,i-1), V(:,:,i-1), A, G, C, S, I);
+        forwardUpdate(X(:,i), mu(:,i-1), V(:,:,i-1), A, G, C, S, I);
 end
 llh = sum(llh);
 
-function [mu, V, llh] = forwardStep(x, mu, V, A, G, C, S, I)
+function [mu, V, llh] = forwardUpdate(x, mu, V, A, G, C, S, I)
 P = A*V*A'+G;                                               % 13.88
 PC = P*C';                                                      
 R = C*PC+S;
diff --git a/chapter13/LDS/kalmanSmoother.m b/chapter13/LDS/kalmanSmoother.m
index 8254230..f6ed2e9 100644
--- a/chapter13/LDS/kalmanSmoother.m
+++ b/chapter13/LDS/kalmanSmoother.m
@@ -1,4 +1,4 @@
-function [nu, U, Ezz, Ezy, llh] = kalmanSmoother(model, X)
+function [nu, U, llh, Ezz, Ezy] = kalmanSmoother(model, X)
 % Kalman smoother (forward-backward algorithm for linear dynamic system)
 % NOTE: This is the exact implementation of the Kalman smoother algorithm in PRML.
 % However, this algorithm is not practical. It is numerical unstable. 
@@ -26,20 +26,19 @@
 P = zeros(q,q,n); % C_{t+1|t}
 Amu = zeros(q,n); % u_{t+1|t}
 llh = zeros(1,n);
-I = eye(q);
 
 % forward
 PC = P0*C';
 R = C*PC+S;
 K = PC/R;
 mu(:,1) = mu0+K*(X(:,1)-C*mu0);
-V(:,:,1) = (I-K*C)*P0;
+V(:,:,1) = (eye(q)-K*C)*P0;
 P(:,:,1) = P0;  % useless, just make a point
 Amu(:,1) = mu0; % useless, just make a point
 llh(1) = logGauss(X(:,1),C*mu0,R);
 for i = 2:n    
     [mu(:,i), V(:,:,i), Amu(:,i), P(:,:,i), llh(i)] = ...
-        forwardStep(X(:,i), mu(:,i-1), V(:,:,i-1), A, G, C, S, I);
+        forwardUpdate(X(:,i), mu(:,i-1), V(:,:,i-1), A, G, C, S);
 end
 llh = sum(llh);
 % backward
@@ -53,24 +52,25 @@
 Ezz(:,:,n) = U(:,:,n)+nu(:,n)*nu(:,n)';
 for i = n-1:-1:1  
     [nu(:,i), U(:,:,i), Ezz(:,:,i), Ezy(:,:,i)] = ...
-        backwardStep(nu(:,i+1), U(:,:,i+1), mu(:,i), V(:,:,i), Amu(:,i+1), P(:,:,i+1), A);
+        backwardUpdate(nu(:,i+1), U(:,:,i+1), mu(:,i), V(:,:,i), Amu(:,i+1), P(:,:,i+1), A);
 end
 
-function [mu, V, Amu, P, llh] = forwardStep(x, mu0, V0, A, G, C, S, I)
+function [mu1, V1, Amu, P, llh] = forwardUpdate(x, mu0, V0, A, G, C, S)
+k = numel(mu0);
 P = A*V0*A'+G;                                              % 13.88
 PC = P*C';
 R = C*PC+S;
 K = PC/R;                                                   % 13.92
 Amu = A*mu0;
 CAmu = C*Amu;
-mu = Amu+K*(x-CAmu);                                        % 13.89
-V = (I-K*C)*P;                                              % 13.90
+mu1 = Amu+K*(x-CAmu);                                        % 13.89
+V1 = (eye(k)-K*C)*P;                                              % 13.90
 llh = logGauss(x,CAmu,R);                                   % 13.91
 
 
-function [nu, U, Ezz, Ezy] = backwardStep(nu0, U0, mu, V, Amu, P, A)
-J = V*A'/P;                                                 % 13.102
-nu = mu+J*(nu0-Amu);                                        % 13.100
-U = V+J*(U0-P)*J';                                          % 13.101
-Ezy = J*U0+nu0*nu';                                         % 13.106 
-Ezz = U+nu*nu';                                             % 13.107
\ No newline at end of file
+function [nu0, U0, E00, E10] = backwardUpdate(nu1, U1, mu, V, Amu, P, A)
+J = V*A'/P;                                                  % 13.102
+nu0 = mu+J*(nu1-Amu);                                        % 13.100
+U0 = V+J*(U1-P)*J';                                          % 13.101
+E00 = U0+nu0*nu0';                                           % 13.107
+E10 = U1*J'+nu1*nu0';                                        % 13.106 

From 469aa06bce9c89c50c55dceb6b901810cc1aa181 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 29 Nov 2018 03:01:30 +0800
Subject: [PATCH 097/119] fix ldsEm

---
 chapter13/LDS/kalmanSmoother.m |  8 ++--
 chapter13/LDS/ldsEm.m          | 76 +++++++++++++++++-----------------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/chapter13/LDS/kalmanSmoother.m b/chapter13/LDS/kalmanSmoother.m
index f6ed2e9..5a8ee7b 100644
--- a/chapter13/LDS/kalmanSmoother.m
+++ b/chapter13/LDS/kalmanSmoother.m
@@ -57,15 +57,15 @@
 
 function [mu1, V1, Amu, P, llh] = forwardUpdate(x, mu0, V0, A, G, C, S)
 k = numel(mu0);
-P = A*V0*A'+G;                                              % 13.88
+P = A*V0*A'+G;                                               % 13.88
 PC = P*C';
 R = C*PC+S;
-K = PC/R;                                                   % 13.92
+K = PC/R;                                                    % 13.92
 Amu = A*mu0;
 CAmu = C*Amu;
 mu1 = Amu+K*(x-CAmu);                                        % 13.89
-V1 = (eye(k)-K*C)*P;                                              % 13.90
-llh = logGauss(x,CAmu,R);                                   % 13.91
+V1 = (eye(k)-K*C)*P;                                         % 13.90
+llh = logGauss(x,CAmu,R);                                    % 13.91
 
 
 function [nu0, U0, E00, E10] = backwardUpdate(nu1, U1, mu, V, Amu, P, A)
diff --git a/chapter13/LDS/ldsEm.m b/chapter13/LDS/ldsEm.m
index 7f283e4..0187a54 100644
--- a/chapter13/LDS/ldsEm.m
+++ b/chapter13/LDS/ldsEm.m
@@ -1,60 +1,60 @@
-function [model, llh] = ldsEm(X, init)
+function [model, llh] = ldsEm(X, m)
 % EM algorithm for parameter estimation of linear dynamic system.
-% NOTE: This is the exact implementation of the EM algorithm in PRML.
-% However, this algorithm is not practical. It is numerical unstable and 
-% there is too much redundant degree of freedom. 
+% NOTE: This is an exact implementation of the algorithm in PRML.
+% However, this algorithm is numerical unstable and there is much redundant degree of freedom. 
 % Input:
 %   X: d x n data matrix
-%   model: prior model structure
+%   m: initilaization parameter, either a integer for dimension of z or
+%   initi model structure.
 % Output:
 %   model: trained model structure
 %   llh: loglikelihood
+% reference: Bayesian Reasoning and Machine Learning (BRML)
 % Written by Mo Chen (sth4nth@gmail.com).
-d = size(X,1);
-if isstruct(init)   % init with a model
-    model = init;
-elseif numel(init) == 1  % random init with latent k
-    k = init;
-    model.A = randn(k,k);
-    model.G = iwishrnd(eye(k),k);
-    model.C = randn(d,k);
-    model.S = iwishrnd(eye(d),d);
-    model.mu0 = randn(k,1);
-    model.P0 = iwishrnd(eye(k),k);
+if isstruct(m)   % init with a model
+    model = m;
+elseif numel(m) == 1  % random init with latent dimension m
+    model = init(X,m);
 end
-tol = 1e-2;
-maxIter = 100;
+tol = 1e-4;
+maxIter = 1000;
 llh = -inf(1,maxIter);
 for iter = 2:maxIter
 %     E-step
-    [nu, U, Ezz, Ezy, llh(iter)] = kalmanSmoother(model,X);
-    if llh(iter)-llh(iter-1) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
+    [nu, U, llh(iter),Ezz, Ezy] = kalmanSmoother(model,X);
+    if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
 %     M-step 
-    model = maximization(X, nu, U, Ezz, Ezy);
+    model = maximization(model, X, nu, U, Ezz, Ezy);
 end
 llh = llh(2:iter);
 
-function model = maximization(X ,nu, U, Ezz, Ezy)
+function model = init(X, k)
+d = size(X,1);
+model.mu0 = randn(k,1);
+model.P0 = iwishrnd(eye(k),k);
+model.A = randn(k,k);
+model.G = iwishrnd(eye(k),k);
+model.C = randn(d,k);
+model.S = iwishrnd(eye(d),d);
+
+
+function model = maximization(model, X ,nu, U, Ezz, Ezy)
 n = size(X,2);
-mu0 = nu(:,1);
-P0 = U(:,:,1);
+mu0 = nu(:,1);                                    % 13.110
+P0 = U(:,:,1);                                    % 13.111, 13.107
 
-Ezzn = sum(Ezz,3);
-Ezz1 = Ezzn-Ezz(:,:,n);
-Ezz2 = Ezzn-Ezz(:,:,1);
-Ezy = sum(Ezy,3);
+EZZ = sum(Ezz,3);
+EZY = sum(Ezy,3);
+A = EZY/(EZZ-Ezz(:,:,n));                         % 13.113
+G = (EZZ-Ezz(:,:,1)-EZY*A')/(n-1);                % 13.114, BRML 24.5.12
 
-A = Ezy/Ezz1;                                           % 13.113
-EzyA = Ezy*A';
-G = (Ezz2-(EzyA+EzyA')+A*Ezz1*A')/(n-1);                % 13.114
 Xnu = X*nu';
-C = Xnu/Ezzn;                                           % 13.115
-XnuC = Xnu*C';
-S = (X*X'-(XnuC+XnuC')+C*Ezzn*C')/n;                    % 13.116
+C = Xnu/EZZ;                                      % 13.115
+S = (X*X'-Xnu*C')/n;                              % 13.116, BRML 24.5.11
 
-model.A = A;
-model.G = G;
-model.C = C;
-model.S = S;
 model.mu0 = mu0;
 model.P0 = P0;
+model.A = A;
+model.G = (G+G')/2;
+model.C = C;
+model.S = (S+S')/2;
\ No newline at end of file

From ccbf6eaca546c8de9cf6d6e2579388973ceaadf1 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 29 Nov 2018 03:08:05 +0800
Subject: [PATCH 098/119] rewrite ldsRnd and lds_demo

---
 chapter13/LDS/ldsRnd.m | 28 ++++++++--------
 demo/ch13/lds_demo.m   | 76 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/chapter13/LDS/ldsRnd.m b/chapter13/LDS/ldsRnd.m
index 1f4cb12..db73531 100644
--- a/chapter13/LDS/ldsRnd.m
+++ b/chapter13/LDS/ldsRnd.m
@@ -1,4 +1,4 @@
-function [X, Z, model] = ldsRnd(d, k, n)
+function [Z, X] = ldsRnd(model, n)
 % Generate a data sequence from linear dynamic system.
 % Input:
 %   d: dimension of data
@@ -8,25 +8,27 @@
 %   X: d x n data matrix
 %   model: model structure
 % Written by Mo Chen (sth4nth@gmail.com).
-A = randn(k,k);
-G = iwishrnd(eye(k),k);
-C = randn(d,k);
-S = iwishrnd(eye(d),d);
-mu0 = randn(k,1);
-P0 = iwishrnd(eye(k),k);
+mu0 = model.mu0;
+P0 = model.P0;
+A = model.A;
+G = model.G;
+C = model.C;
+S = model.S;
+
+k = size(G,1);
+d = size(S,1);
 
 X = zeros(d,n);
 Z = zeros(k,n);
-Z(:,1) = gaussRnd(mu0,P0);              % 13.80
+Z(:,1) = gaussRnd(mu0,P0);                  % 13.80
 X(:,1) = gaussRnd(C*Z(:,1),S);
 for i = 2:n
-    Z(:,i) = gaussRnd(A*Z(:,i-1),G);           % 13.75, 13.78
-    X(:,i) = gaussRnd(C*Z(:,i),S);      % 13.76, 13.79
+    Z(:,i) = gaussRnd(A*Z(:,i-1),G);        % 13.75, 13.78
+    X(:,i) = gaussRnd(C*Z(:,i),S);          % 13.76, 13.79
 end
-
+model.mu0 = mu0; % prior mean
+model.P0 = P0;  % prior covairance
 model.A = A; % transition matrix 
 model.G = G; % transition covariance
 model.C = C; % emission matrix
 model.S = S;  % emision covariance
-model.mu0 = mu0; % prior mean
-model.P0 = P0;  % prior covairance
diff --git a/demo/ch13/lds_demo.m b/demo/ch13/lds_demo.m
index 8c0b30e..ba9bc50 100644
--- a/demo/ch13/lds_demo.m
+++ b/demo/ch13/lds_demo.m
@@ -1,14 +1,68 @@
-% demos for LDS in ch13
+close all;
+%% generate data
+clear; 
+d = 2;
+k = 4;
+n = 50;
 
-clear; close all;
-d = 3;
-k = 2;
-n = 100;
+A = [1 0 1 0; 
+     0 1 0 1;
+     0 0 1 0;
+     0 0 0 1]; 
+G = 0.001*eye(k);
  
-[X,Z,model] = ldsRnd(d,k,n);
-[mu, V, llh] = kalmanFilter(model, X);
+C = [1 0 0 0;
+     0 1 0 0];
+S = eye(d);
 
-[nu, U, Ezz, Ezy, llh] = kalmanSmoother(model, X);
-% [model, llh] = ldsEm(X,k);
-% plot(llh);
-% 
+mu0 = [8; 10; 1; 0];
+P0 = eye(k);
+
+model.A = A;
+model.G = G;
+model.C = C;
+model.S = S;
+model.mu0 = mu0;
+model.P0 = P0;
+
+[z,x] = ldsRnd(model, n);
+figure;
+hold on
+plot(x(1,:), x(2,:), 'ro');
+plot(z(1,:), z(2,:), 'b*-');
+legend('observed', 'latent')
+axis equal
+hold off
+
+%% filter
+[mu, V, llh] = kalmanFilter(model, x);
+figure
+hold on
+plot(x(1,:), x(2,:), 'ro');
+plot(mu(1,:), mu(2,:), 'b*-');
+legend('observed', 'filtered')
+axis equal
+hold off
+
+%% smoother
+[nu, U, llh] = kalmanSmoother(model, x);
+figure
+hold on
+plot(x(1,:), x(2,:), 'ro');
+plot(nu(1,:), nu(2,:), 'b*-');
+legend('observed', 'smoothed')
+axis equal
+hold off
+
+%% EM
+[model, llh] = ldsEm(x,model);
+nu = kalmanSmoother(model, x);
+figure
+hold on
+plot(x(1,:), x(2,:), 'ro');
+plot(nu(1,:), nu(2,:), 'b*-');
+legend('observed', 'smoothed with fitted model')
+axis equal
+hold off
+figure;
+plot(llh);

From 11687b3a261504a3a738d099b07b92c61db86ddb Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 29 Nov 2018 03:12:11 +0800
Subject: [PATCH 099/119] improve ldsEm

---
 chapter13/LDS/ldsEm.m | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chapter13/LDS/ldsEm.m b/chapter13/LDS/ldsEm.m
index 0187a54..5d5b421 100644
--- a/chapter13/LDS/ldsEm.m
+++ b/chapter13/LDS/ldsEm.m
@@ -24,7 +24,7 @@
     [nu, U, llh(iter),Ezz, Ezy] = kalmanSmoother(model,X);
     if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
 %     M-step 
-    model = maximization(model, X, nu, U, Ezz, Ezy);
+    model = maximization(X, nu, U, Ezz, Ezy);
 end
 llh = llh(2:iter);
 
@@ -38,7 +38,7 @@
 model.S = iwishrnd(eye(d),d);
 
 
-function model = maximization(model, X ,nu, U, Ezz, Ezy)
+function model = maximization(X ,nu, U, Ezz, Ezy)
 n = size(X,2);
 mu0 = nu(:,1);                                    % 13.110
 P0 = U(:,:,1);                                    % 13.111, 13.107

From 9a248dab2b872cedbaa6e8a5234828e16298a10b Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 29 Nov 2018 03:37:54 +0800
Subject: [PATCH 100/119] tweak hmmEm a little

---
 chapter13/HMM/hmmEm.m | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chapter13/HMM/hmmEm.m b/chapter13/HMM/hmmEm.m
index 455106b..691409c 100644
--- a/chapter13/HMM/hmmEm.m
+++ b/chapter13/HMM/hmmEm.m
@@ -21,14 +21,14 @@
     E = normalize(rand(k,d),2);
 end
 tol = 1e-4;
-maxIter = 100;
+maxIter = 1000;
 llh = -inf(1,maxIter);
 for iter = 2:maxIter
     M = E*X;
 %     E-step
     [gamma,alpha,beta,c] = hmmSmoother(M,A,s);
     llh(iter) = mean(log(c));
-    if llh(iter)-llh(iter-1) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
+    if abs(llh(iter)-llh(iter-1)) < tol*abs(llh(iter-1)); break; end   % check likelihood for convergence
 %     M-step 
     s = gamma(:,1);                                                                             % 13.18
     A = normalize(A.*(alpha(:,1:n-1)*(beta(:,2:n).*M(:,2:n)./c(2:n))'),2);      % 13.19 13.43 13.65

From fb04ac210add4ca4d958d58bcdde921c2bd84444 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 29 Nov 2018 03:38:27 +0800
Subject: [PATCH 101/119] add empty ldsPca

---
 chapter13/LDS/ldsPca.m | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 chapter13/LDS/ldsPca.m

diff --git a/chapter13/LDS/ldsPca.m b/chapter13/LDS/ldsPca.m
new file mode 100644
index 0000000..8ed035a
--- /dev/null
+++ b/chapter13/LDS/ldsPca.m
@@ -0,0 +1,12 @@
+function [A, C, Z] = ldsPca(X, k, m)
+% Subspace method for learning linear dynamic system.
+% Input:
+%   X: d x n data matrix
+%   k: dimension of hidden variable
+%   m: 
+% Output:
+%   A: 
+%   C:
+%   Z:
+% reference: Bayesian Reasoning and Machine Learning (BRML) chapter 24.5.3 p.507
+% Written by Mo Chen (sth4nth@gmail.com).
\ No newline at end of file

From d1a3ae66bd6fe8d97141ff09b454ab155f1a240e Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 29 Nov 2018 04:26:10 +0800
Subject: [PATCH 102/119] tweak ldsEm a little

---
 chapter13/LDS/ldsEm.m | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/chapter13/LDS/ldsEm.m b/chapter13/LDS/ldsEm.m
index 5d5b421..0ce4d3e 100644
--- a/chapter13/LDS/ldsEm.m
+++ b/chapter13/LDS/ldsEm.m
@@ -40,8 +40,6 @@
 
 function model = maximization(X ,nu, U, Ezz, Ezy)
 n = size(X,2);
-mu0 = nu(:,1);                                    % 13.110
-P0 = U(:,:,1);                                    % 13.111, 13.107
 
 EZZ = sum(Ezz,3);
 EZY = sum(Ezy,3);
@@ -52,8 +50,8 @@
 C = Xnu/EZZ;                                      % 13.115
 S = (X*X'-Xnu*C')/n;                              % 13.116, BRML 24.5.11
 
-model.mu0 = mu0;
-model.P0 = P0;
+model.mu0 = nu(:,1);                              % 13.110
+model.P0 = U(:,:,1);                              % 13.111, 13.107 
 model.A = A;
 model.G = (G+G')/2;
 model.C = C;

From 91cefe8c9643a1ff5faa41dd9e3262b9878cd023 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 30 Nov 2018 00:51:37 +0800
Subject: [PATCH 103/119] add ldsPca

---
 chapter13/LDS/ldsPca.m | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/chapter13/LDS/ldsPca.m b/chapter13/LDS/ldsPca.m
index 8ed035a..d484a8c 100644
--- a/chapter13/LDS/ldsPca.m
+++ b/chapter13/LDS/ldsPca.m
@@ -3,10 +3,18 @@
 % Input:
 %   X: d x n data matrix
 %   k: dimension of hidden variable
-%   m: 
+%   m: stacking order for the Hankel matrix
 % Output:
-%   A: 
-%   C:
-%   Z:
+%   A: k x k transition matrix
+%   C: k x d emission matrix
+%   Z: k x n latent variable
+%   Y: d x n reconstructed data
 % reference: Bayesian Reasoning and Machine Learning (BRML) chapter 24.5.3 p.507
-% Written by Mo Chen (sth4nth@gmail.com).
\ No newline at end of file
+% Written by Mo Chen (sth4nth@gmail.com).
+[d,n] = size(X);
+H = reshape(X(:,hankel(1:m,m:n)),d*m,[]);
+[U,S,V] = svd(H,'econ');
+C = U(1:d,1:k);
+Z = S(1:k,1:k)*V(:,1:k)';
+A = Z(:,2:end)/Z(:,1:end-1); % estimated transition
+% Y = C*Z; % reconstructions
\ No newline at end of file

From ca599be9687287c0e0a3db20aeeacc9025f82515 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 30 Nov 2018 01:22:41 +0800
Subject: [PATCH 104/119] update lds_demo. TODO: init with ldsPCA in ldsEM

---
 chapter13/LDS/ldsEm.m |  8 +++++-
 demo/ch13/lds_demo.m  | 61 ++++++++++++++++++++++++++-----------------
 2 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/chapter13/LDS/ldsEm.m b/chapter13/LDS/ldsEm.m
index 0ce4d3e..7611595 100644
--- a/chapter13/LDS/ldsEm.m
+++ b/chapter13/LDS/ldsEm.m
@@ -36,7 +36,13 @@
 model.G = iwishrnd(eye(k),k);
 model.C = randn(d,k);
 model.S = iwishrnd(eye(d),d);
-
+% [A,C,Z] = ldsPca(X,k,3*k);
+% model.mu0 = Z(:,1);
+% model.P0 = ;
+% model.A = A;
+% model.C = C;
+% model.G = ;
+% model.S = ;
 
 function model = maximization(X ,nu, U, Ezz, Ezy)
 n = size(X,2);
diff --git a/demo/ch13/lds_demo.m b/demo/ch13/lds_demo.m
index ba9bc50..fe9e421 100644
--- a/demo/ch13/lds_demo.m
+++ b/demo/ch13/lds_demo.m
@@ -1,21 +1,19 @@
 close all;
-%% generate data
+%% Parameter
 clear; 
 d = 2;
-k = 4;
+k = 2;
 n = 50;
 
-A = [1 0 1 0; 
-     0 1 0 1;
-     0 0 1 0;
-     0 0 0 1]; 
-G = 0.001*eye(k);
+A = [1,1; 
+     0 1];
+G = eye(k)*1e-3;
  
-C = [1 0 0 0;
-     0 1 0 0];
-S = eye(d);
+C = [1 0;
+     0 1];
+S = eye(d)*1e-1;
 
-mu0 = [8; 10; 1; 0];
+mu0 = [0; 0];
 P0 = eye(k);
 
 model.A = A;
@@ -25,43 +23,58 @@
 model.mu0 = mu0;
 model.P0 = P0;
 
-[z,x] = ldsRnd(model, n);
+%% Generate data
+[z,x] = ldsRnd(model,n);
 figure;
 hold on
 plot(x(1,:), x(2,:), 'ro');
 plot(z(1,:), z(2,:), 'b*-');
 legend('observed', 'latent')
+title('Generated Data')
 axis equal
 hold off
-
-%% filter
-[mu, V, llh] = kalmanFilter(model, x);
+%% Kalman filter
+[mu, V, llh] = kalmanFilter(model,x);
 figure
 hold on
 plot(x(1,:), x(2,:), 'ro');
 plot(mu(1,:), mu(2,:), 'b*-');
 legend('observed', 'filtered')
+title('Kalman filter')
 axis equal
 hold off
-
-%% smoother
-[nu, U, llh] = kalmanSmoother(model, x);
+%% Kalman smoother
+[nu, U, llh] = kalmanSmoother(model,x);
 figure
 hold on
 plot(x(1,:), x(2,:), 'ro');
 plot(nu(1,:), nu(2,:), 'b*-');
 legend('observed', 'smoothed')
+title('Kalman smoother')
 axis equal
 hold off
-
-%% EM
-[model, llh] = ldsEm(x,model);
-nu = kalmanSmoother(model, x);
+%% LDS Subspace
+[A,C,z] = ldsPca(x,k,3*k);
+y = C*z;
+t = size(z,2);
+figure;
+hold on
+plot(x(1,1:t), x(2,1:t), 'ro');
+plot(y(1,1:t), y(2,1:t), 'b*-');
+legend('observed', 'projected')
+title('LDS subspace learning')
+axis equal
+hold off
+%% LDS EM
+[model, llh] = ldsEm(x,k);
+nu = kalmanSmoother(model,x);
+y = model.C*nu;
 figure
 hold on
 plot(x(1,:), x(2,:), 'ro');
-plot(nu(1,:), nu(2,:), 'b*-');
-legend('observed', 'smoothed with fitted model')
+plot(y(1,:), y(2,:), 'b*-');
+legend('observed', 'learned')
+title('LDS EM learning')
 axis equal
 hold off
 figure;

From 62279de8275618a79ccabec77cd90a484021bd61 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 30 Nov 2018 05:55:49 +0800
Subject: [PATCH 105/119] modify ldsEm to use ldsPca as initialization

---
 chapter13/LDS/ldsEm.m | 33 ++++++++++++++++++---------------
 demo/ch13/lds_demo.m  | 29 +++++++++++++++--------------
 2 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/chapter13/LDS/ldsEm.m b/chapter13/LDS/ldsEm.m
index 7611595..d07620a 100644
--- a/chapter13/LDS/ldsEm.m
+++ b/chapter13/LDS/ldsEm.m
@@ -17,7 +17,7 @@
     model = init(X,m);
 end
 tol = 1e-4;
-maxIter = 1000;
+maxIter = 2000;
 llh = -inf(1,maxIter);
 for iter = 2:maxIter
 %     E-step
@@ -29,20 +29,23 @@
 llh = llh(2:iter);
 
 function model = init(X, k)
-d = size(X,1);
-model.mu0 = randn(k,1);
-model.P0 = iwishrnd(eye(k),k);
-model.A = randn(k,k);
-model.G = iwishrnd(eye(k),k);
-model.C = randn(d,k);
-model.S = iwishrnd(eye(d),d);
-% [A,C,Z] = ldsPca(X,k,3*k);
-% model.mu0 = Z(:,1);
-% model.P0 = ;
-% model.A = A;
-% model.C = C;
-% model.G = ;
-% model.S = ;
+% d = size(X,1);
+% model.mu0 = randn(k,1);
+% model.P0 = iwishrnd(eye(k),k);
+% model.A = randn(k,k);
+% model.G = iwishrnd(eye(k),k);
+% model.C = randn(d,k);
+% model.S = iwishrnd(eye(d),d);
+[A,C,Z] = ldsPca(X,k,3*k);
+model.mu0 = Z(:,1);
+E = Z(:,1:end-1)-Z(:,2:end);
+model.P0 = (dot(E(:),E(:))/(k*size(E,2)))*eye(k);
+model.A = A;
+E = A*Z(:,1:end-1)-Z(:,2:end);
+model.G = E*E'/size(E,2);
+model.C = C;
+E = C*Z-X(:,1:size(Z,2));
+model.S = E*E'/size(E,2);
 
 function model = maximization(X ,nu, U, Ezz, Ezy)
 n = size(X,2);
diff --git a/demo/ch13/lds_demo.m b/demo/ch13/lds_demo.m
index fe9e421..42742ae 100644
--- a/demo/ch13/lds_demo.m
+++ b/demo/ch13/lds_demo.m
@@ -1,19 +1,20 @@
 close all;
-%% Parameter
+% Parameter
 clear; 
 d = 2;
-k = 2;
-n = 50;
+k = 3;
+n = 100;
 
-A = [1,1; 
-     0 1];
+A = [1,0,1; 
+     0 1,0;
+     0,0,1];
 G = eye(k)*1e-3;
  
-C = [1 0;
-     0 1];
+C = [1,0,0;
+     0 1,0];
 S = eye(d)*1e-1;
 
-mu0 = [0; 0];
+mu0 = [0;0;0];
 P0 = eye(k);
 
 model.A = A;
@@ -54,9 +55,9 @@
 axis equal
 hold off
 %% LDS Subspace
-[A,C,z] = ldsPca(x,k,3*k);
-y = C*z;
-t = size(z,2);
+[A,C,nu] = ldsPca(x,k,3*k);
+y = C*nu;
+t = size(y,2);
 figure;
 hold on
 plot(x(1,1:t), x(2,1:t), 'ro');
@@ -66,9 +67,9 @@
 axis equal
 hold off
 %% LDS EM
-[model, llh] = ldsEm(x,k);
-nu = kalmanSmoother(model,x);
-y = model.C*nu;
+[tmodel, llh] = ldsEm(x,k);
+nu = kalmanSmoother(tmodel,x);
+y = tmodel.C*nu;
 figure
 hold on
 plot(x(1,:), x(2,:), 'ro');

From 0e1acae939e9ea5554cff3319287199e7bbd892f Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 30 Nov 2018 19:13:22 +0800
Subject: [PATCH 106/119] add Contents.m

---
 Contents.m                     | 124 +++++++++++++++++++++++++++++++++
 chapter07/rvmRegSeq.m          |   1 -
 chapter08/NaiveBayes/nbGauss.m |   4 +-
 3 files changed, 126 insertions(+), 3 deletions(-)
 create mode 100644 Contents.m

diff --git a/Contents.m b/Contents.m
new file mode 100644
index 0000000..f3c492c
--- /dev/null
+++ b/Contents.m
@@ -0,0 +1,124 @@
+% CHAPTER01
+%   condEntropy      - Compute conditional entropy z=H(x|y) of two discrete variables x and y.
+%   entropy          - Compute entropy z=H(x) of a discrete variable x.
+%   jointEntropy     - Compute joint entropy z=H(x,y) of two discrete variables x and y.
+%   mutInfo          - Compute mutual information I(x,y) of two discrete variables x and y.
+%   nmi              - Compute normalized mutual information I(x,y)/sqrt(H(x)*H(y)) of two discrete variables x and y.
+%   nvi              - Compute normalized variation information z=(1-I(x,y)/H(x,y)) of two discrete variables x and y.
+%   relatEntropy     - Compute relative entropy (a.k.a KL divergence) z=KL(p(x)||p(y)) of two discrete variables x and y.
+% CHAPTER02    
+%   logDirichlet     - Compute log pdf of a Dirichlet distribution.
+%   logGauss         - Compute log pdf of a Gaussian distribution.
+%   logKde           - Compute log pdf of kernel density estimator.
+%   logMn            - Compute log pdf of a multinomial distribution.
+%   logMvGamma       - Compute logarithm multivariate Gamma function 
+%   logSt            - Compute log pdf of a Student's t distribution.
+%   logVmf           - Compute log pdf of a von Mises-Fisher distribution.
+%   logWishart       - Compute log pdf of a Wishart distribution.
+% CHAPTER03    
+%   linReg           - Fit linear regression model y=w'x+w0  
+%   linRegFp         - Fit empirical Bayesian linear model with Mackay fixed point method (p.168)
+%   linRegPred       - Compute linear regression model reponse y = w'*X+w0 and likelihood
+%   linRnd           - Generate data from a linear model p(t|w,x)=G(w'x+w0,sigma), sigma=sqrt(1/beta) 
+% CHAPTER04    
+%   binPlot          - Plot binary classification result for 2d data
+%   fda              - Fisher (linear) discriminant analysis
+%   logitBin         - Logistic regression for binary classification optimized by Newton-Raphson method.
+%   logitBinPred     - Prediction of binary logistic regression model
+%   logitMn          - Multinomial regression for multiclass problem (Multinomial likelihood)
+%   logitMnPred      - Prediction of multiclass (multinomial) logistic regression model
+%   sigmoid          - Sigmod function
+%   softmax          - Softmax function
+% CHAPTER05    
+%   mlpReg           - Train a multilayer perceptron neural network
+%   mlpRegPred       - Multilayer perceptron prediction
+% CHAPTER06    
+%   kn2sd            - Transform a kernel matrix (or inner product matrix) to a squared distance matrix
+%   knCenter         - Centerize the data in the kernel space
+%   knGauss          - Gaussian (RBF) kernel K = exp(-|x-y|/(2s));
+%   knKmeans         - Perform kernel kmeans clustering.
+%   knKmeansPred     - Prediction for kernel kmeans clusterng
+%   knLin            - Linear kernel (inner product)
+%   knPca            - Kernel PCA
+%   knPcaPred        - Prediction for kernel PCA
+%   knPoly           - Polynomial kernel k(x,y)=(x'y+c)^o
+%   knReg            - Gaussian process (kernel) regression
+%   knRegPred        - Prediction for Gaussian Process (kernel) regression model
+%   sd2kn            - Transform a squared distance matrix to a kernel matrix. 
+% CHAPTER07    
+%   rvmBinFp         - Relevance Vector Machine (ARD sparse prior) for binary classification.
+%   rvmBinPred       - Prodict the label for binary logistic regression model
+%   rvmRegFp         - Relevance Vector Machine (ARD sparse prior) for regression
+%   rvmRegPred       - Compute RVM regression model reponse y = w'*X+w0 and likelihood 
+%   rvmRegSeq        - Sparse Bayesian Regression (RVM) using sequential algorithm
+% CHAPTER08    
+%  MRF    
+%   mrfBethe         - Compute Bethe energy
+%   mrfBp            - Undirected graph belief propagation for MRF
+%   mrfGibbs         - Compute Gibbs energy
+%   mrfIsGa          - Contruct a latent Ising MRF with Gaussian observation
+%   mrfMf            - Mean field for MRF
+%  NaiveBayes    
+%   nbBern           - Naive bayes classifier with indepenet Bernoulli.
+%   nbBernPred       - Prediction of naive Bayes classifier with independent Bernoulli.
+%   nbGauss          - Naive bayes classifier with indepenet Gaussian
+%   nbGaussPred      - Prediction of naive Bayes classifier with independent Gaussian.
+% CHAPTER09    
+%   kmeans           - Perform kmeans clustering.
+%   kmeansPred       - Prediction for kmeans clusterng
+%   kmeansRnd        - Generate samples from a Gaussian mixture distribution with common variances (kmeans model).
+%   kmedoids         - Perform k-medoids clustering.
+%   kseeds           - Perform kmeans++ seeding
+%   linRegEm         - Fit empirical Bayesian linear regression model with EM (p.448 chapter 9.3.4)
+%   mixBernEm        - Perform EM algorithm for fitting the Bernoulli mixture model.
+%   mixBernRnd       - Generate samples from a Bernoulli mixture distribution.
+%   mixGaussEm       - Perform EM algorithm for fitting the Gaussian mixture model.
+%   mixGaussPred     - Predict label and responsibility for Gaussian mixture model.
+%   mixGaussRnd      - Genarate samples form a Gaussian mixture model.
+%   rvmBinEm         - Relevance Vector Machine (ARD sparse prior) for binary classification.
+%   rvmRegEm         - Relevance Vector Machine (ARD sparse prior) for regression
+% CHAPTER10
+%   linRegVb         - Variational Bayesian inference for linear regression.
+%   mixGaussEvidence - Variational lower bound of the model evidence (log of marginal likelihood)
+%   mixGaussVb       - Variational Bayesian inference for Gaussian mixture.
+%   mixGaussVbPred   - Predict label and responsibility for Gaussian mixture model trained by VB.
+%   rvmRegVb         - Variational Bayesian inference for RVM regression.
+% CHAPTER11
+%   dirichletRnd     - Generate samples from a Dirichlet distribution.
+%   discreteRnd      - Generate samples from a discrete distribution (multinomial).
+%   Gauss            - Class for Gaussian distribution used by Dirichlet process
+%   gaussRnd         - Generate samples from a Gaussian distribution.
+%   GaussWishart     - Class for Gaussian-Wishart distribution used by Dirichlet process
+%   mixDpGb          - Collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 
+%   mixDpGbOl        - Online collapsed Gibbs sampling for Dirichlet process (infinite) mixture model. 
+%   mixGaussGb       - Collapsed Gibbs sampling for Dirichlet process (infinite) Gaussian mixture model (a.k.a. DPGM). 
+%   mixGaussSample   - Genarate samples form a Gaussian mixture model with GaussianWishart prior.
+% CHAPTER12 
+%   fa               - Perform EM algorithm for factor analysis model
+%   pca              - Principal component analysis
+%   pcaEm            - Perform EM-like algorithm for PCA (by Sam Roweis).
+%   pcaEmC           - Perform Constrained EM like algorithm for PCA.
+%   ppcaEm           - Perform EM algorithm to maiximize likelihood of probabilistic PCA model.
+%   ppcaRnd          - Generate data from probabilistic PCA model
+%   ppcaVb           - Perform variatioanl Bayeisan inference for probabilistic PCA model. 
+% CHAPTER13 
+%  HMM 
+%   hmmEm            - EM algorithm to fit the parameters of HMM model (a.k.a Baum-Welch algorithm)
+%   hmmFilter        - HMM forward filtering algorithm. 
+%   hmmRnd           - Generate a data sequence from a hidden Markov model.
+%   hmmSmoother      - HMM smoothing alogrithm (normalized forward-backward or normalized alpha-beta algorithm).
+%   hmmViterbi       - Viterbi algorithm (calculated in log scale to improve numerical stability).
+%  LDS 
+%   kalmanFilter     - Kalman filter (forward algorithm for linear dynamic system)
+%   kalmanSmoother   - Kalman smoother (forward-backward algorithm for linear dynamic system)
+%   ldsEm            - EM algorithm for parameter estimation of linear dynamic system.
+%   ldsPca           - Subspace method for learning linear dynamic system.
+%   ldsRnd           - Generate a data sequence from linear dynamic system.
+% CHAPTER14 
+%   adaboostBin      - Adaboost for binary classification (weak learner: kmeans)
+%   adaboostBinPred  - Prediction of binary Adaboost
+%   mixLinPred       - Prediction function for mxiture of linear regression
+%   mixLinReg        - Mixture of linear regression
+%   mixLinRnd        - Generate data from mixture of linear model
+%   mixLogitBin      - Mixture of logistic regression model for binary classification optimized by Newton-Raphson method
+%   mixLogitBinPred  - Prediction function for mixture of logistic regression
diff --git a/chapter07/rvmRegSeq.m b/chapter07/rvmRegSeq.m
index 97b93db..7fcad31 100644
--- a/chapter07/rvmRegSeq.m
+++ b/chapter07/rvmRegSeq.m
@@ -1,5 +1,4 @@
 function [model, llh] = rvmRegSeq(X, t)
-% TODO: beta is not updated.
 % Sparse Bayesian Regression (RVM) using sequential algorithm
 % Input:
 %   X: d x n data
diff --git a/chapter08/NaiveBayes/nbGauss.m b/chapter08/NaiveBayes/nbGauss.m
index 6b4e8b2..c8061e4 100644
--- a/chapter08/NaiveBayes/nbGauss.m
+++ b/chapter08/NaiveBayes/nbGauss.m
@@ -1,6 +1,6 @@
 function model = nbGauss(X, t)
-% Naive bayes classifier with indepenet Gaussian, each dimension of data is
-% assumed from a 1d Gaussian distribution with independent mean and variance.
+% Naive bayes classifier with indepenet Gaussian
+% Each dimension of data is assumed from a 1d Gaussian distribution with independent mean and variance.
 % Input:
 %   X: d x n data matrix
 %   t: 1 x n label (1~k)

From a117a4364fef63d24e3e34403e69a522771ee25a Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 4 Dec 2018 00:56:24 +0800
Subject: [PATCH 107/119] add MLP classification

---
 chapter05/mlpClass.m     | 63 ++++++++++++++++++++++++++++++++++++++++
 chapter05/mlpClassPred.m | 19 ++++++++++++
 chapter05/mlpReg.m       | 33 +++++++++++----------
 chapter05/mlpRegPred.m   |  3 +-
 demo/ch05/mlp_demo.m     | 25 +++++++++++++---
 5 files changed, 123 insertions(+), 20 deletions(-)
 create mode 100644 chapter05/mlpClass.m
 create mode 100644 chapter05/mlpClassPred.m

diff --git a/chapter05/mlpClass.m b/chapter05/mlpClass.m
new file mode 100644
index 0000000..0a5d645
--- /dev/null
+++ b/chapter05/mlpClass.m
@@ -0,0 +1,63 @@
+function [model, L] = mlpClass(X,y,k,lambda)
+% Train a multilayer perceptron neural network for classification with backpropagation
+% logistic activation function is used.
+% Input:
+%   X: d x n data matrix
+%   Y: p x n response matrix
+%   k: T x 1 vector to specify number of hidden nodes in each layer
+%   lambda: regularization parameter
+% Ouput:
+%   model: model structure
+%   L: (regularized cross entropy) loss
+% Written by Mo Chen (sth4nth@gmail.com).
+if nargin < 4
+    lambda = 1e-2;
+end
+eta = 1e-3;
+tol = 1e-4;
+maxiter = 50000;
+L = inf(1,maxiter);
+
+Y = sparse(y,1:numel(y),1);
+k = [size(X,1);k(:);size(Y,1)];
+T = numel(k)-1;
+W = cell(T,1);
+b = cell(T,1);
+for t = 1:T
+    W{t} = randn(k(t),k(t+1));
+    b{t} = randn(k(t+1),1);
+end
+R = cell(T,1);
+Z = cell(T+1,1);
+Z{1} = X;
+for iter = 2:maxiter
+%     forward
+    for t = 1:T-1
+        Z{t+1} = sigmoid(W{t}'*Z{t}+b{t});         % 5.10 5.113
+    end
+    Z{T+1} = softmax(W{T}'*Z{T}+b{T});   
+    
+%     loss
+    E = Z{T+1};
+    Wn = cellfun(@(x) dot(x(:),x(:)),W);            % |W|^2
+    L(iter) = -dot(Y(:),log(E(:)))+0.5*lambda*sum(Wn);
+    if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end
+
+%     backward
+    R{T} = Z{T+1}-Y;                
+    for t = T-1:-1:1
+        df = Z{t+1}.*(1-Z{t+1});    % h'(a)
+        R{t} = df.*(W{t+1}*R{t+1});     % 5.66
+    end
+    
+%     gradient descent
+    for t=1:T
+        dW = Z{t}*R{t}'+lambda*W{t};      % 5.67
+        db = sum(R{t},2);
+        W{t} = W{t}-eta*dW;               % 5.43
+        b{t} = b{t}-eta*db;
+    end
+end
+L = L(2:iter);
+model.W = W;
+model.b = b;
diff --git a/chapter05/mlpClassPred.m b/chapter05/mlpClassPred.m
new file mode 100644
index 0000000..0c94742
--- /dev/null
+++ b/chapter05/mlpClassPred.m
@@ -0,0 +1,19 @@
+function [y, P] = mlpClassPred(model, X)
+% Multilayer perceptron classification prediction
+% logistic activation function is used.
+% Input:
+%   model: model structure
+%   X: d x n data matrix
+% Ouput:
+%   y: 1 x n label vector
+%   P: k x n probability matrix
+% Written by Mo Chen (sth4nth@gmail.com).
+W = model.W;
+b = model.b;
+T = length(W);
+Z = X;
+for t = 1:T-1
+    Z = sigmoid(W{t}'*Z+b{t});
+end
+P = softmax(W{T}'*Z+b{T});
+[~,y] = max(P,[],1);  
\ No newline at end of file
diff --git a/chapter05/mlpReg.m b/chapter05/mlpReg.m
index caf42d1..d3759eb 100644
--- a/chapter05/mlpReg.m
+++ b/chapter05/mlpReg.m
@@ -1,22 +1,24 @@
-function [model, L] = mlpReg(X,Y,k,lambda)
-% Train a multilayer perceptron neural network
+function [model, L] = mlpReg(X,y,k,lambda)
+% Train a multilayer perceptron neural network for regression with backpropagation
+% tanh activation function is used
 % Input:
 %   X: d x n data matrix
-%   Y: p x n response matrix
+%   y: p x n response matrix
 %   k: T x 1 vector to specify number of hidden nodes in each layer
 %   lambda: regularization parameter
 % Ouput:
 %   model: model structure
-%   L: loss
+%   L: (regularized least square) loss
 % Written by Mo Chen (sth4nth@gmail.com).
 if nargin < 4
     lambda = 1e-2;
 end
-eta = 1e-3;
+eta = 1e-5;
+tol = 1e-5;
 maxiter = 50000;
 L = inf(1,maxiter);
 
-k = [size(X,1);k(:);size(Y,1)];
+k = [size(X,1);k(:);size(y,1)];
 T = numel(k)-1;
 W = cell(T,1);
 b = cell(T,1);
@@ -30,30 +32,31 @@
 for iter = 2:maxiter
 %     forward
     for t = 1:T-1
-        Z{t+1} = tanh(W{t}'*Z{t}+b{t});
+        Z{t+1} = tanh(W{t}'*Z{t}+b{t});             % 5.10 5.113
     end
-    Z{T+1} = W{T}'*Z{T}+b{T};
+    Z{T+1} = W{T}'*Z{T}+b{T};                       % 5.114
 
 %     loss
-    E = Z{T+1}-Y;     
+    E = Z{T+1}-y;     
     Wn = cellfun(@(x) dot(x(:),x(:)),W);            % |W|^2
     L(iter) = dot(E(:),E(:))+lambda*sum(Wn);
-
+    if abs(L(iter)-L(iter-1)) < tol*L(iter-1); break; end
+    
 %     backward
-    R{T} = E;                % delta
+    R{T} = E;                
     for t = T-1:-1:1
         df = 1-Z{t+1}.^2;    % h'(a)
-        R{t} = df.*(W{t+1}*R{t+1});    % delta
+        R{t} = df.*(W{t+1}*R{t+1});    % 5.66
     end
     
 %     gradient descent
     for t=1:T
-        dW = Z{t}*R{t}'+lambda*W{t};
+        dW = Z{t}*R{t}'+lambda*W{t};    % 5.67
         db = sum(R{t},2);
-        W{t} = W{t}-eta*dW;
+        W{t} = W{t}-eta*dW;             % 5.43
         b{t} = b{t}-eta*db;
     end
 end
-L = L(1,2:iter);
+L = L(2:iter);
 model.W = W;
 model.b = b;
diff --git a/chapter05/mlpRegPred.m b/chapter05/mlpRegPred.m
index e3bba3f..d2e67f9 100644
--- a/chapter05/mlpRegPred.m
+++ b/chapter05/mlpRegPred.m
@@ -1,5 +1,6 @@
 function Y = mlpRegPred(model, X)
-% Multilayer perceptron prediction
+% Multilayer perceptron regression prediction
+% tanh activation function is used.
 % Input:
 %   model: model structure
 %   X: d x n data matrix
diff --git a/demo/ch05/mlp_demo.m b/demo/ch05/mlp_demo.m
index 75c170a..70b57b3 100644
--- a/demo/ch05/mlp_demo.m
+++ b/demo/ch05/mlp_demo.m
@@ -1,15 +1,32 @@
-clear; close all;
+clear; close all
+%% Regression
 n = 200;
 x = linspace(0,2*pi,n);
 y = sin(x);
 
-k = [3,4];            % two hidden layers with 3 and 4 hidden nodes
+h = [10,6];            % two hidden layers with 10 and 6 neurons
 lambda = 1e-2;
-[model, L] = mlpReg(x,y,k);
+[model, L] = mlpReg(x,y,h,lambda);
 t = mlpRegPred(model,x);
 plot(L);
 figure;
 hold on
 plot(x,y,'.');
 plot(x,t);
-hold off
\ No newline at end of file
+hold off
+%% Classification
+clear;
+k = 2;
+n = 200;
+[X,y] = kmeansRnd(2,k,n);
+figure;
+plotClass(X,y);
+
+h = 3;
+lambda = 1e-2;
+[model, llh] = mlpClass(X,y,h,lambda);
+[t,p] = mlpClassPred(model,X);
+figure;
+plotClass(X,t);
+figure;
+plot(llh);
\ No newline at end of file

From 36900bd74681dc529e02974010ace70f7625f04b Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 4 Dec 2018 00:58:48 +0800
Subject: [PATCH 108/119] update Contents.m

---
 Contents.m | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Contents.m b/Contents.m
index f3c492c..bd96bba 100644
--- a/Contents.m
+++ b/Contents.m
@@ -29,9 +29,11 @@
 %   logitMnPred      - Prediction of multiclass (multinomial) logistic regression model
 %   sigmoid          - Sigmod function
 %   softmax          - Softmax function
-% CHAPTER05    
-%   mlpReg           - Train a multilayer perceptron neural network
-%   mlpRegPred       - Multilayer perceptron prediction
+% CHAPTER05
+%   mlpClass         - Train a multilayer perceptron neural network for classification with backpropagation
+%   mlpClassPred     - Multilayer perceptron classification prediction
+%   mlpReg           - Train a multilayer perceptron neural network for regression with backpropagation
+%   mlpRegPred       - Multilayer perceptron regression prediction
 % CHAPTER06    
 %   kn2sd            - Transform a kernel matrix (or inner product matrix) to a squared distance matrix
 %   knCenter         - Centerize the data in the kernel space

From f2d8141d0fa749af5c186921cd0281a9fd529309 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Fri, 7 Dec 2018 00:03:18 +0800
Subject: [PATCH 109/119] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ddd02c6..ec97ac6 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Note: this package requires Matlab **R2016b** or latter, since it utilizes a new
 Design Goal
 -------
 * Succinct: The code is extremely compact. Minimizing code length is a major goal. As a result, the core of the algorithms can be easily spotted.
-* Efficient: Many tricks to speedup Matlab code were applied (eg. vectorization, matrix factorization, etc.). Usually, functions in this package are orders faster than Matlab builtin ones (eg. kmeans).
+* Efficient: Many tricks to speedup Matlab code are applied (eg. vectorization, matrix factorization, etc.). Usually, functions in this package are orders faster than Matlab builtin ones (e.g. kmeans).
 * Robust: Many tricks for numerical stability are applied, such as computing probability in log domain, square root matrix update to enforce matrix symmetry\PD, etc.
 * Readable: The code is heavily commented. Corresponding formulas in PRML are annoted. Symbols are in sync with the book.
 * Practical: The package is not only readable, but also meant to be easily used and modified to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).

From 87260d58048a656a270207a9f7f21050df72efce Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 24 Jan 2019 16:05:57 +0800
Subject: [PATCH 110/119] fix doc

---
 chapter05/mlpClass.m | 6 +++---
 chapter05/mlpReg.m   | 4 ++--
 demo/ch05/mlp_demo.m | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/chapter05/mlpClass.m b/chapter05/mlpClass.m
index 0a5d645..6e626de 100644
--- a/chapter05/mlpClass.m
+++ b/chapter05/mlpClass.m
@@ -1,9 +1,9 @@
-function [model, L] = mlpClass(X,y,k,lambda)
-% Train a multilayer perceptron neural network for classification with backpropagation
+function [model, L] = mlpClass(X, y, k, lambda)
+% Train a multilayer perceptron neural network for multiclass classification with backpropagation
 % logistic activation function is used.
 % Input:
 %   X: d x n data matrix
-%   Y: p x n response matrix
+%   y: 1 x n label vector
 %   k: T x 1 vector to specify number of hidden nodes in each layer
 %   lambda: regularization parameter
 % Ouput:
diff --git a/chapter05/mlpReg.m b/chapter05/mlpReg.m
index d3759eb..b6b3378 100644
--- a/chapter05/mlpReg.m
+++ b/chapter05/mlpReg.m
@@ -1,9 +1,9 @@
-function [model, L] = mlpReg(X,y,k,lambda)
+function [model, L] = mlpReg(X, y, k, lambda)
 % Train a multilayer perceptron neural network for regression with backpropagation
 % tanh activation function is used
 % Input:
 %   X: d x n data matrix
-%   y: p x n response matrix
+%   y: 1 x n real value response vector
 %   k: T x 1 vector to specify number of hidden nodes in each layer
 %   lambda: regularization parameter
 % Ouput:
diff --git a/demo/ch05/mlp_demo.m b/demo/ch05/mlp_demo.m
index 70b57b3..33f77b3 100644
--- a/demo/ch05/mlp_demo.m
+++ b/demo/ch05/mlp_demo.m
@@ -26,7 +26,7 @@
 lambda = 1e-2;
 [model, llh] = mlpClass(X,y,h,lambda);
 [t,p] = mlpClassPred(model,X);
+plot(llh);
 figure;
 plotClass(X,t);
 figure;
-plot(llh);
\ No newline at end of file

From 0635e51e906ad5a6fbe14a44f1c711333c7a8903 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 24 Jan 2019 16:06:07 +0800
Subject: [PATCH 111/119] fix kmedoids

---
 chapter09/kmedoids.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter09/kmedoids.m b/chapter09/kmedoids.m
index ff94a60..2dcc0d8 100644
--- a/chapter09/kmedoids.m
+++ b/chapter09/kmedoids.m
@@ -18,7 +18,7 @@
 X = X-mean(X,2);             % reduce chance of numerical problems
 v = dot(X,X,1);
 D = v+v'-2*(X'*X);            % Euclidean distance matrix
-D(sub2ind([d,d],1:d,1:d)) = 0;              % reduce chance of numerical problems
+D(sub2ind([n,n],1:n,1:n)) = 0;              % reduce chance of numerical problems
 last = zeros(1,n);
 while any(label ~= last)
     [~,~,last(:)] = unique(label);   % remove empty clusters

From 314f4756103c28d362d10946aa820a78d216e007 Mon Sep 17 00:00:00 2001
From: sth4nth <sth4nth@gmail.com>
Date: Wed, 30 Jan 2019 14:17:00 +0800
Subject: [PATCH 112/119] tweak logistic regression

---
 chapter04/logitBin.m  | 15 +++++++--------
 chapter09/kmeansRnd.m |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/chapter04/logitBin.m b/chapter04/logitBin.m
index 80584a9..aa3eed7 100644
--- a/chapter04/logitBin.m
+++ b/chapter04/logitBin.m
@@ -1,16 +1,16 @@
-function [model, llh] = logitBin(X, y, lambda, eta)
+function [model, llh] = logitBin(X, y, lambda)
 % Logistic regression for binary classification optimized by Newton-Raphson method.
 % Input:
 %   X: d x n data matrix
-%   z: 1 x n label (0/1)
+%   y: 1 x n label (0/1)
 %   lambda: regularization parameter
-%   eta: step size
+%   alpha: step size
 % Output:
 %   model: trained model structure
 %   llh: loglikelihood
 % Written by Mo Chen (sth4nth@gmail.com).
 if nargin < 4
-    eta = 1e-1;
+    alpha = 1e-1;
 end
 if nargin < 3
     lambda = 1e-4;
@@ -20,18 +20,17 @@
 tol = 1e-4;
 epoch = 200;
 llh = -inf(1,epoch);
-h = 2*y-1;
 w = rand(d,1);
 for t = 2:epoch
     a = w'*X;
-    llh(t) = -(sum(log1pexp(-h.*a))+0.5*lambda*dot(w,w))/n; % 4.89
-    if llh(t)-llh(t-1) < tol; break; end
+    llh(t) = (dot(a,y)-sum(log1pexp(a))-0.5*lambda*dot(w,w))/n; % 4.90
+    if abs(llh(t)-llh(t-1)) < tol; break; end
     z = sigmoid(a);                     % 4.87
     g = X*(z-y)'+lambda*w;              % 4.96
     r = z.*(1-z);                       % 4.98
     Xw = bsxfun(@times, X, sqrt(r));
     H = Xw*Xw'+lambda*eye(d);           % 4.97
-    w = w-eta*(H\g); 
+    w = w-alpha*(H\g);                  % 4.92
 end
 llh = llh(2:t);
 model.w = w;
diff --git a/chapter09/kmeansRnd.m b/chapter09/kmeansRnd.m
index d48013f..b02f98f 100644
--- a/chapter09/kmeansRnd.m
+++ b/chapter09/kmeansRnd.m
@@ -10,7 +10,7 @@
 %   mu: d x k centers of clusters
 % Written by Mo Chen (sth4nth@gmail.com).
 alpha = 1;
-beta = nthroot(k,d); % in volume x^d there is k points: x^d=k
+beta = nthroot(k,d); % k points in volume x^d : x^d=k
 
 X = randn(d,n);
 w = dirichletRnd(alpha,ones(1,k)/k);

From 11e81d33dfcb26540df18fc7ecd661a420393533 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 25 Jul 2019 16:01:42 +0800
Subject: [PATCH 113/119] add

add demo for kernel kmeans
---
 chapter06/knKmeans.m      | 10 +++-------
 demo/ch06/knKmeans_demo.m | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+), 7 deletions(-)
 create mode 100644 demo/ch06/knKmeans_demo.m

diff --git a/chapter06/knKmeans.m b/chapter06/knKmeans.m
index 49c6c15..2265e83 100755
--- a/chapter06/knKmeans.m
+++ b/chapter06/knKmeans.m
@@ -1,7 +1,7 @@
-function [label, model, energy] = knKmeans(X, init, kn)
+function [label, model, energy] = knKmeans(K, init)
 % Perform kernel kmeans clustering.
 % Input:
-%   K: n x n kernel matrix
+%   K: n x n data matrix
 %   init: either number of clusters (k) or initial label (1xn)
 % Output:
 %   label: 1 x n sample labels
@@ -10,17 +10,13 @@
 % Reference: Kernel Methods for Pattern Analysis
 % by John Shawe-Taylor, Nello Cristianini
 % Written by Mo Chen (sth4nth@gmail.com).
-n = size(X,2);
+n = size(K,2);
 if numel(init)==1
     k = init;
     label = ceil(k*rand(1,n));
 elseif numel(init)==n
     label = init;
 end
-if nargin < 3
-    kn = @knGauss;
-end
-K = kn(X,X);
 last = zeros(1,n);
 while any(label ~= last)
     [~,~,last(:)] = unique(label);   % remove empty clusters
diff --git a/demo/ch06/knKmeans_demo.m b/demo/ch06/knKmeans_demo.m
new file mode 100644
index 0000000..4d1882a
--- /dev/null
+++ b/demo/ch06/knKmeans_demo.m
@@ -0,0 +1,23 @@
+%% Kernel kmeans with linear kernel is equivalent to kmeans
+close all; clear;
+d = 2;
+k = 3;
+n = 200;
+[X, y] = kmeansRnd(d,k,n);
+init = ceil(k*rand(1,n));
+K = knLin(X,X);
+label = knKmeans(K,init);
+
+label0 = kmeans(X,init);
+maxdiff(label,label0)
+plotClass(X,label);
+%% Kernel kmeans with Gaussian Kernel for nonlinear data
+x1 = linspace(0,pi,n/2);
+x2 = sin(x1);
+X = [x1,x1+pi/2;
+    x2,-x2];
+
+K = knGauss(X,X,0.4);
+label = knKmeans(K,2);
+figure;
+plotClass(X,label);
\ No newline at end of file

From b9eef951ccb81ca63bf255dc4aef2623bb2d037c Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 25 Jul 2019 16:01:42 +0800
Subject: [PATCH 114/119] add demo for kernel kmeans

---
 chapter06/knKmeans.m      | 10 +++-------
 demo/ch06/knKmeans_demo.m | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+), 7 deletions(-)
 create mode 100644 demo/ch06/knKmeans_demo.m

diff --git a/chapter06/knKmeans.m b/chapter06/knKmeans.m
index 49c6c15..2265e83 100755
--- a/chapter06/knKmeans.m
+++ b/chapter06/knKmeans.m
@@ -1,7 +1,7 @@
-function [label, model, energy] = knKmeans(X, init, kn)
+function [label, model, energy] = knKmeans(K, init)
 % Perform kernel kmeans clustering.
 % Input:
-%   K: n x n kernel matrix
+%   K: n x n data matrix
 %   init: either number of clusters (k) or initial label (1xn)
 % Output:
 %   label: 1 x n sample labels
@@ -10,17 +10,13 @@
 % Reference: Kernel Methods for Pattern Analysis
 % by John Shawe-Taylor, Nello Cristianini
 % Written by Mo Chen (sth4nth@gmail.com).
-n = size(X,2);
+n = size(K,2);
 if numel(init)==1
     k = init;
     label = ceil(k*rand(1,n));
 elseif numel(init)==n
     label = init;
 end
-if nargin < 3
-    kn = @knGauss;
-end
-K = kn(X,X);
 last = zeros(1,n);
 while any(label ~= last)
     [~,~,last(:)] = unique(label);   % remove empty clusters
diff --git a/demo/ch06/knKmeans_demo.m b/demo/ch06/knKmeans_demo.m
new file mode 100644
index 0000000..4d1882a
--- /dev/null
+++ b/demo/ch06/knKmeans_demo.m
@@ -0,0 +1,23 @@
+%% Kernel kmeans with linear kernel is equivalent to kmeans
+close all; clear;
+d = 2;
+k = 3;
+n = 200;
+[X, y] = kmeansRnd(d,k,n);
+init = ceil(k*rand(1,n));
+K = knLin(X,X);
+label = knKmeans(K,init);
+
+label0 = kmeans(X,init);
+maxdiff(label,label0)
+plotClass(X,label);
+%% Kernel kmeans with Gaussian Kernel for nonlinear data
+x1 = linspace(0,pi,n/2);
+x2 = sin(x1);
+X = [x1,x1+pi/2;
+    x2,-x2];
+
+K = knGauss(X,X,0.4);
+label = knKmeans(K,2);
+figure;
+plotClass(X,label);
\ No newline at end of file

From 50a654cd556d4f33a34132a1163bbaf47b1ebc2b Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 25 Jul 2019 16:12:35 +0800
Subject: [PATCH 115/119] tweak knkmeans

---
 chapter06/knKmeans.m | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/chapter06/knKmeans.m b/chapter06/knKmeans.m
index 2265e83..60e9032 100755
--- a/chapter06/knKmeans.m
+++ b/chapter06/knKmeans.m
@@ -1,4 +1,4 @@
-function [label, model, energy] = knKmeans(K, init)
+function [label, energy] = knKmeans(K, init)
 % Perform kernel kmeans clustering.
 % Input:
 %   K: n x n data matrix
@@ -26,8 +26,4 @@
     [val, label] = max(T-dot(T,E,2)/2,[],1);
 end
 energy = trace(K)-2*sum(val); 
-if nargout == 3
-    model.X = X;
-    model.label = label;
-    model.kn = kn;
-end
+

From f20e50ebb86ffcf144a7acfdc39bb1377ac6bba8 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 17 Sep 2019 19:54:02 +0800
Subject: [PATCH 116/119] tweak discreteRnd

---
 chapter11/discreteRnd.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chapter11/discreteRnd.m b/chapter11/discreteRnd.m
index d942783..5b811c2 100644
--- a/chapter11/discreteRnd.m
+++ b/chapter11/discreteRnd.m
@@ -11,4 +11,4 @@
 end
 r = rand(1,n);
 p = cumsum(p(:));
-[~,x] = histc(r,[0;p/p(end)]);
+[~,~,x] = histcounts(r,[0;p/p(end)]);

From 1501973c6d7632da70b424cbd28a7ec3b9529c23 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Tue, 17 Sep 2019 20:04:12 +0800
Subject: [PATCH 117/119] tweak discreteRnd

---
 chapter11/discreteRnd.m | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/chapter11/discreteRnd.m b/chapter11/discreteRnd.m
index 5b811c2..93bcc63 100644
--- a/chapter11/discreteRnd.m
+++ b/chapter11/discreteRnd.m
@@ -9,6 +9,4 @@
 if nargin == 1
     n = 1;
 end
-r = rand(1,n);
-p = cumsum(p(:));
-[~,~,x] = histcounts(r,[0;p/p(end)]);
+[~,~,x] = histcounts(rand(1,n),[0;cumsum(p(:))]);

From d86f1a92e53ded0d8b73b518eaf8c0dde6ecca2e Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Thu, 19 Dec 2019 13:40:21 +0800
Subject: [PATCH 118/119] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ec97ac6..9043b3c 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@ Note: this package requires Matlab **R2016b** or latter, since it utilizes a new
 Design Goal
 -------
 * Succinct: The code is extremely compact. Minimizing code length is a major goal. As a result, the core of the algorithms can be easily spotted.
-* Efficient: Many tricks to speedup Matlab code are applied (eg. vectorization, matrix factorization, etc.). Usually, functions in this package are orders faster than Matlab builtin ones (e.g. kmeans).
-* Robust: Many tricks for numerical stability are applied, such as computing probability in log domain, square root matrix update to enforce matrix symmetry\PD, etc.
+* Efficient: Many tricks for speeding up Matlab code are applied (e.g. vectorization, matrix factorization, etc.). Usually, functions in this package are orders faster than Matlab builtin ones (e.g. kmeans).
+* Robust: Many tricks for numerical stability are applied, such as computing probability in logrithm domain, square root matrix update to enforce matrix symmetry\PD, etc.
 * Readable: The code is heavily commented. Corresponding formulas in PRML are annoted. Symbols are in sync with the book.
 * Practical: The package is not only readable, but also meant to be easily used and modified to facilitate ML research. Many functions in this package are already widely used (see [Matlab file exchange](http://www.mathworks.com/matlabcentral/fileexchange/?term=authorid%3A49739)).
 

From baac49f643db6b39e75307d3b21307b32b29a7a9 Mon Sep 17 00:00:00 2001
From: Mo Chen <sth4nth@gmail.com>
Date: Wed, 4 Mar 2020 21:42:10 +0800
Subject: [PATCH 119/119] fix kernel kmeans

---
 chapter06/knGauss.m       | 2 +-
 chapter06/knKmeans.m      | 7 ++++++-
 demo/ch06/knKmeans_demo.m | 7 +++----
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/chapter06/knGauss.m b/chapter06/knGauss.m
index d19820b..4832e8d 100755
--- a/chapter06/knGauss.m
+++ b/chapter06/knGauss.m
@@ -8,7 +8,7 @@
 %   K: nx x ny kernel matrix
 % Written by Mo Chen (sth4nth@gmail.com).
 if nargin < 3
-    s = 1;
+    s = 0.4;
 end
 
 if nargin < 2 || isempty(Y)  
diff --git a/chapter06/knKmeans.m b/chapter06/knKmeans.m
index 60e9032..c3af5a1 100755
--- a/chapter06/knKmeans.m
+++ b/chapter06/knKmeans.m
@@ -1,4 +1,4 @@
-function [label, energy] = knKmeans(K, init)
+function [label, model, energy] = knKmeans(X, init, kn)
 % Perform kernel kmeans clustering.
 % Input:
 %   K: n x n data matrix
@@ -10,6 +10,7 @@
 % Reference: Kernel Methods for Pattern Analysis
 % by John Shawe-Taylor, Nello Cristianini
 % Written by Mo Chen (sth4nth@gmail.com).
+K = kn(X,X);
 n = size(K,2);
 if numel(init)==1
     k = init;
@@ -26,4 +27,8 @@
     [val, label] = max(T-dot(T,E,2)/2,[],1);
 end
 energy = trace(K)-2*sum(val); 
+model.kn = kn;
+model.label = label;
+model.X = X;
+
 
diff --git a/demo/ch06/knKmeans_demo.m b/demo/ch06/knKmeans_demo.m
index 4d1882a..50e7bfc 100644
--- a/demo/ch06/knKmeans_demo.m
+++ b/demo/ch06/knKmeans_demo.m
@@ -5,8 +5,8 @@
 n = 200;
 [X, y] = kmeansRnd(d,k,n);
 init = ceil(k*rand(1,n));
-K = knLin(X,X);
-label = knKmeans(K,init);
+
+label = knKmeans(X,init,@knLin);
 
 label0 = kmeans(X,init);
 maxdiff(label,label0)
@@ -17,7 +17,6 @@
 X = [x1,x1+pi/2;
     x2,-x2];
 
-K = knGauss(X,X,0.4);
-label = knKmeans(K,2);
+label = knKmeans(X,2,@knGauss);
 figure;
 plotClass(X,label);
\ No newline at end of file