diff --git a/LICENSE b/LICENSE
index ca895b7..9309bf6 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,6 @@
-MIT License
-
 Copyright (c) 2019 Shusen Wang
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+Permission is granted to only nonprofit organizations, including schools and 
+research institutes. Employees of nonprofit organizations are granted, free 
+of charge, the rights to use, copy, modify, merge, publish, and distribute 
+the slides and lecture notes in this repo.
diff --git a/LectureNotes/BP/bp.tex b/LectureNotes/BP/bp.tex
new file mode 100644
index 0000000..6d69dda
--- /dev/null
+++ b/LectureNotes/BP/bp.tex
@@ -0,0 +1,773 @@
+\documentclass[11pt]{article}
+\usepackage{amsmath,amssymb,amsmath,amsthm,amsfonts}
+\usepackage{latexsym,graphicx}
+\usepackage{fullpage,color}
+\usepackage{url}
+\usepackage[pdftex,bookmarks,colorlinks=true,citecolor=blue]{hyperref}
+\usepackage{natbib}
+\usepackage{graphicx,subfigure}
+\usepackage{algorithm}
+\usepackage{algorithmic}
+\usepackage{listings}
+\usepackage[dvipsnames]{xcolor}
+\usepackage{color}
+\usepackage{wrapfig}
+
+\numberwithin{equation}{section}
+
+\pagestyle{plain}
+
+\setlength{\oddsidemargin}{0in}
+\setlength{\topmargin}{0in}
+\setlength{\textwidth}{6.5in}
+\setlength{\textheight}{8.5in}
+
+\newtheorem{fact}{Fact}[section]
+\newtheorem{question}{Question}[section]
+\newtheorem{lemma}{Lemma}[section]
+\newtheorem{theorem}[lemma]{Theorem}
+\newtheorem{assumption}[lemma]{Assumption}
+\newtheorem{corollary}[lemma]{Corollary}
+\newtheorem{prop}[lemma]{Proposition}
+\newtheorem{claim}{Claim}[section]
+\newtheorem{remark}{Remark}[section]
+\newtheorem{definition}{Definition}[section]
+\newtheorem{prob}{Problem}[section]
+\newtheorem{conjecture}{Conjecture}[section]
+\newtheorem{property}{Property}[section]
+
+\def\A{{\bf A}}
+\def\a{{\bf a}}
+\def\B{{\bf B}}
+\def\bb{{\bf b}}
+\def\C{{\bf C}}
+\def\c{{\bf c}}
+\def\D{{\bf D}}
+\def\d{{\bf d}}
+\def\E{{\bf E}}
+\def\e{{\bf e}}
+\def\F{{\bf F}}
+\def\f{{\bf f}}
+\def\g{{\bf g}}
+\def\h{{\bf h}}
+\def\G{{\bf G}}
+\def\H{{\bf H}}
+\def\I{{\bf I}}
+\def\K{{\bf K}}
+\def\k{{\bf k}}
+\def\LL{{\bf L}}
+\def\M{{\bf M}}
+\def\m{{\bf m}}
+\def\N{{\bf N}}
+\def\n{{\bf n}}
+\def\PP{{\bf P}}
+\def\pp{{\bf p}}
+\def\Q{{\bf Q}}
+\def\q{{\bf q}}
+\def\R{{\bf R}}
+\def\rr{{\bf r}}
+\def\S{{\bf S}}
+\def\s{{\bf s}}
+\def\T{{\bf T}}
+\def\tt{{\bf t}}
+\def\U{{\bf U}}
+\def\u{{\bf u}}
+\def\V{{\bf V}}
+\def\v{{\bf v}}
+\def\W{{\bf W}}
+\def\w{{\bf w}}
+\def\X{{\bf X}}
+\def\x{{\bf x}}
+\def\Y{{\bf Y}}
+\def\y{{\bf y}}
+\def\Z{{\bf Z}}
+\def\z{{\bf z}}
+\def\0{{\bf 0}}
+\def\1{{\bf 1}}
+
+
+
+\def\AM{{\mathcal A}}
+\def\CM{{\mathcal C}}
+\def\DM{{\mathcal D}}
+\def\EM{{\mathcal E}}
+\def\GM{{\mathcal G}}
+\def\FM{{\mathcal F}}
+\def\IM{{\mathcal I}}
+\def\JM{{\mathcal J}}
+\def\KM{{\mathcal K}}
+\def\LM{{\mathcal L}}
+\def\NM{{\mathcal N}}
+\def\OM{{\mathcal O}}
+\def\PM{{\mathcal P}}
+\def\SM{{\mathcal S}}
+\def\TM{{\mathcal T}}
+\def\UM{{\mathcal U}}
+\def\VM{{\mathcal V}}
+\def\WM{{\mathcal W}}
+\def\XM{{\mathcal X}}
+\def\YM{{\mathcal Y}}
+\def\RB{{\mathbb R}}
+\def\RBmn{{\RB^{m\times n}}}
+\def\EB{{\mathbb E}}
+\def\PB{{\mathbb P}}
+
+\def\TX{\tilde{\bf X}}
+\def\TA{\tilde{\bf A}}
+\def\tx{\tilde{\bf x}}
+\def\ty{\tilde{\bf y}}
+\def\TZ{\tilde{\bf Z}}
+\def\tz{\tilde{\bf z}}
+\def\hd{\hat{d}}
+\def\HD{\hat{\bf D}}
+\def\hx{\hat{\bf x}}
+\def\nysA{{\tilde{\A}_c^{\textrm{nys}}}}
+
+\def\alp{\mbox{\boldmath$\alpha$\unboldmath}}
+\def\bet{\mbox{\boldmath$\beta$\unboldmath}}
+\def\epsi{\mbox{\boldmath$\epsilon$\unboldmath}}
+\def\etab{\mbox{\boldmath$\eta$\unboldmath}}
+\def\ph{\mbox{\boldmath$\phi$\unboldmath}}
+\def\pii{\mbox{\boldmath$\pi$\unboldmath}}
+\def\Ph{\mbox{\boldmath$\Phi$\unboldmath}}
+\def\Ps{\mbox{\boldmath$\Psi$\unboldmath}}
+\def\ps{\mbox{\boldmath$\psi$\unboldmath}}
+\def\tha{\mbox{\boldmath$\theta$\unboldmath}}
+\def\Tha{\mbox{\boldmath$\Theta$\unboldmath}}
+\def\muu{\mbox{\boldmath$\mu$\unboldmath}}
+\def\Si{\mbox{\boldmath$\Sigma$\unboldmath}}
+\def\si{\mbox{\boldmath$\sigma$\unboldmath}}
+\def\Gam{\mbox{\boldmath$\Gamma$\unboldmath}}
+\def\Lam{\mbox{\boldmath$\Lambda$\unboldmath}}
+\def\De{\mbox{\boldmath$\Delta$\unboldmath}}
+\def\Ome{\mbox{\boldmath$\Omega$\unboldmath}}
+\def\Pii{\mbox{\boldmath$\Pi$\unboldmath}}
+\def\varepsi{\mbox{\boldmath$\varepsilon$\unboldmath}}
+\newcommand{\ti}[1]{\tilde{#1}}
+\def\Ncal{\mathcal{N}}
+\def\argmax{\mathop{\rm argmax}}
+\def\argmin{\mathop{\rm argmin}}
+
+\def\ALG{{\AM_{\textrm{col}}}}
+
+\def\mean{\mathsf{mean}}
+\def\std{\mathsf{std}}
+\def\orth{\mathsf{orth}}
+\def\var{\mathsf{var}}
+\def\sgn{\mathsf{sgn}}
+\def\tr{\mathsf{tr}}
+\def\rk{\mathrm{rank}}
+\def\nnz{\mathsf{nnz}}
+\def\st{\mathsf{s.t.}}
+\def\vect{\mathsf{vec}}
+\def\sech{\mathrm{sech}}
+\def\sigmoid{\mathsf{sigmoid}}
+\def\din{{d_{\textrm{in}}}}
+\def\dout{{d_{\textrm{out}}}}
+
+
+\newcommand{\red}[1]{{\color{red}#1}}
+\newcommand{\blue}[1]{{\color{blue}#1}}
+\newcommand{\green}[1]{{\color{green}#1}}
+
+
+
+\def\argmax{\mathop{\rm argmax}}
+\def\argmin{\mathop{\rm argmin}}
+
+\newenvironment{note}[1]{\medskip\noindent \textbf{#1:}}%
+        {\medskip}
+
+
+\newcommand{\etal}{{\em et al.}\ }
+\newcommand{\assign}{\leftarrow}
+\newcommand{\eps}{\epsilon}
+
+
+
+
+\lstset{ %
+extendedchars=false,            % Shutdown no-ASCII compatible
+language=Python,                % choose the language of the code
+xleftmargin=1em,
+xrightmargin=1em,
+basicstyle=\footnotesize,    % the size of the fonts that are used for the code
+tabsize=3,                            % sets default tabsize to 3 spaces
+numbers=left,                   % where to put the line-numbers
+numberstyle=\tiny,              % the size of the fonts that are used for the line-numbers
+stepnumber=1,                   % the step between two line-numbers. If it's 1 each line
+                                % will be numbered
+numbersep=5pt,                  % how far the line-numbers are from the code   %
+keywordstyle=\color[rgb]{0,0,1},                % keywords
+commentstyle=\color[rgb]{0.133,0.545,0.133},    % comments
+stringstyle=\color[rgb]{0.627,0.126,0.941},      % strings
+backgroundcolor=\color{white}, % choose the background color. You must add \usepackage{color}
+showspaces=false,               % show spaces adding particular underscores
+showstringspaces=false,         % underline spaces within strings
+showtabs=false,                 % show tabs within strings adding particular underscores
+frame=single,                 % adds a frame around the code
+%captionpos=b,                   % sets the caption-position to bottom
+breaklines=true,                % sets automatic line breaking
+breakatwhitespace=false,        % sets if automatic breaks should only happen at whitespace
+%title=\lstname,                 % show the filename of files included with \lstinputlisting;
+%                                % also try caption instead of title
+mathescape=true,escapechar=?    % escape to latex with ?..?
+escapeinside={\%*}{*)},         % if you want to add a comment within your code
+%columns=fixed,                  % nice spacing
+%morestring=[m]',                % strings
+%morekeywords={%,...},%          % if you want to add more keywords to the set
+%    break,case,catch,continue,elseif,else,end,for,function,global,%
+%    if,otherwise,persistent,return,switch,try,while,...},%
+}
+
+
+\begin{document}
+
+%\setlength{\fboxrule}{.5mm}\setlength{\fboxsep}{1.2mm}
+%\newlength{\boxlength}\setlength{\boxlength}{\textwidth}
+%\addtolength{\boxlength}{-4mm}
+
+
+\title{BackPropagation for Fully-Connected and \\Convolutional Neural Networks}
+
+\author{\textbf{Shusen Wang} \\ Stevens Institute of Technology}
+
+%\date{ }
+
+\maketitle
+
+\begin{abstract}
+First, define a fully-connected (FC) layer.
+Second, derive the gradients using chain rule for one FC layer.
+Third, connect multiple FC layers to build a FC neural network.
+Fourth, perform backpropagation using the chain rule.
+Fifth, express convolution as a matrix multiplication.
+Last, derive gradients for convolutional layer.
+\end{abstract}
+
+
+\section{Fully-Connected (FC) Layer}
+
+
+
+We consider one fully-connected (FC) layer and follow the convention of PyTorch.
+Let $\din$ be the input shape, $\dout$ be the output shape, and $b$ be the batch size.
+Let $\X \in \RB^{b\times \din}$ be a batch of input vectors, $\W \in \RB^{\dout \times \din}$ be the weight matrix, and $\Z = \X \W^T \in \RB^{b\times \dout}$.
+The output of this FC layer is $\X' = \sigma (\Z) \in \RB^{b\times \dout}$ where $\sigma$ is an activation function that applies elementwisely.
+For example, if the activation function is ReLU, then the $(i,j)$-th entry of $\X'$ is
+\begin{equation*}
+    x_{ij}'
+    \: = \:
+    \left\{
+    \begin{array}{cc}
+         z_{ij}, & \textrm{if } z_{ij} > 0;  \\
+         0, & \textrm{otherwise.} \\
+    \end{array}
+    \right.
+\end{equation*}
+The structure of a FC layer is illustrated in Figure~\ref{fig:differential}(left).
+
+
+
+\section{Differentiation for FC Layer} \label{sec:differential}
+
+Let $Q$ be the loss function that depends on $\X'$.
+Suppose we know $\frac{\partial \, Q}{ \partial \, \X'} \in \RB^{b\times \dout}$ (the derivative of $Q$ w.r.t.\ $\X'$).
+Since $\X$ and $\W$ influence $Q$ via $\X'$:
+\begin{equation*}
+    \left.
+    \begin{array}{c c}
+         \cdots \: \longrightarrow \: \cdots \: \longrightarrow \: & \X   \\
+         & \W
+    \end{array}
+    \right\}
+    \: \xrightarrow{\textsf{~multiply~}}  \: 
+    \Z 
+    \: \xrightarrow{\textsf{activation}}  \: 
+    \X'
+    \: \longrightarrow \: 
+    \cdots
+    \: \longrightarrow \: 
+    Q ,
+\end{equation*}
+we can let the gradient flow to $\X$ and $\W$ in the opposite direction:
+\begin{equation*}
+    \left.
+    \begin{array}{c c}
+         \cdots \: \longleftarrow \: \cdots \: \longleftarrow \: & \frac{\partial \, Q }{\partial \, \X}    \\
+         & \frac{\partial \, Q }{\partial \, \W} 
+    \end{array}
+    \right\}
+    \: \longleftarrow \: 
+    \frac{\partial \, Q }{\partial \, \Z} 
+    \: \longleftarrow \: 
+    \frac{\partial \, Q }{\partial \, \X'} .
+\end{equation*}
+In the following, we compute $\frac{\partial \, Q}{ \partial \, \Z}$ and then $\frac{\partial \, Q}{ \partial \, \X}$ and $\frac{\partial \, Q}{ \partial \, \W}$. 
+
+\paragraph{From $\X'$ to $\Z$.}
+First, compute $\frac{\partial \, Q}{ \partial \, \Z}  \in \RB^{b\times \dout}$.
+If $\X' = \textsf{ReLU} (\Z)$,\footnote{$[\textsf{ReLU} (\Z)]_{ij} = \max \{z_{ij} , \, 0 \}$.}
+then the $(i,j)$-th entry of $\frac{\partial \, Q}{ \partial \, \Z} $ is
+\begin{equation*}
+    \Big[\frac{\partial \, Q}{ \partial \, \Z} \Big]_{ij}
+    \: = \: \frac{\partial \, Q}{ \partial \, z_{ij}}
+    \: = \:  \frac{\partial \, x_{ij}' }{ \partial \, z_{ij}} \,  \frac{\partial \, Q}{ \partial \, x_{ij}'}
+    \: = \: 
+    \left\{
+    \begin{array}{cc}
+         \tfrac{\partial \, Q}{ \partial \, x_{ij}'}, & \textrm{if } z_{ij} > 0;  \\
+         0, & \textrm{otherwise.} \\
+    \end{array}
+    \right.
+\end{equation*}
+Let $\A \in \RB^{b\times \dout}$ be such as matrix that 
+\begin{equation*}
+    a_{ij} 
+    \: = \: 
+    \left\{
+    \begin{array}{cc}
+         1, & \textrm{if } z_{ij} > 0;  \\
+         0, & \textrm{otherwise.} \\
+    \end{array}
+    \right.
+\end{equation*}
+Let ``$\circ $'' denote the Hadamard product (also known as elementwise product.)
+Then
+\begin{equation} \label{eq:grad_q_z}
+    \frac{\partial \, Q}{ \partial \, \Z}
+    \: = \: \A \circ  \frac{\partial \, Q}{ \partial \, \X'}
+    \: \in \: \RB^{b\times \dout} .
+\end{equation}
+
+\paragraph{From $\Z$ to $\X$.}
+Second, compute $\frac{\partial \, Q}{ \partial \, \X}  \in \RB^{b\times \din}$.
+Let $\x_{i:} \in \RB^{1\times \din}$ and $\z_{i:} \in \RB^{1\times \dout}$ be the $i$-th rows of $\X$ and $\Z$, respectively, for $i = 1$ to $b$.\footnote{To calculate gradient in the standard way, we must use column vectors.}
+Thus $\x_{i:}^T \in \RB^{\din \times 1}$ and $\z_{i:}^T \in \RB^{ \dout \times 1}$ are the $i$-th column of $\X^T \in \RB^{\din \times b}$ and $\Z^T \in \RB^{\dout \times b}$, respectively.
+It follows from the chain rule that
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \x_{i:}^T }
+    \: = \: \sum_{j=1}^{b} \frac{ \partial \, \z_{j:}^T  }{ \partial \, \x_{i:}^T } \frac{ \partial \, Q }{ \partial \, \z_{j:}^T } 
+    \: = \: \frac{ \partial \, \z_{i:}^T  }{ \partial \, \x_{i:}^T } \, \frac{ \partial \, Q }{ \partial \, \z_{i:}^T } 
+    + \sum_{i\neq j} \frac{ \partial \, \z_{j:}^T  }{ \partial \, \x_{i:}^T } \frac{ \partial \, Q }{ \partial \, \z_{j:}^T } .
+\end{equation*}
+If $i \neq j$, $\z_{j:}$ will not depend on $\x_{i:}$, and thus $\frac{ \partial \, \z_{j:}^T  }{ \partial \, \x_{i:}^T } $ is the all-zero matrix.
+It follows that
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \x_{i:}^T }
+    \: = \: \underbrace{\frac{ \partial \, \z_{i:}^T  }{ \partial \, \x_{i:}^T }}_{\din \times \dout} \, 
+    \underbrace{\frac{ \partial \, Q }{ \partial \, \z_{i:}^T } }_{\dout\times 1}
+    \: \in \: \RB^{\din \times 1}.
+\end{equation*}
+Since $\z_{i:} = \x_{i:} \W^T$, we have $\z_{i:}^T = \W \x_{i:}^T$, and thus $\frac{ \partial \, \z_{i:}^T }{ \partial \, \x_{i:}^T } = \W^T  \in  \RB^{\din \times \dout }$.
+It follows that
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \x_{i:}^T }
+    \: = \: \W^T \, \cdot \, \frac{ \partial \, Q }{ \partial \, \z_{i:}^T } 
+    \: \in \: \RB^{\din \times 1}.
+\end{equation*}
+Since $\x_{i:}^T$ is the $i$-th column of $\X^T \in \RB^{\din \times b}$,
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \X^T }
+    \: = \: \Big[ \frac{ \partial \, Q }{ \partial \, \x_{1:}^T } , \; \cdots , \; \frac{ \partial \, Q }{ \partial \, \x_{b:}^T } \Big]
+    \: = \: \W^T \, \cdot \, \Big[ \frac{ \partial \, Q }{ \partial \, \z_{1:}^T } , \; \cdots , \; \frac{ \partial \, Q }{ \partial \, \z_{b:}^T } \Big]
+    \: = \: \underbrace{\W^T}_{\din\times \dout} \, \cdot \,  
+    \underbrace{\frac{ \partial \, Q }{ \partial \, \Z^T } }_{\dout \times b}
+    \: \in \: \RB^{\din \times b}.
+\end{equation*}
+Hence
+\begin{equation} \label{eq:grad_q_x}
+    \frac{ \partial \, Q }{ \partial \, \X }
+    \: = \: \bigg( \frac{ \partial \, Q }{ \partial \, \X^T } \bigg)^T
+    \: = \: \frac{ \partial \, Q }{ \partial \, \Z } \, \cdot \, \W
+    \: \in \: \RB^{b \times \din}.
+\end{equation}
+
+
+\paragraph{From $\Z$ to $\W$.}
+Third, compute $\frac{\partial \, Q}{ \partial \, \W}  \in \RB^{\dout\times \din}$.
+Let $\z_{:j} \in \RB^{b\times 1}$ be the $j$-th columns of $\Z$
+and $\w_{j:} \in \RB^{1\times \din}$ be the $j$-th row of $\W$, for $j = 1$ to $\dout$.
+Thus $\w_{j:}^T \in \RB^{\din \times 1}$ is the $j$-th column of $\W^T \in \RB^{\din \times \dout}$.
+It follows from the chain rule that
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \w_{j:}^T }
+    \: = \: \sum_{i=1}^{\dout} \frac{ \partial \, \z_{:i}  }{ \partial \, \w_{j:}^T } \, \frac{ \partial \, Q }{ \partial \, \z_{:i} } 
+    \: = \: \frac{ \partial \, \z_{:j}  }{ \partial \, \w_{j:}^T } \, \frac{ \partial \, Q }{ \partial \, \z_{:j} } 
+    + \sum_{i\neq j} \frac{ \partial \, \z_{:i}  }{ \partial \, \w_{j:}^T } \, \frac{ \partial \, Q }{ \partial \, \z_{:i} } 
+    \: = \: \underbrace{ \frac{ \partial \, \z_{:j}  }{ \partial \, \w_{j:}^T } }_{\din \times b} 
+    \,  \underbrace{ \frac{ \partial \, Q }{ \partial \, \z_{:j} } }_{b\times 1}
+    \: \in \: \RB^{\din \times 1}.
+\end{equation*}
+Since $\z_{:j} = \X \w_{j:}^T \in \RB^{b\times 1}$, we can show that $\frac{ \partial \, \z_{:j} }{ \partial \, \w_{j:}^T } = \X^T \in  \RB^{\din \times b}$.
+Thus
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \w_{j:}^T }
+    \: = \: \X^T  \, \cdot \,  \frac{ \partial \, Q }{ \partial \, \z_{:j} }
+    \: \in \: \RB^{\din \times 1}.
+\end{equation*}
+Note that $\w_{j:}^T $ is the $j$-th column of $\W^T \in \RB^{\din \times \dout}$.
+It follows that
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \W^T }
+    \: = \: \Big[ \frac{ \partial \, Q }{ \partial \, \w_{1:}^T } , \; \cdots , \;  \frac{ \partial \, Q }{ \partial \, \w_{\dout:}^T } \Big]
+    \: = \:  \X^T  \Big[\frac{ \partial \, Q }{ \partial \, \z_{:1} } , \; \cdots , \; 
+    \frac{ \partial \, Q }{ \partial \, \z_{:\dout} } \Big]
+    \: = \: \underbrace{\X^T}_{\din \times b} \, \underbrace{ \frac{\partial \, Q }{ \partial \, \Z }}_{b\times \dout}
+    \: \in \: \RB^{\din \times \dout} .
+\end{equation*}
+Hence,
+\begin{equation} \label{eq:grad_q_w}
+    \frac{ \partial \, Q }{ \partial \, \W }
+    \: = \: \bigg( \frac{ \partial \, Q }{ \partial \, \W^T } \bigg)^T
+    \: = \: \bigg( \frac{\partial \, Q }{ \partial \, \Z } \bigg)^T  \X 
+    \: \in \: \RB^{\dout \times \din}.
+\end{equation}
+
+
+\begin{figure}[!h]
+	\centering
+	\includegraphics[width=0.85\linewidth]{figures/differential.pdf}
+	\caption{Differential for one FC Layer.}
+	\label{fig:differential}
+\end{figure}
+
+
+
+
+We summarize the gradients flow in Figure~\ref{fig:differential}.
+Given $ \frac{\partial \, Q }{ \partial \, \X' }$, we can compute first $ \frac{\partial \, Q }{ \partial \, \Z }$ according to \eqref{eq:grad_q_z} and then $ \frac{\partial \, Q }{ \partial \, \X }$ according to \eqref{eq:grad_q_x} and $ \frac{\partial \, Q }{ \partial \, \W }$ according to \eqref{eq:grad_q_w}.
+
+
+
+\section{Fully-Connected (FC) Neural Network}
+
+An FC neural network is composed of multiple FC layers.
+Suppose the FC network has $L$ ($> 1$) layers; the $l$-th layer is parameterized by weight matrix $\W^{(l)}$.
+The $l$-th layer takes matrix $\X^{(l)}$ as input,
+computes $\Z^{(l)} = \X^{(l)} {\W^{(l)}}^T$,
+and outputs $\X^{(l+1)} = \sigma (\Z^{(l)})$.
+The dependence can be depicted as
+\begin{small}
+\begin{equation*}
+    \begin{array}{c}
+         \textsf{input} \: \longrightarrow \:  \cdots \: \longrightarrow   \\
+         ~
+    \end{array}
+    \underbrace{
+    \left.
+    \begin{array}{c}
+         \X^{(l)}   \\
+         \W^{(l)} \\
+    \end{array}
+    \right\}
+    \: \longrightarrow \: 
+    \Z^{(l)}
+    \: \longrightarrow \: 
+    \X^{(l+1)} }_{\textsf{the } l\textsf{-th layer}} 
+    \: \longrightarrow \: 
+    \cdots
+    \: \longrightarrow \: 
+    \X^{(L+1)} (\textsf{i.e.\ output})
+    \: \longrightarrow \: 
+    \textsf{loss}.
+\end{equation*}
+\end{small}%
+Note that for $\X^{(1)} , \cdots , \X^{(L+1)}$ all have $b$ rows, where $b$ is the batch size;
+but they can have different numbers of columns.\footnote{In the case of MNIST hand-written digit classification, the inputs are $784$ ($=28\times 28$) dimensional vectors, and the outputs are a $10$-dimensional vectors (for there are $10$ classes).
+So $\X^{(1)}$ has $784$ columns, and $\X^{(L+1)}$ has $10$ columns.}
+
+The $L$-th layer is called the output layer.
+The output of the $L$-th layer, denote $\X^{(L+1)} \in \RB^{b\times m}$, is the prediction the neural network makes for the input $\X^{(1)}$.
+Let $\Y \in \RB^{b\times m}$ be the labels of this batch of samples.\footnote{In the case of housing price prediction, the labels (housing price) are scalars, so $m=1$. In the case of hand-written digit classification, there are ten classes, and the labels are one-hot encode ($10$-dimensional vectors); thus $m=10$.}
+We need to define a loss function $Q$ that measures the difference between the prediction and the ground truth (labels).
+For example, the loss function can be
+\begin{equation*}
+    Q \left( \X^{(1)} , \Y ; \, \W^{(1)}, \cdots , \W^{(L)} \right)
+    \; = \; \frac{1}{2} \Big\| \X^{(L+1)} \, - \, \Y  \Big\|_F^2 .
+\end{equation*}
+See Figure~\ref{fig:bp}(left) for the structure of the FC neural network (with $\Z$'s abbreviated.)
+
+
+
+
+% \begin{wrapfigure}{r}{0.65\textwidth}
+% 	\centering
+% 	\includegraphics[width=0.3\textwidth]{figures/bp1.pdf}~~~
+% 	\includegraphics[width=0.3\textwidth]{figures/bp2.pdf}
+% 	\caption{A}
+% 	\label{fig:bp}
+% \end{wrapfigure}
+
+
+\begin{figure}[!h]
+	\centering
+	\includegraphics[width=0.35\linewidth]{figures/bp1.pdf}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	\includegraphics[width=0.35\linewidth]{figures/bp2.pdf}
+	\caption{The forward and backward pass for computing gradients.}
+	\label{fig:bp}
+\end{figure}
+
+
+
+
+\section{Computing Gradients via BackPropagation}
+
+Let $Q$ be the loss function parameterized by $\W^{(1)}, \W^{(2)}, \cdots , \W^{(L)}$, where $\W^{(l)}$ is the weight of the $l$-th layer.
+We seek to minimize $Q$ w.r.t.\ to the weights, so we need the gradients:
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \W^{(1)} } , \; 
+    \frac{ \partial \, Q }{ \partial \, \W^{(2)} } , \; \cdots , \;
+    \frac{ \partial \, Q }{ \partial \, \W^{(L)} } .
+\end{equation*}
+With the gradients, we can update the weights $\W^{(1)}, \W^{(2)}, \cdots , \W^{(L)}$, e.g., by (stochastic) gradient descent:
+\begin{equation} \label{eq:sgd}
+    \W^{(l)} \: \longleftarrow \: \W^{(l)} - \alpha \, \frac{ \partial \, Q }{ \partial \, \W^{(l)} },
+    \qquad \textrm{for } l=1, \cdots , L,
+\end{equation}
+where $\alpha$ ($>0$) is called step size or learning rate.
+See my lecture note {\it Logistic Regression} for gradient-based algorithms.
+
+
+Since $Q$ is complicated, directly computing $\frac{ \partial \, Q }{ \partial \, \W^{(l)} } $ for any layer is difficult.
+The best practice is BackPropagation, that is, using the chain rule to let gradients flow from the top to the bottom.
+See the illustration in Figure~\ref{fig:bp}(right).
+\begin{itemize}
+    \item 
+    The loss function is a simple function of the output, $\X^{(L+1)}$.
+    So it is easy to compute the derivative $\frac{\partial \, Q}{ \partial \, \X^{(L+1)}}$.
+    \item
+    After knowing $\frac{\partial \, Q}{ \partial \, \X^{(L+1)}}$, we can use the equations (chain rule) in Section~\ref{sec:differential} to compute 
+    $\frac{\partial \, Q}{ \partial \, \W^{(L)}}$ and $\frac{\partial \, Q}{ \partial \, \X^{(L)}}$.
+    We record $\frac{\partial \, Q}{ \partial \, \W^{(L)}}$ and pass $\frac{\partial \, Q}{ \partial \, \X^{(L)}}$ to the $(L-1)$-th layer.
+    \item
+    We repeat the above step from the $(L-1)$-th layer all the way down to the first (bottom) layer.
+\end{itemize}
+In this way, we obtained $\frac{ \partial \, Q }{ \partial \, \W^{(L)} } $, $\cdots$, $\frac{ \partial \, Q }{ \partial \, \W^{(2)} } $, $\frac{ \partial \, Q }{ \partial \, \W^{(1)} } $ one by one.
+We will use them to update $\W^{(L)}$, e.g., according \eqref{eq:sgd}.
+
+
+
+
+\section{Expressing Convolution as Matrix Multiplication}
+
+The rest of this paper extends what we have done for FC layers to convolutional layers.
+In this section, we express convolution as matrix multiplication to reveal the connection between FC layers and convolutional layers.
+In the next section, we will derive the gradients for convolution.
+Differentiation for convolutional layer is the same as FC layer except for the \textsf{unfold} and \textsf{fold} operations;
+understanding \textsf{unfold} and \textsf{fold} will be the key to comprehend this and the next section.
+
+
+\paragraph{Tensor convolution.}
+Let $\T $ be a $d_1 \times d_2 \times d_3$ input tensor and $\K$ be a $k_1 \times k_2 \times d_3$ kernel (aka filter) tensor.
+The convolution $\T * \K $ outputs a $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1)$ matrix, denote $\C$.
+\begin{itemize}
+    \item 
+    What decides the output shape of convolution? It is the number of $k_1 \times k_2 \times d_3$ patches in $\T$.
+    Tensor $\T$ has $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1)$ such patches.
+    In Figure~\ref{fig:unfold}, since $d_1=4$, $d_2=3$, and $k_1=k_2=2$, there are totally $3\times 2 = 6$ patches.
+    \item
+    What are the entries of matrix $\C$?
+    Let scalar $c_{ij} \in \RB$ be the $(i,j)$-th entry of $\C$ and tensor $\PP_{ij} \in \RB^{k_1 \times k_2 \times d_3}$ be the $(i,j)$-th patch of $\T$.
+    Then
+    \begin{equation}\label{eq:conv1}
+        c_{ij} \: = \: \big\langle \K , \, \PP_{ij} \big\rangle
+        \: = \:  \big\langle  \vect (\K) , \, \vect (\PP_{ij} )  \big\rangle .
+    \end{equation}
+    Here, $\vect (\K) $ means reshaping tensor $\K$ to a $k_1  k_2  d_3 \times 1$ vector, and $\langle \cdot , \cdot \rangle$ denotes vector/matrix/tensor inner product.
+\end{itemize}
+
+
+\paragraph{Unfolding.}
+Based on the above discussions, we know that the $d_1 \times d_2 \times d_3$ tensor $\T $ can be converted to $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1)$ patches;
+each patch is a $k_1 \times k_2 \times d_3$ tensor (the same to the kernel $\K$).
+Converting the order-3 tensor $\T$ to the order-5 tensor $\overline{\X}$ is called \textsf{unfolding}.
+The shape of $\overline{\X}$ is
+\begin{equation*}
+    (d_1 - k_1 + 1) \times (d_2 - k_2 + 1) \times k_1 \times k_2 \times d_3;
+\end{equation*}
+$\overline{\X}$ consists of the patches $\{\PP_{ij}\}$:
+\begin{equation*}
+    \overline{\X} \, [i, \, j, \, :, \, :, \, :] \: = \: \PP_{ij} \: \in \: \RB^{k_1 \times k_2 \times d_3}.
+\end{equation*}
+Note that $\textsf{unfold}$ is supported by software systems like PyTorch.
+Then, we \textsf{reshape} the order-5 tensor $\overline{\X}$ to the
+\begin{equation*}
+     (d_1 - k_1 + 1)  (d_2 - k_2 + 1)  \; \times \; (k_1 k_2 d_3 )
+\end{equation*}
+matrix, denote $\X$.
+In sum, the procedure is
+\begin{equation*}
+    \T \textsf{ (order-3 tensor)}
+    \: \xrightarrow{\textsf{unfold}} \:
+    \overline{\X} \textsf{ (order-5 tensor)}
+    \: \xrightarrow{\textsf{reshape}} \:
+    \X \textsf{ (matrix)} .
+\end{equation*}
+Figure~\ref{fig:unfold} illustrates this procedure.
+
+\paragraph{Convolution as matrix-vector multiplication.}
+Let matrix $\X$ be the outcome after unfolding and reshaping.
+Let $\w = \vect (\K) \in \RB^{k_1 k_2 d_3 \times 1}$ be the vectorization of $\K \in \RB^{k_1 \times k_2 \times d_3}$.
+Then, compute the vector
+\begin{equation} \label{eq:conv_multiply}
+    \z \: = \: \X \, \cdot \, \vect (\K) 
+    \: \in \: \RB^{ (d_1 - k_1 + 1)  (d_2 - k_2 + 1)  \times 1} .
+\end{equation}
+Recall that the $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1)$ matrix $\C$ is the outcome of convolution.
+By comparing \eqref{eq:conv1} and \eqref{eq:conv_multiply}, we find that
+\begin{equation*}
+    \z \: = \: \vect (\C )
+    \qquad \textrm{and} \qquad
+    \C \: = \: \textsf{reshape} \Big( \z , \, \big( (d_1 - k_1 + 1), (d_2 - k_2 + 1) \big) \Big) .
+\end{equation*}
+To summarize, the forward pass of tensor convolution can be expressed in the following two equivalent forms:
+\begin{equation*}
+    \left.
+    \begin{array}{c}
+         \T \textsf{ (input tensor)} \\
+         \K \textsf{ (kernel tensor)} \\
+    \end{array}
+    \right\}
+    \: \xrightarrow{\textsf{convolution}}  \: 
+    \C  \textsf{ (output matrix)} 
+\end{equation*}
+and
+\begin{equation} \label{eq:conv2}
+    \left.
+    \begin{array}{r c}
+         \T \: \xrightarrow{\textsf{unfold}} \: \overline{\X} \: \xrightarrow{\textsf{reshape}}  & \X \\
+         \K \: \xrightarrow{\textsf{vectorize}} & \w  \\
+    \end{array}
+    \right\}
+    \: \xrightarrow{\textsf{multiply}}  \: 
+    \z
+    \: \xrightarrow{\textsf{reshape}}  \: 
+    \C .
+\end{equation}
+In the next section, we will use the latter to perform backpropagation.
+
+
+
+\begin{figure}[!h]
+	\centering
+	\includegraphics[width=0.7\linewidth]{figures/unfold.pdf}
+	\caption{Illustrating patching and unfolding. Here, $d_1=4$, $d_2=3$, $d_3=1$, and $k_1=k_2=2$.
+	Thus, there are $(d_1-k_1+1)\times (d_2-k_2+1) = 6$ patches, and each patch is $k_1\times k_2 = 2\times 2$.}
+	\label{fig:unfold}
+\end{figure}
+
+\section{Differentiation for Convolution}
+
+
+By representing convolution as the procedure \eqref{eq:conv2}, the backpropagation will be easier to derive.
+During the backpropagation, we receive $\frac{\partial \, Q}{\partial \, \C}$ and propagate it to $\K$ and $\T$.
+The backpropagation has the following steps:
+\begin{equation} \label{eq:conv2}
+    \left.
+    \begin{array}{r c}
+         \frac{\partial \, Q}{\partial \, \T} \: \xleftarrow{\textsf{fold}} \: \frac{\partial \, Q}{\partial \, \overline{\X}} \: \xleftarrow{\textsf{reshape}}  & \frac{\partial \, Q}{\partial \, \X} \\
+         \frac{\partial \, Q}{\partial \, \K} \: \xleftarrow{\textsf{reshape}} & \frac{\partial \, Q}{\partial \, \w}  \\
+    \end{array}
+    \right\}
+    \: \xleftarrow{\textsf{multiply}}  \: 
+    \frac{\partial \, Q}{\partial \, \z}
+    \: \xleftarrow{\textsf{vectorize}}  \: 
+    \frac{\partial \, Q}{\partial \, \C} .
+\end{equation}
+We describe the steps one by one.
+
+
+\paragraph{From $\C$ to $\z$.}
+This step is almost trivial.
+Since $\z = \vect (\C)$, we have
+\begin{equation*}
+    \frac{\partial \, Q}{\partial \, \z}
+    \: = \: \vect \left( \frac{\partial \, Q}{\partial \, \C} \right) 
+    \: \in \: \RB^{(d_1-k_1+1)(d_2-k_2+1) \times 1} .
+\end{equation*}
+It just performs a vectorization.
+
+
+
+\paragraph{From $\z$ to $\X$ and $\w$.}
+Recall from \eqref{eq:conv_multiply} that $\z$ is computed by the matrix-vector multiplication: $\z= \X \w$.
+Once we have $\frac{ \partial \, Q}{\partial \, \z}$, we can propagate it to $\X$ and $\w$ by
+\begin{align*}
+    & \underbrace{\frac{ \partial \, Q }{ \partial \, \X } }_{(d_1 - k_1 + 1)  (d_2 - k_2 + 1)  \times (k_1 k_2 d_3)}
+    \: = \: \underbrace{\frac{ \partial \, Q }{ \partial \, \z } }_{(d_1 - k_1 + 1)  (d_2 - k_2 + 1)  \times 1}
+    \underbrace{\w^T }_{ 1 \times (k_1 k_2 d_3)} ,\\
+    & \underbrace{ \frac{ \partial \, Q }{ \partial \, \w } }_{ (k_1 k_2 d_3) \times 1} 
+    \: = \:\underbrace{\X^T }_{(k_1 k_2 d_3) \times (d_1 - k_1 + 1)  (d_2 - k_2 + 1)}
+    \underbrace{\frac{ \partial \, Q }{ \partial \, \z } }_{(d_1 - k_1 + 1)  (d_2 - k_2 + 1)  \times 1} .
+\end{align*}
+The above equations follow from \eqref{eq:grad_q_x} and \eqref{eq:grad_q_w}.
+
+
+\paragraph{From $\w$ to $\K$.}
+Since $\w = \vect (\K) $, the $k_1\times k_2 \times d_3$ tensor $\K$ can be obtained by \textsf{reshaping} $\w$ to order-3 tensor.
+Thus,
+\begin{equation*}
+    \frac{ \partial \, Q }{ \partial \, \K } \:= \: 
+    \textsf{reshape} \left( \frac{\partial \, Q}{\partial \, \w}  \; , \Big( k_1, \,  \, k_2 , \, d_3 \Big)  \right)
+\end{equation*}
+
+\paragraph{From $\X$ to $\T$.}
+As illustrated in Figure~\ref{fig:unfold}, $\X$ is obtained by \textsf{unfolding} $\T$,
+and one entry of $\T$ is copied to multiple (at most $k_1k_2$) entries of $\X$.
+For example, in Figure~\ref{fig:unfold}, the \textcolor{OliveGreen}{green ``A''} entry is copied from $t_{2,3,1}$ to both $x_{4,4}$ and $x_{5,3}$.\footnote{We let $x_{ij} = \X[i,j]$ denote the $(i,j)$-th entry of $\X$.}
+Thus
+\begin{equation} \label{eq:t_x_1}
+     \textcolor{OliveGreen}{t_{2,3,1}} 
+     \: = \: 
+     \textcolor{OliveGreen}{x_{4,4}} 
+     \: = \: 
+     \textcolor{OliveGreen}{x_{5,3}} .
+\end{equation}
+So the $(2,3,1)$-th entry of $\T$ influences $Q$ via only two enties of $\X$, and thus
+\begin{equation*}
+    \frac{\partial \, x_{ij}}{\partial \, t_{2,3,1}} 
+    \: = \:
+    \left\{
+    \begin{array}{c l}
+         1 & \textrm{if } (i,j)=(4,4) \textrm{ or } (5,3);  \\
+         0 & \textrm{otherwise}. \\ 
+    \end{array}
+    \right.
+\end{equation*}
+It follows that
+\begin{equation} \label{eq:t_x_2}
+    \bigg[\frac{\partial \, Q}{\partial \, \T} \bigg]_{2, 3, 1}
+    \: = \: \sum_{i, j}  \frac{\partial \, x_{ij}}{\partial \, t_{2,3,1}} \, \frac{\partial \, Q}{\partial \, x_{ij}} 
+    \: = \: \frac{\partial \, Q}{\partial \, x_{4,4}} + \frac{\partial \, Q}{\partial \, x_{5,3}}  
+    \: = \: \bigg[\frac{\partial \, Q}{\partial \, \X} \bigg]_{4, 4} \, + \,\bigg[ \frac{\partial \, Q}{\partial \, \X} \bigg]_{5, 3} . 
+\end{equation}
+Deep learning platforms like PyTorch provide the ``\textsf{fold}'' for aggregating the entries of $\frac{\partial \, Q}{\partial \, \X} $ to get $\frac{\partial \, Q}{\partial \, \T}$.\footnote{The aggregation is according to the way the entries of $\T$ are copied to $\X$, e.g., $t_{2,3,1}$ is copied to $x_{4,4}$ and $x_{5,3}$.}
+First, \textsf{reshape} $\frac{\partial \, Q}{\partial \, \X} $ to the $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1) \times k_1 \times k_2 \times d_3$ order-5 tensor:
+\begin{equation*}
+    \frac{\partial \, Q}{\partial \, \overline{\X}}  \:= \: \textsf{reshape} \left( \frac{\partial \, Q}{\partial \, \X}  \; , \Big( (d_1 - k_1 + 1) , \, (d_2 - k_2 + 1) , \, k_1 , \, k_2 , \, d_3 \Big)  \right) .
+\end{equation*}
+Then, apply ``\textsf{fold}'' to get the $d_1\times d_2 \times d_3$ order-3 tensor  $\frac{\partial \, Q}{\partial \, \T}$:
+\begin{equation*}
+    \frac{\partial \, Q}{\partial \, \T}
+    \: = \: \textsf{fold} \left( \frac{\partial \, Q}{\partial \, \overline{\X}}   \right).
+\end{equation*}
+To summerize, the forward pass from $\T$ to $\X$ and the backward pass from $\X$ to $\T$ are
+\begin{equation*}
+    \T \: \xrightarrow{\textsf{unfold}} \: \overline{\X} \: \xrightarrow{\textsf{reshape}}  \: \X
+    \qquad \textrm{and} \qquad
+    \frac{\partial \, Q}{\partial \, \T} 
+    \: \xleftarrow{\textsf{~fold~}} \: 
+    \frac{\partial \, Q}{\partial \, \overline{\X}} 
+    \: \xleftarrow{\textsf{reshape}}  \: 
+    \frac{\partial \, Q}{\partial \, \X} .
+\end{equation*}
+
+
+
+
+
+
+
+
+\end{document}
diff --git a/LectureNotes/BP/figures/bp.pptx b/LectureNotes/BP/figures/bp.pptx
new file mode 100644
index 0000000..60008dd
Binary files /dev/null and b/LectureNotes/BP/figures/bp.pptx differ
diff --git a/LectureNotes/BP/figures/bp1.pdf b/LectureNotes/BP/figures/bp1.pdf
new file mode 100644
index 0000000..1eade8f
Binary files /dev/null and b/LectureNotes/BP/figures/bp1.pdf differ
diff --git a/LectureNotes/BP/figures/bp2.pdf b/LectureNotes/BP/figures/bp2.pdf
new file mode 100644
index 0000000..b8462ca
Binary files /dev/null and b/LectureNotes/BP/figures/bp2.pdf differ
diff --git a/LectureNotes/BP/figures/differential.pdf b/LectureNotes/BP/figures/differential.pdf
new file mode 100644
index 0000000..81d1d49
Binary files /dev/null and b/LectureNotes/BP/figures/differential.pdf differ
diff --git a/LectureNotes/BP/figures/differential.pptx b/LectureNotes/BP/figures/differential.pptx
new file mode 100644
index 0000000..93bfaa4
Binary files /dev/null and b/LectureNotes/BP/figures/differential.pptx differ
diff --git a/LectureNotes/BP/figures/unfold.pdf b/LectureNotes/BP/figures/unfold.pdf
new file mode 100644
index 0000000..5cb5554
Binary files /dev/null and b/LectureNotes/BP/figures/unfold.pdf differ
diff --git a/LectureNotes/BP/figures/unfold.xlsx b/LectureNotes/BP/figures/unfold.xlsx
new file mode 100644
index 0000000..59e3481
Binary files /dev/null and b/LectureNotes/BP/figures/unfold.xlsx differ
diff --git a/LectureNotes/DRL/DRL.pdf b/LectureNotes/DRL/DRL.pdf
index b24e209..ad4a1f4 100644
Binary files a/LectureNotes/DRL/DRL.pdf and b/LectureNotes/DRL/DRL.pdf differ
diff --git a/LectureNotes/DRL/DRL.tex b/LectureNotes/DRL/DRL.tex
new file mode 100644
index 0000000..acb156c
--- /dev/null
+++ b/LectureNotes/DRL/DRL.tex
@@ -0,0 +1,700 @@
+\documentclass[11pt]{article}
+\usepackage{amsmath,amssymb,amsmath,amsthm,amsfonts}
+\usepackage{latexsym,graphicx}
+\usepackage{fullpage,color}
+\usepackage{url}
+\usepackage[pdftex,bookmarks,colorlinks=true,citecolor=blue]{hyperref}
+\usepackage[numbers]{natbib}
+\usepackage{graphicx,subfigure}
+\usepackage{algorithm}
+\usepackage{algorithmic}
+\usepackage{listings}
+\usepackage{xcolor}
+\usepackage{color}
+
+\numberwithin{equation}{section}
+
+\pagestyle{plain}
+
+\setlength{\oddsidemargin}{0in}
+\setlength{\topmargin}{0in}
+\setlength{\textwidth}{6.5in}
+\setlength{\textheight}{8.5in}
+
+\newtheorem{fact}{Fact}[section]
+\newtheorem{question}{Question}[section]
+\newtheorem{lemma}{Lemma}[section]
+\newtheorem{theorem}[lemma]{Theorem}
+\newtheorem{assumption}[lemma]{Assumption}
+\newtheorem{corollary}[lemma]{Corollary}
+\newtheorem{prop}[lemma]{Proposition}
+\newtheorem{claim}{Claim}[section]
+\newtheorem{remark}{Remark}[section]
+\newtheorem{definition}{Definition}[section]
+\newtheorem{prob}{Problem}[section]
+\newtheorem{conjecture}{Conjecture}[section]
+\newtheorem{property}{Property}[section]
+
+\def\A{{\bf A}}
+\def\a{{\bf a}}
+\def\B{{\bf B}}
+\def\bb{{\bf b}}
+\def\C{{\bf C}}
+\def\c{{\bf c}}
+\def\D{{\bf D}}
+\def\d{{\bf d}}
+\def\E{{\bf E}}
+\def\e{{\bf e}}
+\def\F{{\bf F}}
+\def\f{{\bf f}}
+\def\g{{\bf g}}
+\def\h{{\bf h}}
+\def\G{{\bf G}}
+\def\H{{\bf H}}
+\def\I{{\bf I}}
+\def\K{{\bf K}}
+\def\k{{\bf k}}
+\def\LL{{\bf L}}
+\def\M{{\bf M}}
+\def\m{{\bf m}}
+\def\N{{\bf N}}
+\def\n{{\bf n}}
+\def\PP{{\bf P}}
+\def\pp{{\bf p}}
+\def\Q{{\bf Q}}
+\def\q{{\bf q}}
+\def\R{{\bf R}}
+\def\rr{{\bf r}}
+\def\S{{\bf S}}
+\def\s{{\bf s}}
+\def\T{{\bf T}}
+\def\tt{{\bf t}}
+\def\U{{\bf U}}
+\def\u{{\bf u}}
+\def\V{{\bf V}}
+\def\v{{\bf v}}
+\def\W{{\bf W}}
+\def\w{{\bf w}}
+\def\X{{\bf X}}
+\def\x{{\bf x}}
+\def\Y{{\bf Y}}
+\def\y{{\bf y}}
+\def\Z{{\bf Z}}
+\def\z{{\bf z}}
+\def\0{{\bf 0}}
+\def\1{{\bf 1}}
+
+
+
+\def\AM{{\mathcal A}}
+\def\CM{{\mathcal C}}
+\def\DM{{\mathcal D}}
+\def\EM{{\mathcal E}}
+\def\GM{{\mathcal G}}
+\def\FM{{\mathcal F}}
+\def\IM{{\mathcal I}}
+\def\JM{{\mathcal J}}
+\def\KM{{\mathcal K}}
+\def\LM{{\mathcal L}}
+\def\NM{{\mathcal N}}
+\def\OM{{\mathcal O}}
+\def\PM{{\mathcal P}}
+\def\SM{{\mathcal S}}
+\def\TM{{\mathcal T}}
+\def\UM{{\mathcal U}}
+\def\VM{{\mathcal V}}
+\def\WM{{\mathcal W}}
+\def\XM{{\mathcal X}}
+\def\YM{{\mathcal Y}}
+\def\RB{{\mathbb R}}
+\def\RBmn{{\RB^{m\times n}}}
+\def\EB{{\mathbb E}}
+\def\PB{{\mathbb P}}
+
+\def\TX{\tilde{\bf X}}
+\def\TA{\tilde{\bf A}}
+\def\tx{\tilde{\bf x}}
+\def\ty{\tilde{\bf y}}
+\def\TZ{\tilde{\bf Z}}
+\def\tz{\tilde{\bf z}}
+\def\hd{\hat{d}}
+\def\HD{\hat{\bf D}}
+\def\hx{\hat{\bf x}}
+\def\nysA{{\tilde{\A}_c^{\textrm{nys}}}}
+
+\def\alp{\mbox{\boldmath$\alpha$\unboldmath}}
+\def\bet{\mbox{\boldmath$\beta$\unboldmath}}
+\def\epsi{\mbox{\boldmath$\epsilon$\unboldmath}}
+\def\etab{\mbox{\boldmath$\eta$\unboldmath}}
+\def\ph{\mbox{\boldmath$\phi$\unboldmath}}
+\def\pii{\mbox{\boldmath$\pi$\unboldmath}}
+\def\Ph{\mbox{\boldmath$\Phi$\unboldmath}}
+\def\Ps{\mbox{\boldmath$\Psi$\unboldmath}}
+\def\ps{\mbox{\boldmath$\psi$\unboldmath}}
+\def\tha{\mbox{\boldmath$\theta$\unboldmath}}
+\def\Tha{\mbox{\boldmath$\Theta$\unboldmath}}
+\def\muu{\mbox{\boldmath$\mu$\unboldmath}}
+\def\Si{\mbox{\boldmath$\Sigma$\unboldmath}}
+\def\si{\mbox{\boldmath$\sigma$\unboldmath}}
+\def\Gam{\mbox{\boldmath$\Gamma$\unboldmath}}
+\def\Lam{\mbox{\boldmath$\Lambda$\unboldmath}}
+\def\De{\mbox{\boldmath$\Delta$\unboldmath}}
+\def\de{\mbox{\boldmath$\delta$\unboldmath}}
+\def\Ome{\mbox{\boldmath$\Omega$\unboldmath}}
+\def\Pii{\mbox{\boldmath$\Pi$\unboldmath}}
+\def\varepsi{\mbox{\boldmath$\varepsilon$\unboldmath}}
+\newcommand{\ti}[1]{\tilde{#1}}
+\def\Ncal{\mathcal{N}}
+\def\argmax{\mathop{\rm argmax}}
+\def\argmin{\mathop{\rm argmin}}
+
+\def\ALG{{\AM_{\textrm{col}}}}
+
+\def\mean{\mathsf{mean}}
+\def\std{\mathsf{std}}
+\def\bias{\mathsf{bias}}
+\def\var{\mathsf{var}}
+\def\sgn{\mathsf{sgn}}
+\def\tr{\mathsf{tr}}
+\def\rk{\mathrm{rank}}
+\def\nnz{\mathsf{nnz}}
+\def\poly{\mathrm{poly}}
+\def\diag{\mathsf{diag}}
+\def\Diag{\mathsf{Diag}}
+\def\const{\mathrm{Const}}
+\def\st{\mathsf{s.t.}}
+\def\vect{\mathsf{vec}}
+\def\sech{\mathrm{sech}}
+\def\sigmoid{\mathsf{sigmoid}}
+
+\newcommand{\red}[1]{{\color{red}#1}}
+
+
+
+\def\argmax{\mathop{\rm argmax}}
+\def\argmin{\mathop{\rm argmin}}
+
+\newenvironment{note}[1]{\medskip\noindent \textbf{#1:}}%
+        {\medskip}
+
+
+\newcommand{\etal}{{\em et al.}\ }
+\newcommand{\assign}{\leftarrow}
+\newcommand{\eps}{\epsilon}
+
+\newcommand{\opt}{\textrm{\sc OPT}}
+\newcommand{\script}[1]{\mathcal{#1}}
+\newcommand{\ceil}[1]{\lceil #1 \rceil}
+\newcommand{\floor}[1]{\lfloor #1 \rfloor}
+
+
+
+\lstset{ %
+extendedchars=false,            % Shutdown no-ASCII compatible
+language=Python,                % choose the language of the code
+xleftmargin=1em,
+xrightmargin=1em,
+basicstyle=\footnotesize,    % the size of the fonts that are used for the code
+tabsize=3,                            % sets default tabsize to 3 spaces
+numbers=left,                   % where to put the line-numbers
+numberstyle=\tiny,              % the size of the fonts that are used for the line-numbers
+stepnumber=1,                   % the step between two line-numbers. If it's 1 each line
+                                % will be numbered
+numbersep=5pt,                  % how far the line-numbers are from the code   %
+keywordstyle=\color[rgb]{0,0,1},                % keywords
+commentstyle=\color[rgb]{0.133,0.545,0.133},    % comments
+stringstyle=\color[rgb]{0.627,0.126,0.941},      % strings
+backgroundcolor=\color{white}, % choose the background color. You must add \usepackage{color}
+showspaces=false,               % show spaces adding particular underscores
+showstringspaces=false,         % underline spaces within strings
+showtabs=false,                 % show tabs within strings adding particular underscores
+frame=single,                 % adds a frame around the code
+%captionpos=b,                   % sets the caption-position to bottom
+breaklines=true,                % sets automatic line breaking
+breakatwhitespace=false,        % sets if automatic breaks should only happen at whitespace
+%title=\lstname,                 % show the filename of files included with \lstinputlisting;
+%                                % also try caption instead of title
+mathescape=true,escapechar=?    % escape to latex with ?..?
+escapeinside={\%*}{*)},         % if you want to add a comment within your code
+%columns=fixed,                  % nice spacing
+%morestring=[m]',                % strings
+%morekeywords={%,...},%          % if you want to add more keywords to the set
+%    break,case,catch,continue,elseif,else,end,for,function,global,%
+%    if,otherwise,persistent,return,switch,try,while,...},%
+}
+
+
+\begin{document}
+
+%\setlength{\fboxrule}{.5mm}\setlength{\fboxsep}{1.2mm}
+%\newlength{\boxlength}\setlength{\boxlength}{\textwidth}
+%\addtolength{\boxlength}{-4mm}
+
+
+\title{Deep Reinforcement Learning}
+
+\author{\textbf{Shusen Wang} \\ Stevens Institute of Technology}
+
+%\date{ }
+
+\maketitle
+
+\begin{abstract}
+This lecture note briefly summarizes three kinds of deep reinforcement learning approaches: value-based methods, policy-based methods, and actor-critic methods.
+This note is structured as following.
+First, reinforcement learning terminologies are defined.
+Second, we study Deep Q Network (DQN), a family of value-based methods, and train DQN using temporal difference (TD) learning.
+Third, we study policy-based learning and derive policy gradient algorithms.
+Last, we study standard (random) actor-critic method and deterministic actor-critic method.
+\end{abstract}
+
+
+
+\section{Notation}
+
+Throughout, we use uppercase letters, e.g., $X$, to denote random variables and lowercase letters, e.g., $x$, to denote their observations.
+Let $\PB (X = x)$ be the probability of the event ``$X = x$''.
+Let $\PB (Y=y | X=x)$ be the probability of the event ``$Y=y$'' under the condition ``$X=x$''.
+
+\paragraph{Agent:}
+A system that is embedded in an environment and takes actions to change the state of the environment. Examples include robots, industrial controllers, and Mario in the game Super Mario.
+
+
+\paragraph{State ($S$):}
+State can be viewed as a summary of the history of the system that determines its future evolution.
+State space $\SM$ is the set that contains all the possible states.
+At time step $t$, the past states are observed and we thus know their values: $s_1, \cdots , s_t$;
+however, the future states $S_{t+1}, S_{t+2}, \cdots $ are unobserved random variables.
+
+
+\paragraph{Action ($A$):}
+The agent's decision based on the state and other considerations.
+Action space $\AM$ is the set that contains all the actions.
+Action space can be a discrete set such as $\{\textrm{``left''}, \textrm{``right''}, \textrm{``up''} \}$ or a continuous set such as $[0, 1] \times [-90, 90]$.
+At time step $t$, the past actions are observed: $a_1, \cdots , a_t$, but the future actions $A_{t+1}, A_{t+2}, \cdots$ are unobserved random variables.
+
+
+\paragraph{Reward ($R$):}
+Reward is a value received by the agent from the environment as a direct response to the agent’s actions.
+At time step $t$, all the past rewards are observed: $r_1, r_2, \cdots , r_t$.
+However, the future reward $R_i$ (for $i > t$) is unobserved, and it depends on the random variables, $S_{t+1}$ and $A_{t+1}$.
+Thus, at time step $t$, the future rewards $R_{t+1}, R_{t+2} , \cdots$ are random variables.
+
+
+
+
+\paragraph{Policy function ($\pi $):}
+The decision-making function of the agent. 
+Policy is the probability density function (PDF): $\pi (a | s) = \PB ( A = a | S  = s )$.
+The policy function maps the observed state $S=s$ to a probability distribution over all the actions in set $\AM$.
+Since $\pi$ is a PDF, $\sum_{a \in \AM } \pi (a | s) = 1$.
+The agent will perform action $a$ with probability $\pi (a | s)$, for all $a \in \AM$.
+See the illustration in Figure~\ref{fig:random}.
+
+
+
+\paragraph{State transition ($p $):}
+Given the current state $S=s$, the agent's action $A=a$ will lead to the new state $S'$ given by the environment.
+State-transition function is the probability density function (PDF) $p (s' | s, a) = \PB ( S'  = s' | S = s , A = a )$.
+The environment makes $s'$ the new state with probability $p (s' | s, a)$, for all $s' \in \SM$.
+
+
+
+
+
+
+\paragraph{Trajectory:}
+The agent's interaction with the environment results in a sequence of (state, action, reward) triplets:
+$s_1, a_1, r_1, s_2, a_2, r_2, s_3, a_3, r_3, \cdots$
+
+\begin{figure}[!t]
+    \centering
+    \includegraphics[width=0.5\linewidth]{figures/randomness.pdf}
+    \caption{Illustration of the randomness.
+    The action $A$ is randomly sampled according to the policy function.
+    The new state $S'$ is randomly sampled according to the state-transition function.
+    }
+    \label{fig:random}
+\end{figure}
+
+
+
+
+\paragraph{Return ($U$):}
+Return (aka cumulative future reward) is defined as 
+\begin{equation*}
+    U_t = R_t + R_{t+1} + R_{t+2} + R_{t+3} + \cdots 
+\end{equation*}
+Discounted return (aka cumulative discounted future reward) is defined as 
+\begin{equation*}
+    U_t = R_t + \gamma \cdot R_{t+1} + \gamma^2 \cdot  R_{t+2} + \gamma^3 \cdot  R_{t+3} + \cdots 
+\end{equation*}
+Here, $\gamma \in (0, 1)$ is the discount rate.
+The return $U_t$ is random because the future rewards $R_t, R_{t+1}, R_{t+2}, \cdots $ are unobserved random variables.
+Recall that the randomness in the $R_i$ ($i \geq t$) comes from the future states $S_i$ and action $A_{i} $.
+
+
+
+\paragraph{Action-value function ($Q_{\pi}$):}
+Action-value function $Q_{\pi} (s_t, a_t)$ measures given state $s_t$ and policy $\pi$, how good the action $a_t$ is.
+Formally speaking, 
+\begin{equation*}
+    Q_{\pi} (s_t, a_t)
+    \: = \: \EB \big[ U_t \, \big| \, S_t = s_t , A_t = a_t  \big] .
+\end{equation*}
+The expectation is taken w.r.t.\ the future actions $A_{t+1}, A_{t+2}, \cdots $ and future states $S_{t+1}, S_{t+2}, \cdots $ which are random variables.
+Note that $Q_{\pi} (s_t, a_t)$ depends on the policy function $\pi$ and the state-transition function $p$.
+
+
+\paragraph{Optimal action-value function ($Q^\star $):}
+The optimal action-value function $Q^\star (s_t, a_t)$ measures how good the action $a_t$ is at state $s_t$.
+Formally speaking, 
+\begin{equation*}
+    Q^\star (s, a)
+    \: = \: \max_{\pi } Q_{\pi} (s, a).
+\end{equation*}
+Note that $Q^\star (s, a)$ is independent of the the policy function $\pi$.
+
+
+
+\paragraph{State-value function ($V_{\pi} $):}
+State-value function $V_{\pi} (s_t)$ measures given $\pi$, how good the current situation $s_t$ is.
+Formally speaking, 
+\begin{equation*}
+    V_{\pi} (s_t )
+    \: = \: \EB_{A\sim \pi (\cdot | s_t )} \big[ Q_{\pi} (s_t, A)  \big] 
+    \: = \: \int_{\AM } \pi (a | s_t ) \cdot Q_{\pi} (s_t, a) \: d \, a .
+\end{equation*}
+Here, the action $A$ is treated as a random variable and integrated out.
+
+
+
+
+\paragraph{Optimal state-value function ($V^\star $):}
+The optimal state-value function $V^\star (s_t)$ measures how good the current situation $s_t$ is.
+Formally speaking, 
+\begin{equation*}
+    V^\star (s )
+    \: = \: \max_{\pi } V_{\pi} (s).
+\end{equation*}
+Note that $V^\star (s)$ is independent of the the policy function $\pi$.
+
+
+
+\section{Value-Based Deep Reinforcement Learning}
+
+
+The optimal action-value function $Q^\star  (s, a)$ can be used to control the agent: observing state $s_t$, the agent performs 
+\begin{equation*}
+    a_t  \: = \:  \argmax_{a \in \AM } Q^\star (s_t , a ) .
+\end{equation*}
+The optimal action-value function can be approximated by the neural network $Q (s, a; \w )$ where $\w$ captures the model parameters.
+The neural network is called \textbf{Deep Q Network (DQN)}.
+
+There are different designs of network architecture.
+Here, we consider the game Super Mario, in which the the action space is discrete: $\AM = \{ \textrm{``left''} , \textrm{``right''}, \textrm{``up''}\}$.
+DQN takes state $s_t$ (which can be a screenshot or several most recent screenshots) as input.
+The architecture can be 
+\begin{equation*}
+    \texttt{State} 
+    \: \Rightarrow \: 
+    \texttt{Conv}
+    \: \Rightarrow \: 
+    \texttt{Flatten}
+    \: \Rightarrow \: 
+    \texttt{Dense} 
+    \: \Rightarrow \: 
+    \texttt{Values} .
+\end{equation*}
+In the Super Mario example, DQN outputs a 3-dimensional vector, e.g., $ [200, 100, 250]$, whose entries corresponds to the three actions.
+Then the action should be 
+\begin{equation*}
+    a_t \: = \: \argmax_{a} Q (s_t , a ; \w ).
+\end{equation*}
+Since $Q (s_t , \textrm{``up''} ; \w ) = 250$ is the biggest value among the three,
+$a_t=$``up'' will be the selected action.
+
+
+
+DQN is typically trained using \textbf{temporal different (TD) learning} \cite{sutton2008convergent,sutton2009fast} which allows for updating the model parameters every time a reward $R_t=r_t$ is observed.
+By definition, $U_t = \sum_{i=1} \gamma^{i-t} \cdot R_i$.
+Thus 
+\begin{equation*}
+    U_{t} \: = \: R_t + \gamma \cdot U_{t+1}.
+\end{equation*}
+TD learning makes use of the fact:
+\begin{equation*}
+    Q_\pi (s_t, a_t)
+    \: = \:
+    \EB \big[U_{t} \, \big| \, s_t , a_t \big] 
+    \: = \: \EB \big[ R_t + \gamma \cdot U_{t+1} \, \big| \, s_t , a_t \big]
+    \: = \: \EB \big[ R_t + \gamma \cdot Q_\pi (S_{t+1}, A_{t+1}) \, \big| \, s_t , a_t \big].
+\end{equation*}
+Since $Q (s_t, a_t; \w) \approx \max_{\pi} \EB [U_t | s_t, a_t]$, we have
+\begin{equation*}
+    Q (s_t, a_t; \w)  \: \approx \:  r_t + \gamma \cdot Q (s_{t+1}, a_{t+1}; \w) . 
+\end{equation*}
+Before observing $R_t$, the expected return was
+\[
+q_t \: = \: Q(s_t , a_t; \w)
+\]
+After observing $R_t=r_t$, the expected return is updated to
+\[
+y_t \: = \: r_t + \gamma \cdot Q(s_{t+1} , a_{t+1}; \w),
+\]
+which is called \textbf{TD target}.
+The \textbf{TD error} is $\delta_t = q_t - y_t$.
+We seek to encourage a small TD error and thus define the loss:
+\begin{equation*}
+    L_t \: = \: \frac{1}{2} \delta_t^2 \: = \: \frac{1}{2} \big[ Q (s_t , a_t ; \w ) - y_t \big]^2 .
+\end{equation*}
+Pretend $y_t$ is not a function of $\w$.
+Then the gradient is
+\begin{equation*}
+    \g_t
+    \: \triangleq \: \frac{\partial \, L_t }{\partial \, \w } \Big|_{\w=\w_t }
+    \: = \: \delta_t \cdot \frac{\partial \, Q (s_t , a_t ; \w ) }{\partial \, \w } \Big|_{\w=\w_t } .
+\end{equation*}
+The DQN can be updated by performing a gradient descent: $\w_{k+1} \longleftarrow \w_{k} - \alpha \cdot \g_t $ where $\alpha$ is the learning rate.
+
+
+\section{Policy-Based Deep Reinforcement Learning} \label{sec:policy}
+
+
+
+The policy function $\pi  (a | s)$ can be used to control the agent: observing the state $S_t = s_t$, the agent randomly samples an action: 
+\begin{equation*}
+    a_t  \: \sim \:  \pi (\cdot | s_t ) .
+\end{equation*}
+The policy function can be approximated by the neural network $\pi (a | s; \tha )$ where $\tha$ captures the model parameters.
+The neural network is called \textbf{policy network}.
+
+
+
+
+There are different designs of network architecture.
+Here, we also consider the game Super Mario, in which the the action space is discrete: $\AM = \{ \textrm{``left''} , \textrm{``right''}, \textrm{``up''}\}$.
+The policy network takes observed state s (which can be a screenshot) as input.
+The architecture can be 
+\begin{equation*}
+    \texttt{State} 
+    \: \Rightarrow \: 
+    \texttt{Conv}
+    \: \Rightarrow \: 
+    \texttt{Flatten}
+    \: \Rightarrow \: 
+    \texttt{Dense} 
+    \: \Rightarrow \: 
+    \texttt{Softmax}
+    \: \Rightarrow \: 
+    \texttt{Probabilities} .
+\end{equation*}
+In the Super Mario example, DQN outputs a 3-dimensional vector, e.g., $\pp = [0.2, 0.1, 0.7]$, whose entries corresponds to the three actions.
+Then the action will be randomly sampled:
+\begin{equation*}
+    \PB \big( A = \textrm{``left''} \big) = 0.2, \qquad
+    \PB \big( A = \textrm{``right''} \big) = 0.1, \qquad
+    \PB \big( A = \textrm{``up''} \big) = 0.7.
+\end{equation*}
+All of the three actions may be selected.
+If the random sampling is independently repeated 1000 times, then around 200 observations of $A$ are ``left'', around 100 are ``right'', and around 700 are ``up''.
+
+
+
+
+The policy network can be learned using \textbf{policy gradient} algorithms.
+If the actions are discrete, then the state-value function can be written as:
+\begin{equation} \label{eq:state_value}
+    V_\pi (s ) \: = \: \sum_{a \in \AM} \pi (a | s) \cdot Q_\pi  (s, a)  .
+\end{equation}
+Policy-based learning uses the policy network $\pi (a | s; \tha )$ to approximate the policy function $\pi (a | s)$.
+With the approximation of policy function, $V_\pi (s )$ is approximated by
+\begin{equation*}
+    V (s ; \tha ) \: = \: \sum_{a \in \AM} \pi (a | s; \tha ) \cdot Q_\pi  (s, a)  .
+\end{equation*}
+Policy gradient is the derivative of $ V (s ; \tha )$ w.r.t.\ $\tha $ \cite{sutton2000policy}:
+\begin{eqnarray*}
+    \frac{\partial \, V (s ; \tha )}{\partial \, \tha }
+    & = & \frac{\partial \, \sum_{a \in \AM} \pi (a | s; \tha ) \cdot Q_\pi  (s, a)  }{\partial \, \tha } \\
+    & = & \sum_{a \in \AM} \frac{\partial \,  \pi (a | s; \tha ) \cdot Q_\pi  (s, a)  }{\partial \, \tha } \\
+    & = & \sum_{a \in \AM}  Q_\pi  (s, a)  \cdot \frac{\partial \,  \pi (a | s; \tha )  }{\partial \, \tha } \\
+    & = & \sum_{a \in \AM}  Q_\pi  (s, a) \cdot \pi (a | s; \tha )  \cdot \frac{\partial \,  \log \pi (a | s; \tha )  }{\partial \, \tha } .
+\end{eqnarray*}
+Here, the third identity follows from that $Q_\pi  (s, a)$ does not depend on $\tha$;\footnote{This assumption is too strong. Since $Q_\pi$ depends on the policy function $\pi$, $Q_{\pi}$ can depend on $\tha$. Here, the assumption is used to simplify the derivation.}
+the last identity follows from that $\frac{\partial \, \log f (x)}{\partial \, x} = \frac{1}{f (x)}\cdot \frac{\partial f(x)}{\partial \, x}$.
+The above equation can be equivalently written as
+\begin{equation} \label{eqn:policy_grad}
+    \frac{\partial \, V (s ; \tha )}{\partial \, \tha }
+    \: = \: \EB_{A \sim \pi (\cdot | s , \theta )} \bigg[  Q_\pi  (s, a)  \cdot \frac{\partial \,  \log \pi (A | s; \tha )  }{\partial \, \tha }  \bigg] .
+\end{equation}
+Recall that the approximate state-value function $V (s ; \tha )$ indicates how good the situation $s$ is if policy $\pi (a | s; \tha )$ is used.
+We thereby have the motivation to update $\tha$ so that $V (s ; \tha )$ will increase (which means the situation is better.)
+Thus, the policy network can be updated by policy gradient ascent:
+\begin{equation*}
+    \tha_{t+1} \: \longleftarrow \: \tha_t + \beta \cdot \frac{\partial \, V (s ; \tha )}{\partial \, \tha } \bigg|_{\theta = \theta_{t}} ,
+\end{equation*}
+where $\beta$ is the learning rate.
+
+\begin{remark}
+The derivation of policy gradient written in the above is not rigorous!
+It is a simplified version to make the policy gradient easy to understand.
+To be rigorous, we must take into account that $Q_\pi$ depends on the policy $\pi$ and is thereby a function of $\tha$.
+However, even is $Q_\pi$'s dependence on $\tha$ is taken into account, the resulting policy gradient is the same to \eqref{eqn:policy_grad}.
+\end{remark}
+
+
+To this end, we defined the policy network and derived the policy gradient in \eqref{eqn:policy_grad}.
+However, there are two unsolved problems.
+First, the expectation in \eqref{eqn:policy_grad} maybe intractable; this is typically the case when the action space $\AM$ is continuous, e.g., $\AM=[0, 1]$.
+Second, the action-value $Q_\pi  (s, a) $ is unknown.
+We answer the two questions one by one.
+
+
+\textbf{What if the expectation in \eqref{eqn:policy_grad} is intractable?}
+If the action space $\AM$ is continuous, then the expectation (which is an integration) is typically intractable.
+Given state $S_t=s_t$, if the action $A_t = a_t$ is randomly sampled according to the PDF $\pi (\cdot | s_t ; \tha )$, then 
+\begin{equation*}
+    \tilde{\g}_t
+    \: = \: Q_\pi  (s_t, a_t)  \cdot \frac{\partial \,  \log \pi (a_t | s_t; \tha )}{\partial \, \tha }
+\end{equation*}
+is an unbiased estimate of $\frac{\partial \, V (s_t ; \mathbf{\theta} )}{\partial \,\mathbf{ \theta} }$.
+We can think of $\g_{\theta } (\tha )$ as a stochastic gradient and update $\tha$ using stochastic gradient ascent.
+
+
+\textbf{How do we know the action-value $Q_\pi  (s, a) $?}
+There can be two solutions: first, use the observed return $r_t$ instead of $Q_\pi  (s, a) $; second, approximate $Q_\pi  (s, a) $ using a neural network.
+The two solutions are described in the following:
+\begin{itemize}
+\item
+    Play a game to the end, obtain all the rewards $r_1, r_2, \cdots , r_T$, and compute the returns $u_1, u_2, \cdots , u_T$ using the equation $u_t = \sum_{i=t}^T \gamma^{i-t} \cdot r_i$.
+    Since $Q_\pi  (s_t, a_t) = \EB [U_t | s_t, a_t , \pi ]$, we can use $u_t$ to replace $Q_\pi  (s_t, a_t) $.
+    In this way, the policy gradient \eqref{eqn:policy_grad} at time step $t$ becomes
+    \begin{equation*}
+        \frac{\partial \, V (s_t ; \tha )}{\partial \, \tha }
+        \: = \: \EB_{A \sim \pi (\cdot | s_t , \mathbf{\theta} )} \bigg[  u_t  \cdot \frac{\partial \,  \log \pi (A | s_t; \tha )  }{\partial \, \tha }  \bigg] .
+    \end{equation*}
+    AlphaGo \cite{silver2016mastering} uses this approach.
+\item
+    Use a value network to approximate $Q_\pi  (s, a) $.
+    The value network provides supervision to the policy network.
+    The value network can be learned by temporal difference (TD).
+    This leads to the actor-critic method which is elaborated on in Section~\ref{sec:actor_critic_rand}.
+\end{itemize}
+
+
+
+
+\section{Actor-Critic Methods}  \label{sec:actor_critic}
+
+Section~\ref{sec:actor_critic_rand} follows Section~\ref{sec:policy} and derive the standard (random) actor-critic method.
+This approach is suitable for problems with discrete action space.\footnote{For example, Super Mario's action space $\{\textrm{``left''}, \textrm{``right''}, \textrm{``up''} \}$ is a discrete set.}
+Section~\ref{sec:actor_critic_det} studies deterministic actor-critic method and learn it using deterministic policy gradient algorithm.
+This method is very useful when the actions are continuous.\footnote{For example, a self-driving car's action can be two-dimensional vectors. The first dimension is the steering angle, and the second dimension is acceleration/deceleration. The action space is obviously continuous.}
+
+
+\subsection{Random Actor-Critic Method} \label{sec:actor_critic_rand}
+
+
+The actor-critic method has two neural networks.
+Policy network $\pi (a | s; \tha ) $, which is called actor, approximates the policy function $\pi (a | s)$.
+Value network $q (s, a; \w )$, which is called critic, approximates the action-value function $Q_\pi (a, s)$.
+In this way, the state-value function $V_\pi (s )$ is approximated by
+\begin{equation*}
+    V (s ; \w , \tha )
+    \: = \: \EB_{A \sim \pi (\cdot | s; \theta )} \big[ q (s, A ; \w ) \big]
+    \: = \: \sum_{a\in \AM} \pi (a | s; \tha ) \cdot  q (s, a ; \w ) .
+\end{equation*}
+It is not hard to show the policy gradient is
+\begin{equation*}
+    \frac{\partial \, V (s ; \w , \tha ) }{ \partial \, \tha }
+    \: = \: \EB_{A \sim \pi (\cdot | s , \theta )} \bigg[  q  (s, A; \w)  \cdot \frac{\partial \,  \log \pi (A | s; \tha )  }{\partial \, \tha }  \bigg] .
+\end{equation*}
+The policy network will be updated using (stochastic) policy gradient ascent.
+The value network can be updated using temporal different (TD) learning.
+The following summarizes one iteration of the algorithm.
+\begin{enumerate}
+    \item 
+    Observe state $s_t$, and then randomly sample action $a_t \sim \pi (\cdot | s_t ; \tha_t )$.
+    \item
+    Agent performs action $a_t$ and observe reward $r_t$ and new state $s_{t+1}$.
+    \item
+    Randomly sample action $a_{t+1} \sim \pi (\cdot | s_{t+1} ; \tha_t )$. (Agent does not perform action $a_{t+1}$.)
+    \item
+    Evaluate the value network and get $q_t = q (s_t , a_t ; \w_t )$ and $q_{t+1} = q (s_{t+1} , a_{t+1} ; \w_t )$.
+    \item
+    Compute the TD error: $\delta_t = q_t - (r_t + \gamma \cdot q_{t+1})$.
+    \item
+    Update the value network: $\w_{t+1} \longleftarrow \w_{t} - \alpha \cdot \delta_t \cdot \frac{ \partial \, Q (s_t , a_t ; \w )}{\partial \, \w} \big|_{\w=\w_t }$.
+    \item
+    Update the policy network: $\tha_{t+1} \longleftarrow \tha_{t} + \beta \cdot q_t \cdot \frac{ \partial \, \log \pi ( a_t | s_t ; \theta )}{\partial \, \theta} \big|_{\theta =\theta_t }$.\footnote{In most papers and books, the update of the policy network is $\tha_{t+1} \longleftarrow \tha_{t} + \beta \cdot \delta_t \cdot \frac{ \partial \, \log \pi ( a_t | s_t ; \theta )}{\partial \, \theta} \big|_{\theta =\theta_t }$.
+The difference is that $q_t$ is replaced by $\delta_t$.
+Both approaches are correct.
+The use of $\delta_t$ is the result of using a baseline which can reduce variance.}
+\end{enumerate}
+When learning the policy network (actor), the supervision is not from the rewards; instead, the supervision is from the critic's output $q_t = q (s_t , a_t ; \w_t )$.
+The actor uses the critic's judgments to improve her performance.
+When training the critic, the supervision is from the rewards.
+The critic uses ground truth from the environment to make his judgment more accurate.
+
+
+
+\subsection{Deterministic Actor-Critic Method}  \label{sec:actor_critic_det}
+
+
+Throughout, the policy function is defined as the probability density function $\pi (a | s)$, and the action is randomly sampled according to $\pi$.
+\textbf{Deterministic policy} is a function that maps state to actions: $\pi: \SM \mapsto \AM$,
+where $\SM $ is the state space and $\AM$ is the action space.
+Given the state $\s$, the policy function deterministically outputs action $a = \pi (s)$.
+Deterministic policy is very useful when the actions are continuous.
+
+
+
+Deterministic actor-critic method \cite{silver2014deterministic} has two networks: policy network $\pi (s; \tha )$ and value network $q (s, a ; \w)$; see Figure~\ref{fig:dpg}.
+The agent is controlled by the policy network which deterministically maps state $s$ to action $a$.
+The value network is used for providing the policy network with supervision.
+The two networks can be trained in the following way.
+
+\begin{figure}[!h]
+    \centering
+    \includegraphics[width=0.8\linewidth]{figures/DPG.pdf}
+    \caption{Deterministic actor-critic method.
+    The deterministic policy network maps state $s \in \SM$ to action $a \in \AM \subset \RB^2$.
+    The two dimensions of $a$ are, for example, the steering angle and acceleration of a self-driving car.
+    The value network maps the pair $(s, a)$ to a scalar.
+    }
+    \label{fig:dpg}
+\end{figure}
+
+
+
+\paragraph{The value network can be trained by temporal different (TD) learning.}
+Let $q_t = q (s_t , a_t ; \w_t )$ be the prediction and $y_t = r_t + \gamma \cdot q (s_{t+1} , a_{t+1} ; \w_t )$ be the TD target.
+The TD error is $\delta_t = q_t - y_t$.
+The model parameters $\w$ can be updated by $\w_{t+1}\longleftarrow \w_t - \alpha \cdot \delta_t \cdot \frac{\partial \, q (s_t , a_t ; \w )}{\partial \, \w}\big|_{\w=\w_t}$.
+
+
+
+
+
+\paragraph{Train the policy network by deterministic policy gradient (DPG)} which is totally different from the policy gradient we studied previously.
+Note that the value network $q (s_t, a_t; \w)$ evaluates how good it is for the agent to perform action $a_t$ at state $s_t$.
+The policy network has motivation to update its parameters $\tha$ so that the action $a_t = \pi (s_t ; \tha)$ will get a higher evaluation.
+Intuitively speaking, the policy network (actor) wants to change herself so that the evaluation given by the value network (critic) will increase.
+The derivative of the objective, i.e., $q (s_t, a_t; \w)$, w.r.t.\ the policy network's parameters $\tha$ is
+\begin{equation*}
+    \g (\tha) 
+    \: = \: \frac{\partial \, q (s_t , \pi (s_t ; \theta) ; \w )}{\partial \, \theta}
+    \: = \: \frac{\partial \, \pi (s_t ; \theta) }{\partial \, \theta} 
+    \cdot  \frac{\partial \, q (s_t , a; \w )}{\partial \, a} \bigg|_{a=\pi (s_t ; \theta) } ,
+\end{equation*}
+where the second identity follows from the chain rule.
+The policy network is updated by performing gradient ascent: $\tha_{t+1} \longleftarrow \tha_t + \beta \cdot \g (\tha_t )$.
+
+
+
+
+\bibliographystyle{plainnat}
+\bibliography{bib/rl}
+
+
+\end{document}
diff --git a/LectureNotes/DRL/bib/rl.bib b/LectureNotes/DRL/bib/rl.bib
new file mode 100644
index 0000000..41fe04b
--- /dev/null
+++ b/LectureNotes/DRL/bib/rl.bib
@@ -0,0 +1,47 @@
+@inproceedings{silver2014deterministic,
+  title={Deterministic Policy Gradient Algorithms},
+  author={Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin},
+  booktitle={International Conference on Machine Learning (ICML)},
+  pages={387--395},
+  year={2014}
+}
+
+
+@article{silver2016mastering,
+  title={Mastering the game of {Go} with deep neural networks and tree search},
+  author={Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc},
+  journal={nature},
+  volume={529},
+  number={7587},
+  pages={484},
+  year={2016}
+}
+
+
+@inproceedings{sutton2000policy,
+  title={Policy gradient methods for reinforcement learning with function approximation},
+  author={Sutton, Richard S and McAllester, David A and Singh, Satinder P and Mansour, Yishay},
+  booktitle={Advances in Neural Information Processing Systems (NIPS)},
+  pages={1057--1063},
+  year={2000}
+}
+
+
+@article{sutton2008convergent,
+  title={A convergent O (n) algorithm for off-policy temporal-difference learning with linear function approximation},
+  author={Sutton, Richard S and Szepesv{\'a}ri, Csaba and Maei, Hamid Reza},
+  journal={Advances in Neural Information Processing Systems (NIPS)},
+  volume={21},
+  number={21},
+  pages={1609--1616},
+  year={2008}
+}
+
+
+@inproceedings{sutton2009fast,
+  title={Fast gradient-descent methods for temporal-difference learning with linear function approximation},
+  author={Sutton, Richard S and Maei, Hamid Reza and Precup, Doina and Bhatnagar, Shalabh and Silver, David and Szepesv{\'a}ri, Csaba and Wiewiora, Eric},
+  booktitle={International Conference on Machine Learning (ICML)},
+  pages={993--1000},
+  year={2009}
+}
\ No newline at end of file
diff --git a/LectureNotes/DRL/figures/DPG.pdf b/LectureNotes/DRL/figures/DPG.pdf
new file mode 100644
index 0000000..a64f75f
Binary files /dev/null and b/LectureNotes/DRL/figures/DPG.pdf differ
diff --git a/LectureNotes/DRL/figures/DPG.pptx b/LectureNotes/DRL/figures/DPG.pptx
new file mode 100644
index 0000000..98cfff3
Binary files /dev/null and b/LectureNotes/DRL/figures/DPG.pptx differ
diff --git a/LectureNotes/DRL/figures/randomness.pdf b/LectureNotes/DRL/figures/randomness.pdf
new file mode 100644
index 0000000..c3bb95c
Binary files /dev/null and b/LectureNotes/DRL/figures/randomness.pdf differ
diff --git a/LectureNotes/DRL/figures/randomness.pptx b/LectureNotes/DRL/figures/randomness.pptx
new file mode 100644
index 0000000..22dfe16
Binary files /dev/null and b/LectureNotes/DRL/figures/randomness.pptx differ
diff --git a/README.md b/README.md
index df9f29e..079459d 100644
--- a/README.md
+++ b/README.md
@@ -1,56 +1,61 @@
 # CS583: Deep Learning
 
 
+
 1. **Machine learning basics.**
 This part briefly introduces the fundamental ML problems-- regression, classification, dimensionality reduction, and clustering-- and the traditional ML models and numerical algorithms for solving the problems.
 
-    * ML basics. 
+    * ML basics
     [[slides-1](https://github.com/wangshusen/DeepLearning/blob/master/Slides/1_ML_Basics.pdf)]
-    [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/1_Models.pdf)]
+    [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/1_Models.pdf)].
 
     
-    * Regression. 
+    * Regression
     [[slides-1](https://github.com/wangshusen/DeepLearning/blob/master/Slides/2_Regression_1.pdf)] 
-    [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/2_Regression_2.pdf)]
+    [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/2_Regression_2.pdf)].
     
     
     * Classification. 
     
-        - Logistic regression: 
+        - Logistic regression
         [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_1.pdf)] 
-        [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Logistic/paper/logistic.pdf)]
+        [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Logistic/paper/logistic.pdf)].
     
-        - SVM: [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_2.pdf)] 
+        - SVM 
+        [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_2.pdf)].
     
-        - Softmax classifier: [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_3.pdf)] 
+        - Softmax classifier 
+        [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_3.pdf)].
     
-        - KNN classifier: [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_4.pdf)]
+        - KNN classifier
+        [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_4.pdf)].
     
-    * Regularizations. 
+    * Regularizations 
     [[slides-1](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Optimization.pdf)]
-    [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Regularizations.pdf)]
+    [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Regularizations.pdf)].
     
-    * Clustering. 
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/4_Clustering.pdf)] 
+    * Clustering 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Clustering.pdf)].
     
-    * Dimensionality reduction. 
+    * Dimensionality reduction
     [[slides-1](https://github.com/wangshusen/DeepLearning/blob/master/Slides/5_DR_1.pdf)] 
     [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/5_DR_2.pdf)] 
-    [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/SVD/svd.pdf)]
+    [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/SVD/svd.pdf)].
     
     * Scientific computing libraries.
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/5_DR_3.pdf)]
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/5_DR_3.pdf)].
+    
     
     
 2. **Neural network basics.**
 This part covers the multilayer perceptron, backpropagation, and deep learning libraries, with focus on Keras.
 
-    * Multilayer perceptron and backpropagation. 
+    * Multilayer perceptron and backpropagation
     [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/6_NeuralNet_1.pdf)]
-    [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/BP/bp.pdf)]
+    [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/BP/bp.pdf)].
     
-    * Keras. 
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/6_NeuralNet_2.pdf)]
+    * Keras
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/6_NeuralNet_2.pdf)].
     
     * Further reading:
     
@@ -64,23 +69,21 @@ This part covers the multilayer perceptron, backpropagation, and deep learning l
 3. **Convolutional neural networks (CNNs).**
 This part is focused on CNNs and its application to computer vision problems.
 
-    * CNN basics.
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_1.pdf)]
+    * CNN basics
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_1.pdf)].
     
-    * Tricks for improving test accuracy.
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_2.pdf)]
+    * Tricks for improving test accuracy
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_2.pdf)].
     
-    * Feature scaling and batch normalization.
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_3.pdf)]
+    * Feature scaling and batch normalization
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_3.pdf)].
     
-    * Advanced topics on CNNs. 
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_4.pdf)]
+    * Advanced topics on CNNs
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_4.pdf)].
     
-    * Popular CNN architectures.
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_5.pdf)]
+    * Popular CNN architectures
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_5.pdf)].
     
-    * Face recognition.
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_6.pdf)]
     
     * Further reading: 
     
@@ -93,95 +96,193 @@ This part is focused on CNNs and its application to computer vision problems.
 4. **Recurrent neural networks (RNNs).**
 This part introduces RNNs and its applications in natural language processing (NLP).
 
-    * Text processing.
+    * Categorical feature processing
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_0.pdf)] 
+	[[video (Chinese)](https://youtu.be/NWcShtqr8kc)].
+
+    * Text processing and word embedding
     [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_1.pdf)] 
+	[[video (Chinese)](https://youtu.be/6_2_2CPB97s)].
        
-    * RNN basics and LSTM.
+    * RNN basics
     [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_2.pdf)]
-    [[reference](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)]
-   
-    * Text generation.
+	[[video (Chinese)](https://youtu.be/Cc4ENs6BHQw)].
+       
+    * LSTM
     [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_3.pdf)]
-    
-    * Machine translation. 
+    [[reference](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)]
+	[[video (Chinese)](https://youtu.be/vTouAvxlphc)].
+       
+    * Making RNNs more effective
     [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_4.pdf)]
-    
-    * Image caption generation. 
+	[[video (Chinese)](https://youtu.be/pzWHk_M23a0)].
+   
+    * Text generation
     [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_5.pdf)]
-    [[reference](https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/)]
+	[[video (Chinese)](https://youtu.be/10cjvcrU_ZU)].
     
-    * Attention. 
+    * Machine translation
     [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_6.pdf)]
-    [[reference-1](https://distill.pub/2016/augmented-rnns/)]
-    [[reference-2](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html)]
+	[[video (Chinese)](https://youtu.be/gxXJ58LR684)].
+        
+    * Attention
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_8.pdf)]
+	[[video (English)](https://youtu.be/B3uws4cLcFw)]
+	[[video (Chinese)](https://youtu.be/XhWdv7ghmQQ)]
+    [[reference](https://distill.pub/2016/augmented-rnns/)].
+        
+    * Self-attention
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_9.pdf)]
+	[[video (English)](https://youtu.be/06r6kp7ujCA)]
+	[[video (Chinese)](https://youtu.be/Vr4UNt7X6Gw)].
+
+    
+    * Image caption generation 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_7.pdf)]
+    [[reference](https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/)].
+
     
     
-5. **Language Models beyond RNNs.**
+5. **Transformer Models.**
 
-    * Transformer model: beyond RNNs. 
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_7.pdf)]
-    [[reference](https://arxiv.org/pdf/1706.03762.pdf)]
+
+    * Transformer (1/2): attention without RNN
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_Transformer_1.pdf)]
+	[[video (English)](https://youtu.be/FC8PziPmxnQ)]
+	[[video (Chinese)](https://youtu.be/aButdUV0dxI)].
+    
+    * Transformer (2/2): from shallow to deep
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_Transformer_2.pdf)]
+	[[video (English)](https://youtu.be/J4H6A4-dvhE)]
+	[[video (Chinese)](https://youtu.be/aJRsr39F4dI)]
+   [[reference](https://arxiv.org/pdf/1706.03762.pdf)].
     
-    * Pre-train Transformer using BERT. [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_8.pdf)]
-    [[reference](https://arxiv.org/pdf/1810.04805.pdf)]
+    * BERT: pre-training Transformer 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_BERT.pdf)]
+	[[video (English)](https://youtu.be/EOmd5sUUA_A)]
+	[[video (Chinese)](https://youtu.be/UlC6AjQWao8)]
+   [[reference](https://arxiv.org/pdf/1810.04805.pdf)].
+    
+    * Vision Transformer (ViT)
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_ViT.pdf)]
+	[[video (English)](https://youtu.be/HZ4j_U3FC94)]
+	[[video (Chinese)](https://youtu.be/BbzOZ9THriY)].
 
 
 6. **Autoencoders.**
 This part introduces autoencoders for dimensionality reduction and image generation.
 
-    * Autoencoder for dimensionality reduction.
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/8_AE_1.pdf)]
+    * Autoencoder for dimensionality reduction
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/8_AE_1.pdf)].
     
-    * Variational Autoencoders (VAEs) for image generation. 
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/8_AE_2.pdf)]
+    * Variational Autoencoders (VAEs) for image generation
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/8_AE_2.pdf)].
 
     
 7. **Generative Adversarial Networks (GANs).** 
 
-    * DC-GAN [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/12_GAN.pdf)]
+    * DC-GAN [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/12_GAN.pdf)].
 
 
+    
+8. **Deep Reinforcement Learning.** 
 
-8. **Recommender system.** 
-This part is focused on the collaborative filtering approach to recommendation based on the user-item rating data.
-This part covers matrix completion methods and neural network approaches. 
+    * Reinforcement learning basics 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_1.pdf)] 
+    [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/DRL/DRL.pdf)] 
+    [[video (Chinese)](https://youtu.be/vmkRMvhCW5c)].
 
-    * Collaborative filtering. 
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_Recommender.pdf)]
+    * Value-based learning 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_2.pdf)] 
+    [[video (Chinese)](https://youtu.be/jflq6vNcZyA)].
 
-    
-9. **Deep Reinforcement Learning.** 
+    * Policy-based learning 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_3.pdf)] 
+    [[video (Chinese)](https://youtu.be/qI0vyfR2_Rc)].
 
-    * Reinforcement learning [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_1.pdf)] [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/DRL/DRL.pdf)]
+    * Actor-critic methods 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_4.pdf)] 
+    [[video (Chinese)](https://youtu.be/xjd7Jq9wPQY)].
 
-    * Value-based learning [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_2.pdf)]
+    * AlphaGo and Monte Carlo tree search
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_5.pdf)] 
+    [[video (Chinese)](https://youtu.be/zHojAp5vkRE)].
 
-    * Policy-based learning [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_3.pdf)]
 
-    * Actor-critic methods [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_4.pdf)]
 
-    * AlphaGo [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_5.pdf)]
+9. **Parallel Computing.** 
 
+	* Basics and MapReduce 
+	[[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_1.pdf)] 
+	[[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Parallel/Parallel.pdf)] 
+	[[video (Chinese)](https://youtu.be/gVcnOe6_c6Q)].
+	
+	* Parameter server and decentralized network
+	[[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_2.pdf)] 
+	[[video (Chinese)](https://youtu.be/Aga2Lxp3G7M)].
+	
+	* TensorFlow's mirrored strategy and ring all-reduce
+	[[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_3.pdf)] 
+	[[video (Chinese)](https://youtu.be/rj-hjS5L8Bw)].
 
-10. **Parallel Computing.** 
-
-	* Basics and MapReduce. [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_1.pdf)] [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Parallel/Parallel.pdf)] [[Video (in Chinese)](https://youtu.be/gVcnOe6_c6Q)]
 	
-	* Parameter Server and Decentralized Network. [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_2.pdf)] [[Video (in Chinese)](https://youtu.be/Aga2Lxp3G7M)]
 	
-	* Federated Learning. [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_3.pdf)] [[Video (in Chinese)](https://youtu.be/STxtRucv_zo)]
+	* Federated learning
+	[[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_4.pdf)] 
+	[[video (Chinese)](https://youtu.be/STxtRucv_zo)].
 
 
-11. **Adversarial Robustness.**
+10. **Adversarial Robustness.**
 This part introduces how to attack neural networks using adversarial examples and how to defend from the attack.
 
-	* Data evasion attack and defense.
-    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/11_Adversarial.pdf)]
-    [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Adversarial/DataAttacks.pdf)]
+	* Data evasion attack and defense
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/11_Evasion.pdf)]
+    [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Adversarial/DataAttacks.pdf)].
+
+	* Data poisoning attack
+	 [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/11_Poisoning.pdf)]
+	[[video (Chinese)](https://youtu.be/_K0nZcqdu5w)].
+	 
         
     * Further reading:
-    [[Adversarial Robustness - Theory and Practice](https://adversarial-ml-tutorial.org/)]
+    [[Adversarial Robustness - Theory and Practice](https://adversarial-ml-tutorial.org/)].
+    
+
+11. **Meta Learning.** 
+
+    * Few-shot learning: basic concepts
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/16_Meta_1.pdf)]
+	[[video (English)](https://youtu.be/hE7eGew4eeg)]
+	[[video (Chinese)](https://youtu.be/UkQ2FVpDxHg)].
+
+    * Siamese network
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/16_Meta_2.pdf)]
+	[[video (English)](https://youtu.be/4S-XDefSjTM)]
+	[[video (Chinese)](https://youtu.be/Er8xH_k0Vj4)].
+
+    * Pretraining + fine tuning
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/16_Meta_3.pdf)]
+	[[video (English)](https://youtu.be/U6uFOIURcD0)]
+	[[video (Chinese)](https://youtu.be/3zSYMuDm6RU)].
+
     
 
+12. **Neural Architecture Search (NAS).** 
+
+
+    * Basics
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/15_NAS_1.pdf)]
+	[[video (Chinese)](https://youtu.be/voWgnMpFaW8)].
+
+    * RNN + Reinforcement Learning: 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/15_NAS_2.pdf)]
+	[[video (Chinese)](https://youtu.be/AmitvRzmvv0)].
+
+    * Differentiable NAS: 
+    [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/15_NAS_3.pdf)]
+	[[video (Chinese)](https://youtu.be/D9m9-CXw_HY)].
+
+
+
 
 
diff --git a/Slides/10_BERT.pdf b/Slides/10_BERT.pdf
new file mode 100644
index 0000000..34f4a7d
Binary files /dev/null and b/Slides/10_BERT.pdf differ
diff --git a/Slides/10_Recommender.pdf b/Slides/10_Recommender.pdf
deleted file mode 100644
index dd3c98b..0000000
Binary files a/Slides/10_Recommender.pdf and /dev/null differ
diff --git a/Slides/10_Transformer_1.pdf b/Slides/10_Transformer_1.pdf
new file mode 100644
index 0000000..5e528ec
Binary files /dev/null and b/Slides/10_Transformer_1.pdf differ
diff --git a/Slides/10_Transformer_2.pdf b/Slides/10_Transformer_2.pdf
new file mode 100644
index 0000000..90dd015
Binary files /dev/null and b/Slides/10_Transformer_2.pdf differ
diff --git a/Slides/10_ViT.pdf b/Slides/10_ViT.pdf
new file mode 100644
index 0000000..0295ec9
Binary files /dev/null and b/Slides/10_ViT.pdf differ
diff --git a/Slides/11_Adversarial.pdf b/Slides/11_Evasion.pdf
similarity index 100%
rename from Slides/11_Adversarial.pdf
rename to Slides/11_Evasion.pdf
diff --git a/Slides/11_Poisoning.pdf b/Slides/11_Poisoning.pdf
new file mode 100644
index 0000000..98c7d56
Binary files /dev/null and b/Slides/11_Poisoning.pdf differ
diff --git a/Slides/12_GAN.pdf b/Slides/12_GAN.pdf
index a9f0645..66c86b1 100644
Binary files a/Slides/12_GAN.pdf and b/Slides/12_GAN.pdf differ
diff --git a/Slides/13_RL_1.pdf b/Slides/13_RL_1.pdf
index 3443444..f8eb258 100644
Binary files a/Slides/13_RL_1.pdf and b/Slides/13_RL_1.pdf differ
diff --git a/Slides/13_RL_2.pdf b/Slides/13_RL_2.pdf
index 57bde51..5a21cc8 100644
Binary files a/Slides/13_RL_2.pdf and b/Slides/13_RL_2.pdf differ
diff --git a/Slides/13_RL_3.pdf b/Slides/13_RL_3.pdf
index 2c63054..010ab8f 100644
Binary files a/Slides/13_RL_3.pdf and b/Slides/13_RL_3.pdf differ
diff --git a/Slides/13_RL_4.pdf b/Slides/13_RL_4.pdf
index c5d6d78..54cc4b6 100644
Binary files a/Slides/13_RL_4.pdf and b/Slides/13_RL_4.pdf differ
diff --git a/Slides/13_RL_5.pdf b/Slides/13_RL_5.pdf
index 7240019..cb28222 100644
Binary files a/Slides/13_RL_5.pdf and b/Slides/13_RL_5.pdf differ
diff --git a/Slides/14_Parallel_1.pdf b/Slides/14_Parallel_1.pdf
index b484324..4d02138 100644
Binary files a/Slides/14_Parallel_1.pdf and b/Slides/14_Parallel_1.pdf differ
diff --git a/Slides/14_Parallel_2.pdf b/Slides/14_Parallel_2.pdf
index 56a2d64..478c68d 100644
Binary files a/Slides/14_Parallel_2.pdf and b/Slides/14_Parallel_2.pdf differ
diff --git a/Slides/14_Parallel_3.pdf b/Slides/14_Parallel_3.pdf
index fa81ea0..682ce8c 100644
Binary files a/Slides/14_Parallel_3.pdf and b/Slides/14_Parallel_3.pdf differ
diff --git a/Slides/14_Parallel_4.pdf b/Slides/14_Parallel_4.pdf
new file mode 100644
index 0000000..afc6c03
Binary files /dev/null and b/Slides/14_Parallel_4.pdf differ
diff --git a/Slides/15_NAS_1.pdf b/Slides/15_NAS_1.pdf
new file mode 100644
index 0000000..3527aa4
Binary files /dev/null and b/Slides/15_NAS_1.pdf differ
diff --git a/Slides/15_NAS_2.pdf b/Slides/15_NAS_2.pdf
new file mode 100644
index 0000000..bd195c4
Binary files /dev/null and b/Slides/15_NAS_2.pdf differ
diff --git a/Slides/15_NAS_3.pdf b/Slides/15_NAS_3.pdf
new file mode 100644
index 0000000..6cb6532
Binary files /dev/null and b/Slides/15_NAS_3.pdf differ
diff --git a/Slides/16_Meta_1.pdf b/Slides/16_Meta_1.pdf
new file mode 100644
index 0000000..72f40cb
Binary files /dev/null and b/Slides/16_Meta_1.pdf differ
diff --git a/Slides/16_Meta_2.pdf b/Slides/16_Meta_2.pdf
new file mode 100644
index 0000000..43b5ed2
Binary files /dev/null and b/Slides/16_Meta_2.pdf differ
diff --git a/Slides/16_Meta_3.pdf b/Slides/16_Meta_3.pdf
new file mode 100644
index 0000000..b069f44
Binary files /dev/null and b/Slides/16_Meta_3.pdf differ
diff --git a/Slides/2_Regression_1.pdf b/Slides/2_Regression_1.pdf
index f0da1c9..8136026 100644
Binary files a/Slides/2_Regression_1.pdf and b/Slides/2_Regression_1.pdf differ
diff --git a/Slides/2_Regression_2.pdf b/Slides/2_Regression_2.pdf
index da43078..6a99162 100644
Binary files a/Slides/2_Regression_2.pdf and b/Slides/2_Regression_2.pdf differ
diff --git a/Slides/3_Classification_1.pdf b/Slides/3_Classification_1.pdf
index 913deca..7445d72 100644
Binary files a/Slides/3_Classification_1.pdf and b/Slides/3_Classification_1.pdf differ
diff --git a/Slides/3_Classification_2.pdf b/Slides/3_Classification_2.pdf
index 1e44260..414cb40 100644
Binary files a/Slides/3_Classification_2.pdf and b/Slides/3_Classification_2.pdf differ
diff --git a/Slides/3_Classification_3.pdf b/Slides/3_Classification_3.pdf
index f14b1eb..26a7dbc 100644
Binary files a/Slides/3_Classification_3.pdf and b/Slides/3_Classification_3.pdf differ
diff --git a/Slides/3_Classification_4.pdf b/Slides/3_Classification_4.pdf
index 2334699..1174850 100644
Binary files a/Slides/3_Classification_4.pdf and b/Slides/3_Classification_4.pdf differ
diff --git a/Slides/4_Clustering.pdf b/Slides/3_Clustering.pdf
similarity index 100%
rename from Slides/4_Clustering.pdf
rename to Slides/3_Clustering.pdf
diff --git a/Slides/3_Optimization.pdf b/Slides/3_Optimization.pdf
index 95f4435..f0ee580 100644
Binary files a/Slides/3_Optimization.pdf and b/Slides/3_Optimization.pdf differ
diff --git a/Slides/3_Regularizations.pdf b/Slides/3_Regularizations.pdf
index 6dfa7de..12d2887 100644
Binary files a/Slides/3_Regularizations.pdf and b/Slides/3_Regularizations.pdf differ
diff --git a/Slides/4_MC_1.pdf b/Slides/4_MC_1.pdf
new file mode 100644
index 0000000..9b14730
Binary files /dev/null and b/Slides/4_MC_1.pdf differ
diff --git a/Slides/5_DR_3.pdf b/Slides/5_DR_3.pdf
index a4eae3e..6d06060 100644
Binary files a/Slides/5_DR_3.pdf and b/Slides/5_DR_3.pdf differ
diff --git a/Slides/6_NeuralNet_1.pdf b/Slides/6_NeuralNet_1.pdf
index fe0d5e0..a886e57 100644
Binary files a/Slides/6_NeuralNet_1.pdf and b/Slides/6_NeuralNet_1.pdf differ
diff --git a/Slides/7_CNN_1.pdf b/Slides/7_CNN_1.pdf
index 0de1116..2162da6 100644
Binary files a/Slides/7_CNN_1.pdf and b/Slides/7_CNN_1.pdf differ
diff --git a/Slides/7_CNN_3.pdf b/Slides/7_CNN_3.pdf
index eb61ea0..67e6726 100644
Binary files a/Slides/7_CNN_3.pdf and b/Slides/7_CNN_3.pdf differ
diff --git a/Slides/7_CNN_5.pdf b/Slides/7_CNN_5.pdf
index 40fbbc5..51dd3e1 100644
Binary files a/Slides/7_CNN_5.pdf and b/Slides/7_CNN_5.pdf differ
diff --git a/Slides/8_AE_2.pdf b/Slides/8_AE_2.pdf
index 459b203..a26f7a3 100644
Binary files a/Slides/8_AE_2.pdf and b/Slides/8_AE_2.pdf differ
diff --git a/Slides/9_RNN_0.pdf b/Slides/9_RNN_0.pdf
new file mode 100644
index 0000000..6254ef3
Binary files /dev/null and b/Slides/9_RNN_0.pdf differ
diff --git a/Slides/9_RNN_1.pdf b/Slides/9_RNN_1.pdf
index e3bd5ae..7660614 100644
Binary files a/Slides/9_RNN_1.pdf and b/Slides/9_RNN_1.pdf differ
diff --git a/Slides/9_RNN_2.pdf b/Slides/9_RNN_2.pdf
index 9671464..9a0fccc 100644
Binary files a/Slides/9_RNN_2.pdf and b/Slides/9_RNN_2.pdf differ
diff --git a/Slides/9_RNN_3.pdf b/Slides/9_RNN_3.pdf
index 941d2f0..8d35e83 100644
Binary files a/Slides/9_RNN_3.pdf and b/Slides/9_RNN_3.pdf differ
diff --git a/Slides/9_RNN_4.pdf b/Slides/9_RNN_4.pdf
index 2a840ce..41d6c5e 100644
Binary files a/Slides/9_RNN_4.pdf and b/Slides/9_RNN_4.pdf differ
diff --git a/Slides/9_RNN_5.pdf b/Slides/9_RNN_5.pdf
index 91aa93d..dc0f678 100644
Binary files a/Slides/9_RNN_5.pdf and b/Slides/9_RNN_5.pdf differ
diff --git a/Slides/9_RNN_6.pdf b/Slides/9_RNN_6.pdf
index f366ed7..e0e010e 100644
Binary files a/Slides/9_RNN_6.pdf and b/Slides/9_RNN_6.pdf differ
diff --git a/Slides/9_RNN_7.pdf b/Slides/9_RNN_7.pdf
index 0aec9bf..91aa93d 100644
Binary files a/Slides/9_RNN_7.pdf and b/Slides/9_RNN_7.pdf differ
diff --git a/Slides/9_RNN_8.pdf b/Slides/9_RNN_8.pdf
index 613c80f..297466e 100644
Binary files a/Slides/9_RNN_8.pdf and b/Slides/9_RNN_8.pdf differ
diff --git a/Slides/9_RNN_9.pdf b/Slides/9_RNN_9.pdf
new file mode 100644
index 0000000..7ec1845
Binary files /dev/null and b/Slides/9_RNN_9.pdf differ