diff --git a/LICENSE b/LICENSE index ca895b7..9309bf6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,6 @@ -MIT License - Copyright (c) 2019 Shusen Wang -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Permission is granted to only nonprofit organizations, including schools and +research institutes. Employees of nonprofit organizations are granted, free +of charge, the rights to use, copy, modify, merge, publish, and distribute +the slides and lecture notes in this repo. diff --git a/LectureNotes/BP/bp.tex b/LectureNotes/BP/bp.tex new file mode 100644 index 0000000..6d69dda --- /dev/null +++ b/LectureNotes/BP/bp.tex @@ -0,0 +1,773 @@ +\documentclass[11pt]{article} +\usepackage{amsmath,amssymb,amsmath,amsthm,amsfonts} +\usepackage{latexsym,graphicx} +\usepackage{fullpage,color} +\usepackage{url} +\usepackage[pdftex,bookmarks,colorlinks=true,citecolor=blue]{hyperref} +\usepackage{natbib} +\usepackage{graphicx,subfigure} +\usepackage{algorithm} +\usepackage{algorithmic} +\usepackage{listings} +\usepackage[dvipsnames]{xcolor} +\usepackage{color} +\usepackage{wrapfig} + +\numberwithin{equation}{section} + +\pagestyle{plain} + +\setlength{\oddsidemargin}{0in} +\setlength{\topmargin}{0in} +\setlength{\textwidth}{6.5in} +\setlength{\textheight}{8.5in} + +\newtheorem{fact}{Fact}[section] +\newtheorem{question}{Question}[section] +\newtheorem{lemma}{Lemma}[section] +\newtheorem{theorem}[lemma]{Theorem} +\newtheorem{assumption}[lemma]{Assumption} +\newtheorem{corollary}[lemma]{Corollary} +\newtheorem{prop}[lemma]{Proposition} +\newtheorem{claim}{Claim}[section] +\newtheorem{remark}{Remark}[section] +\newtheorem{definition}{Definition}[section] +\newtheorem{prob}{Problem}[section] +\newtheorem{conjecture}{Conjecture}[section] +\newtheorem{property}{Property}[section] + +\def\A{{\bf A}} +\def\a{{\bf a}} +\def\B{{\bf B}} +\def\bb{{\bf b}} +\def\C{{\bf C}} +\def\c{{\bf c}} +\def\D{{\bf D}} +\def\d{{\bf d}} +\def\E{{\bf E}} +\def\e{{\bf e}} +\def\F{{\bf F}} +\def\f{{\bf f}} +\def\g{{\bf g}} +\def\h{{\bf h}} +\def\G{{\bf G}} +\def\H{{\bf H}} +\def\I{{\bf I}} +\def\K{{\bf K}} +\def\k{{\bf k}} +\def\LL{{\bf L}} +\def\M{{\bf M}} +\def\m{{\bf m}} +\def\N{{\bf N}} +\def\n{{\bf n}} +\def\PP{{\bf P}} +\def\pp{{\bf p}} +\def\Q{{\bf Q}} +\def\q{{\bf q}} +\def\R{{\bf R}} +\def\rr{{\bf r}} +\def\S{{\bf S}} +\def\s{{\bf s}} +\def\T{{\bf T}} +\def\tt{{\bf t}} +\def\U{{\bf U}} +\def\u{{\bf u}} +\def\V{{\bf V}} +\def\v{{\bf v}} +\def\W{{\bf W}} +\def\w{{\bf w}} +\def\X{{\bf X}} +\def\x{{\bf x}} +\def\Y{{\bf Y}} +\def\y{{\bf y}} +\def\Z{{\bf Z}} +\def\z{{\bf z}} +\def\0{{\bf 0}} +\def\1{{\bf 1}} + + + +\def\AM{{\mathcal A}} +\def\CM{{\mathcal C}} +\def\DM{{\mathcal D}} +\def\EM{{\mathcal E}} +\def\GM{{\mathcal G}} +\def\FM{{\mathcal F}} +\def\IM{{\mathcal I}} +\def\JM{{\mathcal J}} +\def\KM{{\mathcal K}} +\def\LM{{\mathcal L}} +\def\NM{{\mathcal N}} +\def\OM{{\mathcal O}} +\def\PM{{\mathcal P}} +\def\SM{{\mathcal S}} +\def\TM{{\mathcal T}} +\def\UM{{\mathcal U}} +\def\VM{{\mathcal V}} +\def\WM{{\mathcal W}} +\def\XM{{\mathcal X}} +\def\YM{{\mathcal Y}} +\def\RB{{\mathbb R}} +\def\RBmn{{\RB^{m\times n}}} +\def\EB{{\mathbb E}} +\def\PB{{\mathbb P}} + +\def\TX{\tilde{\bf X}} +\def\TA{\tilde{\bf A}} +\def\tx{\tilde{\bf x}} +\def\ty{\tilde{\bf y}} +\def\TZ{\tilde{\bf Z}} +\def\tz{\tilde{\bf z}} +\def\hd{\hat{d}} +\def\HD{\hat{\bf D}} +\def\hx{\hat{\bf x}} +\def\nysA{{\tilde{\A}_c^{\textrm{nys}}}} + +\def\alp{\mbox{\boldmath$\alpha$\unboldmath}} +\def\bet{\mbox{\boldmath$\beta$\unboldmath}} +\def\epsi{\mbox{\boldmath$\epsilon$\unboldmath}} +\def\etab{\mbox{\boldmath$\eta$\unboldmath}} +\def\ph{\mbox{\boldmath$\phi$\unboldmath}} +\def\pii{\mbox{\boldmath$\pi$\unboldmath}} +\def\Ph{\mbox{\boldmath$\Phi$\unboldmath}} +\def\Ps{\mbox{\boldmath$\Psi$\unboldmath}} +\def\ps{\mbox{\boldmath$\psi$\unboldmath}} +\def\tha{\mbox{\boldmath$\theta$\unboldmath}} +\def\Tha{\mbox{\boldmath$\Theta$\unboldmath}} +\def\muu{\mbox{\boldmath$\mu$\unboldmath}} +\def\Si{\mbox{\boldmath$\Sigma$\unboldmath}} +\def\si{\mbox{\boldmath$\sigma$\unboldmath}} +\def\Gam{\mbox{\boldmath$\Gamma$\unboldmath}} +\def\Lam{\mbox{\boldmath$\Lambda$\unboldmath}} +\def\De{\mbox{\boldmath$\Delta$\unboldmath}} +\def\Ome{\mbox{\boldmath$\Omega$\unboldmath}} +\def\Pii{\mbox{\boldmath$\Pi$\unboldmath}} +\def\varepsi{\mbox{\boldmath$\varepsilon$\unboldmath}} +\newcommand{\ti}[1]{\tilde{#1}} +\def\Ncal{\mathcal{N}} +\def\argmax{\mathop{\rm argmax}} +\def\argmin{\mathop{\rm argmin}} + +\def\ALG{{\AM_{\textrm{col}}}} + +\def\mean{\mathsf{mean}} +\def\std{\mathsf{std}} +\def\orth{\mathsf{orth}} +\def\var{\mathsf{var}} +\def\sgn{\mathsf{sgn}} +\def\tr{\mathsf{tr}} +\def\rk{\mathrm{rank}} +\def\nnz{\mathsf{nnz}} +\def\st{\mathsf{s.t.}} +\def\vect{\mathsf{vec}} +\def\sech{\mathrm{sech}} +\def\sigmoid{\mathsf{sigmoid}} +\def\din{{d_{\textrm{in}}}} +\def\dout{{d_{\textrm{out}}}} + + +\newcommand{\red}[1]{{\color{red}#1}} +\newcommand{\blue}[1]{{\color{blue}#1}} +\newcommand{\green}[1]{{\color{green}#1}} + + + +\def\argmax{\mathop{\rm argmax}} +\def\argmin{\mathop{\rm argmin}} + +\newenvironment{note}[1]{\medskip\noindent \textbf{#1:}}% + {\medskip} + + +\newcommand{\etal}{{\em et al.}\ } +\newcommand{\assign}{\leftarrow} +\newcommand{\eps}{\epsilon} + + + + +\lstset{ % +extendedchars=false, % Shutdown no-ASCII compatible +language=Python, % choose the language of the code +xleftmargin=1em, +xrightmargin=1em, +basicstyle=\footnotesize, % the size of the fonts that are used for the code +tabsize=3, % sets default tabsize to 3 spaces +numbers=left, % where to put the line-numbers +numberstyle=\tiny, % the size of the fonts that are used for the line-numbers +stepnumber=1, % the step between two line-numbers. If it's 1 each line + % will be numbered +numbersep=5pt, % how far the line-numbers are from the code % +keywordstyle=\color[rgb]{0,0,1}, % keywords +commentstyle=\color[rgb]{0.133,0.545,0.133}, % comments +stringstyle=\color[rgb]{0.627,0.126,0.941}, % strings +backgroundcolor=\color{white}, % choose the background color. You must add \usepackage{color} +showspaces=false, % show spaces adding particular underscores +showstringspaces=false, % underline spaces within strings +showtabs=false, % show tabs within strings adding particular underscores +frame=single, % adds a frame around the code +%captionpos=b, % sets the caption-position to bottom +breaklines=true, % sets automatic line breaking +breakatwhitespace=false, % sets if automatic breaks should only happen at whitespace +%title=\lstname, % show the filename of files included with \lstinputlisting; +% % also try caption instead of title +mathescape=true,escapechar=? % escape to latex with ?..? +escapeinside={\%*}{*)}, % if you want to add a comment within your code +%columns=fixed, % nice spacing +%morestring=[m]', % strings +%morekeywords={%,...},% % if you want to add more keywords to the set +% break,case,catch,continue,elseif,else,end,for,function,global,% +% if,otherwise,persistent,return,switch,try,while,...},% +} + + +\begin{document} + +%\setlength{\fboxrule}{.5mm}\setlength{\fboxsep}{1.2mm} +%\newlength{\boxlength}\setlength{\boxlength}{\textwidth} +%\addtolength{\boxlength}{-4mm} + + +\title{BackPropagation for Fully-Connected and \\Convolutional Neural Networks} + +\author{\textbf{Shusen Wang} \\ Stevens Institute of Technology} + +%\date{ } + +\maketitle + +\begin{abstract} +First, define a fully-connected (FC) layer. +Second, derive the gradients using chain rule for one FC layer. +Third, connect multiple FC layers to build a FC neural network. +Fourth, perform backpropagation using the chain rule. +Fifth, express convolution as a matrix multiplication. +Last, derive gradients for convolutional layer. +\end{abstract} + + +\section{Fully-Connected (FC) Layer} + + + +We consider one fully-connected (FC) layer and follow the convention of PyTorch. +Let $\din$ be the input shape, $\dout$ be the output shape, and $b$ be the batch size. +Let $\X \in \RB^{b\times \din}$ be a batch of input vectors, $\W \in \RB^{\dout \times \din}$ be the weight matrix, and $\Z = \X \W^T \in \RB^{b\times \dout}$. +The output of this FC layer is $\X' = \sigma (\Z) \in \RB^{b\times \dout}$ where $\sigma$ is an activation function that applies elementwisely. +For example, if the activation function is ReLU, then the $(i,j)$-th entry of $\X'$ is +\begin{equation*} + x_{ij}' + \: = \: + \left\{ + \begin{array}{cc} + z_{ij}, & \textrm{if } z_{ij} > 0; \\ + 0, & \textrm{otherwise.} \\ + \end{array} + \right. +\end{equation*} +The structure of a FC layer is illustrated in Figure~\ref{fig:differential}(left). + + + +\section{Differentiation for FC Layer} \label{sec:differential} + +Let $Q$ be the loss function that depends on $\X'$. +Suppose we know $\frac{\partial \, Q}{ \partial \, \X'} \in \RB^{b\times \dout}$ (the derivative of $Q$ w.r.t.\ $\X'$). +Since $\X$ and $\W$ influence $Q$ via $\X'$: +\begin{equation*} + \left. + \begin{array}{c c} + \cdots \: \longrightarrow \: \cdots \: \longrightarrow \: & \X \\ + & \W + \end{array} + \right\} + \: \xrightarrow{\textsf{~multiply~}} \: + \Z + \: \xrightarrow{\textsf{activation}} \: + \X' + \: \longrightarrow \: + \cdots + \: \longrightarrow \: + Q , +\end{equation*} +we can let the gradient flow to $\X$ and $\W$ in the opposite direction: +\begin{equation*} + \left. + \begin{array}{c c} + \cdots \: \longleftarrow \: \cdots \: \longleftarrow \: & \frac{\partial \, Q }{\partial \, \X} \\ + & \frac{\partial \, Q }{\partial \, \W} + \end{array} + \right\} + \: \longleftarrow \: + \frac{\partial \, Q }{\partial \, \Z} + \: \longleftarrow \: + \frac{\partial \, Q }{\partial \, \X'} . +\end{equation*} +In the following, we compute $\frac{\partial \, Q}{ \partial \, \Z}$ and then $\frac{\partial \, Q}{ \partial \, \X}$ and $\frac{\partial \, Q}{ \partial \, \W}$. + +\paragraph{From $\X'$ to $\Z$.} +First, compute $\frac{\partial \, Q}{ \partial \, \Z} \in \RB^{b\times \dout}$. +If $\X' = \textsf{ReLU} (\Z)$,\footnote{$[\textsf{ReLU} (\Z)]_{ij} = \max \{z_{ij} , \, 0 \}$.} +then the $(i,j)$-th entry of $\frac{\partial \, Q}{ \partial \, \Z} $ is +\begin{equation*} + \Big[\frac{\partial \, Q}{ \partial \, \Z} \Big]_{ij} + \: = \: \frac{\partial \, Q}{ \partial \, z_{ij}} + \: = \: \frac{\partial \, x_{ij}' }{ \partial \, z_{ij}} \, \frac{\partial \, Q}{ \partial \, x_{ij}'} + \: = \: + \left\{ + \begin{array}{cc} + \tfrac{\partial \, Q}{ \partial \, x_{ij}'}, & \textrm{if } z_{ij} > 0; \\ + 0, & \textrm{otherwise.} \\ + \end{array} + \right. +\end{equation*} +Let $\A \in \RB^{b\times \dout}$ be such as matrix that +\begin{equation*} + a_{ij} + \: = \: + \left\{ + \begin{array}{cc} + 1, & \textrm{if } z_{ij} > 0; \\ + 0, & \textrm{otherwise.} \\ + \end{array} + \right. +\end{equation*} +Let ``$\circ $'' denote the Hadamard product (also known as elementwise product.) +Then +\begin{equation} \label{eq:grad_q_z} + \frac{\partial \, Q}{ \partial \, \Z} + \: = \: \A \circ \frac{\partial \, Q}{ \partial \, \X'} + \: \in \: \RB^{b\times \dout} . +\end{equation} + +\paragraph{From $\Z$ to $\X$.} +Second, compute $\frac{\partial \, Q}{ \partial \, \X} \in \RB^{b\times \din}$. +Let $\x_{i:} \in \RB^{1\times \din}$ and $\z_{i:} \in \RB^{1\times \dout}$ be the $i$-th rows of $\X$ and $\Z$, respectively, for $i = 1$ to $b$.\footnote{To calculate gradient in the standard way, we must use column vectors.} +Thus $\x_{i:}^T \in \RB^{\din \times 1}$ and $\z_{i:}^T \in \RB^{ \dout \times 1}$ are the $i$-th column of $\X^T \in \RB^{\din \times b}$ and $\Z^T \in \RB^{\dout \times b}$, respectively. +It follows from the chain rule that +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \x_{i:}^T } + \: = \: \sum_{j=1}^{b} \frac{ \partial \, \z_{j:}^T }{ \partial \, \x_{i:}^T } \frac{ \partial \, Q }{ \partial \, \z_{j:}^T } + \: = \: \frac{ \partial \, \z_{i:}^T }{ \partial \, \x_{i:}^T } \, \frac{ \partial \, Q }{ \partial \, \z_{i:}^T } + + \sum_{i\neq j} \frac{ \partial \, \z_{j:}^T }{ \partial \, \x_{i:}^T } \frac{ \partial \, Q }{ \partial \, \z_{j:}^T } . +\end{equation*} +If $i \neq j$, $\z_{j:}$ will not depend on $\x_{i:}$, and thus $\frac{ \partial \, \z_{j:}^T }{ \partial \, \x_{i:}^T } $ is the all-zero matrix. +It follows that +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \x_{i:}^T } + \: = \: \underbrace{\frac{ \partial \, \z_{i:}^T }{ \partial \, \x_{i:}^T }}_{\din \times \dout} \, + \underbrace{\frac{ \partial \, Q }{ \partial \, \z_{i:}^T } }_{\dout\times 1} + \: \in \: \RB^{\din \times 1}. +\end{equation*} +Since $\z_{i:} = \x_{i:} \W^T$, we have $\z_{i:}^T = \W \x_{i:}^T$, and thus $\frac{ \partial \, \z_{i:}^T }{ \partial \, \x_{i:}^T } = \W^T \in \RB^{\din \times \dout }$. +It follows that +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \x_{i:}^T } + \: = \: \W^T \, \cdot \, \frac{ \partial \, Q }{ \partial \, \z_{i:}^T } + \: \in \: \RB^{\din \times 1}. +\end{equation*} +Since $\x_{i:}^T$ is the $i$-th column of $\X^T \in \RB^{\din \times b}$, +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \X^T } + \: = \: \Big[ \frac{ \partial \, Q }{ \partial \, \x_{1:}^T } , \; \cdots , \; \frac{ \partial \, Q }{ \partial \, \x_{b:}^T } \Big] + \: = \: \W^T \, \cdot \, \Big[ \frac{ \partial \, Q }{ \partial \, \z_{1:}^T } , \; \cdots , \; \frac{ \partial \, Q }{ \partial \, \z_{b:}^T } \Big] + \: = \: \underbrace{\W^T}_{\din\times \dout} \, \cdot \, + \underbrace{\frac{ \partial \, Q }{ \partial \, \Z^T } }_{\dout \times b} + \: \in \: \RB^{\din \times b}. +\end{equation*} +Hence +\begin{equation} \label{eq:grad_q_x} + \frac{ \partial \, Q }{ \partial \, \X } + \: = \: \bigg( \frac{ \partial \, Q }{ \partial \, \X^T } \bigg)^T + \: = \: \frac{ \partial \, Q }{ \partial \, \Z } \, \cdot \, \W + \: \in \: \RB^{b \times \din}. +\end{equation} + + +\paragraph{From $\Z$ to $\W$.} +Third, compute $\frac{\partial \, Q}{ \partial \, \W} \in \RB^{\dout\times \din}$. +Let $\z_{:j} \in \RB^{b\times 1}$ be the $j$-th columns of $\Z$ +and $\w_{j:} \in \RB^{1\times \din}$ be the $j$-th row of $\W$, for $j = 1$ to $\dout$. +Thus $\w_{j:}^T \in \RB^{\din \times 1}$ is the $j$-th column of $\W^T \in \RB^{\din \times \dout}$. +It follows from the chain rule that +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \w_{j:}^T } + \: = \: \sum_{i=1}^{\dout} \frac{ \partial \, \z_{:i} }{ \partial \, \w_{j:}^T } \, \frac{ \partial \, Q }{ \partial \, \z_{:i} } + \: = \: \frac{ \partial \, \z_{:j} }{ \partial \, \w_{j:}^T } \, \frac{ \partial \, Q }{ \partial \, \z_{:j} } + + \sum_{i\neq j} \frac{ \partial \, \z_{:i} }{ \partial \, \w_{j:}^T } \, \frac{ \partial \, Q }{ \partial \, \z_{:i} } + \: = \: \underbrace{ \frac{ \partial \, \z_{:j} }{ \partial \, \w_{j:}^T } }_{\din \times b} + \, \underbrace{ \frac{ \partial \, Q }{ \partial \, \z_{:j} } }_{b\times 1} + \: \in \: \RB^{\din \times 1}. +\end{equation*} +Since $\z_{:j} = \X \w_{j:}^T \in \RB^{b\times 1}$, we can show that $\frac{ \partial \, \z_{:j} }{ \partial \, \w_{j:}^T } = \X^T \in \RB^{\din \times b}$. +Thus +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \w_{j:}^T } + \: = \: \X^T \, \cdot \, \frac{ \partial \, Q }{ \partial \, \z_{:j} } + \: \in \: \RB^{\din \times 1}. +\end{equation*} +Note that $\w_{j:}^T $ is the $j$-th column of $\W^T \in \RB^{\din \times \dout}$. +It follows that +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \W^T } + \: = \: \Big[ \frac{ \partial \, Q }{ \partial \, \w_{1:}^T } , \; \cdots , \; \frac{ \partial \, Q }{ \partial \, \w_{\dout:}^T } \Big] + \: = \: \X^T \Big[\frac{ \partial \, Q }{ \partial \, \z_{:1} } , \; \cdots , \; + \frac{ \partial \, Q }{ \partial \, \z_{:\dout} } \Big] + \: = \: \underbrace{\X^T}_{\din \times b} \, \underbrace{ \frac{\partial \, Q }{ \partial \, \Z }}_{b\times \dout} + \: \in \: \RB^{\din \times \dout} . +\end{equation*} +Hence, +\begin{equation} \label{eq:grad_q_w} + \frac{ \partial \, Q }{ \partial \, \W } + \: = \: \bigg( \frac{ \partial \, Q }{ \partial \, \W^T } \bigg)^T + \: = \: \bigg( \frac{\partial \, Q }{ \partial \, \Z } \bigg)^T \X + \: \in \: \RB^{\dout \times \din}. +\end{equation} + + +\begin{figure}[!h] + \centering + \includegraphics[width=0.85\linewidth]{figures/differential.pdf} + \caption{Differential for one FC Layer.} + \label{fig:differential} +\end{figure} + + + + +We summarize the gradients flow in Figure~\ref{fig:differential}. +Given $ \frac{\partial \, Q }{ \partial \, \X' }$, we can compute first $ \frac{\partial \, Q }{ \partial \, \Z }$ according to \eqref{eq:grad_q_z} and then $ \frac{\partial \, Q }{ \partial \, \X }$ according to \eqref{eq:grad_q_x} and $ \frac{\partial \, Q }{ \partial \, \W }$ according to \eqref{eq:grad_q_w}. + + + +\section{Fully-Connected (FC) Neural Network} + +An FC neural network is composed of multiple FC layers. +Suppose the FC network has $L$ ($> 1$) layers; the $l$-th layer is parameterized by weight matrix $\W^{(l)}$. +The $l$-th layer takes matrix $\X^{(l)}$ as input, +computes $\Z^{(l)} = \X^{(l)} {\W^{(l)}}^T$, +and outputs $\X^{(l+1)} = \sigma (\Z^{(l)})$. +The dependence can be depicted as +\begin{small} +\begin{equation*} + \begin{array}{c} + \textsf{input} \: \longrightarrow \: \cdots \: \longrightarrow \\ + ~ + \end{array} + \underbrace{ + \left. + \begin{array}{c} + \X^{(l)} \\ + \W^{(l)} \\ + \end{array} + \right\} + \: \longrightarrow \: + \Z^{(l)} + \: \longrightarrow \: + \X^{(l+1)} }_{\textsf{the } l\textsf{-th layer}} + \: \longrightarrow \: + \cdots + \: \longrightarrow \: + \X^{(L+1)} (\textsf{i.e.\ output}) + \: \longrightarrow \: + \textsf{loss}. +\end{equation*} +\end{small}% +Note that for $\X^{(1)} , \cdots , \X^{(L+1)}$ all have $b$ rows, where $b$ is the batch size; +but they can have different numbers of columns.\footnote{In the case of MNIST hand-written digit classification, the inputs are $784$ ($=28\times 28$) dimensional vectors, and the outputs are a $10$-dimensional vectors (for there are $10$ classes). +So $\X^{(1)}$ has $784$ columns, and $\X^{(L+1)}$ has $10$ columns.} + +The $L$-th layer is called the output layer. +The output of the $L$-th layer, denote $\X^{(L+1)} \in \RB^{b\times m}$, is the prediction the neural network makes for the input $\X^{(1)}$. +Let $\Y \in \RB^{b\times m}$ be the labels of this batch of samples.\footnote{In the case of housing price prediction, the labels (housing price) are scalars, so $m=1$. In the case of hand-written digit classification, there are ten classes, and the labels are one-hot encode ($10$-dimensional vectors); thus $m=10$.} +We need to define a loss function $Q$ that measures the difference between the prediction and the ground truth (labels). +For example, the loss function can be +\begin{equation*} + Q \left( \X^{(1)} , \Y ; \, \W^{(1)}, \cdots , \W^{(L)} \right) + \; = \; \frac{1}{2} \Big\| \X^{(L+1)} \, - \, \Y \Big\|_F^2 . +\end{equation*} +See Figure~\ref{fig:bp}(left) for the structure of the FC neural network (with $\Z$'s abbreviated.) + + + + +% \begin{wrapfigure}{r}{0.65\textwidth} +% \centering +% \includegraphics[width=0.3\textwidth]{figures/bp1.pdf}~~~ +% \includegraphics[width=0.3\textwidth]{figures/bp2.pdf} +% \caption{A} +% \label{fig:bp} +% \end{wrapfigure} + + +\begin{figure}[!h] + \centering + \includegraphics[width=0.35\linewidth]{figures/bp1.pdf}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + \includegraphics[width=0.35\linewidth]{figures/bp2.pdf} + \caption{The forward and backward pass for computing gradients.} + \label{fig:bp} +\end{figure} + + + + +\section{Computing Gradients via BackPropagation} + +Let $Q$ be the loss function parameterized by $\W^{(1)}, \W^{(2)}, \cdots , \W^{(L)}$, where $\W^{(l)}$ is the weight of the $l$-th layer. +We seek to minimize $Q$ w.r.t.\ to the weights, so we need the gradients: +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \W^{(1)} } , \; + \frac{ \partial \, Q }{ \partial \, \W^{(2)} } , \; \cdots , \; + \frac{ \partial \, Q }{ \partial \, \W^{(L)} } . +\end{equation*} +With the gradients, we can update the weights $\W^{(1)}, \W^{(2)}, \cdots , \W^{(L)}$, e.g., by (stochastic) gradient descent: +\begin{equation} \label{eq:sgd} + \W^{(l)} \: \longleftarrow \: \W^{(l)} - \alpha \, \frac{ \partial \, Q }{ \partial \, \W^{(l)} }, + \qquad \textrm{for } l=1, \cdots , L, +\end{equation} +where $\alpha$ ($>0$) is called step size or learning rate. +See my lecture note {\it Logistic Regression} for gradient-based algorithms. + + +Since $Q$ is complicated, directly computing $\frac{ \partial \, Q }{ \partial \, \W^{(l)} } $ for any layer is difficult. +The best practice is BackPropagation, that is, using the chain rule to let gradients flow from the top to the bottom. +See the illustration in Figure~\ref{fig:bp}(right). +\begin{itemize} + \item + The loss function is a simple function of the output, $\X^{(L+1)}$. + So it is easy to compute the derivative $\frac{\partial \, Q}{ \partial \, \X^{(L+1)}}$. + \item + After knowing $\frac{\partial \, Q}{ \partial \, \X^{(L+1)}}$, we can use the equations (chain rule) in Section~\ref{sec:differential} to compute + $\frac{\partial \, Q}{ \partial \, \W^{(L)}}$ and $\frac{\partial \, Q}{ \partial \, \X^{(L)}}$. + We record $\frac{\partial \, Q}{ \partial \, \W^{(L)}}$ and pass $\frac{\partial \, Q}{ \partial \, \X^{(L)}}$ to the $(L-1)$-th layer. + \item + We repeat the above step from the $(L-1)$-th layer all the way down to the first (bottom) layer. +\end{itemize} +In this way, we obtained $\frac{ \partial \, Q }{ \partial \, \W^{(L)} } $, $\cdots$, $\frac{ \partial \, Q }{ \partial \, \W^{(2)} } $, $\frac{ \partial \, Q }{ \partial \, \W^{(1)} } $ one by one. +We will use them to update $\W^{(L)}$, e.g., according \eqref{eq:sgd}. + + + + +\section{Expressing Convolution as Matrix Multiplication} + +The rest of this paper extends what we have done for FC layers to convolutional layers. +In this section, we express convolution as matrix multiplication to reveal the connection between FC layers and convolutional layers. +In the next section, we will derive the gradients for convolution. +Differentiation for convolutional layer is the same as FC layer except for the \textsf{unfold} and \textsf{fold} operations; +understanding \textsf{unfold} and \textsf{fold} will be the key to comprehend this and the next section. + + +\paragraph{Tensor convolution.} +Let $\T $ be a $d_1 \times d_2 \times d_3$ input tensor and $\K$ be a $k_1 \times k_2 \times d_3$ kernel (aka filter) tensor. +The convolution $\T * \K $ outputs a $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1)$ matrix, denote $\C$. +\begin{itemize} + \item + What decides the output shape of convolution? It is the number of $k_1 \times k_2 \times d_3$ patches in $\T$. + Tensor $\T$ has $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1)$ such patches. + In Figure~\ref{fig:unfold}, since $d_1=4$, $d_2=3$, and $k_1=k_2=2$, there are totally $3\times 2 = 6$ patches. + \item + What are the entries of matrix $\C$? + Let scalar $c_{ij} \in \RB$ be the $(i,j)$-th entry of $\C$ and tensor $\PP_{ij} \in \RB^{k_1 \times k_2 \times d_3}$ be the $(i,j)$-th patch of $\T$. + Then + \begin{equation}\label{eq:conv1} + c_{ij} \: = \: \big\langle \K , \, \PP_{ij} \big\rangle + \: = \: \big\langle \vect (\K) , \, \vect (\PP_{ij} ) \big\rangle . + \end{equation} + Here, $\vect (\K) $ means reshaping tensor $\K$ to a $k_1 k_2 d_3 \times 1$ vector, and $\langle \cdot , \cdot \rangle$ denotes vector/matrix/tensor inner product. +\end{itemize} + + +\paragraph{Unfolding.} +Based on the above discussions, we know that the $d_1 \times d_2 \times d_3$ tensor $\T $ can be converted to $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1)$ patches; +each patch is a $k_1 \times k_2 \times d_3$ tensor (the same to the kernel $\K$). +Converting the order-3 tensor $\T$ to the order-5 tensor $\overline{\X}$ is called \textsf{unfolding}. +The shape of $\overline{\X}$ is +\begin{equation*} + (d_1 - k_1 + 1) \times (d_2 - k_2 + 1) \times k_1 \times k_2 \times d_3; +\end{equation*} +$\overline{\X}$ consists of the patches $\{\PP_{ij}\}$: +\begin{equation*} + \overline{\X} \, [i, \, j, \, :, \, :, \, :] \: = \: \PP_{ij} \: \in \: \RB^{k_1 \times k_2 \times d_3}. +\end{equation*} +Note that $\textsf{unfold}$ is supported by software systems like PyTorch. +Then, we \textsf{reshape} the order-5 tensor $\overline{\X}$ to the +\begin{equation*} + (d_1 - k_1 + 1) (d_2 - k_2 + 1) \; \times \; (k_1 k_2 d_3 ) +\end{equation*} +matrix, denote $\X$. +In sum, the procedure is +\begin{equation*} + \T \textsf{ (order-3 tensor)} + \: \xrightarrow{\textsf{unfold}} \: + \overline{\X} \textsf{ (order-5 tensor)} + \: \xrightarrow{\textsf{reshape}} \: + \X \textsf{ (matrix)} . +\end{equation*} +Figure~\ref{fig:unfold} illustrates this procedure. + +\paragraph{Convolution as matrix-vector multiplication.} +Let matrix $\X$ be the outcome after unfolding and reshaping. +Let $\w = \vect (\K) \in \RB^{k_1 k_2 d_3 \times 1}$ be the vectorization of $\K \in \RB^{k_1 \times k_2 \times d_3}$. +Then, compute the vector +\begin{equation} \label{eq:conv_multiply} + \z \: = \: \X \, \cdot \, \vect (\K) + \: \in \: \RB^{ (d_1 - k_1 + 1) (d_2 - k_2 + 1) \times 1} . +\end{equation} +Recall that the $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1)$ matrix $\C$ is the outcome of convolution. +By comparing \eqref{eq:conv1} and \eqref{eq:conv_multiply}, we find that +\begin{equation*} + \z \: = \: \vect (\C ) + \qquad \textrm{and} \qquad + \C \: = \: \textsf{reshape} \Big( \z , \, \big( (d_1 - k_1 + 1), (d_2 - k_2 + 1) \big) \Big) . +\end{equation*} +To summarize, the forward pass of tensor convolution can be expressed in the following two equivalent forms: +\begin{equation*} + \left. + \begin{array}{c} + \T \textsf{ (input tensor)} \\ + \K \textsf{ (kernel tensor)} \\ + \end{array} + \right\} + \: \xrightarrow{\textsf{convolution}} \: + \C \textsf{ (output matrix)} +\end{equation*} +and +\begin{equation} \label{eq:conv2} + \left. + \begin{array}{r c} + \T \: \xrightarrow{\textsf{unfold}} \: \overline{\X} \: \xrightarrow{\textsf{reshape}} & \X \\ + \K \: \xrightarrow{\textsf{vectorize}} & \w \\ + \end{array} + \right\} + \: \xrightarrow{\textsf{multiply}} \: + \z + \: \xrightarrow{\textsf{reshape}} \: + \C . +\end{equation} +In the next section, we will use the latter to perform backpropagation. + + + +\begin{figure}[!h] + \centering + \includegraphics[width=0.7\linewidth]{figures/unfold.pdf} + \caption{Illustrating patching and unfolding. Here, $d_1=4$, $d_2=3$, $d_3=1$, and $k_1=k_2=2$. + Thus, there are $(d_1-k_1+1)\times (d_2-k_2+1) = 6$ patches, and each patch is $k_1\times k_2 = 2\times 2$.} + \label{fig:unfold} +\end{figure} + +\section{Differentiation for Convolution} + + +By representing convolution as the procedure \eqref{eq:conv2}, the backpropagation will be easier to derive. +During the backpropagation, we receive $\frac{\partial \, Q}{\partial \, \C}$ and propagate it to $\K$ and $\T$. +The backpropagation has the following steps: +\begin{equation} \label{eq:conv2} + \left. + \begin{array}{r c} + \frac{\partial \, Q}{\partial \, \T} \: \xleftarrow{\textsf{fold}} \: \frac{\partial \, Q}{\partial \, \overline{\X}} \: \xleftarrow{\textsf{reshape}} & \frac{\partial \, Q}{\partial \, \X} \\ + \frac{\partial \, Q}{\partial \, \K} \: \xleftarrow{\textsf{reshape}} & \frac{\partial \, Q}{\partial \, \w} \\ + \end{array} + \right\} + \: \xleftarrow{\textsf{multiply}} \: + \frac{\partial \, Q}{\partial \, \z} + \: \xleftarrow{\textsf{vectorize}} \: + \frac{\partial \, Q}{\partial \, \C} . +\end{equation} +We describe the steps one by one. + + +\paragraph{From $\C$ to $\z$.} +This step is almost trivial. +Since $\z = \vect (\C)$, we have +\begin{equation*} + \frac{\partial \, Q}{\partial \, \z} + \: = \: \vect \left( \frac{\partial \, Q}{\partial \, \C} \right) + \: \in \: \RB^{(d_1-k_1+1)(d_2-k_2+1) \times 1} . +\end{equation*} +It just performs a vectorization. + + + +\paragraph{From $\z$ to $\X$ and $\w$.} +Recall from \eqref{eq:conv_multiply} that $\z$ is computed by the matrix-vector multiplication: $\z= \X \w$. +Once we have $\frac{ \partial \, Q}{\partial \, \z}$, we can propagate it to $\X$ and $\w$ by +\begin{align*} + & \underbrace{\frac{ \partial \, Q }{ \partial \, \X } }_{(d_1 - k_1 + 1) (d_2 - k_2 + 1) \times (k_1 k_2 d_3)} + \: = \: \underbrace{\frac{ \partial \, Q }{ \partial \, \z } }_{(d_1 - k_1 + 1) (d_2 - k_2 + 1) \times 1} + \underbrace{\w^T }_{ 1 \times (k_1 k_2 d_3)} ,\\ + & \underbrace{ \frac{ \partial \, Q }{ \partial \, \w } }_{ (k_1 k_2 d_3) \times 1} + \: = \:\underbrace{\X^T }_{(k_1 k_2 d_3) \times (d_1 - k_1 + 1) (d_2 - k_2 + 1)} + \underbrace{\frac{ \partial \, Q }{ \partial \, \z } }_{(d_1 - k_1 + 1) (d_2 - k_2 + 1) \times 1} . +\end{align*} +The above equations follow from \eqref{eq:grad_q_x} and \eqref{eq:grad_q_w}. + + +\paragraph{From $\w$ to $\K$.} +Since $\w = \vect (\K) $, the $k_1\times k_2 \times d_3$ tensor $\K$ can be obtained by \textsf{reshaping} $\w$ to order-3 tensor. +Thus, +\begin{equation*} + \frac{ \partial \, Q }{ \partial \, \K } \:= \: + \textsf{reshape} \left( \frac{\partial \, Q}{\partial \, \w} \; , \Big( k_1, \, \, k_2 , \, d_3 \Big) \right) +\end{equation*} + +\paragraph{From $\X$ to $\T$.} +As illustrated in Figure~\ref{fig:unfold}, $\X$ is obtained by \textsf{unfolding} $\T$, +and one entry of $\T$ is copied to multiple (at most $k_1k_2$) entries of $\X$. +For example, in Figure~\ref{fig:unfold}, the \textcolor{OliveGreen}{green ``A''} entry is copied from $t_{2,3,1}$ to both $x_{4,4}$ and $x_{5,3}$.\footnote{We let $x_{ij} = \X[i,j]$ denote the $(i,j)$-th entry of $\X$.} +Thus +\begin{equation} \label{eq:t_x_1} + \textcolor{OliveGreen}{t_{2,3,1}} + \: = \: + \textcolor{OliveGreen}{x_{4,4}} + \: = \: + \textcolor{OliveGreen}{x_{5,3}} . +\end{equation} +So the $(2,3,1)$-th entry of $\T$ influences $Q$ via only two enties of $\X$, and thus +\begin{equation*} + \frac{\partial \, x_{ij}}{\partial \, t_{2,3,1}} + \: = \: + \left\{ + \begin{array}{c l} + 1 & \textrm{if } (i,j)=(4,4) \textrm{ or } (5,3); \\ + 0 & \textrm{otherwise}. \\ + \end{array} + \right. +\end{equation*} +It follows that +\begin{equation} \label{eq:t_x_2} + \bigg[\frac{\partial \, Q}{\partial \, \T} \bigg]_{2, 3, 1} + \: = \: \sum_{i, j} \frac{\partial \, x_{ij}}{\partial \, t_{2,3,1}} \, \frac{\partial \, Q}{\partial \, x_{ij}} + \: = \: \frac{\partial \, Q}{\partial \, x_{4,4}} + \frac{\partial \, Q}{\partial \, x_{5,3}} + \: = \: \bigg[\frac{\partial \, Q}{\partial \, \X} \bigg]_{4, 4} \, + \,\bigg[ \frac{\partial \, Q}{\partial \, \X} \bigg]_{5, 3} . +\end{equation} +Deep learning platforms like PyTorch provide the ``\textsf{fold}'' for aggregating the entries of $\frac{\partial \, Q}{\partial \, \X} $ to get $\frac{\partial \, Q}{\partial \, \T}$.\footnote{The aggregation is according to the way the entries of $\T$ are copied to $\X$, e.g., $t_{2,3,1}$ is copied to $x_{4,4}$ and $x_{5,3}$.} +First, \textsf{reshape} $\frac{\partial \, Q}{\partial \, \X} $ to the $(d_1 - k_1 + 1) \times (d_2 - k_2 + 1) \times k_1 \times k_2 \times d_3$ order-5 tensor: +\begin{equation*} + \frac{\partial \, Q}{\partial \, \overline{\X}} \:= \: \textsf{reshape} \left( \frac{\partial \, Q}{\partial \, \X} \; , \Big( (d_1 - k_1 + 1) , \, (d_2 - k_2 + 1) , \, k_1 , \, k_2 , \, d_3 \Big) \right) . +\end{equation*} +Then, apply ``\textsf{fold}'' to get the $d_1\times d_2 \times d_3$ order-3 tensor $\frac{\partial \, Q}{\partial \, \T}$: +\begin{equation*} + \frac{\partial \, Q}{\partial \, \T} + \: = \: \textsf{fold} \left( \frac{\partial \, Q}{\partial \, \overline{\X}} \right). +\end{equation*} +To summerize, the forward pass from $\T$ to $\X$ and the backward pass from $\X$ to $\T$ are +\begin{equation*} + \T \: \xrightarrow{\textsf{unfold}} \: \overline{\X} \: \xrightarrow{\textsf{reshape}} \: \X + \qquad \textrm{and} \qquad + \frac{\partial \, Q}{\partial \, \T} + \: \xleftarrow{\textsf{~fold~}} \: + \frac{\partial \, Q}{\partial \, \overline{\X}} + \: \xleftarrow{\textsf{reshape}} \: + \frac{\partial \, Q}{\partial \, \X} . +\end{equation*} + + + + + + + + +\end{document} diff --git a/LectureNotes/BP/figures/bp.pptx b/LectureNotes/BP/figures/bp.pptx new file mode 100644 index 0000000..60008dd Binary files /dev/null and b/LectureNotes/BP/figures/bp.pptx differ diff --git a/LectureNotes/BP/figures/bp1.pdf b/LectureNotes/BP/figures/bp1.pdf new file mode 100644 index 0000000..1eade8f Binary files /dev/null and b/LectureNotes/BP/figures/bp1.pdf differ diff --git a/LectureNotes/BP/figures/bp2.pdf b/LectureNotes/BP/figures/bp2.pdf new file mode 100644 index 0000000..b8462ca Binary files /dev/null and b/LectureNotes/BP/figures/bp2.pdf differ diff --git a/LectureNotes/BP/figures/differential.pdf b/LectureNotes/BP/figures/differential.pdf new file mode 100644 index 0000000..81d1d49 Binary files /dev/null and b/LectureNotes/BP/figures/differential.pdf differ diff --git a/LectureNotes/BP/figures/differential.pptx b/LectureNotes/BP/figures/differential.pptx new file mode 100644 index 0000000..93bfaa4 Binary files /dev/null and b/LectureNotes/BP/figures/differential.pptx differ diff --git a/LectureNotes/BP/figures/unfold.pdf b/LectureNotes/BP/figures/unfold.pdf new file mode 100644 index 0000000..5cb5554 Binary files /dev/null and b/LectureNotes/BP/figures/unfold.pdf differ diff --git a/LectureNotes/BP/figures/unfold.xlsx b/LectureNotes/BP/figures/unfold.xlsx new file mode 100644 index 0000000..59e3481 Binary files /dev/null and b/LectureNotes/BP/figures/unfold.xlsx differ diff --git a/LectureNotes/DRL/DRL.pdf b/LectureNotes/DRL/DRL.pdf index b24e209..ad4a1f4 100644 Binary files a/LectureNotes/DRL/DRL.pdf and b/LectureNotes/DRL/DRL.pdf differ diff --git a/LectureNotes/DRL/DRL.tex b/LectureNotes/DRL/DRL.tex new file mode 100644 index 0000000..acb156c --- /dev/null +++ b/LectureNotes/DRL/DRL.tex @@ -0,0 +1,700 @@ +\documentclass[11pt]{article} +\usepackage{amsmath,amssymb,amsmath,amsthm,amsfonts} +\usepackage{latexsym,graphicx} +\usepackage{fullpage,color} +\usepackage{url} +\usepackage[pdftex,bookmarks,colorlinks=true,citecolor=blue]{hyperref} +\usepackage[numbers]{natbib} +\usepackage{graphicx,subfigure} +\usepackage{algorithm} +\usepackage{algorithmic} +\usepackage{listings} +\usepackage{xcolor} +\usepackage{color} + +\numberwithin{equation}{section} + +\pagestyle{plain} + +\setlength{\oddsidemargin}{0in} +\setlength{\topmargin}{0in} +\setlength{\textwidth}{6.5in} +\setlength{\textheight}{8.5in} + +\newtheorem{fact}{Fact}[section] +\newtheorem{question}{Question}[section] +\newtheorem{lemma}{Lemma}[section] +\newtheorem{theorem}[lemma]{Theorem} +\newtheorem{assumption}[lemma]{Assumption} +\newtheorem{corollary}[lemma]{Corollary} +\newtheorem{prop}[lemma]{Proposition} +\newtheorem{claim}{Claim}[section] +\newtheorem{remark}{Remark}[section] +\newtheorem{definition}{Definition}[section] +\newtheorem{prob}{Problem}[section] +\newtheorem{conjecture}{Conjecture}[section] +\newtheorem{property}{Property}[section] + +\def\A{{\bf A}} +\def\a{{\bf a}} +\def\B{{\bf B}} +\def\bb{{\bf b}} +\def\C{{\bf C}} +\def\c{{\bf c}} +\def\D{{\bf D}} +\def\d{{\bf d}} +\def\E{{\bf E}} +\def\e{{\bf e}} +\def\F{{\bf F}} +\def\f{{\bf f}} +\def\g{{\bf g}} +\def\h{{\bf h}} +\def\G{{\bf G}} +\def\H{{\bf H}} +\def\I{{\bf I}} +\def\K{{\bf K}} +\def\k{{\bf k}} +\def\LL{{\bf L}} +\def\M{{\bf M}} +\def\m{{\bf m}} +\def\N{{\bf N}} +\def\n{{\bf n}} +\def\PP{{\bf P}} +\def\pp{{\bf p}} +\def\Q{{\bf Q}} +\def\q{{\bf q}} +\def\R{{\bf R}} +\def\rr{{\bf r}} +\def\S{{\bf S}} +\def\s{{\bf s}} +\def\T{{\bf T}} +\def\tt{{\bf t}} +\def\U{{\bf U}} +\def\u{{\bf u}} +\def\V{{\bf V}} +\def\v{{\bf v}} +\def\W{{\bf W}} +\def\w{{\bf w}} +\def\X{{\bf X}} +\def\x{{\bf x}} +\def\Y{{\bf Y}} +\def\y{{\bf y}} +\def\Z{{\bf Z}} +\def\z{{\bf z}} +\def\0{{\bf 0}} +\def\1{{\bf 1}} + + + +\def\AM{{\mathcal A}} +\def\CM{{\mathcal C}} +\def\DM{{\mathcal D}} +\def\EM{{\mathcal E}} +\def\GM{{\mathcal G}} +\def\FM{{\mathcal F}} +\def\IM{{\mathcal I}} +\def\JM{{\mathcal J}} +\def\KM{{\mathcal K}} +\def\LM{{\mathcal L}} +\def\NM{{\mathcal N}} +\def\OM{{\mathcal O}} +\def\PM{{\mathcal P}} +\def\SM{{\mathcal S}} +\def\TM{{\mathcal T}} +\def\UM{{\mathcal U}} +\def\VM{{\mathcal V}} +\def\WM{{\mathcal W}} +\def\XM{{\mathcal X}} +\def\YM{{\mathcal Y}} +\def\RB{{\mathbb R}} +\def\RBmn{{\RB^{m\times n}}} +\def\EB{{\mathbb E}} +\def\PB{{\mathbb P}} + +\def\TX{\tilde{\bf X}} +\def\TA{\tilde{\bf A}} +\def\tx{\tilde{\bf x}} +\def\ty{\tilde{\bf y}} +\def\TZ{\tilde{\bf Z}} +\def\tz{\tilde{\bf z}} +\def\hd{\hat{d}} +\def\HD{\hat{\bf D}} +\def\hx{\hat{\bf x}} +\def\nysA{{\tilde{\A}_c^{\textrm{nys}}}} + +\def\alp{\mbox{\boldmath$\alpha$\unboldmath}} +\def\bet{\mbox{\boldmath$\beta$\unboldmath}} +\def\epsi{\mbox{\boldmath$\epsilon$\unboldmath}} +\def\etab{\mbox{\boldmath$\eta$\unboldmath}} +\def\ph{\mbox{\boldmath$\phi$\unboldmath}} +\def\pii{\mbox{\boldmath$\pi$\unboldmath}} +\def\Ph{\mbox{\boldmath$\Phi$\unboldmath}} +\def\Ps{\mbox{\boldmath$\Psi$\unboldmath}} +\def\ps{\mbox{\boldmath$\psi$\unboldmath}} +\def\tha{\mbox{\boldmath$\theta$\unboldmath}} +\def\Tha{\mbox{\boldmath$\Theta$\unboldmath}} +\def\muu{\mbox{\boldmath$\mu$\unboldmath}} +\def\Si{\mbox{\boldmath$\Sigma$\unboldmath}} +\def\si{\mbox{\boldmath$\sigma$\unboldmath}} +\def\Gam{\mbox{\boldmath$\Gamma$\unboldmath}} +\def\Lam{\mbox{\boldmath$\Lambda$\unboldmath}} +\def\De{\mbox{\boldmath$\Delta$\unboldmath}} +\def\de{\mbox{\boldmath$\delta$\unboldmath}} +\def\Ome{\mbox{\boldmath$\Omega$\unboldmath}} +\def\Pii{\mbox{\boldmath$\Pi$\unboldmath}} +\def\varepsi{\mbox{\boldmath$\varepsilon$\unboldmath}} +\newcommand{\ti}[1]{\tilde{#1}} +\def\Ncal{\mathcal{N}} +\def\argmax{\mathop{\rm argmax}} +\def\argmin{\mathop{\rm argmin}} + +\def\ALG{{\AM_{\textrm{col}}}} + +\def\mean{\mathsf{mean}} +\def\std{\mathsf{std}} +\def\bias{\mathsf{bias}} +\def\var{\mathsf{var}} +\def\sgn{\mathsf{sgn}} +\def\tr{\mathsf{tr}} +\def\rk{\mathrm{rank}} +\def\nnz{\mathsf{nnz}} +\def\poly{\mathrm{poly}} +\def\diag{\mathsf{diag}} +\def\Diag{\mathsf{Diag}} +\def\const{\mathrm{Const}} +\def\st{\mathsf{s.t.}} +\def\vect{\mathsf{vec}} +\def\sech{\mathrm{sech}} +\def\sigmoid{\mathsf{sigmoid}} + +\newcommand{\red}[1]{{\color{red}#1}} + + + +\def\argmax{\mathop{\rm argmax}} +\def\argmin{\mathop{\rm argmin}} + +\newenvironment{note}[1]{\medskip\noindent \textbf{#1:}}% + {\medskip} + + +\newcommand{\etal}{{\em et al.}\ } +\newcommand{\assign}{\leftarrow} +\newcommand{\eps}{\epsilon} + +\newcommand{\opt}{\textrm{\sc OPT}} +\newcommand{\script}[1]{\mathcal{#1}} +\newcommand{\ceil}[1]{\lceil #1 \rceil} +\newcommand{\floor}[1]{\lfloor #1 \rfloor} + + + +\lstset{ % +extendedchars=false, % Shutdown no-ASCII compatible +language=Python, % choose the language of the code +xleftmargin=1em, +xrightmargin=1em, +basicstyle=\footnotesize, % the size of the fonts that are used for the code +tabsize=3, % sets default tabsize to 3 spaces +numbers=left, % where to put the line-numbers +numberstyle=\tiny, % the size of the fonts that are used for the line-numbers +stepnumber=1, % the step between two line-numbers. If it's 1 each line + % will be numbered +numbersep=5pt, % how far the line-numbers are from the code % +keywordstyle=\color[rgb]{0,0,1}, % keywords +commentstyle=\color[rgb]{0.133,0.545,0.133}, % comments +stringstyle=\color[rgb]{0.627,0.126,0.941}, % strings +backgroundcolor=\color{white}, % choose the background color. You must add \usepackage{color} +showspaces=false, % show spaces adding particular underscores +showstringspaces=false, % underline spaces within strings +showtabs=false, % show tabs within strings adding particular underscores +frame=single, % adds a frame around the code +%captionpos=b, % sets the caption-position to bottom +breaklines=true, % sets automatic line breaking +breakatwhitespace=false, % sets if automatic breaks should only happen at whitespace +%title=\lstname, % show the filename of files included with \lstinputlisting; +% % also try caption instead of title +mathescape=true,escapechar=? % escape to latex with ?..? +escapeinside={\%*}{*)}, % if you want to add a comment within your code +%columns=fixed, % nice spacing +%morestring=[m]', % strings +%morekeywords={%,...},% % if you want to add more keywords to the set +% break,case,catch,continue,elseif,else,end,for,function,global,% +% if,otherwise,persistent,return,switch,try,while,...},% +} + + +\begin{document} + +%\setlength{\fboxrule}{.5mm}\setlength{\fboxsep}{1.2mm} +%\newlength{\boxlength}\setlength{\boxlength}{\textwidth} +%\addtolength{\boxlength}{-4mm} + + +\title{Deep Reinforcement Learning} + +\author{\textbf{Shusen Wang} \\ Stevens Institute of Technology} + +%\date{ } + +\maketitle + +\begin{abstract} +This lecture note briefly summarizes three kinds of deep reinforcement learning approaches: value-based methods, policy-based methods, and actor-critic methods. +This note is structured as following. +First, reinforcement learning terminologies are defined. +Second, we study Deep Q Network (DQN), a family of value-based methods, and train DQN using temporal difference (TD) learning. +Third, we study policy-based learning and derive policy gradient algorithms. +Last, we study standard (random) actor-critic method and deterministic actor-critic method. +\end{abstract} + + + +\section{Notation} + +Throughout, we use uppercase letters, e.g., $X$, to denote random variables and lowercase letters, e.g., $x$, to denote their observations. +Let $\PB (X = x)$ be the probability of the event ``$X = x$''. +Let $\PB (Y=y | X=x)$ be the probability of the event ``$Y=y$'' under the condition ``$X=x$''. + +\paragraph{Agent:} +A system that is embedded in an environment and takes actions to change the state of the environment. Examples include robots, industrial controllers, and Mario in the game Super Mario. + + +\paragraph{State ($S$):} +State can be viewed as a summary of the history of the system that determines its future evolution. +State space $\SM$ is the set that contains all the possible states. +At time step $t$, the past states are observed and we thus know their values: $s_1, \cdots , s_t$; +however, the future states $S_{t+1}, S_{t+2}, \cdots $ are unobserved random variables. + + +\paragraph{Action ($A$):} +The agent's decision based on the state and other considerations. +Action space $\AM$ is the set that contains all the actions. +Action space can be a discrete set such as $\{\textrm{``left''}, \textrm{``right''}, \textrm{``up''} \}$ or a continuous set such as $[0, 1] \times [-90, 90]$. +At time step $t$, the past actions are observed: $a_1, \cdots , a_t$, but the future actions $A_{t+1}, A_{t+2}, \cdots$ are unobserved random variables. + + +\paragraph{Reward ($R$):} +Reward is a value received by the agent from the environment as a direct response to the agent’s actions. +At time step $t$, all the past rewards are observed: $r_1, r_2, \cdots , r_t$. +However, the future reward $R_i$ (for $i > t$) is unobserved, and it depends on the random variables, $S_{t+1}$ and $A_{t+1}$. +Thus, at time step $t$, the future rewards $R_{t+1}, R_{t+2} , \cdots$ are random variables. + + + + +\paragraph{Policy function ($\pi $):} +The decision-making function of the agent. +Policy is the probability density function (PDF): $\pi (a | s) = \PB ( A = a | S = s )$. +The policy function maps the observed state $S=s$ to a probability distribution over all the actions in set $\AM$. +Since $\pi$ is a PDF, $\sum_{a \in \AM } \pi (a | s) = 1$. +The agent will perform action $a$ with probability $\pi (a | s)$, for all $a \in \AM$. +See the illustration in Figure~\ref{fig:random}. + + + +\paragraph{State transition ($p $):} +Given the current state $S=s$, the agent's action $A=a$ will lead to the new state $S'$ given by the environment. +State-transition function is the probability density function (PDF) $p (s' | s, a) = \PB ( S' = s' | S = s , A = a )$. +The environment makes $s'$ the new state with probability $p (s' | s, a)$, for all $s' \in \SM$. + + + + + + +\paragraph{Trajectory:} +The agent's interaction with the environment results in a sequence of (state, action, reward) triplets: +$s_1, a_1, r_1, s_2, a_2, r_2, s_3, a_3, r_3, \cdots$ + +\begin{figure}[!t] + \centering + \includegraphics[width=0.5\linewidth]{figures/randomness.pdf} + \caption{Illustration of the randomness. + The action $A$ is randomly sampled according to the policy function. + The new state $S'$ is randomly sampled according to the state-transition function. + } + \label{fig:random} +\end{figure} + + + + +\paragraph{Return ($U$):} +Return (aka cumulative future reward) is defined as +\begin{equation*} + U_t = R_t + R_{t+1} + R_{t+2} + R_{t+3} + \cdots +\end{equation*} +Discounted return (aka cumulative discounted future reward) is defined as +\begin{equation*} + U_t = R_t + \gamma \cdot R_{t+1} + \gamma^2 \cdot R_{t+2} + \gamma^3 \cdot R_{t+3} + \cdots +\end{equation*} +Here, $\gamma \in (0, 1)$ is the discount rate. +The return $U_t$ is random because the future rewards $R_t, R_{t+1}, R_{t+2}, \cdots $ are unobserved random variables. +Recall that the randomness in the $R_i$ ($i \geq t$) comes from the future states $S_i$ and action $A_{i} $. + + + +\paragraph{Action-value function ($Q_{\pi}$):} +Action-value function $Q_{\pi} (s_t, a_t)$ measures given state $s_t$ and policy $\pi$, how good the action $a_t$ is. +Formally speaking, +\begin{equation*} + Q_{\pi} (s_t, a_t) + \: = \: \EB \big[ U_t \, \big| \, S_t = s_t , A_t = a_t \big] . +\end{equation*} +The expectation is taken w.r.t.\ the future actions $A_{t+1}, A_{t+2}, \cdots $ and future states $S_{t+1}, S_{t+2}, \cdots $ which are random variables. +Note that $Q_{\pi} (s_t, a_t)$ depends on the policy function $\pi$ and the state-transition function $p$. + + +\paragraph{Optimal action-value function ($Q^\star $):} +The optimal action-value function $Q^\star (s_t, a_t)$ measures how good the action $a_t$ is at state $s_t$. +Formally speaking, +\begin{equation*} + Q^\star (s, a) + \: = \: \max_{\pi } Q_{\pi} (s, a). +\end{equation*} +Note that $Q^\star (s, a)$ is independent of the the policy function $\pi$. + + + +\paragraph{State-value function ($V_{\pi} $):} +State-value function $V_{\pi} (s_t)$ measures given $\pi$, how good the current situation $s_t$ is. +Formally speaking, +\begin{equation*} + V_{\pi} (s_t ) + \: = \: \EB_{A\sim \pi (\cdot | s_t )} \big[ Q_{\pi} (s_t, A) \big] + \: = \: \int_{\AM } \pi (a | s_t ) \cdot Q_{\pi} (s_t, a) \: d \, a . +\end{equation*} +Here, the action $A$ is treated as a random variable and integrated out. + + + + +\paragraph{Optimal state-value function ($V^\star $):} +The optimal state-value function $V^\star (s_t)$ measures how good the current situation $s_t$ is. +Formally speaking, +\begin{equation*} + V^\star (s ) + \: = \: \max_{\pi } V_{\pi} (s). +\end{equation*} +Note that $V^\star (s)$ is independent of the the policy function $\pi$. + + + +\section{Value-Based Deep Reinforcement Learning} + + +The optimal action-value function $Q^\star (s, a)$ can be used to control the agent: observing state $s_t$, the agent performs +\begin{equation*} + a_t \: = \: \argmax_{a \in \AM } Q^\star (s_t , a ) . +\end{equation*} +The optimal action-value function can be approximated by the neural network $Q (s, a; \w )$ where $\w$ captures the model parameters. +The neural network is called \textbf{Deep Q Network (DQN)}. + +There are different designs of network architecture. +Here, we consider the game Super Mario, in which the the action space is discrete: $\AM = \{ \textrm{``left''} , \textrm{``right''}, \textrm{``up''}\}$. +DQN takes state $s_t$ (which can be a screenshot or several most recent screenshots) as input. +The architecture can be +\begin{equation*} + \texttt{State} + \: \Rightarrow \: + \texttt{Conv} + \: \Rightarrow \: + \texttt{Flatten} + \: \Rightarrow \: + \texttt{Dense} + \: \Rightarrow \: + \texttt{Values} . +\end{equation*} +In the Super Mario example, DQN outputs a 3-dimensional vector, e.g., $ [200, 100, 250]$, whose entries corresponds to the three actions. +Then the action should be +\begin{equation*} + a_t \: = \: \argmax_{a} Q (s_t , a ; \w ). +\end{equation*} +Since $Q (s_t , \textrm{``up''} ; \w ) = 250$ is the biggest value among the three, +$a_t=$``up'' will be the selected action. + + + +DQN is typically trained using \textbf{temporal different (TD) learning} \cite{sutton2008convergent,sutton2009fast} which allows for updating the model parameters every time a reward $R_t=r_t$ is observed. +By definition, $U_t = \sum_{i=1} \gamma^{i-t} \cdot R_i$. +Thus +\begin{equation*} + U_{t} \: = \: R_t + \gamma \cdot U_{t+1}. +\end{equation*} +TD learning makes use of the fact: +\begin{equation*} + Q_\pi (s_t, a_t) + \: = \: + \EB \big[U_{t} \, \big| \, s_t , a_t \big] + \: = \: \EB \big[ R_t + \gamma \cdot U_{t+1} \, \big| \, s_t , a_t \big] + \: = \: \EB \big[ R_t + \gamma \cdot Q_\pi (S_{t+1}, A_{t+1}) \, \big| \, s_t , a_t \big]. +\end{equation*} +Since $Q (s_t, a_t; \w) \approx \max_{\pi} \EB [U_t | s_t, a_t]$, we have +\begin{equation*} + Q (s_t, a_t; \w) \: \approx \: r_t + \gamma \cdot Q (s_{t+1}, a_{t+1}; \w) . +\end{equation*} +Before observing $R_t$, the expected return was +\[ +q_t \: = \: Q(s_t , a_t; \w) +\] +After observing $R_t=r_t$, the expected return is updated to +\[ +y_t \: = \: r_t + \gamma \cdot Q(s_{t+1} , a_{t+1}; \w), +\] +which is called \textbf{TD target}. +The \textbf{TD error} is $\delta_t = q_t - y_t$. +We seek to encourage a small TD error and thus define the loss: +\begin{equation*} + L_t \: = \: \frac{1}{2} \delta_t^2 \: = \: \frac{1}{2} \big[ Q (s_t , a_t ; \w ) - y_t \big]^2 . +\end{equation*} +Pretend $y_t$ is not a function of $\w$. +Then the gradient is +\begin{equation*} + \g_t + \: \triangleq \: \frac{\partial \, L_t }{\partial \, \w } \Big|_{\w=\w_t } + \: = \: \delta_t \cdot \frac{\partial \, Q (s_t , a_t ; \w ) }{\partial \, \w } \Big|_{\w=\w_t } . +\end{equation*} +The DQN can be updated by performing a gradient descent: $\w_{k+1} \longleftarrow \w_{k} - \alpha \cdot \g_t $ where $\alpha$ is the learning rate. + + +\section{Policy-Based Deep Reinforcement Learning} \label{sec:policy} + + + +The policy function $\pi (a | s)$ can be used to control the agent: observing the state $S_t = s_t$, the agent randomly samples an action: +\begin{equation*} + a_t \: \sim \: \pi (\cdot | s_t ) . +\end{equation*} +The policy function can be approximated by the neural network $\pi (a | s; \tha )$ where $\tha$ captures the model parameters. +The neural network is called \textbf{policy network}. + + + + +There are different designs of network architecture. +Here, we also consider the game Super Mario, in which the the action space is discrete: $\AM = \{ \textrm{``left''} , \textrm{``right''}, \textrm{``up''}\}$. +The policy network takes observed state s (which can be a screenshot) as input. +The architecture can be +\begin{equation*} + \texttt{State} + \: \Rightarrow \: + \texttt{Conv} + \: \Rightarrow \: + \texttt{Flatten} + \: \Rightarrow \: + \texttt{Dense} + \: \Rightarrow \: + \texttt{Softmax} + \: \Rightarrow \: + \texttt{Probabilities} . +\end{equation*} +In the Super Mario example, DQN outputs a 3-dimensional vector, e.g., $\pp = [0.2, 0.1, 0.7]$, whose entries corresponds to the three actions. +Then the action will be randomly sampled: +\begin{equation*} + \PB \big( A = \textrm{``left''} \big) = 0.2, \qquad + \PB \big( A = \textrm{``right''} \big) = 0.1, \qquad + \PB \big( A = \textrm{``up''} \big) = 0.7. +\end{equation*} +All of the three actions may be selected. +If the random sampling is independently repeated 1000 times, then around 200 observations of $A$ are ``left'', around 100 are ``right'', and around 700 are ``up''. + + + + +The policy network can be learned using \textbf{policy gradient} algorithms. +If the actions are discrete, then the state-value function can be written as: +\begin{equation} \label{eq:state_value} + V_\pi (s ) \: = \: \sum_{a \in \AM} \pi (a | s) \cdot Q_\pi (s, a) . +\end{equation} +Policy-based learning uses the policy network $\pi (a | s; \tha )$ to approximate the policy function $\pi (a | s)$. +With the approximation of policy function, $V_\pi (s )$ is approximated by +\begin{equation*} + V (s ; \tha ) \: = \: \sum_{a \in \AM} \pi (a | s; \tha ) \cdot Q_\pi (s, a) . +\end{equation*} +Policy gradient is the derivative of $ V (s ; \tha )$ w.r.t.\ $\tha $ \cite{sutton2000policy}: +\begin{eqnarray*} + \frac{\partial \, V (s ; \tha )}{\partial \, \tha } + & = & \frac{\partial \, \sum_{a \in \AM} \pi (a | s; \tha ) \cdot Q_\pi (s, a) }{\partial \, \tha } \\ + & = & \sum_{a \in \AM} \frac{\partial \, \pi (a | s; \tha ) \cdot Q_\pi (s, a) }{\partial \, \tha } \\ + & = & \sum_{a \in \AM} Q_\pi (s, a) \cdot \frac{\partial \, \pi (a | s; \tha ) }{\partial \, \tha } \\ + & = & \sum_{a \in \AM} Q_\pi (s, a) \cdot \pi (a | s; \tha ) \cdot \frac{\partial \, \log \pi (a | s; \tha ) }{\partial \, \tha } . +\end{eqnarray*} +Here, the third identity follows from that $Q_\pi (s, a)$ does not depend on $\tha$;\footnote{This assumption is too strong. Since $Q_\pi$ depends on the policy function $\pi$, $Q_{\pi}$ can depend on $\tha$. Here, the assumption is used to simplify the derivation.} +the last identity follows from that $\frac{\partial \, \log f (x)}{\partial \, x} = \frac{1}{f (x)}\cdot \frac{\partial f(x)}{\partial \, x}$. +The above equation can be equivalently written as +\begin{equation} \label{eqn:policy_grad} + \frac{\partial \, V (s ; \tha )}{\partial \, \tha } + \: = \: \EB_{A \sim \pi (\cdot | s , \theta )} \bigg[ Q_\pi (s, a) \cdot \frac{\partial \, \log \pi (A | s; \tha ) }{\partial \, \tha } \bigg] . +\end{equation} +Recall that the approximate state-value function $V (s ; \tha )$ indicates how good the situation $s$ is if policy $\pi (a | s; \tha )$ is used. +We thereby have the motivation to update $\tha$ so that $V (s ; \tha )$ will increase (which means the situation is better.) +Thus, the policy network can be updated by policy gradient ascent: +\begin{equation*} + \tha_{t+1} \: \longleftarrow \: \tha_t + \beta \cdot \frac{\partial \, V (s ; \tha )}{\partial \, \tha } \bigg|_{\theta = \theta_{t}} , +\end{equation*} +where $\beta$ is the learning rate. + +\begin{remark} +The derivation of policy gradient written in the above is not rigorous! +It is a simplified version to make the policy gradient easy to understand. +To be rigorous, we must take into account that $Q_\pi$ depends on the policy $\pi$ and is thereby a function of $\tha$. +However, even is $Q_\pi$'s dependence on $\tha$ is taken into account, the resulting policy gradient is the same to \eqref{eqn:policy_grad}. +\end{remark} + + +To this end, we defined the policy network and derived the policy gradient in \eqref{eqn:policy_grad}. +However, there are two unsolved problems. +First, the expectation in \eqref{eqn:policy_grad} maybe intractable; this is typically the case when the action space $\AM$ is continuous, e.g., $\AM=[0, 1]$. +Second, the action-value $Q_\pi (s, a) $ is unknown. +We answer the two questions one by one. + + +\textbf{What if the expectation in \eqref{eqn:policy_grad} is intractable?} +If the action space $\AM$ is continuous, then the expectation (which is an integration) is typically intractable. +Given state $S_t=s_t$, if the action $A_t = a_t$ is randomly sampled according to the PDF $\pi (\cdot | s_t ; \tha )$, then +\begin{equation*} + \tilde{\g}_t + \: = \: Q_\pi (s_t, a_t) \cdot \frac{\partial \, \log \pi (a_t | s_t; \tha )}{\partial \, \tha } +\end{equation*} +is an unbiased estimate of $\frac{\partial \, V (s_t ; \mathbf{\theta} )}{\partial \,\mathbf{ \theta} }$. +We can think of $\g_{\theta } (\tha )$ as a stochastic gradient and update $\tha$ using stochastic gradient ascent. + + +\textbf{How do we know the action-value $Q_\pi (s, a) $?} +There can be two solutions: first, use the observed return $r_t$ instead of $Q_\pi (s, a) $; second, approximate $Q_\pi (s, a) $ using a neural network. +The two solutions are described in the following: +\begin{itemize} +\item + Play a game to the end, obtain all the rewards $r_1, r_2, \cdots , r_T$, and compute the returns $u_1, u_2, \cdots , u_T$ using the equation $u_t = \sum_{i=t}^T \gamma^{i-t} \cdot r_i$. + Since $Q_\pi (s_t, a_t) = \EB [U_t | s_t, a_t , \pi ]$, we can use $u_t$ to replace $Q_\pi (s_t, a_t) $. + In this way, the policy gradient \eqref{eqn:policy_grad} at time step $t$ becomes + \begin{equation*} + \frac{\partial \, V (s_t ; \tha )}{\partial \, \tha } + \: = \: \EB_{A \sim \pi (\cdot | s_t , \mathbf{\theta} )} \bigg[ u_t \cdot \frac{\partial \, \log \pi (A | s_t; \tha ) }{\partial \, \tha } \bigg] . + \end{equation*} + AlphaGo \cite{silver2016mastering} uses this approach. +\item + Use a value network to approximate $Q_\pi (s, a) $. + The value network provides supervision to the policy network. + The value network can be learned by temporal difference (TD). + This leads to the actor-critic method which is elaborated on in Section~\ref{sec:actor_critic_rand}. +\end{itemize} + + + + +\section{Actor-Critic Methods} \label{sec:actor_critic} + +Section~\ref{sec:actor_critic_rand} follows Section~\ref{sec:policy} and derive the standard (random) actor-critic method. +This approach is suitable for problems with discrete action space.\footnote{For example, Super Mario's action space $\{\textrm{``left''}, \textrm{``right''}, \textrm{``up''} \}$ is a discrete set.} +Section~\ref{sec:actor_critic_det} studies deterministic actor-critic method and learn it using deterministic policy gradient algorithm. +This method is very useful when the actions are continuous.\footnote{For example, a self-driving car's action can be two-dimensional vectors. The first dimension is the steering angle, and the second dimension is acceleration/deceleration. The action space is obviously continuous.} + + +\subsection{Random Actor-Critic Method} \label{sec:actor_critic_rand} + + +The actor-critic method has two neural networks. +Policy network $\pi (a | s; \tha ) $, which is called actor, approximates the policy function $\pi (a | s)$. +Value network $q (s, a; \w )$, which is called critic, approximates the action-value function $Q_\pi (a, s)$. +In this way, the state-value function $V_\pi (s )$ is approximated by +\begin{equation*} + V (s ; \w , \tha ) + \: = \: \EB_{A \sim \pi (\cdot | s; \theta )} \big[ q (s, A ; \w ) \big] + \: = \: \sum_{a\in \AM} \pi (a | s; \tha ) \cdot q (s, a ; \w ) . +\end{equation*} +It is not hard to show the policy gradient is +\begin{equation*} + \frac{\partial \, V (s ; \w , \tha ) }{ \partial \, \tha } + \: = \: \EB_{A \sim \pi (\cdot | s , \theta )} \bigg[ q (s, A; \w) \cdot \frac{\partial \, \log \pi (A | s; \tha ) }{\partial \, \tha } \bigg] . +\end{equation*} +The policy network will be updated using (stochastic) policy gradient ascent. +The value network can be updated using temporal different (TD) learning. +The following summarizes one iteration of the algorithm. +\begin{enumerate} + \item + Observe state $s_t$, and then randomly sample action $a_t \sim \pi (\cdot | s_t ; \tha_t )$. + \item + Agent performs action $a_t$ and observe reward $r_t$ and new state $s_{t+1}$. + \item + Randomly sample action $a_{t+1} \sim \pi (\cdot | s_{t+1} ; \tha_t )$. (Agent does not perform action $a_{t+1}$.) + \item + Evaluate the value network and get $q_t = q (s_t , a_t ; \w_t )$ and $q_{t+1} = q (s_{t+1} , a_{t+1} ; \w_t )$. + \item + Compute the TD error: $\delta_t = q_t - (r_t + \gamma \cdot q_{t+1})$. + \item + Update the value network: $\w_{t+1} \longleftarrow \w_{t} - \alpha \cdot \delta_t \cdot \frac{ \partial \, Q (s_t , a_t ; \w )}{\partial \, \w} \big|_{\w=\w_t }$. + \item + Update the policy network: $\tha_{t+1} \longleftarrow \tha_{t} + \beta \cdot q_t \cdot \frac{ \partial \, \log \pi ( a_t | s_t ; \theta )}{\partial \, \theta} \big|_{\theta =\theta_t }$.\footnote{In most papers and books, the update of the policy network is $\tha_{t+1} \longleftarrow \tha_{t} + \beta \cdot \delta_t \cdot \frac{ \partial \, \log \pi ( a_t | s_t ; \theta )}{\partial \, \theta} \big|_{\theta =\theta_t }$. +The difference is that $q_t$ is replaced by $\delta_t$. +Both approaches are correct. +The use of $\delta_t$ is the result of using a baseline which can reduce variance.} +\end{enumerate} +When learning the policy network (actor), the supervision is not from the rewards; instead, the supervision is from the critic's output $q_t = q (s_t , a_t ; \w_t )$. +The actor uses the critic's judgments to improve her performance. +When training the critic, the supervision is from the rewards. +The critic uses ground truth from the environment to make his judgment more accurate. + + + +\subsection{Deterministic Actor-Critic Method} \label{sec:actor_critic_det} + + +Throughout, the policy function is defined as the probability density function $\pi (a | s)$, and the action is randomly sampled according to $\pi$. +\textbf{Deterministic policy} is a function that maps state to actions: $\pi: \SM \mapsto \AM$, +where $\SM $ is the state space and $\AM$ is the action space. +Given the state $\s$, the policy function deterministically outputs action $a = \pi (s)$. +Deterministic policy is very useful when the actions are continuous. + + + +Deterministic actor-critic method \cite{silver2014deterministic} has two networks: policy network $\pi (s; \tha )$ and value network $q (s, a ; \w)$; see Figure~\ref{fig:dpg}. +The agent is controlled by the policy network which deterministically maps state $s$ to action $a$. +The value network is used for providing the policy network with supervision. +The two networks can be trained in the following way. + +\begin{figure}[!h] + \centering + \includegraphics[width=0.8\linewidth]{figures/DPG.pdf} + \caption{Deterministic actor-critic method. + The deterministic policy network maps state $s \in \SM$ to action $a \in \AM \subset \RB^2$. + The two dimensions of $a$ are, for example, the steering angle and acceleration of a self-driving car. + The value network maps the pair $(s, a)$ to a scalar. + } + \label{fig:dpg} +\end{figure} + + + +\paragraph{The value network can be trained by temporal different (TD) learning.} +Let $q_t = q (s_t , a_t ; \w_t )$ be the prediction and $y_t = r_t + \gamma \cdot q (s_{t+1} , a_{t+1} ; \w_t )$ be the TD target. +The TD error is $\delta_t = q_t - y_t$. +The model parameters $\w$ can be updated by $\w_{t+1}\longleftarrow \w_t - \alpha \cdot \delta_t \cdot \frac{\partial \, q (s_t , a_t ; \w )}{\partial \, \w}\big|_{\w=\w_t}$. + + + + + +\paragraph{Train the policy network by deterministic policy gradient (DPG)} which is totally different from the policy gradient we studied previously. +Note that the value network $q (s_t, a_t; \w)$ evaluates how good it is for the agent to perform action $a_t$ at state $s_t$. +The policy network has motivation to update its parameters $\tha$ so that the action $a_t = \pi (s_t ; \tha)$ will get a higher evaluation. +Intuitively speaking, the policy network (actor) wants to change herself so that the evaluation given by the value network (critic) will increase. +The derivative of the objective, i.e., $q (s_t, a_t; \w)$, w.r.t.\ the policy network's parameters $\tha$ is +\begin{equation*} + \g (\tha) + \: = \: \frac{\partial \, q (s_t , \pi (s_t ; \theta) ; \w )}{\partial \, \theta} + \: = \: \frac{\partial \, \pi (s_t ; \theta) }{\partial \, \theta} + \cdot \frac{\partial \, q (s_t , a; \w )}{\partial \, a} \bigg|_{a=\pi (s_t ; \theta) } , +\end{equation*} +where the second identity follows from the chain rule. +The policy network is updated by performing gradient ascent: $\tha_{t+1} \longleftarrow \tha_t + \beta \cdot \g (\tha_t )$. + + + + +\bibliographystyle{plainnat} +\bibliography{bib/rl} + + +\end{document} diff --git a/LectureNotes/DRL/bib/rl.bib b/LectureNotes/DRL/bib/rl.bib new file mode 100644 index 0000000..41fe04b --- /dev/null +++ b/LectureNotes/DRL/bib/rl.bib @@ -0,0 +1,47 @@ +@inproceedings{silver2014deterministic, + title={Deterministic Policy Gradient Algorithms}, + author={Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin}, + booktitle={International Conference on Machine Learning (ICML)}, + pages={387--395}, + year={2014} +} + + +@article{silver2016mastering, + title={Mastering the game of {Go} with deep neural networks and tree search}, + author={Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc}, + journal={nature}, + volume={529}, + number={7587}, + pages={484}, + year={2016} +} + + +@inproceedings{sutton2000policy, + title={Policy gradient methods for reinforcement learning with function approximation}, + author={Sutton, Richard S and McAllester, David A and Singh, Satinder P and Mansour, Yishay}, + booktitle={Advances in Neural Information Processing Systems (NIPS)}, + pages={1057--1063}, + year={2000} +} + + +@article{sutton2008convergent, + title={A convergent O (n) algorithm for off-policy temporal-difference learning with linear function approximation}, + author={Sutton, Richard S and Szepesv{\'a}ri, Csaba and Maei, Hamid Reza}, + journal={Advances in Neural Information Processing Systems (NIPS)}, + volume={21}, + number={21}, + pages={1609--1616}, + year={2008} +} + + +@inproceedings{sutton2009fast, + title={Fast gradient-descent methods for temporal-difference learning with linear function approximation}, + author={Sutton, Richard S and Maei, Hamid Reza and Precup, Doina and Bhatnagar, Shalabh and Silver, David and Szepesv{\'a}ri, Csaba and Wiewiora, Eric}, + booktitle={International Conference on Machine Learning (ICML)}, + pages={993--1000}, + year={2009} +} \ No newline at end of file diff --git a/LectureNotes/DRL/figures/DPG.pdf b/LectureNotes/DRL/figures/DPG.pdf new file mode 100644 index 0000000..a64f75f Binary files /dev/null and b/LectureNotes/DRL/figures/DPG.pdf differ diff --git a/LectureNotes/DRL/figures/DPG.pptx b/LectureNotes/DRL/figures/DPG.pptx new file mode 100644 index 0000000..98cfff3 Binary files /dev/null and b/LectureNotes/DRL/figures/DPG.pptx differ diff --git a/LectureNotes/DRL/figures/randomness.pdf b/LectureNotes/DRL/figures/randomness.pdf new file mode 100644 index 0000000..c3bb95c Binary files /dev/null and b/LectureNotes/DRL/figures/randomness.pdf differ diff --git a/LectureNotes/DRL/figures/randomness.pptx b/LectureNotes/DRL/figures/randomness.pptx new file mode 100644 index 0000000..22dfe16 Binary files /dev/null and b/LectureNotes/DRL/figures/randomness.pptx differ diff --git a/README.md b/README.md index df9f29e..079459d 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,61 @@ # CS583: Deep Learning + 1. **Machine learning basics.** This part briefly introduces the fundamental ML problems-- regression, classification, dimensionality reduction, and clustering-- and the traditional ML models and numerical algorithms for solving the problems. - * ML basics. + * ML basics [[slides-1](https://github.com/wangshusen/DeepLearning/blob/master/Slides/1_ML_Basics.pdf)] - [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/1_Models.pdf)] + [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/1_Models.pdf)]. - * Regression. + * Regression [[slides-1](https://github.com/wangshusen/DeepLearning/blob/master/Slides/2_Regression_1.pdf)] - [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/2_Regression_2.pdf)] + [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/2_Regression_2.pdf)]. * Classification. - - Logistic regression: + - Logistic regression [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_1.pdf)] - [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Logistic/paper/logistic.pdf)] + [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Logistic/paper/logistic.pdf)]. - - SVM: [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_2.pdf)] + - SVM + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_2.pdf)]. - - Softmax classifier: [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_3.pdf)] + - Softmax classifier + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_3.pdf)]. - - KNN classifier: [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_4.pdf)] + - KNN classifier + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Classification_4.pdf)]. - * Regularizations. + * Regularizations [[slides-1](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Optimization.pdf)] - [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Regularizations.pdf)] + [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Regularizations.pdf)]. - * Clustering. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/4_Clustering.pdf)] + * Clustering + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/3_Clustering.pdf)]. - * Dimensionality reduction. + * Dimensionality reduction [[slides-1](https://github.com/wangshusen/DeepLearning/blob/master/Slides/5_DR_1.pdf)] [[slides-2](https://github.com/wangshusen/DeepLearning/blob/master/Slides/5_DR_2.pdf)] - [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/SVD/svd.pdf)] + [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/SVD/svd.pdf)]. * Scientific computing libraries. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/5_DR_3.pdf)] + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/5_DR_3.pdf)]. + 2. **Neural network basics.** This part covers the multilayer perceptron, backpropagation, and deep learning libraries, with focus on Keras. - * Multilayer perceptron and backpropagation. + * Multilayer perceptron and backpropagation [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/6_NeuralNet_1.pdf)] - [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/BP/bp.pdf)] + [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/BP/bp.pdf)]. - * Keras. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/6_NeuralNet_2.pdf)] + * Keras + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/6_NeuralNet_2.pdf)]. * Further reading: @@ -64,23 +69,21 @@ This part covers the multilayer perceptron, backpropagation, and deep learning l 3. **Convolutional neural networks (CNNs).** This part is focused on CNNs and its application to computer vision problems. - * CNN basics. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_1.pdf)] + * CNN basics + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_1.pdf)]. - * Tricks for improving test accuracy. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_2.pdf)] + * Tricks for improving test accuracy + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_2.pdf)]. - * Feature scaling and batch normalization. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_3.pdf)] + * Feature scaling and batch normalization + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_3.pdf)]. - * Advanced topics on CNNs. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_4.pdf)] + * Advanced topics on CNNs + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_4.pdf)]. - * Popular CNN architectures. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_5.pdf)] + * Popular CNN architectures + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_5.pdf)]. - * Face recognition. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/7_CNN_6.pdf)] * Further reading: @@ -93,95 +96,193 @@ This part is focused on CNNs and its application to computer vision problems. 4. **Recurrent neural networks (RNNs).** This part introduces RNNs and its applications in natural language processing (NLP). - * Text processing. + * Categorical feature processing + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_0.pdf)] + [[video (Chinese)](https://youtu.be/NWcShtqr8kc)]. + + * Text processing and word embedding [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_1.pdf)] + [[video (Chinese)](https://youtu.be/6_2_2CPB97s)]. - * RNN basics and LSTM. + * RNN basics [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_2.pdf)] - [[reference](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)] - - * Text generation. + [[video (Chinese)](https://youtu.be/Cc4ENs6BHQw)]. + + * LSTM [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_3.pdf)] - - * Machine translation. + [[reference](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)] + [[video (Chinese)](https://youtu.be/vTouAvxlphc)]. + + * Making RNNs more effective [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_4.pdf)] - - * Image caption generation. + [[video (Chinese)](https://youtu.be/pzWHk_M23a0)]. + + * Text generation [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_5.pdf)] - [[reference](https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/)] + [[video (Chinese)](https://youtu.be/10cjvcrU_ZU)]. - * Attention. + * Machine translation [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_6.pdf)] - [[reference-1](https://distill.pub/2016/augmented-rnns/)] - [[reference-2](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html)] + [[video (Chinese)](https://youtu.be/gxXJ58LR684)]. + + * Attention + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_8.pdf)] + [[video (English)](https://youtu.be/B3uws4cLcFw)] + [[video (Chinese)](https://youtu.be/XhWdv7ghmQQ)] + [[reference](https://distill.pub/2016/augmented-rnns/)]. + + * Self-attention + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_9.pdf)] + [[video (English)](https://youtu.be/06r6kp7ujCA)] + [[video (Chinese)](https://youtu.be/Vr4UNt7X6Gw)]. + + + * Image caption generation + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_7.pdf)] + [[reference](https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/)]. + -5. **Language Models beyond RNNs.** +5. **Transformer Models.** - * Transformer model: beyond RNNs. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_7.pdf)] - [[reference](https://arxiv.org/pdf/1706.03762.pdf)] + + * Transformer (1/2): attention without RNN + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_Transformer_1.pdf)] + [[video (English)](https://youtu.be/FC8PziPmxnQ)] + [[video (Chinese)](https://youtu.be/aButdUV0dxI)]. + + * Transformer (2/2): from shallow to deep + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_Transformer_2.pdf)] + [[video (English)](https://youtu.be/J4H6A4-dvhE)] + [[video (Chinese)](https://youtu.be/aJRsr39F4dI)] + [[reference](https://arxiv.org/pdf/1706.03762.pdf)]. - * Pre-train Transformer using BERT. [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/9_RNN_8.pdf)] - [[reference](https://arxiv.org/pdf/1810.04805.pdf)] + * BERT: pre-training Transformer + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_BERT.pdf)] + [[video (English)](https://youtu.be/EOmd5sUUA_A)] + [[video (Chinese)](https://youtu.be/UlC6AjQWao8)] + [[reference](https://arxiv.org/pdf/1810.04805.pdf)]. + + * Vision Transformer (ViT) + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_ViT.pdf)] + [[video (English)](https://youtu.be/HZ4j_U3FC94)] + [[video (Chinese)](https://youtu.be/BbzOZ9THriY)]. 6. **Autoencoders.** This part introduces autoencoders for dimensionality reduction and image generation. - * Autoencoder for dimensionality reduction. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/8_AE_1.pdf)] + * Autoencoder for dimensionality reduction + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/8_AE_1.pdf)]. - * Variational Autoencoders (VAEs) for image generation. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/8_AE_2.pdf)] + * Variational Autoencoders (VAEs) for image generation + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/8_AE_2.pdf)]. 7. **Generative Adversarial Networks (GANs).** - * DC-GAN [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/12_GAN.pdf)] + * DC-GAN [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/12_GAN.pdf)]. + +8. **Deep Reinforcement Learning.** -8. **Recommender system.** -This part is focused on the collaborative filtering approach to recommendation based on the user-item rating data. -This part covers matrix completion methods and neural network approaches. + * Reinforcement learning basics + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_1.pdf)] + [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/DRL/DRL.pdf)] + [[video (Chinese)](https://youtu.be/vmkRMvhCW5c)]. - * Collaborative filtering. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/10_Recommender.pdf)] + * Value-based learning + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_2.pdf)] + [[video (Chinese)](https://youtu.be/jflq6vNcZyA)]. - -9. **Deep Reinforcement Learning.** + * Policy-based learning + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_3.pdf)] + [[video (Chinese)](https://youtu.be/qI0vyfR2_Rc)]. - * Reinforcement learning [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_1.pdf)] [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/DRL/DRL.pdf)] + * Actor-critic methods + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_4.pdf)] + [[video (Chinese)](https://youtu.be/xjd7Jq9wPQY)]. - * Value-based learning [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_2.pdf)] + * AlphaGo and Monte Carlo tree search + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_5.pdf)] + [[video (Chinese)](https://youtu.be/zHojAp5vkRE)]. - * Policy-based learning [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_3.pdf)] - * Actor-critic methods [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_4.pdf)] - * AlphaGo [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/13_RL_5.pdf)] +9. **Parallel Computing.** + * Basics and MapReduce + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_1.pdf)] + [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Parallel/Parallel.pdf)] + [[video (Chinese)](https://youtu.be/gVcnOe6_c6Q)]. + + * Parameter server and decentralized network + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_2.pdf)] + [[video (Chinese)](https://youtu.be/Aga2Lxp3G7M)]. + + * TensorFlow's mirrored strategy and ring all-reduce + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_3.pdf)] + [[video (Chinese)](https://youtu.be/rj-hjS5L8Bw)]. -10. **Parallel Computing.** - - * Basics and MapReduce. [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_1.pdf)] [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Parallel/Parallel.pdf)] [[Video (in Chinese)](https://youtu.be/gVcnOe6_c6Q)] - * Parameter Server and Decentralized Network. [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_2.pdf)] [[Video (in Chinese)](https://youtu.be/Aga2Lxp3G7M)] - * Federated Learning. [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_3.pdf)] [[Video (in Chinese)](https://youtu.be/STxtRucv_zo)] + * Federated learning + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/14_Parallel_4.pdf)] + [[video (Chinese)](https://youtu.be/STxtRucv_zo)]. -11. **Adversarial Robustness.** +10. **Adversarial Robustness.** This part introduces how to attack neural networks using adversarial examples and how to defend from the attack. - * Data evasion attack and defense. - [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/11_Adversarial.pdf)] - [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Adversarial/DataAttacks.pdf)] + * Data evasion attack and defense + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/11_Evasion.pdf)] + [[lecture note](https://github.com/wangshusen/DeepLearning/blob/master/LectureNotes/Adversarial/DataAttacks.pdf)]. + + * Data poisoning attack + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/11_Poisoning.pdf)] + [[video (Chinese)](https://youtu.be/_K0nZcqdu5w)]. + * Further reading: - [[Adversarial Robustness - Theory and Practice](https://adversarial-ml-tutorial.org/)] + [[Adversarial Robustness - Theory and Practice](https://adversarial-ml-tutorial.org/)]. + + +11. **Meta Learning.** + + * Few-shot learning: basic concepts + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/16_Meta_1.pdf)] + [[video (English)](https://youtu.be/hE7eGew4eeg)] + [[video (Chinese)](https://youtu.be/UkQ2FVpDxHg)]. + + * Siamese network + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/16_Meta_2.pdf)] + [[video (English)](https://youtu.be/4S-XDefSjTM)] + [[video (Chinese)](https://youtu.be/Er8xH_k0Vj4)]. + + * Pretraining + fine tuning + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/16_Meta_3.pdf)] + [[video (English)](https://youtu.be/U6uFOIURcD0)] + [[video (Chinese)](https://youtu.be/3zSYMuDm6RU)]. + +12. **Neural Architecture Search (NAS).** + + + * Basics + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/15_NAS_1.pdf)] + [[video (Chinese)](https://youtu.be/voWgnMpFaW8)]. + + * RNN + Reinforcement Learning: + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/15_NAS_2.pdf)] + [[video (Chinese)](https://youtu.be/AmitvRzmvv0)]. + + * Differentiable NAS: + [[slides](https://github.com/wangshusen/DeepLearning/blob/master/Slides/15_NAS_3.pdf)] + [[video (Chinese)](https://youtu.be/D9m9-CXw_HY)]. + + + diff --git a/Slides/10_BERT.pdf b/Slides/10_BERT.pdf new file mode 100644 index 0000000..34f4a7d Binary files /dev/null and b/Slides/10_BERT.pdf differ diff --git a/Slides/10_Recommender.pdf b/Slides/10_Recommender.pdf deleted file mode 100644 index dd3c98b..0000000 Binary files a/Slides/10_Recommender.pdf and /dev/null differ diff --git a/Slides/10_Transformer_1.pdf b/Slides/10_Transformer_1.pdf new file mode 100644 index 0000000..5e528ec Binary files /dev/null and b/Slides/10_Transformer_1.pdf differ diff --git a/Slides/10_Transformer_2.pdf b/Slides/10_Transformer_2.pdf new file mode 100644 index 0000000..90dd015 Binary files /dev/null and b/Slides/10_Transformer_2.pdf differ diff --git a/Slides/10_ViT.pdf b/Slides/10_ViT.pdf new file mode 100644 index 0000000..0295ec9 Binary files /dev/null and b/Slides/10_ViT.pdf differ diff --git a/Slides/11_Adversarial.pdf b/Slides/11_Evasion.pdf similarity index 100% rename from Slides/11_Adversarial.pdf rename to Slides/11_Evasion.pdf diff --git a/Slides/11_Poisoning.pdf b/Slides/11_Poisoning.pdf new file mode 100644 index 0000000..98c7d56 Binary files /dev/null and b/Slides/11_Poisoning.pdf differ diff --git a/Slides/12_GAN.pdf b/Slides/12_GAN.pdf index a9f0645..66c86b1 100644 Binary files a/Slides/12_GAN.pdf and b/Slides/12_GAN.pdf differ diff --git a/Slides/13_RL_1.pdf b/Slides/13_RL_1.pdf index 3443444..f8eb258 100644 Binary files a/Slides/13_RL_1.pdf and b/Slides/13_RL_1.pdf differ diff --git a/Slides/13_RL_2.pdf b/Slides/13_RL_2.pdf index 57bde51..5a21cc8 100644 Binary files a/Slides/13_RL_2.pdf and b/Slides/13_RL_2.pdf differ diff --git a/Slides/13_RL_3.pdf b/Slides/13_RL_3.pdf index 2c63054..010ab8f 100644 Binary files a/Slides/13_RL_3.pdf and b/Slides/13_RL_3.pdf differ diff --git a/Slides/13_RL_4.pdf b/Slides/13_RL_4.pdf index c5d6d78..54cc4b6 100644 Binary files a/Slides/13_RL_4.pdf and b/Slides/13_RL_4.pdf differ diff --git a/Slides/13_RL_5.pdf b/Slides/13_RL_5.pdf index 7240019..cb28222 100644 Binary files a/Slides/13_RL_5.pdf and b/Slides/13_RL_5.pdf differ diff --git a/Slides/14_Parallel_1.pdf b/Slides/14_Parallel_1.pdf index b484324..4d02138 100644 Binary files a/Slides/14_Parallel_1.pdf and b/Slides/14_Parallel_1.pdf differ diff --git a/Slides/14_Parallel_2.pdf b/Slides/14_Parallel_2.pdf index 56a2d64..478c68d 100644 Binary files a/Slides/14_Parallel_2.pdf and b/Slides/14_Parallel_2.pdf differ diff --git a/Slides/14_Parallel_3.pdf b/Slides/14_Parallel_3.pdf index fa81ea0..682ce8c 100644 Binary files a/Slides/14_Parallel_3.pdf and b/Slides/14_Parallel_3.pdf differ diff --git a/Slides/14_Parallel_4.pdf b/Slides/14_Parallel_4.pdf new file mode 100644 index 0000000..afc6c03 Binary files /dev/null and b/Slides/14_Parallel_4.pdf differ diff --git a/Slides/15_NAS_1.pdf b/Slides/15_NAS_1.pdf new file mode 100644 index 0000000..3527aa4 Binary files /dev/null and b/Slides/15_NAS_1.pdf differ diff --git a/Slides/15_NAS_2.pdf b/Slides/15_NAS_2.pdf new file mode 100644 index 0000000..bd195c4 Binary files /dev/null and b/Slides/15_NAS_2.pdf differ diff --git a/Slides/15_NAS_3.pdf b/Slides/15_NAS_3.pdf new file mode 100644 index 0000000..6cb6532 Binary files /dev/null and b/Slides/15_NAS_3.pdf differ diff --git a/Slides/16_Meta_1.pdf b/Slides/16_Meta_1.pdf new file mode 100644 index 0000000..72f40cb Binary files /dev/null and b/Slides/16_Meta_1.pdf differ diff --git a/Slides/16_Meta_2.pdf b/Slides/16_Meta_2.pdf new file mode 100644 index 0000000..43b5ed2 Binary files /dev/null and b/Slides/16_Meta_2.pdf differ diff --git a/Slides/16_Meta_3.pdf b/Slides/16_Meta_3.pdf new file mode 100644 index 0000000..b069f44 Binary files /dev/null and b/Slides/16_Meta_3.pdf differ diff --git a/Slides/2_Regression_1.pdf b/Slides/2_Regression_1.pdf index f0da1c9..8136026 100644 Binary files a/Slides/2_Regression_1.pdf and b/Slides/2_Regression_1.pdf differ diff --git a/Slides/2_Regression_2.pdf b/Slides/2_Regression_2.pdf index da43078..6a99162 100644 Binary files a/Slides/2_Regression_2.pdf and b/Slides/2_Regression_2.pdf differ diff --git a/Slides/3_Classification_1.pdf b/Slides/3_Classification_1.pdf index 913deca..7445d72 100644 Binary files a/Slides/3_Classification_1.pdf and b/Slides/3_Classification_1.pdf differ diff --git a/Slides/3_Classification_2.pdf b/Slides/3_Classification_2.pdf index 1e44260..414cb40 100644 Binary files a/Slides/3_Classification_2.pdf and b/Slides/3_Classification_2.pdf differ diff --git a/Slides/3_Classification_3.pdf b/Slides/3_Classification_3.pdf index f14b1eb..26a7dbc 100644 Binary files a/Slides/3_Classification_3.pdf and b/Slides/3_Classification_3.pdf differ diff --git a/Slides/3_Classification_4.pdf b/Slides/3_Classification_4.pdf index 2334699..1174850 100644 Binary files a/Slides/3_Classification_4.pdf and b/Slides/3_Classification_4.pdf differ diff --git a/Slides/4_Clustering.pdf b/Slides/3_Clustering.pdf similarity index 100% rename from Slides/4_Clustering.pdf rename to Slides/3_Clustering.pdf diff --git a/Slides/3_Optimization.pdf b/Slides/3_Optimization.pdf index 95f4435..f0ee580 100644 Binary files a/Slides/3_Optimization.pdf and b/Slides/3_Optimization.pdf differ diff --git a/Slides/3_Regularizations.pdf b/Slides/3_Regularizations.pdf index 6dfa7de..12d2887 100644 Binary files a/Slides/3_Regularizations.pdf and b/Slides/3_Regularizations.pdf differ diff --git a/Slides/4_MC_1.pdf b/Slides/4_MC_1.pdf new file mode 100644 index 0000000..9b14730 Binary files /dev/null and b/Slides/4_MC_1.pdf differ diff --git a/Slides/5_DR_3.pdf b/Slides/5_DR_3.pdf index a4eae3e..6d06060 100644 Binary files a/Slides/5_DR_3.pdf and b/Slides/5_DR_3.pdf differ diff --git a/Slides/6_NeuralNet_1.pdf b/Slides/6_NeuralNet_1.pdf index fe0d5e0..a886e57 100644 Binary files a/Slides/6_NeuralNet_1.pdf and b/Slides/6_NeuralNet_1.pdf differ diff --git a/Slides/7_CNN_1.pdf b/Slides/7_CNN_1.pdf index 0de1116..2162da6 100644 Binary files a/Slides/7_CNN_1.pdf and b/Slides/7_CNN_1.pdf differ diff --git a/Slides/7_CNN_3.pdf b/Slides/7_CNN_3.pdf index eb61ea0..67e6726 100644 Binary files a/Slides/7_CNN_3.pdf and b/Slides/7_CNN_3.pdf differ diff --git a/Slides/7_CNN_5.pdf b/Slides/7_CNN_5.pdf index 40fbbc5..51dd3e1 100644 Binary files a/Slides/7_CNN_5.pdf and b/Slides/7_CNN_5.pdf differ diff --git a/Slides/8_AE_2.pdf b/Slides/8_AE_2.pdf index 459b203..a26f7a3 100644 Binary files a/Slides/8_AE_2.pdf and b/Slides/8_AE_2.pdf differ diff --git a/Slides/9_RNN_0.pdf b/Slides/9_RNN_0.pdf new file mode 100644 index 0000000..6254ef3 Binary files /dev/null and b/Slides/9_RNN_0.pdf differ diff --git a/Slides/9_RNN_1.pdf b/Slides/9_RNN_1.pdf index e3bd5ae..7660614 100644 Binary files a/Slides/9_RNN_1.pdf and b/Slides/9_RNN_1.pdf differ diff --git a/Slides/9_RNN_2.pdf b/Slides/9_RNN_2.pdf index 9671464..9a0fccc 100644 Binary files a/Slides/9_RNN_2.pdf and b/Slides/9_RNN_2.pdf differ diff --git a/Slides/9_RNN_3.pdf b/Slides/9_RNN_3.pdf index 941d2f0..8d35e83 100644 Binary files a/Slides/9_RNN_3.pdf and b/Slides/9_RNN_3.pdf differ diff --git a/Slides/9_RNN_4.pdf b/Slides/9_RNN_4.pdf index 2a840ce..41d6c5e 100644 Binary files a/Slides/9_RNN_4.pdf and b/Slides/9_RNN_4.pdf differ diff --git a/Slides/9_RNN_5.pdf b/Slides/9_RNN_5.pdf index 91aa93d..dc0f678 100644 Binary files a/Slides/9_RNN_5.pdf and b/Slides/9_RNN_5.pdf differ diff --git a/Slides/9_RNN_6.pdf b/Slides/9_RNN_6.pdf index f366ed7..e0e010e 100644 Binary files a/Slides/9_RNN_6.pdf and b/Slides/9_RNN_6.pdf differ diff --git a/Slides/9_RNN_7.pdf b/Slides/9_RNN_7.pdf index 0aec9bf..91aa93d 100644 Binary files a/Slides/9_RNN_7.pdf and b/Slides/9_RNN_7.pdf differ diff --git a/Slides/9_RNN_8.pdf b/Slides/9_RNN_8.pdf index 613c80f..297466e 100644 Binary files a/Slides/9_RNN_8.pdf and b/Slides/9_RNN_8.pdf differ diff --git a/Slides/9_RNN_9.pdf b/Slides/9_RNN_9.pdf new file mode 100644 index 0000000..7ec1845 Binary files /dev/null and b/Slides/9_RNN_9.pdf differ