mp: minor fixes to summary

cristianpjensen · Jun 2, 2024 · 2be0f64 · 2be0f64
1 parent e42fb6b
commit 2be0f64
Show file tree

Hide file tree

Showing 8 changed files with 25 additions and 21 deletions.
diff --git a/machine_perception/summary/sections/01_neural_networks.tex b/machine_perception/summary/sections/01_neural_networks.tex
@@ -140,7 +140,7 @@ \subsection{Backpropagation}
               \pdv{\mathcal{L}}{h_k} = \pdv{\mathcal{L}}{\hat{y}} \pdv{\hat{y}}{\transpose{\vec{w}} \vec{h}} \pdv{\transpose{\vec{w}} \vec{h}}{h_k} = (y - \hat{y}) \sigma' \lft( \transpose{\vec{w}} \vec{h} \rgt) w_k.
           \]
           This also matches our intuition, since the amount that we want to increase $h_k$ by is proportional
-          to $w_k$. But, we cannot increase $h_k$ directly, but we can update the weights connected to $h_k$,
+          to $w_k$. We cannot increase $h_k$ directly, but we can update the weights connected to $h_k$,
           which brings us back to the first case. This update will be proportional to
           $\pdv{\mathcal{L}}{h_k}$ by the chain rule. In this way, we can recursively update all the weights
           using gradient information.
@@ -173,6 +173,8 @@ \subsection{Universal approximation theorem}
     where $|f_{\vec{\theta}}(\vec{x}) - g(\vec{x})| < \epsilon$ for all $\vec{x}\in I_m$.
 \end{theorem}
 
-The universal approximation theorem holds for any single hidden layer network. However, this hidden
-layer may need to have infinite width to approximate $f$. In practice, deeper networks work better
-than wider networks.
+In words, this means that any continuous function can be approximated by a single hidden layer MLP
+with a continuous non-linear activation function with arbitrary precision. The universal
+approximation theorem holds for any single hidden layer network. However, this hidden layer may
+need to have infinite width to approximate $f$. In practice, deeper networks work better than wider
+networks.
diff --git a/machine_perception/summary/sections/02_convolutional_neural_networks.tex b/machine_perception/summary/sections/02_convolutional_neural_networks.tex
@@ -97,7 +97,7 @@ \subsection{Convolutional neural network}
                            & = \sum_{i'} \sum_{j'} \delta^{(\ell)}[i',j'] w^{(\ell)}[i'-i,j'-j].
 \end{align*}
 From this, we can see that we can compute all values of $\delta^{(\ell-1)}$ by a single convolution, \[
-    \delta^{(\ell-1)} = \delta^{(\ell)} * \mathrm{Rot}_{180} \lft( \mat{W}^{(\ell)} \rgt) = \delta^{(\ell)} \star \mat{W}^{(\ell)}.
+    \delta^{(\ell-1)} = \delta^{(\ell)} * \mathrm{Flip}\lft( \mat{W}^{(\ell)} \rgt) = \delta^{(\ell)} \star \mat{W}^{(\ell)}.
 \]
 
 Using this value, we can compute the derivative \wrt the weights, which we need for the parameter

diff --git a/machine_perception/summary/sections/04_recurrent_neural_networks.tex b/machine_perception/summary/sections/04_recurrent_neural_networks.tex
@@ -113,7 +113,7 @@ \subsection{Long-short term memory}
           forgetting completely and $1$ means remembering everything;
 
     \item $g: \R^d \times \R^d \to [-1, 1]^d$ is the \textit{gate} that decides what to write in the
-          cell state $\vec{c}^{(t)}$.
+          cell state $\vec{c}^{(t)}$;
 
     \item $i: \R^d \times \R^d \to [0,1]^d$ is the \textit{input gate} and has the role of ``deciding
           which values of the cell state $\vec{c}^{(t)}$ should be updated'' at the current time step.

diff --git a/machine_perception/summary/sections/06_autoencoders.tex b/machine_perception/summary/sections/06_autoencoders.tex
@@ -95,7 +95,8 @@ \subsection{Variational autoencoders}
                     & = \E_{\vec{z}\mid \vec{x}}[\log p_{\vec{\psi}}(\vec{x}\mid \vec{z})] - D_{\mathrm{KL}}(q_{\vec{\phi}}(\vec{z}\mid \vec{x}) \rVert p(\vec{z})) + D_{\mathrm{KL}} (q_{\vec{\phi}}(\vec{z}\mid \vec{x}) \rVert p(\vec{z} \mid \vec{x}))                                                                                            \\
                     & \geq \E_{\vec{z}\mid \vec{x}}[\log p_{\vec{\psi}}(\vec{x}\mid \vec{z})] - D_{\mathrm{KL}}(q_{\vec{\phi}}(\vec{z}\mid \vec{x}) \rVert p(\vec{z})). \margintag{KL-divergence is non-negative.}
 \end{align*}
-The first term of the ELBO encourages low reconstruction error, while the second term makes sure that
+The first term of the ELBO encourages low reconstruction error, which encourages the latent space to
+be structured such that similar data is clustered together. The second term makes sure that
 the approximate posterior $q_{\vec{\phi}}$ does not deviate too far from the prior $p$. The second
 term can be computed in a closed-form, since both arguments are Gaussian,
 \begin{align*}

diff --git a/machine_perception/summary/sections/07_autoregressive_models.tex b/machine_perception/summary/sections/07_autoregressive_models.tex
@@ -98,7 +98,7 @@ \subsection{Masked autoencoder distribution estimation}
 matrices,
 \begin{align*}
     \mat{M}^{\mat{W^{(\ell)}}}_{ij} & = \mathbb{1}\lft\{ m^{(\ell-1)}(j) \leq m^{(\ell)}(i) \rgt\} \\
-    \mat{M}^{\mat{V}}_{ij}          & = \mathbb{1}\lft\{ m^{(\ell-1)}(j) < m^{(\ell)}(i) \rgt\}.
+    \mat{M}^{\mat{V}}_{ij}          & = \mathbb{1}\lft\{ m^{(n)}(j) < m^{(\mat{V})}(i) \rgt\}.
 \end{align*}
 Then, we alter the weight matrices by
 \begin{align*}

diff --git a/machine_perception/summary/sections/08_normalizing_flows.tex b/machine_perception/summary/sections/08_normalizing_flows.tex
@@ -37,7 +37,8 @@ \subsection{Change of variables}
 From a computational perspective, we require the Jacobian of the transformation to be computed
 efficiently. In general, computing the Jacobian takes $\bigo{d^3}$ to compute for a $d \times d$
 matrix. However, this is not fast enough. A way to achieve linear complexity is to design $f$ such
-that its Jacobian is a triangular matrix, which takes $\bigo{d}$ to compute. This requirement
+that its Jacobian is a triangular matrix, which takes $\bigo{d}$ to compute.\sidenote{The
+    determinant of a triangular matrix is the product of its diagonal entries.} This requirement
 further reduces the number of modeling decisions we can make.
 
 \subsection{Coupling layers}

diff --git a/machine_perception/summary/sections/09_generative_adversarial_networks.tex b/machine_perception/summary/sections/09_generative_adversarial_networks.tex
@@ -33,7 +33,7 @@ \section{Generative adversarial networks}
 samples generated by the generator.
 
 This leads us to the following value function, \[
-    V(D,G) = \log D(\vec{x}) + \log (1 - D(G(\vec{z}))), \quad \vec{x} \in \mathcal{D}, \vec{z} \sim \mathcal{N}(\vec{0}, \mat{I}).
+    V(D,G) = \E_{\vec{x} \sim p_{\mathrm{data}}} [D(\vec{x})] + \E_{\vec{z} \sim \mathcal{N}(\vec{0}, \mat{I})} [\log (1 - D(G(\vec{z})))].
 \]
 The discriminator aims to maximize it, while the generator aims to minimize it, which gives the
 following optimization problem, \[
@@ -57,10 +57,10 @@ \subsection{Theoretical analysis}
 \begin{proof}
     Let $G$ be a generator, then $D$ is computed by
     \begin{align*}
-        D^\star & = \argmax_D V(G,D)                                                                                                                                                                                             \\
-                & = \argmax_D \E_{\vec{x} \sim p_{\mathrm{data}}}[\log D(\vec{x})] + \E_{\vec{z} \sim p_{\mathrm{prior}}}[\log(1 - D(G(\vec{z})))]                                                                               \\
-                & = \argmax_D \int p_{\mathrm{data}}(\vec{x}) \log D(\vec{x}) \mathrm{d}\vec{x} + \int_{\vec{z}} p_{\mathrm{prior}}(\vec{z}) \log(1 - D(G(\vec{z}))) \mathrm{d}\vec{z}                                           \\
-                & = \argmax_D \int p_{\mathrm{data}}(\vec{x}) \log D(\vec{x}) \mathrm{d}\vec{x} + \int_{\vec{x}} p_{\mathrm{model}}(\vec{x}) \log(1 - D(\vec{x})) \mathrm{d}\vec{x} \margintag{Law of unconscious statistician.} \\
+        D^\star & = \argmax_D V(G,D)                                                                                                                                                                                   \\
+                & = \argmax_D \E_{\vec{x} \sim p_{\mathrm{data}}}[\log D(\vec{x})] + \E_{\vec{z} \sim p_{\mathrm{prior}}}[\log(1 - D(G(\vec{z})))]                                                                     \\
+                & = \argmax_D \int p_{\mathrm{data}}(\vec{x}) \log D(\vec{x}) \mathrm{d}\vec{x} + \int p(\vec{z}) \log(1 - D(G(\vec{z}))) \mathrm{d}\vec{z}                                                            \\
+                & = \argmax_D \int p_{\mathrm{data}}(\vec{x}) \log D(\vec{x}) \mathrm{d}\vec{x} + \int p_{\mathrm{model}}(\vec{x}) \log(1 - D(\vec{x})) \mathrm{d}\vec{x} \margintag{Law of unconscious statistician.} \\
                 & = \argmax_D \int p_{\mathrm{data}}(\vec{x}) \log D(\vec{x}) + p_{\mathrm{model}}(\vec{x}) \log(1 - D(\vec{x})) \mathrm{d}\vec{x}.
     \end{align*}
 
@@ -94,12 +94,12 @@ \subsection{Theoretical analysis}
     \]
     We substitute this into the value function,
     \begin{align*}
-        V(G, D^\star) & = \E_{\vec{x} \sim p} \lft[ \log \lft( \frac{p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] + \E_{\vec{z} \sim p_{\mathrm{prior}}} \lft[ \log \lft( 1 - \frac{p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt]                   \\
-                      & = \E_{\vec{x} \sim p} \lft[ \log \lft( \frac{p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] + \E_{\vec{z} \sim p_{\mathrm{prior}}} \lft[ \log \lft( \frac{q(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt]                       \\
-                      & = \E_{\vec{x} \sim p} \lft[ \log \lft( \frac{2 p(\vec{x})}{2 (p(\vec{x}) + q(\vec{x}))} \rgt) \rgt] + \E_{\vec{z} \sim p_{\mathrm{prior}}} \lft[ \log \lft( \frac{2 q(\vec{x})}{2 (p(\vec{x}) + q(\vec{x}))} \rgt) \rgt]           \\
-                      & = \E_{\vec{x} \sim p} \lft[ \log \lft( \frac{2 p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] - \log 2 + \E_{\vec{z} \sim p_{\mathrm{prior}}} \lft[ \log \lft( \frac{2 q(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] - \log 2 \\
-                      & = \E_{\vec{x} \sim p} \lft[ \log \lft( \frac{2 p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] + \E_{\vec{z} \sim p_{\mathrm{prior}}} \lft[ \log \lft( \frac{2 q(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] - \log 4          \\
-                      & = D_{\mathrm{KL}} \lft( p \middle\lVert \frac{p + q}{2} \rgt) + D_{\mathrm{KL}} \lft( q \middle\lVert \frac{p + q}{2} \rgt) - \log 4                                                                                               \\
+        V(G, D^\star) & = \E_p \lft[ \log \lft( \frac{p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] + \E_q \lft[ \log \lft( 1 - \frac{p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt]                   \\
+                      & = \E_p \lft[ \log \lft( \frac{p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] + \E_q \lft[ \log \lft( \frac{q(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt]                       \\
+                      & = \E_p \lft[ \log \lft( \frac{2 p(\vec{x})}{2 (p(\vec{x}) + q(\vec{x}))} \rgt) \rgt] + \E_q \lft[ \log \lft( \frac{2 q(\vec{x})}{2 (p(\vec{x}) + q(\vec{x}))} \rgt) \rgt]           \\
+                      & = \E_p \lft[ \log \lft( \frac{2 p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] - \log 2 + \E_q \lft[ \log \lft( \frac{2 q(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] - \log 2 \\
+                      & = \E_p \lft[ \log \lft( \frac{2 p(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] + \E_q \lft[ \log \lft( \frac{2 q(\vec{x})}{p(\vec{x}) + q(\vec{x})} \rgt) \rgt] - \log 4          \\
+                      & = D_{\mathrm{KL}} \lft( p \middle\lVert \frac{p + q}{2} \rgt) + D_{\mathrm{KL}} \lft( q \middle\lVert \frac{p + q}{2} \rgt) - \log 4                                                \\
                       & = 2 D_{\mathrm{JS}}(p \lVert q) - \log 4,
     \end{align*}
     where the Jensen-Shannon divergence is a symmetric and smoothed version of the KL divergence, defined as \[

diff --git a/machine_perception/summary/sections/13_parametric_body_models.tex b/machine_perception/summary/sections/13_parametric_body_models.tex
@@ -97,7 +97,7 @@ \subsection{3D poses}
               \mat{T}_P(\vec{\beta}, \vec{\theta}) = \bar{\mat{T}} + \mat{B}_S(\vec{\beta}) + \mat{B}_P(\vec{\theta});
           \]
     \item Lastly, perform linear blend skinning on the resulting base mesh, \[
-              \vec{t}_i' = \lft( \sum_{k} w_{ki} \mat{G}_k(\vec{\theta}, \mat{J}(\vec{\beta})) \rgt) (\vec{t}_i + \vec{s}_i(\vec{\beta}) + \vec{p}_i(\vec{\theta})).
+              \bar{\vec{t}}_i' = \lft( \sum_{k} w_{ki} \mat{G}_k(\vec{\theta}, \mat{J}(\vec{\beta})) \rgt) (\bar{\vec{t}}_i + \vec{b}_{S,i}(\vec{\beta}) + \vec{b}_{P,i}(\vec{\theta})).
           \]
 \end{enumerate}