diff --git b/main/algorithm.sty a/main/algorithm.sty
new file mode 100644
index 0000000..843e3d5
--- /dev/null
+++ a/main/algorithm.sty
@@ -0,0 +1,79 @@
+% ALGORITHM STYLE -- Released 8 April 1996
+%    for LaTeX-2e
+% Copyright -- 1994 Peter Williams
+% E-mail Peter.Williams@dsto.defence.gov.au
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{algorithm}
+\typeout{Document Style `algorithm' - floating environment}
+
+\RequirePackage{float}
+\RequirePackage{ifthen}
+\newcommand{\ALG@within}{nothing}
+\newboolean{ALG@within}
+\setboolean{ALG@within}{false}
+\newcommand{\ALG@floatstyle}{ruled}
+\newcommand{\ALG@name}{Algorithm}
+\newcommand{\listalgorithmname}{List of \ALG@name s}
+
+% Declare Options
+% first appearance
+\DeclareOption{plain}{
+  \renewcommand{\ALG@floatstyle}{plain}
+}
+\DeclareOption{ruled}{
+  \renewcommand{\ALG@floatstyle}{ruled}
+}
+\DeclareOption{boxed}{
+  \renewcommand{\ALG@floatstyle}{boxed}
+}
+% then numbering convention
+\DeclareOption{part}{
+  \renewcommand{\ALG@within}{part}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{chapter}{
+  \renewcommand{\ALG@within}{chapter}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{section}{
+  \renewcommand{\ALG@within}{section}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{subsection}{
+  \renewcommand{\ALG@within}{subsection}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{subsubsection}{
+  \renewcommand{\ALG@within}{subsubsection}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{nothing}{
+  \renewcommand{\ALG@within}{nothing}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption*{\edef\ALG@name{\CurrentOption}}
+
+% ALGORITHM
+%
+\ProcessOptions
+\floatstyle{\ALG@floatstyle}
+\ifthenelse{\boolean{ALG@within}}{
+  \ifthenelse{\equal{\ALG@within}{part}}
+     {\newfloat{algorithm}{htbp}{loa}[part]}{}
+  \ifthenelse{\equal{\ALG@within}{chapter}}
+     {\newfloat{algorithm}{htbp}{loa}[chapter]}{}
+  \ifthenelse{\equal{\ALG@within}{section}}
+     {\newfloat{algorithm}{htbp}{loa}[section]}{}
+  \ifthenelse{\equal{\ALG@within}{subsection}}
+     {\newfloat{algorithm}{htbp}{loa}[subsection]}{}
+  \ifthenelse{\equal{\ALG@within}{subsubsection}}
+     {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{}
+  \ifthenelse{\equal{\ALG@within}{nothing}}
+     {\newfloat{algorithm}{htbp}{loa}}{}
+}{
+  \newfloat{algorithm}{htbp}{loa}
+}
+\floatname{algorithm}{\ALG@name}
+
+\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}}
+
diff --git b/main/algorithmic.sty a/main/algorithmic.sty
new file mode 100644
index 0000000..ad61478
--- /dev/null
+++ a/main/algorithmic.sty
@@ -0,0 +1,201 @@
+% ALGORITHMIC STYLE -- Released 8 APRIL 1996
+%    for LaTeX version 2e
+% Copyright -- 1994 Peter Williams
+% E-mail PeterWilliams@dsto.defence.gov.au
+%
+% Modified by Alex Smola (08/2000)
+% E-mail Alex.Smola@anu.edu.au
+%
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{algorithmic}
+\typeout{Document Style `algorithmic' - environment}
+%
+\RequirePackage{ifthen}
+\RequirePackage{calc}
+\newboolean{ALC@noend}
+\setboolean{ALC@noend}{false}
+\newcounter{ALC@line}
+\newcounter{ALC@rem}
+\newlength{\ALC@tlm}
+%
+\DeclareOption{noend}{\setboolean{ALC@noend}{true}}
+%
+\ProcessOptions
+%
+% ALGORITHMIC
+\newcommand{\algorithmicrequire}{\textbf{Require:}}
+\newcommand{\algorithmicensure}{\textbf{Ensure:}}
+\newcommand{\algorithmiccomment}[1]{\{#1\}}
+\newcommand{\algorithmicend}{\textbf{end}}
+\newcommand{\algorithmicif}{\textbf{if}}
+\newcommand{\algorithmicthen}{\textbf{then}}
+\newcommand{\algorithmicelse}{\textbf{else}}
+\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif}
+\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif}
+\newcommand{\algorithmicfor}{\textbf{for}}
+\newcommand{\algorithmicforall}{\textbf{for all}}
+\newcommand{\algorithmicdo}{\textbf{do}}
+\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor}
+\newcommand{\algorithmicwhile}{\textbf{while}}
+\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile}
+\newcommand{\algorithmicloop}{\textbf{loop}}
+\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop}
+\newcommand{\algorithmicrepeat}{\textbf{repeat}}
+\newcommand{\algorithmicuntil}{\textbf{until}}
+
+%changed by alex smola
+\newcommand{\algorithmicinput}{\textbf{input}}
+\newcommand{\algorithmicoutput}{\textbf{output}}
+\newcommand{\algorithmicset}{\textbf{set}}
+\newcommand{\algorithmictrue}{\textbf{true}}
+\newcommand{\algorithmicfalse}{\textbf{false}}
+\newcommand{\algorithmicand}{\textbf{and\ }}
+\newcommand{\algorithmicor}{\textbf{or\ }}
+\newcommand{\algorithmicfunction}{\textbf{function}}
+\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction}
+\newcommand{\algorithmicmain}{\textbf{main}}
+\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain}
+%end changed by alex smola
+
+\def\ALC@item[#1]{%
+\if@noparitem \@donoparitem
+  \else \if@inlabel \indent \par \fi
+         \ifhmode \unskip\unskip \par \fi
+         \if@newlist \if@nobreak \@nbitem \else
+                        \addpenalty\@beginparpenalty
+                        \addvspace\@topsep \addvspace{-\parskip}\fi
+           \else \addpenalty\@itempenalty \addvspace\itemsep
+          \fi
+    \global\@inlabeltrue
+\fi
+\everypar{\global\@minipagefalse\global\@newlistfalse
+          \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels
+             \penalty\z@ \fi
+          \everypar{}}\global\@nobreakfalse
+\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi
+\sbox\@tempboxa{\makelabel{#1}}%
+\global\setbox\@labels
+ \hbox{\unhbox\@labels \hskip \itemindent
+       \hskip -\labelwidth \hskip -\ALC@tlm
+       \ifdim \wd\@tempboxa >\labelwidth
+                \box\@tempboxa
+          \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi
+       \hskip \ALC@tlm}\ignorespaces}
+%
+\newenvironment{algorithmic}[1][0]{
+\let\@item\ALC@item
+  \newcommand{\ALC@lno}{%
+\ifthenelse{\equal{\arabic{ALC@rem}}{0}}
+{{\footnotesize \arabic{ALC@line}:}}{}%
+}
+\let\@listii\@listi
+\let\@listiii\@listi
+\let\@listiv\@listi
+\let\@listv\@listi
+\let\@listvi\@listi
+\let\@listvii\@listi
+  \newenvironment{ALC@g}{
+    \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@
+    \listparindent\z@ \rightmargin\z@ 
+    \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@
+    \leftmargin 1em
+    \addtolength{\ALC@tlm}{\leftmargin}
+    }
+  }
+  {\end{list}}
+  \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item}
+  \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}%
+{}{\ \algorithmiccomment{##1}}}
+  \newcommand{\REQUIRE}{\item[\algorithmicrequire]}
+  \newcommand{\ENSURE}{\item[\algorithmicensure]}
+  \newcommand{\STATE}{\ALC@it}
+  \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}}
+%changes by alex smola
+  \newcommand{\INPUT}{\item[\algorithmicinput]}
+  \newcommand{\OUTPUT}{\item[\algorithmicoutput]}
+  \newcommand{\SET}{\item[\algorithmicset]}
+%  \newcommand{\TRUE}{\algorithmictrue}
+%  \newcommand{\FALSE}{\algorithmicfalse}
+  \newcommand{\AND}{\algorithmicand}
+  \newcommand{\OR}{\algorithmicor}
+  \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}}
+%end changes by alex smola
+  \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}}
+  \renewcommand{\\}{\@centercr}
+  \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\
+    \algorithmicthen\ {##2}}
+  \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\ELSIF}[2][default]%
+{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo%
+\ALC@com{##1}\begin{ALC@for}}
+  \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ %
+\algorithmicdo%
+\ALC@com{##1}\begin{ALC@for}}
+  \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ %
+    \algorithmicdo\ {##2}}
+  \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ %
+\algorithmicdo%
+\ALC@com{##1}\begin{ALC@whl}}
+  \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop%
+\ALC@com{##1}\begin{ALC@loop}}
+%changed by alex smola
+  \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ %
+    \ALC@com{##1}\begin{ALC@func}}
+  \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ %
+    \ALC@com{##1}\begin{ALC@main}}
+%end changed by alex smola
+  \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat%
+    \ALC@com{##1}\begin{ALC@rpt}}
+    \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1}
+  \ifthenelse{\boolean{ALC@noend}}{
+    \newcommand{\ENDIF}{\end{ALC@if}}
+    \newcommand{\ENDFOR}{\end{ALC@for}}
+    \newcommand{\ENDWHILE}{\end{ALC@whl}}
+    \newcommand{\ENDLOOP}{\end{ALC@loop}}
+    \newcommand{\ENDFUNCTION}{\end{ALC@func}}
+    \newcommand{\ENDMAIN}{\end{ALC@main}}
+  }{
+    \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif}
+    \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor}
+    \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile}
+    \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop}
+    \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction}
+    \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain}
+  } 
+  \renewcommand{\@toodeep}{}
+  \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}%
+      \itemsep\z@ \itemindent\z@ \listparindent\z@%
+      \partopsep\z@ \parskip\z@ \parsep\z@%
+      \labelsep 0.5em \topsep 0.2em%
+      \ifthenelse{\equal{#1}{0}}
+      {\labelwidth 0.5em }
+      {\labelwidth  1.2em }
+      \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
+      \ALC@tlm\labelsep
+      }
+    }
+  {\end{list}}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git b/main/appendix.tex a/main/appendix.tex
new file mode 100644
index 0000000..62ebedd
--- /dev/null
+++ a/main/appendix.tex
@@ -0,0 +1,753 @@
+\section{Relevant proofs}
+\subsection{Proof of Theorem \ref{theorem1}}
+\label{proofth1}
+\begin{proof}
+    \label{th1proof}   
+        The proof is  based on Borkar's Theorem   for
+        general stochastic approximation recursions with two time scales
+        \cite{borkar1997stochastic}. 
+        
+        % The new TD error for the linear setting is 
+        % \begin{equation*}
+        % \delta_{\text{new}}=r+\gamma
+        % \theta^{\top}\phi'-\theta^{\top}\phi-\mathbb{E}[\delta].
+        % \end{equation*}
+        A new one-step
+        linear TD solution is defined
+        as: 
+        \begin{equation*}
+        0=\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]=-A\theta+b.
+        \end{equation*}
+        Thus, the  VMTD's solution is
+        $\theta_{\text{VMTD}}=A^{-1}b$.
+        
+        First, note that recursion (\ref{theta}) can be rewritten as
+        \begin{equation*}
+        \theta_{k+1}\leftarrow \theta_k+\beta_k\xi(k),
+        \end{equation*}
+        where
+        \begin{equation*}
+        \xi(k)=\frac{\alpha_k}{\beta_k}(\delta_k-\omega_k)\phi_k
+        \end{equation*}
+        Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$,
+        $\xi(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$. 
+        That is the increments in iteration (\ref{omega}) are uniformly larger than
+        those in (\ref{theta}), thus (\ref{omega}) is the faster recursion.
+        Along the faster time scale, iterations of (\ref{omega}) and (\ref{theta})
+        are associated to ODEs system as follows:
+        \begin{equation}
+        \dot{\theta}(t) = 0,
+        \label{thetaFast}
+        \end{equation}
+        \begin{equation}
+        \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta(t)]-\omega(t).
+        \label{omegaFast}
+        \end{equation}
+        Based on the ODE (\ref{thetaFast}), $\theta(t)\equiv \theta$ when
+        viewed from the faster timescale. 
+        By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
+        $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
+        $\theta$ that depends on the initial condition $\theta_0$ of recursion
+        (\ref{theta}).
+        Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as
+        \begin{equation}
+        \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta]-\omega(t).
+        \label{omegaFastFinal}
+        \end{equation}
+        Consider the function $h(\omega)=\mathbb{E}[\delta|\theta]-\omega$,
+        i.e., the driving vector field of the ODE (\ref{omegaFastFinal}).
+        It is easy to find that the function $h$ is Lipschitz with coefficient
+        $-1$.
+        Let $h_{\infty}(\cdot)$ be the function defined by
+         $h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$.
+         Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
+         For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}[\delta|\theta]$
+        is the unique globally asymptotically stable equilibrium.
+         For the ODE
+          \begin{equation}
+         \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
+         \label{omegaInfty}
+         \end{equation}
+         apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
+        associated strict Liapunov function. Then,
+        the origin of (\ref{omegaInfty}) is a globally asymptotically stable
+        equilibrium.
+        
+        
+        Consider now the recursion (\ref{omega}).
+        Let
+        $M_{k+1}=(\delta_k-\omega_k)
+        -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
+        where $\mathcal{F}(k)=\sigma(\omega_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
+        $k\geq 1$ are the sigma fields
+        generated by $\omega_0,\theta_0,\omega_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
+        $0\leq l<k$.
+        It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
+        satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
+        Because $\phi_k$, $r_k$, and $\phi_k'$   have
+        uniformly bounded second moments, it can be seen that for some constant
+        $c_1>0$, $\forall k\geq0$,
+        \begin{equation*}
+        \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
+        c_1(1+||\omega_k||^2+||\theta_k||^2).
+        \end{equation*}
+        
+        
+        Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
+        Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
+        conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus,
+        by Theorem 2.2 of \cite{borkar2000ode} we obtain that
+        $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
+        
+        Consider now the slower time scale recursion (\ref{theta}).
+        Based on the above analysis, (\ref{theta}) can be rewritten as 
+        \begin{equation*}
+        \theta_{k+1}\leftarrow
+        \theta_{k}+\alpha_k(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k.
+        \end{equation*}
+        
+        Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
+        $k\geq 1$ be the sigma fields
+        generated by $\theta_0,\theta_{l+1},\phi_l,\phi_l'$,
+        $0\leq l<k$.
+        Let 
+        $
+        Z_{k+1} = Y_{t}-\mathbb{E}[Y_{t}|\mathcal{G}(k)],
+        $
+        where
+        \begin{equation*}
+        Y_{k}=(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k.
+        \end{equation*}
+        Consequently,
+        \begin{equation*}
+        \begin{array}{ccl}
+        \mathbb{E}[Y_t|\mathcal{G}(k)]&=&\mathbb{E}[(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k|\mathcal{G}(k)]\\
+        &=&\mathbb{E}[\delta_k\phi_k|\theta_k]
+        -\mathbb{E}[\mathbb{E}[\delta_k|\theta_k]\phi_k]\\
+        &=&\mathbb{E}[\delta_k\phi_k|\theta_k]
+        -\mathbb{E}[\delta_k|\theta_k]\mathbb{E}[\phi_k]\\
+        &=&\mathrm{Cov}(\delta_k|\theta_k,\phi_k),
+        \end{array}
+        \end{equation*}
+        where $\mathrm{Cov}(\cdot,\cdot)$ is a covariance operator.
+        
+         Thus,
+         \begin{equation*}
+        \begin{array}{ccl}
+        Z_{k+1}&=&(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k-\mathrm{Cov}(\delta_k|\theta_k,\phi_k).
+        \end{array}
+        \end{equation*}
+        It is easy to verify that $Z_{k+1},k\geq0$ are integrable random variables that 
+        satisfy $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$.
+        Also, because $\phi_k$, $r_k$, and $\phi_k'$  have
+        uniformly bounded second moments, it can be seen that for some constant
+        $c_2>0$, $\forall k\geq0$,
+        \begin{equation*}
+        \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
+        c_2(1+||\theta_k||^2).
+        \end{equation*}
+        
+        Consider now the following ODE associated with (\ref{theta}):
+        \begin{equation}
+        \begin{array}{ccl}
+        \dot{\theta}(t)&=&\mathrm{Cov}(\delta|\theta(t),\phi)\\
+        &=&\mathrm{Cov}(r+(\gamma\phi'-\phi)^{\top}\theta(t),\phi)\\
+        &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\theta(t)^{\top}(\phi-\gamma\phi'),\phi)\\
+        &=&\mathrm{Cov}(r,\phi)-\theta(t)^{\top}\mathrm{Cov}(\phi-\gamma\phi',\phi)\\
+        &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi-\gamma\phi',\phi)^{\top}\theta(t)\\
+        &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi-\gamma\phi')\theta(t)\\
+        &=&-A\theta(t)+b.
+        \end{array}
+        \label{odetheta}
+        \end{equation}
+        Let $\vec{h}(\theta(t))$ be the driving vector field of the ODE
+        (\ref{odetheta}).
+        \begin{equation*}
+        \vec{h}(\theta(t))=-A\theta(t)+b.
+        \end{equation*}
+         Consider the cross-covariance matrix,
+        \begin{equation}
+        \begin{array}{ccl}
+        A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\
+          &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\
+          &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\
+          &=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\
+        \end{array}
+        \label{covariance}
+        \end{equation}
+        where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$
+        \footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to
+        the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or
+        initialized randomly in a Markov chain for on-policy update.}.
+        Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and
+        $\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive
+        definite. Then, the matrix $A$ is semi-positive definite because  $A$ is
+        linearly combined  by  two positive-weighted semi-positive definite matrice
+        (\ref{covariance}).
+        Furthermore, $A$ is nonsingular due to the assumption.
+        Hence, the cross-covariance matrix $A$ is positive definite.
+        
+        Therefore,
+        $\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically
+        stable equilibrium for ODE (\ref{odetheta}).
+        Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow
+        \infty}\frac{\vec{h}(r\theta)}{r}$. Then
+        $\vec{h}_{\infty}(\theta)=-A\theta$ is well-defined. 
+        Consider now the ODE
+        \begin{equation}
+        \dot{\theta}(t)=-A\theta(t).
+        \label{odethetafinal}
+        \end{equation}
+        The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium.
+        Thus, the assumption (A1) and (A2) are verified.
+    \end{proof}
+
+\subsection{Proof of Corollary \ref{corollary4_2}}
+\label{proofcorollary4_2}
+The update formulas in linear two-timescale algorithms are as follows:
+\begin{equation}
+    \theta_{k+1}=\theta_{k} + \alpha_{k}[h_1(\theta_{k},\omega_{k})+M^{(1)}_{k+1}],
+\end{equation}
+\begin{equation}
+    \omega_{k+1}=\omega_{k} + \alpha_{k}[h_2(\theta_{k},\omega_{k})+M^{(2)}_{k+1}].
+\end{equation}
+where $\alpha_k, \beta_k \in \mathbb{R} $ are stepsizes and $M^{(1)} \in \mathbb{R}^{d_1}, M^{(2)} \in \mathbb{R}^{d_2}$
+denote noise. 
+$h_1 : \mathbb{R}^{d_{1}}\times \mathbb{R}^{d_{2}}\rightarrow \mathbb{R}^{d_{1}}$ and 
+$h_2 : \mathbb{R}^{d_{1}}\times \mathbb{R}^{d_{2}}\rightarrow \mathbb{R}^{d_{2}}$ have the 
+form, respectively, 
+\begin{equation}
+    h_{1}(\theta,\omega)=v_1 - \Gamma_1 \theta - W_1\omega,
+\end{equation}
+\begin{equation}
+    h_{2}(\theta,\omega)=v_2 - \Gamma_2 \theta - W_2\omega,
+\end{equation}
+where $v_1 \in \mathbb{R}^{d_1}$, $v_2 \in \mathbb{R}^{d_2}$, $\Gamma_1 \in \mathbb{R}^{d_1 \times d_1}$
+, $\Gamma_2 \in \mathbb{R}^{d_2 \times d_1}$, $W_1 \in \mathbb{R}^{d_1 \times d_2}$ and 
+$W_2 \in \mathbb{R}^{d_2 \times d_2}$. $d_1$ and $d_2$ are the dimensions of vectors $\theta$ and $\omega$, respectively.
+
+For Theorem 3 in \cite{dalal2020tale}, the theorem still holds even when $d——1$ is not equal to $d_2$. For the VMTD algorithm, $d_2$ is equal to 1.
+% Before proving the Corollary \ref{corollary4_2}, 
+\cite{dalal2020tale} presents 
+the matrix assumption, step size assumption, and 
+defines sparse projection.
+
+\begin{assumption}
+\label{matrixassumption}
+(Matrix Assumption).
+$W_2$ and $X_1 = \Gamma_1 - W_1 W_{2}^{-1}\Gamma_2$ are positive definite(not necessarily symmetric).
+\end{assumption}
+
+\begin{assumption}
+    \label{stepsizeassumption}
+(Step Size Assumption).
+$\alpha_k = (k+1)^{-\alpha}$ and $\beta_k = (k+1)^{-\beta}$, where $1>\alpha > \beta > 0$.
+\end{assumption}
+
+\begin{definition}
+    \label{sparseprojection}
+(Sparse Projection).
+For $R>0$, let $\Pi_{R}(x)=\min \{1, R/||x||\}$. $x$ be the projection into the ball with redius
+R around the origin. The sparse projection operator
+\begin{equation*}
+    \Pi_{n, R} = \begin{cases}
+        \Pi_{R}, & \text{if } k = n^{n} - 1 \text{ for some } n \in \mathbb{Z}_{>0}, \\
+        I, & \text{otherwise}.
+    \end{cases}
+\end{equation*}
+We call it sparse as it projects only on specific indices that are exponentially far apart.
+
+Pick an arbitrary $p>1$. Fix some constant $R^{\theta}_{\text{proj}}>0$ and $R^{\omega}_{\text{proj}}>0$ 
+for the radius of the projection ball. Further, let 
+\begin{equation*}
+    \theta^{*}=X^{-1}_{1}b_{1}, \omega^{*}=W^{-1}_{2}(v_2 - \Gamma_2 \theta^{*})
+\end{equation*}
+with $b_1=v_1 - W_1 W_2^{-1}v_2$.
+The formula for the sparse projection update in linear two-timescale algorithms is as follows:
+\begin{equation}
+    \label{sparseprojectiontheta}
+    \theta'_{k+1}=\Pi_{k+1,R^{\theta}_{\text{proj}}}(\theta'_{k} + \alpha_{k}[h_1(\theta'_{k},\omega'_{k})+M^{(1')}_{k+1}]),
+\end{equation}
+\begin{equation}
+    \label{sparseprojectionomega}
+    \omega'_{k+1}=\Pi_{k+1,R^{\omega}_{\text{proj}}}(\omega'_{k} + \beta_{k}[h_2(\theta'_{k},\omega'_{k})+M^{(2')}_{k+1}]).
+\end{equation}
+
+\end{definition}
+
+\begin{proof}
+    As long as the VMTD algorithm satisfies Assumption \ref{matrixassumption}, 
+the convergence speed of the VMTD algorithm can be 
+obtained.
+
+VMTD's update rule is 
+\begin{equation*}
+    \theta_{k+1}=\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi_k.
+\end{equation*}
+\begin{equation*}
+        \omega_{k+1}=\omega_{k}+\beta_k(\delta_k-\omega_k).
+\end{equation*}
+Thus, $h_1(\theta, \omega)=\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi - \gamma\phi')\theta$, 
+$h_2(\theta, \omega)=\mathbb{E}[r]+\mathbb{E}[\gamma \phi'^{\top}-\phi^{\top}]\theta -\omega$, 
+$\Gamma_1 =\mathrm{Cov}(\phi,\phi - \gamma\phi')$, 
+$W_1 = 0$ and 
+$\Gamma_2 = -\mathbb{E}[\gamma \phi'^{\top}-\phi^{\top}]$, 
+$W_2 = 1$, 
+$v_2 = \mathbb{E}[r]$. Additionally, 
+$X_1=\Gamma_1 - W_1 W^{-1}_2 \Gamma_2 = \mathrm{Cov}(\phi,\phi - \gamma\phi')$.
+% By the Assumption \ref{matrixassumption}, 
+It can be deduced from the proof \ref{th1proof} that $X_1$ is a positive definite matrix.
+The VMTD algorithm satisfies the Assumption \ref{matrixassumption}.
+By the proof \ref{th1proof}, Definition 1 in \cite{dalal2020tale} is satisfied.
+We can apply the Theorem 3 in \cite{dalal2020tale} to get the Corollary \ref{corollary4_2}.
+
+
+
+
+
+\end{proof}
+
+
+
+
+\subsection{Proof of Theorem \ref{theorem2}}
+\label{proofth2}
+\begin{proof}
+The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation.
+
+For the VMTDC algorithm, a new one-step linear TD solution is defined as:
+\begin{equation*}
+    0=\mathbb{E}[(\phi - \gamma \phi' - \mathbb{E}[\phi - \gamma \phi'])\phi^\top]\mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\phi]=A^{\top}C^{-1}(-A\theta+b).
+\end{equation*}
+The matrix $A^{\top}C^{-1}A$ is positive definite. Thus, the  VMTD's solution is
+$\theta_{\text{VMTDC}}=\theta_{\text{VMTD}}=A^{-1}b$.
+
+First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, 
+\begin{equation*}
+    \theta_{k+1}\leftarrow \theta_k+\zeta_k x(k),
+\end{equation*}
+\begin{equation*}
+    u_{k+1}\leftarrow u_k+\beta_k y(k),
+\end{equation*}
+where 
+\begin{equation*}
+    x(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)],
+\end{equation*}
+\begin{equation*}
+    y(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \phi^{\top}_k u_k]\phi_k.
+\end{equation*}
+
+Recursion (\ref{thetavmtdc}) can also be rewritten as
+\begin{equation*}
+    \theta_{k+1}\leftarrow \theta_k+\beta_k z(k),
+\end{equation*}
+where
+\begin{equation*}
+    z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)],
+\end{equation*}
+
+Due to the settings of step-size schedule 
+$\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$.
+That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than
+those in (\ref{uvmtdc}) and  the increments in iteration (\ref{uvmtdc}) are uniformly larger than
+those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion.
+Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc})
+are associated to ODEs system as follows:
+\begin{equation}
+    \dot{\theta}(t) = 0,
+    \label{thetavmtdcFastest}
+\end{equation}
+\begin{equation}
+    \dot{u}(t) = 0,
+    \label{uvmtdcFastest}
+\end{equation}
+\begin{equation}
+    \dot{\omega}(t)=\mathbb{E}[\delta_t|u(t),\theta(t)]-\omega(t).
+    \label{omegavmtdcFastest}
+\end{equation}
+
+Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$
+and $u(t)\equiv u$ when viewed from the fastest timescale.
+By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
+$||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
+$\theta$ that depends on the initial condition $\theta_0$ of recursion
+(\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
+$u$ that depends on the initial condition $u_0$ of recursion
+(\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest})
+can be written as 
+\begin{equation}
+    \dot{\omega}(t)=\mathbb{E}[\delta_t|u,\theta]-\omega(t).
+    \label{omegavmtdcFastestFinal}
+\end{equation}
+
+Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$,
+i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}).
+It is easy to find that the function $h$ is Lipschitz with coefficient
+$-1$.
+Let $h_{\infty}(\cdot)$ be the function defined by
+ $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$.
+ Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
+ For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\theta,u]$
+is the unique globally asymptotically stable equilibrium.
+For the ODE
+\begin{equation}
+ \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
+ \label{omegavmtdcInfty}
+\end{equation}
+apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
+associated strict Liapunov function. Then,
+the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable
+equilibrium.
+
+Consider now the recursion (\ref{omegavmtdc}).
+Let
+$M_{k+1}=(\delta_k-\omega_k)
+-\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
+where $\mathcal{F}(k)=\sigma(\omega_l,u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
+$k\geq 1$ are the sigma fields
+generated by $\omega_0,u_0,\theta_0,\omega_{l+1},u_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
+$0\leq l<k$.
+It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
+satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
+Because $\phi_k$, $r_k$, and $\phi_k'$   have
+uniformly bounded second moments, it can be seen that for some constant
+$c_1>0$, $\forall k\geq0$,
+\begin{equation*}
+\mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
+c_1(1+||\omega_k||^2+||u_k||^2+||\theta_k||^2).
+\end{equation*}
+
+
+Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
+Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
+conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
+by Theorem 2.2 of \cite{borkar2000ode} we obtain that
+$||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
+
+Consider now the second time scale recursion (\ref{uvmtdc}).
+Based on the above analysis, (\ref{uvmtdc}) can be rewritten as
+% \begin{equation*}
+%     u_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|u_k,\theta_k] - \phi^{\top} (s_k) u_k]\phi(s_k).
+% \end{equation*}
+\begin{equation}
+    \dot{\theta}(t) = 0,
+    \label{thetavmtdcFaster}
+\end{equation}
+\begin{equation}
+    \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|u(t),\theta(t)])\phi_t|\theta(t)] - Cu(t).
+    \label{uvmtdcFaster}
+\end{equation}
+The ODE (\ref{thetavmtdcFaster}) suggests that $\theta(t)\equiv \theta$ (i.e., a time invariant parameter)
+when viewed from the second fast timescale.
+By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
+$||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
+$\theta$ that depends on the initial condition $\theta_0$ of recursion
+(\ref{thetavmtdc}). 
+
+Consider now the recursion (\ref{uvmtdc}).
+Let
+$N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \phi_k \phi^{\top}_k u_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \phi_k \phi^{\top}_k u_k)|\mathcal{I} (k)]$,
+where $\mathcal{I}(k)=\sigma(u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
+$k\geq 1$ are the sigma fields
+generated by $u_0,\theta_0,u_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
+$0\leq l<k$.
+It is easy to verify that $N_{k+1},k\geq0$ are integrable random variables that 
+satisfy $\mathbb{E}[N_{k+1}|\mathcal{I}(k)]=0$, $\forall k\geq0$.
+Because $\phi_k$, $r_k$, and $\phi_k'$   have
+uniformly bounded second moments, it can be seen that for some constant
+$c_2>0$, $\forall k\geq0$,
+\begin{equation*}
+\mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq
+c_2(1+||u_k||^2+||\theta_k||^2).
+\end{equation*}
+
+Because $\theta(t)\equiv \theta$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster})
+can be written as 
+\begin{equation}
+    \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\theta])\phi_t|\theta] - Cu(t).
+    \label{uvmtdcFasterFinal}
+\end{equation}
+Now consider the function $h(u)=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\theta]|\theta] -Cu$, i.e., the
+driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}),
+$u^* = C^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta])\phi|\theta]$ is the unique globally asymptotically
+stable equilibrium. Let $h_{\infty}(u)=-Cu$.
+For the ODE
+\begin{equation}
+    \dot{u}(t) = h_{\infty}(u(t))= -Cu(t),
+    \label{uvmtdcInfty}
+\end{equation}
+the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable
+equilibrium because $C$ is a positive definite matrix (because it is nonnegative definite and nonsingular).
+Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
+Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
+conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
+by Theorem 2.2 of \cite{borkar2000ode} we obtain that
+$||u_k-u^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
+
+Consider now the slower timescale recursion (\ref{thetavmtdc}). In the light of the above,
+(\ref{thetavmtdc}) can be rewritten as 
+\begin{equation}
+    \theta_{k+1} \leftarrow \theta_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k\\
+    - \alpha_k \gamma\phi'_{k}(\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k]).
+\end{equation}
+Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
+$k\geq 1$ be the sigma fields
+generated by $\theta_0,\theta_{l+1},\phi_l,\phi_l'$,
+$0\leq l<k$. Let
+\begin{equation*}
+    \begin{array}{ccl}
+    Z_{k+1}&=&((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])\\ 
+     & &-\mathbb{E}[((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])|\mathcal{G}(k)]\\
+    &=&((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])\\
+    & &-\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k|\theta_k] - \gamma\mathbb{E}[\phi' \phi^{\top}]C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k|\theta_k].
+    \end{array}
+\end{equation*}
+It is easy to see that $Z_k$, $k\geq 0$ are integrable random variables and $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$. Further,
+\begin{equation*}
+\mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
+c_3(1+||\theta_k||^2), k\geq 0
+\end{equation*}
+for some constant $c_3 \geq 0$, again beacuse $\phi_k$, $r_k$, and $\phi_k'$   have
+uniformly bounded second moments, it can be seen that for some constant.
+
+Consider now the following ODE associated with (\ref{thetavmtdc}):
+\begin{equation}
+    \dot{\theta}(t) = (I - \mathbb{E}[\gamma \phi' \phi^{\top}]C^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)].
+    \label{thetavmtdcSlowerFinal}
+\end{equation}
+Let 
+\begin{equation*}
+\begin{array}{ccl}
+    \vec{h}(\theta(t))&=&(I - \mathbb{E}[\gamma \phi' \phi^{\top}]C^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
+    &=&(C - \mathbb{E}[\gamma \phi' \phi^{\top}])C^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
+    &=& (\mathbb{E}[\phi \phi^{\top}] - \mathbb{E}[\gamma \phi' \phi^{\top}])C^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
+    &=& A^{\top}C^{-1}(-A\theta(t)+b),
+\end{array}
+\end{equation*}
+because $\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]=-A\theta(t)+b$, where 
+$A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$, $b=\mathrm{Cov}(r,\phi)$, and $C=\mathbb{E}[\phi\phi^{\top}]$
+
+Therefore,
+$\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically
+stable equilibrium for ODE (\ref{thetavmtdcSlowerFinal}).
+Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow
+\infty}\frac{\vec{h}(r\theta)}{r}$. Then
+$\vec{h}_{\infty}(\theta)=-A^{\top}C^{-1}A\theta$ is well-defined. 
+Consider now the ODE
+\begin{equation}
+\dot{\theta}(t)=-A^{\top}C^{-1}A\theta(t).
+\label{odethetavmtdcfinal}
+\end{equation}
+
+Because $C^{-1}$ is positive definite and $A$ has full rank (as it
+is nonsingular by assumption), the matrix $A^{\top} C^{-1}A$ is also
+positive definite. 
+The ODE (\ref{odethetavmtdcfinal}) has the origin as its unique globally asymptotically stable equilibrium.
+Thus, the assumption (A1) and (A2) are verified.
+
+The proof is given above.
+In the fastest time scale, the parameter $w$ converges to
+$\mathbb{E}[\delta|u_k,\theta_k]$.
+In the second fast time scale,
+the parameter $u$ converges to $C^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta_k])\phi|\theta_k]$.
+In the slower time scale,
+the parameter $\theta$ converges to $A^{-1}b$.
+\end{proof}
+
+\begin{algorithm}[t]
+    \caption{VMTDC algorithm with linear function approximation in the off-policy setting}
+    \label{alg:algorithm 2}
+\begin{algorithmic}
+    \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
+    $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
+    \REPEAT
+    \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
+    \textbf{Output}: $\theta^*$.\\
+    \FOR{$t=0$ {\bfseries to} $T-1$}
+    \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
+    \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
+    \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
+    \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
+    \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[ (\delta_t-\omega_t)\phi_t - \gamma \phi_{t+1}(\phi^{\top}_{t} u_t)]$
+    \STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) - \phi^{\top}_{t} u_t] \phi_t$
+    \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
+    \STATE $S_t=S_{t+1}$
+    \ENDFOR
+    \UNTIL{terminal episode}
+\end{algorithmic}
+\end{algorithm}
+
+\begin{algorithm}[t]
+    \caption{VMGTD algorithm with linear function approximation in the off-policy setting}
+    \label{alg:algorithm 3}
+\begin{algorithmic}
+    \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
+    $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
+    \REPEAT
+    \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
+    \textbf{Output}: $\theta^*$.\\
+    \FOR{$t=0$ {\bfseries to} $T-1$}
+    \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
+    \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
+    \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
+    \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
+    \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[\phi_t - \gamma \phi_{t+1}]\phi^{\top}_{t} u_t$
+    \STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) \phi_t - u_t]$
+    \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
+    \STATE $S_t=S_{t+1}$
+    \ENDFOR
+    \UNTIL{terminal episode}
+\end{algorithmic}
+\end{algorithm}
+
+\begin{algorithm}[t]
+    \caption{VMGTD2 algorithm with linear function approximation in the off-policy setting}
+    \label{alg:algorithm 4}
+\begin{algorithmic}
+    \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
+    $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
+    \REPEAT
+    \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
+    \textbf{Output}: $\theta^*$.\\
+    \FOR{$t=0$ {\bfseries to} $T-1$}
+    \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
+    \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
+    \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
+    \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
+    \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[\phi_t - \gamma \phi_{t+1}]\phi^{\top}_{t} u_t$
+    \STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) - \phi^{\top}_{t} u_t] \phi_t$
+    \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
+    \STATE $S_t=S_{t+1}$
+    \ENDFOR
+    \UNTIL{terminal episode}
+\end{algorithmic}
+\end{algorithm}
+
+% \begin{algorithm}[t]
+%     \caption{VMETD algorithm with linear function approximation in the off-policy setting}
+%     \label{alg:algorithm 5}
+% \begin{algorithmic}
+%     \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
+%     $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
+%     \REPEAT
+%     \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
+%     \textbf{Output}: $\theta^*$.\\
+%     \FOR{$t=0$ {\bfseries to} $T-1$}
+%     \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
+%     \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
+%     \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
+%     \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
+%     \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t u_t \rho_t(\delta_t-\omega_t)\phi_t$
+%     \STATE $u_{t+1}\leftarrow \gamma \rho_t u_t +1$
+%     \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
+%     \STATE $S_t=S_{t+1}$
+%     \ENDFOR
+%     \UNTIL{terminal episode}
+% \end{algorithmic}
+% \end{algorithm}
+
+\section{Experimental details}
+\label{experimentaldetails}
+The feature matrices corresponding to three random walks are shown below respectively:
+\begin{equation*}
+    \Phi_{tabular}=\left[ 
+    \begin{array}{ccccc}
+    1 & 0& 0& 0& 0\\
+    0 & 1& 0& 0& 0\\
+    0 & 0& 1& 0& 0\\
+    0 & 0& 0& 1& 0\\
+    0 & 0& 0& 0& 1
+    \end{array}\right]
+    \end{equation*}
+    \begin{equation*}
+    \Phi_{inverted}=\left[ 
+    \begin{array}{ccccc}
+    0 & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\
+    \frac{1}{2} & 0& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\
+    \frac{1}{2} & \frac{1}{2}& 0& \frac{1}{2}& \frac{1}{2}\\
+    \frac{1}{2} & \frac{1}{2}& \frac{1}{2}& 0& \frac{1}{2}\\
+    \frac{1}{2} & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& 0
+    \end{array}\right]
+    \end{equation*}
+    \begin{equation*}
+    \Phi_{dependent}=\left[ 
+    \begin{array}{ccccc}
+    1 & 0& 0\\
+    \frac{1}{\sqrt{2}} & \frac{1}{\sqrt{2}}& 0\\
+    \frac{1}{\sqrt{3}} & \frac{1}{\sqrt{3}}& \frac{1}{\sqrt{3}}\\
+    0 & \frac{1}{\sqrt{2}}& \frac{1}{\sqrt{2}}\\
+    0 & 0& 1
+    \end{array}\right]
+    \end{equation*}
+
+Three random walk experiments: the $\alpha$ values for 
+all algorithms are in the range of $\{0.008, 0.015, 0.03, 0.06, 0.12, 0.25, 0.5\}$. For the TDC algorithm, 
+the range of the ratio $\frac{\zeta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. For the VMTD algorithm, 
+the range of the ratio $\frac{\beta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. It can be observed from 
+the update formula of VMTDC that when $\zeta$ takes a very small value, 
+the VMTDC update tends to be similar to VMTD update. Similarly, 
+when $\beta$ takes a very small value, the VMTDC update tends to be 
+similar to TDC update. Through experiments, it was found that 
+setting $\zeta$ to a small value makes VMTDC updates approach VMTD 
+updates, resulting in better performance. Therefore, for the VMTDC 
+algorithm, the range of $\frac{\beta}{\alpha}$ ratio is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$, and the range of 
+$\zeta$ is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. The learning curves in Figure \ref{Evaluation_full} correspond to the optimal 
+parameters.
+
+The feature matrix of 7-state version of Baird's off-policy counterexample is
+defined as follow:
+\begin{equation*}
+\Phi_{Counter}=\left[ 
+\begin{array}{cccccccc}
+1 & 2& 0& 0& 0& 0& 0& 0\\
+1 & 0& 2& 0& 0& 0& 0& 0\\
+1 & 0& 0& 2& 0& 0& 0& 0\\
+1 & 0& 0& 0& 2& 0& 0& 0\\
+1 & 0& 0& 0& 0& 2& 0& 0\\
+1 & 0& 0& 0& 0& 0& 2& 0\\
+2 & 0& 0& 0& 0& 0& 0& 1
+\end{array}\right]
+\end{equation*}
+
+7-state version of Baird's off-policy counterexample: 
+for TD algorithm, $\alpha$ is set to 0.1. For the TDC algorithm, the range of 
+$\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$, 
+and the range of 
+$\zeta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. 
+For the VMTD algorithm, the range of 
+$\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$, 
+and the range of 
+$\beta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. Through experiments, it was found 
+that setting $\zeta$ to a small value makes VMTDC updates approach VMTD 
+updates, resulting in better performance. Therefore, for the VMTDC 
+algorithm, The range of values for $\alpha$ and $\beta$ is the same as that of VMTD  
+and the range of $\zeta$ 
+is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. 
+The learning curves in Figure \ref{Complete_full} correspond to the optimal parameters.
+For all policy evaluation experiments, each experiment 
+is independently run 100 times.
+
+For the four control experiments: The learning rates for each 
+algorithm in all experiments are shown in Table \ref{lrofways}.
+For all control experiments, each experiment is independently run 50 times.
+
+\begin{table*}[htb]
+    \centering
+    \caption{Learning rates ($lr$) of four control experiments.}
+    \vskip 0.15in
+    \begin{tabular}{c|ccccc}
+        \hline
+        \multicolumn{1}{c|}{\diagbox{algorithms($lr$)}{envs}} &Maze &Cliff walking &Mountain Car &Acrobot \\
+        \hline
+         Sarsa($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\
+         GQ(0)($\alpha,\zeta$)&$0.1,0.003$ &$0.1,0.004$ &$0.1,0.01$ &$0.1,0.01$ \\
+         VMSarsa($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\
+         VMGQ(0)($\alpha,\zeta,\beta$)&$0.1,0.001,0.001$ &$0.1,0.005,\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ \\
+         AC($lr_{\text{actor}},lr_{\text{critic}}$)&$0.01,0.1$ &$0.01,0.01$ &$0.01,0.05$ &$0.01,0.05$ \\
+         Q-learning($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\
+         VMQ($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\
+         \hline
+    \end{tabular}
+    \label{lrofways}
+    \vskip -0.1in
+\end{table*}
\ No newline at end of file
diff --git b/main/conclusion.tex a/main/conclusion.tex
new file mode 100644
index 0000000..c199085
--- /dev/null
+++ a/main/conclusion.tex
@@ -0,0 +1,18 @@
+\section{Conclusion and Future Work}
+Value-based reinforcement learning typically aims 
+to minimize error as an optimization objective. 
+As an alternation, this study proposes new objective 
+functions: VBE, VPBE and VNEU, and derives many variance minimization algorithms, including VMTD, 
+VMTDC, VMGTD, VMGTD2 and VMETD. 
+% The VMTD algorithm 
+% is essentially an adjustment or correction to the traditional 
+% TD update. 
+%  Both 
+% algorithms are capable of stabilizing gradient estimation, reducing 
+% the variance of gradient estimation and accelerating convergence.
+All algorithms demonstrated superior performance in policy 
+evaluation and control experiments.
+Future work may include, but are not limited
+to, (1) analysis of the convergence rate of VMTDC. 
+(2) extensions of VBE and VPBE to multi-step returns. 
+(3) extensions to nonlinear approximations, such as neural networks. 
\ No newline at end of file
diff --git b/main/experiment.tex a/main/experiment.tex
new file mode 100644
index 0000000..c8013f9
--- /dev/null
+++ a/main/experiment.tex
@@ -0,0 +1,153 @@
+\section{Experimental Studies}
+This section assesses algorithm performance through experiments, 
+which are divided into policy evaluation experiments and control experiments.
+
+\subsection{Testing Tasks}
+\textbf{Random-walk:} as shown in Figure \ref{randomwalk}, all episodes
+start in the center state, $C$, and proceed to left or right by one state on each
+step, equiprobably. Episodes terminate either on the extreme left or
+the extreme right, and get a reward of $+1$ if terminate on the right, or
+$0$ in the other case. In this task, the true value for each state is  the
+probability of starting from that state and terminating on the right
+\cite{Sutton2018book}.
+Thus, the true values of states from $A$ to $E$ are
+$\frac{1}{6},\frac{2}{6},\frac{3}{6},\frac{4}{6},\frac{5}{6}$, respectively.
+The discount factor $\gamma=1.0$. 
+There are three standard kinds of features for random-walk problems: tabular
+feature, inverted feature and dependent feature \cite{sutton2009fast}. 
+The feature matrices corresponding to three random walks are shown in Appendix \ref{experimentaldetails}.
+Conduct experiments using
+an on-policy approach in the Random-walk environment.
+\begin{figure}
+    \begin{center}
+    \input{main/pic/randomwalk.tex}
+    \caption{Random walk.}
+    \label{randomwalk}
+    \end{center}
+\end{figure}
+
+\begin{figure}
+    \begin{center}
+    \input{main/pic/BairdExample.tex}
+    \caption{7-state version of Baird's off-policy counterexample.}
+    \label{bairdexample}
+    \end{center}
+\end{figure}
+    
+\textbf{Baird's off-policy counterexample:} This task is well known as a
+counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As
+shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy
+chooses actions represented by solid lines with a probability of $\frac{1}{7}$
+and actions represented by dotted lines with a probability of $\frac{6}{7}$. The
+target policy is expected to choose the solid line with more probability than $\frac{1}{7}$,
+and it chooses the solid line with probability of $1$ in this paper.
+ The discount factor $\gamma =0.99$, and the feature matrix is
+defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}.
+
+\textbf{Maze}:  The learning agent should find a shortest path from the upper
+left corner to the lower right corner.  In each state,
+there are four alternative actions: $up$, $down$, $left$, and $right$, which
+takes the agent deterministically to the corresponding neighbour state, except when
+ \begin{wrapfigure}{r}{3cm}
+\centering
+\includegraphics[scale=0.15]{main/pic/maze_13_13.pdf} 
+% \caption{The 2-state counterexample.}
+\end{wrapfigure}
+ a movement is blocked by an obstacle or the edge
+of the maze. Rewards are $-1$ in all transitions until the
+agent reaches the goal state.
+The discount factor $\gamma=0.99$, and states $s$ are represented by tabular
+features.The maximum number of moves in the game is set to 1000.
+
+\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are 
+selected from the gym official website and correspond to the following 
+versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''. 
+For specific details, please refer to the gym official website.
+The maximum number of steps for the Mountain Car environment is set to 1000, 
+while the default settings are used for the other two environments. In  Mountain car and Acrobot, features are generated by tile coding.
+
+Please, refer to the Appendix \ref{experimentaldetails} for the selection of learning rates for all experiments.
+
+\subsection{Experimental Results and Analysis}
+\begin{figure}[htb]
+    \vskip 0.2in
+    \begin{center}
+    \subfigure[Dependent]{
+        \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/dependent_new.pdf}
+        \label{DependentFull}
+    }
+    \subfigure[Tabular]{
+        \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/tabular_new.pdf}
+        \label{TabularFull}
+    }
+    \\
+    \subfigure[Inverted]{
+        \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/inverted_new.pdf}
+        \label{InvertedFull}
+    }
+    \subfigure[counterexample]{
+        \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/counterexample_quanju_new.pdf}
+        \label{CounterExampleFull}
+    }
+        \caption{Learning curses of four evaluation environments.}
+        \label{Evaluation_full}
+    \end{center}
+    \vskip -0.2in
+\end{figure}
+
+
+
+% The learning rates of all algorithms in different environments are shown in Table \ref{lrofways}. 
+% Figure \ref{Complete_full} shows the experimental curves of different algorithms in four environments.
+
+For policy evaluation experiments, compare the performance of the VMTD, 
+VMTDC, TD, and TDC algorithms. 
+The vertical axis is unified as RVBE.
+
+For policy evaluation experiments, the criteria for evaluating 
+  algorithms vary. The objective function minimized by our proposed 
+  new algorithm differs from that of other algorithms. Therefore, to 
+ensure fairness in comparisons, this study only contrasts algorithm 
+experiments in controlled settings.
+
+This study will compare the performance of Sarsa, Q-learning, GQ(0), 
+  AC, VMSarsa, VMQ, and VMGQ(0) in four control environments.
+% All experiments involved in this paper were run independently for 100 times.
+
+The learning curses of the algorithms corresponding to 
+policy evaluation experiments and control experiments are 
+shown in Figures \ref{Evaluation_full} and \ref{Complete_full}, respectively.
+The shaded area in Figure \ref{Evaluation_full}, \ref{Complete_full} represents the standard deviation (std).
+
+In the random-walk tasks, VMTD and VMTDC exhibit excellent performance, 
+outperforming TD and TDC in the case of dependent random-walk.
+
+In the 7-state example counter task, TD diverges, 
+while VMTDC converges and performs better than TDC. 
+From the update formula, it can be observed that the VMTD algorithm, like TDC,  
+is also an adjustment or correction of the TD update.
+What is more surprising is that VMTD also maintains 
+convergence and demonstrates the best performance.
+
+In  Maze, Mountain Car, and Acrobot, 
+the convergence speed of VMSarsa, VMQ, and VMGQ(0) has 
+been significantly improved compared to Sarsa, Q-learning, 
+and GQ(0), respectively. The performance of the AC algorithm 
+is at an intermediate level. The performances of VMSarsa, 
+VMQ, and VMGQ(0) in these three experimental environments 
+have no significant differences.
+
+
+In  Cliff Walking, Sarsa and 
+VMSarsa converge to slightly worse solutions compared to 
+other algorithms. The convergence speed of VMSarsa is significantly 
+better than that of Sarsa. The convergence speed of VMGQ(0) and VMQ 
+is better than other algorithms, and the performance of VMGQ(0) is 
+slightly better than that of VMQ.
+
+In summary, the performance of VMSarsa, 
+VMQ, and VMGQ(0) is better than that of other algorithms. 
+In the Cliff Walking environment, 
+the performance of VMGQ(0) is slightly better than that of 
+VMSarsa and VMQ. In the other three experimental environments, 
+the performances of VMSarsa, VMQ, and VMGQ(0) are close.
\ No newline at end of file
diff --git b/main/introduction.tex a/main/introduction.tex
new file mode 100644
index 0000000..96ceb9a
--- /dev/null
+++ a/main/introduction.tex
@@ -0,0 +1,94 @@
+\section{Introduction}
+\label{introduction}
+Reinforcement learning can be mainly divided into two
+categories: value-based reinforcement learning
+and policy gradient-based reinforcement learning. This
+paper focuses on temporal difference learning based on
+linear approximated valued functions. Its research is
+usually divided into two steps: the first step is to establish the convergence of the algorithm, and the second
+step is to accelerate the algorithm.
+
+
+In terms of stability, \citet{sutton1988learning} established the
+ convergence of on-policy TD(0), and \citet{tsitsiklis1997analysis}
+ established the convergence of on-policy TD($\lambda$).
+ However, ``The deadly triad'' consisting of off-policy learning, 
+ bootstrapping, and function approximation makes 
+ the stability  a difficult problem \citep{Sutton2018book}.
+ To solve this problem, convergent off-policy temporal difference
+  learning algorithms are proposed, e.g., BR \cite{baird1995residual},
+ GTD \cite{sutton2008convergent},  GTD2 and TDC \cite{sutton2009fast},
+  ETD \cite{sutton2016emphatic}, and MRetrace \cite{chen2023modified}.
+
+In terms of acceleration, \citet{hackman2012faster} 
+proposed Hybrid TD algorithm with on-policy matrix.
+\citet{liu2015finite,liu2016proximal,liu2018proximal} proposed
+true stochastic algorithms, i.e., GTD-MP and GTD2-MP, from 
+a convex-concave saddle-point formulation.
+Second-order  methods are used to accelerate TD learning,
+e.g.,  Quasi Newton TD \cite{givchi2015quasi} and 
+accelerated TD (ATD)  \citep{pan2017accelerated}.
+\citet{hallak2016generalized} introduced an new parameter 
+to reduce variance for ETD.
+\citet{zhang2022truncated} proposed truncated ETD with a lower variance.
+Variance Reduced TD with direct variance reduction technique \citep{johnson2013accelerating} is proposed by \cite{korda2015td} 
+and analysed by  \cite{xu2019reanalysis}.
+How to further improve the convergence rates of reinforcement learning 
+algorithms is currently still an open problem.
+
+Algorithm stability is prominently reflected in the changes 
+to the objective function, transitioning from mean squared 
+errors (MSE) \citep{Sutton2018book} to mean squared bellman errors (MSBE) \cite{baird1995residual}, then to 
+norm of the expected TD update \cite{sutton2009fast}, and further to 
+mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}. On the other hand, algorithm 
+acceleration is more centered around optimizing the iterative 
+update formula of the algorithm itself without altering the 
+objective function, thereby speeding up the convergence rate 
+of the algorithm. The emergence of new optimization objective 
+functions often leads to the development of novel algorithms. 
+The introduction of new algorithms, in turn, tends to inspire 
+researchers to explore methods for accelerating algorithms, 
+leading to the iterative creation of increasingly superior algorithms.
+
+The kernel loss function can be optimized using standard 
+gradient-based methods, addressing the issue of double 
+sampling in residual gradient algorithm \cite{feng2019kernel}. It ensures convergence 
+in both on-policy and off-policy scenarios. The logistic bellman 
+error is convex and smooth in the action-value function parameters, 
+with bounded gradients \cite{basserrano2021logistic}. In contrast, the squared Bellman error is 
+not convex in the action-value function parameters, and RL algorithms 
+based on recursive optimization using it are known to be unstable.
+
+
+% The value-based algorithms mentioned above aim to
+% minimize some errors, e.g., mean squared errors \citep{Sutton2018book},
+% mean squared Bellman errors \cite{baird1995residual}, norm
+% of the expected TD update \cite{sutton2009fast}, 
+% mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}, etc.
+It is necessary to propose a new objective function, but the mentioned objective functions above are all some form of error.
+Is minimizing error the only option for value-based reinforcement learning?
+
+For policy evaluation experiments, 
+differences in objective functions may result 
+in inconsistent fixed points. This inconsistency 
+makes it difficult to uniformly compare the superiority 
+of algorithms derived from different objective functions. 
+However, for control experiments, since the choice of actions 
+depends on the relative values of the Q values rather than their
+ absolute values, the presence of solution bias is acceptable.
+
+Based on this observation, we propose  alternate objective functions 
+instead of minimizing errors. We minimize Variance of Bellman Error (VBE),
+Variance of Projected Bellman Error (VPBE), and Variance of the norm of the expected TD update (VNEU)
+and derive Variance Minimization (VM) algorithms.
+These algorithms preserve the invariance of the optimal policy in the control environments,
+but significantly reduce the variance of gradient estimation,
+and thus hastening convergence.
+
+The contributions of this paper are as follows:
+(1) Introduction of  novel objective functions based on
+the invariance of the optimal policy.
+(2) Derived mang variance minimization algorithms, including on-policy and one off-policy.
+(3) Proof of their convergence.
+(4) Analysis of the convergence rate of on-policy algorithm.
+(5) Experiments demonstrating the faster convergence speed of the proposed algorithms.
diff --git b/main/motivation.tex a/main/motivation.tex
new file mode 100644
index 0000000..8dbe330
--- /dev/null
+++ a/main/motivation.tex
@@ -0,0 +1,207 @@
+\section{Variance Minimization Algorithms}
+\subsection{Motivation}
+ As shown 
+in Table \ref{example_bias}, although there is a bias between the 
+true value and the predicted value, action $a_3$ is 
+still chosen under the greedy-policy.
+On the contrary, supervised learning is usually used to predict temperature, humidity, morbidity, etc. If the bias is too large, the consequences could be serious. 
+
+\begin{table}[t]
+    \caption{Classification accuracies for naive Bayes and flexible
+    Bayes on various data sets.}
+    \label{example_bias}
+    \vskip 0.15in
+    \begin{center}
+    \begin{small}
+    \begin{sc}
+    \begin{tabular}{lcccr}
+    \toprule
+    action & $Q$ value & $Q$ value with bias \\
+    \midrule
+    $Q(s, a_0)$ & 1& 5 \\
+    $Q(s, a_1)$ & 2& 6 \\
+    $Q(s, a_2)$ & 3& 7 \\
+    $Q(s, a_3)$ & 4& 8 \\
+    $\arg \min_{a}Q(s,a)$ & $a_3$& $a_3$\\
+    \bottomrule
+    \end{tabular}
+    \end{sc}
+    \end{small}
+    \end{center}
+    \vskip -0.1in
+\end{table}
+
+In addition, reward shaping can significantly speed up the learning  by adding a shaping
+reward $F(s,s')$ to the original reward  $r$, 
+where $F(s,s')$ is the general form of any state-based shaping reward.
+Static potential-based reward shaping (Static PBRS) maintains the policy invariance if the
+shaping reward follows from $F(s,s')=\gamma
+f(s')-f(s)$ \cite{ng1999policy}.
+
+This means that we can make changes to the TD error $\delta = r+\gamma \theta^{\top}\phi'-\theta^{\top}\phi $ while still ensuring the invariance of the optimal policy,
+\begin{equation*}
+    \delta - \omega= r+\gamma \theta^{\top}\phi'-\theta^{\top}\phi - \omega,
+\end{equation*}
+where $\omega$ is a constant, acting as a static PBRS. 
+This also means that algorithms with the optimization goal 
+of minimizing errors, after introducing reward shaping, 
+may result in larger or smaller bias. Fortunately, 
+as discussed above, bias is acceptable in reinforcement 
+learning. 
+However, the problem is that selecting an appropriate 
+$\omega$ requires expert knowledge. This forces us to learn 
+$\omega$ dynamically, i.e., $\omega=\omega_t $ and dynamic PBRS can also maintain the policy 
+invariance if the shaping reward is $F(s,t,s',t')=\gamma f(s',t')-f(s,t)$,
+where $t$ is the time-step the agent reaches in  state $s$
+\cite{devlin2012dynamic}.
+However, this result requires the convergence guarantee of the dynamic potential
+function $f(s,t)$. If $f(s,t)$ does not converge as the time-step
+$t\rightarrow\infty$, the Q-values of  dynamic PBRS are not 
+guaranteed to converge.
+
+Let  $f_{\omega_t}(s)=\frac{\omega_t}{\gamma-1}$.
+Thus, $F_{\omega_t}(s,s')=\gamma f_{\omega_t}(s')-f_{\omega_t}(s)= \omega_t$
+is a dynamic PBRS. And if $\omega$ converges finally, the dynamic potential
+function $f(s,t)$ will converge.
+Bias is the expected difference between the predicted value 
+and the true value. Therefore, under the premise of bootstrapping, we first think of 
+letting $\omega \doteq \mathbb{E}[\mathbb{E}[\delta|s]]=\mathbb{E}[\delta]$. 
+
+As we all know, the optimization process of linear TD(0) (semi-gradient) and linear TDC are as follows, respectively:
+\begin{equation*}
+    \theta^{*}= \arg \min_{\theta} \mathbb{E}[(\mathbb{E}[\delta |s])^2],
+\end{equation*}
+and
+\begin{equation*}
+    \theta^{*}=\arg \min_{\theta} \mathbb{E}[\delta \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1} \mathbb{E}[\delta\phi].
+\end{equation*}
+As a result, two novel objective functions and their corresponding algorithms are proposed, 
+where $\omega$ is subsequently proven to converge, meaning that these two algorithms can maintain the invariance of the optimal strategy.
+
+\subsection{Variance Minimization TD Learning: VMTD}
+For on-policy learning,
+a novel objective function, Variance of Bellman Error (VBE), is proposed as follows:
+\begin{equation}
+    \begin{array}{ccl}
+        \arg \min_{\theta}\text{VBE}(\theta)&=&\arg \min_{\theta}\mathbb{E}[(\mathbb{E}[\delta|s]-\mathbb{E}[\mathbb{E}[\delta|s]])^2]\\
+        &=&\arg \min_{\theta,\omega} \mathbb{E}[(\mathbb{E}[\delta|s]-\omega)^2].
+    \end{array}
+\end{equation}
+Clearly, it is no longer to minimize Bellman errors. 
+
+First, the parameter  $\omega$ is derived directly based on
+stochastic gradient descent:
+\begin{equation}
+\omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k),
+\label{omega}
+\end{equation}
+where $\delta_k$ is the TD error as follows:
+\begin{equation}
+\delta_k = r+\gamma
+\theta_k^{\top}\phi_{k}'-\theta_k^{\top}\phi_k.
+\label{delta}
+\end{equation}
+
+Then, based on stochastic semi-gradient descent, the update of 
+the parameter $\theta$ is as follows:
+\begin{equation}
+\theta_{k+1}\leftarrow
+\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi_k.
+\label{theta}
+\end{equation}
+
+The pseudocode of the VMTD algorithm is shown in Algorithm \ref{alg:algorithm 1}.
+
+For control tasks,  two extensions of VMTD are named VMSarsa and VMQ respectively, 
+and the update formulas are shown below: 
+\begin{equation}
+    \theta_{k+1}\leftarrow
+    \theta_{k}+\alpha_k(\delta_k-\omega_k)\phi(s_k,a_k).
+\end{equation}
+and
+\begin{equation}
+    \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k),
+\end{equation}
+where $\delta_k$ delta in VMSarsa is:
+\begin{equation}
+    \delta_{k}=r_{k+1}+\gamma \theta_{k}^{\top}\phi(s_{k+1},a_{k+1}) - \theta_{k}^{\top}\phi(s_{k},a_{k}),
+    \label{deltaSarsa}
+\end{equation}
+and $\delta_k$ delta in VMQ is:
+\begin{equation}
+    \delta_{k}=r_{k+1}+\gamma \max_{a\in A}\theta_{k}^{\top}\phi(s_{k+1},a) - \theta_{k}^{\top}\phi(s_{k},a_{k}).
+    \label{deltaQ}
+\end{equation}
+
+
+\begin{algorithm}[t]
+    \caption{VMTD algorithm with linear function approximation in the on-policy setting}
+    \label{alg:algorithm 1}
+\begin{algorithmic}
+    \STATE {\bfseries Input:} $\theta_{0}$, $\omega_{0}$, $\gamma
+    $, learning rate $\alpha_t$ and $\beta_t$
+    \REPEAT
+    \STATE For any episode, initialize $\theta_{0}$ arbitrarily,  $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$ and $\beta_t$ are constant.\\
+    \FOR{$t=0$ {\bfseries to} $T-1$}
+    \STATE Take $A_t$ from $S_t$ according to policy $\mu$, and arrive at $S_{t+1}$\\
+    \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
+    \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t}'-\theta_t^{\top}\phi_t$
+    \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t(\delta_t-\omega_t)\phi_t$
+    \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t(\delta_t-\omega_t)$
+    \STATE $S_t=S_{t+1}$
+    \ENDFOR
+    \UNTIL{terminal episode}
+\end{algorithmic}
+\end{algorithm}
+
+
+
+\subsection{Variance Minimization TDC Learning: VMTDC}
+For off-policy learning, we employ a projection operator.
+The objective function is called Variance of Projected Bellman error (VPBE), 
+and the corresponding algorithm is called VMTDC.
+\begin{equation}
+    \begin{array}{ccl}
+    \text{VPBE}(\theta)&=&\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta])\phi]\\
+    &=&\mathbb{E}[(\delta-\omega) \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta-\omega)\phi],
+    \end{array}
+\end{equation}
+where $\omega$  is used to estimate $\mathbb{E}[\delta]$, i.e., $\omega \doteq \mathbb{E}[\delta]$.
+
+The derivation process of the VMTDC algorithm is the same 
+as that of the TDC algorithm, the only difference is that the original $\delta$ is replaced by $\delta-\omega$.
+Therefore, we can easily get the updated formula of VMTDC, as follows:
+
+\begin{equation}
+    \theta_{k+1}\leftarrow\theta_{k}+\alpha_{k}[(\delta_{k}- \omega_k) \phi(s_k)\\
+     - \gamma\phi(s_{k+1})(\phi^{\top} (s_k) u_k)],
+\label{thetavmtdc}
+\end{equation}
+\begin{equation}
+    u_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\omega_k - \phi^{\top} (s_k) u_k]\phi(s_k),
+\label{uvmtdc}
+\end{equation}
+and
+\begin{equation}
+    \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k),
+    \label{omegavmtdc}
+\end{equation}
+The pseudocode of the VMTDC algorithm for importance-sampling scenario is shown in Algorithm \ref{alg:algorithm 2} of Appendix \ref{proofth2}.
+
+Now, we will introduce the improved version of the GQ(0) algorithm, named VMGQ(0):
+\begin{equation}
+    \begin{array}{ccl}
+    \theta_{k+1}\leftarrow\theta_{k}&+&\alpha_{k}[(\delta_{k}- \omega_k) \phi(s_k,a_k)\\
+     &-& \gamma\phi(s_{k+1},A^{*}_{k+1})(\phi^{\top} (s_k,a_k) u_k)],
+    \end{array}
+\end{equation}
+\begin{equation}
+    u_{k+1}\leftarrow u_{k}+\zeta_{k}[(\delta_{k}-u_k) - \phi^{\top} (s_k,a_k) u_k]\phi(s_k,a_k),
+\end{equation}
+and
+\begin{equation}
+    \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k- \omega_k),
+\end{equation}
+where $\delta_{k}$ is (\ref{deltaQ}) and $A^{*}_{k+1}={\arg \max}_{a}(\theta_{k}^{\top}\phi(s_{k+1},a))$.
+
+This paper also introduces an additional parameter $\omega$ into the GTD and GTD2 algorithms. For details, please refer to the appendix.
\ No newline at end of file
diff --git b/main/pic/Acrobot_complete.pdf a/main/pic/Acrobot_complete.pdf
new file mode 100644
index 0000000..8d11e4d
Binary files /dev/null and a/main/pic/Acrobot_complete.pdf differ
diff --git b/main/pic/BairdExample.tex a/main/pic/BairdExample.tex
new file mode 100644
index 0000000..456b342
--- /dev/null
+++ a/main/pic/BairdExample.tex
@@ -0,0 +1,69 @@
+\resizebox{6cm}{4cm}{
+\begin{tikzpicture}[smooth]
+\node[coordinate] (origin) at (0.3,0) {};
+\node[coordinate] (num7) at (3,0) {};
+\node[coordinate] (num1) at (1,2.5) {};
+\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {};
+\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {};
+\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {};
+\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {};
+\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {};
+\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {};
+\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {};
+\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {};
+
+\path (num1) ++ (0:1cm) node (num2) [coordinate] {};
+\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {};
+\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {};
+\path (num2) ++ (0:1cm) node (num3) [coordinate] {};
+\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {};
+\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {};
+\path (num3) ++ (0:1cm) node (num4) [coordinate] {};
+\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {};
+\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {};
+\path (num4) ++ (0:1cm) node (num5) [coordinate] {};
+\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {};
+\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {};
+\path (num5) ++ (0:1cm) node (num6) [coordinate] {};
+\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {};
+\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {};
+
+
+%\draw[->](0,0) -- (1,1);
+%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1);
+ %\fill (0.5,0.5) circle (0.5);
+ %\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7};
+
+ 
+\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06]
+coordinates{(num7) (origin) (origin_above) (origin_aright)}; 
+
+\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5]
+coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)};
+ 
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7};
+
+
+
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6};
+
+\draw[->,>=stealth,line width = 0.02cm] (a)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (b)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (c)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (d)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (e)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (f)--(g);
+\end{tikzpicture}
+}
+
diff --git b/main/pic/counterexample_quanju_new.pdf a/main/pic/counterexample_quanju_new.pdf
new file mode 100644
index 0000000..7b39ec5
Binary files /dev/null and a/main/pic/counterexample_quanju_new.pdf differ
diff --git b/main/pic/cw_complete.pdf a/main/pic/cw_complete.pdf
new file mode 100644
index 0000000..b80dd74
Binary files /dev/null and a/main/pic/cw_complete.pdf differ
diff --git b/main/pic/dependent_new.pdf a/main/pic/dependent_new.pdf
new file mode 100644
index 0000000..f7f34ce
Binary files /dev/null and a/main/pic/dependent_new.pdf differ
diff --git b/main/pic/inverted_new.pdf a/main/pic/inverted_new.pdf
new file mode 100644
index 0000000..b8e4bc2
Binary files /dev/null and a/main/pic/inverted_new.pdf differ
diff --git b/main/pic/maze_13_13.pdf a/main/pic/maze_13_13.pdf
new file mode 100644
index 0000000..cda62be
Binary files /dev/null and a/main/pic/maze_13_13.pdf differ
diff --git b/main/pic/maze_complete.pdf a/main/pic/maze_complete.pdf
new file mode 100644
index 0000000..6757bdb
Binary files /dev/null and a/main/pic/maze_complete.pdf differ
diff --git b/main/pic/mt_complete.pdf a/main/pic/mt_complete.pdf
new file mode 100644
index 0000000..aa554fb
Binary files /dev/null and a/main/pic/mt_complete.pdf differ
diff --git b/main/pic/randomwalk.tex a/main/pic/randomwalk.tex
new file mode 100644
index 0000000..de9bf55
--- /dev/null
+++ a/main/pic/randomwalk.tex
@@ -0,0 +1,62 @@
+
+% \tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
+% \tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
+% \tikzstyle{init} = [pin edge={to-,thin,black}]
+% 	\resizebox{8cm}{1.2cm}{
+% \begin{tikzpicture}[node distance=1.5cm,auto,>=latex']
+%     \node [block] (o) {};
+%     \node (p) [left of=o,node distance=0.5cm, coordinate] {o};
+%     \node [shape=circle,int] (a) [right of=o]{$A$};
+%     \node (b) [left of=a,node distance=1.5cm, coordinate] {a};
+%     \node [shape=circle,int] (c) [right of=a] {$B$};
+%     \node (d) [left of=c,node distance=1.5cm, coordinate] {c};
+%     \node [shape=circle,int, pin={[init]above:$$}] (e) [right of=c]{$C$}; 
+%     \node (f) [left of=e,node distance=1.5cm, coordinate] {e};
+%     \node [shape=circle,int] (g) [right of=e] {$D$};
+%     \node (h) [left of=g,node distance=1.5cm, coordinate] {g};
+%     \node [shape=circle,int] (i) [right of=g] {$E$};
+%     \node (j) [left of=i,node distance=1.5cm, coordinate] {i};
+%     \node [block] (k) [right of=i] {};
+%     \node (l) [left of=k,node distance=0.5cm, coordinate] {k};
+
+%     \path[<-] (o) edge node {$0$} (a);
+%     \path[<->] (a) edge node {$0$} (c);
+%     \path[<->] (c) edge node {$0$} (e);
+%     \path[<->] (e) edge node {$0$} (g);
+%     \path[<->] (g) edge node {$0$} (i);
+%     \draw[->] (i) edge node {$1$} (k);
+% \end{tikzpicture}
+% }
+\tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
+\tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
+\tikzstyle{init} = [pin edge={to-,thin,black}]
+
+\resizebox{6cm}{1cm}{
+    \begin{tikzpicture}[node distance=1.5cm, auto, >=latex]
+        \node [block] (o) {};
+        \node (p) [left of=o, node distance=0.5cm, coordinate] {o};
+        \node [shape=circle, int] (a) [right of=o] {$A$};
+        \node (b) [left of=a, node distance=1.5cm, coordinate] {a};
+        \node [shape=circle, int] (c) [right of=a] {$B$};
+        \node (d) [left of=c, node distance=1.5cm, coordinate] {c};
+        \node [shape=circle, int, pin={[init]above:$ $}] (e) [right of=c] {$C$};
+        \node (f) [left of=e, node distance=1.5cm, coordinate] {e};
+        \node [shape=circle, int] (g) [right of=e] {$D$};
+        \node (h) [left of=g, node distance=1.5cm, coordinate] {g};
+        \node [shape=circle, int] (i) [right of=g] {$E$};
+        \node (j) [left of=i, node distance=1.5cm, coordinate] {i};
+        \node [block] (k) [right of=i] {};
+        \node (l) [left of=k, node distance=0.5cm, coordinate] {k};
+
+        \path[->] (o) edge node {$0$} (a);
+        \path[<->] (a) edge node {$0$} (c);
+        \path[<->] (c) edge node {$0$} (e);
+        \path[<->] (e) edge node {$0$} (g);
+        \path[<->] (g) edge node {$0$} (i);
+        \draw[->] (i) edge node {$1$} (k);
+    \end{tikzpicture}
+}
+
+
+  
+    
\ No newline at end of file
diff --git b/main/pic/tabular_new.pdf a/main/pic/tabular_new.pdf
new file mode 100644
index 0000000..32bc90b
Binary files /dev/null and a/main/pic/tabular_new.pdf differ
diff --git b/main/preliminaries.tex a/main/preliminaries.tex
new file mode 100644
index 0000000..4015346
--- /dev/null
+++ a/main/preliminaries.tex
@@ -0,0 +1,55 @@
+\section{Background}
+\label{preliminaries}
+Reinforcement learning agent interacts with environment, observes state,
+ takes sequential decision makings to influence environment, and obtains
+ rewards.
+ Consider an infinite-horizon discounted 
+ Markov Decision Process (MDP), defined by a tuple $\langle S,A,R,P,\gamma
+ \rangle$, where $S=\{1,2,\ldots,N\}$ is a finite set of states of the environment;  $A$
+ is a finite set of actions of the agent; 
+ $R:S\times A \times S \rightarrow \mathbb{R}$ is a bounded deterministic reward
+ function; $P:S\times A\times S \rightarrow [0,1]$ is the transition
+ probability distribution;  and $\gamma\in (0,1)$
+  is the discount factor \cite{Sutton2018book}.
+  Due to the requirements of  online learning, value iteration based on sampling
+  is considered in this paper. 
+  In each sampling, an experience (or transition) $\langle s, a, s', r\rangle$ is
+  obtained.
+
+  A policy is a mapping $\pi:S\times A \rightarrow [0,1]$. The goal of the
+  agent is to find an optimal policy $\pi^*$ to maximize the expectation of a
+  discounted cumulative rewards in a long period.
+  State value function $V^{\pi}(s)$  for a stationary policy $\pi$ is 
+  defined as:
+  \begin{equation*}
+  V^{\pi}(s)=\mathbb{E}_{\pi}[\sum_{k=0}^{\infty} \gamma^k R_{k}|s_0=s].
+  \label{valuefunction}
+  \end{equation*}
+  Linear value function for state $s\in S$ is defined as:
+   \begin{equation}
+   V_{{\theta}}(s):= {\theta}^{\top}{\phi}(s) = \sum_{i=1}^{m}
+   \theta_i \phi_i(s),
+   \label{linearvaluefunction}
+   \end{equation}
+  where ${\theta}:=(\theta_1,\theta_2,\ldots,\theta_m)^{\top}\in
+  \mathbb{R}^m$ is a parameter vector, 
+  ${\phi}:=(\phi_1,\phi_2,\ldots,\phi_m)^{\top}\in \mathbb{R}^m$ is a feature
+  function  defined on state space $S$, and $m$ is the feature size. 
+
+  Tabular temporal difference (TD) learning \cite{Sutton2018book} has been successfully applied to small-scale problems.
+  To deal with the well-known curse of dimensionality of large scale MDPs, value
+  function is usually approximated by a linear model, kernel methods, decision
+   trees, or neural networks, etc. This paper focuses on the linear model, where
+   features are usually hand coded by domain experts.
+
+TD learning can also be used to find optimal strategies. The problem of finding an optimal policy is 
+often called the control problem. Two popular TD methods are Sarsa and Q-leaning. The former is an on-policy 
+TD control, while the latter is an off-policy control. 
+
+It is well known that TDC algorithm \cite{sutton2009fast} guarantees 
+convergence under off-policy conditions while the off-policy TD algorithm may diverge. The 
+objective function of TDC is MSPBE. 
+TDC is essentially an adjustment or correction of the TD update so that it
+follows the gradient of the MSPBE objective function. In the context of the TDC algorithm, the control algorithm 
+is known as Greedy-GQ($\lambda$) \cite{sutton2009fast}. When $\lambda$ is set to 0, it is denoted 
+as GQ(0).
\ No newline at end of file
diff --git b/main/relatedwork.tex a/main/relatedwork.tex
new file mode 100644
index 0000000..99b39f6
--- /dev/null
+++ a/main/relatedwork.tex
@@ -0,0 +1,95 @@
+\begin{figure*}[htb]
+  \vskip 0.2in
+  \begin{center}
+  \subfigure[Maze]{
+      \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/maze_complete.pdf}
+      \label{MazeFull}
+  }
+  \subfigure[Cliff Walking]{
+      \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/cw_complete.pdf}
+      \label{CliffWalkingFull}
+  }
+  \\
+  \subfigure[Mountain Car]{
+      \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/mt_complete.pdf}
+      \label{MountainCarFull}
+  }
+  \subfigure[Acrobot]{
+      \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/Acrobot_complete.pdf}
+      \label{AcrobotFull}
+  }
+      \caption{Learning curses of four contral environments.}
+      \label{Complete_full}
+  \end{center}
+  \vskip -0.2in
+\end{figure*}
+
+\section{Related Work}
+\subsection{Difference between VMQ and R-learning}
+\begin{table*}[htb]
+  \centering
+  \caption{Difference between R-learning and tabular VMQ.}
+  \vskip 0.15in
+  \begin{tabular}{c|cc}
+      \hline
+      algorithms&update formula \\
+      \hline
+       R-learning&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}-m_{k}+ \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a))$\\
+              &$m_{k+1}\leftarrow m_{k}+\beta_k(r_{k+1}+\max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-m_{k})$\\
+       tabular VMQ&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_k)$\\
+       &$\omega_{k+1}\leftarrow \omega_{k}+\beta_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_{k})$\\
+       \hline
+  \end{tabular}
+  \label{differenceRandVMQ}
+  \vskip -0.1in
+\end{table*}
+Tabular VMQ's update formula bears some resemblance 
+to R-learning's update formula. As shown in Table \ref{differenceRandVMQ}, the update formulas of the two algorithms have the following differences:
+\\(1) The goal of the R-learning algorithm \cite{schwartz1993reinforcement} is to maximize the average 
+reward, rather than the cumulative reward, by learning an estimate 
+of the average reward. This estimate $m$ is then used to update the Q-values.
+On the contrary, the $\omega$ in the tabular VMQ update formula eventually converges to $\mathbb{E}[\delta]$.
+\\(2) When $\gamma=1$ in the tabular VMQ update formula, the 
+R-learning update formula is formally 
+the same as the tabular VMQ update formula. 
+Therefore, R-learning algorithm can be 
+considered as a special case of VMQ algorithm in form.
+
+\subsection{Variance Reduction for TD Learning}
+ The TD with centering algorithm (CTD) \cite{korda2015td} 
+was proposed, which directly applies variance reduction techniques to 
+the TD algorithm. The CTD algorithm updates its parameters using the 
+average gradient of a batch of Markovian samples and a projection operator. 
+Unfortunately, the authors’ analysis of the CTD algorithm contains technical 
+errors. The VRTD algorithm \cite{xu2020reanalysis} is also a variance-reduced algorithm that updates 
+its parameters using the average gradient of a batch of i.i.d. samples. The 
+authors of VRTD provide a technically sound analysis to demonstrate the 
+advantages of variance reduction. 
+
+\subsection{Variance Reduction for Policy Gradient Algorithms}
+Policy gradient algorithms are a class of reinforcement 
+learning algorithms that directly optimize cumulative rewards. 
+REINFORCE  is a Monte Carlo algorithm that estimates 
+gradients through sampling, but may have a high variance. 
+Baselines are introduced to reduce variance and to
+accelerate learning \cite{Sutton2018book}. In  Actor-Critic, 
+value function as a baseline and bootstrapping 
+ are used to reduce variance, also accelerating convergence \cite{Sutton2018book}.
+ TRPO \cite{schulman2015trust} and PPO \cite{schulman2017proximal}
+  use generalized advantage 
+estimation, which combines multi-step bootstrapping and Monte Carlo 
+estimation to reduce variance, making gradient estimation more stable and 
+accelerating convergence. 
+
+In Variance Minimization, 
+the incorporation of $\omega \doteq \mathbb{E}[\delta]$ 
+bears a striking resemblance to the use of a baseline 
+in policy gradient methods. The introduction of a baseline 
+in policy gradient techniques does not alter 
+the expected value of the update; 
+rather, it significantly impacts the variance of gradient estimation. 
+The addition of $\omega \doteq \mathbb{E}[\delta]$ in Variance Minimization 
+ preserves the invariance of the optimal 
+policy while stabilizing gradient estimation, 
+reducing the variance of gradient estimation, 
+and hastening convergence.
\ No newline at end of file
diff --git b/main/theory.tex a/main/theory.tex
new file mode 100644
index 0000000..0454189
--- /dev/null
+++ a/main/theory.tex
@@ -0,0 +1,85 @@
+\section{Theoretical Analysis}
+The purpose of this section is to establish the stabilities of the VMTD algorithm
+and the VMTDC algorithm, and also presents a corollary on the convergence rate of VMTD.
+
+\begin{theorem}
+    \label{theorem1}(Convergence of VMTD).
+    In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta})  of VMTD.
+     Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
+    $
+    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\beta_k^2<\infty,
+    $
+    and  
+    $
+    \alpha_k = o(\beta_k).
+    $
+    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
+    uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain.
+    Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
+    $b=\mathrm{Cov}(r,\phi)$.
+    Assume that matrix $A$ is  non-singular. 
+    Then the parameter vector $\theta_k$ converges with probability one 
+    to $A^{-1}b$.
+\end{theorem}
+
+Please refer to the appendix \ref{proofth1} for detailed proof process.
+
+Theorem 3 in \cite{dalal2020tale} provides a general conclusion on the convergence speed of all linear two-timescale 
+algorithms. VMTD satisfies the assumptions of this theorem, leading 
+to the following corollary.
+\begin{corollary}
+    \label{corollary4_2}
+Consider the Sparsely Projected variant of VMTD. Then, for $\alpha_k = 1/(k+1)^{\alpha}$, $\beta_k = 1/(k+1)^{\beta}$, 
+$0<\beta<\alpha<1$, $p>1$, with probility larger than $1- \tau$, for all $k\geq N_3$, we have
+\begin{equation}
+    ||\theta'_{k} - \theta^{*}|| \le C_{3,\theta} \frac{\sqrt{\ln (4d_{1}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\alpha / 2}}
+\end{equation} 
+\begin{equation}
+    ||\omega'_{n} - \omega^{*}|| \le C_{3,\omega} \frac{\sqrt{\ln (4d_{2}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\omega / 2}},
+\end{equation} 
+\end{corollary}
+
+where $d_1$ and $d_2$ represent the dimensions of $\theta$ and $\omega$, respectively. For VMTD, $d_2 =1$.
+The meanings of $N_3$,$C_{3,\theta}$ and $C_{3,\omega}$ are explained in \cite{dalal2020tale}.
+The formulas for $\theta'_{k}$ and $\omega'_{n}$ can be found in (\ref{sparseprojectiontheta}) and (\ref{sparseprojectionomega}).
+
+Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process.
+
+\begin{theorem}
+    \label{theorem2}(Convergence of VMTDC).
+    In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc})   of VMTDC.
+     Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$,
+    $
+    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\zeta_k^2<\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\beta_k^2<\infty,
+    $
+    and  
+    $
+    \alpha_k = o(\zeta_k),
+    $
+    $
+    \zeta_k = o(\beta_k).
+    $
+    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
+    uniformly bounded second moments.
+    Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
+    $b=\mathrm{Cov}(r,\phi)$, and $C=\mathbb{E}[\phi\phi^{\top}]$.
+    Assume that  $A$ and $C$ are  non-singular matrices. 
+    Then the parameter vector $\theta_k$ converges with probability one 
+    to $A^{-1}b$.
+\end{theorem}
+Please refer to the appendix \ref{proofth2} for detailed proof process.
\ No newline at end of file
diff --git b/named.bst a/named.bst
new file mode 100644
index 0000000..78b6bb6
--- /dev/null
+++ a/named.bst
@@ -0,0 +1,1287 @@
+%NAME: named.bst
+% BibTeX `named' style file for BibTeX version 0.99c, LaTeX version 2.09
+% Place it in a file called named.bst in the BibTeX search path.  (Placing it
+% in the same directory as the LaTeX document should also work.)
+% Support for named citations is provided by named.sty
+
+% This version was made by modifying the master file made by
+% Oren Patashnik (PATASHNIK@SCORE.STANFORD.EDU)
+
+% Copyright (C) 1985, all rights reserved.
+% Modifications Copyright 1988, Peter F. Patel-Schneider
+% Copying of this file is authorized only if either
+% (1) you make absolutely no changes to your copy, including name, or
+% (2) if you do make changes, you name it something other than
+% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
+% This restriction helps ensure that all standard styles are identical.
+
+% There are undoubtably bugs in this style.  If you make bug fixes,
+% improvements, etc.  please let me know.  My e-mail address is:
+%	pfps@research.att.com
+
+%   Citation format: [author-last-name, year]
+%		     [author-last-name and author-last-name, year]
+%		     [author-last-name {\em et al.}, year]
+%
+%   Reference list ordering: alphabetical by author or whatever passes
+%	for author in the absence of one.
+%
+% This BibTeX style has support for short (year only) citations.  This
+% is done by having the citations actually look like
+%         \citeauthoryear{author-info}{year}
+% The LaTeX style has to have the following (or similar)
+%     \let\@internalcite\cite
+%     \def\cite{\def\citeauthoryear##1##2{##1, ##2}\@internalcite}
+%     \def\shortcite{\def\citeauthoryear##1{##2}\@internalcite}
+%     \def\@biblabel#1{\def\citeauthoryear##1##2{##1, ##2}[#1]\hfill}
+% which makes \shortcite the macro for short citations.
+
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label extra.label sort.label }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+	{ add.period$ write$
+	  newline$
+	  "\newblock " write$
+	}
+	{ output.state before.all =
+	    'write$
+	    { add.period$ " " * write$ }
+	  if$
+	}
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+
+  "\bibitem[" write$
+  label write$
+  "]{" write$
+
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+	'skip$
+	{ after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+
+    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
+
+      nameptr #1 >
+	{ namesleft #1 >
+	    { ", " * t * }
+	    { numnames #2 >
+		{ "," * }
+		'skip$
+	      if$
+	      t "others" =
+		{ " et~al." * }
+		{ " and " * t * }
+	      if$
+	    }
+	  if$
+	}
+	't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+	{ ", editors" * }
+	{ ", editor" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+
+    { title "t" change.case$ }
+
+  if$
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+	{ t #1 #2 substring$ "--" = not
+	    { "--" *
+	      t #2 global.max$ substring$ 't :=
+	    }
+	    {   { t #1 #1 substring$ "-" = }
+		{ "-" *
+		  t #2 global.max$ substring$ 't :=
+		}
+	      while$
+	    }
+	  if$
+	}
+	{ t #1 #1 substring$ *
+	  t #2 global.max$ substring$ 't :=
+	}
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year empty$
+    { month empty$
+	{ "" }
+	{ "there's a month but no year in " cite$ * warning$
+	  month
+	}
+      if$
+    }
+    { month empty$
+	'year
+	{ month " " * year * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+	'skip$
+	{ " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+	{ series field.or.null }
+	{ output.state mid.sentence =
+	    { "number" }
+	    { "Number" }
+	  if$
+	  number tie.or.space.connect
+	  series empty$
+	    { "there's a number but no series in " cite$ * warning$ }
+	    { " in " * series * }
+	  if$
+	}
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+	{ edition "l" change.case$ " edition" * }
+	{ edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+	{ #1 'multiresult := }
+	{ t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+	{ "pages" pages n.dashify tie.or.space.connect }
+	{ "page" pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "(" number * ")" * *
+      volume empty$
+	{ "there's a number but no volume in " cite$ * warning$ }
+	'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+	{ pop$ format.pages }
+	{ ":" * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+	{ "chapter" }
+	{ type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+	'skip$
+	{ ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+	{ "In " booktitle emphasize * }
+	{ "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+
+  key empty$ not and
+
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+	{ "need key or journal for " cite$ * " to crossref " * crossref *
+	  warning$
+	  ""
+	}
+	{ "In {\em " journal * "\/}" * }
+      if$
+    }
+    { "In " key * }
+  if$
+  " \shortcite{" * crossref * "}" *
+}
+
+FUNCTION {format.crossref.editor}
+{ editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+	'skip$
+	{ editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+	    { " et~al." * }
+	    { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+	  if$
+	}
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+	{ series empty$
+	    { "need editor, key, or series for " cite$ * " to crossref " *
+	      crossref * warning$
+	      "" *
+	    }
+	    { "{\em " * series * "\/}" * }
+	  if$
+	}
+	{ key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \shortcite{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+	{ booktitle empty$
+	    { "need editor, key, or booktitle for " cite$ * " to crossref " *
+	      crossref * warning$
+	      ""
+	    }
+	    { "In {\em " booktitle * "\/}" * }
+	  if$
+	}
+	{ "In " key * }
+      if$
+    }
+    { "In " format.crossref.editor * }
+  if$
+  " \shortcite{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      format.vol.num.pages output
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+	{ "author and editor" editor either.or.check }
+	'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+	{ "author and editor" editor either.or.check }
+	'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+	{ organization publisher new.sentence.checkb
+	  organization output
+	  publisher output
+	  format.date "year" output.check
+	}
+	{ address output.nonnull
+	  format.date "year" output.check
+	  new.sentence
+	  organization output
+	  publisher output
+	}
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  author empty$
+    { organization empty$
+	'skip$
+	{ organization output.nonnull
+	  address output
+	}
+      if$
+    }
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  author empty$
+    { organization empty$
+	{ address new.block.checka
+	  address output
+	}
+	'skip$
+      if$
+    }
+    { organization address new.block.checkb
+      organization output
+      address output
+    }
+  if$
+  format.edition output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  editor empty$
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address empty$
+    { editor empty$
+	{ publisher new.sentence.checka }
+	{ organization publisher new.sentence.checkb
+	  organization output
+	}
+      if$
+      publisher output
+      format.date "year" output.check
+    }
+    { address output.nonnull
+      format.date "year" output.check
+      new.sentence
+      editor empty$
+	'skip$
+	{ organization output }
+      if$
+      publisher output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+INTEGERS { et.al.char.used }
+
+FUNCTION {initialize.et.al.char.used}
+{ #0 'et.al.char.used :=
+}
+
+EXECUTE {initialize.et.al.char.used}
+
+FUNCTION {format.lab.names}
+{ 's :=
+  s num.names$ 'numnames :=
+
+  numnames #1 =
+    { s #1 "{vv }{ll}" format.name$ }
+    { numnames #2 =
+        { s #1 "{vv }{ll }and " format.name$ s #2 "{vv }{ll}" format.name$ *
+        }
+        { s #1 "{vv }{ll }\bgroup \em et al.\egroup " format.name$ }
+      if$
+    }
+  if$
+
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+    { key empty$
+
+	{ cite$ #1 #3 substring$ }
+
+	{ key }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+    { editor empty$
+	{ key empty$
+
+	    { cite$ #1 #3 substring$ }
+
+	    { key }
+	  if$
+	}
+	{ editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.key.organization.label}
+{ author empty$
+    { key empty$
+	{ organization empty$
+
+	    { cite$ #1 #3 substring$ }
+
+	    { "The " #4 organization chop.word #3 text.prefix$ }
+	  if$
+	}
+	{ key }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.organization.label}
+{ editor empty$
+    { key empty$
+	{ organization empty$
+
+	    { cite$ #1 #3 substring$ }
+
+	    { "The " #4 organization chop.word #3 text.prefix$ }
+	  if$
+	}
+	{ key }
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION {calc.label}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+	'editor.key.organization.label
+	{ type$ "manual" =
+	    'author.key.organization.label
+	    'author.key.label
+	  if$
+	}
+      if$
+    }
+  if$
+  duplicate$
+
+  "\protect\citeauthoryear{" swap$ * "}{" *
+  year field.or.null purify$ *  % CHANGED - pfps - 15 Feb 1989
+  'label :=
+  year field.or.null purify$ *
+
+  sortify 'sort.label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+	{ "   " * }
+	'skip$
+      if$
+
+      s nameptr "{vv{ } }{ll{ }}{  ff{ }}{  jj{ }}" format.name$ 't :=
+
+      nameptr numnames = t "others" = and
+	{ "et al" * }
+	{ t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+	{ "to sort, need author or key in " cite$ * warning$
+	  ""
+	}
+	{ key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+	{ key empty$
+	    { "to sort, need author, editor, or key in " cite$ * warning$
+	      ""
+	    }
+	    { key sortify }
+	  if$
+	}
+	{ editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+	{ key empty$
+	    { "to sort, need author, organization, or key in " cite$ * warning$
+	      ""
+	    }
+	    { key sortify }
+	  if$
+	}
+	{ "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+	{ key empty$
+	    { "to sort, need editor, organization, or key in " cite$ * warning$
+	      ""
+	    }
+	    { key sortify }
+	  if$
+	}
+	{ "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+FUNCTION {presort}
+
+{ calc.label
+  sort.label
+  "    "
+  *
+  type$ "book" =
+
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+	'editor.organization.sort
+	{ type$ "manual" =
+	    'author.organization.sort
+	    'author.sort
+	  if$
+	}
+      if$
+    }
+  if$
+
+  *
+
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label last.sort.label next.extra }
+
+INTEGERS { longest.label.width last.extra.num }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #0 int.to.chr$ 'last.sort.label :=
+  "" 'next.extra :=
+  #0 'longest.label.width :=
+  #0 'last.extra.num :=
+}
+
+FUNCTION {forward.pass}
+{ last.sort.label sort.label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num int.to.chr$ 'extra.label :=
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      sort.label 'last.sort.label :=
+    }
+  if$
+}
+
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  label extra.label * "}" * 'label :=   % CHANGED - pfps 15 Feb 1989
+  label width$ longest.label.width >
+    { label 'longest.label :=
+      label width$ 'longest.label.width :=
+    }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION {begin.bib}
+
+{ et.al.char.used
+    { "\newcommand{\etalchar}[1]{$^{#1}$}" write$ newline$ }
+    'skip$
+  if$
+  preamble$ empty$
+
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+
+  "\begin{thebibliography}{}" write$ newline$
+
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
diff --git b/neurips_2024.aux a/neurips_2024.aux
new file mode 100644
index 0000000..14357af
--- /dev/null
+++ a/neurips_2024.aux
@@ -0,0 +1,219 @@
+\relax 
+\providecommand\hyper@newdestlabel[2]{}
+\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
+\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
+\global\let\oldnewlabel\newlabel
+\gdef\newlabel#1#2{\newlabelxx{#1}#2}
+\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
+\AtEndDocument{\ifx\hyper@anchor\@undefined
+\let\newlabel\oldnewlabel
+\fi}
+\fi}
+\global\let\hyper@last\relax 
+\gdef\HyperFirstAtBeginDocument#1{#1}
+\providecommand\HyField@AuxAddToFields[1]{}
+\providecommand\HyField@AuxAddToCoFields[2]{}
+\citation{sutton1988learning}
+\citation{tsitsiklis1997analysis}
+\citation{Sutton2018book}
+\citation{baird1995residual}
+\citation{sutton2008convergent}
+\citation{sutton2009fast}
+\citation{sutton2016emphatic}
+\citation{chen2023modified}
+\citation{hackman2012faster}
+\citation{liu2015finite,liu2016proximal,liu2018proximal}
+\citation{givchi2015quasi}
+\citation{pan2017accelerated}
+\citation{hallak2016generalized}
+\citation{zhang2022truncated}
+\citation{johnson2013accelerating}
+\citation{korda2015td}
+\citation{xu2019reanalysis}
+\citation{Sutton2018book}
+\citation{baird1995residual}
+\citation{sutton2009fast}
+\citation{sutton2009fast}
+\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
+\newlabel{introduction}{{1}{1}{Introduction}{section.1}{}}
+\citation{feng2019kernel}
+\citation{basserrano2021logistic}
+\citation{Sutton2018book}
+\citation{Sutton2018book}
+\@writefile{toc}{\contentsline {section}{\numberline {2}Background}{2}{section.2}\protected@file@percent }
+\newlabel{preliminaries}{{2}{2}{Background}{section.2}{}}
+\newlabel{valuefunction}{{2}{2}{Background}{section.2}{}}
+\newlabel{linearvaluefunction}{{1}{2}{Background}{equation.2.1}{}}
+\citation{sutton2009fast}
+\citation{sutton2009fast}
+\citation{ng1999policy}
+\citation{devlin2012dynamic}
+\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Classification accuracies for naive Bayes and flexible Bayes on various data sets.}}{3}{table.1}\protected@file@percent }
+\newlabel{example_bias}{{1}{3}{Classification accuracies for naive Bayes and flexible Bayes on various data sets}{table.1}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {3}Variance Minimization Algorithms}{3}{section.3}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Motivation}{3}{subsection.3.1}\protected@file@percent }
+\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces VMTD algorithm with linear function approximation in the on-policy setting}}{4}{algorithm.1}\protected@file@percent }
+\newlabel{alg:algorithm 1}{{1}{4}{Variance Minimization TD Learning: VMTD}{algorithm.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Variance Minimization TD Learning: VMTD}{4}{subsection.3.2}\protected@file@percent }
+\newlabel{omega}{{3}{4}{Variance Minimization TD Learning: VMTD}{equation.3.3}{}}
+\newlabel{delta}{{4}{4}{Variance Minimization TD Learning: VMTD}{equation.3.4}{}}
+\newlabel{theta}{{5}{4}{Variance Minimization TD Learning: VMTD}{equation.3.5}{}}
+\newlabel{deltaSarsa}{{8}{4}{Variance Minimization TD Learning: VMTD}{equation.3.8}{}}
+\newlabel{deltaQ}{{9}{4}{Variance Minimization TD Learning: VMTD}{equation.3.9}{}}
+\citation{dalal2020tale}
+\citation{dalal2020tale}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Variance Minimization TDC Learning: VMTDC}{5}{subsection.3.3}\protected@file@percent }
+\newlabel{thetavmtdc}{{11}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.11}{}}
+\newlabel{uvmtdc}{{12}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.12}{}}
+\newlabel{omegavmtdc}{{13}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.13}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {4}Theoretical Analysis}{5}{section.4}\protected@file@percent }
+\newlabel{theorem1}{{4.1}{5}{}{theorem.4.1}{}}
+\newlabel{corollary4_2}{{4.2}{5}{}{theorem.4.2}{}}
+\citation{Sutton2018book}
+\citation{sutton2009fast}
+\citation{baird1995residual,sutton2009fast}
+\citation{baird1995residual,sutton2009fast,maei2011gradient}
+\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Random walk.}}{6}{figure.1}\protected@file@percent }
+\newlabel{randomwalk}{{1}{6}{Random walk}{figure.1}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces 7-state version of Baird's off-policy counterexample.}}{6}{figure.2}\protected@file@percent }
+\newlabel{bairdexample}{{2}{6}{7-state version of Baird's off-policy counterexample}{figure.2}{}}
+\newlabel{theorem2}{{4.3}{6}{}{theorem.4.3}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {5}Experimental Studies}{6}{section.5}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Testing Tasks}{6}{subsection.5.1}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Experimental Results and Analysis}{7}{subsection.5.2}\protected@file@percent }
+\newlabel{DependentFull}{{3(a)}{7}{Subfigure 3(a)}{subfigure.3.1}{}}
+\newlabel{sub@DependentFull}{{(a)}{7}{Subfigure 3(a)\relax }{subfigure.3.1}{}}
+\newlabel{TabularFull}{{3(b)}{7}{Subfigure 3(b)}{subfigure.3.2}{}}
+\newlabel{sub@TabularFull}{{(b)}{7}{Subfigure 3(b)\relax }{subfigure.3.2}{}}
+\newlabel{InvertedFull}{{3(c)}{7}{Subfigure 3(c)}{subfigure.3.3}{}}
+\newlabel{sub@InvertedFull}{{(c)}{7}{Subfigure 3(c)\relax }{subfigure.3.3}{}}
+\newlabel{CounterExampleFull}{{3(d)}{7}{Subfigure 3(d)}{subfigure.3.4}{}}
+\newlabel{sub@CounterExampleFull}{{(d)}{7}{Subfigure 3(d)\relax }{subfigure.3.4}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Learning curses of four evaluation environments.}}{7}{figure.3}\protected@file@percent }
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Dependent}}}{7}{figure.3}\protected@file@percent }
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Tabular}}}{7}{figure.3}\protected@file@percent }
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Inverted}}}{7}{figure.3}\protected@file@percent }
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {counterexample}}}{7}{figure.3}\protected@file@percent }
+\newlabel{Evaluation_full}{{3}{7}{Learning curses of four evaluation environments}{figure.3}{}}
+\citation{schwartz1993reinforcement}
+\newlabel{MazeFull}{{4(a)}{8}{Subfigure 4(a)}{subfigure.4.1}{}}
+\newlabel{sub@MazeFull}{{(a)}{8}{Subfigure 4(a)\relax }{subfigure.4.1}{}}
+\newlabel{CliffWalkingFull}{{4(b)}{8}{Subfigure 4(b)}{subfigure.4.2}{}}
+\newlabel{sub@CliffWalkingFull}{{(b)}{8}{Subfigure 4(b)\relax }{subfigure.4.2}{}}
+\newlabel{MountainCarFull}{{4(c)}{8}{Subfigure 4(c)}{subfigure.4.3}{}}
+\newlabel{sub@MountainCarFull}{{(c)}{8}{Subfigure 4(c)\relax }{subfigure.4.3}{}}
+\newlabel{AcrobotFull}{{4(d)}{8}{Subfigure 4(d)}{subfigure.4.4}{}}
+\newlabel{sub@AcrobotFull}{{(d)}{8}{Subfigure 4(d)\relax }{subfigure.4.4}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Learning curses of four contral environments.}}{8}{figure.4}\protected@file@percent }
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Maze}}}{8}{figure.4}\protected@file@percent }
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Cliff Walking}}}{8}{figure.4}\protected@file@percent }
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Mountain Car}}}{8}{figure.4}\protected@file@percent }
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Acrobot}}}{8}{figure.4}\protected@file@percent }
+\newlabel{Complete_full}{{4}{8}{Learning curses of four contral environments}{figure.4}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {6}Related Work}{8}{section.6}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Difference between VMQ and R-learning}{8}{subsection.6.1}\protected@file@percent }
+\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Difference between R-learning and tabular VMQ.}}{8}{table.2}\protected@file@percent }
+\newlabel{differenceRandVMQ}{{2}{8}{Difference between R-learning and tabular VMQ}{table.2}{}}
+\citation{korda2015td}
+\citation{xu2020reanalysis}
+\citation{Sutton2018book}
+\citation{Sutton2018book}
+\citation{schulman2015trust}
+\citation{schulman2017proximal}
+\citation{borkar1997stochastic}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2}Variance Reduction for TD Learning}{9}{subsection.6.2}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3}Variance Reduction for Policy Gradient Algorithms}{9}{subsection.6.3}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion and Future Work}{9}{section.7}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {A}Relevant proofs}{9}{appendix.A}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {A.1}Proof of Theorem \ref {theorem1}}{9}{subsection.A.1}\protected@file@percent }
+\newlabel{proofth1}{{A.1}{9}{Proof of Theorem \ref {theorem1}}{subsection.A.1}{}}
+\newlabel{th1proof}{{A.1}{9}{Proof of Theorem \ref {theorem1}}{subsection.A.1}{}}
+\citation{hirsch1989convergent}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\newlabel{thetaFast}{{19}{10}{Proof of Theorem \ref {theorem1}}{equation.A.19}{}}
+\newlabel{omegaFast}{{20}{10}{Proof of Theorem \ref {theorem1}}{equation.A.20}{}}
+\newlabel{omegaFastFinal}{{21}{10}{Proof of Theorem \ref {theorem1}}{equation.A.21}{}}
+\newlabel{omegaInfty}{{22}{10}{Proof of Theorem \ref {theorem1}}{equation.A.22}{}}
+\newlabel{odetheta}{{23}{10}{Proof of Theorem \ref {theorem1}}{equation.A.23}{}}
+\citation{dalal2020tale}
+\citation{dalal2020tale}
+\newlabel{covariance}{{24}{11}{Proof of Theorem \ref {theorem1}}{equation.A.24}{}}
+\newlabel{odethetafinal}{{25}{11}{Proof of Theorem \ref {theorem1}}{equation.A.25}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {A.2}Proof of Corollary \ref {corollary4_2}}{11}{subsection.A.2}\protected@file@percent }
+\newlabel{proofcorollary4_2}{{A.2}{11}{Proof of Corollary \ref {corollary4_2}}{subsection.A.2}{}}
+\newlabel{matrixassumption}{{A.1}{11}{}{theorem.A.1}{}}
+\newlabel{stepsizeassumption}{{A.2}{11}{}{theorem.A.2}{}}
+\newlabel{sparseprojection}{{A.3}{11}{}{theorem.A.3}{}}
+\citation{dalal2020tale}
+\citation{dalal2020tale}
+\citation{sutton2009fast}
+\citation{hirsch1989convergent}
+\newlabel{sparseprojectiontheta}{{30}{12}{}{equation.A.30}{}}
+\newlabel{sparseprojectionomega}{{31}{12}{}{equation.A.31}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {A.3}Proof of Theorem \ref {theorem2}}{12}{subsection.A.3}\protected@file@percent }
+\newlabel{proofth2}{{A.3}{12}{Proof of Theorem \ref {theorem2}}{subsection.A.3}{}}
+\newlabel{thetavmtdcFastest}{{32}{12}{Proof of Theorem \ref {theorem2}}{equation.A.32}{}}
+\newlabel{uvmtdcFastest}{{33}{12}{Proof of Theorem \ref {theorem2}}{equation.A.33}{}}
+\newlabel{omegavmtdcFastest}{{34}{12}{Proof of Theorem \ref {theorem2}}{equation.A.34}{}}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{hirsch1989convergent}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\newlabel{omegavmtdcFastestFinal}{{35}{13}{Proof of Theorem \ref {theorem2}}{equation.A.35}{}}
+\newlabel{omegavmtdcInfty}{{36}{13}{Proof of Theorem \ref {theorem2}}{equation.A.36}{}}
+\newlabel{thetavmtdcFaster}{{37}{13}{Proof of Theorem \ref {theorem2}}{equation.A.37}{}}
+\newlabel{uvmtdcFaster}{{38}{13}{Proof of Theorem \ref {theorem2}}{equation.A.38}{}}
+\newlabel{uvmtdcFasterFinal}{{39}{13}{Proof of Theorem \ref {theorem2}}{equation.A.39}{}}
+\newlabel{uvmtdcInfty}{{40}{13}{Proof of Theorem \ref {theorem2}}{equation.A.40}{}}
+\newlabel{thetavmtdcSlowerFinal}{{42}{14}{Proof of Theorem \ref {theorem2}}{equation.A.42}{}}
+\newlabel{odethetavmtdcfinal}{{43}{14}{Proof of Theorem \ref {theorem2}}{equation.A.43}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {B}Experimental details}{14}{appendix.B}\protected@file@percent }
+\newlabel{experimentaldetails}{{B}{14}{Experimental details}{appendix.B}{}}
+\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces VMTDC algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.2}\protected@file@percent }
+\newlabel{alg:algorithm 2}{{2}{15}{Proof of Theorem \ref {theorem2}}{algorithm.2}{}}
+\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces VMGTD algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.3}\protected@file@percent }
+\newlabel{alg:algorithm 3}{{3}{15}{Proof of Theorem \ref {theorem2}}{algorithm.3}{}}
+\bibstyle{named}
+\bibdata{neurips_2024}
+\bibcite{baird1995residual}{{1}{1995}{{Baird and others}}{{}}}
+\bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano \bgroup \em  et al.\egroup }}{{}}}
+\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces VMGTD2 algorithm with linear function approximation in the off-policy setting}}{16}{algorithm.4}\protected@file@percent }
+\newlabel{alg:algorithm 4}{{4}{16}{Proof of Theorem \ref {theorem2}}{algorithm.4}{}}
+\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Learning rates ($lr$) of four control experiments.}}{16}{table.3}\protected@file@percent }
+\newlabel{lrofways}{{3}{16}{Learning rates ($lr$) of four control experiments}{table.3}{}}
+\bibcite{borkar2000ode}{{3}{2000}{{Borkar and Meyn}}{{}}}
+\bibcite{borkar1997stochastic}{{4}{1997}{{Borkar}}{{}}}
+\bibcite{chen2023modified}{{5}{2023}{{Chen \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{dalal2020tale}{{6}{2020}{{Dalal \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{devlin2012dynamic}{{7}{2012}{{Devlin and Kudenko}}{{}}}
+\bibcite{feng2019kernel}{{8}{2019}{{Feng \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{givchi2015quasi}{{9}{2015}{{Givchi and Palhang}}{{}}}
+\bibcite{hackman2012faster}{{10}{2012}{{Hackman}}{{}}}
+\bibcite{hallak2016generalized}{{11}{2016}{{Hallak \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{hirsch1989convergent}{{12}{1989}{{Hirsch}}{{}}}
+\bibcite{johnson2013accelerating}{{13}{2013}{{Johnson and Zhang}}{{}}}
+\bibcite{korda2015td}{{14}{2015}{{Korda and La}}{{}}}
+\bibcite{liu2015finite}{{15}{2015}{{Liu \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{liu2016proximal}{{16}{2016}{{Liu \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{liu2018proximal}{{17}{2018}{{Liu \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{maei2011gradient}{{18}{2011}{{Maei}}{{}}}
+\bibcite{ng1999policy}{{19}{1999}{{Ng \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{pan2017accelerated}{{20}{2017}{{Pan \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{schulman2015trust}{{21}{2015}{{Schulman \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{schulman2017proximal}{{22}{2017}{{Schulman \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{schwartz1993reinforcement}{{23}{1993}{{Schwartz}}{{}}}
+\bibcite{Sutton2018book}{{24}{2018}{{Sutton and Barto}}{{}}}
+\bibcite{sutton2008convergent}{{25}{2008}{{Sutton \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{sutton2009fast}{{26}{2009}{{Sutton \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{sutton2016emphatic}{{27}{2016}{{Sutton \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{sutton1988learning}{{28}{1988}{{Sutton}}{{}}}
+\bibcite{tsitsiklis1997analysis}{{29}{1997}{{Tsitsiklis and Van~Roy}}{{}}}
+\bibcite{xu2019reanalysis}{{30}{2019}{{Xu \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{xu2020reanalysis}{{31}{2020}{{Xu \bgroup \em  et al.\egroup }}{{}}}
+\bibcite{zhang2022truncated}{{32}{2022}{{Zhang and Whiteson}}{{}}}
+\gdef \@abspage@last{18}
diff --git b/neurips_2024.bbl a/neurips_2024.bbl
new file mode 100644
index 0000000..a64724d
--- /dev/null
+++ a/neurips_2024.bbl
@@ -0,0 +1,163 @@
+\begin{thebibliography}{}
+
+\bibitem[\protect\citeauthoryear{Baird and others}{1995}]{baird1995residual}
+Leemon Baird et~al.
+\newblock Residual algorithms: Reinforcement learning with function approximation.
+\newblock In {\em Proc. 12th Int. Conf. Mach. Learn.}, pages 30--37, 1995.
+
+\bibitem[\protect\citeauthoryear{Bas-Serrano \bgroup \em et al.\egroup }{2021}]{basserrano2021logistic}
+Joan Bas-Serrano, Sebastian Curi, Andreas Krause, and Gergely Neu.
+\newblock Logistic q-learning.
+\newblock In {\em International Conference on Artificial Intelligence and Statistics}, pages 3610--3618, 2021.
+
+\bibitem[\protect\citeauthoryear{Borkar and Meyn}{2000}]{borkar2000ode}
+Vivek~S Borkar and Sean~P Meyn.
+\newblock The ode method for convergence of stochastic approximation and reinforcement learning.
+\newblock {\em SIAM J. Control Optim.}, 38(2):447--469, 2000.
+
+\bibitem[\protect\citeauthoryear{Borkar}{1997}]{borkar1997stochastic}
+Vivek~S Borkar.
+\newblock Stochastic approximation with two time scales.
+\newblock {\em Syst. \& Control Letters}, 29(5):291--294, 1997.
+
+\bibitem[\protect\citeauthoryear{Chen \bgroup \em et al.\egroup }{2023}]{chen2023modified}
+Xingguo Chen, Xingzhou Ma, Yang Li, Guang Yang, Shangdong Yang, and Yang Gao.
+\newblock Modified retrace for off-policy temporal difference learning.
+\newblock In {\em Uncertainty in Artificial Intelligence}, pages 303--312. PMLR, 2023.
+
+\bibitem[\protect\citeauthoryear{Dalal \bgroup \em et al.\egroup }{2020}]{dalal2020tale}
+Gal Dalal, Balazs Szorenyi, and Gugan Thoppe.
+\newblock A tale of two-timescale reinforcement learning with the tightest finite-time bound.
+\newblock In {\em Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pages 3701--3708, 2020.
+
+\bibitem[\protect\citeauthoryear{Devlin and Kudenko}{2012}]{devlin2012dynamic}
+Sam Devlin and Daniel Kudenko.
+\newblock Dynamic potential-based reward shaping.
+\newblock In {\em Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, pages 433--440, 2012.
+
+\bibitem[\protect\citeauthoryear{Feng \bgroup \em et al.\egroup }{2019}]{feng2019kernel}
+Yihao Feng, Lihong Li, and Qiang Liu.
+\newblock A kernel loss for solving the bellman equation.
+\newblock In {\em Advances in Neural Information Processing Systems}, pages 15430--15441, 2019.
+
+\bibitem[\protect\citeauthoryear{Givchi and Palhang}{2015}]{givchi2015quasi}
+Arash Givchi and Maziar Palhang.
+\newblock Quasi newton temporal difference learning.
+\newblock In {\em Asian Conference on Machine Learning}, pages 159--172, 2015.
+
+\bibitem[\protect\citeauthoryear{Hackman}{2012}]{hackman2012faster}
+Leah Hackman.
+\newblock {\em Faster Gradient-TD Algorithms}.
+\newblock PhD thesis, University of Alberta, 2012.
+
+\bibitem[\protect\citeauthoryear{Hallak \bgroup \em et al.\egroup }{2016}]{hallak2016generalized}
+Assaf Hallak, Aviv Tamar, Remi Munos, and Shie Mannor.
+\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
+\newblock In {\em Proceedings of the 30th AAAI Conference on Artificial Intelligence}, pages 1631--1637, 2016.
+
+\bibitem[\protect\citeauthoryear{Hirsch}{1989}]{hirsch1989convergent}
+Morris~W Hirsch.
+\newblock Convergent activation dynamics in continuous time networks.
+\newblock {\em Neural Netw.}, 2(5):331--349, 1989.
+
+\bibitem[\protect\citeauthoryear{Johnson and Zhang}{2013}]{johnson2013accelerating}
+R.~Johnson and T.~Zhang.
+\newblock Accelerating stochastic gradient descent using predictive variance reduction.
+\newblock In {\em Advances in Neural Information Processing Systems}, pages 315--323, 2013.
+
+\bibitem[\protect\citeauthoryear{Korda and La}{2015}]{korda2015td}
+Nathaniel Korda and Prashanth La.
+\newblock On td (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
+\newblock In {\em International conference on machine learning}, pages 626--634. PMLR, 2015.
+
+\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2015}]{liu2015finite}
+Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
+\newblock Finite-sample analysis of proximal gradient td algorithms.
+\newblock In {\em Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, pages 504--513, 2015.
+
+\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2016}]{liu2016proximal}
+Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
+\newblock Proximal gradient temporal difference learning algorithms.
+\newblock In {\em Proceedings of the International Joint Conference on Artificial Intelligence}, pages 4195--4199, 2016.
+
+\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2018}]{liu2018proximal}
+Bo~Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji~Liu, Sridhar Mahadevan, and Marek Petrik.
+\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
+\newblock {\em Journal of Artificial Intelligence Research}, 63:461--494, 2018.
+
+\bibitem[\protect\citeauthoryear{Maei}{2011}]{maei2011gradient}
+Hamid~Reza Maei.
+\newblock {\em Gradient temporal-difference learning algorithms}.
+\newblock PhD thesis, University of Alberta, 2011.
+
+\bibitem[\protect\citeauthoryear{Ng \bgroup \em et al.\egroup }{1999}]{ng1999policy}
+Andrew~Y Ng, Daishi Harada, and Stuart Russell.
+\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
+\newblock In {\em Proc. 16th Int. Conf. Mach. Learn.}, pages 278--287, 1999.
+
+\bibitem[\protect\citeauthoryear{Pan \bgroup \em et al.\egroup }{2017}]{pan2017accelerated}
+Yangchen Pan, Adam White, and Martha White.
+\newblock Accelerated gradient temporal difference learning.
+\newblock In {\em Proceedings of the 21st AAAI Conference on Artificial Intelligence}, pages 2464--2470, 2017.
+
+\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2015}]{schulman2015trust}
+J.~Schulman, S.~Levine, P.~Abbeel, M.~Jordan, and P.~Moritz.
+\newblock Trust region policy optimization.
+\newblock In {\em International Conference on Machine Learning}, pages 1889--1897, 2015.
+
+\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2017}]{schulman2017proximal}
+J.~Schulman, F.~Wolski, P.~Dhariwal, A.~Radford, and O.~Klimov.
+\newblock Proximal policy optimization algorithms.
+\newblock {\em arXiv preprint arXiv:1707.06347}, 2017.
+
+\bibitem[\protect\citeauthoryear{Schwartz}{1993}]{schwartz1993reinforcement}
+Anton Schwartz.
+\newblock A reinforcement learning method for maximizing undiscounted rewards.
+\newblock In {\em Proc. 10th Int. Conf. Mach. Learn.}, volume 298, pages 298--305, 1993.
+
+\bibitem[\protect\citeauthoryear{Sutton and Barto}{2018}]{Sutton2018book}
+Richard~S. Sutton and Andrew~G. Barto.
+\newblock {\em Reinforcement Learning: An Introduction}.
+\newblock The MIT Press, second edition, 2018.
+
+\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2008}]{sutton2008convergent}
+Richard~S Sutton, Hamid~R Maei, and Csaba Szepesv{\'a}ri.
+\newblock A convergent $ o (n) $ temporal-difference algorithm for off-policy learning with linear function approximation.
+\newblock In {\em Advances in Neural Information Processing Systems}, pages 1609--1616. Cambridge, MA: MIT Press, 2008.
+
+\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2009}]{sutton2009fast}
+R.S. Sutton, H.R. Maei, D.~Precup, S.~Bhatnagar, D.~Silver, C.~Szepesv{\'a}ri, and E.~Wiewiora.
+\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
+\newblock In {\em Proc. 26th Int. Conf. Mach. Learn.}, pages 993--1000, 2009.
+
+\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2016}]{sutton2016emphatic}
+Richard~S Sutton, A~Rupam Mahmood, and Martha White.
+\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
+\newblock {\em The Journal of Machine Learning Research}, 17(1):2603--2631, 2016.
+
+\bibitem[\protect\citeauthoryear{Sutton}{1988}]{sutton1988learning}
+Richard~S Sutton.
+\newblock Learning to predict by the methods of temporal differences.
+\newblock {\em Machine learning}, 3(1):9--44, 1988.
+
+\bibitem[\protect\citeauthoryear{Tsitsiklis and Van~Roy}{1997}]{tsitsiklis1997analysis}
+John~N Tsitsiklis and Benjamin Van~Roy.
+\newblock Analysis of temporal-diffference learning with function approximation.
+\newblock In {\em Advances in Neural Information Processing Systems}, pages 1075--1081, 1997.
+
+\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2019}]{xu2019reanalysis}
+Tengyu Xu, Zhe Wang, Yi~Zhou, and Yingbin Liang.
+\newblock Reanalysis of variance reduced temporal difference learning.
+\newblock In {\em International Conference on Learning Representations}, 2019.
+
+\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2020}]{xu2020reanalysis}
+T.~Xu, Z.~Wang, Y.~Zhou, and Y.~Liang.
+\newblock Reanalysis of variance reduced temporal difference learning.
+\newblock {\em arXiv preprint arXiv:2001.01898}, 2020.
+
+\bibitem[\protect\citeauthoryear{Zhang and Whiteson}{2022}]{zhang2022truncated}
+Shangtong Zhang and Shimon Whiteson.
+\newblock Truncated emphatic temporal difference methods for prediction and control.
+\newblock {\em The Journal of Machine Learning Research}, 23(1):6859--6917, 2022.
+
+\end{thebibliography}
diff --git b/neurips_2024.bib a/neurips_2024.bib
new file mode 100644
index 0000000..08e4dae
--- /dev/null
+++ a/neurips_2024.bib
@@ -0,0 +1,1138 @@
+@inproceedings{langley00,
+ author    = {P. Langley},
+ title     = {Crafting Papers on Machine Learning},
+ year      = {2000},
+ pages     = {1207--1216},
+ editor    = {Pat Langley},
+ booktitle     = {Proceedings of the 17th International Conference
+              on Machine Learning (ICML 2000)},
+ address   = {Stanford, CA},
+ publisher = {Morgan Kaufmann}
+}
+
+@TechReport{mitchell80,
+  author = 	 "T. M. Mitchell",
+  title = 	 "The Need for Biases in Learning Generalizations",
+  institution =  "Computer Science Department, Rutgers University",
+  year = 	 "1980",
+  address =	 "New Brunswick, MA",
+}
+
+@phdthesis{kearns89,
+  author = {M. J. Kearns},
+  title =  {Computational Complexity of Machine Learning},
+  school = {Department of Computer Science, Harvard University},
+  year =   {1989}
+}
+
+@Book{MachineLearningI,
+  editor = 	 "R. S. Michalski and J. G. Carbonell and T.
+		  M. Mitchell",
+  title = 	 "Machine Learning: An Artificial Intelligence
+		  Approach, Vol. I",
+  publisher = 	 "Tioga",
+  year = 	 "1983",
+  address =	 "Palo Alto, CA"
+}
+
+@Book{DudaHart2nd,
+  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
+  title =        "Pattern Classification",
+  publisher =    "John Wiley and Sons",
+  edition =      "2nd",
+  year =         "2000"
+}
+
+@misc{anonymous,
+  title= {Suppressed for Anonymity},
+  author= {Author, N. N.},
+  year= {2021}
+}
+
+@InCollection{Newell81,
+  author =       "A. Newell and P. S. Rosenbloom",
+  title =        "Mechanisms of Skill Acquisition and the Law of
+                  Practice", 
+  booktitle =    "Cognitive Skills and Their Acquisition",
+  pages =        "1--51",
+  publisher =    "Lawrence Erlbaum Associates, Inc.",
+  year =         "1981",
+  editor =       "J. R. Anderson",
+  chapter =      "1",
+  address =      "Hillsdale, NJ"
+}
+
+
+@Article{Samuel59,
+  author = 	 "A. L. Samuel",
+  title = 	 "Some Studies in Machine Learning Using the Game of
+		  Checkers",
+  journal =	 "IBM Journal of Research and Development",
+  year =	 "1959",
+  volume =	 "3",
+  number =	 "3",
+  pages =	 "211--229"
+}
+@inproceedings{langley00,
+ author    = {P. Langley},
+ title     = {Crafting Papers on Machine Learning},
+ year      = {2000},
+ pages     = {1207--1216},
+ editor    = {Pat Langley},
+ booktitle     = {Proceedings of the 17th International Conference
+              on Machine Learning (ICML 2000)},
+ address   = {Stanford, CA},
+ publisher = {Morgan Kaufmann}
+}
+
+@TechReport{mitchell80,
+  author = 	 "T. M. Mitchell",
+  title = 	 "The Need for Biases in Learning Generalizations",
+  institution =  "Computer Science Department, Rutgers University",
+  year = 	 "1980",
+  address =	 "New Brunswick, MA",
+}
+
+@phdthesis{kearns89,
+  author = {M. J. Kearns},
+  title =  {Computational Complexity of Machine Learning},
+  school = {Department of Computer Science, Harvard University},
+  year =   {1989}
+}
+
+@Book{MachineLearningI,
+  editor = 	 "R. S. Michalski and J. G. Carbonell and T.
+		  M. Mitchell",
+  title = 	 "Machine Learning: An Artificial Intelligence
+		  Approach, Vol. I",
+  publisher = 	 "Tioga",
+  year = 	 "1983",
+  address =	 "Palo Alto, CA"
+}
+
+@Book{DudaHart2nd,
+  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
+  title =        "Pattern Classification",
+  publisher =    "John Wiley and Sons",
+  edition =      "2nd",
+  year =         "2000"
+}
+
+@misc{anonymous,
+  title= {Suppressed for Anonymity},
+  author= {Author, N. N.},
+  year= {2021}
+}
+
+@InCollection{Newell81,
+  author =       "A. Newell and P. S. Rosenbloom",
+  title =        "Mechanisms of Skill Acquisition and the Law of
+                  Practice", 
+  booktitle =    "Cognitive Skills and Their Acquisition",
+  pages =        "1--51",
+  publisher =    "Lawrence Erlbaum Associates, Inc.",
+  year =         "1981",
+  editor =       "J. R. Anderson",
+  chapter =      "1",
+  address =      "Hillsdale, NJ"
+}
+
+
+@Article{Samuel59,
+  author = 	 "A. L. Samuel",
+  title = 	 "Some Studies in Machine Learning Using the Game of
+		  Checkers",
+  journal =	 "IBM Journal of Research and Development",
+  year =	 "1959",
+  volume =	 "3",
+  number =	 "3",
+  pages =	 "211--229"
+}
+
+@book{em:86,
+  editor  = "Engelmore, Robert and Morgan, Anthony",
+  title   = "Blackboard Systems",
+  year    = 1986,
+  address = "Reading, Mass.",
+  publisher = "Addison-Wesley",
+}
+@inproceedings{dalal2018finite,
+  title={Finite sample analyses for TD (0) with function approximation},
+  author={Dalal, Gal and Szorenyi, Balazs and Thoppe, Gugan and Mannor, Shie},
+  booktitle={Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence and Thirtieth Innovative Applications of Artificial Intelligence Conference and Eighth AAAI Symposium on Educational Advances in Artificial Intelligence},
+  pages={6144--6160},
+  year={2018}
+}
+@inproceedings{xu2019reanalysis,
+  title={Reanalysis of Variance Reduced Temporal Difference Learning},
+  author={Xu, Tengyu and Wang, Zhe and Zhou, Yi and Liang, Yingbin},
+  booktitle={International Conference on Learning Representations},
+  year={2019}
+}
+@inproceedings{c:83,
+  author  = "Clancey, William J.",
+  year    = 1983,
+  title   = "{Communication, Simulation, and Intelligent
+Agents: Implications of Personal Intelligent Machines
+for Medical Education}",
+  booktitle="Proceedings of the Eighth International Joint Conference on Artificial Intelligence {(IJCAI-83)}", 
+  pages   = "556-560",
+  address = "Menlo Park, Calif",
+  publisher = "{IJCAI Organization}",
+}
+@inproceedings{c:84,
+  author  = "Clancey, William J.",
+  year    = 1984,
+  title   = "{Classification Problem Solving}",
+  booktitle = "Proceedings of the Fourth National 
+              Conference on Artificial Intelligence",
+  pages   = "45-54",
+  address = "Menlo Park, Calif.",
+  publisher="AAAI Press",
+}
+@article{r:80,
+  author = {Robinson, Arthur L.},
+  title = {New Ways to Make Microcircuits Smaller},
+  volume = {208},
+  number = {4447},
+  pages = {1019--1022},
+  year = {1980},
+  doi = {10.1126/science.208.4447.1019},
+  publisher = {American Association for the Advancement of Science},
+  issn = {0036-8075},
+  URL = {https://science.sciencemag.org/content/208/4447/1019},
+  eprint = {https://science.sciencemag.org/content/208/4447/1019.full.pdf},
+  journal = {Science},
+}
+@article{r:80x,
+  author  = "Robinson, Arthur L.",
+  year    = 1980,
+  title   = "{New Ways to Make Microcircuits Smaller---Duplicate Entry}",
+  journal = "Science",
+  volume  =  208,
+  pages   = "1019-1026",
+}
+@article{hcr:83,
+title = {Strategic explanations for a diagnostic consultation system},
+journal = {International Journal of Man-Machine Studies},
+volume = {20},
+number = {1},
+pages = {3-19},
+year = {1984},
+issn = {0020-7373},
+doi = {https://doi.org/10.1016/S0020-7373(84)80003-6},
+url = {https://www.sciencedirect.com/science/article/pii/S0020737384800036},
+author = {Diane Warner Hasling and William J. Clancey and Glenn Rennels},
+abstract = {This article examines the problem of automatte explanation of reasoning, especially as it relates to expert systems. By explanation we mean the ability of a program to discuss what it is doing in some understandable way. We first present a general framework in which to view explanation and review some of the research done in this area. We then focus on the explanation system for NEOMYCIN, a medical consultation program. A consultation program interactively helps a user to solve a problem. Our goal is to have NEOMYCIN explain its problem-solving strategies. An explanation of strategy describes the plan the program is using to reach a solution. Such an explanation is usually concrete, referring to aspects of the current problem situation. Abstract explanations articulate a general principle, which can be applied in different situations; such explanations are useful in teaching and in explaining by analogy. We describe the aspects of NEOMYCIN that make abstract strategic explanations possible—the representation of strategic knowledge explicitly and separately from domain knowledge— and demonstrate how this representation can be used to generate explanations.}
+}
+@article{hcrt:83,
+  author  = "Hasling, Diane Warner and Clancey, William J. and Rennels, Glenn R. and Test, Thomas",
+  year    = 1983,
+  title   = "{Strategic Explanations in Consultation---Duplicate}",
+  journal = "The International Journal of Man-Machine Studies",
+  volume  = 20,
+  number  = 1,
+  pages   = "3-19",
+}
+@techreport{r:86,
+  author  = "Rice, James",
+  year    = 1986,
+  title   = "{Poligon: A System for Parallel Problem Solving}",
+  type    = "Technical Report", 
+  number  = "KSL-86-19", 
+  institution = "Dept.\ of Computer Science, Stanford Univ.",
+}
+@phdthesis{c:79,
+  author  = "Clancey, William J.",
+  year    = 1979,
+  title   = "{Transfer of Rule-Based Expertise
+through a Tutorial Dialogue}",
+  type    = "{Ph.D.} diss.",
+  school  = "Dept.\ of Computer Science, Stanford Univ.",
+  address = "Stanford, Calif.",
+}
+@unpublished{c:21,
+  author  = "Clancey, William J.",
+  title   = "{The Engineering of Qualitative Models}",
+  year    = 2021,
+  note    = "Forthcoming",
+}
+@misc{c:22,
+      title={Attention Is All You Need}, 
+      author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
+      year={2017},
+      eprint={1706.03762},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{c:23,
+  title        = "Pluto: The 'Other' Red Planet",
+  author       = "{NASA}",
+  howpublished = "\url{https://www.nasa.gov/nh/pluto-the-other-red-planet}",
+  year         = 2015,
+  note         = "Accessed: 2018-12-06"
+}
+@article{r:80x,
+  author  = "Robinson, Arthur L.",
+  year    = 1980,
+  title   = "{New Ways to Make Microcircuits Smaller---Duplicate Entry}",
+  journal = "Science",
+  volume  =  208,
+  pages   = "1019-1026",
+}
+@article{hcrt:83,
+  author  = "Hasling, Diane Warner and Clancey, William J. and Rennels, Glenn R. and Test, Thomas",
+  year    = 1983,
+  title   = "{Strategic Explanations in Consultation---Duplicate}",
+  journal = "The International Journal of Man-Machine Studies",
+  volume  = 20,
+  number  = 1,
+  pages   = "3-19",
+}
+@article{xu2013online,
+  title={Online learning control using adaptive critic designs with sparse kernel machines},
+  author={Xu, Xin and Hou, Zhongsheng and Lian, Chuanqiang and He, Haibo},
+  journal={IEEE Trans. Neural Netw. Learn. Syst.},
+  volume={24},
+  number={5},
+  pages={762--775},
+  year={2013},
+  publisher={IEEE}
+}
+@article{bertsekas2017value,
+  title={Value and policy iterations in optimal control and adaptive dynamic programming},
+  author={Bertsekas, Dimitri P},
+  journal={IEEE Trans. Neural Netw. Learn. Syst.},
+  year={2017},
+  volume={28},
+  number={3},
+  pages={500 - 509},
+  publisher={IEEE}
+}
+@phdthesis{hackman2012faster,
+  title={Faster Gradient-TD Algorithms},
+  author={Hackman, Leah},
+  year={2012},
+  school={University of Alberta}
+}
+@inproceedings{harutyunyan2015multi,
+  title={Multi-scale reward shaping via an off-policy ensemble},
+  author={Harutyunyan, Anna and Brys, Tim and Vrancx, Peter and Now{\'e}, Ann},
+  booktitle={Proc. 2015 Int. Conf. Autonomous Agents and Multiagent Systems},
+  pages={1641--1642},
+  year={2015},
+  organization={International Foundation for Autonomous Agents and Multiagent Systems}
+}
+@inproceedings{harutyunyan2015expressing,
+  title={Expressing Arbitrary Reward Functions as Potential-Based Advice.},
+  author={Harutyunyan, Anna and Devlin, Sam and Vrancx, Peter and Now{\'e}, Ann},
+  booktitle={AAAI},
+  pages={2652--2658},
+  year={2015}
+}
+@article{wiewiora2003potential,
+  title={Potential-based shaping and Q-value initialization are equivalent},
+  author={Wiewiora, Eric},
+  journal={J. Artif. Intell. Res.},
+  volume={19},
+  pages={205--208},
+  year={2003}
+}
+@article{grzes2010online,
+  title={Online learning of shaping rewards in reinforcement learning},
+  author={Grze{\'s}, Marek and Kudenko, Daniel},
+  journal={Neural Netw.},
+  volume={23},
+  number={4},
+  pages={541--550},
+  year={2010},
+  publisher={Elsevier}
+}
+@inproceedings{marthi2007automatic,
+  title={Automatic shaping and decomposition of reward functions},
+  author={Marthi, Bhaskara},
+  booktitle={Proc. 24th Int. Conf. Mach. Learn.},
+  pages={601--608},
+  year={2007}
+}
+@inproceedings{laud2003influence,
+  title={The Influence of Reward on the Speed of Reinforcement Learning: An Analysis of Shaping},
+  author={Laud, Adam and Dejong, Gerald},
+  booktitle={Proc. 20th Int. Conf. Mach. Learn.},
+  pages={440--447},
+  year={2003}
+}
+@phdthesis{laud2004theory,
+  title={Theory and application of reward shaping in reinforcement learning},
+  author={Laud, Adam Daniel},
+  year={2004},
+  school={University of Illinois at Urbana-Champaign}
+}
+@article{geist2013algorithmic,
+  title={Algorithmic survey of parametric value function approximation},
+  author={Geist, Matthieu and Pietquin, Olivier},
+  journal={IEEE Trans. Neural Netw. Learn. Syst.},
+  volume={24},
+  number={6},
+  pages={845--867},
+  year={2013},
+  publisher={IEEE}
+}
+@article{furmston2016approximate,
+  title={Approximate Newton Methods for Policy Search in Markov Decision Processes},
+  author={Furmston, Thomas and Lever, Guy and Barber, David},
+  journal={J. Mach. Learn. Res.},
+  volume={17},
+  number={227},
+  pages={1--51},
+  year={2016}
+}
+@article{silver2016mastering,
+  title={Mastering the game of Go with deep neural networks and tree search},
+  author={Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and van den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others},
+  journal={Nature},
+  volume={529},
+  number={7587},
+  pages={484--489},
+  year={2016},
+  publisher={Nature Publishing Group}
+}
+
+@article{mnih2015human,
+  title={Human-level control through deep reinforcement learning},
+  author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A and Veness, Joel and Bellemare, Marc G and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K and Ostrovski, Georg and others},
+  journal={Nature},
+  volume={518},
+  number={7540},
+  pages={529--533},
+  year={2015},
+  publisher={Nature Publishing Group}
+}
+@inproceedings{guo2014deep,
+  title={Deep learning for real-time Atari game play using offline Monte-Carlo tree search planning},
+  author={Guo, Xiaoxiao and Singh, Satinder and Lee, Honglak and Lewis, Richard L and Wang, Xiaoshi},
+  booktitle={Advances in Neural Information Processing Systems},
+  pages={3338--3346},
+  publisher={Cambridge, MA: MIT Press},
+  year={2014}
+}
+@inproceedings{scherrer2010should,
+  title={Should one compute the Temporal Difference fix point or minimize the Bellman Residual? The unified oblique projection view},
+  author={Scherrer, Bruno},
+  booktitle={Proc. 27th Int. Conf. Mach. Learn.},
+  pages={959--966},
+  year={2010}
+}
+@article{hirsch1989convergent,
+  title={Convergent activation dynamics in continuous time networks},
+  author={Hirsch, Morris W},
+  journal={Neural Netw.},
+  volume={2},
+  number={5},
+  pages={331--349},
+  year={1989},
+  publisher={Elsevier}
+}
+@article{borkar1997stochastic,
+  title={Stochastic approximation with two time scales},
+  author={Borkar, Vivek S},
+  journal={Syst. \& Control Letters},
+  volume={29},
+  number={5},
+  pages={291--294},
+  year={1997},
+  publisher={Elsevier}
+}
+@article{ortner2013adaptive,
+  title={Adaptive aggregation for reinforcement learning in average reward Markov decision processes},
+  author={Ortner, Ronald},
+  journal={Annals Oper. Res.},
+  volume={208},
+  number={1},
+  pages={321--336},
+  year={2013},
+  publisher={Springer}
+}
+@article{jaksch2010near,
+  title={Near-optimal regret bounds for reinforcement learning},
+  author={Jaksch, Thomas and Ortner, Ronald and Auer, Peter},
+  journal={Journal of Machine Learning Research},
+  number={Apr},
+  volume={11},
+  pages={1563--1600},
+  year={2010}
+}
+@article{ortner2007logarithmic,
+  title={Logarithmic online regret bounds for undiscounted reinforcement learning},
+  author={Ortner, P and Auer, R},
+  journal={Advances in Neural Information Processing Systems},
+  publisher={Cambridge, MA: MIT Press},
+  volume={19},
+  pages={49},
+  year={2007}
+}
+@article{das1999solving,
+  title={Solving semi-Markov decision problems using average reward reinforcement learning},
+  author={Das, Tapas K and Gosavi, Abhijit and Mahadevan, Sridhar and Marchalleck, Nicholas},
+  journal={Management Science},
+  volume={45},
+  number={4},
+  pages={560--574},
+  year={1999},
+  publisher={INFORMS}
+}
+@article{abounadi2001learning,
+  title={Learning algorithms for Markov decision processes with average cost},
+  author={Abounadi, Jinane and Bertsekas, D and Borkar, Vivek S},
+  journal={SIAM J. Control Optim.},
+  volume={40},
+  number={3},
+  pages={681--698},
+  year={2001},
+  publisher={SIAM}
+}
+@inproceedings{singh1994reinforcement,
+  title={Reinforcement learning algorithms for average-payoff Markovian decision processes},
+  author={Singh, Satinder P},
+  booktitle={AAAI},
+  volume={94},
+  pages={700--705},
+  year={1994}
+}
+@inproceedings{schwartz1993reinforcement,
+  title={A reinforcement learning method for maximizing undiscounted rewards},
+  author={Schwartz, Anton},
+  booktitle={Proc. 10th Int. Conf. Mach. Learn.},
+  volume={298},
+  pages={298--305},
+  year={1993}
+}
+
+@inproceedings{yang2016efficient,
+  title={Efficient Average Reward Reinforcement Learning Using Constant Shifting Values},
+  author={Yang, Shangdong and Gao, Yang and An, Bo and Wang, Hao and Chen, Xingguo},
+  booktitle={Thirtieth AAAI Conference on Artificial Intelligence},
+  pages={2258-2264},
+  year={2016}
+}
+@inproceedings{devlin2012dynamic,
+  title={Dynamic potential-based reward shaping},
+  author={Devlin, Sam and Kudenko, Daniel},
+  booktitle={Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems},
+  pages={433--440},
+  year={2012}
+}
+
+@inproceedings{ng1999policy,
+  title={Policy invariance under reward transformations: Theory and application to reward shaping},
+  author={Ng, Andrew Y and Harada, Daishi and Russell, Stuart},
+  booktitle={Proc. 16th Int. Conf. Mach. Learn.},
+  pages={278--287},
+  year={1999}
+}
+@article{borkar2000ode,
+  title={The ODE method for convergence of stochastic approximation and reinforcement learning},
+  author={Borkar, Vivek S and Meyn, Sean P},
+  journal={SIAM J. Control Optim.},
+  volume={38},
+  number={2},
+  pages={447--469},
+  year={2000},
+  publisher={SIAM}
+}
+@phdthesis{maei2011gradient,
+  title={Gradient temporal-difference learning algorithms},
+  author={Maei, Hamid Reza},
+  year={2011},
+  school={University of Alberta}
+}
+@phdthesis{baird1999reinforcement,
+  title={Reinforcement learning through gradient descent},
+  author={Baird III, Leemon C},
+  year={1999},
+  school={US Air Force Academy, US}
+}
+@PHDTHESIS{Driessens2004,
+  AUTHOR ="Kurt Driessens",
+  TITLE ="Relational Reinforcement Learning",
+  SCHOOL ="Catholic University of Leuven",
+  YEAR ="2004",
+}
+@article{tsitsiklis1996feature,
+  title={Feature-based methods for large scale dynamic programming},
+  author={Tsitsiklis, John N and Van Roy, Benjamin},
+  journal={Mach. Learn.},
+  volume={22},
+  number={1-3},
+  pages={59--94},
+  year={1996},
+  publisher={Springer}
+}
+@inproceedings{chen2009apply,
+  title={Apply ant colony optimization to Tetris},
+  author={Chen, X. and Wang, H. and Wang, W. and Shi, Y. and Gao, Y.},
+  booktitle={Proceedings of the 11th Annual Conference on Genetic and Evolutionary Computation (GECCO)},
+  pages={1741--1742},
+  year={2009},
+  organization={ACM}
+}
+@incollection{farias2006tetris,
+  title={Tetris: A study of randomized constraint sampling},
+  author={Farias, Vivek F and Van Roy, Benjamin},
+  booktitle={Probabilistic and Randomized Methods for Design Under Uncertainty},
+  pages={189--201},
+  year={2006},
+  publisher={Springer}
+}
+@article{bertsekas1996temporal,
+  title={Temporal differences-based policy iteration and applications in neuro-dynamic programming},
+  author={Bertsekas, Dimitri P and Ioffe, Sergey},
+  journal={Lab. for Info. and Decision Systems Report LIDS-P-2349, MIT, Cambridge, MA},
+  year={1996},
+  publisher={Citeseer}
+}
+@inproceedings{kakade2001natural,
+  title={A Natural Policy Gradient.},
+  author={Kakade, Sham},
+  booktitle={Advances in Neural Information Processing Systems},
+  publisher={Cambridge, MA: MIT Press},
+  volume={14},
+  pages={1531--1538},
+  year={2001}
+}
+@article{peters2008natural,
+  title={Natural actor-critic},
+  author={Peters, Jan and Schaal, Stefan},
+  journal={Neurocomputing},
+  volume={71},
+  number={7},
+  pages={1180--1190},
+  year={2008},
+  publisher={Elsevier}
+}
+@article{baxter2001infinite,
+  title={Infinite-horizon policy-gradient estimation},
+  author={Baxter, Jonathan and Bartlett, Peter L.},
+  journal={J. Artif. Intell. Res.},
+  pages={319--350},
+  year={2001}
+}
+@inproceedings{sutton1999policy,
+  title={Policy Gradient Methods for Reinforcement Learning with Function Approximation.},
+  author={Sutton, Richard S and McAllester, David A and Singh, Satinder P and Mansour, Yishay and others},
+  booktitle={Advances in Neural Information Processing Systems},
+  publisher={Cambridge, MA: MIT Press},
+  pages={1057--1063},
+  year={1999}
+}
+@inproceedings{bohm2005evolutionary,
+  title={An evolutionary approach to tetris},
+  author={B{\"o}hm, Niko and K{\'o}kai, Gabriella and Mandl, Stefan},
+  booktitle={Proc. 6th Metaheuristics Int. Conf.},
+  pages={137-148},
+  year={2005}
+}
+@article{szita2006learning,
+  title={Learning Tetris using the noisy cross-entropy method},
+  author={Szita, Istv{\'a}n and L{\"o}rincz, Andr{\'a}s},
+  journal={Neural Comput.},
+  volume={18},
+  number={12},
+  pages={2936--2941},
+  year={2006},
+  publisher={MIT Press}
+}
+@inproceedings{thiery2010least,
+  title={Least-Squares $\lambda$ Policy Iteration: Bias-Variance Trade-off in Control Problems},
+  author={Thiery, Christophe and Scherrer, Bruno},
+  booktitle={Proc. 27th Int. Conf. Mach. Learn.},
+  pages={1071--1078},
+  year={2010}
+}
+
+@inproceedings{gabillon2013approximate,
+  title={Approximate dynamic programming finally performs well in the game of Tetris},
+  author={Gabillon, Victor and Ghavamzadeh, Mohammad and Scherrer, Bruno},
+  booktitle={Advances in Neural Information Processing Systems},
+  publisher={Cambridge, MA: MIT Press},
+  pages={1754--1762},
+  year={2013}
+}
+@article{scherrer2013performance,
+  title={Performance bounds for $\lambda$ policy iteration and application to the game of Tetris},
+  author={Scherrer, Bruno},
+  journal={J. Mach. Learn. Res.},
+  volume={14},
+  number={1},
+  pages={1181--1227},
+  year={2013},
+  publisher={JMLR. org}
+}
+@article{thiery2009improvements,
+  title={Improvements on Learning Tetris with Cross Entropy},
+  author={Thiery, Christophe and Scherrer, Bruno},
+  journal={Int. Computer Games Assoc. J.},
+  volume={32},
+  number={1},
+  pages={23--33},
+  year={2009}
+}
+@article{scherrer2015approximate,
+  title={Approximate Modified Policy Iteration and its Application to the Game of Tetris},
+  author={Scherrer, Bruno and Ghavamzadeh, Mohammad and Gabillon, Victor and Lesner, Boris and Geist, Matthieu},
+  journal={J. Mach. Learn. Res.},
+  volume={16},
+  pages={1629--1676},
+  year={2015}
+}
+
+@article{efron2004least,
+  title={Least angle regression},
+  author={Efron, Bradley and Hastie, Trevor and Johnstone, Iain and Tibshirani, Robert and others},
+  journal={The Annals of statistics},
+  volume={32},
+  number={2},
+  pages={407--499},
+  year={2004},
+  publisher={Institute of Mathematical Statistics}
+}
+@MASTERSTHESIS{Brzustowski1992,
+  author ={John Brzustowski},
+  title ={Can you win at tetris?},
+  school = {University of British Columbia},
+  year ={1992}
+}
+@Article{Breukelaar04,
+  author =	 {Ron Breukelaar and Erik D. Demaine and Susan
+                  Hohenberger and Hendrik Jan Hoogeboom and Walter
+                  A. Kosters and David Liben-Nowell},
+  title =	 {Tetris is Hard, Even to Approximate},
+  journal =	 {International Journal of Computational Geometry and
+                  Applications},
+  year =	 {2004},
+  volume =	 {14},
+  number =	 {1--2},
+  pages =	 {41--68},
+  month =	 {April},
+}
+@book{Bertsekas1996,
+  author =	 {Bertsekas, D. and Tsitsiklis, J. N.},
+  title =	 {Neuro-Dynamic Programming},
+  year =	 {1996},
+  publisher =	 {Athena Scientific},
+}
+@inproceedings{maei2010gq,
+  title={GQ ($\lambda$): A general gradient algorithm for temporal-difference prediction learning with eligibility traces},
+  author={Maei, Hamid Reza and Sutton, Richard S},
+  booktitle={Proceedings of the Third Conference on Artificial General Intelligence},
+  volume={1},
+  pages={91--96},
+  year={2010}
+}
+@inproceedings{maei2010toward,
+  title={Toward off-policy learning control with function approximation},
+  author={Maei, Hamid R and Szepesv{\'a}ri, Csaba and Bhatnagar, Shalabh and Sutton, Richard S},
+  booktitle={Proc. 27th Int. Conf. Mach. Learn.},
+  pages={719--726},
+  year={2010}
+}
+@inproceedings{phua2007tracking,
+  title={Tracking value function dynamics to improve reinforcement learning with piecewise linear function approximation},
+  author={Phua, Chee Wee and Fitch, Robert},
+  booktitle={Proc. 24th Int. Conf. Mach. Learn.},
+  pages={751--758},
+  year={2007},
+  organization={ACM}
+}
+@inproceedings{szubert2014temporal,
+  title={Temporal difference learning of N-tuple networks for the game 2048},
+  author={Szubert, Marcin and Jaskowski, Wojciech},
+  booktitle={2014 IEEE Conference on Computational Intelligence and Games (CIG)},
+  pages={1--8},
+  year={2014},
+  organization={IEEE}
+}
+@article{chen2013online,
+  title={Online Selective Kernel-based Temporal Differece Learning},
+  author={Chen, Xingguo and Gao, Yang and Wang, Ruili},
+  journal={IEEE Trans. Neural Netw. Learn. Syst.},
+  year={2013},
+  volume={24},
+  number={12},
+  pages={1944--1956},
+  publisher={IEEE}
+}
+
+@article{xu2007kernel,
+  title={Kernel-based least squares policy iteration for reinforcement learning},
+  author={Xu, Xin and Hu, Dewen and Lu, Xicheng},
+  journal={IEEE Trans. Neural Netw.},
+  volume={18},
+  number={4},
+  pages={973--992},
+  year={2007},
+  publisher={IEEE}
+}
+@INPROCEEDINGS{Engel03bayesmeets,
+    author = {Yaakov Engel and Shie Mannor and Ron Meir},
+    title = {Bayes meets {B}ellman: the {G}aussian process approach to temporal difference learning},
+    booktitle = {Proc. 20th Int. Conf. Mach. Learn.},
+    year = {2003},
+    pages = {154--161},
+    address={Washington, DC},
+    month={Aug.},
+}
+@inproceedings{robards2011sparse,
+  title={Sparse Kernel-SARSA ($\lambda$) with an eligibility trace},
+  author={Robards, M. and Sunehag, P. and Sanner, S. and Marthi, B.},
+  booktitle = {Proc. 22nd Eur. Conf. Mach. Learn.},
+  pages={1--17},
+  year={2011},
+  month={Sept.},
+   address = {Athens, Greece},
+}
+@conference{reisinger2008online,
+  title={{Online kernel selection for {B}ayesian reinforcement learning}},
+  author={Reisinger, J. and Stone, P. and Miikkulainen, R.},
+  booktitle={Proc. 25th Int. Conf. Mach. Learn.},
+  pages={816--823},
+  year={2008},
+  month={July},
+  address={ Helsinki, Finland},
+}
+@book{Sutton1998,
+  title={{Reinforcement learning: an introduction}},
+  author={Sutton, R.S. and Barto, A.G.},
+  year={1998},
+  publisher={MIT Press},
+  address={Cambridge, MA}
+}
+@book{Sutton2018book,
+  author = {Sutton, Richard S. and Barto, Andrew G.},
+  edition = {Second},
+  publisher = {The MIT Press},
+  title = {Reinforcement Learning: An Introduction},
+  year = {2018 }
+}
+@phdthesis{Bradtke1994phd,
+  title={Incremental Dynamic Programming for On-line Adaptive Optimal Control},
+  author={Bradtke, Steven J},
+  year={1994},
+  school={University of  Massachusetts},
+  month={Sept.},
+  address={Amherst},
+}
+@inproceedings{baird1995residual,
+  title={Residual algorithms: Reinforcement learning with function approximation},
+  author={Baird, Leemon and others},
+  booktitle={Proc. 12th Int. Conf. Mach. Learn.},
+  pages={30--37},
+  year={1995}
+}
+@article{bradtke1996linear,
+  title={Linear least-squares algorithms for temporal difference learning},
+  author={Bradtke, S.J. and Barto, A.G.},
+  journal={Mach. Learn.},
+  volume={22},
+  number={1},
+  pages={33--57},
+  year={1996},
+  publisher={Springer}
+}
+@article{lagoudakis2003least,
+  title={Least-squares policy iteration},
+  author={Lagoudakis, M.G. and Parr, R.},
+  journal={J. Mach. Learn. Res.},
+  volume={4},
+  pages={1107--1149},
+  year={2003},
+  publisher={JMLR. org}
+}
+@article{boyan2002technical,
+  title={Technical update: Least-squares temporal difference learning},
+  author={Boyan, J.A.},
+  journal={Mach. Learn.},
+  volume={49},
+  number={2},
+  pages={233--246},
+  year={2002},
+  publisher={Springer}
+}
+@inproceedings{geramifard2006incremental,
+  title={Incremental least-squares temporal difference learning},
+  author={Geramifard, A. and Bowling, M. and Sutton, R.S.},
+  booktitle={Proc. 21st AAAI Conf. Artif. Intell.},
+  pages={356--361},
+  year={2006},
+  month={July},
+  address={Boston, Massachusetts},
+}
+@inproceedings{sutton2009fast,
+  title={Fast gradient-descent methods for temporal-difference learning with linear function approximation},
+  author={Sutton, R.S. and Maei, H.R. and Precup, D. and Bhatnagar, S. and Silver, D. and Szepesv{\'a}ri, C. and Wiewiora, E.},
+  booktitle={Proc. 26th Int. Conf. Mach. Learn.},
+  pages={993--1000},
+  year={2009}
+}
+@inproceedings{sutton2008convergent,
+  title={A Convergent $ O (n) $ Temporal-difference Algorithm for Off-policy Learning with Linear Function Approximation},
+  author={Sutton, Richard S and Maei, Hamid R and Szepesv{\'a}ri, Csaba},
+  booktitle={Advances in Neural Information Processing Systems},
+  publisher={Cambridge, MA: MIT Press},
+  pages={1609--1616},
+  year={2008}
+}
+@inproceedings{dabney2014natural,
+  title={Natural Temporal Difference Learning},
+  author={Dabney, William and Thomas, Philip},
+  booktitle={Twenty-Eighth AAAI Conference on Artificial Intelligence},
+  year={2014}
+}
+@inproceedings{mahmood2014weighted,
+  title={Weighted importance sampling for off-policy learning with linear function approximation},
+  author={Mahmood, A Rupam and van Hasselt, Hado P and Sutton, Richard S},
+  booktitle={Advances in Neural Information Processing Systems},
+  publisher={Cambridge, MA: MIT Press},
+  pages={3014--3022},
+  year={2014}
+}
+@inproceedings{seijen2014true,
+  title={True Online TD ($\lambda$)},
+  author={Seijen, Harm V and Sutton, Rich},
+  booktitle={Proc. 31st Int. Conf. Mach. Learn.},
+  pages={692--700},
+  year={2014}
+}
+@article{ormoneit2002kernel,
+  title={{Kernel-based reinforcement learning}},
+  author={Ormoneit, D. and Sen, {\'S}.},
+  journal={Mach. Learn.},
+  volume={49},
+  number={2-3},
+  pages={161--178},
+  issn={0885-6125},
+  year={2002},
+  publisher={Springer-Verlag },
+  address = {Hingham, MA, USA},
+}
+@inproceedings{Ghavamzadeh2010lstd,
+  author = {M. Ghavamzadeh and A. Lazaric and O. A. Maillard and R. Munos},
+  title = {{LSTD} with Random Projections},
+  BOOKTITLE={Advances in Neural Information Processing Systems},
+  publisher={Cambridge, MA: MIT Press},
+  volume = {23},
+  pages = {721--729},
+  Address = {Lake Tahoe, Nevada, USA},
+  year = {2010}
+}
+@inproceedings{loth2007sparse,
+  title={Sparse temporal difference learning using LASSO},
+  author={Loth, M. and Davy, M. and Preux, P.},
+  booktitle={Proc. IEEE Symp. Approx. Dynamic Program. Reinforce. Learn.},
+  pages={352--359},
+  year={2007},
+  organization={IEEE}
+}
+@inproceedings{kolter2009regularization,
+  title={Regularization and feature selection in least-squares temporal difference learning},
+  author={Kolter, J.Z. and Ng, A.Y.},
+  booktitle={Proc. 26th Int. Conf. Mach. Learn.},
+  pages={521--528},
+  year={2009},
+  organization={ACM}
+}
+@inproceedings{hoffman2011regularized,
+  title={Regularized least squares temporal difference learning with nested l2 and l1 penalization},
+  author={Hoffman, M.W. and Lazaric, A. and Ghavamzadeh, M. and Munos, R.},
+  booktitle={Proc. Eur. Workshop Reinforce. Learn.},
+  year={2011}
+}
+@inproceedings{Ghavamzadeh2011finite,
+  author = {M. Ghavamzadeh and A. Lazaric and R. Munos and M. Hoffman},
+  title = {Finite-Sample Analysis of {Lasso-TD}},
+  booktitle = {Proc. 28th Int. Conf. Mach. Learn.},
+  year = {2011},
+  month= {June},
+  address={Bellevue, Washington, USA},
+  pages={1177--1184},
+}
+@inproceedings{johnson2013accelerating,
+  title={Accelerating stochastic gradient descent using predictive variance reduction},
+  author={Johnson, R. and Zhang, T.},
+  booktitle={Advances in Neural Information Processing Systems},
+  pages={315--323},
+  year={2013}
+}
+@article{xu2020reanalysis,
+  title={Reanalysis of variance reduced temporal difference learning},
+  author={Xu, T. and Wang, Z. and Zhou, Y. and Liang, Y.},
+  journal={arXiv preprint arXiv:2001.01898},
+  year={2020}
+}
+@inproceedings{schulman2015trust,
+  title={Trust region policy optimization},
+  author={Schulman, J. and Levine, S. and Abbeel, P. and Jordan, M. and Moritz, P.},
+  booktitle={International Conference on Machine Learning},
+  pages={1889--1897},
+  year={2015}
+}
+@article{schulman2017proximal,
+  title={Proximal policy optimization algorithms},
+  author={Schulman, J. and Wolski, F. and Dhariwal, P. and Radford, A. and Klimov, O.},
+  journal={arXiv preprint arXiv:1707.06347},
+  year={2017}
+}
+@inproceedings{defazio2014saga,
+  title={SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives},
+  author={Defazio, A. and Bach, F. and Lacoste-Julien, S.},
+  booktitle={Advances in Neural Information Processing Systems},
+  pages={1646--1654},
+  year={2014}
+}
+@inproceedings{du2017stochastic,
+  title={Stochastic variance reduction methods for policy evaluation},
+  author={Du, S. S. and Chen, J. and Li, L. and Xiao, L. and Zhou, D.},
+  booktitle={Proceedings of the 34th International Conference on Machine Learning},
+  pages={1049--1058},
+  year={2017}
+}
+@inproceedings{chen2023modified,
+  title={Modified Retrace for Off-Policy Temporal Difference Learning},
+  author={Chen, Xingguo and Ma, Xingzhou and Li, Yang and Yang, Guang and Yang, Shangdong and Gao, Yang},
+  booktitle={Uncertainty in Artificial Intelligence},
+  pages={303--312},
+  year={2023},
+  organization={PMLR}
+}
+@article{dalal2017finite,
+  title={Finite Sample Analyses for TD(0) with Function Approximation},
+  author={Dalal, Gal and Szörényi, Balázs and Thoppe, Gugan and Mannor, Shie},
+  journal={arXiv preprint arXiv:1704.01161},
+  year={2017}
+}
+@article{sutton1988learning,
+  title={Learning to predict by the methods of temporal differences},
+  author={Sutton, Richard S},
+  journal={Machine learning},
+  volume={3},
+  number={1},
+  pages={9--44},
+  year={1988},
+  publisher={Springer}
+}
+@inproceedings{tsitsiklis1997analysis,
+  title={Analysis of temporal-diffference learning with function approximation},
+  author={Tsitsiklis, John N and Van Roy, Benjamin},
+  booktitle={Advances in Neural Information Processing Systems},
+  pages={1075--1081},
+  year={1997}
+}
+@article{sutton2016emphatic,
+  title={An emphatic approach to the problem of off-policy temporal-difference learning},
+  author={Sutton, Richard S and Mahmood, A Rupam and White, Martha},
+  journal={The Journal of Machine Learning Research},
+  volume={17},
+  number={1},
+  pages={2603--2631},
+  year={2016},
+  publisher={JMLR. org}
+}
+@inproceedings{liu2015finite,
+  title={Finite-sample analysis of proximal gradient TD algorithms},
+  author={Liu, Bo and Liu, Ji and Ghavamzadeh, Mohammad and Mahadevan, Sridhar and Petrik, Marek},
+  booktitle={Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence},
+  pages={504--513},
+  year={2015}
+}
+@inproceedings{liu2016proximal,
+  title={Proximal Gradient Temporal Difference Learning Algorithms.},
+  author={Liu, Bo and Liu, Ji and Ghavamzadeh, Mohammad and Mahadevan, Sridhar and Petrik, Marek},
+  booktitle={Proceedings of the International Joint Conference on Artificial Intelligence},
+  pages={4195--4199},
+  year={2016}
+}
+@article{liu2018proximal,
+  title={Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity},
+  author={Liu, Bo and Gemp, Ian and Ghavamzadeh, Mohammad and Liu, Ji and Mahadevan, Sridhar and Petrik, Marek},
+  journal={Journal of Artificial Intelligence Research},
+  volume={63},
+  pages={461--494},
+  year={2018}
+}
+@inproceedings{givchi2015quasi,
+  title={Quasi newton temporal difference learning},
+  author={Givchi, Arash and Palhang, Maziar},
+  booktitle={Asian Conference on Machine Learning},
+  pages={159--172},
+  year={2015}
+}
+@inproceedings{pan2017accelerated,
+  title={Accelerated gradient temporal difference learning},
+  author={Pan, Yangchen and White, Adam and White, Martha},
+  booktitle={Proceedings of the 21st AAAI Conference on Artificial Intelligence},
+  pages={2464--2470},
+  year={2017}
+}
+@inproceedings{hallak2016generalized,
+  title={Generalized emphatic temporal difference learning: bias-variance analysis},
+  author={Hallak, Assaf and Tamar, Aviv and Munos, Remi and Mannor, Shie},
+  booktitle={Proceedings of the 30th AAAI Conference on Artificial Intelligence},
+  pages={1631--1637},
+  year={2016}
+}
+@article{zhang2022truncated,
+  title={Truncated emphatic temporal difference methods for prediction and control},
+  author={Zhang, Shangtong and Whiteson, Shimon},
+  journal={The Journal of Machine Learning Research},
+  volume={23},
+  number={1},
+  pages={6859--6917},
+  year={2022},
+  publisher={JMLRORG}
+}
+@inproceedings{korda2015td,
+  title={On TD (0) with function approximation: Concentration bounds and a centered variant with exponential convergence},
+  author={Korda, Nathaniel and La, Prashanth},
+  booktitle={International conference on machine learning},
+  pages={626--634},
+  year={2015},
+  organization={PMLR}
+}
+@book{zhou2021machine,
+  title={Machine learning},
+  author={Zhou, Zhi-Hua},
+  year={2021},
+  publisher={Springer Nature}
+}
+@inproceedings{dalal2020tale,
+  title={A tale of two-timescale reinforcement learning with the tightest finite-time bound},
+  author={Dalal, Gal and Szorenyi, Balazs and Thoppe, Gugan},
+  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
+  volume={34},
+  number={04},
+  pages={3701-3708},
+  year={2020}
+}
+@inproceedings{feng2019kernel,
+  title={A kernel loss for solving the Bellman equation},
+  author={Feng, Yihao and Li, Lihong and Liu, Qiang},
+  booktitle={Advances in Neural Information Processing Systems},
+  pages={15430--15441},
+  year={2019}
+}
+@inproceedings{basserrano2021logistic,
+  title={Logistic Q-Learning},
+  author={Bas-Serrano, Joan and Curi, Sebastian and Krause, Andreas and Neu, Gergely},
+  booktitle={International Conference on Artificial Intelligence and Statistics},
+  pages={3610--3618},
+  year={2021}
+}
+
+
+
+
+
+
+
+
+
diff --git b/neurips_2024.blg a/neurips_2024.blg
new file mode 100644
index 0000000..b65e352
--- /dev/null
+++ a/neurips_2024.blg
@@ -0,0 +1,48 @@
+This is BibTeX, Version 0.99d (TeX Live 2023)
+Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
+The top-level auxiliary file: neurips_2024.aux
+The style file: named.bst
+Database file #1: neurips_2024.bib
+Warning--can't use both volume and number fields in dalal2020tale
+You've used 32 entries,
+            2439 wiz_defined-function locations,
+            737 strings with 10053 characters,
+and the built_in function-call counts, 15617 in all, are:
+= -- 1648
+> -- 575
+< -- 21
++ -- 200
+- -- 194
+* -- 1156
+:= -- 2297
+add.period$ -- 99
+call.type$ -- 32
+change.case$ -- 222
+chr.to.int$ -- 32
+cite$ -- 33
+duplicate$ -- 692
+empty$ -- 1205
+format.name$ -- 235
+if$ -- 3401
+int.to.chr$ -- 1
+int.to.str$ -- 0
+missing$ -- 31
+newline$ -- 163
+num.names$ -- 96
+pop$ -- 236
+preamble$ -- 1
+purify$ -- 256
+quote$ -- 0
+skip$ -- 627
+stack$ -- 0
+substring$ -- 1023
+swap$ -- 276
+text.length$ -- 21
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 252
+warning$ -- 1
+while$ -- 134
+width$ -- 37
+write$ -- 420
+(There was 1 warning)
diff --git b/neurips_2024.log a/neurips_2024.log
new file mode 100644
index 0000000..ea7ebef
--- /dev/null
+++ a/neurips_2024.log
@@ -0,0 +1,966 @@
+This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.3.31)  19 MAY 2024 17:37
+entering extended mode
+ restricted \write18 enabled.
+ file:line:error style messages enabled.
+ %&-line parsing enabled.
+**neurips_2024
+(./neurips_2024.tex
+LaTeX2e <2022-11-01> patch level 1
+L3 programming layer <2023-02-22> (d:/software/texlive/2023/texmf-dist/tex/latex/base/article.cls
+Document Class: article 2022/07/02 v1.4n Standard LaTeX document class
+(d:/software/texlive/2023/texmf-dist/tex/latex/base/size10.clo
+File: size10.clo 2022/07/02 v1.4n Standard LaTeX file (size option)
+)
+\c@part=\count185
+\c@section=\count186
+\c@subsection=\count187
+\c@subsubsection=\count188
+\c@paragraph=\count189
+\c@subparagraph=\count190
+\c@figure=\count191
+\c@table=\count192
+\abovecaptionskip=\skip48
+\belowcaptionskip=\skip49
+\bibindent=\dimen140
+) (./neurips_2024.sty
+Package: neurips_2024 2024/03/31 NeurIPS 2024 submission/camera-ready style file
+ (d:/software/texlive/2023/texmf-dist/tex/latex/environ/environ.sty
+Package: environ 2014/05/04 v0.3 A new way to define environments
+ (d:/software/texlive/2023/texmf-dist/tex/latex/trimspaces/trimspaces.sty
+Package: trimspaces 2009/09/17 v1.1 Trim spaces around a token list
+)
+\@envbody=\toks16
+) (d:/software/texlive/2023/texmf-dist/tex/latex/natbib/natbib.sty
+Package: natbib 2010/09/13 8.31b (PWD, AO)
+\bibhang=\skip50
+\bibsep=\skip51
+LaTeX Info: Redefining \cite on input line 694.
+\c@NAT@ctr=\count193
+) (d:/software/texlive/2023/texmf-dist/tex/latex/geometry/geometry.sty
+Package: geometry 2020/01/02 v5.9 Page Geometry
+ (d:/software/texlive/2023/texmf-dist/tex/latex/graphics/keyval.sty
+Package: keyval 2022/05/29 v1.15 key=value parser (DPC)
+\KV@toks@=\toks17
+) (d:/software/texlive/2023/texmf-dist/tex/generic/iftex/ifvtex.sty
+Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
+ (d:/software/texlive/2023/texmf-dist/tex/generic/iftex/iftex.sty
+Package: iftex 2022/02/03 v1.0f TeX engine tests
+))
+\Gm@cnth=\count194
+\Gm@cntv=\count195
+\c@Gm@tempcnt=\count196
+\Gm@bindingoffset=\dimen141
+\Gm@wd@mp=\dimen142
+\Gm@odd@mp=\dimen143
+\Gm@even@mp=\dimen144
+\Gm@layoutwidth=\dimen145
+\Gm@layoutheight=\dimen146
+\Gm@layouthoffset=\dimen147
+\Gm@layoutvoffset=\dimen148
+\Gm@dimlist=\toks18
+)
+\@neuripsabovecaptionskip=\skip52
+\@neuripsbelowcaptionskip=\skip53
+ (d:/software/texlive/2023/texmf-dist/tex/latex/lineno/lineno.sty
+Package: lineno 2023/01/19 line numbers on paragraphs v5.1
+\linenopenalty=\count197
+\output=\toks19
+\linenoprevgraf=\count198
+\linenumbersep=\dimen149
+\linenumberwidth=\dimen150
+\c@linenumber=\count199
+\c@pagewiselinenumber=\count266
+\c@LN@truepage=\count267
+\c@internallinenumber=\count268
+\c@internallinenumbers=\count269
+\quotelinenumbersep=\dimen151
+\bframerule=\dimen152
+\bframesep=\dimen153
+\bframebox=\box51
+ (d:/software/texlive/2023/texmf-dist/tex/latex/etoolbox/etoolbox.sty
+Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW)
+\etb@tempcnta=\count270
+)
+LaTeX Info: Redefining \\ on input line 3131.
+)) (d:/software/texlive/2023/texmf-dist/tex/latex/base/inputenc.sty
+Package: inputenc 2021/02/14 v1.3d Input encoding file
+\inpenc@prehook=\toks20
+\inpenc@posthook=\toks21
+) (d:/software/texlive/2023/texmf-dist/tex/latex/base/fontenc.sty
+Package: fontenc 2021/04/29 v2.0v Standard LaTeX package
+LaTeX Font Info:    Trying to load font information for T1+ptm on input line 112.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/psnfss/t1ptm.fd
+File: t1ptm.fd 2001/06/04 font definitions for T1/ptm.
+)) (d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/hyperref.sty
+Package: hyperref 2023-02-07 v7.00v Hypertext links for LaTeX
+ (d:/software/texlive/2023/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
+Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
+Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO)
+ (d:/software/texlive/2023/texmf-dist/tex/generic/infwarerr/infwarerr.sty
+Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
+)
+Package pdftexcmds Info: \pdf@primitive is available.
+Package pdftexcmds Info: \pdf@ifprimitive is available.
+Package pdftexcmds Info: \pdfdraftmode found.
+) (d:/software/texlive/2023/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty
+Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
+Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pdfescape/pdfescape.sty
+Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/latex/hycolor/hycolor.sty
+Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
+Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/latex/auxhook/auxhook.sty
+Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/nameref.sty
+Package: nameref 2022-05-17 v2.50 Cross-referencing by name of section
+ (d:/software/texlive/2023/texmf-dist/tex/latex/refcount/refcount.sty
+Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty
+Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
+ (d:/software/texlive/2023/texmf-dist/tex/latex/kvoptions/kvoptions.sty
+Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO)
+))
+\c@section@level=\count271
+)
+\@linkdim=\dimen154
+\Hy@linkcounter=\count272
+\Hy@pagecounter=\count273
+ (d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/pd1enc.def
+File: pd1enc.def 2023-02-07 v7.00v Hyperref: PDFDocEncoding definition (HO)
+Now handling font encoding PD1 ...
+... no UTF-8 mapping file for font encoding PD1
+) (d:/software/texlive/2023/texmf-dist/tex/generic/intcalc/intcalc.sty
+Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/etexcmds/etexcmds.sty
+Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
+)
+\Hy@SavedSpaceFactor=\count274
+ (d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/puenc.def
+File: puenc.def 2023-02-07 v7.00v Hyperref: PDF Unicode definition (HO)
+Now handling font encoding PU ...
+... no UTF-8 mapping file for font encoding PU
+)
+Package hyperref Info: Hyper figures OFF on input line 4177.
+Package hyperref Info: Link nesting OFF on input line 4182.
+Package hyperref Info: Hyper index ON on input line 4185.
+Package hyperref Info: Plain pages OFF on input line 4192.
+Package hyperref Info: Backreferencing OFF on input line 4197.
+Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
+Package hyperref Info: Bookmarks ON on input line 4425.
+\c@Hy@tempcnt=\count275
+ (d:/software/texlive/2023/texmf-dist/tex/latex/url/url.sty
+\Urlmuskip=\muskip16
+Package: url 2013/09/16  ver 3.4  Verb mode for urls, etc.
+)
+LaTeX Info: Redefining \url on input line 4763.
+\XeTeXLinkMargin=\dimen155
+ (d:/software/texlive/2023/texmf-dist/tex/generic/bitset/bitset.sty
+Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
+ (d:/software/texlive/2023/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
+Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO)
+))
+\Fld@menulength=\count276
+\Field@Width=\dimen156
+\Fld@charsize=\dimen157
+Package hyperref Info: Hyper figures OFF on input line 6042.
+Package hyperref Info: Link nesting OFF on input line 6047.
+Package hyperref Info: Hyper index ON on input line 6050.
+Package hyperref Info: backreferencing OFF on input line 6057.
+Package hyperref Info: Link coloring OFF on input line 6062.
+Package hyperref Info: Link coloring with OCG OFF on input line 6067.
+Package hyperref Info: PDF/A mode OFF on input line 6072.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/base/atbegshi-ltx.sty
+Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi
+package with kernel methods
+)
+\Hy@abspage=\count277
+\c@Item=\count278
+\c@Hfootnote=\count279
+)
+Package hyperref Info: Driver (autodetected): hpdftex.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/hpdftex.def
+File: hpdftex.def 2023-02-07 v7.00v Hyperref driver for pdfTeX
+ (d:/software/texlive/2023/texmf-dist/tex/latex/base/atveryend-ltx.sty
+Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend package
+with kernel methods
+)
+\Fld@listcount=\count280
+\c@bookmark@seq@number=\count281
+ (d:/software/texlive/2023/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
+Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO)
+ (d:/software/texlive/2023/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
+Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
+)
+Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 285.
+)
+\Hy@SectionHShift=\skip54
+) (d:/software/texlive/2023/texmf-dist/tex/latex/booktabs/booktabs.sty
+Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
+\heavyrulewidth=\dimen158
+\lightrulewidth=\dimen159
+\cmidrulewidth=\dimen160
+\belowrulesep=\dimen161
+\belowbottomsep=\dimen162
+\aboverulesep=\dimen163
+\abovetopsep=\dimen164
+\cmidrulesep=\dimen165
+\cmidrulekern=\dimen166
+\defaultaddspace=\dimen167
+\@cmidla=\count282
+\@cmidlb=\count283
+\@aboverulesep=\dimen168
+\@belowrulesep=\dimen169
+\@thisruleclass=\count284
+\@lastruleclass=\count285
+\@thisrulewidth=\dimen170
+) (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/amsfonts.sty
+Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
+\@emptytoks=\toks22
+\symAMSa=\mathgroup4
+\symAMSb=\mathgroup5
+LaTeX Font Info:    Redeclaring math symbol \hbar on input line 98.
+LaTeX Font Info:    Overwriting math alphabet `\mathfrak' in version `bold'
+(Font)                  U/euf/m/n --> U/euf/b/n on input line 106.
+) (d:/software/texlive/2023/texmf-dist/tex/latex/units/nicefrac.sty
+Package: nicefrac 1998/08/04 v0.9b Nice fractions
+\L@UnitsRaiseDisplaystyle=\skip55
+\L@UnitsRaiseTextstyle=\skip56
+\L@UnitsRaiseScriptstyle=\skip57
+ (d:/software/texlive/2023/texmf-dist/tex/latex/base/ifthen.sty
+Package: ifthen 2022/04/13 v1.1d Standard LaTeX ifthen package (DPC)
+)) (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/microtype.sty
+Package: microtype 2023/03/13 v3.1a Micro-typographical refinements (RS)
+\MT@toks=\toks23
+\MT@tempbox=\box52
+\MT@count=\count286
+LaTeX Info: Redefining \noprotrusionifhmode on input line 1059.
+LaTeX Info: Redefining \leftprotrusion on input line 1060.
+\MT@prot@toks=\toks24
+LaTeX Info: Redefining \rightprotrusion on input line 1078.
+LaTeX Info: Redefining \textls on input line 1368.
+\MT@outer@kern=\dimen171
+LaTeX Info: Redefining \textmicrotypecontext on input line 1988.
+\MT@listname@count=\count287
+ (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/microtype-pdftex.def
+File: microtype-pdftex.def 2023/03/13 v3.1a Definitions specific to pdftex (RS)
+LaTeX Info: Redefining \lsstyle on input line 902.
+LaTeX Info: Redefining \lslig on input line 902.
+\MT@outer@space=\skip58
+)
+Package microtype Info: Loading configuration file microtype.cfg.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/microtype.cfg
+File: microtype.cfg 2023/03/13 v3.1a microtype main configuration file (RS)
+)) (d:/software/texlive/2023/texmf-dist/tex/latex/xcolor/xcolor.sty
+Package: xcolor 2022/06/12 v2.14 LaTeX color extensions (UK)
+ (d:/software/texlive/2023/texmf-dist/tex/latex/graphics-cfg/color.cfg
+File: color.cfg 2016/01/02 v1.6 sample color configuration
+)
+Package xcolor Info: Driver file: pdftex.def on input line 227.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/graphics-def/pdftex.def
+File: pdftex.def 2022/09/22 v1.2b Graphics/color driver for pdftex
+) (d:/software/texlive/2023/texmf-dist/tex/latex/graphics/mathcolor.ltx)
+Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1353.
+Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1357.
+Package xcolor Info: Model `RGB' extended on input line 1369.
+Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1371.
+Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1372.
+Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1373.
+Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1374.
+Package xcolor Info: Model `Gray' substituted by `gray' on input line 1375.
+Package xcolor Info: Model `wave' substituted by `hsb' on input line 1376.
+) (d:/software/texlive/2023/texmf-dist/tex/latex/graphics/graphicx.sty
+Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
+ (d:/software/texlive/2023/texmf-dist/tex/latex/graphics/graphics.sty
+Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR)
+ (d:/software/texlive/2023/texmf-dist/tex/latex/graphics/trig.sty
+Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
+) (d:/software/texlive/2023/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
+File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
+)
+Package graphics Info: Driver file: pdftex.def on input line 107.
+)
+\Gin@req@height=\dimen172
+\Gin@req@width=\dimen173
+) (d:/software/texlive/2023/texmf-dist/tex/latex/subfigure/subfigure.sty
+Package: subfigure 2002/03/15 v2.1.5 subfigure package
+\subfigtopskip=\skip59
+\subfigcapskip=\skip60
+\subfigcaptopadj=\dimen174
+\subfigbottomskip=\skip61
+\subfigcapmargin=\dimen175
+\subfiglabelskip=\skip62
+\c@subfigure=\count288
+\c@subtable=\count289
+
+****************************************
+* Local config file subfigure.cfg used *
+****************************************
+(d:/software/texlive/2023/texmf-dist/tex/latex/subfigure/subfigure.cfg)
+\subfig@top=\skip63
+\subfig@bottom=\skip64
+) (d:/software/texlive/2023/texmf-dist/tex/latex/diagbox/diagbox.sty
+Package: diagbox 2020/02/09 v2.3 Making table heads with diagonal lines
+ (d:/software/texlive/2023/texmf-dist/tex/latex/pict2e/pict2e.sty
+Package: pict2e 2020/09/30 v0.4b Improved picture commands (HjG,RN,JT)
+ (d:/software/texlive/2023/texmf-dist/tex/latex/pict2e/pict2e.cfg
+File: pict2e.cfg 2016/02/05 v0.1u pict2e configuration for teTeX/TeXLive
+)
+Package pict2e Info: Driver file: pdftex.def on input line 112.
+Package pict2e Info: Driver file for pict2e: p2e-pdftex.def on input line 114.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/pict2e/p2e-pdftex.def
+File: p2e-pdftex.def 2016/02/05 v0.1u Driver-dependant file (RN,HjG,JT)
+)
+\pIIe@GRAPH=\toks25
+\@arclen=\dimen176
+\@arcrad=\dimen177
+\pIIe@tempdima=\dimen178
+\pIIe@tempdimb=\dimen179
+\pIIe@tempdimc=\dimen180
+\pIIe@tempdimd=\dimen181
+\pIIe@tempdime=\dimen182
+\pIIe@tempdimf=\dimen183
+) (d:/software/texlive/2023/texmf-dist/tex/latex/tools/calc.sty
+Package: calc 2017/05/25 v4.3 Infix arithmetic (KKT,FJ)
+\calc@Acount=\count290
+\calc@Bcount=\count291
+\calc@Adimen=\dimen184
+\calc@Bdimen=\dimen185
+\calc@Askip=\skip65
+\calc@Bskip=\skip66
+LaTeX Info: Redefining \setlength on input line 80.
+LaTeX Info: Redefining \addtolength on input line 81.
+\calc@Ccount=\count292
+\calc@Cskip=\skip67
+) (d:/software/texlive/2023/texmf-dist/tex/latex/tools/array.sty
+Package: array 2022/09/04 v2.5g Tabular extension package (FMi)
+\col@sep=\dimen186
+\ar@mcellbox=\box53
+\extrarowheight=\dimen187
+\NC@list=\toks26
+\extratabsurround=\skip68
+\backup@length=\skip69
+\ar@cellbox=\box54
+)
+\diagbox@boxa=\box55
+\diagbox@boxb=\box56
+\diagbox@boxm=\box57
+\diagbox@wd=\dimen188
+\diagbox@ht=\dimen189
+\diagbox@insepl=\dimen190
+\diagbox@insepr=\dimen191
+\diagbox@outsepl=\dimen192
+\diagbox@outsepr=\dimen193
+) (d:/software/texlive/2023/texmf-dist/tex/latex/wrapfig/wrapfig.sty
+\wrapoverhang=\dimen194
+\WF@size=\dimen195
+\c@WF@wrappedlines=\count293
+\WF@box=\box58
+\WF@everypar=\toks27
+Package: wrapfig 2003/01/31  v 3.6
+) (d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amsmath.sty
+Package: amsmath 2022/04/08 v2.17n AMS math features
+\@mathmargin=\skip70
+
+For additional information on amsmath, use the `?' option.
+(d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amstext.sty
+Package: amstext 2021/08/26 v2.01 AMS text
+ (d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amsgen.sty
+File: amsgen.sty 1999/11/30 v2.0 generic functions
+\@emptytoks=\toks28
+\ex@=\dimen196
+)) (d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amsbsy.sty
+Package: amsbsy 1999/11/29 v1.2d Bold Symbols
+\pmbraise@=\dimen197
+) (d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amsopn.sty
+Package: amsopn 2022/04/08 v2.04 operator names
+)
+\inf@bad=\count294
+LaTeX Info: Redefining \frac on input line 234.
+\uproot@=\count295
+\leftroot@=\count296
+LaTeX Info: Redefining \overline on input line 399.
+LaTeX Info: Redefining \colon on input line 410.
+\classnum@=\count297
+\DOTSCASE@=\count298
+LaTeX Info: Redefining \ldots on input line 496.
+LaTeX Info: Redefining \dots on input line 499.
+LaTeX Info: Redefining \cdots on input line 620.
+\Mathstrutbox@=\box59
+\strutbox@=\box60
+LaTeX Info: Redefining \big on input line 722.
+LaTeX Info: Redefining \Big on input line 723.
+LaTeX Info: Redefining \bigg on input line 724.
+LaTeX Info: Redefining \Bigg on input line 725.
+\big@size=\dimen198
+LaTeX Font Info:    Redeclaring font encoding OML on input line 743.
+LaTeX Font Info:    Redeclaring font encoding OMS on input line 744.
+\macc@depth=\count299
+LaTeX Info: Redefining \bmod on input line 905.
+LaTeX Info: Redefining \pmod on input line 910.
+LaTeX Info: Redefining \smash on input line 940.
+LaTeX Info: Redefining \relbar on input line 970.
+LaTeX Info: Redefining \Relbar on input line 971.
+\c@MaxMatrixCols=\count300
+\dotsspace@=\muskip17
+\c@parentequation=\count301
+\dspbrk@lvl=\count302
+\tag@help=\toks29
+\row@=\count303
+\column@=\count304
+\maxfields@=\count305
+\andhelp@=\toks30
+\eqnshift@=\dimen199
+\alignsep@=\dimen256
+\tagshift@=\dimen257
+\tagwidth@=\dimen258
+\totwidth@=\dimen259
+\lineht@=\dimen260
+\@envbody=\toks31
+\multlinegap=\skip71
+\multlinetaggap=\skip72
+\mathdisplay@stack=\toks32
+LaTeX Info: Redefining \[ on input line 2953.
+LaTeX Info: Redefining \] on input line 2954.
+)
+\linenoamsmath@ams@eqpen=\count306
+ (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/amssymb.sty
+Package: amssymb 2013/01/14 v3.01 AMS font symbols
+) (d:/software/texlive/2023/texmf-dist/tex/latex/mathtools/mathtools.sty
+Package: mathtools 2022/06/29 v1.29 mathematical typesetting tools
+ (d:/software/texlive/2023/texmf-dist/tex/latex/mathtools/mhsetup.sty
+Package: mhsetup 2021/03/18 v1.4 programming setup (MH)
+)
+\g_MT_multlinerow_int=\count307
+\l_MT_multwidth_dim=\dimen261
+\origjot=\skip73
+\l_MT_shortvdotswithinadjustabove_dim=\dimen262
+\l_MT_shortvdotswithinadjustbelow_dim=\dimen263
+\l_MT_above_intertext_sep=\dimen264
+\l_MT_below_intertext_sep=\dimen265
+\l_MT_above_shortintertext_sep=\dimen266
+\l_MT_below_shortintertext_sep=\dimen267
+\xmathstrut@box=\box61
+\xmathstrut@dim=\dimen268
+) (d:/software/texlive/2023/texmf-dist/tex/latex/amscls/amsthm.sty
+Package: amsthm 2020/05/29 v2.20.6
+\thm@style=\toks33
+\thm@bodyfont=\toks34
+\thm@headfont=\toks35
+\thm@notefont=\toks36
+\thm@headpunct=\toks37
+\thm@preskip=\skip74
+\thm@postskip=\skip75
+\thm@headsep=\skip76
+\dth@everypar=\toks38
+) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex
+\pgfutil@everybye=\toks39
+\pgfutil@tempdima=\dimen269
+\pgfutil@tempdimb=\dimen270
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def
+\pgfutil@abb=\box62
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/pgf.revision.tex)
+Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10)
+))
+Package: pgf 2023-01-15 v3.1.10 (3.1.10)
+ (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex
+Package: pgfsys 2023-01-15 v3.1.10 (3.1.10)
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex
+\pgfkeys@pathtoks=\toks40
+\pgfkeys@temptoks=\toks41
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered.code.tex
+\pgfkeys@tmptoks=\toks42
+))
+\pgf@x=\dimen271
+\pgf@y=\dimen272
+\pgf@xa=\dimen273
+\pgf@ya=\dimen274
+\pgf@xb=\dimen275
+\pgf@yb=\dimen276
+\pgf@xc=\dimen277
+\pgf@yc=\dimen278
+\pgf@xd=\dimen279
+\pgf@yd=\dimen280
+\w@pgf@writea=\write3
+\r@pgf@reada=\read2
+\c@pgf@counta=\count308
+\c@pgf@countb=\count309
+\c@pgf@countc=\count310
+\c@pgf@countd=\count311
+\t@pgf@toka=\toks43
+\t@pgf@tokb=\toks44
+\t@pgf@tokc=\toks45
+\pgf@sys@id@count=\count312
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg
+File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10)
+)
+Driver file for pgf: pgfsys-pdftex.def
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def
+File: pgfsys-pdftex.def 2023-01-15 v3.1.10 (3.1.10)
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.def
+File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10)
+))) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex
+File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgfsyssoftpath@smallbuffer@items=\count313
+\pgfsyssoftpath@bigbuffer@items=\count314
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex
+File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10)
+)) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex
+Package: pgfcore 2023-01-15 v3.1.10 (3.1.10)
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex
+\pgfmath@dimen=\dimen281
+\pgfmath@count=\count315
+\pgfmath@box=\box63
+\pgfmath@toks=\toks46
+\pgfmath@stack@operand=\toks47
+\pgfmath@stack@operation=\toks48
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonometric.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerarithmetics.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex
+\c@pgfmathroundto@lastzeros=\count316
+)) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.tex
+File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgf@picminx=\dimen282
+\pgf@picmaxx=\dimen283
+\pgf@picminy=\dimen284
+\pgf@picmaxy=\dimen285
+\pgf@pathminx=\dimen286
+\pgf@pathmaxx=\dimen287
+\pgf@pathminy=\dimen288
+\pgf@pathmaxy=\dimen289
+\pgf@xx=\dimen290
+\pgf@xy=\dimen291
+\pgf@yx=\dimen292
+\pgf@yy=\dimen293
+\pgf@zx=\dimen294
+\pgf@zy=\dimen295
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.code.tex
+File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgf@path@lastx=\dimen296
+\pgf@path@lasty=\dimen297
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code.tex
+File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgf@shorten@end@additional=\dimen298
+\pgf@shorten@start@additional=\dimen299
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.tex
+File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgfpic=\box64
+\pgf@hbox=\box65
+\pgf@layerbox@main=\box66
+\pgf@picture@serial@count=\count317
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.code.tex
+File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgflinewidth=\dimen300
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformations.code.tex
+File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgf@pt@x=\dimen301
+\pgf@pt@y=\dimen302
+\pgf@pt@temp=\dimen303
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex
+File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.tex
+File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing.code.tex
+File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.tex
+File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgfarrowsep=\dimen304
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex
+File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgf@max=\dimen305
+\pgf@sys@shading@range@num=\count318
+\pgf@shadingcount=\count319
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex
+File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.tex
+File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgfexternal@startupbox=\box67
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.tex
+File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.code.tex
+File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.tex
+File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10)
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex
+File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10)
+))) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex
+File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgfnodeparttextbox=\box68
+) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex
+File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10)
+) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty
+Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10)
+\pgf@nodesepstart=\dimen306
+\pgf@nodesepend=\dimen307
+) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty
+Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10)
+)) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgffor.sty (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/math/pgfmath.sty (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex
+Package: pgffor 2023-01-15 v3.1.10 (3.1.10)
+\pgffor@iter=\dimen308
+\pgffor@skip=\dimen309
+\pgffor@stack=\toks49
+\pgffor@toks=\toks50
+)) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex
+Package: tikz 2023-01-15 v3.1.10 (3.1.10)
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex
+File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgf@plot@mark@count=\count320
+\pgfplotmarksize=\dimen310
+)
+\tikz@lastx=\dimen311
+\tikz@lasty=\dimen312
+\tikz@lastxsaved=\dimen313
+\tikz@lastysaved=\dimen314
+\tikz@lastmovetox=\dimen315
+\tikz@lastmovetoy=\dimen316
+\tikzleveldistance=\dimen317
+\tikzsiblingdistance=\dimen318
+\tikz@figbox=\box69
+\tikz@figbox@bg=\box70
+\tikz@tempbox=\box71
+\tikz@tempbox@bg=\box72
+\tikztreelevel=\count321
+\tikznumberofchildren=\count322
+\tikznumberofcurrentchild=\count323
+\tikz@fig@count=\count324
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex
+File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10)
+\pgfmatrixcurrentrow=\count325
+\pgfmatrixcurrentcolumn=\count326
+\pgf@matrix@numberofcolumns=\count327
+)
+\tikz@expandcount=\count328
+ (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex
+File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10)
+)))
+\c@theorem=\count329
+ (d:/software/texlive/2023/texmf-dist/tex/latex/algorithms/algorithm.sty
+Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating environment
+ (d:/software/texlive/2023/texmf-dist/tex/latex/float/float.sty
+Package: float 2001/11/08 v1.3d Float enhancements (AL)
+\c@float@type=\count330
+\float@exts=\toks51
+\float@box=\box73
+\@float@everytoks=\toks52
+\@floatcapt=\box74
+)
+\@float@every@algorithm=\toks53
+\c@algorithm=\count331
+) (d:/software/texlive/2023/texmf-dist/tex/latex/algorithms/algorithmic.sty
+Package: algorithmic 2009/08/24 v0.1 Document Style `algorithmic'
+\c@ALC@unique=\count332
+\c@ALC@line=\count333
+\c@ALC@rem=\count334
+\c@ALC@depth=\count335
+\ALC@tlm=\skip77
+\algorithmicindent=\skip78
+) (d:/software/texlive/2023/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def
+File: l3backend-pdftex.def 2023-01-16 L3 backend support: PDF output (pdfTeX)
+\l__color_backend_stack_int=\count336
+\l__pdf_internal_box=\box75
+) (./neurips_2024.aux)
+\openout1 = `neurips_2024.aux'.
+
+LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+LaTeX Font Info:    Checking defaults for TS1/cmr/m/n on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+LaTeX Font Info:    Checking defaults for PD1/pdf/m/n on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+LaTeX Font Info:    Checking defaults for PU/pdf/m/n on input line 106.
+LaTeX Font Info:    ... okay on input line 106.
+
+*geometry* driver: auto-detecting
+*geometry* detected driver: pdftex
+*geometry* verbose mode - [ preamble ] result:
+* driver: pdftex
+* paper: letterpaper
+* layout: <same size as paper>
+* layoutoffset:(h,v)=(0.0pt,0.0pt)
+* modes: 
+* h-part:(L,W,R)=(92.14519pt, 430.00462pt, 92.14519pt)
+* v-part:(T,H,B)=(95.39737pt, 556.47656pt, 143.09605pt)
+* \paperwidth=614.295pt
+* \paperheight=794.96999pt
+* \textwidth=430.00462pt
+* \textheight=556.47656pt
+* \oddsidemargin=19.8752pt
+* \evensidemargin=19.8752pt
+* \topmargin=-13.87262pt
+* \headheight=12.0pt
+* \headsep=25.0pt
+* \topskip=10.0pt
+* \footskip=30.0pt
+* \marginparwidth=65.0pt
+* \marginparsep=11.0pt
+* \columnsep=10.0pt
+* \skip\footins=9.0pt plus 4.0pt minus 2.0pt
+* \hoffset=0.0pt
+* \voffset=0.0pt
+* \mag=1000
+* \@twocolumnfalse
+* \@twosidefalse
+* \@mparswitchfalse
+* \@reversemarginfalse
+* (1in=72.27pt=25.4mm, 1cm=28.453pt)
+
+*geometry* verbose mode - [ newgeometry ] result:
+* driver: pdftex
+* paper: letterpaper
+* layout: <same size as paper>
+* layoutoffset:(h,v)=(0.0pt,0.0pt)
+* modes: 
+* h-part:(L,W,R)=(108.405pt, 397.48499pt, 108.40501pt)
+* v-part:(T,H,B)=(72.26999pt, 650.43pt, 72.27pt)
+* \paperwidth=614.295pt
+* \paperheight=794.96999pt
+* \textwidth=397.48499pt
+* \textheight=650.43pt
+* \oddsidemargin=36.13501pt
+* \evensidemargin=36.13501pt
+* \topmargin=-37.0pt
+* \headheight=12.0pt
+* \headsep=25.0pt
+* \topskip=10.0pt
+* \footskip=30.0pt
+* \marginparwidth=65.0pt
+* \marginparsep=11.0pt
+* \columnsep=10.0pt
+* \skip\footins=9.0pt plus 4.0pt minus 2.0pt
+* \hoffset=0.0pt
+* \voffset=0.0pt
+* \mag=1000
+* \@twocolumnfalse
+* \@twosidefalse
+* \@mparswitchfalse
+* \@reversemarginfalse
+* (1in=72.27pt=25.4mm, 1cm=28.453pt)
+
+Package hyperref Info: Link coloring OFF on input line 106.
+(./neurips_2024.out) (./neurips_2024.out)
+\@outlinefile=\write4
+\openout4 = `neurips_2024.out'.
+
+LaTeX Info: Redefining \microtypecontext on input line 106.
+Package microtype Info: Applying patch `item' on input line 106.
+Package microtype Info: Applying patch `toc' on input line 106.
+Package microtype Info: Applying patch `eqnum' on input line 106.
+Package microtype Info: Applying patch `footnote' on input line 106.
+Package microtype Info: Applying patch `verbatim' on input line 106.
+Package microtype Info: Generating PDF output.
+Package microtype Info: Character protrusion enabled (level 2).
+Package microtype Info: Using default protrusion set `alltext'.
+Package microtype Info: Automatic font expansion enabled (level 2),
+(microtype)             stretch: 20, shrink: 20, step: 1, non-selected.
+Package microtype Info: Using default expansion set `alltext-nott'.
+LaTeX Info: Redefining \showhyphens on input line 106.
+Package microtype Info: No adjustment of tracking.
+Package microtype Info: No adjustment of interword spacing.
+Package microtype Info: No adjustment of character kerning.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/mt-ptm.cfg
+File: mt-ptm.cfg 2006/04/20 v1.7 microtype config. file: Times (RS)
+) (d:/software/texlive/2023/texmf-dist/tex/context/base/mkii/supp-pdf.mkii
+[Loading MPS to PDF converter (version 2006.09.02).]
+\scratchcounter=\count337
+\scratchdimen=\dimen319
+\scratchbox=\box76
+\nofMPsegments=\count338
+\nofMParguments=\count339
+\everyMPshowfont=\toks54
+\MPscratchCnt=\count340
+\MPscratchDim=\dimen320
+\MPnumerator=\count341
+\makeMPintoPDFobject=\count342
+\everyMPtoPDFconversion=\toks55
+) (d:/software/texlive/2023/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty
+Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf
+Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 485.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg
+File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Live
+)) (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/mt-cmr.cfg
+File: mt-cmr.cfg 2013/05/19 v2.2 microtype config. file: Computer Modern Roman (RS)
+)
+LaTeX Font Info:    Trying to load font information for U+msa on input line 110.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsa.fd
+File: umsa.fd 2013/01/14 v3.01 AMS symbols A
+) (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/mt-msa.cfg
+File: mt-msa.cfg 2006/02/04 v1.1 microtype config. file: AMS symbols (a) (RS)
+)
+LaTeX Font Info:    Trying to load font information for U+msb on input line 110.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsb.fd
+File: umsb.fd 2013/01/14 v3.01 AMS symbols B
+) (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/mt-msb.cfg
+File: mt-msb.cfg 2005/06/01 v1.0 microtype config. file: AMS symbols (b) (RS)
+)
+LaTeX Font Info:    Trying to load font information for T1+cmtt on input line 110.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/base/t1cmtt.fd
+File: t1cmtt.fd 2022/07/10 v2.5l Standard LaTeX font definitions
+)
+Package microtype Info: Loading generic protrusion settings for font family
+(microtype)             `cmtt' (encoding: T1).
+(microtype)             For optimal results, create family-specific settings.
+(microtype)             See the microtype manual for details.
+LaTeX Font Info:    Trying to load font information for T1+phv on input line 126.
+ (d:/software/texlive/2023/texmf-dist/tex/latex/psnfss/t1phv.fd
+File: t1phv.fd 2020/03/25 scalable font definitions for T1/phv.
+)
+Package microtype Info: Loading generic protrusion settings for font family
+(microtype)             `phv' (encoding: T1).
+(microtype)             For optimal results, create family-specific settings.
+(microtype)             See the microtype manual for details.
+ (./main/introduction.tex [1
+
+
+{d:/software/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/base/8r.enc}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/cm-super/cm-super-t1.enc}]) (./main/preliminaries.tex [2]) (./main/motivation.tex [3
+pdfTeX warning (ext4): destination with the same identifier (name{table.1}) has been already used, duplicate ignored
+<argument> ...shipout:D \box_use:N \l_shipout_box 
+                                                  \__shipout_drop_firstpage_...
+l.77 \end{equation*}
+                    ]
+Package hyperref Info: bookmark level for unknown algorithm defaults to 0 on input line 138.
+ [4]) (./main/theory.tex [5]) (./main/experiment.tex (./main/pic/randomwalk.tex) (./main/pic/BairdExample.tex) [6
+pdfTeX warning (ext4): destination with the same identifier (name{figure.1}) has been already used, duplicate ignored
+<argument> ...shipout:D \box_use:N \l_shipout_box 
+                                                  \__shipout_drop_firstpage_...
+l.46 
+     
+pdfTeX warning (ext4): destination with the same identifier (name{figure.2}) has been already used, duplicate ignored
+<argument> ...shipout:D \box_use:N \l_shipout_box 
+                                                  \__shipout_drop_firstpage_...
+l.46 
+     ]
+<main/pic/maze_13_13.pdf, id=300, 493.1646pt x 387.62602pt>
+File: main/pic/maze_13_13.pdf Graphic file (type pdf)
+<use main/pic/maze_13_13.pdf>
+Package pdftex.def Info: main/pic/maze_13_13.pdf  used on input line 53.
+(pdftex.def)             Requested size: 73.9715pt x 58.14139pt.
+<main/pic/dependent_new.pdf, id=302, 557.01889pt x 394.59978pt>
+File: main/pic/dependent_new.pdf Graphic file (type pdf)
+<use main/pic/dependent_new.pdf>
+Package pdftex.def Info: main/pic/dependent_new.pdf  used on input line 78.
+(pdftex.def)             Requested size: 119.24675pt x 79.49658pt.
+<main/pic/tabular_new.pdf, id=303, 566.51224pt x 401.1703pt>
+File: main/pic/tabular_new.pdf Graphic file (type pdf)
+<use main/pic/tabular_new.pdf>
+Package pdftex.def Info: main/pic/tabular_new.pdf  used on input line 82.
+(pdftex.def)             Requested size: 119.23904pt x 79.49194pt.
+<main/pic/inverted_new.pdf, id=304, 565.61766pt x 402.45422pt>
+File: main/pic/inverted_new.pdf Graphic file (type pdf)
+<use main/pic/inverted_new.pdf>
+Package pdftex.def Info: main/pic/inverted_new.pdf  used on input line 87.
+(pdftex.def)             Requested size: 119.24063pt x 79.49458pt.
+<main/pic/counterexample_quanju_new.pdf, id=305, 471.30164pt x 401.08943pt>
+File: main/pic/counterexample_quanju_new.pdf Graphic file (type pdf)
+<use main/pic/counterexample_quanju_new.pdf>
+Package pdftex.def Info: main/pic/counterexample_quanju_new.pdf  used on input line 91.
+(pdftex.def)             Requested size: 119.24184pt x 79.49428pt.
+ [7
+pdfTeX warning (ext4): destination with the same identifier (name{figure.3}) has been already used, duplicate ignored
+<argument> ...shipout:D \box_use:N \l_shipout_box 
+                                                  \__shipout_drop_firstpage_...
+l.131 
+       <./main/pic/maze_13_13.pdf> <./main/pic/dependent_new.pdf
+
+pdfTeX warning: pdflatex.exe (file ./main/pic/dependent_new.pdf): PDF inclusion: multiple pdfs with page group included in a single page
+> <./main/pic/tabular_new.pdf
+
+pdfTeX warning: pdflatex.exe (file ./main/pic/tabular_new.pdf): PDF inclusion: multiple pdfs with page group included in a single page
+> <./main/pic/inverted_new.pdf
+
+pdfTeX warning: pdflatex.exe (file ./main/pic/inverted_new.pdf): PDF inclusion: multiple pdfs with page group included in a single page
+> <./main/pic/counterexample_quanju_new.pdf
+
+pdfTeX warning: pdflatex.exe (file ./main/pic/counterexample_quanju_new.pdf): PDF inclusion: multiple pdfs with page group included in a single page
+>]) (./main/relatedwork.tex
+<main/pic/maze_complete.pdf, id=426, 595.42892pt x 465.38112pt>
+File: main/pic/maze_complete.pdf Graphic file (type pdf)
+<use main/pic/maze_complete.pdf>
+Package pdftex.def Info: main/pic/maze_complete.pdf  used on input line 7.
+(pdftex.def)             Requested size: 119.24721pt x 79.4901pt.
+<main/pic/cw_complete.pdf, id=427, 570.46333pt x 465.10928pt>
+File: main/pic/cw_complete.pdf Graphic file (type pdf)
+<use main/pic/cw_complete.pdf>
+Package pdftex.def Info: main/pic/cw_complete.pdf  used on input line 11.
+(pdftex.def)             Requested size: 119.24373pt x 79.49335pt.
+<main/pic/mt_complete.pdf, id=428, 569.92673pt x 468.75475pt>
+File: main/pic/mt_complete.pdf Graphic file (type pdf)
+<use main/pic/mt_complete.pdf>
+Package pdftex.def Info: main/pic/mt_complete.pdf  used on input line 16.
+(pdftex.def)             Requested size: 119.24463pt x 79.49413pt.
+<main/pic/Acrobot_complete.pdf, id=429, 564.99583pt x 478.09494pt>
+File: main/pic/Acrobot_complete.pdf Graphic file (type pdf)
+<use main/pic/Acrobot_complete.pdf>
+Package pdftex.def Info: main/pic/Acrobot_complete.pdf  used on input line 20.
+(pdftex.def)             Requested size: 119.23886pt x 79.49504pt.
+ [8
+pdfTeX warning (ext4): destination with the same identifier (name{figure.4}) has been already used, duplicate ignored
+<argument> ...shipout:D \box_use:N \l_shipout_box 
+                                                  \__shipout_drop_firstpage_...
+l.57 
+     
+pdfTeX warning (ext4): destination with the same identifier (name{table.2}) has been already used, duplicate ignored
+<argument> ...shipout:D \box_use:N \l_shipout_box 
+                                                  \__shipout_drop_firstpage_...
+l.57 
+      <./main/pic/maze_complete.pdf> <./main/pic/cw_complete.pdf
+
+pdfTeX warning: pdflatex.exe (file ./main/pic/cw_complete.pdf): PDF inclusion: multiple pdfs with page group included in a single page
+> <./main/pic/mt_complete.pdf
+
+pdfTeX warning: pdflatex.exe (file ./main/pic/mt_complete.pdf): PDF inclusion: multiple pdfs with page group included in a single page
+> <./main/pic/Acrobot_complete.pdf
+
+pdfTeX warning: pdflatex.exe (file ./main/pic/Acrobot_complete.pdf): PDF inclusion: multiple pdfs with page group included in a single page
+>]) (./main/conclusion.tex) (./main/appendix.tex [9] [10]
+
+LaTeX Warning: Command \textemdash invalid in math mode on input line 229.
+
+
+LaTeX Warning: Command \textemdash invalid in math mode on input line 229.
+
+[11] [12] [13]
+Underfull \hbox (badness 1946) in paragraph at lines 683--696
+[]\T1/ptm/m/n/10 (+20) Three ran-dom walk ex-per-i-ments: the $\OML/cmm/m/it/10 $ \T1/ptm/m/n/10 (+20) val-ues for all al-go-rithms are in the range of
+ []
+
+[14] [15]
+Overfull \hbox (33.58313pt too wide) in paragraph at lines 738--752
+ [][] 
+ []
+
+) (./neurips_2024.bbl [16
+pdfTeX warning (ext4): destination with the same identifier (name{table.3}) has been already used, duplicate ignored
+<argument> ...shipout:D \box_use:N \l_shipout_box 
+                                                  \__shipout_drop_firstpage_...
+l.12 
+     ] [17]) [18] (./neurips_2024.aux)
+Package rerunfilecheck Info: File `neurips_2024.out' has not changed.
+(rerunfilecheck)             Checksum: E5788AEC1D4F936207967A17A6B3E0A1;3587.
+ ) 
+Here is how much of TeX's memory you used:
+ 26626 strings out of 476025
+ 484842 string characters out of 5789524
+ 1897382 words of memory out of 5000000
+ 46086 multiletter control sequences out of 15000+600000
+ 567455 words of font info for 255 fonts, out of 8000000 for 9000
+ 1141 hyphenation exceptions out of 8191
+ 84i,16n,80p,1005b,1065s stack positions out of 10000i,1000n,20000p,200000b,200000s
+<d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmex10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi6.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi9.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr6.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr9.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy6.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/symbols/msbm10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/cm-super/sftt1000.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmb8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmr8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmri8a.pfb>
+Output written on neurips_2024.pdf (18 pages, 2290177 bytes).
+PDF statistics:
+ 1011 PDF objects out of 1200 (max. 8388607)
+ 839 compressed objects within 9 object streams
+ 195 named destinations out of 1000 (max. 500000)
+ 52442 words of extra memory for PDF output out of 61914 (max. 10000000)
+
diff --git b/neurips_2024.out a/neurips_2024.out
new file mode 100644
index 0000000..fe9cf21
--- /dev/null
+++ a/neurips_2024.out
@@ -0,0 +1,20 @@
+\BOOKMARK [1][-]{section.1}{\376\377\000I\000n\000t\000r\000o\000d\000u\000c\000t\000i\000o\000n}{}% 1
+\BOOKMARK [1][-]{section.2}{\376\377\000B\000a\000c\000k\000g\000r\000o\000u\000n\000d}{}% 2
+\BOOKMARK [1][-]{section.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{}% 3
+\BOOKMARK [2][-]{subsection.3.1}{\376\377\000M\000o\000t\000i\000v\000a\000t\000i\000o\000n}{section.3}% 4
+\BOOKMARK [2][-]{subsection.3.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D}{section.3}% 5
+\BOOKMARK [2][-]{subsection.3.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000C\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D\000C}{section.3}% 6
+\BOOKMARK [1][-]{section.4}{\376\377\000T\000h\000e\000o\000r\000e\000t\000i\000c\000a\000l\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{}% 7
+\BOOKMARK [1][-]{section.5}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000S\000t\000u\000d\000i\000e\000s}{}% 8
+\BOOKMARK [2][-]{subsection.5.1}{\376\377\000T\000e\000s\000t\000i\000n\000g\000\040\000T\000a\000s\000k\000s}{section.5}% 9
+\BOOKMARK [2][-]{subsection.5.2}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000R\000e\000s\000u\000l\000t\000s\000\040\000a\000n\000d\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section.5}% 10
+\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000l\000a\000t\000e\000d\000\040\000W\000o\000r\000k}{}% 11
+\BOOKMARK [2][-]{subsection.6.1}{\376\377\000D\000i\000f\000f\000e\000r\000e\000n\000c\000e\000\040\000b\000e\000t\000w\000e\000e\000n\000\040\000V\000M\000Q\000\040\000a\000n\000d\000\040\000R\000-\000l\000e\000a\000r\000n\000i\000n\000g}{section.6}% 12
+\BOOKMARK [2][-]{subsection.6.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g}{section.6}% 13
+\BOOKMARK [2][-]{subsection.6.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{section.6}% 14
+\BOOKMARK [1][-]{section.7}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n\000\040\000a\000n\000d\000\040\000F\000u\000t\000u\000r\000e\000\040\000W\000o\000r\000k}{}% 15
+\BOOKMARK [1][-]{appendix.A}{\376\377\000R\000e\000l\000e\000v\000a\000n\000t\000\040\000p\000r\000o\000o\000f\000s}{}% 16
+\BOOKMARK [2][-]{subsection.A.1}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0001}{appendix.A}% 17
+\BOOKMARK [2][-]{subsection.A.2}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000C\000o\000r\000o\000l\000l\000a\000r\000y\000\040\0004\000.\0002}{appendix.A}% 18
+\BOOKMARK [2][-]{subsection.A.3}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0003}{appendix.A}% 19
+\BOOKMARK [1][-]{appendix.B}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000d\000e\000t\000a\000i\000l\000s}{}% 20
diff --git b/neurips_2024.pdf a/neurips_2024.pdf
new file mode 100644
index 0000000..4c26ff0
Binary files /dev/null and a/neurips_2024.pdf differ
diff --git b/neurips_2024.sty a/neurips_2024.sty
new file mode 100644
index 0000000..feba361
--- /dev/null
+++ a/neurips_2024.sty
@@ -0,0 +1,382 @@
+% partial rewrite of the LaTeX2e package for submissions to the
+% Conference on Neural Information Processing Systems (NeurIPS):
+%
+% - uses more LaTeX conventions
+% - line numbers at submission time replaced with aligned numbers from
+%   lineno package
+% - \nipsfinalcopy replaced with [final] package option
+% - automatically loads times package for authors
+% - loads natbib automatically; this can be suppressed with the
+%   [nonatbib] package option
+% - adds foot line to first page identifying the conference
+% - adds preprint option for submission to e.g. arXiv
+% - conference acronym modified
+%
+% Roman Garnett (garnett@wustl.edu) and the many authors of
+% nips15submit_e.sty, including MK and drstrip@sandia
+%
+% last revision: March 2024
+
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{neurips_2024}[2024/03/31 NeurIPS 2024 submission/camera-ready style file]
+
+% declare final option, which creates camera-ready copy
+\newif\if@neuripsfinal\@neuripsfinalfalse
+\DeclareOption{final}{
+  \@neuripsfinaltrue
+}
+
+% declare nonatbib option, which does not load natbib in case of
+% package clash (users can pass options to natbib via
+% \PassOptionsToPackage)
+\newif\if@natbib\@natbibtrue
+\DeclareOption{nonatbib}{
+  \@natbibfalse
+}
+
+% declare preprint option, which creates a preprint version ready for
+% upload to, e.g., arXiv
+\newif\if@preprint\@preprintfalse
+\DeclareOption{preprint}{
+  \@preprinttrue
+}
+
+\ProcessOptions\relax
+
+% determine whether this is an anonymized submission
+\newif\if@submission\@submissiontrue
+\if@neuripsfinal\@submissionfalse\fi
+\if@preprint\@submissionfalse\fi
+
+% fonts
+\renewcommand{\rmdefault}{ptm}
+\renewcommand{\sfdefault}{phv}
+
+% change this every year for notice string at bottom
+\newcommand{\@neuripsordinal}{38th}
+\newcommand{\@neuripsyear}{2024}
+\newcommand{\@neuripslocation}{Vancouver}
+
+% acknowledgments
+\usepackage{environ}
+\newcommand{\acksection}{\section*{Acknowledgments and Disclosure of Funding}}
+\NewEnviron{ack}{%
+  \acksection
+  \BODY
+}
+
+
+% load natbib unless told otherwise
+\if@natbib
+  \RequirePackage{natbib}
+\fi
+
+% set page geometry
+\usepackage[verbose=true,letterpaper]{geometry}
+\AtBeginDocument{
+  \newgeometry{
+    textheight=9in,
+    textwidth=5.5in,
+    top=1in,
+    headheight=12pt,
+    headsep=25pt,
+    footskip=30pt
+  }
+  \@ifpackageloaded{fullpage}
+    {\PackageWarning{neurips_2024}{fullpage package not allowed! Overwriting formatting.}}
+    {}
+}
+
+\widowpenalty=10000
+\clubpenalty=10000
+\flushbottom
+\sloppy
+
+
+% font sizes with reduced leading
+\renewcommand{\normalsize}{%
+  \@setfontsize\normalsize\@xpt\@xipt
+  \abovedisplayskip      7\p@ \@plus 2\p@ \@minus 5\p@
+  \abovedisplayshortskip \z@ \@plus 3\p@
+  \belowdisplayskip      \abovedisplayskip
+  \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@
+}
+\normalsize
+\renewcommand{\small}{%
+  \@setfontsize\small\@ixpt\@xpt
+  \abovedisplayskip      6\p@ \@plus 1.5\p@ \@minus 4\p@
+  \abovedisplayshortskip \z@  \@plus 2\p@
+  \belowdisplayskip      \abovedisplayskip
+  \belowdisplayshortskip 3\p@ \@plus 2\p@   \@minus 2\p@
+}
+\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt}
+\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt}
+\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt}
+\renewcommand{\large}{\@setfontsize\large\@xiipt{14}}
+\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}}
+\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}}
+\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}}
+\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}}
+
+% sections with less space
+\providecommand{\section}{}
+\renewcommand{\section}{%
+  \@startsection{section}{1}{\z@}%
+                {-2.0ex \@plus -0.5ex \@minus -0.2ex}%
+                { 1.5ex \@plus  0.3ex \@minus  0.2ex}%
+                {\large\bf\raggedright}%
+}
+\providecommand{\subsection}{}
+\renewcommand{\subsection}{%
+  \@startsection{subsection}{2}{\z@}%
+                {-1.8ex \@plus -0.5ex \@minus -0.2ex}%
+                { 0.8ex \@plus  0.2ex}%
+                {\normalsize\bf\raggedright}%
+}
+\providecommand{\subsubsection}{}
+\renewcommand{\subsubsection}{%
+  \@startsection{subsubsection}{3}{\z@}%
+                {-1.5ex \@plus -0.5ex \@minus -0.2ex}%
+                { 0.5ex \@plus  0.2ex}%
+                {\normalsize\bf\raggedright}%
+}
+\providecommand{\paragraph}{}
+\renewcommand{\paragraph}{%
+  \@startsection{paragraph}{4}{\z@}%
+                {1.5ex \@plus 0.5ex \@minus 0.2ex}%
+                {-1em}%
+                {\normalsize\bf}%
+}
+\providecommand{\subparagraph}{}
+\renewcommand{\subparagraph}{%
+  \@startsection{subparagraph}{5}{\z@}%
+                {1.5ex \@plus 0.5ex \@minus 0.2ex}%
+                {-1em}%
+                {\normalsize\bf}%
+}
+\providecommand{\subsubsubsection}{}
+\renewcommand{\subsubsubsection}{%
+  \vskip5pt{\noindent\normalsize\rm\raggedright}%
+}
+
+% float placement
+\renewcommand{\topfraction      }{0.85}
+\renewcommand{\bottomfraction   }{0.4}
+\renewcommand{\textfraction     }{0.1}
+\renewcommand{\floatpagefraction}{0.7}
+
+\newlength{\@neuripsabovecaptionskip}\setlength{\@neuripsabovecaptionskip}{7\p@}
+\newlength{\@neuripsbelowcaptionskip}\setlength{\@neuripsbelowcaptionskip}{\z@}
+
+\setlength{\abovecaptionskip}{\@neuripsabovecaptionskip}
+\setlength{\belowcaptionskip}{\@neuripsbelowcaptionskip}
+
+% swap above/belowcaptionskip lengths for tables
+\renewenvironment{table}
+  {\setlength{\abovecaptionskip}{\@neuripsbelowcaptionskip}%
+   \setlength{\belowcaptionskip}{\@neuripsabovecaptionskip}%
+   \@float{table}}
+  {\end@float}
+
+% footnote formatting
+\setlength{\footnotesep }{6.65\p@}
+\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@}
+\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@}
+\setcounter{footnote}{0}
+
+% paragraph formatting
+\setlength{\parindent}{\z@}
+\setlength{\parskip  }{5.5\p@}
+
+% list formatting
+\setlength{\topsep       }{4\p@ \@plus 1\p@   \@minus 2\p@}
+\setlength{\partopsep    }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@}
+\setlength{\itemsep      }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
+\setlength{\parsep       }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
+\setlength{\leftmargin   }{3pc}
+\setlength{\leftmargini  }{\leftmargin}
+\setlength{\leftmarginii }{2em}
+\setlength{\leftmarginiii}{1.5em}
+\setlength{\leftmarginiv }{1.0em}
+\setlength{\leftmarginv  }{0.5em}
+\def\@listi  {\leftmargin\leftmargini}
+\def\@listii {\leftmargin\leftmarginii
+              \labelwidth\leftmarginii
+              \advance\labelwidth-\labelsep
+              \topsep  2\p@ \@plus 1\p@    \@minus 0.5\p@
+              \parsep  1\p@ \@plus 0.5\p@ \@minus 0.5\p@
+              \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+              \labelwidth\leftmarginiii
+              \advance\labelwidth-\labelsep
+              \topsep    1\p@ \@plus 0.5\p@ \@minus 0.5\p@
+              \parsep    \z@
+              \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@
+              \itemsep \topsep}
+\def\@listiv {\leftmargin\leftmarginiv
+              \labelwidth\leftmarginiv
+              \advance\labelwidth-\labelsep}
+\def\@listv  {\leftmargin\leftmarginv
+              \labelwidth\leftmarginv
+              \advance\labelwidth-\labelsep}
+\def\@listvi {\leftmargin\leftmarginvi
+              \labelwidth\leftmarginvi
+              \advance\labelwidth-\labelsep}
+
+% create title
+\providecommand{\maketitle}{}
+\renewcommand{\maketitle}{%
+  \par
+  \begingroup
+    \renewcommand{\thefootnote}{\fnsymbol{footnote}}
+    % for perfect author name centering
+    \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}}
+    % The footnote-mark was overlapping the footnote-text,
+    % added the following to fix this problem               (MK)
+    \long\def\@makefntext##1{%
+      \parindent 1em\noindent
+      \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1
+    }
+    \thispagestyle{empty}
+    \@maketitle
+    \@thanks
+    \@notice
+  \endgroup
+  \let\maketitle\relax
+  \let\thanks\relax
+}
+
+% rules for title box at top of first page
+\newcommand{\@toptitlebar}{
+  \hrule height 4\p@
+  \vskip 0.25in
+  \vskip -\parskip%
+}
+\newcommand{\@bottomtitlebar}{
+  \vskip 0.29in
+  \vskip -\parskip
+  \hrule height 1\p@
+  \vskip 0.09in%
+}
+
+% create title (includes both anonymized and non-anonymized versions)
+\providecommand{\@maketitle}{}
+\renewcommand{\@maketitle}{%
+  \vbox{%
+    \hsize\textwidth
+    \linewidth\hsize
+    \vskip 0.1in
+    \@toptitlebar
+    \centering
+    {\LARGE\bf \@title\par}
+    \@bottomtitlebar
+    \if@submission
+      \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}
+        Anonymous Author(s) \\
+        Affiliation \\
+        Address \\
+        \texttt{email} \\
+      \end{tabular}%
+    \else
+      \def\And{%
+        \end{tabular}\hfil\linebreak[0]\hfil%
+        \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
+      }
+      \def\AND{%
+        \end{tabular}\hfil\linebreak[4]\hfil%
+        \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
+      }
+      \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}%
+    \fi
+    \vskip 0.3in \@minus 0.1in
+  }
+}
+
+% add conference notice to bottom of first page
+\newcommand{\ftype@noticebox}{8}
+\newcommand{\@notice}{%
+  % give a bit of extra room back to authors on first page
+  \enlargethispage{2\baselineskip}%
+  \@float{noticebox}[b]%
+    \footnotesize\@noticestring%
+  \end@float%
+}
+
+% abstract styling
+\renewenvironment{abstract}%
+{%
+  \vskip 0.075in%
+  \centerline%
+  {\large\bf Abstract}%
+  \vspace{0.5ex}%
+  \begin{quote}%
+}
+{
+  \par%
+  \end{quote}%
+  \vskip 1ex%
+}
+
+% For the paper checklist
+\newcommand{\answerYes}[1][]{\textcolor{blue}{[Yes] #1}}
+\newcommand{\answerNo}[1][]{\textcolor{orange}{[No] #1}}
+\newcommand{\answerNA}[1][]{\textcolor{gray}{[NA] #1}}
+\newcommand{\answerTODO}[1][]{\textcolor{red}{\bf [TODO]}}
+\newcommand{\justificationTODO}[1][]{\textcolor{red}{\bf [TODO]}}
+
+% handle tweaks for camera-ready copy vs. submission copy
+\if@preprint
+  \newcommand{\@noticestring}{%
+    Preprint. Under review.%
+  }
+\else
+  \if@neuripsfinal
+    \newcommand{\@noticestring}{%
+      \@neuripsordinal\/ Conference on Neural Information Processing Systems
+      (NeurIPS \@neuripsyear).%, \@neuripslocation.%
+    }
+  \else
+    \newcommand{\@noticestring}{%
+      Submitted to \@neuripsordinal\/ Conference on Neural Information
+      Processing Systems (NeurIPS \@neuripsyear). Do not distribute.%
+    }
+
+    % hide the acknowledgements
+    \NewEnviron{hide}{}
+    \let\ack\hide
+    \let\endack\endhide
+
+    % line numbers for submission
+    \RequirePackage{lineno}
+    \linenumbers
+
+    % fix incompatibilities between lineno and amsmath, if required, by
+    % transparently wrapping linenomath environments around amsmath
+    % environments
+    \AtBeginDocument{%
+      \@ifpackageloaded{amsmath}{%
+        \newcommand*\patchAmsMathEnvironmentForLineno[1]{%
+          \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname
+          \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname
+          \renewenvironment{#1}%
+                          {\linenomath\csname old#1\endcsname}%
+                          {\csname oldend#1\endcsname\endlinenomath}%
+        }%
+        \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{%
+          \patchAmsMathEnvironmentForLineno{#1}%
+          \patchAmsMathEnvironmentForLineno{#1*}%
+        }%
+        \patchBothAmsMathEnvironmentsForLineno{equation}%
+        \patchBothAmsMathEnvironmentsForLineno{align}%
+        \patchBothAmsMathEnvironmentsForLineno{flalign}%
+        \patchBothAmsMathEnvironmentsForLineno{alignat}%
+        \patchBothAmsMathEnvironmentsForLineno{gather}%
+        \patchBothAmsMathEnvironmentsForLineno{multline}%
+      }
+      {}
+    }
+  \fi
+\fi
+
+
+\endinput
diff --git b/neurips_2024.synctex.gz a/neurips_2024.synctex.gz
new file mode 100644
index 0000000..8998458
Binary files /dev/null and a/neurips_2024.synctex.gz differ
diff --git b/neurips_2024.tex a/neurips_2024.tex
new file mode 100644
index 0000000..b73affa
--- /dev/null
+++ a/neurips_2024.tex
@@ -0,0 +1,147 @@
+\documentclass{article}
+
+
+% if you need to pass options to natbib, use, e.g.:
+%     \PassOptionsToPackage{numbers, compress}{natbib}
+% before loading neurips_2024
+
+
+% ready for submission
+\usepackage{neurips_2024}
+
+
+% to compile a preprint version, e.g., for submission to arXiv, add add the
+% [preprint] option:
+%     \usepackage[preprint]{neurips_2024}
+
+
+% to compile a camera-ready version, add the [final] option, e.g.:
+%     \usepackage[final]{neurips_2024}
+
+
+% to avoid loading the natbib package, add option nonatbib:
+%    \usepackage[nonatbib]{neurips_2024}
+
+
+\usepackage[utf8]{inputenc} % allow utf-8 input
+\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
+\usepackage{hyperref}       % hyperlinks
+\usepackage{url}            % simple URL typesetting
+\usepackage{booktabs}       % professional-quality tables
+\usepackage{amsfonts}       % blackboard math symbols
+\usepackage{nicefrac}       % compact symbols for 1/2, etc.
+\usepackage{microtype}      % microtypography
+\usepackage{xcolor}         % colors
+\usepackage{graphicx}
+\usepackage{subfigure}
+\usepackage{diagbox}
+\usepackage{wrapfig}
+\usepackage{booktabs}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{mathtools}
+\usepackage{amsthm}
+\usepackage{tikz}
+
+\theoremstyle{plain}
+\newtheorem{theorem}{Theorem}[section]
+\newtheorem{proposition}[theorem]{Proposition}
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{corollary}[theorem]{Corollary}
+\theoremstyle{definition}
+\newtheorem{definition}[theorem]{Definition}
+\newtheorem{assumption}[theorem]{Assumption}
+\theoremstyle{remark}
+\newtheorem{remark}[theorem]{Remark}
+
+\usepackage{algorithm}
+\usepackage{algorithmic}
+
+
+
+\title{Is Minimizing Errors the Only Option for Value-based Reinforcement Learning?}
+
+
+% The \author macro works with any number of authors. There are two commands
+% used to separate the names and addresses of multiple authors: \And and \AND.
+%
+% Using \And between authors leaves it to LaTeX to determine where to break the
+% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
+% authors names on the first line, and the last on the second line, try using
+% \AND instead of \And before the third author name.
+
+
+\author{%
+  David S.~Hippocampus\thanks{Use footnote for providing further information
+    about author (webpage, alternative address)---\emph{not} for acknowledging
+    funding agencies.} \\
+  Department of Computer Science\\
+  Cranberry-Lemon University\\
+  Pittsburgh, PA 15213 \\
+  \texttt{hippo@cs.cranberry-lemon.edu} \\
+  % examples of more authors
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \AND
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+}
+
+
+\begin{document}
+
+
+\maketitle
+
+
+\begin{abstract}
+  The existing research on 
+  value-based reinforcement learning also minimizes the error. 
+  However, is error minimization really the only option
+   for value-based reinforcement learning? 
+   We can easily observe that the policy on action 
+   choosing probabilities is often related to the relative values,
+    and has nothing to do with their absolute values. 
+    Based on this observation, we propose the objective
+    of variance minimization instead of error minimization, 
+    derive many new variance minimization algorithms, both including a traditional parameter $\omega$, 
+    and conduct an analysis of the convergence rate and experiments. 
+    The experimental results show that our proposed variance minimization algorithms
+     converge much faster.
+\end{abstract}
+
+\input{main/introduction.tex}
+\input{main/preliminaries.tex}
+\input{main/motivation.tex}
+\input{main/theory.tex}
+\input{main/experiment.tex}
+\input{main/relatedwork.tex}
+\input{main/conclusion.tex}
+
+
+\appendix
+
+\input{main/appendix.tex}
+
+
+\bibliographystyle{named}
+\bibliography{neurips_2024}
+% \bibliographystyle{neurips_2024}
+
+
+\end{document}
\ No newline at end of file