Commit afbe69ae by GongYu

来不及了

parents
% ALGORITHM STYLE -- Released 8 April 1996
% for LaTeX-2e
% Copyright -- 1994 Peter Williams
% E-mail Peter.Williams@dsto.defence.gov.au
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{algorithm}
\typeout{Document Style `algorithm' - floating environment}
\RequirePackage{float}
\RequirePackage{ifthen}
\newcommand{\ALG@within}{nothing}
\newboolean{ALG@within}
\setboolean{ALG@within}{false}
\newcommand{\ALG@floatstyle}{ruled}
\newcommand{\ALG@name}{Algorithm}
\newcommand{\listalgorithmname}{List of \ALG@name s}
% Declare Options
% first appearance
\DeclareOption{plain}{
\renewcommand{\ALG@floatstyle}{plain}
}
\DeclareOption{ruled}{
\renewcommand{\ALG@floatstyle}{ruled}
}
\DeclareOption{boxed}{
\renewcommand{\ALG@floatstyle}{boxed}
}
% then numbering convention
\DeclareOption{part}{
\renewcommand{\ALG@within}{part}
\setboolean{ALG@within}{true}
}
\DeclareOption{chapter}{
\renewcommand{\ALG@within}{chapter}
\setboolean{ALG@within}{true}
}
\DeclareOption{section}{
\renewcommand{\ALG@within}{section}
\setboolean{ALG@within}{true}
}
\DeclareOption{subsection}{
\renewcommand{\ALG@within}{subsection}
\setboolean{ALG@within}{true}
}
\DeclareOption{subsubsection}{
\renewcommand{\ALG@within}{subsubsection}
\setboolean{ALG@within}{true}
}
\DeclareOption{nothing}{
\renewcommand{\ALG@within}{nothing}
\setboolean{ALG@within}{true}
}
\DeclareOption*{\edef\ALG@name{\CurrentOption}}
% ALGORITHM
%
\ProcessOptions
\floatstyle{\ALG@floatstyle}
\ifthenelse{\boolean{ALG@within}}{
\ifthenelse{\equal{\ALG@within}{part}}
{\newfloat{algorithm}{htbp}{loa}[part]}{}
\ifthenelse{\equal{\ALG@within}{chapter}}
{\newfloat{algorithm}{htbp}{loa}[chapter]}{}
\ifthenelse{\equal{\ALG@within}{section}}
{\newfloat{algorithm}{htbp}{loa}[section]}{}
\ifthenelse{\equal{\ALG@within}{subsection}}
{\newfloat{algorithm}{htbp}{loa}[subsection]}{}
\ifthenelse{\equal{\ALG@within}{subsubsection}}
{\newfloat{algorithm}{htbp}{loa}[subsubsection]}{}
\ifthenelse{\equal{\ALG@within}{nothing}}
{\newfloat{algorithm}{htbp}{loa}}{}
}{
\newfloat{algorithm}{htbp}{loa}
}
\floatname{algorithm}{\ALG@name}
\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}}
% ALGORITHMIC STYLE -- Released 8 APRIL 1996
% for LaTeX version 2e
% Copyright -- 1994 Peter Williams
% E-mail PeterWilliams@dsto.defence.gov.au
%
% Modified by Alex Smola (08/2000)
% E-mail Alex.Smola@anu.edu.au
%
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{algorithmic}
\typeout{Document Style `algorithmic' - environment}
%
\RequirePackage{ifthen}
\RequirePackage{calc}
\newboolean{ALC@noend}
\setboolean{ALC@noend}{false}
\newcounter{ALC@line}
\newcounter{ALC@rem}
\newlength{\ALC@tlm}
%
\DeclareOption{noend}{\setboolean{ALC@noend}{true}}
%
\ProcessOptions
%
% ALGORITHMIC
\newcommand{\algorithmicrequire}{\textbf{Require:}}
\newcommand{\algorithmicensure}{\textbf{Ensure:}}
\newcommand{\algorithmiccomment}[1]{\{#1\}}
\newcommand{\algorithmicend}{\textbf{end}}
\newcommand{\algorithmicif}{\textbf{if}}
\newcommand{\algorithmicthen}{\textbf{then}}
\newcommand{\algorithmicelse}{\textbf{else}}
\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif}
\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif}
\newcommand{\algorithmicfor}{\textbf{for}}
\newcommand{\algorithmicforall}{\textbf{for all}}
\newcommand{\algorithmicdo}{\textbf{do}}
\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor}
\newcommand{\algorithmicwhile}{\textbf{while}}
\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile}
\newcommand{\algorithmicloop}{\textbf{loop}}
\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop}
\newcommand{\algorithmicrepeat}{\textbf{repeat}}
\newcommand{\algorithmicuntil}{\textbf{until}}
%changed by alex smola
\newcommand{\algorithmicinput}{\textbf{input}}
\newcommand{\algorithmicoutput}{\textbf{output}}
\newcommand{\algorithmicset}{\textbf{set}}
\newcommand{\algorithmictrue}{\textbf{true}}
\newcommand{\algorithmicfalse}{\textbf{false}}
\newcommand{\algorithmicand}{\textbf{and\ }}
\newcommand{\algorithmicor}{\textbf{or\ }}
\newcommand{\algorithmicfunction}{\textbf{function}}
\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction}
\newcommand{\algorithmicmain}{\textbf{main}}
\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain}
%end changed by alex smola
\def\ALC@item[#1]{%
\if@noparitem \@donoparitem
\else \if@inlabel \indent \par \fi
\ifhmode \unskip\unskip \par \fi
\if@newlist \if@nobreak \@nbitem \else
\addpenalty\@beginparpenalty
\addvspace\@topsep \addvspace{-\parskip}\fi
\else \addpenalty\@itempenalty \addvspace\itemsep
\fi
\global\@inlabeltrue
\fi
\everypar{\global\@minipagefalse\global\@newlistfalse
\if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels
\penalty\z@ \fi
\everypar{}}\global\@nobreakfalse
\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi
\sbox\@tempboxa{\makelabel{#1}}%
\global\setbox\@labels
\hbox{\unhbox\@labels \hskip \itemindent
\hskip -\labelwidth \hskip -\ALC@tlm
\ifdim \wd\@tempboxa >\labelwidth
\box\@tempboxa
\else \hbox to\labelwidth {\unhbox\@tempboxa}\fi
\hskip \ALC@tlm}\ignorespaces}
%
\newenvironment{algorithmic}[1][0]{
\let\@item\ALC@item
\newcommand{\ALC@lno}{%
\ifthenelse{\equal{\arabic{ALC@rem}}{0}}
{{\footnotesize \arabic{ALC@line}:}}{}%
}
\let\@listii\@listi
\let\@listiii\@listi
\let\@listiv\@listi
\let\@listv\@listi
\let\@listvi\@listi
\let\@listvii\@listi
\newenvironment{ALC@g}{
\begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@
\listparindent\z@ \rightmargin\z@
\topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@
\leftmargin 1em
\addtolength{\ALC@tlm}{\leftmargin}
}
}
{\end{list}}
\newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item}
\newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}%
{}{\ \algorithmiccomment{##1}}}
\newcommand{\REQUIRE}{\item[\algorithmicrequire]}
\newcommand{\ENSURE}{\item[\algorithmicensure]}
\newcommand{\STATE}{\ALC@it}
\newcommand{\COMMENT}[1]{\algorithmiccomment{##1}}
%changes by alex smola
\newcommand{\INPUT}{\item[\algorithmicinput]}
\newcommand{\OUTPUT}{\item[\algorithmicoutput]}
\newcommand{\SET}{\item[\algorithmicset]}
% \newcommand{\TRUE}{\algorithmictrue}
% \newcommand{\FALSE}{\algorithmicfalse}
\newcommand{\AND}{\algorithmicand}
\newcommand{\OR}{\algorithmicor}
\newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}}
%end changes by alex smola
\newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}}
\renewcommand{\\}{\@centercr}
\newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\
\algorithmicthen\ {##2}}
\newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\ELSIF}[2][default]%
{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo%
\ALC@com{##1}\begin{ALC@for}}
\newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ %
\algorithmicdo%
\ALC@com{##1}\begin{ALC@for}}
\newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ %
\algorithmicdo\ {##2}}
\newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ %
\algorithmicdo%
\ALC@com{##1}\begin{ALC@whl}}
\newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop%
\ALC@com{##1}\begin{ALC@loop}}
%changed by alex smola
\newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ %
\ALC@com{##1}\begin{ALC@func}}
\newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ %
\ALC@com{##1}\begin{ALC@main}}
%end changed by alex smola
\newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat%
\ALC@com{##1}\begin{ALC@rpt}}
\newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1}
\ifthenelse{\boolean{ALC@noend}}{
\newcommand{\ENDIF}{\end{ALC@if}}
\newcommand{\ENDFOR}{\end{ALC@for}}
\newcommand{\ENDWHILE}{\end{ALC@whl}}
\newcommand{\ENDLOOP}{\end{ALC@loop}}
\newcommand{\ENDFUNCTION}{\end{ALC@func}}
\newcommand{\ENDMAIN}{\end{ALC@main}}
}{
\newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif}
\newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor}
\newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile}
\newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop}
\newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction}
\newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain}
}
\renewcommand{\@toodeep}{}
\begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}%
\itemsep\z@ \itemindent\z@ \listparindent\z@%
\partopsep\z@ \parskip\z@ \parsep\z@%
\labelsep 0.5em \topsep 0.2em%
\ifthenelse{\equal{#1}{0}}
{\labelwidth 0.5em }
{\labelwidth 1.2em }
\leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
\ALC@tlm\labelsep
}
}
{\end{list}}
\section{Relevant proofs}
\subsection{Proof of Theorem \ref{theorem1}}
\label{proofth1}
\begin{proof}
\label{th1proof}
The proof is based on Borkar's Theorem for
general stochastic approximation recursions with two time scales
\cite{borkar1997stochastic}.
% The new TD error for the linear setting is
% \begin{equation*}
% \delta_{\text{new}}=r+\gamma
% \theta^{\top}\phi'-\theta^{\top}\phi-\mathbb{E}[\delta].
% \end{equation*}
A new one-step
linear TD solution is defined
as:
\begin{equation*}
0=\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]=-A\theta+b.
\end{equation*}
Thus, the VMTD's solution is
$\theta_{\text{VMTD}}=A^{-1}b$.
First, note that recursion (\ref{theta}) can be rewritten as
\begin{equation*}
\theta_{k+1}\leftarrow \theta_k+\beta_k\xi(k),
\end{equation*}
where
\begin{equation*}
\xi(k)=\frac{\alpha_k}{\beta_k}(\delta_k-\omega_k)\phi_k
\end{equation*}
Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$,
$\xi(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$.
That is the increments in iteration (\ref{omega}) are uniformly larger than
those in (\ref{theta}), thus (\ref{omega}) is the faster recursion.
Along the faster time scale, iterations of (\ref{omega}) and (\ref{theta})
are associated to ODEs system as follows:
\begin{equation}
\dot{\theta}(t) = 0,
\label{thetaFast}
\end{equation}
\begin{equation}
\dot{\omega}(t)=\mathbb{E}[\delta_t|\theta(t)]-\omega(t).
\label{omegaFast}
\end{equation}
Based on the ODE (\ref{thetaFast}), $\theta(t)\equiv \theta$ when
viewed from the faster timescale.
By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
$||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$\theta$ that depends on the initial condition $\theta_0$ of recursion
(\ref{theta}).
Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as
\begin{equation}
\dot{\omega}(t)=\mathbb{E}[\delta_t|\theta]-\omega(t).
\label{omegaFastFinal}
\end{equation}
Consider the function $h(\omega)=\mathbb{E}[\delta|\theta]-\omega$,
i.e., the driving vector field of the ODE (\ref{omegaFastFinal}).
It is easy to find that the function $h$ is Lipschitz with coefficient
$-1$.
Let $h_{\infty}(\cdot)$ be the function defined by
$h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$.
Then $h_{\infty}(\omega)= -\omega$, is well-defined.
For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}[\delta|\theta]$
is the unique globally asymptotically stable equilibrium.
For the ODE
\begin{equation}
\dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
\label{omegaInfty}
\end{equation}
apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
associated strict Liapunov function. Then,
the origin of (\ref{omegaInfty}) is a globally asymptotically stable
equilibrium.
Consider now the recursion (\ref{omega}).
Let
$M_{k+1}=(\delta_k-\omega_k)
-\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
where $\mathcal{F}(k)=\sigma(\omega_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$,
$k\geq 1$ are the sigma fields
generated by $\omega_0,\theta_0,\omega_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
$0\leq l<k$.
It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that
satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
Because $\phi_k$, $r_k$, and $\phi_k'$ have
uniformly bounded second moments, it can be seen that for some constant
$c_1>0$, $\forall k\geq0$,
\begin{equation*}
\mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
c_1(1+||\omega_k||^2+||\theta_k||^2).
\end{equation*}
Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus,
by Theorem 2.2 of \cite{borkar2000ode} we obtain that
$||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
Consider now the slower time scale recursion (\ref{theta}).
Based on the above analysis, (\ref{theta}) can be rewritten as
\begin{equation*}
\theta_{k+1}\leftarrow
\theta_{k}+\alpha_k(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k.
\end{equation*}
Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$,
$k\geq 1$ be the sigma fields
generated by $\theta_0,\theta_{l+1},\phi_l,\phi_l'$,
$0\leq l<k$.
Let
$
Z_{k+1} = Y_{t}-\mathbb{E}[Y_{t}|\mathcal{G}(k)],
$
where
\begin{equation*}
Y_{k}=(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k.
\end{equation*}
Consequently,
\begin{equation*}
\begin{array}{ccl}
\mathbb{E}[Y_t|\mathcal{G}(k)]&=&\mathbb{E}[(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k|\mathcal{G}(k)]\\
&=&\mathbb{E}[\delta_k\phi_k|\theta_k]
-\mathbb{E}[\mathbb{E}[\delta_k|\theta_k]\phi_k]\\
&=&\mathbb{E}[\delta_k\phi_k|\theta_k]
-\mathbb{E}[\delta_k|\theta_k]\mathbb{E}[\phi_k]\\
&=&\mathrm{Cov}(\delta_k|\theta_k,\phi_k),
\end{array}
\end{equation*}
where $\mathrm{Cov}(\cdot,\cdot)$ is a covariance operator.
Thus,
\begin{equation*}
\begin{array}{ccl}
Z_{k+1}&=&(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k-\mathrm{Cov}(\delta_k|\theta_k,\phi_k).
\end{array}
\end{equation*}
It is easy to verify that $Z_{k+1},k\geq0$ are integrable random variables that
satisfy $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$.
Also, because $\phi_k$, $r_k$, and $\phi_k'$ have
uniformly bounded second moments, it can be seen that for some constant
$c_2>0$, $\forall k\geq0$,
\begin{equation*}
\mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
c_2(1+||\theta_k||^2).
\end{equation*}
Consider now the following ODE associated with (\ref{theta}):
\begin{equation}
\begin{array}{ccl}
\dot{\theta}(t)&=&\mathrm{Cov}(\delta|\theta(t),\phi)\\
&=&\mathrm{Cov}(r+(\gamma\phi'-\phi)^{\top}\theta(t),\phi)\\
&=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\theta(t)^{\top}(\phi-\gamma\phi'),\phi)\\
&=&\mathrm{Cov}(r,\phi)-\theta(t)^{\top}\mathrm{Cov}(\phi-\gamma\phi',\phi)\\
&=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi-\gamma\phi',\phi)^{\top}\theta(t)\\
&=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi-\gamma\phi')\theta(t)\\
&=&-A\theta(t)+b.
\end{array}
\label{odetheta}
\end{equation}
Let $\vec{h}(\theta(t))$ be the driving vector field of the ODE
(\ref{odetheta}).
\begin{equation*}
\vec{h}(\theta(t))=-A\theta(t)+b.
\end{equation*}
Consider the cross-covariance matrix,
\begin{equation}
\begin{array}{ccl}
A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\
&=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\
&=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\
&=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\
\end{array}
\label{covariance}
\end{equation}
where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$
\footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to
the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or
initialized randomly in a Markov chain for on-policy update.}.
Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and
$\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive
definite. Then, the matrix $A$ is semi-positive definite because $A$ is
linearly combined by two positive-weighted semi-positive definite matrice
(\ref{covariance}).
Furthermore, $A$ is nonsingular due to the assumption.
Hence, the cross-covariance matrix $A$ is positive definite.
Therefore,
$\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically
stable equilibrium for ODE (\ref{odetheta}).
Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow
\infty}\frac{\vec{h}(r\theta)}{r}$. Then
$\vec{h}_{\infty}(\theta)=-A\theta$ is well-defined.
Consider now the ODE
\begin{equation}
\dot{\theta}(t)=-A\theta(t).
\label{odethetafinal}
\end{equation}
The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium.
Thus, the assumption (A1) and (A2) are verified.
\end{proof}
\subsection{Proof of Corollary \ref{corollary4_2}}
\label{proofcorollary4_2}
The update formulas in linear two-timescale algorithms are as follows:
\begin{equation}
\theta_{k+1}=\theta_{k} + \alpha_{k}[h_1(\theta_{k},\omega_{k})+M^{(1)}_{k+1}],
\end{equation}
\begin{equation}
\omega_{k+1}=\omega_{k} + \alpha_{k}[h_2(\theta_{k},\omega_{k})+M^{(2)}_{k+1}].
\end{equation}
where $\alpha_k, \beta_k \in \mathbb{R} $ are stepsizes and $M^{(1)} \in \mathbb{R}^{d_1}, M^{(2)} \in \mathbb{R}^{d_2}$
denote noise.
$h_1 : \mathbb{R}^{d_{1}}\times \mathbb{R}^{d_{2}}\rightarrow \mathbb{R}^{d_{1}}$ and
$h_2 : \mathbb{R}^{d_{1}}\times \mathbb{R}^{d_{2}}\rightarrow \mathbb{R}^{d_{2}}$ have the
form, respectively,
\begin{equation}
h_{1}(\theta,\omega)=v_1 - \Gamma_1 \theta - W_1\omega,
\end{equation}
\begin{equation}
h_{2}(\theta,\omega)=v_2 - \Gamma_2 \theta - W_2\omega,
\end{equation}
where $v_1 \in \mathbb{R}^{d_1}$, $v_2 \in \mathbb{R}^{d_2}$, $\Gamma_1 \in \mathbb{R}^{d_1 \times d_1}$
, $\Gamma_2 \in \mathbb{R}^{d_2 \times d_1}$, $W_1 \in \mathbb{R}^{d_1 \times d_2}$ and
$W_2 \in \mathbb{R}^{d_2 \times d_2}$. $d_1$ and $d_2$ are the dimensions of vectors $\theta$ and $\omega$, respectively.
For Theorem 3 in \cite{dalal2020tale}, the theorem still holds even when $d——1$ is not equal to $d_2$. For the VMTD algorithm, $d_2$ is equal to 1.
% Before proving the Corollary \ref{corollary4_2},
\cite{dalal2020tale} presents
the matrix assumption, step size assumption, and
defines sparse projection.
\begin{assumption}
\label{matrixassumption}
(Matrix Assumption).
$W_2$ and $X_1 = \Gamma_1 - W_1 W_{2}^{-1}\Gamma_2$ are positive definite(not necessarily symmetric).
\end{assumption}
\begin{assumption}
\label{stepsizeassumption}
(Step Size Assumption).
$\alpha_k = (k+1)^{-\alpha}$ and $\beta_k = (k+1)^{-\beta}$, where $1>\alpha > \beta > 0$.
\end{assumption}
\begin{definition}
\label{sparseprojection}
(Sparse Projection).
For $R>0$, let $\Pi_{R}(x)=\min \{1, R/||x||\}$. $x$ be the projection into the ball with redius
R around the origin. The sparse projection operator
\begin{equation*}
\Pi_{n, R} = \begin{cases}
\Pi_{R}, & \text{if } k = n^{n} - 1 \text{ for some } n \in \mathbb{Z}_{>0}, \\
I, & \text{otherwise}.
\end{cases}
\end{equation*}
We call it sparse as it projects only on specific indices that are exponentially far apart.
Pick an arbitrary $p>1$. Fix some constant $R^{\theta}_{\text{proj}}>0$ and $R^{\omega}_{\text{proj}}>0$
for the radius of the projection ball. Further, let
\begin{equation*}
\theta^{*}=X^{-1}_{1}b_{1}, \omega^{*}=W^{-1}_{2}(v_2 - \Gamma_2 \theta^{*})
\end{equation*}
with $b_1=v_1 - W_1 W_2^{-1}v_2$.
The formula for the sparse projection update in linear two-timescale algorithms is as follows:
\begin{equation}
\label{sparseprojectiontheta}
\theta'_{k+1}=\Pi_{k+1,R^{\theta}_{\text{proj}}}(\theta'_{k} + \alpha_{k}[h_1(\theta'_{k},\omega'_{k})+M^{(1')}_{k+1}]),
\end{equation}
\begin{equation}
\label{sparseprojectionomega}
\omega'_{k+1}=\Pi_{k+1,R^{\omega}_{\text{proj}}}(\omega'_{k} + \beta_{k}[h_2(\theta'_{k},\omega'_{k})+M^{(2')}_{k+1}]).
\end{equation}
\end{definition}
\begin{proof}
As long as the VMTD algorithm satisfies Assumption \ref{matrixassumption},
the convergence speed of the VMTD algorithm can be
obtained.
VMTD's update rule is
\begin{equation*}
\theta_{k+1}=\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi_k.
\end{equation*}
\begin{equation*}
\omega_{k+1}=\omega_{k}+\beta_k(\delta_k-\omega_k).
\end{equation*}
Thus, $h_1(\theta, \omega)=\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi - \gamma\phi')\theta$,
$h_2(\theta, \omega)=\mathbb{E}[r]+\mathbb{E}[\gamma \phi'^{\top}-\phi^{\top}]\theta -\omega$,
$\Gamma_1 =\mathrm{Cov}(\phi,\phi - \gamma\phi')$,
$W_1 = 0$ and
$\Gamma_2 = -\mathbb{E}[\gamma \phi'^{\top}-\phi^{\top}]$,
$W_2 = 1$,
$v_2 = \mathbb{E}[r]$. Additionally,
$X_1=\Gamma_1 - W_1 W^{-1}_2 \Gamma_2 = \mathrm{Cov}(\phi,\phi - \gamma\phi')$.
% By the Assumption \ref{matrixassumption},
It can be deduced from the proof \ref{th1proof} that $X_1$ is a positive definite matrix.
The VMTD algorithm satisfies the Assumption \ref{matrixassumption}.
By the proof \ref{th1proof}, Definition 1 in \cite{dalal2020tale} is satisfied.
We can apply the Theorem 3 in \cite{dalal2020tale} to get the Corollary \ref{corollary4_2}.
\end{proof}
\subsection{Proof of Theorem \ref{theorem2}}
\label{proofth2}
\begin{proof}
The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation.
For the VMTDC algorithm, a new one-step linear TD solution is defined as:
\begin{equation*}
0=\mathbb{E}[(\phi - \gamma \phi' - \mathbb{E}[\phi - \gamma \phi'])\phi^\top]\mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\phi]=A^{\top}C^{-1}(-A\theta+b).
\end{equation*}
The matrix $A^{\top}C^{-1}A$ is positive definite. Thus, the VMTD's solution is
$\theta_{\text{VMTDC}}=\theta_{\text{VMTD}}=A^{-1}b$.
First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively,
\begin{equation*}
\theta_{k+1}\leftarrow \theta_k+\zeta_k x(k),
\end{equation*}
\begin{equation*}
u_{k+1}\leftarrow u_k+\beta_k y(k),
\end{equation*}
where
\begin{equation*}
x(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)],
\end{equation*}
\begin{equation*}
y(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \phi^{\top}_k u_k]\phi_k.
\end{equation*}
Recursion (\ref{thetavmtdc}) can also be rewritten as
\begin{equation*}
\theta_{k+1}\leftarrow \theta_k+\beta_k z(k),
\end{equation*}
where
\begin{equation*}
z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)],
\end{equation*}
Due to the settings of step-size schedule
$\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$.
That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than
those in (\ref{uvmtdc}) and the increments in iteration (\ref{uvmtdc}) are uniformly larger than
those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion.
Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc})
are associated to ODEs system as follows:
\begin{equation}
\dot{\theta}(t) = 0,
\label{thetavmtdcFastest}
\end{equation}
\begin{equation}
\dot{u}(t) = 0,
\label{uvmtdcFastest}
\end{equation}
\begin{equation}
\dot{\omega}(t)=\mathbb{E}[\delta_t|u(t),\theta(t)]-\omega(t).
\label{omegavmtdcFastest}
\end{equation}
Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$
and $u(t)\equiv u$ when viewed from the fastest timescale.
By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
$||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$\theta$ that depends on the initial condition $\theta_0$ of recursion
(\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$u$ that depends on the initial condition $u_0$ of recursion
(\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest})
can be written as
\begin{equation}
\dot{\omega}(t)=\mathbb{E}[\delta_t|u,\theta]-\omega(t).
\label{omegavmtdcFastestFinal}
\end{equation}
Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$,
i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}).
It is easy to find that the function $h$ is Lipschitz with coefficient
$-1$.
Let $h_{\infty}(\cdot)$ be the function defined by
$h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$.
Then $h_{\infty}(\omega)= -\omega$, is well-defined.
For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\theta,u]$
is the unique globally asymptotically stable equilibrium.
For the ODE
\begin{equation}
\dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
\label{omegavmtdcInfty}
\end{equation}
apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
associated strict Liapunov function. Then,
the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable
equilibrium.
Consider now the recursion (\ref{omegavmtdc}).
Let
$M_{k+1}=(\delta_k-\omega_k)
-\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
where $\mathcal{F}(k)=\sigma(\omega_l,u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$,
$k\geq 1$ are the sigma fields
generated by $\omega_0,u_0,\theta_0,\omega_{l+1},u_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
$0\leq l<k$.
It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that
satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
Because $\phi_k$, $r_k$, and $\phi_k'$ have
uniformly bounded second moments, it can be seen that for some constant
$c_1>0$, $\forall k\geq0$,
\begin{equation*}
\mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
c_1(1+||\omega_k||^2+||u_k||^2+||\theta_k||^2).
\end{equation*}
Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
by Theorem 2.2 of \cite{borkar2000ode} we obtain that
$||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
Consider now the second time scale recursion (\ref{uvmtdc}).
Based on the above analysis, (\ref{uvmtdc}) can be rewritten as
% \begin{equation*}
% u_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|u_k,\theta_k] - \phi^{\top} (s_k) u_k]\phi(s_k).
% \end{equation*}
\begin{equation}
\dot{\theta}(t) = 0,
\label{thetavmtdcFaster}
\end{equation}
\begin{equation}
\dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|u(t),\theta(t)])\phi_t|\theta(t)] - Cu(t).
\label{uvmtdcFaster}
\end{equation}
The ODE (\ref{thetavmtdcFaster}) suggests that $\theta(t)\equiv \theta$ (i.e., a time invariant parameter)
when viewed from the second fast timescale.
By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
$||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$\theta$ that depends on the initial condition $\theta_0$ of recursion
(\ref{thetavmtdc}).
Consider now the recursion (\ref{uvmtdc}).
Let
$N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \phi_k \phi^{\top}_k u_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \phi_k \phi^{\top}_k u_k)|\mathcal{I} (k)]$,
where $\mathcal{I}(k)=\sigma(u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$,
$k\geq 1$ are the sigma fields
generated by $u_0,\theta_0,u_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
$0\leq l<k$.
It is easy to verify that $N_{k+1},k\geq0$ are integrable random variables that
satisfy $\mathbb{E}[N_{k+1}|\mathcal{I}(k)]=0$, $\forall k\geq0$.
Because $\phi_k$, $r_k$, and $\phi_k'$ have
uniformly bounded second moments, it can be seen that for some constant
$c_2>0$, $\forall k\geq0$,
\begin{equation*}
\mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq
c_2(1+||u_k||^2+||\theta_k||^2).
\end{equation*}
Because $\theta(t)\equiv \theta$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster})
can be written as
\begin{equation}
\dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\theta])\phi_t|\theta] - Cu(t).
\label{uvmtdcFasterFinal}
\end{equation}
Now consider the function $h(u)=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\theta]|\theta] -Cu$, i.e., the
driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}),
$u^* = C^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta])\phi|\theta]$ is the unique globally asymptotically
stable equilibrium. Let $h_{\infty}(u)=-Cu$.
For the ODE
\begin{equation}
\dot{u}(t) = h_{\infty}(u(t))= -Cu(t),
\label{uvmtdcInfty}
\end{equation}
the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable
equilibrium because $C$ is a positive definite matrix (because it is nonnegative definite and nonsingular).
Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
by Theorem 2.2 of \cite{borkar2000ode} we obtain that
$||u_k-u^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
Consider now the slower timescale recursion (\ref{thetavmtdc}). In the light of the above,
(\ref{thetavmtdc}) can be rewritten as
\begin{equation}
\theta_{k+1} \leftarrow \theta_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k\\
- \alpha_k \gamma\phi'_{k}(\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k]).
\end{equation}
Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$,
$k\geq 1$ be the sigma fields
generated by $\theta_0,\theta_{l+1},\phi_l,\phi_l'$,
$0\leq l<k$. Let
\begin{equation*}
\begin{array}{ccl}
Z_{k+1}&=&((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])\\
& &-\mathbb{E}[((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])|\mathcal{G}(k)]\\
&=&((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])\\
& &-\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k|\theta_k] - \gamma\mathbb{E}[\phi' \phi^{\top}]C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k|\theta_k].
\end{array}
\end{equation*}
It is easy to see that $Z_k$, $k\geq 0$ are integrable random variables and $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$. Further,
\begin{equation*}
\mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
c_3(1+||\theta_k||^2), k\geq 0
\end{equation*}
for some constant $c_3 \geq 0$, again beacuse $\phi_k$, $r_k$, and $\phi_k'$ have
uniformly bounded second moments, it can be seen that for some constant.
Consider now the following ODE associated with (\ref{thetavmtdc}):
\begin{equation}
\dot{\theta}(t) = (I - \mathbb{E}[\gamma \phi' \phi^{\top}]C^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)].
\label{thetavmtdcSlowerFinal}
\end{equation}
Let
\begin{equation*}
\begin{array}{ccl}
\vec{h}(\theta(t))&=&(I - \mathbb{E}[\gamma \phi' \phi^{\top}]C^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
&=&(C - \mathbb{E}[\gamma \phi' \phi^{\top}])C^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
&=& (\mathbb{E}[\phi \phi^{\top}] - \mathbb{E}[\gamma \phi' \phi^{\top}])C^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
&=& A^{\top}C^{-1}(-A\theta(t)+b),
\end{array}
\end{equation*}
because $\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]=-A\theta(t)+b$, where
$A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$, $b=\mathrm{Cov}(r,\phi)$, and $C=\mathbb{E}[\phi\phi^{\top}]$
Therefore,
$\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically
stable equilibrium for ODE (\ref{thetavmtdcSlowerFinal}).
Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow
\infty}\frac{\vec{h}(r\theta)}{r}$. Then
$\vec{h}_{\infty}(\theta)=-A^{\top}C^{-1}A\theta$ is well-defined.
Consider now the ODE
\begin{equation}
\dot{\theta}(t)=-A^{\top}C^{-1}A\theta(t).
\label{odethetavmtdcfinal}
\end{equation}
Because $C^{-1}$ is positive definite and $A$ has full rank (as it
is nonsingular by assumption), the matrix $A^{\top} C^{-1}A$ is also
positive definite.
The ODE (\ref{odethetavmtdcfinal}) has the origin as its unique globally asymptotically stable equilibrium.
Thus, the assumption (A1) and (A2) are verified.
The proof is given above.
In the fastest time scale, the parameter $w$ converges to
$\mathbb{E}[\delta|u_k,\theta_k]$.
In the second fast time scale,
the parameter $u$ converges to $C^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta_k])\phi|\theta_k]$.
In the slower time scale,
the parameter $\theta$ converges to $A^{-1}b$.
\end{proof}
\begin{algorithm}[t]
\caption{VMTDC algorithm with linear function approximation in the off-policy setting}
\label{alg:algorithm 2}
\begin{algorithmic}
\STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
$, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
\REPEAT
\STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
\textbf{Output}: $\theta^*$.\\
\FOR{$t=0$ {\bfseries to} $T-1$}
\STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
\STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
\STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
\STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
\STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[ (\delta_t-\omega_t)\phi_t - \gamma \phi_{t+1}(\phi^{\top}_{t} u_t)]$
\STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) - \phi^{\top}_{t} u_t] \phi_t$
\STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
\STATE $S_t=S_{t+1}$
\ENDFOR
\UNTIL{terminal episode}
\end{algorithmic}
\end{algorithm}
\begin{algorithm}[t]
\caption{VMGTD algorithm with linear function approximation in the off-policy setting}
\label{alg:algorithm 3}
\begin{algorithmic}
\STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
$, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
\REPEAT
\STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
\textbf{Output}: $\theta^*$.\\
\FOR{$t=0$ {\bfseries to} $T-1$}
\STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
\STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
\STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
\STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
\STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[\phi_t - \gamma \phi_{t+1}]\phi^{\top}_{t} u_t$
\STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) \phi_t - u_t]$
\STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
\STATE $S_t=S_{t+1}$
\ENDFOR
\UNTIL{terminal episode}
\end{algorithmic}
\end{algorithm}
\begin{algorithm}[t]
\caption{VMGTD2 algorithm with linear function approximation in the off-policy setting}
\label{alg:algorithm 4}
\begin{algorithmic}
\STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
$, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
\REPEAT
\STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
\textbf{Output}: $\theta^*$.\\
\FOR{$t=0$ {\bfseries to} $T-1$}
\STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
\STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
\STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
\STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
\STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[\phi_t - \gamma \phi_{t+1}]\phi^{\top}_{t} u_t$
\STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) - \phi^{\top}_{t} u_t] \phi_t$
\STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
\STATE $S_t=S_{t+1}$
\ENDFOR
\UNTIL{terminal episode}
\end{algorithmic}
\end{algorithm}
% \begin{algorithm}[t]
% \caption{VMETD algorithm with linear function approximation in the off-policy setting}
% \label{alg:algorithm 5}
% \begin{algorithmic}
% \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
% $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
% \REPEAT
% \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
% \textbf{Output}: $\theta^*$.\\
% \FOR{$t=0$ {\bfseries to} $T-1$}
% \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
% \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
% \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
% \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
% \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t u_t \rho_t(\delta_t-\omega_t)\phi_t$
% \STATE $u_{t+1}\leftarrow \gamma \rho_t u_t +1$
% \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
% \STATE $S_t=S_{t+1}$
% \ENDFOR
% \UNTIL{terminal episode}
% \end{algorithmic}
% \end{algorithm}
\section{Experimental details}
\label{experimentaldetails}
The feature matrices corresponding to three random walks are shown below respectively:
\begin{equation*}
\Phi_{tabular}=\left[
\begin{array}{ccccc}
1 & 0& 0& 0& 0\\
0 & 1& 0& 0& 0\\
0 & 0& 1& 0& 0\\
0 & 0& 0& 1& 0\\
0 & 0& 0& 0& 1
\end{array}\right]
\end{equation*}
\begin{equation*}
\Phi_{inverted}=\left[
\begin{array}{ccccc}
0 & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\
\frac{1}{2} & 0& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\
\frac{1}{2} & \frac{1}{2}& 0& \frac{1}{2}& \frac{1}{2}\\
\frac{1}{2} & \frac{1}{2}& \frac{1}{2}& 0& \frac{1}{2}\\
\frac{1}{2} & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& 0
\end{array}\right]
\end{equation*}
\begin{equation*}
\Phi_{dependent}=\left[
\begin{array}{ccccc}
1 & 0& 0\\
\frac{1}{\sqrt{2}} & \frac{1}{\sqrt{2}}& 0\\
\frac{1}{\sqrt{3}} & \frac{1}{\sqrt{3}}& \frac{1}{\sqrt{3}}\\
0 & \frac{1}{\sqrt{2}}& \frac{1}{\sqrt{2}}\\
0 & 0& 1
\end{array}\right]
\end{equation*}
Three random walk experiments: the $\alpha$ values for
all algorithms are in the range of $\{0.008, 0.015, 0.03, 0.06, 0.12, 0.25, 0.5\}$. For the TDC algorithm,
the range of the ratio $\frac{\zeta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. For the VMTD algorithm,
the range of the ratio $\frac{\beta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. It can be observed from
the update formula of VMTDC that when $\zeta$ takes a very small value,
the VMTDC update tends to be similar to VMTD update. Similarly,
when $\beta$ takes a very small value, the VMTDC update tends to be
similar to TDC update. Through experiments, it was found that
setting $\zeta$ to a small value makes VMTDC updates approach VMTD
updates, resulting in better performance. Therefore, for the VMTDC
algorithm, the range of $\frac{\beta}{\alpha}$ ratio is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$, and the range of
$\zeta$ is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. The learning curves in Figure \ref{Evaluation_full} correspond to the optimal
parameters.
The feature matrix of 7-state version of Baird's off-policy counterexample is
defined as follow:
\begin{equation*}
\Phi_{Counter}=\left[
\begin{array}{cccccccc}
1 & 2& 0& 0& 0& 0& 0& 0\\
1 & 0& 2& 0& 0& 0& 0& 0\\
1 & 0& 0& 2& 0& 0& 0& 0\\
1 & 0& 0& 0& 2& 0& 0& 0\\
1 & 0& 0& 0& 0& 2& 0& 0\\
1 & 0& 0& 0& 0& 0& 2& 0\\
2 & 0& 0& 0& 0& 0& 0& 1
\end{array}\right]
\end{equation*}
7-state version of Baird's off-policy counterexample:
for TD algorithm, $\alpha$ is set to 0.1. For the TDC algorithm, the range of
$\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$,
and the range of
$\zeta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$.
For the VMTD algorithm, the range of
$\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$,
and the range of
$\beta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. Through experiments, it was found
that setting $\zeta$ to a small value makes VMTDC updates approach VMTD
updates, resulting in better performance. Therefore, for the VMTDC
algorithm, The range of values for $\alpha$ and $\beta$ is the same as that of VMTD
and the range of $\zeta$
is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$.
The learning curves in Figure \ref{Complete_full} correspond to the optimal parameters.
For all policy evaluation experiments, each experiment
is independently run 100 times.
For the four control experiments: The learning rates for each
algorithm in all experiments are shown in Table \ref{lrofways}.
For all control experiments, each experiment is independently run 50 times.
\begin{table*}[htb]
\centering
\caption{Learning rates ($lr$) of four control experiments.}
\vskip 0.15in
\begin{tabular}{c|ccccc}
\hline
\multicolumn{1}{c|}{\diagbox{algorithms($lr$)}{envs}} &Maze &Cliff walking &Mountain Car &Acrobot \\
\hline
Sarsa($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\
GQ(0)($\alpha,\zeta$)&$0.1,0.003$ &$0.1,0.004$ &$0.1,0.01$ &$0.1,0.01$ \\
VMSarsa($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\
VMGQ(0)($\alpha,\zeta,\beta$)&$0.1,0.001,0.001$ &$0.1,0.005,\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ \\
AC($lr_{\text{actor}},lr_{\text{critic}}$)&$0.01,0.1$ &$0.01,0.01$ &$0.01,0.05$ &$0.01,0.05$ \\
Q-learning($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\
VMQ($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\
\hline
\end{tabular}
\label{lrofways}
\vskip -0.1in
\end{table*}
\ No newline at end of file
\section{Conclusion and Future Work}
Value-based reinforcement learning typically aims
to minimize error as an optimization objective.
As an alternation, this study proposes new objective
functions: VBE, VPBE and VNEU, and derives many variance minimization algorithms, including VMTD,
VMTDC, VMGTD, VMGTD2 and VMETD.
% The VMTD algorithm
% is essentially an adjustment or correction to the traditional
% TD update.
% Both
% algorithms are capable of stabilizing gradient estimation, reducing
% the variance of gradient estimation and accelerating convergence.
All algorithms demonstrated superior performance in policy
evaluation and control experiments.
Future work may include, but are not limited
to, (1) analysis of the convergence rate of VMTDC.
(2) extensions of VBE and VPBE to multi-step returns.
(3) extensions to nonlinear approximations, such as neural networks.
\ No newline at end of file
\section{Experimental Studies}
This section assesses algorithm performance through experiments,
which are divided into policy evaluation experiments and control experiments.
\subsection{Testing Tasks}
\textbf{Random-walk:} as shown in Figure \ref{randomwalk}, all episodes
start in the center state, $C$, and proceed to left or right by one state on each
step, equiprobably. Episodes terminate either on the extreme left or
the extreme right, and get a reward of $+1$ if terminate on the right, or
$0$ in the other case. In this task, the true value for each state is the
probability of starting from that state and terminating on the right
\cite{Sutton2018book}.
Thus, the true values of states from $A$ to $E$ are
$\frac{1}{6},\frac{2}{6},\frac{3}{6},\frac{4}{6},\frac{5}{6}$, respectively.
The discount factor $\gamma=1.0$.
There are three standard kinds of features for random-walk problems: tabular
feature, inverted feature and dependent feature \cite{sutton2009fast}.
The feature matrices corresponding to three random walks are shown in Appendix \ref{experimentaldetails}.
Conduct experiments using
an on-policy approach in the Random-walk environment.
\begin{figure}
\begin{center}
\input{main/pic/randomwalk.tex}
\caption{Random walk.}
\label{randomwalk}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
\input{main/pic/BairdExample.tex}
\caption{7-state version of Baird's off-policy counterexample.}
\label{bairdexample}
\end{center}
\end{figure}
\textbf{Baird's off-policy counterexample:} This task is well known as a
counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As
shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy
chooses actions represented by solid lines with a probability of $\frac{1}{7}$
and actions represented by dotted lines with a probability of $\frac{6}{7}$. The
target policy is expected to choose the solid line with more probability than $\frac{1}{7}$,
and it chooses the solid line with probability of $1$ in this paper.
The discount factor $\gamma =0.99$, and the feature matrix is
defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}.
\textbf{Maze}: The learning agent should find a shortest path from the upper
left corner to the lower right corner. In each state,
there are four alternative actions: $up$, $down$, $left$, and $right$, which
takes the agent deterministically to the corresponding neighbour state, except when
\begin{wrapfigure}{r}{3cm}
\centering
\includegraphics[scale=0.15]{main/pic/maze_13_13.pdf}
% \caption{The 2-state counterexample.}
\end{wrapfigure}
a movement is blocked by an obstacle or the edge
of the maze. Rewards are $-1$ in all transitions until the
agent reaches the goal state.
The discount factor $\gamma=0.99$, and states $s$ are represented by tabular
features.The maximum number of moves in the game is set to 1000.
\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are
selected from the gym official website and correspond to the following
versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''.
For specific details, please refer to the gym official website.
The maximum number of steps for the Mountain Car environment is set to 1000,
while the default settings are used for the other two environments. In Mountain car and Acrobot, features are generated by tile coding.
Please, refer to the Appendix \ref{experimentaldetails} for the selection of learning rates for all experiments.
\subsection{Experimental Results and Analysis}
\begin{figure}[htb]
\vskip 0.2in
\begin{center}
\subfigure[Dependent]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/dependent_new.pdf}
\label{DependentFull}
}
\subfigure[Tabular]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/tabular_new.pdf}
\label{TabularFull}
}
\\
\subfigure[Inverted]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/inverted_new.pdf}
\label{InvertedFull}
}
\subfigure[counterexample]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/counterexample_quanju_new.pdf}
\label{CounterExampleFull}
}
\caption{Learning curses of four evaluation environments.}
\label{Evaluation_full}
\end{center}
\vskip -0.2in
\end{figure}
% The learning rates of all algorithms in different environments are shown in Table \ref{lrofways}.
% Figure \ref{Complete_full} shows the experimental curves of different algorithms in four environments.
For policy evaluation experiments, compare the performance of the VMTD,
VMTDC, TD, and TDC algorithms.
The vertical axis is unified as RVBE.
For policy evaluation experiments, the criteria for evaluating
algorithms vary. The objective function minimized by our proposed
new algorithm differs from that of other algorithms. Therefore, to
ensure fairness in comparisons, this study only contrasts algorithm
experiments in controlled settings.
This study will compare the performance of Sarsa, Q-learning, GQ(0),
AC, VMSarsa, VMQ, and VMGQ(0) in four control environments.
% All experiments involved in this paper were run independently for 100 times.
The learning curses of the algorithms corresponding to
policy evaluation experiments and control experiments are
shown in Figures \ref{Evaluation_full} and \ref{Complete_full}, respectively.
The shaded area in Figure \ref{Evaluation_full}, \ref{Complete_full} represents the standard deviation (std).
In the random-walk tasks, VMTD and VMTDC exhibit excellent performance,
outperforming TD and TDC in the case of dependent random-walk.
In the 7-state example counter task, TD diverges,
while VMTDC converges and performs better than TDC.
From the update formula, it can be observed that the VMTD algorithm, like TDC,
is also an adjustment or correction of the TD update.
What is more surprising is that VMTD also maintains
convergence and demonstrates the best performance.
In Maze, Mountain Car, and Acrobot,
the convergence speed of VMSarsa, VMQ, and VMGQ(0) has
been significantly improved compared to Sarsa, Q-learning,
and GQ(0), respectively. The performance of the AC algorithm
is at an intermediate level. The performances of VMSarsa,
VMQ, and VMGQ(0) in these three experimental environments
have no significant differences.
In Cliff Walking, Sarsa and
VMSarsa converge to slightly worse solutions compared to
other algorithms. The convergence speed of VMSarsa is significantly
better than that of Sarsa. The convergence speed of VMGQ(0) and VMQ
is better than other algorithms, and the performance of VMGQ(0) is
slightly better than that of VMQ.
In summary, the performance of VMSarsa,
VMQ, and VMGQ(0) is better than that of other algorithms.
In the Cliff Walking environment,
the performance of VMGQ(0) is slightly better than that of
VMSarsa and VMQ. In the other three experimental environments,
the performances of VMSarsa, VMQ, and VMGQ(0) are close.
\ No newline at end of file
\section{Introduction}
\label{introduction}
Reinforcement learning can be mainly divided into two
categories: value-based reinforcement learning
and policy gradient-based reinforcement learning. This
paper focuses on temporal difference learning based on
linear approximated valued functions. Its research is
usually divided into two steps: the first step is to establish the convergence of the algorithm, and the second
step is to accelerate the algorithm.
In terms of stability, \citet{sutton1988learning} established the
convergence of on-policy TD(0), and \citet{tsitsiklis1997analysis}
established the convergence of on-policy TD($\lambda$).
However, ``The deadly triad'' consisting of off-policy learning,
bootstrapping, and function approximation makes
the stability a difficult problem \citep{Sutton2018book}.
To solve this problem, convergent off-policy temporal difference
learning algorithms are proposed, e.g., BR \cite{baird1995residual},
GTD \cite{sutton2008convergent}, GTD2 and TDC \cite{sutton2009fast},
ETD \cite{sutton2016emphatic}, and MRetrace \cite{chen2023modified}.
In terms of acceleration, \citet{hackman2012faster}
proposed Hybrid TD algorithm with on-policy matrix.
\citet{liu2015finite,liu2016proximal,liu2018proximal} proposed
true stochastic algorithms, i.e., GTD-MP and GTD2-MP, from
a convex-concave saddle-point formulation.
Second-order methods are used to accelerate TD learning,
e.g., Quasi Newton TD \cite{givchi2015quasi} and
accelerated TD (ATD) \citep{pan2017accelerated}.
\citet{hallak2016generalized} introduced an new parameter
to reduce variance for ETD.
\citet{zhang2022truncated} proposed truncated ETD with a lower variance.
Variance Reduced TD with direct variance reduction technique \citep{johnson2013accelerating} is proposed by \cite{korda2015td}
and analysed by \cite{xu2019reanalysis}.
How to further improve the convergence rates of reinforcement learning
algorithms is currently still an open problem.
Algorithm stability is prominently reflected in the changes
to the objective function, transitioning from mean squared
errors (MSE) \citep{Sutton2018book} to mean squared bellman errors (MSBE) \cite{baird1995residual}, then to
norm of the expected TD update \cite{sutton2009fast}, and further to
mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}. On the other hand, algorithm
acceleration is more centered around optimizing the iterative
update formula of the algorithm itself without altering the
objective function, thereby speeding up the convergence rate
of the algorithm. The emergence of new optimization objective
functions often leads to the development of novel algorithms.
The introduction of new algorithms, in turn, tends to inspire
researchers to explore methods for accelerating algorithms,
leading to the iterative creation of increasingly superior algorithms.
The kernel loss function can be optimized using standard
gradient-based methods, addressing the issue of double
sampling in residual gradient algorithm \cite{feng2019kernel}. It ensures convergence
in both on-policy and off-policy scenarios. The logistic bellman
error is convex and smooth in the action-value function parameters,
with bounded gradients \cite{basserrano2021logistic}. In contrast, the squared Bellman error is
not convex in the action-value function parameters, and RL algorithms
based on recursive optimization using it are known to be unstable.
% The value-based algorithms mentioned above aim to
% minimize some errors, e.g., mean squared errors \citep{Sutton2018book},
% mean squared Bellman errors \cite{baird1995residual}, norm
% of the expected TD update \cite{sutton2009fast},
% mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}, etc.
It is necessary to propose a new objective function, but the mentioned objective functions above are all some form of error.
Is minimizing error the only option for value-based reinforcement learning?
For policy evaluation experiments,
differences in objective functions may result
in inconsistent fixed points. This inconsistency
makes it difficult to uniformly compare the superiority
of algorithms derived from different objective functions.
However, for control experiments, since the choice of actions
depends on the relative values of the Q values rather than their
absolute values, the presence of solution bias is acceptable.
Based on this observation, we propose alternate objective functions
instead of minimizing errors. We minimize Variance of Bellman Error (VBE),
Variance of Projected Bellman Error (VPBE), and Variance of the norm of the expected TD update (VNEU)
and derive Variance Minimization (VM) algorithms.
These algorithms preserve the invariance of the optimal policy in the control environments,
but significantly reduce the variance of gradient estimation,
and thus hastening convergence.
The contributions of this paper are as follows:
(1) Introduction of novel objective functions based on
the invariance of the optimal policy.
(2) Derived mang variance minimization algorithms, including on-policy and one off-policy.
(3) Proof of their convergence.
(4) Analysis of the convergence rate of on-policy algorithm.
(5) Experiments demonstrating the faster convergence speed of the proposed algorithms.
\section{Variance Minimization Algorithms}
\subsection{Motivation}
As shown
in Table \ref{example_bias}, although there is a bias between the
true value and the predicted value, action $a_3$ is
still chosen under the greedy-policy.
On the contrary, supervised learning is usually used to predict temperature, humidity, morbidity, etc. If the bias is too large, the consequences could be serious.
\begin{table}[t]
\caption{Classification accuracies for naive Bayes and flexible
Bayes on various data sets.}
\label{example_bias}
\vskip 0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lcccr}
\toprule
action & $Q$ value & $Q$ value with bias \\
\midrule
$Q(s, a_0)$ & 1& 5 \\
$Q(s, a_1)$ & 2& 6 \\
$Q(s, a_2)$ & 3& 7 \\
$Q(s, a_3)$ & 4& 8 \\
$\arg \min_{a}Q(s,a)$ & $a_3$& $a_3$\\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\end{table}
In addition, reward shaping can significantly speed up the learning by adding a shaping
reward $F(s,s')$ to the original reward $r$,
where $F(s,s')$ is the general form of any state-based shaping reward.
Static potential-based reward shaping (Static PBRS) maintains the policy invariance if the
shaping reward follows from $F(s,s')=\gamma
f(s')-f(s)$ \cite{ng1999policy}.
This means that we can make changes to the TD error $\delta = r+\gamma \theta^{\top}\phi'-\theta^{\top}\phi $ while still ensuring the invariance of the optimal policy,
\begin{equation*}
\delta - \omega= r+\gamma \theta^{\top}\phi'-\theta^{\top}\phi - \omega,
\end{equation*}
where $\omega$ is a constant, acting as a static PBRS.
This also means that algorithms with the optimization goal
of minimizing errors, after introducing reward shaping,
may result in larger or smaller bias. Fortunately,
as discussed above, bias is acceptable in reinforcement
learning.
However, the problem is that selecting an appropriate
$\omega$ requires expert knowledge. This forces us to learn
$\omega$ dynamically, i.e., $\omega=\omega_t $ and dynamic PBRS can also maintain the policy
invariance if the shaping reward is $F(s,t,s',t')=\gamma f(s',t')-f(s,t)$,
where $t$ is the time-step the agent reaches in state $s$
\cite{devlin2012dynamic}.
However, this result requires the convergence guarantee of the dynamic potential
function $f(s,t)$. If $f(s,t)$ does not converge as the time-step
$t\rightarrow\infty$, the Q-values of dynamic PBRS are not
guaranteed to converge.
Let $f_{\omega_t}(s)=\frac{\omega_t}{\gamma-1}$.
Thus, $F_{\omega_t}(s,s')=\gamma f_{\omega_t}(s')-f_{\omega_t}(s)= \omega_t$
is a dynamic PBRS. And if $\omega$ converges finally, the dynamic potential
function $f(s,t)$ will converge.
Bias is the expected difference between the predicted value
and the true value. Therefore, under the premise of bootstrapping, we first think of
letting $\omega \doteq \mathbb{E}[\mathbb{E}[\delta|s]]=\mathbb{E}[\delta]$.
As we all know, the optimization process of linear TD(0) (semi-gradient) and linear TDC are as follows, respectively:
\begin{equation*}
\theta^{*}= \arg \min_{\theta} \mathbb{E}[(\mathbb{E}[\delta |s])^2],
\end{equation*}
and
\begin{equation*}
\theta^{*}=\arg \min_{\theta} \mathbb{E}[\delta \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1} \mathbb{E}[\delta\phi].
\end{equation*}
As a result, two novel objective functions and their corresponding algorithms are proposed,
where $\omega$ is subsequently proven to converge, meaning that these two algorithms can maintain the invariance of the optimal strategy.
\subsection{Variance Minimization TD Learning: VMTD}
For on-policy learning,
a novel objective function, Variance of Bellman Error (VBE), is proposed as follows:
\begin{equation}
\begin{array}{ccl}
\arg \min_{\theta}\text{VBE}(\theta)&=&\arg \min_{\theta}\mathbb{E}[(\mathbb{E}[\delta|s]-\mathbb{E}[\mathbb{E}[\delta|s]])^2]\\
&=&\arg \min_{\theta,\omega} \mathbb{E}[(\mathbb{E}[\delta|s]-\omega)^2].
\end{array}
\end{equation}
Clearly, it is no longer to minimize Bellman errors.
First, the parameter $\omega$ is derived directly based on
stochastic gradient descent:
\begin{equation}
\omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k),
\label{omega}
\end{equation}
where $\delta_k$ is the TD error as follows:
\begin{equation}
\delta_k = r+\gamma
\theta_k^{\top}\phi_{k}'-\theta_k^{\top}\phi_k.
\label{delta}
\end{equation}
Then, based on stochastic semi-gradient descent, the update of
the parameter $\theta$ is as follows:
\begin{equation}
\theta_{k+1}\leftarrow
\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi_k.
\label{theta}
\end{equation}
The pseudocode of the VMTD algorithm is shown in Algorithm \ref{alg:algorithm 1}.
For control tasks, two extensions of VMTD are named VMSarsa and VMQ respectively,
and the update formulas are shown below:
\begin{equation}
\theta_{k+1}\leftarrow
\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi(s_k,a_k).
\end{equation}
and
\begin{equation}
\omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k),
\end{equation}
where $\delta_k$ delta in VMSarsa is:
\begin{equation}
\delta_{k}=r_{k+1}+\gamma \theta_{k}^{\top}\phi(s_{k+1},a_{k+1}) - \theta_{k}^{\top}\phi(s_{k},a_{k}),
\label{deltaSarsa}
\end{equation}
and $\delta_k$ delta in VMQ is:
\begin{equation}
\delta_{k}=r_{k+1}+\gamma \max_{a\in A}\theta_{k}^{\top}\phi(s_{k+1},a) - \theta_{k}^{\top}\phi(s_{k},a_{k}).
\label{deltaQ}
\end{equation}
\begin{algorithm}[t]
\caption{VMTD algorithm with linear function approximation in the on-policy setting}
\label{alg:algorithm 1}
\begin{algorithmic}
\STATE {\bfseries Input:} $\theta_{0}$, $\omega_{0}$, $\gamma
$, learning rate $\alpha_t$ and $\beta_t$
\REPEAT
\STATE For any episode, initialize $\theta_{0}$ arbitrarily, $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$ and $\beta_t$ are constant.\\
\FOR{$t=0$ {\bfseries to} $T-1$}
\STATE Take $A_t$ from $S_t$ according to policy $\mu$, and arrive at $S_{t+1}$\\
\STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
\STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t}'-\theta_t^{\top}\phi_t$
\STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t(\delta_t-\omega_t)\phi_t$
\STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t(\delta_t-\omega_t)$
\STATE $S_t=S_{t+1}$
\ENDFOR
\UNTIL{terminal episode}
\end{algorithmic}
\end{algorithm}
\subsection{Variance Minimization TDC Learning: VMTDC}
For off-policy learning, we employ a projection operator.
The objective function is called Variance of Projected Bellman error (VPBE),
and the corresponding algorithm is called VMTDC.
\begin{equation}
\begin{array}{ccl}
\text{VPBE}(\theta)&=&\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta])\phi]\\
&=&\mathbb{E}[(\delta-\omega) \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta-\omega)\phi],
\end{array}
\end{equation}
where $\omega$ is used to estimate $\mathbb{E}[\delta]$, i.e., $\omega \doteq \mathbb{E}[\delta]$.
The derivation process of the VMTDC algorithm is the same
as that of the TDC algorithm, the only difference is that the original $\delta$ is replaced by $\delta-\omega$.
Therefore, we can easily get the updated formula of VMTDC, as follows:
\begin{equation}
\theta_{k+1}\leftarrow\theta_{k}+\alpha_{k}[(\delta_{k}- \omega_k) \phi(s_k)\\
- \gamma\phi(s_{k+1})(\phi^{\top} (s_k) u_k)],
\label{thetavmtdc}
\end{equation}
\begin{equation}
u_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\omega_k - \phi^{\top} (s_k) u_k]\phi(s_k),
\label{uvmtdc}
\end{equation}
and
\begin{equation}
\omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k),
\label{omegavmtdc}
\end{equation}
The pseudocode of the VMTDC algorithm for importance-sampling scenario is shown in Algorithm \ref{alg:algorithm 2} of Appendix \ref{proofth2}.
Now, we will introduce the improved version of the GQ(0) algorithm, named VMGQ(0):
\begin{equation}
\begin{array}{ccl}
\theta_{k+1}\leftarrow\theta_{k}&+&\alpha_{k}[(\delta_{k}- \omega_k) \phi(s_k,a_k)\\
&-& \gamma\phi(s_{k+1},A^{*}_{k+1})(\phi^{\top} (s_k,a_k) u_k)],
\end{array}
\end{equation}
\begin{equation}
u_{k+1}\leftarrow u_{k}+\zeta_{k}[(\delta_{k}-u_k) - \phi^{\top} (s_k,a_k) u_k]\phi(s_k,a_k),
\end{equation}
and
\begin{equation}
\omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k- \omega_k),
\end{equation}
where $\delta_{k}$ is (\ref{deltaQ}) and $A^{*}_{k+1}={\arg \max}_{a}(\theta_{k}^{\top}\phi(s_{k+1},a))$.
This paper also introduces an additional parameter $\omega$ into the GTD and GTD2 algorithms. For details, please refer to the appendix.
\ No newline at end of file
\resizebox{6cm}{4cm}{
\begin{tikzpicture}[smooth]
\node[coordinate] (origin) at (0.3,0) {};
\node[coordinate] (num7) at (3,0) {};
\node[coordinate] (num1) at (1,2.5) {};
\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {};
\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {};
\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {};
\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {};
\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {};
\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {};
\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {};
\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {};
\path (num1) ++ (0:1cm) node (num2) [coordinate] {};
\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {};
\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {};
\path (num2) ++ (0:1cm) node (num3) [coordinate] {};
\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {};
\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {};
\path (num3) ++ (0:1cm) node (num4) [coordinate] {};
\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {};
\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {};
\path (num4) ++ (0:1cm) node (num5) [coordinate] {};
\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {};
\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {};
\path (num5) ++ (0:1cm) node (num6) [coordinate] {};
\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {};
\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {};
%\draw[->](0,0) -- (1,1);
%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1);
%\fill (0.5,0.5) circle (0.5);
%\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7};
\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06]
coordinates{(num7) (origin) (origin_above) (origin_aright)};
\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5]
coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)};
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6};
\draw[->,>=stealth,line width = 0.02cm] (a)--(g);
\draw[->,>=stealth,line width = 0.02cm] (b)--(g);
\draw[->,>=stealth,line width = 0.02cm] (c)--(g);
\draw[->,>=stealth,line width = 0.02cm] (d)--(g);
\draw[->,>=stealth,line width = 0.02cm] (e)--(g);
\draw[->,>=stealth,line width = 0.02cm] (f)--(g);
\end{tikzpicture}
}
% \tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
% \tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
% \tikzstyle{init} = [pin edge={to-,thin,black}]
% \resizebox{8cm}{1.2cm}{
% \begin{tikzpicture}[node distance=1.5cm,auto,>=latex']
% \node [block] (o) {};
% \node (p) [left of=o,node distance=0.5cm, coordinate] {o};
% \node [shape=circle,int] (a) [right of=o]{$A$};
% \node (b) [left of=a,node distance=1.5cm, coordinate] {a};
% \node [shape=circle,int] (c) [right of=a] {$B$};
% \node (d) [left of=c,node distance=1.5cm, coordinate] {c};
% \node [shape=circle,int, pin={[init]above:$$}] (e) [right of=c]{$C$};
% \node (f) [left of=e,node distance=1.5cm, coordinate] {e};
% \node [shape=circle,int] (g) [right of=e] {$D$};
% \node (h) [left of=g,node distance=1.5cm, coordinate] {g};
% \node [shape=circle,int] (i) [right of=g] {$E$};
% \node (j) [left of=i,node distance=1.5cm, coordinate] {i};
% \node [block] (k) [right of=i] {};
% \node (l) [left of=k,node distance=0.5cm, coordinate] {k};
% \path[<-] (o) edge node {$0$} (a);
% \path[<->] (a) edge node {$0$} (c);
% \path[<->] (c) edge node {$0$} (e);
% \path[<->] (e) edge node {$0$} (g);
% \path[<->] (g) edge node {$0$} (i);
% \draw[->] (i) edge node {$1$} (k);
% \end{tikzpicture}
% }
\tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
\tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
\tikzstyle{init} = [pin edge={to-,thin,black}]
\resizebox{6cm}{1cm}{
\begin{tikzpicture}[node distance=1.5cm, auto, >=latex]
\node [block] (o) {};
\node (p) [left of=o, node distance=0.5cm, coordinate] {o};
\node [shape=circle, int] (a) [right of=o] {$A$};
\node (b) [left of=a, node distance=1.5cm, coordinate] {a};
\node [shape=circle, int] (c) [right of=a] {$B$};
\node (d) [left of=c, node distance=1.5cm, coordinate] {c};
\node [shape=circle, int, pin={[init]above:$ $}] (e) [right of=c] {$C$};
\node (f) [left of=e, node distance=1.5cm, coordinate] {e};
\node [shape=circle, int] (g) [right of=e] {$D$};
\node (h) [left of=g, node distance=1.5cm, coordinate] {g};
\node [shape=circle, int] (i) [right of=g] {$E$};
\node (j) [left of=i, node distance=1.5cm, coordinate] {i};
\node [block] (k) [right of=i] {};
\node (l) [left of=k, node distance=0.5cm, coordinate] {k};
\path[->] (o) edge node {$0$} (a);
\path[<->] (a) edge node {$0$} (c);
\path[<->] (c) edge node {$0$} (e);
\path[<->] (e) edge node {$0$} (g);
\path[<->] (g) edge node {$0$} (i);
\draw[->] (i) edge node {$1$} (k);
\end{tikzpicture}
}
\ No newline at end of file
\section{Background}
\label{preliminaries}
Reinforcement learning agent interacts with environment, observes state,
takes sequential decision makings to influence environment, and obtains
rewards.
Consider an infinite-horizon discounted
Markov Decision Process (MDP), defined by a tuple $\langle S,A,R,P,\gamma
\rangle$, where $S=\{1,2,\ldots,N\}$ is a finite set of states of the environment; $A$
is a finite set of actions of the agent;
$R:S\times A \times S \rightarrow \mathbb{R}$ is a bounded deterministic reward
function; $P:S\times A\times S \rightarrow [0,1]$ is the transition
probability distribution; and $\gamma\in (0,1)$
is the discount factor \cite{Sutton2018book}.
Due to the requirements of online learning, value iteration based on sampling
is considered in this paper.
In each sampling, an experience (or transition) $\langle s, a, s', r\rangle$ is
obtained.
A policy is a mapping $\pi:S\times A \rightarrow [0,1]$. The goal of the
agent is to find an optimal policy $\pi^*$ to maximize the expectation of a
discounted cumulative rewards in a long period.
State value function $V^{\pi}(s)$ for a stationary policy $\pi$ is
defined as:
\begin{equation*}
V^{\pi}(s)=\mathbb{E}_{\pi}[\sum_{k=0}^{\infty} \gamma^k R_{k}|s_0=s].
\label{valuefunction}
\end{equation*}
Linear value function for state $s\in S$ is defined as:
\begin{equation}
V_{{\theta}}(s):= {\theta}^{\top}{\phi}(s) = \sum_{i=1}^{m}
\theta_i \phi_i(s),
\label{linearvaluefunction}
\end{equation}
where ${\theta}:=(\theta_1,\theta_2,\ldots,\theta_m)^{\top}\in
\mathbb{R}^m$ is a parameter vector,
${\phi}:=(\phi_1,\phi_2,\ldots,\phi_m)^{\top}\in \mathbb{R}^m$ is a feature
function defined on state space $S$, and $m$ is the feature size.
Tabular temporal difference (TD) learning \cite{Sutton2018book} has been successfully applied to small-scale problems.
To deal with the well-known curse of dimensionality of large scale MDPs, value
function is usually approximated by a linear model, kernel methods, decision
trees, or neural networks, etc. This paper focuses on the linear model, where
features are usually hand coded by domain experts.
TD learning can also be used to find optimal strategies. The problem of finding an optimal policy is
often called the control problem. Two popular TD methods are Sarsa and Q-leaning. The former is an on-policy
TD control, while the latter is an off-policy control.
It is well known that TDC algorithm \cite{sutton2009fast} guarantees
convergence under off-policy conditions while the off-policy TD algorithm may diverge. The
objective function of TDC is MSPBE.
TDC is essentially an adjustment or correction of the TD update so that it
follows the gradient of the MSPBE objective function. In the context of the TDC algorithm, the control algorithm
is known as Greedy-GQ($\lambda$) \cite{sutton2009fast}. When $\lambda$ is set to 0, it is denoted
as GQ(0).
\ No newline at end of file
\begin{figure*}[htb]
\vskip 0.2in
\begin{center}
\subfigure[Maze]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/maze_complete.pdf}
\label{MazeFull}
}
\subfigure[Cliff Walking]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/cw_complete.pdf}
\label{CliffWalkingFull}
}
\\
\subfigure[Mountain Car]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/mt_complete.pdf}
\label{MountainCarFull}
}
\subfigure[Acrobot]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/Acrobot_complete.pdf}
\label{AcrobotFull}
}
\caption{Learning curses of four contral environments.}
\label{Complete_full}
\end{center}
\vskip -0.2in
\end{figure*}
\section{Related Work}
\subsection{Difference between VMQ and R-learning}
\begin{table*}[htb]
\centering
\caption{Difference between R-learning and tabular VMQ.}
\vskip 0.15in
\begin{tabular}{c|cc}
\hline
algorithms&update formula \\
\hline
R-learning&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}-m_{k}+ \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a))$\\
&$m_{k+1}\leftarrow m_{k}+\beta_k(r_{k+1}+\max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-m_{k})$\\
tabular VMQ&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_k)$\\
&$\omega_{k+1}\leftarrow \omega_{k}+\beta_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_{k})$\\
\hline
\end{tabular}
\label{differenceRandVMQ}
\vskip -0.1in
\end{table*}
Tabular VMQ's update formula bears some resemblance
to R-learning's update formula. As shown in Table \ref{differenceRandVMQ}, the update formulas of the two algorithms have the following differences:
\\(1) The goal of the R-learning algorithm \cite{schwartz1993reinforcement} is to maximize the average
reward, rather than the cumulative reward, by learning an estimate
of the average reward. This estimate $m$ is then used to update the Q-values.
On the contrary, the $\omega$ in the tabular VMQ update formula eventually converges to $\mathbb{E}[\delta]$.
\\(2) When $\gamma=1$ in the tabular VMQ update formula, the
R-learning update formula is formally
the same as the tabular VMQ update formula.
Therefore, R-learning algorithm can be
considered as a special case of VMQ algorithm in form.
\subsection{Variance Reduction for TD Learning}
The TD with centering algorithm (CTD) \cite{korda2015td}
was proposed, which directly applies variance reduction techniques to
the TD algorithm. The CTD algorithm updates its parameters using the
average gradient of a batch of Markovian samples and a projection operator.
Unfortunately, the authors’ analysis of the CTD algorithm contains technical
errors. The VRTD algorithm \cite{xu2020reanalysis} is also a variance-reduced algorithm that updates
its parameters using the average gradient of a batch of i.i.d. samples. The
authors of VRTD provide a technically sound analysis to demonstrate the
advantages of variance reduction.
\subsection{Variance Reduction for Policy Gradient Algorithms}
Policy gradient algorithms are a class of reinforcement
learning algorithms that directly optimize cumulative rewards.
REINFORCE is a Monte Carlo algorithm that estimates
gradients through sampling, but may have a high variance.
Baselines are introduced to reduce variance and to
accelerate learning \cite{Sutton2018book}. In Actor-Critic,
value function as a baseline and bootstrapping
are used to reduce variance, also accelerating convergence \cite{Sutton2018book}.
TRPO \cite{schulman2015trust} and PPO \cite{schulman2017proximal}
use generalized advantage
estimation, which combines multi-step bootstrapping and Monte Carlo
estimation to reduce variance, making gradient estimation more stable and
accelerating convergence.
In Variance Minimization,
the incorporation of $\omega \doteq \mathbb{E}[\delta]$
bears a striking resemblance to the use of a baseline
in policy gradient methods. The introduction of a baseline
in policy gradient techniques does not alter
the expected value of the update;
rather, it significantly impacts the variance of gradient estimation.
The addition of $\omega \doteq \mathbb{E}[\delta]$ in Variance Minimization
preserves the invariance of the optimal
policy while stabilizing gradient estimation,
reducing the variance of gradient estimation,
and hastening convergence.
\ No newline at end of file
\section{Theoretical Analysis}
The purpose of this section is to establish the stabilities of the VMTD algorithm
and the VMTDC algorithm, and also presents a corollary on the convergence rate of VMTD.
\begin{theorem}
\label{theorem1}(Convergence of VMTD).
In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta}) of VMTD.
Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
$
\sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
$
$
\sum_{k=0}^{\infty}\alpha_k^2<\infty,
$
$
\sum_{k=0}^{\infty}\beta_k^2<\infty,
$
and
$
\alpha_k = o(\beta_k).
$
Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain.
Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
$b=\mathrm{Cov}(r,\phi)$.
Assume that matrix $A$ is non-singular.
Then the parameter vector $\theta_k$ converges with probability one
to $A^{-1}b$.
\end{theorem}
Please refer to the appendix \ref{proofth1} for detailed proof process.
Theorem 3 in \cite{dalal2020tale} provides a general conclusion on the convergence speed of all linear two-timescale
algorithms. VMTD satisfies the assumptions of this theorem, leading
to the following corollary.
\begin{corollary}
\label{corollary4_2}
Consider the Sparsely Projected variant of VMTD. Then, for $\alpha_k = 1/(k+1)^{\alpha}$, $\beta_k = 1/(k+1)^{\beta}$,
$0<\beta<\alpha<1$, $p>1$, with probility larger than $1- \tau$, for all $k\geq N_3$, we have
\begin{equation}
||\theta'_{k} - \theta^{*}|| \le C_{3,\theta} \frac{\sqrt{\ln (4d_{1}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\alpha / 2}}
\end{equation}
\begin{equation}
||\omega'_{n} - \omega^{*}|| \le C_{3,\omega} \frac{\sqrt{\ln (4d_{2}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\omega / 2}},
\end{equation}
\end{corollary}
where $d_1$ and $d_2$ represent the dimensions of $\theta$ and $\omega$, respectively. For VMTD, $d_2 =1$.
The meanings of $N_3$,$C_{3,\theta}$ and $C_{3,\omega}$ are explained in \cite{dalal2020tale}.
The formulas for $\theta'_{k}$ and $\omega'_{n}$ can be found in (\ref{sparseprojectiontheta}) and (\ref{sparseprojectionomega}).
Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process.
\begin{theorem}
\label{theorem2}(Convergence of VMTDC).
In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc}) of VMTDC.
Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$,
$
\sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty,
$
$
\sum_{k=0}^{\infty}\alpha_k^2<\infty,
$
$
\sum_{k=0}^{\infty}\zeta_k^2<\infty,
$
$
\sum_{k=0}^{\infty}\beta_k^2<\infty,
$
and
$
\alpha_k = o(\zeta_k),
$
$
\zeta_k = o(\beta_k).
$
Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
uniformly bounded second moments.
Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
$b=\mathrm{Cov}(r,\phi)$, and $C=\mathbb{E}[\phi\phi^{\top}]$.
Assume that $A$ and $C$ are non-singular matrices.
Then the parameter vector $\theta_k$ converges with probability one
to $A^{-1}b$.
\end{theorem}
Please refer to the appendix \ref{proofth2} for detailed proof process.
\ No newline at end of file
%NAME: named.bst
% BibTeX `named' style file for BibTeX version 0.99c, LaTeX version 2.09
% Place it in a file called named.bst in the BibTeX search path. (Placing it
% in the same directory as the LaTeX document should also work.)
% Support for named citations is provided by named.sty
% This version was made by modifying the master file made by
% Oren Patashnik (PATASHNIK@SCORE.STANFORD.EDU)
% Copyright (C) 1985, all rights reserved.
% Modifications Copyright 1988, Peter F. Patel-Schneider
% Copying of this file is authorized only if either
% (1) you make absolutely no changes to your copy, including name, or
% (2) if you do make changes, you name it something other than
% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
% This restriction helps ensure that all standard styles are identical.
% There are undoubtably bugs in this style. If you make bug fixes,
% improvements, etc. please let me know. My e-mail address is:
% pfps@research.att.com
% Citation format: [author-last-name, year]
% [author-last-name and author-last-name, year]
% [author-last-name {\em et al.}, year]
%
% Reference list ordering: alphabetical by author or whatever passes
% for author in the absence of one.
%
% This BibTeX style has support for short (year only) citations. This
% is done by having the citations actually look like
% \citeauthoryear{author-info}{year}
% The LaTeX style has to have the following (or similar)
% \let\@internalcite\cite
% \def\cite{\def\citeauthoryear##1##2{##1, ##2}\@internalcite}
% \def\shortcite{\def\citeauthoryear##1{##2}\@internalcite}
% \def\@biblabel#1{\def\citeauthoryear##1##2{##1, ##2}[#1]\hfill}
% which makes \shortcite the macro for short citations.
ENTRY
{ address
author
booktitle
chapter
edition
editor
howpublished
institution
journal
key
month
note
number
organization
pages
publisher
school
series
title
type
volume
year
}
{}
{ label extra.label sort.label }
INTEGERS { output.state before.all mid.sentence after.sentence after.block }
FUNCTION {init.state.consts}
{ #0 'before.all :=
#1 'mid.sentence :=
#2 'after.sentence :=
#3 'after.block :=
}
STRINGS { s t }
FUNCTION {output.nonnull}
{ 's :=
output.state mid.sentence =
{ ", " * write$ }
{ output.state after.block =
{ add.period$ write$
newline$
"\newblock " write$
}
{ output.state before.all =
'write$
{ add.period$ " " * write$ }
if$
}
if$
mid.sentence 'output.state :=
}
if$
s
}
FUNCTION {output}
{ duplicate$ empty$
'pop$
'output.nonnull
if$
}
FUNCTION {output.check}
{ 't :=
duplicate$ empty$
{ pop$ "empty " t * " in " * cite$ * warning$ }
'output.nonnull
if$
}
FUNCTION {output.bibitem}
{ newline$
"\bibitem[" write$
label write$
"]{" write$
cite$ write$
"}" write$
newline$
""
before.all 'output.state :=
}
FUNCTION {fin.entry}
{ add.period$
write$
newline$
}
FUNCTION {new.block}
{ output.state before.all =
'skip$
{ after.block 'output.state := }
if$
}
FUNCTION {new.sentence}
{ output.state after.block =
'skip$
{ output.state before.all =
'skip$
{ after.sentence 'output.state := }
if$
}
if$
}
FUNCTION {not}
{ { #0 }
{ #1 }
if$
}
FUNCTION {and}
{ 'skip$
{ pop$ #0 }
if$
}
FUNCTION {or}
{ { pop$ #1 }
'skip$
if$
}
FUNCTION {new.block.checka}
{ empty$
'skip$
'new.block
if$
}
FUNCTION {new.block.checkb}
{ empty$
swap$ empty$
and
'skip$
'new.block
if$
}
FUNCTION {new.sentence.checka}
{ empty$
'skip$
'new.sentence
if$
}
FUNCTION {new.sentence.checkb}
{ empty$
swap$ empty$
and
'skip$
'new.sentence
if$
}
FUNCTION {field.or.null}
{ duplicate$ empty$
{ pop$ "" }
'skip$
if$
}
FUNCTION {emphasize}
{ duplicate$ empty$
{ pop$ "" }
{ "{\em " swap$ * "}" * }
if$
}
INTEGERS { nameptr namesleft numnames }
FUNCTION {format.names}
{ 's :=
#1 'nameptr :=
s num.names$ 'numnames :=
numnames 'namesleft :=
{ namesleft #0 > }
{ s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
nameptr #1 >
{ namesleft #1 >
{ ", " * t * }
{ numnames #2 >
{ "," * }
'skip$
if$
t "others" =
{ " et~al." * }
{ " and " * t * }
if$
}
if$
}
't
if$
nameptr #1 + 'nameptr :=
namesleft #1 - 'namesleft :=
}
while$
}
FUNCTION {format.authors}
{ author empty$
{ "" }
{ author format.names }
if$
}
FUNCTION {format.editors}
{ editor empty$
{ "" }
{ editor format.names
editor num.names$ #1 >
{ ", editors" * }
{ ", editor" * }
if$
}
if$
}
FUNCTION {format.title}
{ title empty$
{ "" }
{ title "t" change.case$ }
if$
}
FUNCTION {n.dashify}
{ 't :=
""
{ t empty$ not }
{ t #1 #1 substring$ "-" =
{ t #1 #2 substring$ "--" = not
{ "--" *
t #2 global.max$ substring$ 't :=
}
{ { t #1 #1 substring$ "-" = }
{ "-" *
t #2 global.max$ substring$ 't :=
}
while$
}
if$
}
{ t #1 #1 substring$ *
t #2 global.max$ substring$ 't :=
}
if$
}
while$
}
FUNCTION {format.date}
{ year empty$
{ month empty$
{ "" }
{ "there's a month but no year in " cite$ * warning$
month
}
if$
}
{ month empty$
'year
{ month " " * year * }
if$
}
if$
}
FUNCTION {format.btitle}
{ title emphasize
}
FUNCTION {tie.or.space.connect}
{ duplicate$ text.length$ #3 <
{ "~" }
{ " " }
if$
swap$ * *
}
FUNCTION {either.or.check}
{ empty$
'pop$
{ "can't use both " swap$ * " fields in " * cite$ * warning$ }
if$
}
FUNCTION {format.bvolume}
{ volume empty$
{ "" }
{ "volume" volume tie.or.space.connect
series empty$
'skip$
{ " of " * series emphasize * }
if$
"volume and number" number either.or.check
}
if$
}
FUNCTION {format.number.series}
{ volume empty$
{ number empty$
{ series field.or.null }
{ output.state mid.sentence =
{ "number" }
{ "Number" }
if$
number tie.or.space.connect
series empty$
{ "there's a number but no series in " cite$ * warning$ }
{ " in " * series * }
if$
}
if$
}
{ "" }
if$
}
FUNCTION {format.edition}
{ edition empty$
{ "" }
{ output.state mid.sentence =
{ edition "l" change.case$ " edition" * }
{ edition "t" change.case$ " edition" * }
if$
}
if$
}
INTEGERS { multiresult }
FUNCTION {multi.page.check}
{ 't :=
#0 'multiresult :=
{ multiresult not
t empty$ not
and
}
{ t #1 #1 substring$
duplicate$ "-" =
swap$ duplicate$ "," =
swap$ "+" =
or or
{ #1 'multiresult := }
{ t #2 global.max$ substring$ 't := }
if$
}
while$
multiresult
}
FUNCTION {format.pages}
{ pages empty$
{ "" }
{ pages multi.page.check
{ "pages" pages n.dashify tie.or.space.connect }
{ "page" pages tie.or.space.connect }
if$
}
if$
}
FUNCTION {format.vol.num.pages}
{ volume field.or.null
number empty$
'skip$
{ "(" number * ")" * *
volume empty$
{ "there's a number but no volume in " cite$ * warning$ }
'skip$
if$
}
if$
pages empty$
'skip$
{ duplicate$ empty$
{ pop$ format.pages }
{ ":" * pages n.dashify * }
if$
}
if$
}
FUNCTION {format.chapter.pages}
{ chapter empty$
'format.pages
{ type empty$
{ "chapter" }
{ type "l" change.case$ }
if$
chapter tie.or.space.connect
pages empty$
'skip$
{ ", " * format.pages * }
if$
}
if$
}
FUNCTION {format.in.ed.booktitle}
{ booktitle empty$
{ "" }
{ editor empty$
{ "In " booktitle emphasize * }
{ "In " format.editors * ", " * booktitle emphasize * }
if$
}
if$
}
FUNCTION {empty.misc.check}
{ author empty$ title empty$ howpublished empty$
month empty$ year empty$ note empty$
and and and and and
key empty$ not and
{ "all relevant fields are empty in " cite$ * warning$ }
'skip$
if$
}
FUNCTION {format.thesis.type}
{ type empty$
'skip$
{ pop$
type "t" change.case$
}
if$
}
FUNCTION {format.tr.number}
{ type empty$
{ "Technical Report" }
'type
if$
number empty$
{ "t" change.case$ }
{ number tie.or.space.connect }
if$
}
FUNCTION {format.article.crossref}
{ key empty$
{ journal empty$
{ "need key or journal for " cite$ * " to crossref " * crossref *
warning$
""
}
{ "In {\em " journal * "\/}" * }
if$
}
{ "In " key * }
if$
" \shortcite{" * crossref * "}" *
}
FUNCTION {format.crossref.editor}
{ editor #1 "{vv~}{ll}" format.name$
editor num.names$ duplicate$
#2 >
{ pop$ " et~al." * }
{ #2 <
'skip$
{ editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
{ " et~al." * }
{ " and " * editor #2 "{vv~}{ll}" format.name$ * }
if$
}
if$
}
if$
}
FUNCTION {format.book.crossref}
{ volume empty$
{ "empty volume in " cite$ * "'s crossref of " * crossref * warning$
"In "
}
{ "Volume" volume tie.or.space.connect
" of " *
}
if$
editor empty$
editor field.or.null author field.or.null =
or
{ key empty$
{ series empty$
{ "need editor, key, or series for " cite$ * " to crossref " *
crossref * warning$
"" *
}
{ "{\em " * series * "\/}" * }
if$
}
{ key * }
if$
}
{ format.crossref.editor * }
if$
" \shortcite{" * crossref * "}" *
}
FUNCTION {format.incoll.inproc.crossref}
{ editor empty$
editor field.or.null author field.or.null =
or
{ key empty$
{ booktitle empty$
{ "need editor, key, or booktitle for " cite$ * " to crossref " *
crossref * warning$
""
}
{ "In {\em " booktitle * "\/}" * }
if$
}
{ "In " key * }
if$
}
{ "In " format.crossref.editor * }
if$
" \shortcite{" * crossref * "}" *
}
FUNCTION {article}
{ output.bibitem
format.authors "author" output.check
new.block
format.title "title" output.check
new.block
crossref missing$
{ journal emphasize "journal" output.check
format.vol.num.pages output
format.date "year" output.check
}
{ format.article.crossref output.nonnull
format.pages output
}
if$
new.block
note output
fin.entry
}
FUNCTION {book}
{ output.bibitem
author empty$
{ format.editors "author and editor" output.check }
{ format.authors output.nonnull
crossref missing$
{ "author and editor" editor either.or.check }
'skip$
if$
}
if$
new.block
format.btitle "title" output.check
crossref missing$
{ format.bvolume output
new.block
format.number.series output
new.sentence
publisher "publisher" output.check
address output
}
{ new.block
format.book.crossref output.nonnull
}
if$
format.edition output
format.date "year" output.check
new.block
note output
fin.entry
}
FUNCTION {booklet}
{ output.bibitem
format.authors output
new.block
format.title "title" output.check
howpublished address new.block.checkb
howpublished output
address output
format.date output
new.block
note output
fin.entry
}
FUNCTION {inbook}
{ output.bibitem
author empty$
{ format.editors "author and editor" output.check }
{ format.authors output.nonnull
crossref missing$
{ "author and editor" editor either.or.check }
'skip$
if$
}
if$
new.block
format.btitle "title" output.check
crossref missing$
{ format.bvolume output
format.chapter.pages "chapter and pages" output.check
new.block
format.number.series output
new.sentence
publisher "publisher" output.check
address output
}
{ format.chapter.pages "chapter and pages" output.check
new.block
format.book.crossref output.nonnull
}
if$
format.edition output
format.date "year" output.check
new.block
note output
fin.entry
}
FUNCTION {incollection}
{ output.bibitem
format.authors "author" output.check
new.block
format.title "title" output.check
new.block
crossref missing$
{ format.in.ed.booktitle "booktitle" output.check
format.bvolume output
format.number.series output
format.chapter.pages output
new.sentence
publisher "publisher" output.check
address output
format.edition output
format.date "year" output.check
}
{ format.incoll.inproc.crossref output.nonnull
format.chapter.pages output
}
if$
new.block
note output
fin.entry
}
FUNCTION {inproceedings}
{ output.bibitem
format.authors "author" output.check
new.block
format.title "title" output.check
new.block
crossref missing$
{ format.in.ed.booktitle "booktitle" output.check
format.bvolume output
format.number.series output
format.pages output
address empty$
{ organization publisher new.sentence.checkb
organization output
publisher output
format.date "year" output.check
}
{ address output.nonnull
format.date "year" output.check
new.sentence
organization output
publisher output
}
if$
}
{ format.incoll.inproc.crossref output.nonnull
format.pages output
}
if$
new.block
note output
fin.entry
}
FUNCTION {conference} { inproceedings }
FUNCTION {manual}
{ output.bibitem
author empty$
{ organization empty$
'skip$
{ organization output.nonnull
address output
}
if$
}
{ format.authors output.nonnull }
if$
new.block
format.btitle "title" output.check
author empty$
{ organization empty$
{ address new.block.checka
address output
}
'skip$
if$
}
{ organization address new.block.checkb
organization output
address output
}
if$
format.edition output
format.date output
new.block
note output
fin.entry
}
FUNCTION {mastersthesis}
{ output.bibitem
format.authors "author" output.check
new.block
format.title "title" output.check
new.block
"Master's thesis" format.thesis.type output.nonnull
school "school" output.check
address output
format.date "year" output.check
new.block
note output
fin.entry
}
FUNCTION {misc}
{ output.bibitem
format.authors output
title howpublished new.block.checkb
format.title output
howpublished new.block.checka
howpublished output
format.date output
new.block
note output
fin.entry
empty.misc.check
}
FUNCTION {phdthesis}
{ output.bibitem
format.authors "author" output.check
new.block
format.btitle "title" output.check
new.block
"PhD thesis" format.thesis.type output.nonnull
school "school" output.check
address output
format.date "year" output.check
new.block
note output
fin.entry
}
FUNCTION {proceedings}
{ output.bibitem
editor empty$
{ organization output }
{ format.editors output.nonnull }
if$
new.block
format.btitle "title" output.check
format.bvolume output
format.number.series output
address empty$
{ editor empty$
{ publisher new.sentence.checka }
{ organization publisher new.sentence.checkb
organization output
}
if$
publisher output
format.date "year" output.check
}
{ address output.nonnull
format.date "year" output.check
new.sentence
editor empty$
'skip$
{ organization output }
if$
publisher output
}
if$
new.block
note output
fin.entry
}
FUNCTION {techreport}
{ output.bibitem
format.authors "author" output.check
new.block
format.title "title" output.check
new.block
format.tr.number output.nonnull
institution "institution" output.check
address output
format.date "year" output.check
new.block
note output
fin.entry
}
FUNCTION {unpublished}
{ output.bibitem
format.authors "author" output.check
new.block
format.title "title" output.check
new.block
note "note" output.check
format.date output
fin.entry
}
FUNCTION {default.type} { misc }
MACRO {jan} {"January"}
MACRO {feb} {"February"}
MACRO {mar} {"March"}
MACRO {apr} {"April"}
MACRO {may} {"May"}
MACRO {jun} {"June"}
MACRO {jul} {"July"}
MACRO {aug} {"August"}
MACRO {sep} {"September"}
MACRO {oct} {"October"}
MACRO {nov} {"November"}
MACRO {dec} {"December"}
MACRO {acmcs} {"ACM Computing Surveys"}
MACRO {acta} {"Acta Informatica"}
MACRO {cacm} {"Communications of the ACM"}
MACRO {ibmjrd} {"IBM Journal of Research and Development"}
MACRO {ibmsj} {"IBM Systems Journal"}
MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
MACRO {ieeetc} {"IEEE Transactions on Computers"}
MACRO {ieeetcad}
{"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
MACRO {ipl} {"Information Processing Letters"}
MACRO {jacm} {"Journal of the ACM"}
MACRO {jcss} {"Journal of Computer and System Sciences"}
MACRO {scp} {"Science of Computer Programming"}
MACRO {sicomp} {"SIAM Journal on Computing"}
MACRO {tocs} {"ACM Transactions on Computer Systems"}
MACRO {tods} {"ACM Transactions on Database Systems"}
MACRO {tog} {"ACM Transactions on Graphics"}
MACRO {toms} {"ACM Transactions on Mathematical Software"}
MACRO {toois} {"ACM Transactions on Office Information Systems"}
MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
MACRO {tcs} {"Theoretical Computer Science"}
READ
FUNCTION {sortify}
{ purify$
"l" change.case$
}
INTEGERS { len }
FUNCTION {chop.word}
{ 's :=
'len :=
s #1 len substring$ =
{ s len #1 + global.max$ substring$ }
's
if$
}
INTEGERS { et.al.char.used }
FUNCTION {initialize.et.al.char.used}
{ #0 'et.al.char.used :=
}
EXECUTE {initialize.et.al.char.used}
FUNCTION {format.lab.names}
{ 's :=
s num.names$ 'numnames :=
numnames #1 =
{ s #1 "{vv }{ll}" format.name$ }
{ numnames #2 =
{ s #1 "{vv }{ll }and " format.name$ s #2 "{vv }{ll}" format.name$ *
}
{ s #1 "{vv }{ll }\bgroup \em et al.\egroup " format.name$ }
if$
}
if$
}
FUNCTION {author.key.label}
{ author empty$
{ key empty$
{ cite$ #1 #3 substring$ }
{ key }
if$
}
{ author format.lab.names }
if$
}
FUNCTION {author.editor.key.label}
{ author empty$
{ editor empty$
{ key empty$
{ cite$ #1 #3 substring$ }
{ key }
if$
}
{ editor format.lab.names }
if$
}
{ author format.lab.names }
if$
}
FUNCTION {author.key.organization.label}
{ author empty$
{ key empty$
{ organization empty$
{ cite$ #1 #3 substring$ }
{ "The " #4 organization chop.word #3 text.prefix$ }
if$
}
{ key }
if$
}
{ author format.lab.names }
if$
}
FUNCTION {editor.key.organization.label}
{ editor empty$
{ key empty$
{ organization empty$
{ cite$ #1 #3 substring$ }
{ "The " #4 organization chop.word #3 text.prefix$ }
if$
}
{ key }
if$
}
{ editor format.lab.names }
if$
}
FUNCTION {calc.label}
{ type$ "book" =
type$ "inbook" =
or
'author.editor.key.label
{ type$ "proceedings" =
'editor.key.organization.label
{ type$ "manual" =
'author.key.organization.label
'author.key.label
if$
}
if$
}
if$
duplicate$
"\protect\citeauthoryear{" swap$ * "}{" *
year field.or.null purify$ * % CHANGED - pfps - 15 Feb 1989
'label :=
year field.or.null purify$ *
sortify 'sort.label :=
}
FUNCTION {sort.format.names}
{ 's :=
#1 'nameptr :=
""
s num.names$ 'numnames :=
numnames 'namesleft :=
{ namesleft #0 > }
{ nameptr #1 >
{ " " * }
'skip$
if$
s nameptr "{vv{ } }{ll{ }}{ ff{ }}{ jj{ }}" format.name$ 't :=
nameptr numnames = t "others" = and
{ "et al" * }
{ t sortify * }
if$
nameptr #1 + 'nameptr :=
namesleft #1 - 'namesleft :=
}
while$
}
FUNCTION {sort.format.title}
{ 't :=
"A " #2
"An " #3
"The " #4 t chop.word
chop.word
chop.word
sortify
#1 global.max$ substring$
}
FUNCTION {author.sort}
{ author empty$
{ key empty$
{ "to sort, need author or key in " cite$ * warning$
""
}
{ key sortify }
if$
}
{ author sort.format.names }
if$
}
FUNCTION {author.editor.sort}
{ author empty$
{ editor empty$
{ key empty$
{ "to sort, need author, editor, or key in " cite$ * warning$
""
}
{ key sortify }
if$
}
{ editor sort.format.names }
if$
}
{ author sort.format.names }
if$
}
FUNCTION {author.organization.sort}
{ author empty$
{ organization empty$
{ key empty$
{ "to sort, need author, organization, or key in " cite$ * warning$
""
}
{ key sortify }
if$
}
{ "The " #4 organization chop.word sortify }
if$
}
{ author sort.format.names }
if$
}
FUNCTION {editor.organization.sort}
{ editor empty$
{ organization empty$
{ key empty$
{ "to sort, need editor, organization, or key in " cite$ * warning$
""
}
{ key sortify }
if$
}
{ "The " #4 organization chop.word sortify }
if$
}
{ editor sort.format.names }
if$
}
FUNCTION {presort}
{ calc.label
sort.label
" "
*
type$ "book" =
type$ "inbook" =
or
'author.editor.sort
{ type$ "proceedings" =
'editor.organization.sort
{ type$ "manual" =
'author.organization.sort
'author.sort
if$
}
if$
}
if$
*
" "
*
year field.or.null sortify
*
" "
*
title field.or.null
sort.format.title
*
#1 entry.max$ substring$
'sort.key$ :=
}
ITERATE {presort}
SORT
STRINGS { longest.label last.sort.label next.extra }
INTEGERS { longest.label.width last.extra.num }
FUNCTION {initialize.longest.label}
{ "" 'longest.label :=
#0 int.to.chr$ 'last.sort.label :=
"" 'next.extra :=
#0 'longest.label.width :=
#0 'last.extra.num :=
}
FUNCTION {forward.pass}
{ last.sort.label sort.label =
{ last.extra.num #1 + 'last.extra.num :=
last.extra.num int.to.chr$ 'extra.label :=
}
{ "a" chr.to.int$ 'last.extra.num :=
"" 'extra.label :=
sort.label 'last.sort.label :=
}
if$
}
FUNCTION {reverse.pass}
{ next.extra "b" =
{ "a" 'extra.label := }
'skip$
if$
label extra.label * "}" * 'label := % CHANGED - pfps 15 Feb 1989
label width$ longest.label.width >
{ label 'longest.label :=
label width$ 'longest.label.width :=
}
'skip$
if$
extra.label 'next.extra :=
}
EXECUTE {initialize.longest.label}
ITERATE {forward.pass}
REVERSE {reverse.pass}
FUNCTION {begin.bib}
{ et.al.char.used
{ "\newcommand{\etalchar}[1]{$^{#1}$}" write$ newline$ }
'skip$
if$
preamble$ empty$
'skip$
{ preamble$ write$ newline$ }
if$
"\begin{thebibliography}{}" write$ newline$
}
EXECUTE {begin.bib}
EXECUTE {init.state.consts}
ITERATE {call.type$}
FUNCTION {end.bib}
{ newline$
"\end{thebibliography}" write$ newline$
}
EXECUTE {end.bib}
\relax
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\citation{sutton1988learning}
\citation{tsitsiklis1997analysis}
\citation{Sutton2018book}
\citation{baird1995residual}
\citation{sutton2008convergent}
\citation{sutton2009fast}
\citation{sutton2016emphatic}
\citation{chen2023modified}
\citation{hackman2012faster}
\citation{liu2015finite,liu2016proximal,liu2018proximal}
\citation{givchi2015quasi}
\citation{pan2017accelerated}
\citation{hallak2016generalized}
\citation{zhang2022truncated}
\citation{johnson2013accelerating}
\citation{korda2015td}
\citation{xu2019reanalysis}
\citation{Sutton2018book}
\citation{baird1995residual}
\citation{sutton2009fast}
\citation{sutton2009fast}
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
\newlabel{introduction}{{1}{1}{Introduction}{section.1}{}}
\citation{feng2019kernel}
\citation{basserrano2021logistic}
\citation{Sutton2018book}
\citation{Sutton2018book}
\@writefile{toc}{\contentsline {section}{\numberline {2}Background}{2}{section.2}\protected@file@percent }
\newlabel{preliminaries}{{2}{2}{Background}{section.2}{}}
\newlabel{valuefunction}{{2}{2}{Background}{section.2}{}}
\newlabel{linearvaluefunction}{{1}{2}{Background}{equation.2.1}{}}
\citation{sutton2009fast}
\citation{sutton2009fast}
\citation{ng1999policy}
\citation{devlin2012dynamic}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Classification accuracies for naive Bayes and flexible Bayes on various data sets.}}{3}{table.1}\protected@file@percent }
\newlabel{example_bias}{{1}{3}{Classification accuracies for naive Bayes and flexible Bayes on various data sets}{table.1}{}}
\@writefile{toc}{\contentsline {section}{\numberline {3}Variance Minimization Algorithms}{3}{section.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Motivation}{3}{subsection.3.1}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces VMTD algorithm with linear function approximation in the on-policy setting}}{4}{algorithm.1}\protected@file@percent }
\newlabel{alg:algorithm 1}{{1}{4}{Variance Minimization TD Learning: VMTD}{algorithm.1}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Variance Minimization TD Learning: VMTD}{4}{subsection.3.2}\protected@file@percent }
\newlabel{omega}{{3}{4}{Variance Minimization TD Learning: VMTD}{equation.3.3}{}}
\newlabel{delta}{{4}{4}{Variance Minimization TD Learning: VMTD}{equation.3.4}{}}
\newlabel{theta}{{5}{4}{Variance Minimization TD Learning: VMTD}{equation.3.5}{}}
\newlabel{deltaSarsa}{{8}{4}{Variance Minimization TD Learning: VMTD}{equation.3.8}{}}
\newlabel{deltaQ}{{9}{4}{Variance Minimization TD Learning: VMTD}{equation.3.9}{}}
\citation{dalal2020tale}
\citation{dalal2020tale}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Variance Minimization TDC Learning: VMTDC}{5}{subsection.3.3}\protected@file@percent }
\newlabel{thetavmtdc}{{11}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.11}{}}
\newlabel{uvmtdc}{{12}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.12}{}}
\newlabel{omegavmtdc}{{13}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.13}{}}
\@writefile{toc}{\contentsline {section}{\numberline {4}Theoretical Analysis}{5}{section.4}\protected@file@percent }
\newlabel{theorem1}{{4.1}{5}{}{theorem.4.1}{}}
\newlabel{corollary4_2}{{4.2}{5}{}{theorem.4.2}{}}
\citation{Sutton2018book}
\citation{sutton2009fast}
\citation{baird1995residual,sutton2009fast}
\citation{baird1995residual,sutton2009fast,maei2011gradient}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Random walk.}}{6}{figure.1}\protected@file@percent }
\newlabel{randomwalk}{{1}{6}{Random walk}{figure.1}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces 7-state version of Baird's off-policy counterexample.}}{6}{figure.2}\protected@file@percent }
\newlabel{bairdexample}{{2}{6}{7-state version of Baird's off-policy counterexample}{figure.2}{}}
\newlabel{theorem2}{{4.3}{6}{}{theorem.4.3}{}}
\@writefile{toc}{\contentsline {section}{\numberline {5}Experimental Studies}{6}{section.5}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Testing Tasks}{6}{subsection.5.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Experimental Results and Analysis}{7}{subsection.5.2}\protected@file@percent }
\newlabel{DependentFull}{{3(a)}{7}{Subfigure 3(a)}{subfigure.3.1}{}}
\newlabel{sub@DependentFull}{{(a)}{7}{Subfigure 3(a)\relax }{subfigure.3.1}{}}
\newlabel{TabularFull}{{3(b)}{7}{Subfigure 3(b)}{subfigure.3.2}{}}
\newlabel{sub@TabularFull}{{(b)}{7}{Subfigure 3(b)\relax }{subfigure.3.2}{}}
\newlabel{InvertedFull}{{3(c)}{7}{Subfigure 3(c)}{subfigure.3.3}{}}
\newlabel{sub@InvertedFull}{{(c)}{7}{Subfigure 3(c)\relax }{subfigure.3.3}{}}
\newlabel{CounterExampleFull}{{3(d)}{7}{Subfigure 3(d)}{subfigure.3.4}{}}
\newlabel{sub@CounterExampleFull}{{(d)}{7}{Subfigure 3(d)\relax }{subfigure.3.4}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Learning curses of four evaluation environments.}}{7}{figure.3}\protected@file@percent }
\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Dependent}}}{7}{figure.3}\protected@file@percent }
\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Tabular}}}{7}{figure.3}\protected@file@percent }
\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Inverted}}}{7}{figure.3}\protected@file@percent }
\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {counterexample}}}{7}{figure.3}\protected@file@percent }
\newlabel{Evaluation_full}{{3}{7}{Learning curses of four evaluation environments}{figure.3}{}}
\citation{schwartz1993reinforcement}
\newlabel{MazeFull}{{4(a)}{8}{Subfigure 4(a)}{subfigure.4.1}{}}
\newlabel{sub@MazeFull}{{(a)}{8}{Subfigure 4(a)\relax }{subfigure.4.1}{}}
\newlabel{CliffWalkingFull}{{4(b)}{8}{Subfigure 4(b)}{subfigure.4.2}{}}
\newlabel{sub@CliffWalkingFull}{{(b)}{8}{Subfigure 4(b)\relax }{subfigure.4.2}{}}
\newlabel{MountainCarFull}{{4(c)}{8}{Subfigure 4(c)}{subfigure.4.3}{}}
\newlabel{sub@MountainCarFull}{{(c)}{8}{Subfigure 4(c)\relax }{subfigure.4.3}{}}
\newlabel{AcrobotFull}{{4(d)}{8}{Subfigure 4(d)}{subfigure.4.4}{}}
\newlabel{sub@AcrobotFull}{{(d)}{8}{Subfigure 4(d)\relax }{subfigure.4.4}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Learning curses of four contral environments.}}{8}{figure.4}\protected@file@percent }
\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Maze}}}{8}{figure.4}\protected@file@percent }
\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Cliff Walking}}}{8}{figure.4}\protected@file@percent }
\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Mountain Car}}}{8}{figure.4}\protected@file@percent }
\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Acrobot}}}{8}{figure.4}\protected@file@percent }
\newlabel{Complete_full}{{4}{8}{Learning curses of four contral environments}{figure.4}{}}
\@writefile{toc}{\contentsline {section}{\numberline {6}Related Work}{8}{section.6}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Difference between VMQ and R-learning}{8}{subsection.6.1}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Difference between R-learning and tabular VMQ.}}{8}{table.2}\protected@file@percent }
\newlabel{differenceRandVMQ}{{2}{8}{Difference between R-learning and tabular VMQ}{table.2}{}}
\citation{korda2015td}
\citation{xu2020reanalysis}
\citation{Sutton2018book}
\citation{Sutton2018book}
\citation{schulman2015trust}
\citation{schulman2017proximal}
\citation{borkar1997stochastic}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.2}Variance Reduction for TD Learning}{9}{subsection.6.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {6.3}Variance Reduction for Policy Gradient Algorithms}{9}{subsection.6.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion and Future Work}{9}{section.7}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {A}Relevant proofs}{9}{appendix.A}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {A.1}Proof of Theorem \ref {theorem1}}{9}{subsection.A.1}\protected@file@percent }
\newlabel{proofth1}{{A.1}{9}{Proof of Theorem \ref {theorem1}}{subsection.A.1}{}}
\newlabel{th1proof}{{A.1}{9}{Proof of Theorem \ref {theorem1}}{subsection.A.1}{}}
\citation{hirsch1989convergent}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{borkar2000ode}
\newlabel{thetaFast}{{19}{10}{Proof of Theorem \ref {theorem1}}{equation.A.19}{}}
\newlabel{omegaFast}{{20}{10}{Proof of Theorem \ref {theorem1}}{equation.A.20}{}}
\newlabel{omegaFastFinal}{{21}{10}{Proof of Theorem \ref {theorem1}}{equation.A.21}{}}
\newlabel{omegaInfty}{{22}{10}{Proof of Theorem \ref {theorem1}}{equation.A.22}{}}
\newlabel{odetheta}{{23}{10}{Proof of Theorem \ref {theorem1}}{equation.A.23}{}}
\citation{dalal2020tale}
\citation{dalal2020tale}
\newlabel{covariance}{{24}{11}{Proof of Theorem \ref {theorem1}}{equation.A.24}{}}
\newlabel{odethetafinal}{{25}{11}{Proof of Theorem \ref {theorem1}}{equation.A.25}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {A.2}Proof of Corollary \ref {corollary4_2}}{11}{subsection.A.2}\protected@file@percent }
\newlabel{proofcorollary4_2}{{A.2}{11}{Proof of Corollary \ref {corollary4_2}}{subsection.A.2}{}}
\newlabel{matrixassumption}{{A.1}{11}{}{theorem.A.1}{}}
\newlabel{stepsizeassumption}{{A.2}{11}{}{theorem.A.2}{}}
\newlabel{sparseprojection}{{A.3}{11}{}{theorem.A.3}{}}
\citation{dalal2020tale}
\citation{dalal2020tale}
\citation{sutton2009fast}
\citation{hirsch1989convergent}
\newlabel{sparseprojectiontheta}{{30}{12}{}{equation.A.30}{}}
\newlabel{sparseprojectionomega}{{31}{12}{}{equation.A.31}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {A.3}Proof of Theorem \ref {theorem2}}{12}{subsection.A.3}\protected@file@percent }
\newlabel{proofth2}{{A.3}{12}{Proof of Theorem \ref {theorem2}}{subsection.A.3}{}}
\newlabel{thetavmtdcFastest}{{32}{12}{Proof of Theorem \ref {theorem2}}{equation.A.32}{}}
\newlabel{uvmtdcFastest}{{33}{12}{Proof of Theorem \ref {theorem2}}{equation.A.33}{}}
\newlabel{omegavmtdcFastest}{{34}{12}{Proof of Theorem \ref {theorem2}}{equation.A.34}{}}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{hirsch1989convergent}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{borkar2000ode}
\newlabel{omegavmtdcFastestFinal}{{35}{13}{Proof of Theorem \ref {theorem2}}{equation.A.35}{}}
\newlabel{omegavmtdcInfty}{{36}{13}{Proof of Theorem \ref {theorem2}}{equation.A.36}{}}
\newlabel{thetavmtdcFaster}{{37}{13}{Proof of Theorem \ref {theorem2}}{equation.A.37}{}}
\newlabel{uvmtdcFaster}{{38}{13}{Proof of Theorem \ref {theorem2}}{equation.A.38}{}}
\newlabel{uvmtdcFasterFinal}{{39}{13}{Proof of Theorem \ref {theorem2}}{equation.A.39}{}}
\newlabel{uvmtdcInfty}{{40}{13}{Proof of Theorem \ref {theorem2}}{equation.A.40}{}}
\newlabel{thetavmtdcSlowerFinal}{{42}{14}{Proof of Theorem \ref {theorem2}}{equation.A.42}{}}
\newlabel{odethetavmtdcfinal}{{43}{14}{Proof of Theorem \ref {theorem2}}{equation.A.43}{}}
\@writefile{toc}{\contentsline {section}{\numberline {B}Experimental details}{14}{appendix.B}\protected@file@percent }
\newlabel{experimentaldetails}{{B}{14}{Experimental details}{appendix.B}{}}
\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces VMTDC algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.2}\protected@file@percent }
\newlabel{alg:algorithm 2}{{2}{15}{Proof of Theorem \ref {theorem2}}{algorithm.2}{}}
\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces VMGTD algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.3}\protected@file@percent }
\newlabel{alg:algorithm 3}{{3}{15}{Proof of Theorem \ref {theorem2}}{algorithm.3}{}}
\bibstyle{named}
\bibdata{neurips_2024}
\bibcite{baird1995residual}{{1}{1995}{{Baird and others}}{{}}}
\bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano \bgroup \em et al.\egroup }}{{}}}
\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces VMGTD2 algorithm with linear function approximation in the off-policy setting}}{16}{algorithm.4}\protected@file@percent }
\newlabel{alg:algorithm 4}{{4}{16}{Proof of Theorem \ref {theorem2}}{algorithm.4}{}}
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Learning rates ($lr$) of four control experiments.}}{16}{table.3}\protected@file@percent }
\newlabel{lrofways}{{3}{16}{Learning rates ($lr$) of four control experiments}{table.3}{}}
\bibcite{borkar2000ode}{{3}{2000}{{Borkar and Meyn}}{{}}}
\bibcite{borkar1997stochastic}{{4}{1997}{{Borkar}}{{}}}
\bibcite{chen2023modified}{{5}{2023}{{Chen \bgroup \em et al.\egroup }}{{}}}
\bibcite{dalal2020tale}{{6}{2020}{{Dalal \bgroup \em et al.\egroup }}{{}}}
\bibcite{devlin2012dynamic}{{7}{2012}{{Devlin and Kudenko}}{{}}}
\bibcite{feng2019kernel}{{8}{2019}{{Feng \bgroup \em et al.\egroup }}{{}}}
\bibcite{givchi2015quasi}{{9}{2015}{{Givchi and Palhang}}{{}}}
\bibcite{hackman2012faster}{{10}{2012}{{Hackman}}{{}}}
\bibcite{hallak2016generalized}{{11}{2016}{{Hallak \bgroup \em et al.\egroup }}{{}}}
\bibcite{hirsch1989convergent}{{12}{1989}{{Hirsch}}{{}}}
\bibcite{johnson2013accelerating}{{13}{2013}{{Johnson and Zhang}}{{}}}
\bibcite{korda2015td}{{14}{2015}{{Korda and La}}{{}}}
\bibcite{liu2015finite}{{15}{2015}{{Liu \bgroup \em et al.\egroup }}{{}}}
\bibcite{liu2016proximal}{{16}{2016}{{Liu \bgroup \em et al.\egroup }}{{}}}
\bibcite{liu2018proximal}{{17}{2018}{{Liu \bgroup \em et al.\egroup }}{{}}}
\bibcite{maei2011gradient}{{18}{2011}{{Maei}}{{}}}
\bibcite{ng1999policy}{{19}{1999}{{Ng \bgroup \em et al.\egroup }}{{}}}
\bibcite{pan2017accelerated}{{20}{2017}{{Pan \bgroup \em et al.\egroup }}{{}}}
\bibcite{schulman2015trust}{{21}{2015}{{Schulman \bgroup \em et al.\egroup }}{{}}}
\bibcite{schulman2017proximal}{{22}{2017}{{Schulman \bgroup \em et al.\egroup }}{{}}}
\bibcite{schwartz1993reinforcement}{{23}{1993}{{Schwartz}}{{}}}
\bibcite{Sutton2018book}{{24}{2018}{{Sutton and Barto}}{{}}}
\bibcite{sutton2008convergent}{{25}{2008}{{Sutton \bgroup \em et al.\egroup }}{{}}}
\bibcite{sutton2009fast}{{26}{2009}{{Sutton \bgroup \em et al.\egroup }}{{}}}
\bibcite{sutton2016emphatic}{{27}{2016}{{Sutton \bgroup \em et al.\egroup }}{{}}}
\bibcite{sutton1988learning}{{28}{1988}{{Sutton}}{{}}}
\bibcite{tsitsiklis1997analysis}{{29}{1997}{{Tsitsiklis and Van~Roy}}{{}}}
\bibcite{xu2019reanalysis}{{30}{2019}{{Xu \bgroup \em et al.\egroup }}{{}}}
\bibcite{xu2020reanalysis}{{31}{2020}{{Xu \bgroup \em et al.\egroup }}{{}}}
\bibcite{zhang2022truncated}{{32}{2022}{{Zhang and Whiteson}}{{}}}
\gdef \@abspage@last{18}
\begin{thebibliography}{}
\bibitem[\protect\citeauthoryear{Baird and others}{1995}]{baird1995residual}
Leemon Baird et~al.
\newblock Residual algorithms: Reinforcement learning with function approximation.
\newblock In {\em Proc. 12th Int. Conf. Mach. Learn.}, pages 30--37, 1995.
\bibitem[\protect\citeauthoryear{Bas-Serrano \bgroup \em et al.\egroup }{2021}]{basserrano2021logistic}
Joan Bas-Serrano, Sebastian Curi, Andreas Krause, and Gergely Neu.
\newblock Logistic q-learning.
\newblock In {\em International Conference on Artificial Intelligence and Statistics}, pages 3610--3618, 2021.
\bibitem[\protect\citeauthoryear{Borkar and Meyn}{2000}]{borkar2000ode}
Vivek~S Borkar and Sean~P Meyn.
\newblock The ode method for convergence of stochastic approximation and reinforcement learning.
\newblock {\em SIAM J. Control Optim.}, 38(2):447--469, 2000.
\bibitem[\protect\citeauthoryear{Borkar}{1997}]{borkar1997stochastic}
Vivek~S Borkar.
\newblock Stochastic approximation with two time scales.
\newblock {\em Syst. \& Control Letters}, 29(5):291--294, 1997.
\bibitem[\protect\citeauthoryear{Chen \bgroup \em et al.\egroup }{2023}]{chen2023modified}
Xingguo Chen, Xingzhou Ma, Yang Li, Guang Yang, Shangdong Yang, and Yang Gao.
\newblock Modified retrace for off-policy temporal difference learning.
\newblock In {\em Uncertainty in Artificial Intelligence}, pages 303--312. PMLR, 2023.
\bibitem[\protect\citeauthoryear{Dalal \bgroup \em et al.\egroup }{2020}]{dalal2020tale}
Gal Dalal, Balazs Szorenyi, and Gugan Thoppe.
\newblock A tale of two-timescale reinforcement learning with the tightest finite-time bound.
\newblock In {\em Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pages 3701--3708, 2020.
\bibitem[\protect\citeauthoryear{Devlin and Kudenko}{2012}]{devlin2012dynamic}
Sam Devlin and Daniel Kudenko.
\newblock Dynamic potential-based reward shaping.
\newblock In {\em Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, pages 433--440, 2012.
\bibitem[\protect\citeauthoryear{Feng \bgroup \em et al.\egroup }{2019}]{feng2019kernel}
Yihao Feng, Lihong Li, and Qiang Liu.
\newblock A kernel loss for solving the bellman equation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 15430--15441, 2019.
\bibitem[\protect\citeauthoryear{Givchi and Palhang}{2015}]{givchi2015quasi}
Arash Givchi and Maziar Palhang.
\newblock Quasi newton temporal difference learning.
\newblock In {\em Asian Conference on Machine Learning}, pages 159--172, 2015.
\bibitem[\protect\citeauthoryear{Hackman}{2012}]{hackman2012faster}
Leah Hackman.
\newblock {\em Faster Gradient-TD Algorithms}.
\newblock PhD thesis, University of Alberta, 2012.
\bibitem[\protect\citeauthoryear{Hallak \bgroup \em et al.\egroup }{2016}]{hallak2016generalized}
Assaf Hallak, Aviv Tamar, Remi Munos, and Shie Mannor.
\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
\newblock In {\em Proceedings of the 30th AAAI Conference on Artificial Intelligence}, pages 1631--1637, 2016.
\bibitem[\protect\citeauthoryear{Hirsch}{1989}]{hirsch1989convergent}
Morris~W Hirsch.
\newblock Convergent activation dynamics in continuous time networks.
\newblock {\em Neural Netw.}, 2(5):331--349, 1989.
\bibitem[\protect\citeauthoryear{Johnson and Zhang}{2013}]{johnson2013accelerating}
R.~Johnson and T.~Zhang.
\newblock Accelerating stochastic gradient descent using predictive variance reduction.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 315--323, 2013.
\bibitem[\protect\citeauthoryear{Korda and La}{2015}]{korda2015td}
Nathaniel Korda and Prashanth La.
\newblock On td (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
\newblock In {\em International conference on machine learning}, pages 626--634. PMLR, 2015.
\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2015}]{liu2015finite}
Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
\newblock Finite-sample analysis of proximal gradient td algorithms.
\newblock In {\em Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, pages 504--513, 2015.
\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2016}]{liu2016proximal}
Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
\newblock Proximal gradient temporal difference learning algorithms.
\newblock In {\em Proceedings of the International Joint Conference on Artificial Intelligence}, pages 4195--4199, 2016.
\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2018}]{liu2018proximal}
Bo~Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji~Liu, Sridhar Mahadevan, and Marek Petrik.
\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
\newblock {\em Journal of Artificial Intelligence Research}, 63:461--494, 2018.
\bibitem[\protect\citeauthoryear{Maei}{2011}]{maei2011gradient}
Hamid~Reza Maei.
\newblock {\em Gradient temporal-difference learning algorithms}.
\newblock PhD thesis, University of Alberta, 2011.
\bibitem[\protect\citeauthoryear{Ng \bgroup \em et al.\egroup }{1999}]{ng1999policy}
Andrew~Y Ng, Daishi Harada, and Stuart Russell.
\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
\newblock In {\em Proc. 16th Int. Conf. Mach. Learn.}, pages 278--287, 1999.
\bibitem[\protect\citeauthoryear{Pan \bgroup \em et al.\egroup }{2017}]{pan2017accelerated}
Yangchen Pan, Adam White, and Martha White.
\newblock Accelerated gradient temporal difference learning.
\newblock In {\em Proceedings of the 21st AAAI Conference on Artificial Intelligence}, pages 2464--2470, 2017.
\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2015}]{schulman2015trust}
J.~Schulman, S.~Levine, P.~Abbeel, M.~Jordan, and P.~Moritz.
\newblock Trust region policy optimization.
\newblock In {\em International Conference on Machine Learning}, pages 1889--1897, 2015.
\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2017}]{schulman2017proximal}
J.~Schulman, F.~Wolski, P.~Dhariwal, A.~Radford, and O.~Klimov.
\newblock Proximal policy optimization algorithms.
\newblock {\em arXiv preprint arXiv:1707.06347}, 2017.
\bibitem[\protect\citeauthoryear{Schwartz}{1993}]{schwartz1993reinforcement}
Anton Schwartz.
\newblock A reinforcement learning method for maximizing undiscounted rewards.
\newblock In {\em Proc. 10th Int. Conf. Mach. Learn.}, volume 298, pages 298--305, 1993.
\bibitem[\protect\citeauthoryear{Sutton and Barto}{2018}]{Sutton2018book}
Richard~S. Sutton and Andrew~G. Barto.
\newblock {\em Reinforcement Learning: An Introduction}.
\newblock The MIT Press, second edition, 2018.
\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2008}]{sutton2008convergent}
Richard~S Sutton, Hamid~R Maei, and Csaba Szepesv{\'a}ri.
\newblock A convergent $ o (n) $ temporal-difference algorithm for off-policy learning with linear function approximation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 1609--1616. Cambridge, MA: MIT Press, 2008.
\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2009}]{sutton2009fast}
R.S. Sutton, H.R. Maei, D.~Precup, S.~Bhatnagar, D.~Silver, C.~Szepesv{\'a}ri, and E.~Wiewiora.
\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
\newblock In {\em Proc. 26th Int. Conf. Mach. Learn.}, pages 993--1000, 2009.
\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2016}]{sutton2016emphatic}
Richard~S Sutton, A~Rupam Mahmood, and Martha White.
\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
\newblock {\em The Journal of Machine Learning Research}, 17(1):2603--2631, 2016.
\bibitem[\protect\citeauthoryear{Sutton}{1988}]{sutton1988learning}
Richard~S Sutton.
\newblock Learning to predict by the methods of temporal differences.
\newblock {\em Machine learning}, 3(1):9--44, 1988.
\bibitem[\protect\citeauthoryear{Tsitsiklis and Van~Roy}{1997}]{tsitsiklis1997analysis}
John~N Tsitsiklis and Benjamin Van~Roy.
\newblock Analysis of temporal-diffference learning with function approximation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 1075--1081, 1997.
\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2019}]{xu2019reanalysis}
Tengyu Xu, Zhe Wang, Yi~Zhou, and Yingbin Liang.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock In {\em International Conference on Learning Representations}, 2019.
\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2020}]{xu2020reanalysis}
T.~Xu, Z.~Wang, Y.~Zhou, and Y.~Liang.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock {\em arXiv preprint arXiv:2001.01898}, 2020.
\bibitem[\protect\citeauthoryear{Zhang and Whiteson}{2022}]{zhang2022truncated}
Shangtong Zhang and Shimon Whiteson.
\newblock Truncated emphatic temporal difference methods for prediction and control.
\newblock {\em The Journal of Machine Learning Research}, 23(1):6859--6917, 2022.
\end{thebibliography}
@inproceedings{langley00,
author = {P. Langley},
title = {Crafting Papers on Machine Learning},
year = {2000},
pages = {1207--1216},
editor = {Pat Langley},
booktitle = {Proceedings of the 17th International Conference
on Machine Learning (ICML 2000)},
address = {Stanford, CA},
publisher = {Morgan Kaufmann}
}
@TechReport{mitchell80,
author = "T. M. Mitchell",
title = "The Need for Biases in Learning Generalizations",
institution = "Computer Science Department, Rutgers University",
year = "1980",
address = "New Brunswick, MA",
}
@phdthesis{kearns89,
author = {M. J. Kearns},
title = {Computational Complexity of Machine Learning},
school = {Department of Computer Science, Harvard University},
year = {1989}
}
@Book{MachineLearningI,
editor = "R. S. Michalski and J. G. Carbonell and T.
M. Mitchell",
title = "Machine Learning: An Artificial Intelligence
Approach, Vol. I",
publisher = "Tioga",
year = "1983",
address = "Palo Alto, CA"
}
@Book{DudaHart2nd,
author = "R. O. Duda and P. E. Hart and D. G. Stork",
title = "Pattern Classification",
publisher = "John Wiley and Sons",
edition = "2nd",
year = "2000"
}
@misc{anonymous,
title= {Suppressed for Anonymity},
author= {Author, N. N.},
year= {2021}
}
@InCollection{Newell81,
author = "A. Newell and P. S. Rosenbloom",
title = "Mechanisms of Skill Acquisition and the Law of
Practice",
booktitle = "Cognitive Skills and Their Acquisition",
pages = "1--51",
publisher = "Lawrence Erlbaum Associates, Inc.",
year = "1981",
editor = "J. R. Anderson",
chapter = "1",
address = "Hillsdale, NJ"
}
@Article{Samuel59,
author = "A. L. Samuel",
title = "Some Studies in Machine Learning Using the Game of
Checkers",
journal = "IBM Journal of Research and Development",
year = "1959",
volume = "3",
number = "3",
pages = "211--229"
}
@inproceedings{langley00,
author = {P. Langley},
title = {Crafting Papers on Machine Learning},
year = {2000},
pages = {1207--1216},
editor = {Pat Langley},
booktitle = {Proceedings of the 17th International Conference
on Machine Learning (ICML 2000)},
address = {Stanford, CA},
publisher = {Morgan Kaufmann}
}
@TechReport{mitchell80,
author = "T. M. Mitchell",
title = "The Need for Biases in Learning Generalizations",
institution = "Computer Science Department, Rutgers University",
year = "1980",
address = "New Brunswick, MA",
}
@phdthesis{kearns89,
author = {M. J. Kearns},
title = {Computational Complexity of Machine Learning},
school = {Department of Computer Science, Harvard University},
year = {1989}
}
@Book{MachineLearningI,
editor = "R. S. Michalski and J. G. Carbonell and T.
M. Mitchell",
title = "Machine Learning: An Artificial Intelligence
Approach, Vol. I",
publisher = "Tioga",
year = "1983",
address = "Palo Alto, CA"
}
@Book{DudaHart2nd,
author = "R. O. Duda and P. E. Hart and D. G. Stork",
title = "Pattern Classification",
publisher = "John Wiley and Sons",
edition = "2nd",
year = "2000"
}
@misc{anonymous,
title= {Suppressed for Anonymity},
author= {Author, N. N.},
year= {2021}
}
@InCollection{Newell81,
author = "A. Newell and P. S. Rosenbloom",
title = "Mechanisms of Skill Acquisition and the Law of
Practice",
booktitle = "Cognitive Skills and Their Acquisition",
pages = "1--51",
publisher = "Lawrence Erlbaum Associates, Inc.",
year = "1981",
editor = "J. R. Anderson",
chapter = "1",
address = "Hillsdale, NJ"
}
@Article{Samuel59,
author = "A. L. Samuel",
title = "Some Studies in Machine Learning Using the Game of
Checkers",
journal = "IBM Journal of Research and Development",
year = "1959",
volume = "3",
number = "3",
pages = "211--229"
}
@book{em:86,
editor = "Engelmore, Robert and Morgan, Anthony",
title = "Blackboard Systems",
year = 1986,
address = "Reading, Mass.",
publisher = "Addison-Wesley",
}
@inproceedings{dalal2018finite,
title={Finite sample analyses for TD (0) with function approximation},
author={Dalal, Gal and Szorenyi, Balazs and Thoppe, Gugan and Mannor, Shie},
booktitle={Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence and Thirtieth Innovative Applications of Artificial Intelligence Conference and Eighth AAAI Symposium on Educational Advances in Artificial Intelligence},
pages={6144--6160},
year={2018}
}
@inproceedings{xu2019reanalysis,
title={Reanalysis of Variance Reduced Temporal Difference Learning},
author={Xu, Tengyu and Wang, Zhe and Zhou, Yi and Liang, Yingbin},
booktitle={International Conference on Learning Representations},
year={2019}
}
@inproceedings{c:83,
author = "Clancey, William J.",
year = 1983,
title = "{Communication, Simulation, and Intelligent
Agents: Implications of Personal Intelligent Machines
for Medical Education}",
booktitle="Proceedings of the Eighth International Joint Conference on Artificial Intelligence {(IJCAI-83)}",
pages = "556-560",
address = "Menlo Park, Calif",
publisher = "{IJCAI Organization}",
}
@inproceedings{c:84,
author = "Clancey, William J.",
year = 1984,
title = "{Classification Problem Solving}",
booktitle = "Proceedings of the Fourth National
Conference on Artificial Intelligence",
pages = "45-54",
address = "Menlo Park, Calif.",
publisher="AAAI Press",
}
@article{r:80,
author = {Robinson, Arthur L.},
title = {New Ways to Make Microcircuits Smaller},
volume = {208},
number = {4447},
pages = {1019--1022},
year = {1980},
doi = {10.1126/science.208.4447.1019},
publisher = {American Association for the Advancement of Science},
issn = {0036-8075},
URL = {https://science.sciencemag.org/content/208/4447/1019},
eprint = {https://science.sciencemag.org/content/208/4447/1019.full.pdf},
journal = {Science},
}
@article{r:80x,
author = "Robinson, Arthur L.",
year = 1980,
title = "{New Ways to Make Microcircuits Smaller---Duplicate Entry}",
journal = "Science",
volume = 208,
pages = "1019-1026",
}
@article{hcr:83,
title = {Strategic explanations for a diagnostic consultation system},
journal = {International Journal of Man-Machine Studies},
volume = {20},
number = {1},
pages = {3-19},
year = {1984},
issn = {0020-7373},
doi = {https://doi.org/10.1016/S0020-7373(84)80003-6},
url = {https://www.sciencedirect.com/science/article/pii/S0020737384800036},
author = {Diane Warner Hasling and William J. Clancey and Glenn Rennels},
abstract = {This article examines the problem of automatte explanation of reasoning, especially as it relates to expert systems. By explanation we mean the ability of a program to discuss what it is doing in some understandable way. We first present a general framework in which to view explanation and review some of the research done in this area. We then focus on the explanation system for NEOMYCIN, a medical consultation program. A consultation program interactively helps a user to solve a problem. Our goal is to have NEOMYCIN explain its problem-solving strategies. An explanation of strategy describes the plan the program is using to reach a solution. Such an explanation is usually concrete, referring to aspects of the current problem situation. Abstract explanations articulate a general principle, which can be applied in different situations; such explanations are useful in teaching and in explaining by analogy. We describe the aspects of NEOMYCIN that make abstract strategic explanations possible—the representation of strategic knowledge explicitly and separately from domain knowledge— and demonstrate how this representation can be used to generate explanations.}
}
@article{hcrt:83,
author = "Hasling, Diane Warner and Clancey, William J. and Rennels, Glenn R. and Test, Thomas",
year = 1983,
title = "{Strategic Explanations in Consultation---Duplicate}",
journal = "The International Journal of Man-Machine Studies",
volume = 20,
number = 1,
pages = "3-19",
}
@techreport{r:86,
author = "Rice, James",
year = 1986,
title = "{Poligon: A System for Parallel Problem Solving}",
type = "Technical Report",
number = "KSL-86-19",
institution = "Dept.\ of Computer Science, Stanford Univ.",
}
@phdthesis{c:79,
author = "Clancey, William J.",
year = 1979,
title = "{Transfer of Rule-Based Expertise
through a Tutorial Dialogue}",
type = "{Ph.D.} diss.",
school = "Dept.\ of Computer Science, Stanford Univ.",
address = "Stanford, Calif.",
}
@unpublished{c:21,
author = "Clancey, William J.",
title = "{The Engineering of Qualitative Models}",
year = 2021,
note = "Forthcoming",
}
@misc{c:22,
title={Attention Is All You Need},
author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
year={2017},
eprint={1706.03762},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{c:23,
title = "Pluto: The 'Other' Red Planet",
author = "{NASA}",
howpublished = "\url{https://www.nasa.gov/nh/pluto-the-other-red-planet}",
year = 2015,
note = "Accessed: 2018-12-06"
}
@article{r:80x,
author = "Robinson, Arthur L.",
year = 1980,
title = "{New Ways to Make Microcircuits Smaller---Duplicate Entry}",
journal = "Science",
volume = 208,
pages = "1019-1026",
}
@article{hcrt:83,
author = "Hasling, Diane Warner and Clancey, William J. and Rennels, Glenn R. and Test, Thomas",
year = 1983,
title = "{Strategic Explanations in Consultation---Duplicate}",
journal = "The International Journal of Man-Machine Studies",
volume = 20,
number = 1,
pages = "3-19",
}
@article{xu2013online,
title={Online learning control using adaptive critic designs with sparse kernel machines},
author={Xu, Xin and Hou, Zhongsheng and Lian, Chuanqiang and He, Haibo},
journal={IEEE Trans. Neural Netw. Learn. Syst.},
volume={24},
number={5},
pages={762--775},
year={2013},
publisher={IEEE}
}
@article{bertsekas2017value,
title={Value and policy iterations in optimal control and adaptive dynamic programming},
author={Bertsekas, Dimitri P},
journal={IEEE Trans. Neural Netw. Learn. Syst.},
year={2017},
volume={28},
number={3},
pages={500 - 509},
publisher={IEEE}
}
@phdthesis{hackman2012faster,
title={Faster Gradient-TD Algorithms},
author={Hackman, Leah},
year={2012},
school={University of Alberta}
}
@inproceedings{harutyunyan2015multi,
title={Multi-scale reward shaping via an off-policy ensemble},
author={Harutyunyan, Anna and Brys, Tim and Vrancx, Peter and Now{\'e}, Ann},
booktitle={Proc. 2015 Int. Conf. Autonomous Agents and Multiagent Systems},
pages={1641--1642},
year={2015},
organization={International Foundation for Autonomous Agents and Multiagent Systems}
}
@inproceedings{harutyunyan2015expressing,
title={Expressing Arbitrary Reward Functions as Potential-Based Advice.},
author={Harutyunyan, Anna and Devlin, Sam and Vrancx, Peter and Now{\'e}, Ann},
booktitle={AAAI},
pages={2652--2658},
year={2015}
}
@article{wiewiora2003potential,
title={Potential-based shaping and Q-value initialization are equivalent},
author={Wiewiora, Eric},
journal={J. Artif. Intell. Res.},
volume={19},
pages={205--208},
year={2003}
}
@article{grzes2010online,
title={Online learning of shaping rewards in reinforcement learning},
author={Grze{\'s}, Marek and Kudenko, Daniel},
journal={Neural Netw.},
volume={23},
number={4},
pages={541--550},
year={2010},
publisher={Elsevier}
}
@inproceedings{marthi2007automatic,
title={Automatic shaping and decomposition of reward functions},
author={Marthi, Bhaskara},
booktitle={Proc. 24th Int. Conf. Mach. Learn.},
pages={601--608},
year={2007}
}
@inproceedings{laud2003influence,
title={The Influence of Reward on the Speed of Reinforcement Learning: An Analysis of Shaping},
author={Laud, Adam and Dejong, Gerald},
booktitle={Proc. 20th Int. Conf. Mach. Learn.},
pages={440--447},
year={2003}
}
@phdthesis{laud2004theory,
title={Theory and application of reward shaping in reinforcement learning},
author={Laud, Adam Daniel},
year={2004},
school={University of Illinois at Urbana-Champaign}
}
@article{geist2013algorithmic,
title={Algorithmic survey of parametric value function approximation},
author={Geist, Matthieu and Pietquin, Olivier},
journal={IEEE Trans. Neural Netw. Learn. Syst.},
volume={24},
number={6},
pages={845--867},
year={2013},
publisher={IEEE}
}
@article{furmston2016approximate,
title={Approximate Newton Methods for Policy Search in Markov Decision Processes},
author={Furmston, Thomas and Lever, Guy and Barber, David},
journal={J. Mach. Learn. Res.},
volume={17},
number={227},
pages={1--51},
year={2016}
}
@article{silver2016mastering,
title={Mastering the game of Go with deep neural networks and tree search},
author={Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and van den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others},
journal={Nature},
volume={529},
number={7587},
pages={484--489},
year={2016},
publisher={Nature Publishing Group}
}
@article{mnih2015human,
title={Human-level control through deep reinforcement learning},
author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A and Veness, Joel and Bellemare, Marc G and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K and Ostrovski, Georg and others},
journal={Nature},
volume={518},
number={7540},
pages={529--533},
year={2015},
publisher={Nature Publishing Group}
}
@inproceedings{guo2014deep,
title={Deep learning for real-time Atari game play using offline Monte-Carlo tree search planning},
author={Guo, Xiaoxiao and Singh, Satinder and Lee, Honglak and Lewis, Richard L and Wang, Xiaoshi},
booktitle={Advances in Neural Information Processing Systems},
pages={3338--3346},
publisher={Cambridge, MA: MIT Press},
year={2014}
}
@inproceedings{scherrer2010should,
title={Should one compute the Temporal Difference fix point or minimize the Bellman Residual? The unified oblique projection view},
author={Scherrer, Bruno},
booktitle={Proc. 27th Int. Conf. Mach. Learn.},
pages={959--966},
year={2010}
}
@article{hirsch1989convergent,
title={Convergent activation dynamics in continuous time networks},
author={Hirsch, Morris W},
journal={Neural Netw.},
volume={2},
number={5},
pages={331--349},
year={1989},
publisher={Elsevier}
}
@article{borkar1997stochastic,
title={Stochastic approximation with two time scales},
author={Borkar, Vivek S},
journal={Syst. \& Control Letters},
volume={29},
number={5},
pages={291--294},
year={1997},
publisher={Elsevier}
}
@article{ortner2013adaptive,
title={Adaptive aggregation for reinforcement learning in average reward Markov decision processes},
author={Ortner, Ronald},
journal={Annals Oper. Res.},
volume={208},
number={1},
pages={321--336},
year={2013},
publisher={Springer}
}
@article{jaksch2010near,
title={Near-optimal regret bounds for reinforcement learning},
author={Jaksch, Thomas and Ortner, Ronald and Auer, Peter},
journal={Journal of Machine Learning Research},
number={Apr},
volume={11},
pages={1563--1600},
year={2010}
}
@article{ortner2007logarithmic,
title={Logarithmic online regret bounds for undiscounted reinforcement learning},
author={Ortner, P and Auer, R},
journal={Advances in Neural Information Processing Systems},
publisher={Cambridge, MA: MIT Press},
volume={19},
pages={49},
year={2007}
}
@article{das1999solving,
title={Solving semi-Markov decision problems using average reward reinforcement learning},
author={Das, Tapas K and Gosavi, Abhijit and Mahadevan, Sridhar and Marchalleck, Nicholas},
journal={Management Science},
volume={45},
number={4},
pages={560--574},
year={1999},
publisher={INFORMS}
}
@article{abounadi2001learning,
title={Learning algorithms for Markov decision processes with average cost},
author={Abounadi, Jinane and Bertsekas, D and Borkar, Vivek S},
journal={SIAM J. Control Optim.},
volume={40},
number={3},
pages={681--698},
year={2001},
publisher={SIAM}
}
@inproceedings{singh1994reinforcement,
title={Reinforcement learning algorithms for average-payoff Markovian decision processes},
author={Singh, Satinder P},
booktitle={AAAI},
volume={94},
pages={700--705},
year={1994}
}
@inproceedings{schwartz1993reinforcement,
title={A reinforcement learning method for maximizing undiscounted rewards},
author={Schwartz, Anton},
booktitle={Proc. 10th Int. Conf. Mach. Learn.},
volume={298},
pages={298--305},
year={1993}
}
@inproceedings{yang2016efficient,
title={Efficient Average Reward Reinforcement Learning Using Constant Shifting Values},
author={Yang, Shangdong and Gao, Yang and An, Bo and Wang, Hao and Chen, Xingguo},
booktitle={Thirtieth AAAI Conference on Artificial Intelligence},
pages={2258-2264},
year={2016}
}
@inproceedings{devlin2012dynamic,
title={Dynamic potential-based reward shaping},
author={Devlin, Sam and Kudenko, Daniel},
booktitle={Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems},
pages={433--440},
year={2012}
}
@inproceedings{ng1999policy,
title={Policy invariance under reward transformations: Theory and application to reward shaping},
author={Ng, Andrew Y and Harada, Daishi and Russell, Stuart},
booktitle={Proc. 16th Int. Conf. Mach. Learn.},
pages={278--287},
year={1999}
}
@article{borkar2000ode,
title={The ODE method for convergence of stochastic approximation and reinforcement learning},
author={Borkar, Vivek S and Meyn, Sean P},
journal={SIAM J. Control Optim.},
volume={38},
number={2},
pages={447--469},
year={2000},
publisher={SIAM}
}
@phdthesis{maei2011gradient,
title={Gradient temporal-difference learning algorithms},
author={Maei, Hamid Reza},
year={2011},
school={University of Alberta}
}
@phdthesis{baird1999reinforcement,
title={Reinforcement learning through gradient descent},
author={Baird III, Leemon C},
year={1999},
school={US Air Force Academy, US}
}
@PHDTHESIS{Driessens2004,
AUTHOR ="Kurt Driessens",
TITLE ="Relational Reinforcement Learning",
SCHOOL ="Catholic University of Leuven",
YEAR ="2004",
}
@article{tsitsiklis1996feature,
title={Feature-based methods for large scale dynamic programming},
author={Tsitsiklis, John N and Van Roy, Benjamin},
journal={Mach. Learn.},
volume={22},
number={1-3},
pages={59--94},
year={1996},
publisher={Springer}
}
@inproceedings{chen2009apply,
title={Apply ant colony optimization to Tetris},
author={Chen, X. and Wang, H. and Wang, W. and Shi, Y. and Gao, Y.},
booktitle={Proceedings of the 11th Annual Conference on Genetic and Evolutionary Computation (GECCO)},
pages={1741--1742},
year={2009},
organization={ACM}
}
@incollection{farias2006tetris,
title={Tetris: A study of randomized constraint sampling},
author={Farias, Vivek F and Van Roy, Benjamin},
booktitle={Probabilistic and Randomized Methods for Design Under Uncertainty},
pages={189--201},
year={2006},
publisher={Springer}
}
@article{bertsekas1996temporal,
title={Temporal differences-based policy iteration and applications in neuro-dynamic programming},
author={Bertsekas, Dimitri P and Ioffe, Sergey},
journal={Lab. for Info. and Decision Systems Report LIDS-P-2349, MIT, Cambridge, MA},
year={1996},
publisher={Citeseer}
}
@inproceedings{kakade2001natural,
title={A Natural Policy Gradient.},
author={Kakade, Sham},
booktitle={Advances in Neural Information Processing Systems},
publisher={Cambridge, MA: MIT Press},
volume={14},
pages={1531--1538},
year={2001}
}
@article{peters2008natural,
title={Natural actor-critic},
author={Peters, Jan and Schaal, Stefan},
journal={Neurocomputing},
volume={71},
number={7},
pages={1180--1190},
year={2008},
publisher={Elsevier}
}
@article{baxter2001infinite,
title={Infinite-horizon policy-gradient estimation},
author={Baxter, Jonathan and Bartlett, Peter L.},
journal={J. Artif. Intell. Res.},
pages={319--350},
year={2001}
}
@inproceedings{sutton1999policy,
title={Policy Gradient Methods for Reinforcement Learning with Function Approximation.},
author={Sutton, Richard S and McAllester, David A and Singh, Satinder P and Mansour, Yishay and others},
booktitle={Advances in Neural Information Processing Systems},
publisher={Cambridge, MA: MIT Press},
pages={1057--1063},
year={1999}
}
@inproceedings{bohm2005evolutionary,
title={An evolutionary approach to tetris},
author={B{\"o}hm, Niko and K{\'o}kai, Gabriella and Mandl, Stefan},
booktitle={Proc. 6th Metaheuristics Int. Conf.},
pages={137-148},
year={2005}
}
@article{szita2006learning,
title={Learning Tetris using the noisy cross-entropy method},
author={Szita, Istv{\'a}n and L{\"o}rincz, Andr{\'a}s},
journal={Neural Comput.},
volume={18},
number={12},
pages={2936--2941},
year={2006},
publisher={MIT Press}
}
@inproceedings{thiery2010least,
title={Least-Squares $\lambda$ Policy Iteration: Bias-Variance Trade-off in Control Problems},
author={Thiery, Christophe and Scherrer, Bruno},
booktitle={Proc. 27th Int. Conf. Mach. Learn.},
pages={1071--1078},
year={2010}
}
@inproceedings{gabillon2013approximate,
title={Approximate dynamic programming finally performs well in the game of Tetris},
author={Gabillon, Victor and Ghavamzadeh, Mohammad and Scherrer, Bruno},
booktitle={Advances in Neural Information Processing Systems},
publisher={Cambridge, MA: MIT Press},
pages={1754--1762},
year={2013}
}
@article{scherrer2013performance,
title={Performance bounds for $\lambda$ policy iteration and application to the game of Tetris},
author={Scherrer, Bruno},
journal={J. Mach. Learn. Res.},
volume={14},
number={1},
pages={1181--1227},
year={2013},
publisher={JMLR. org}
}
@article{thiery2009improvements,
title={Improvements on Learning Tetris with Cross Entropy},
author={Thiery, Christophe and Scherrer, Bruno},
journal={Int. Computer Games Assoc. J.},
volume={32},
number={1},
pages={23--33},
year={2009}
}
@article{scherrer2015approximate,
title={Approximate Modified Policy Iteration and its Application to the Game of Tetris},
author={Scherrer, Bruno and Ghavamzadeh, Mohammad and Gabillon, Victor and Lesner, Boris and Geist, Matthieu},
journal={J. Mach. Learn. Res.},
volume={16},
pages={1629--1676},
year={2015}
}
@article{efron2004least,
title={Least angle regression},
author={Efron, Bradley and Hastie, Trevor and Johnstone, Iain and Tibshirani, Robert and others},
journal={The Annals of statistics},
volume={32},
number={2},
pages={407--499},
year={2004},
publisher={Institute of Mathematical Statistics}
}
@MASTERSTHESIS{Brzustowski1992,
author ={John Brzustowski},
title ={Can you win at tetris?},
school = {University of British Columbia},
year ={1992}
}
@Article{Breukelaar04,
author = {Ron Breukelaar and Erik D. Demaine and Susan
Hohenberger and Hendrik Jan Hoogeboom and Walter
A. Kosters and David Liben-Nowell},
title = {Tetris is Hard, Even to Approximate},
journal = {International Journal of Computational Geometry and
Applications},
year = {2004},
volume = {14},
number = {1--2},
pages = {41--68},
month = {April},
}
@book{Bertsekas1996,
author = {Bertsekas, D. and Tsitsiklis, J. N.},
title = {Neuro-Dynamic Programming},
year = {1996},
publisher = {Athena Scientific},
}
@inproceedings{maei2010gq,
title={GQ ($\lambda$): A general gradient algorithm for temporal-difference prediction learning with eligibility traces},
author={Maei, Hamid Reza and Sutton, Richard S},
booktitle={Proceedings of the Third Conference on Artificial General Intelligence},
volume={1},
pages={91--96},
year={2010}
}
@inproceedings{maei2010toward,
title={Toward off-policy learning control with function approximation},
author={Maei, Hamid R and Szepesv{\'a}ri, Csaba and Bhatnagar, Shalabh and Sutton, Richard S},
booktitle={Proc. 27th Int. Conf. Mach. Learn.},
pages={719--726},
year={2010}
}
@inproceedings{phua2007tracking,
title={Tracking value function dynamics to improve reinforcement learning with piecewise linear function approximation},
author={Phua, Chee Wee and Fitch, Robert},
booktitle={Proc. 24th Int. Conf. Mach. Learn.},
pages={751--758},
year={2007},
organization={ACM}
}
@inproceedings{szubert2014temporal,
title={Temporal difference learning of N-tuple networks for the game 2048},
author={Szubert, Marcin and Jaskowski, Wojciech},
booktitle={2014 IEEE Conference on Computational Intelligence and Games (CIG)},
pages={1--8},
year={2014},
organization={IEEE}
}
@article{chen2013online,
title={Online Selective Kernel-based Temporal Differece Learning},
author={Chen, Xingguo and Gao, Yang and Wang, Ruili},
journal={IEEE Trans. Neural Netw. Learn. Syst.},
year={2013},
volume={24},
number={12},
pages={1944--1956},
publisher={IEEE}
}
@article{xu2007kernel,
title={Kernel-based least squares policy iteration for reinforcement learning},
author={Xu, Xin and Hu, Dewen and Lu, Xicheng},
journal={IEEE Trans. Neural Netw.},
volume={18},
number={4},
pages={973--992},
year={2007},
publisher={IEEE}
}
@INPROCEEDINGS{Engel03bayesmeets,
author = {Yaakov Engel and Shie Mannor and Ron Meir},
title = {Bayes meets {B}ellman: the {G}aussian process approach to temporal difference learning},
booktitle = {Proc. 20th Int. Conf. Mach. Learn.},
year = {2003},
pages = {154--161},
address={Washington, DC},
month={Aug.},
}
@inproceedings{robards2011sparse,
title={Sparse Kernel-SARSA ($\lambda$) with an eligibility trace},
author={Robards, M. and Sunehag, P. and Sanner, S. and Marthi, B.},
booktitle = {Proc. 22nd Eur. Conf. Mach. Learn.},
pages={1--17},
year={2011},
month={Sept.},
address = {Athens, Greece},
}
@conference{reisinger2008online,
title={{Online kernel selection for {B}ayesian reinforcement learning}},
author={Reisinger, J. and Stone, P. and Miikkulainen, R.},
booktitle={Proc. 25th Int. Conf. Mach. Learn.},
pages={816--823},
year={2008},
month={July},
address={ Helsinki, Finland},
}
@book{Sutton1998,
title={{Reinforcement learning: an introduction}},
author={Sutton, R.S. and Barto, A.G.},
year={1998},
publisher={MIT Press},
address={Cambridge, MA}
}
@book{Sutton2018book,
author = {Sutton, Richard S. and Barto, Andrew G.},
edition = {Second},
publisher = {The MIT Press},
title = {Reinforcement Learning: An Introduction},
year = {2018 }
}
@phdthesis{Bradtke1994phd,
title={Incremental Dynamic Programming for On-line Adaptive Optimal Control},
author={Bradtke, Steven J},
year={1994},
school={University of Massachusetts},
month={Sept.},
address={Amherst},
}
@inproceedings{baird1995residual,
title={Residual algorithms: Reinforcement learning with function approximation},
author={Baird, Leemon and others},
booktitle={Proc. 12th Int. Conf. Mach. Learn.},
pages={30--37},
year={1995}
}
@article{bradtke1996linear,
title={Linear least-squares algorithms for temporal difference learning},
author={Bradtke, S.J. and Barto, A.G.},
journal={Mach. Learn.},
volume={22},
number={1},
pages={33--57},
year={1996},
publisher={Springer}
}
@article{lagoudakis2003least,
title={Least-squares policy iteration},
author={Lagoudakis, M.G. and Parr, R.},
journal={J. Mach. Learn. Res.},
volume={4},
pages={1107--1149},
year={2003},
publisher={JMLR. org}
}
@article{boyan2002technical,
title={Technical update: Least-squares temporal difference learning},
author={Boyan, J.A.},
journal={Mach. Learn.},
volume={49},
number={2},
pages={233--246},
year={2002},
publisher={Springer}
}
@inproceedings{geramifard2006incremental,
title={Incremental least-squares temporal difference learning},
author={Geramifard, A. and Bowling, M. and Sutton, R.S.},
booktitle={Proc. 21st AAAI Conf. Artif. Intell.},
pages={356--361},
year={2006},
month={July},
address={Boston, Massachusetts},
}
@inproceedings{sutton2009fast,
title={Fast gradient-descent methods for temporal-difference learning with linear function approximation},
author={Sutton, R.S. and Maei, H.R. and Precup, D. and Bhatnagar, S. and Silver, D. and Szepesv{\'a}ri, C. and Wiewiora, E.},
booktitle={Proc. 26th Int. Conf. Mach. Learn.},
pages={993--1000},
year={2009}
}
@inproceedings{sutton2008convergent,
title={A Convergent $ O (n) $ Temporal-difference Algorithm for Off-policy Learning with Linear Function Approximation},
author={Sutton, Richard S and Maei, Hamid R and Szepesv{\'a}ri, Csaba},
booktitle={Advances in Neural Information Processing Systems},
publisher={Cambridge, MA: MIT Press},
pages={1609--1616},
year={2008}
}
@inproceedings{dabney2014natural,
title={Natural Temporal Difference Learning},
author={Dabney, William and Thomas, Philip},
booktitle={Twenty-Eighth AAAI Conference on Artificial Intelligence},
year={2014}
}
@inproceedings{mahmood2014weighted,
title={Weighted importance sampling for off-policy learning with linear function approximation},
author={Mahmood, A Rupam and van Hasselt, Hado P and Sutton, Richard S},
booktitle={Advances in Neural Information Processing Systems},
publisher={Cambridge, MA: MIT Press},
pages={3014--3022},
year={2014}
}
@inproceedings{seijen2014true,
title={True Online TD ($\lambda$)},
author={Seijen, Harm V and Sutton, Rich},
booktitle={Proc. 31st Int. Conf. Mach. Learn.},
pages={692--700},
year={2014}
}
@article{ormoneit2002kernel,
title={{Kernel-based reinforcement learning}},
author={Ormoneit, D. and Sen, {\'S}.},
journal={Mach. Learn.},
volume={49},
number={2-3},
pages={161--178},
issn={0885-6125},
year={2002},
publisher={Springer-Verlag },
address = {Hingham, MA, USA},
}
@inproceedings{Ghavamzadeh2010lstd,
author = {M. Ghavamzadeh and A. Lazaric and O. A. Maillard and R. Munos},
title = {{LSTD} with Random Projections},
BOOKTITLE={Advances in Neural Information Processing Systems},
publisher={Cambridge, MA: MIT Press},
volume = {23},
pages = {721--729},
Address = {Lake Tahoe, Nevada, USA},
year = {2010}
}
@inproceedings{loth2007sparse,
title={Sparse temporal difference learning using LASSO},
author={Loth, M. and Davy, M. and Preux, P.},
booktitle={Proc. IEEE Symp. Approx. Dynamic Program. Reinforce. Learn.},
pages={352--359},
year={2007},
organization={IEEE}
}
@inproceedings{kolter2009regularization,
title={Regularization and feature selection in least-squares temporal difference learning},
author={Kolter, J.Z. and Ng, A.Y.},
booktitle={Proc. 26th Int. Conf. Mach. Learn.},
pages={521--528},
year={2009},
organization={ACM}
}
@inproceedings{hoffman2011regularized,
title={Regularized least squares temporal difference learning with nested l2 and l1 penalization},
author={Hoffman, M.W. and Lazaric, A. and Ghavamzadeh, M. and Munos, R.},
booktitle={Proc. Eur. Workshop Reinforce. Learn.},
year={2011}
}
@inproceedings{Ghavamzadeh2011finite,
author = {M. Ghavamzadeh and A. Lazaric and R. Munos and M. Hoffman},
title = {Finite-Sample Analysis of {Lasso-TD}},
booktitle = {Proc. 28th Int. Conf. Mach. Learn.},
year = {2011},
month= {June},
address={Bellevue, Washington, USA},
pages={1177--1184},
}
@inproceedings{johnson2013accelerating,
title={Accelerating stochastic gradient descent using predictive variance reduction},
author={Johnson, R. and Zhang, T.},
booktitle={Advances in Neural Information Processing Systems},
pages={315--323},
year={2013}
}
@article{xu2020reanalysis,
title={Reanalysis of variance reduced temporal difference learning},
author={Xu, T. and Wang, Z. and Zhou, Y. and Liang, Y.},
journal={arXiv preprint arXiv:2001.01898},
year={2020}
}
@inproceedings{schulman2015trust,
title={Trust region policy optimization},
author={Schulman, J. and Levine, S. and Abbeel, P. and Jordan, M. and Moritz, P.},
booktitle={International Conference on Machine Learning},
pages={1889--1897},
year={2015}
}
@article{schulman2017proximal,
title={Proximal policy optimization algorithms},
author={Schulman, J. and Wolski, F. and Dhariwal, P. and Radford, A. and Klimov, O.},
journal={arXiv preprint arXiv:1707.06347},
year={2017}
}
@inproceedings{defazio2014saga,
title={SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives},
author={Defazio, A. and Bach, F. and Lacoste-Julien, S.},
booktitle={Advances in Neural Information Processing Systems},
pages={1646--1654},
year={2014}
}
@inproceedings{du2017stochastic,
title={Stochastic variance reduction methods for policy evaluation},
author={Du, S. S. and Chen, J. and Li, L. and Xiao, L. and Zhou, D.},
booktitle={Proceedings of the 34th International Conference on Machine Learning},
pages={1049--1058},
year={2017}
}
@inproceedings{chen2023modified,
title={Modified Retrace for Off-Policy Temporal Difference Learning},
author={Chen, Xingguo and Ma, Xingzhou and Li, Yang and Yang, Guang and Yang, Shangdong and Gao, Yang},
booktitle={Uncertainty in Artificial Intelligence},
pages={303--312},
year={2023},
organization={PMLR}
}
@article{dalal2017finite,
title={Finite Sample Analyses for TD(0) with Function Approximation},
author={Dalal, Gal and Szörényi, Balázs and Thoppe, Gugan and Mannor, Shie},
journal={arXiv preprint arXiv:1704.01161},
year={2017}
}
@article{sutton1988learning,
title={Learning to predict by the methods of temporal differences},
author={Sutton, Richard S},
journal={Machine learning},
volume={3},
number={1},
pages={9--44},
year={1988},
publisher={Springer}
}
@inproceedings{tsitsiklis1997analysis,
title={Analysis of temporal-diffference learning with function approximation},
author={Tsitsiklis, John N and Van Roy, Benjamin},
booktitle={Advances in Neural Information Processing Systems},
pages={1075--1081},
year={1997}
}
@article{sutton2016emphatic,
title={An emphatic approach to the problem of off-policy temporal-difference learning},
author={Sutton, Richard S and Mahmood, A Rupam and White, Martha},
journal={The Journal of Machine Learning Research},
volume={17},
number={1},
pages={2603--2631},
year={2016},
publisher={JMLR. org}
}
@inproceedings{liu2015finite,
title={Finite-sample analysis of proximal gradient TD algorithms},
author={Liu, Bo and Liu, Ji and Ghavamzadeh, Mohammad and Mahadevan, Sridhar and Petrik, Marek},
booktitle={Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence},
pages={504--513},
year={2015}
}
@inproceedings{liu2016proximal,
title={Proximal Gradient Temporal Difference Learning Algorithms.},
author={Liu, Bo and Liu, Ji and Ghavamzadeh, Mohammad and Mahadevan, Sridhar and Petrik, Marek},
booktitle={Proceedings of the International Joint Conference on Artificial Intelligence},
pages={4195--4199},
year={2016}
}
@article{liu2018proximal,
title={Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity},
author={Liu, Bo and Gemp, Ian and Ghavamzadeh, Mohammad and Liu, Ji and Mahadevan, Sridhar and Petrik, Marek},
journal={Journal of Artificial Intelligence Research},
volume={63},
pages={461--494},
year={2018}
}
@inproceedings{givchi2015quasi,
title={Quasi newton temporal difference learning},
author={Givchi, Arash and Palhang, Maziar},
booktitle={Asian Conference on Machine Learning},
pages={159--172},
year={2015}
}
@inproceedings{pan2017accelerated,
title={Accelerated gradient temporal difference learning},
author={Pan, Yangchen and White, Adam and White, Martha},
booktitle={Proceedings of the 21st AAAI Conference on Artificial Intelligence},
pages={2464--2470},
year={2017}
}
@inproceedings{hallak2016generalized,
title={Generalized emphatic temporal difference learning: bias-variance analysis},
author={Hallak, Assaf and Tamar, Aviv and Munos, Remi and Mannor, Shie},
booktitle={Proceedings of the 30th AAAI Conference on Artificial Intelligence},
pages={1631--1637},
year={2016}
}
@article{zhang2022truncated,
title={Truncated emphatic temporal difference methods for prediction and control},
author={Zhang, Shangtong and Whiteson, Shimon},
journal={The Journal of Machine Learning Research},
volume={23},
number={1},
pages={6859--6917},
year={2022},
publisher={JMLRORG}
}
@inproceedings{korda2015td,
title={On TD (0) with function approximation: Concentration bounds and a centered variant with exponential convergence},
author={Korda, Nathaniel and La, Prashanth},
booktitle={International conference on machine learning},
pages={626--634},
year={2015},
organization={PMLR}
}
@book{zhou2021machine,
title={Machine learning},
author={Zhou, Zhi-Hua},
year={2021},
publisher={Springer Nature}
}
@inproceedings{dalal2020tale,
title={A tale of two-timescale reinforcement learning with the tightest finite-time bound},
author={Dalal, Gal and Szorenyi, Balazs and Thoppe, Gugan},
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
volume={34},
number={04},
pages={3701-3708},
year={2020}
}
@inproceedings{feng2019kernel,
title={A kernel loss for solving the Bellman equation},
author={Feng, Yihao and Li, Lihong and Liu, Qiang},
booktitle={Advances in Neural Information Processing Systems},
pages={15430--15441},
year={2019}
}
@inproceedings{basserrano2021logistic,
title={Logistic Q-Learning},
author={Bas-Serrano, Joan and Curi, Sebastian and Krause, Andreas and Neu, Gergely},
booktitle={International Conference on Artificial Intelligence and Statistics},
pages={3610--3618},
year={2021}
}
This is BibTeX, Version 0.99d (TeX Live 2023)
Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
The top-level auxiliary file: neurips_2024.aux
The style file: named.bst
Database file #1: neurips_2024.bib
Warning--can't use both volume and number fields in dalal2020tale
You've used 32 entries,
2439 wiz_defined-function locations,
737 strings with 10053 characters,
and the built_in function-call counts, 15617 in all, are:
= -- 1648
> -- 575
< -- 21
+ -- 200
- -- 194
* -- 1156
:= -- 2297
add.period$ -- 99
call.type$ -- 32
change.case$ -- 222
chr.to.int$ -- 32
cite$ -- 33
duplicate$ -- 692
empty$ -- 1205
format.name$ -- 235
if$ -- 3401
int.to.chr$ -- 1
int.to.str$ -- 0
missing$ -- 31
newline$ -- 163
num.names$ -- 96
pop$ -- 236
preamble$ -- 1
purify$ -- 256
quote$ -- 0
skip$ -- 627
stack$ -- 0
substring$ -- 1023
swap$ -- 276
text.length$ -- 21
text.prefix$ -- 0
top$ -- 0
type$ -- 252
warning$ -- 1
while$ -- 134
width$ -- 37
write$ -- 420
(There was 1 warning)
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.3.31) 19 MAY 2024 17:37
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
%&-line parsing enabled.
**neurips_2024
(./neurips_2024.tex
LaTeX2e <2022-11-01> patch level 1
L3 programming layer <2023-02-22> (d:/software/texlive/2023/texmf-dist/tex/latex/base/article.cls
Document Class: article 2022/07/02 v1.4n Standard LaTeX document class
(d:/software/texlive/2023/texmf-dist/tex/latex/base/size10.clo
File: size10.clo 2022/07/02 v1.4n Standard LaTeX file (size option)
)
\c@part=\count185
\c@section=\count186
\c@subsection=\count187
\c@subsubsection=\count188
\c@paragraph=\count189
\c@subparagraph=\count190
\c@figure=\count191
\c@table=\count192
\abovecaptionskip=\skip48
\belowcaptionskip=\skip49
\bibindent=\dimen140
) (./neurips_2024.sty
Package: neurips_2024 2024/03/31 NeurIPS 2024 submission/camera-ready style file
(d:/software/texlive/2023/texmf-dist/tex/latex/environ/environ.sty
Package: environ 2014/05/04 v0.3 A new way to define environments
(d:/software/texlive/2023/texmf-dist/tex/latex/trimspaces/trimspaces.sty
Package: trimspaces 2009/09/17 v1.1 Trim spaces around a token list
)
\@envbody=\toks16
) (d:/software/texlive/2023/texmf-dist/tex/latex/natbib/natbib.sty
Package: natbib 2010/09/13 8.31b (PWD, AO)
\bibhang=\skip50
\bibsep=\skip51
LaTeX Info: Redefining \cite on input line 694.
\c@NAT@ctr=\count193
) (d:/software/texlive/2023/texmf-dist/tex/latex/geometry/geometry.sty
Package: geometry 2020/01/02 v5.9 Page Geometry
(d:/software/texlive/2023/texmf-dist/tex/latex/graphics/keyval.sty
Package: keyval 2022/05/29 v1.15 key=value parser (DPC)
\KV@toks@=\toks17
) (d:/software/texlive/2023/texmf-dist/tex/generic/iftex/ifvtex.sty
Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
(d:/software/texlive/2023/texmf-dist/tex/generic/iftex/iftex.sty
Package: iftex 2022/02/03 v1.0f TeX engine tests
))
\Gm@cnth=\count194
\Gm@cntv=\count195
\c@Gm@tempcnt=\count196
\Gm@bindingoffset=\dimen141
\Gm@wd@mp=\dimen142
\Gm@odd@mp=\dimen143
\Gm@even@mp=\dimen144
\Gm@layoutwidth=\dimen145
\Gm@layoutheight=\dimen146
\Gm@layouthoffset=\dimen147
\Gm@layoutvoffset=\dimen148
\Gm@dimlist=\toks18
)
\@neuripsabovecaptionskip=\skip52
\@neuripsbelowcaptionskip=\skip53
(d:/software/texlive/2023/texmf-dist/tex/latex/lineno/lineno.sty
Package: lineno 2023/01/19 line numbers on paragraphs v5.1
\linenopenalty=\count197
\output=\toks19
\linenoprevgraf=\count198
\linenumbersep=\dimen149
\linenumberwidth=\dimen150
\c@linenumber=\count199
\c@pagewiselinenumber=\count266
\c@LN@truepage=\count267
\c@internallinenumber=\count268
\c@internallinenumbers=\count269
\quotelinenumbersep=\dimen151
\bframerule=\dimen152
\bframesep=\dimen153
\bframebox=\box51
(d:/software/texlive/2023/texmf-dist/tex/latex/etoolbox/etoolbox.sty
Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW)
\etb@tempcnta=\count270
)
LaTeX Info: Redefining \\ on input line 3131.
)) (d:/software/texlive/2023/texmf-dist/tex/latex/base/inputenc.sty
Package: inputenc 2021/02/14 v1.3d Input encoding file
\inpenc@prehook=\toks20
\inpenc@posthook=\toks21
) (d:/software/texlive/2023/texmf-dist/tex/latex/base/fontenc.sty
Package: fontenc 2021/04/29 v2.0v Standard LaTeX package
LaTeX Font Info: Trying to load font information for T1+ptm on input line 112.
(d:/software/texlive/2023/texmf-dist/tex/latex/psnfss/t1ptm.fd
File: t1ptm.fd 2001/06/04 font definitions for T1/ptm.
)) (d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/hyperref.sty
Package: hyperref 2023-02-07 v7.00v Hypertext links for LaTeX
(d:/software/texlive/2023/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO)
(d:/software/texlive/2023/texmf-dist/tex/generic/infwarerr/infwarerr.sty
Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
)
Package pdftexcmds Info: \pdf@primitive is available.
Package pdftexcmds Info: \pdf@ifprimitive is available.
Package pdftexcmds Info: \pdfdraftmode found.
) (d:/software/texlive/2023/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty
Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO)
) (d:/software/texlive/2023/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pdfescape/pdfescape.sty
Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
) (d:/software/texlive/2023/texmf-dist/tex/latex/hycolor/hycolor.sty
Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
) (d:/software/texlive/2023/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
) (d:/software/texlive/2023/texmf-dist/tex/latex/auxhook/auxhook.sty
Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
) (d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/nameref.sty
Package: nameref 2022-05-17 v2.50 Cross-referencing by name of section
(d:/software/texlive/2023/texmf-dist/tex/latex/refcount/refcount.sty
Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
) (d:/software/texlive/2023/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty
Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
(d:/software/texlive/2023/texmf-dist/tex/latex/kvoptions/kvoptions.sty
Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO)
))
\c@section@level=\count271
)
\@linkdim=\dimen154
\Hy@linkcounter=\count272
\Hy@pagecounter=\count273
(d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/pd1enc.def
File: pd1enc.def 2023-02-07 v7.00v Hyperref: PDFDocEncoding definition (HO)
Now handling font encoding PD1 ...
... no UTF-8 mapping file for font encoding PD1
) (d:/software/texlive/2023/texmf-dist/tex/generic/intcalc/intcalc.sty
Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
) (d:/software/texlive/2023/texmf-dist/tex/generic/etexcmds/etexcmds.sty
Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
)
\Hy@SavedSpaceFactor=\count274
(d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/puenc.def
File: puenc.def 2023-02-07 v7.00v Hyperref: PDF Unicode definition (HO)
Now handling font encoding PU ...
... no UTF-8 mapping file for font encoding PU
)
Package hyperref Info: Hyper figures OFF on input line 4177.
Package hyperref Info: Link nesting OFF on input line 4182.
Package hyperref Info: Hyper index ON on input line 4185.
Package hyperref Info: Plain pages OFF on input line 4192.
Package hyperref Info: Backreferencing OFF on input line 4197.
Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
Package hyperref Info: Bookmarks ON on input line 4425.
\c@Hy@tempcnt=\count275
(d:/software/texlive/2023/texmf-dist/tex/latex/url/url.sty
\Urlmuskip=\muskip16
Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
)
LaTeX Info: Redefining \url on input line 4763.
\XeTeXLinkMargin=\dimen155
(d:/software/texlive/2023/texmf-dist/tex/generic/bitset/bitset.sty
Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
(d:/software/texlive/2023/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO)
))
\Fld@menulength=\count276
\Field@Width=\dimen156
\Fld@charsize=\dimen157
Package hyperref Info: Hyper figures OFF on input line 6042.
Package hyperref Info: Link nesting OFF on input line 6047.
Package hyperref Info: Hyper index ON on input line 6050.
Package hyperref Info: backreferencing OFF on input line 6057.
Package hyperref Info: Link coloring OFF on input line 6062.
Package hyperref Info: Link coloring with OCG OFF on input line 6067.
Package hyperref Info: PDF/A mode OFF on input line 6072.
(d:/software/texlive/2023/texmf-dist/tex/latex/base/atbegshi-ltx.sty
Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi
package with kernel methods
)
\Hy@abspage=\count277
\c@Item=\count278
\c@Hfootnote=\count279
)
Package hyperref Info: Driver (autodetected): hpdftex.
(d:/software/texlive/2023/texmf-dist/tex/latex/hyperref/hpdftex.def
File: hpdftex.def 2023-02-07 v7.00v Hyperref driver for pdfTeX
(d:/software/texlive/2023/texmf-dist/tex/latex/base/atveryend-ltx.sty
Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend package
with kernel methods
)
\Fld@listcount=\count280
\c@bookmark@seq@number=\count281
(d:/software/texlive/2023/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO)
(d:/software/texlive/2023/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
)
Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 285.
)
\Hy@SectionHShift=\skip54
) (d:/software/texlive/2023/texmf-dist/tex/latex/booktabs/booktabs.sty
Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
\heavyrulewidth=\dimen158
\lightrulewidth=\dimen159
\cmidrulewidth=\dimen160
\belowrulesep=\dimen161
\belowbottomsep=\dimen162
\aboverulesep=\dimen163
\abovetopsep=\dimen164
\cmidrulesep=\dimen165
\cmidrulekern=\dimen166
\defaultaddspace=\dimen167
\@cmidla=\count282
\@cmidlb=\count283
\@aboverulesep=\dimen168
\@belowrulesep=\dimen169
\@thisruleclass=\count284
\@lastruleclass=\count285
\@thisrulewidth=\dimen170
) (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/amsfonts.sty
Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
\@emptytoks=\toks22
\symAMSa=\mathgroup4
\symAMSb=\mathgroup5
LaTeX Font Info: Redeclaring math symbol \hbar on input line 98.
LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold'
(Font) U/euf/m/n --> U/euf/b/n on input line 106.
) (d:/software/texlive/2023/texmf-dist/tex/latex/units/nicefrac.sty
Package: nicefrac 1998/08/04 v0.9b Nice fractions
\L@UnitsRaiseDisplaystyle=\skip55
\L@UnitsRaiseTextstyle=\skip56
\L@UnitsRaiseScriptstyle=\skip57
(d:/software/texlive/2023/texmf-dist/tex/latex/base/ifthen.sty
Package: ifthen 2022/04/13 v1.1d Standard LaTeX ifthen package (DPC)
)) (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/microtype.sty
Package: microtype 2023/03/13 v3.1a Micro-typographical refinements (RS)
\MT@toks=\toks23
\MT@tempbox=\box52
\MT@count=\count286
LaTeX Info: Redefining \noprotrusionifhmode on input line 1059.
LaTeX Info: Redefining \leftprotrusion on input line 1060.
\MT@prot@toks=\toks24
LaTeX Info: Redefining \rightprotrusion on input line 1078.
LaTeX Info: Redefining \textls on input line 1368.
\MT@outer@kern=\dimen171
LaTeX Info: Redefining \textmicrotypecontext on input line 1988.
\MT@listname@count=\count287
(d:/software/texlive/2023/texmf-dist/tex/latex/microtype/microtype-pdftex.def
File: microtype-pdftex.def 2023/03/13 v3.1a Definitions specific to pdftex (RS)
LaTeX Info: Redefining \lsstyle on input line 902.
LaTeX Info: Redefining \lslig on input line 902.
\MT@outer@space=\skip58
)
Package microtype Info: Loading configuration file microtype.cfg.
(d:/software/texlive/2023/texmf-dist/tex/latex/microtype/microtype.cfg
File: microtype.cfg 2023/03/13 v3.1a microtype main configuration file (RS)
)) (d:/software/texlive/2023/texmf-dist/tex/latex/xcolor/xcolor.sty
Package: xcolor 2022/06/12 v2.14 LaTeX color extensions (UK)
(d:/software/texlive/2023/texmf-dist/tex/latex/graphics-cfg/color.cfg
File: color.cfg 2016/01/02 v1.6 sample color configuration
)
Package xcolor Info: Driver file: pdftex.def on input line 227.
(d:/software/texlive/2023/texmf-dist/tex/latex/graphics-def/pdftex.def
File: pdftex.def 2022/09/22 v1.2b Graphics/color driver for pdftex
) (d:/software/texlive/2023/texmf-dist/tex/latex/graphics/mathcolor.ltx)
Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1353.
Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1357.
Package xcolor Info: Model `RGB' extended on input line 1369.
Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1371.
Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1372.
Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1373.
Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1374.
Package xcolor Info: Model `Gray' substituted by `gray' on input line 1375.
Package xcolor Info: Model `wave' substituted by `hsb' on input line 1376.
) (d:/software/texlive/2023/texmf-dist/tex/latex/graphics/graphicx.sty
Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
(d:/software/texlive/2023/texmf-dist/tex/latex/graphics/graphics.sty
Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR)
(d:/software/texlive/2023/texmf-dist/tex/latex/graphics/trig.sty
Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
) (d:/software/texlive/2023/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
)
Package graphics Info: Driver file: pdftex.def on input line 107.
)
\Gin@req@height=\dimen172
\Gin@req@width=\dimen173
) (d:/software/texlive/2023/texmf-dist/tex/latex/subfigure/subfigure.sty
Package: subfigure 2002/03/15 v2.1.5 subfigure package
\subfigtopskip=\skip59
\subfigcapskip=\skip60
\subfigcaptopadj=\dimen174
\subfigbottomskip=\skip61
\subfigcapmargin=\dimen175
\subfiglabelskip=\skip62
\c@subfigure=\count288
\c@subtable=\count289
****************************************
* Local config file subfigure.cfg used *
****************************************
(d:/software/texlive/2023/texmf-dist/tex/latex/subfigure/subfigure.cfg)
\subfig@top=\skip63
\subfig@bottom=\skip64
) (d:/software/texlive/2023/texmf-dist/tex/latex/diagbox/diagbox.sty
Package: diagbox 2020/02/09 v2.3 Making table heads with diagonal lines
(d:/software/texlive/2023/texmf-dist/tex/latex/pict2e/pict2e.sty
Package: pict2e 2020/09/30 v0.4b Improved picture commands (HjG,RN,JT)
(d:/software/texlive/2023/texmf-dist/tex/latex/pict2e/pict2e.cfg
File: pict2e.cfg 2016/02/05 v0.1u pict2e configuration for teTeX/TeXLive
)
Package pict2e Info: Driver file: pdftex.def on input line 112.
Package pict2e Info: Driver file for pict2e: p2e-pdftex.def on input line 114.
(d:/software/texlive/2023/texmf-dist/tex/latex/pict2e/p2e-pdftex.def
File: p2e-pdftex.def 2016/02/05 v0.1u Driver-dependant file (RN,HjG,JT)
)
\pIIe@GRAPH=\toks25
\@arclen=\dimen176
\@arcrad=\dimen177
\pIIe@tempdima=\dimen178
\pIIe@tempdimb=\dimen179
\pIIe@tempdimc=\dimen180
\pIIe@tempdimd=\dimen181
\pIIe@tempdime=\dimen182
\pIIe@tempdimf=\dimen183
) (d:/software/texlive/2023/texmf-dist/tex/latex/tools/calc.sty
Package: calc 2017/05/25 v4.3 Infix arithmetic (KKT,FJ)
\calc@Acount=\count290
\calc@Bcount=\count291
\calc@Adimen=\dimen184
\calc@Bdimen=\dimen185
\calc@Askip=\skip65
\calc@Bskip=\skip66
LaTeX Info: Redefining \setlength on input line 80.
LaTeX Info: Redefining \addtolength on input line 81.
\calc@Ccount=\count292
\calc@Cskip=\skip67
) (d:/software/texlive/2023/texmf-dist/tex/latex/tools/array.sty
Package: array 2022/09/04 v2.5g Tabular extension package (FMi)
\col@sep=\dimen186
\ar@mcellbox=\box53
\extrarowheight=\dimen187
\NC@list=\toks26
\extratabsurround=\skip68
\backup@length=\skip69
\ar@cellbox=\box54
)
\diagbox@boxa=\box55
\diagbox@boxb=\box56
\diagbox@boxm=\box57
\diagbox@wd=\dimen188
\diagbox@ht=\dimen189
\diagbox@insepl=\dimen190
\diagbox@insepr=\dimen191
\diagbox@outsepl=\dimen192
\diagbox@outsepr=\dimen193
) (d:/software/texlive/2023/texmf-dist/tex/latex/wrapfig/wrapfig.sty
\wrapoverhang=\dimen194
\WF@size=\dimen195
\c@WF@wrappedlines=\count293
\WF@box=\box58
\WF@everypar=\toks27
Package: wrapfig 2003/01/31 v 3.6
) (d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amsmath.sty
Package: amsmath 2022/04/08 v2.17n AMS math features
\@mathmargin=\skip70
For additional information on amsmath, use the `?' option.
(d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amstext.sty
Package: amstext 2021/08/26 v2.01 AMS text
(d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amsgen.sty
File: amsgen.sty 1999/11/30 v2.0 generic functions
\@emptytoks=\toks28
\ex@=\dimen196
)) (d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amsbsy.sty
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
\pmbraise@=\dimen197
) (d:/software/texlive/2023/texmf-dist/tex/latex/amsmath/amsopn.sty
Package: amsopn 2022/04/08 v2.04 operator names
)
\inf@bad=\count294
LaTeX Info: Redefining \frac on input line 234.
\uproot@=\count295
\leftroot@=\count296
LaTeX Info: Redefining \overline on input line 399.
LaTeX Info: Redefining \colon on input line 410.
\classnum@=\count297
\DOTSCASE@=\count298
LaTeX Info: Redefining \ldots on input line 496.
LaTeX Info: Redefining \dots on input line 499.
LaTeX Info: Redefining \cdots on input line 620.
\Mathstrutbox@=\box59
\strutbox@=\box60
LaTeX Info: Redefining \big on input line 722.
LaTeX Info: Redefining \Big on input line 723.
LaTeX Info: Redefining \bigg on input line 724.
LaTeX Info: Redefining \Bigg on input line 725.
\big@size=\dimen198
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
\macc@depth=\count299
LaTeX Info: Redefining \bmod on input line 905.
LaTeX Info: Redefining \pmod on input line 910.
LaTeX Info: Redefining \smash on input line 940.
LaTeX Info: Redefining \relbar on input line 970.
LaTeX Info: Redefining \Relbar on input line 971.
\c@MaxMatrixCols=\count300
\dotsspace@=\muskip17
\c@parentequation=\count301
\dspbrk@lvl=\count302
\tag@help=\toks29
\row@=\count303
\column@=\count304
\maxfields@=\count305
\andhelp@=\toks30
\eqnshift@=\dimen199
\alignsep@=\dimen256
\tagshift@=\dimen257
\tagwidth@=\dimen258
\totwidth@=\dimen259
\lineht@=\dimen260
\@envbody=\toks31
\multlinegap=\skip71
\multlinetaggap=\skip72
\mathdisplay@stack=\toks32
LaTeX Info: Redefining \[ on input line 2953.
LaTeX Info: Redefining \] on input line 2954.
)
\linenoamsmath@ams@eqpen=\count306
(d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/amssymb.sty
Package: amssymb 2013/01/14 v3.01 AMS font symbols
) (d:/software/texlive/2023/texmf-dist/tex/latex/mathtools/mathtools.sty
Package: mathtools 2022/06/29 v1.29 mathematical typesetting tools
(d:/software/texlive/2023/texmf-dist/tex/latex/mathtools/mhsetup.sty
Package: mhsetup 2021/03/18 v1.4 programming setup (MH)
)
\g_MT_multlinerow_int=\count307
\l_MT_multwidth_dim=\dimen261
\origjot=\skip73
\l_MT_shortvdotswithinadjustabove_dim=\dimen262
\l_MT_shortvdotswithinadjustbelow_dim=\dimen263
\l_MT_above_intertext_sep=\dimen264
\l_MT_below_intertext_sep=\dimen265
\l_MT_above_shortintertext_sep=\dimen266
\l_MT_below_shortintertext_sep=\dimen267
\xmathstrut@box=\box61
\xmathstrut@dim=\dimen268
) (d:/software/texlive/2023/texmf-dist/tex/latex/amscls/amsthm.sty
Package: amsthm 2020/05/29 v2.20.6
\thm@style=\toks33
\thm@bodyfont=\toks34
\thm@headfont=\toks35
\thm@notefont=\toks36
\thm@headpunct=\toks37
\thm@preskip=\skip74
\thm@postskip=\skip75
\thm@headsep=\skip76
\dth@everypar=\toks38
) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex
\pgfutil@everybye=\toks39
\pgfutil@tempdima=\dimen269
\pgfutil@tempdimb=\dimen270
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def
\pgfutil@abb=\box62
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/pgf.revision.tex)
Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10)
))
Package: pgf 2023-01-15 v3.1.10 (3.1.10)
(d:/software/texlive/2023/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex
Package: pgfsys 2023-01-15 v3.1.10 (3.1.10)
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex
\pgfkeys@pathtoks=\toks40
\pgfkeys@temptoks=\toks41
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered.code.tex
\pgfkeys@tmptoks=\toks42
))
\pgf@x=\dimen271
\pgf@y=\dimen272
\pgf@xa=\dimen273
\pgf@ya=\dimen274
\pgf@xb=\dimen275
\pgf@yb=\dimen276
\pgf@xc=\dimen277
\pgf@yc=\dimen278
\pgf@xd=\dimen279
\pgf@yd=\dimen280
\w@pgf@writea=\write3
\r@pgf@reada=\read2
\c@pgf@counta=\count308
\c@pgf@countb=\count309
\c@pgf@countc=\count310
\c@pgf@countd=\count311
\t@pgf@toka=\toks43
\t@pgf@tokb=\toks44
\t@pgf@tokc=\toks45
\pgf@sys@id@count=\count312
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg
File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10)
)
Driver file for pgf: pgfsys-pdftex.def
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def
File: pgfsys-pdftex.def 2023-01-15 v3.1.10 (3.1.10)
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.def
File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10)
))) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex
File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfsyssoftpath@smallbuffer@items=\count313
\pgfsyssoftpath@bigbuffer@items=\count314
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex
File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10)
)) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex
Package: pgfcore 2023-01-15 v3.1.10 (3.1.10)
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex
\pgfmath@dimen=\dimen281
\pgfmath@count=\count315
\pgfmath@box=\box63
\pgfmath@toks=\toks46
\pgfmath@stack@operand=\toks47
\pgfmath@stack@operation=\toks48
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonometric.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerarithmetics.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex
\c@pgfmathroundto@lastzeros=\count316
)) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.tex
File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@picminx=\dimen282
\pgf@picmaxx=\dimen283
\pgf@picminy=\dimen284
\pgf@picmaxy=\dimen285
\pgf@pathminx=\dimen286
\pgf@pathmaxx=\dimen287
\pgf@pathminy=\dimen288
\pgf@pathmaxy=\dimen289
\pgf@xx=\dimen290
\pgf@xy=\dimen291
\pgf@yx=\dimen292
\pgf@yy=\dimen293
\pgf@zx=\dimen294
\pgf@zy=\dimen295
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.code.tex
File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@path@lastx=\dimen296
\pgf@path@lasty=\dimen297
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code.tex
File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@shorten@end@additional=\dimen298
\pgf@shorten@start@additional=\dimen299
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.tex
File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfpic=\box64
\pgf@hbox=\box65
\pgf@layerbox@main=\box66
\pgf@picture@serial@count=\count317
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.code.tex
File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgflinewidth=\dimen300
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformations.code.tex
File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@pt@x=\dimen301
\pgf@pt@y=\dimen302
\pgf@pt@temp=\dimen303
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex
File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.tex
File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing.code.tex
File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.tex
File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfarrowsep=\dimen304
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex
File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@max=\dimen305
\pgf@sys@shading@range@num=\count318
\pgf@shadingcount=\count319
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex
File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.tex
File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfexternal@startupbox=\box67
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.tex
File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.code.tex
File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.tex
File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10)
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex
File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10)
))) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex
File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfnodeparttextbox=\box68
) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex
File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10)
) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty
Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10)
\pgf@nodesepstart=\dimen306
\pgf@nodesepend=\dimen307
) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty
Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10)
)) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgffor.sty (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)) (d:/software/texlive/2023/texmf-dist/tex/latex/pgf/math/pgfmath.sty (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex
Package: pgffor 2023-01-15 v3.1.10 (3.1.10)
\pgffor@iter=\dimen308
\pgffor@skip=\dimen309
\pgffor@stack=\toks49
\pgffor@toks=\toks50
)) (d:/software/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex
Package: tikz 2023-01-15 v3.1.10 (3.1.10)
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex
File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@plot@mark@count=\count320
\pgfplotmarksize=\dimen310
)
\tikz@lastx=\dimen311
\tikz@lasty=\dimen312
\tikz@lastxsaved=\dimen313
\tikz@lastysaved=\dimen314
\tikz@lastmovetox=\dimen315
\tikz@lastmovetoy=\dimen316
\tikzleveldistance=\dimen317
\tikzsiblingdistance=\dimen318
\tikz@figbox=\box69
\tikz@figbox@bg=\box70
\tikz@tempbox=\box71
\tikz@tempbox@bg=\box72
\tikztreelevel=\count321
\tikznumberofchildren=\count322
\tikznumberofcurrentchild=\count323
\tikz@fig@count=\count324
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex
File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfmatrixcurrentrow=\count325
\pgfmatrixcurrentcolumn=\count326
\pgf@matrix@numberofcolumns=\count327
)
\tikz@expandcount=\count328
(d:/software/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex
File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10)
)))
\c@theorem=\count329
(d:/software/texlive/2023/texmf-dist/tex/latex/algorithms/algorithm.sty
Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating environment
(d:/software/texlive/2023/texmf-dist/tex/latex/float/float.sty
Package: float 2001/11/08 v1.3d Float enhancements (AL)
\c@float@type=\count330
\float@exts=\toks51
\float@box=\box73
\@float@everytoks=\toks52
\@floatcapt=\box74
)
\@float@every@algorithm=\toks53
\c@algorithm=\count331
) (d:/software/texlive/2023/texmf-dist/tex/latex/algorithms/algorithmic.sty
Package: algorithmic 2009/08/24 v0.1 Document Style `algorithmic'
\c@ALC@unique=\count332
\c@ALC@line=\count333
\c@ALC@rem=\count334
\c@ALC@depth=\count335
\ALC@tlm=\skip77
\algorithmicindent=\skip78
) (d:/software/texlive/2023/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def
File: l3backend-pdftex.def 2023-01-16 L3 backend support: PDF output (pdfTeX)
\l__color_backend_stack_int=\count336
\l__pdf_internal_box=\box75
) (./neurips_2024.aux)
\openout1 = `neurips_2024.aux'.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 106.
LaTeX Font Info: ... okay on input line 106.
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 106.
LaTeX Font Info: ... okay on input line 106.
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 106.
LaTeX Font Info: ... okay on input line 106.
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 106.
LaTeX Font Info: ... okay on input line 106.
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 106.
LaTeX Font Info: ... okay on input line 106.
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 106.
LaTeX Font Info: ... okay on input line 106.
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 106.
LaTeX Font Info: ... okay on input line 106.
LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 106.
LaTeX Font Info: ... okay on input line 106.
LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 106.
LaTeX Font Info: ... okay on input line 106.
*geometry* driver: auto-detecting
*geometry* detected driver: pdftex
*geometry* verbose mode - [ preamble ] result:
* driver: pdftex
* paper: letterpaper
* layout: <same size as paper>
* layoutoffset:(h,v)=(0.0pt,0.0pt)
* modes:
* h-part:(L,W,R)=(92.14519pt, 430.00462pt, 92.14519pt)
* v-part:(T,H,B)=(95.39737pt, 556.47656pt, 143.09605pt)
* \paperwidth=614.295pt
* \paperheight=794.96999pt
* \textwidth=430.00462pt
* \textheight=556.47656pt
* \oddsidemargin=19.8752pt
* \evensidemargin=19.8752pt
* \topmargin=-13.87262pt
* \headheight=12.0pt
* \headsep=25.0pt
* \topskip=10.0pt
* \footskip=30.0pt
* \marginparwidth=65.0pt
* \marginparsep=11.0pt
* \columnsep=10.0pt
* \skip\footins=9.0pt plus 4.0pt minus 2.0pt
* \hoffset=0.0pt
* \voffset=0.0pt
* \mag=1000
* \@twocolumnfalse
* \@twosidefalse
* \@mparswitchfalse
* \@reversemarginfalse
* (1in=72.27pt=25.4mm, 1cm=28.453pt)
*geometry* verbose mode - [ newgeometry ] result:
* driver: pdftex
* paper: letterpaper
* layout: <same size as paper>
* layoutoffset:(h,v)=(0.0pt,0.0pt)
* modes:
* h-part:(L,W,R)=(108.405pt, 397.48499pt, 108.40501pt)
* v-part:(T,H,B)=(72.26999pt, 650.43pt, 72.27pt)
* \paperwidth=614.295pt
* \paperheight=794.96999pt
* \textwidth=397.48499pt
* \textheight=650.43pt
* \oddsidemargin=36.13501pt
* \evensidemargin=36.13501pt
* \topmargin=-37.0pt
* \headheight=12.0pt
* \headsep=25.0pt
* \topskip=10.0pt
* \footskip=30.0pt
* \marginparwidth=65.0pt
* \marginparsep=11.0pt
* \columnsep=10.0pt
* \skip\footins=9.0pt plus 4.0pt minus 2.0pt
* \hoffset=0.0pt
* \voffset=0.0pt
* \mag=1000
* \@twocolumnfalse
* \@twosidefalse
* \@mparswitchfalse
* \@reversemarginfalse
* (1in=72.27pt=25.4mm, 1cm=28.453pt)
Package hyperref Info: Link coloring OFF on input line 106.
(./neurips_2024.out) (./neurips_2024.out)
\@outlinefile=\write4
\openout4 = `neurips_2024.out'.
LaTeX Info: Redefining \microtypecontext on input line 106.
Package microtype Info: Applying patch `item' on input line 106.
Package microtype Info: Applying patch `toc' on input line 106.
Package microtype Info: Applying patch `eqnum' on input line 106.
Package microtype Info: Applying patch `footnote' on input line 106.
Package microtype Info: Applying patch `verbatim' on input line 106.
Package microtype Info: Generating PDF output.
Package microtype Info: Character protrusion enabled (level 2).
Package microtype Info: Using default protrusion set `alltext'.
Package microtype Info: Automatic font expansion enabled (level 2),
(microtype) stretch: 20, shrink: 20, step: 1, non-selected.
Package microtype Info: Using default expansion set `alltext-nott'.
LaTeX Info: Redefining \showhyphens on input line 106.
Package microtype Info: No adjustment of tracking.
Package microtype Info: No adjustment of interword spacing.
Package microtype Info: No adjustment of character kerning.
(d:/software/texlive/2023/texmf-dist/tex/latex/microtype/mt-ptm.cfg
File: mt-ptm.cfg 2006/04/20 v1.7 microtype config. file: Times (RS)
) (d:/software/texlive/2023/texmf-dist/tex/context/base/mkii/supp-pdf.mkii
[Loading MPS to PDF converter (version 2006.09.02).]
\scratchcounter=\count337
\scratchdimen=\dimen319
\scratchbox=\box76
\nofMPsegments=\count338
\nofMParguments=\count339
\everyMPshowfont=\toks54
\MPscratchCnt=\count340
\MPscratchDim=\dimen320
\MPnumerator=\count341
\makeMPintoPDFobject=\count342
\everyMPtoPDFconversion=\toks55
) (d:/software/texlive/2023/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty
Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf
Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 485.
(d:/software/texlive/2023/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg
File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Live
)) (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/mt-cmr.cfg
File: mt-cmr.cfg 2013/05/19 v2.2 microtype config. file: Computer Modern Roman (RS)
)
LaTeX Font Info: Trying to load font information for U+msa on input line 110.
(d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsa.fd
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
) (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/mt-msa.cfg
File: mt-msa.cfg 2006/02/04 v1.1 microtype config. file: AMS symbols (a) (RS)
)
LaTeX Font Info: Trying to load font information for U+msb on input line 110.
(d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsb.fd
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
) (d:/software/texlive/2023/texmf-dist/tex/latex/microtype/mt-msb.cfg
File: mt-msb.cfg 2005/06/01 v1.0 microtype config. file: AMS symbols (b) (RS)
)
LaTeX Font Info: Trying to load font information for T1+cmtt on input line 110.
(d:/software/texlive/2023/texmf-dist/tex/latex/base/t1cmtt.fd
File: t1cmtt.fd 2022/07/10 v2.5l Standard LaTeX font definitions
)
Package microtype Info: Loading generic protrusion settings for font family
(microtype) `cmtt' (encoding: T1).
(microtype) For optimal results, create family-specific settings.
(microtype) See the microtype manual for details.
LaTeX Font Info: Trying to load font information for T1+phv on input line 126.
(d:/software/texlive/2023/texmf-dist/tex/latex/psnfss/t1phv.fd
File: t1phv.fd 2020/03/25 scalable font definitions for T1/phv.
)
Package microtype Info: Loading generic protrusion settings for font family
(microtype) `phv' (encoding: T1).
(microtype) For optimal results, create family-specific settings.
(microtype) See the microtype manual for details.
(./main/introduction.tex [1
{d:/software/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/base/8r.enc}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/cm-super/cm-super-t1.enc}]) (./main/preliminaries.tex [2]) (./main/motivation.tex [3
pdfTeX warning (ext4): destination with the same identifier (name{table.1}) has been already used, duplicate ignored
<argument> ...shipout:D \box_use:N \l_shipout_box
\__shipout_drop_firstpage_...
l.77 \end{equation*}
]
Package hyperref Info: bookmark level for unknown algorithm defaults to 0 on input line 138.
[4]) (./main/theory.tex [5]) (./main/experiment.tex (./main/pic/randomwalk.tex) (./main/pic/BairdExample.tex) [6
pdfTeX warning (ext4): destination with the same identifier (name{figure.1}) has been already used, duplicate ignored
<argument> ...shipout:D \box_use:N \l_shipout_box
\__shipout_drop_firstpage_...
l.46
pdfTeX warning (ext4): destination with the same identifier (name{figure.2}) has been already used, duplicate ignored
<argument> ...shipout:D \box_use:N \l_shipout_box
\__shipout_drop_firstpage_...
l.46
]
<main/pic/maze_13_13.pdf, id=300, 493.1646pt x 387.62602pt>
File: main/pic/maze_13_13.pdf Graphic file (type pdf)
<use main/pic/maze_13_13.pdf>
Package pdftex.def Info: main/pic/maze_13_13.pdf used on input line 53.
(pdftex.def) Requested size: 73.9715pt x 58.14139pt.
<main/pic/dependent_new.pdf, id=302, 557.01889pt x 394.59978pt>
File: main/pic/dependent_new.pdf Graphic file (type pdf)
<use main/pic/dependent_new.pdf>
Package pdftex.def Info: main/pic/dependent_new.pdf used on input line 78.
(pdftex.def) Requested size: 119.24675pt x 79.49658pt.
<main/pic/tabular_new.pdf, id=303, 566.51224pt x 401.1703pt>
File: main/pic/tabular_new.pdf Graphic file (type pdf)
<use main/pic/tabular_new.pdf>
Package pdftex.def Info: main/pic/tabular_new.pdf used on input line 82.
(pdftex.def) Requested size: 119.23904pt x 79.49194pt.
<main/pic/inverted_new.pdf, id=304, 565.61766pt x 402.45422pt>
File: main/pic/inverted_new.pdf Graphic file (type pdf)
<use main/pic/inverted_new.pdf>
Package pdftex.def Info: main/pic/inverted_new.pdf used on input line 87.
(pdftex.def) Requested size: 119.24063pt x 79.49458pt.
<main/pic/counterexample_quanju_new.pdf, id=305, 471.30164pt x 401.08943pt>
File: main/pic/counterexample_quanju_new.pdf Graphic file (type pdf)
<use main/pic/counterexample_quanju_new.pdf>
Package pdftex.def Info: main/pic/counterexample_quanju_new.pdf used on input line 91.
(pdftex.def) Requested size: 119.24184pt x 79.49428pt.
[7
pdfTeX warning (ext4): destination with the same identifier (name{figure.3}) has been already used, duplicate ignored
<argument> ...shipout:D \box_use:N \l_shipout_box
\__shipout_drop_firstpage_...
l.131
<./main/pic/maze_13_13.pdf> <./main/pic/dependent_new.pdf
pdfTeX warning: pdflatex.exe (file ./main/pic/dependent_new.pdf): PDF inclusion: multiple pdfs with page group included in a single page
> <./main/pic/tabular_new.pdf
pdfTeX warning: pdflatex.exe (file ./main/pic/tabular_new.pdf): PDF inclusion: multiple pdfs with page group included in a single page
> <./main/pic/inverted_new.pdf
pdfTeX warning: pdflatex.exe (file ./main/pic/inverted_new.pdf): PDF inclusion: multiple pdfs with page group included in a single page
> <./main/pic/counterexample_quanju_new.pdf
pdfTeX warning: pdflatex.exe (file ./main/pic/counterexample_quanju_new.pdf): PDF inclusion: multiple pdfs with page group included in a single page
>]) (./main/relatedwork.tex
<main/pic/maze_complete.pdf, id=426, 595.42892pt x 465.38112pt>
File: main/pic/maze_complete.pdf Graphic file (type pdf)
<use main/pic/maze_complete.pdf>
Package pdftex.def Info: main/pic/maze_complete.pdf used on input line 7.
(pdftex.def) Requested size: 119.24721pt x 79.4901pt.
<main/pic/cw_complete.pdf, id=427, 570.46333pt x 465.10928pt>
File: main/pic/cw_complete.pdf Graphic file (type pdf)
<use main/pic/cw_complete.pdf>
Package pdftex.def Info: main/pic/cw_complete.pdf used on input line 11.
(pdftex.def) Requested size: 119.24373pt x 79.49335pt.
<main/pic/mt_complete.pdf, id=428, 569.92673pt x 468.75475pt>
File: main/pic/mt_complete.pdf Graphic file (type pdf)
<use main/pic/mt_complete.pdf>
Package pdftex.def Info: main/pic/mt_complete.pdf used on input line 16.
(pdftex.def) Requested size: 119.24463pt x 79.49413pt.
<main/pic/Acrobot_complete.pdf, id=429, 564.99583pt x 478.09494pt>
File: main/pic/Acrobot_complete.pdf Graphic file (type pdf)
<use main/pic/Acrobot_complete.pdf>
Package pdftex.def Info: main/pic/Acrobot_complete.pdf used on input line 20.
(pdftex.def) Requested size: 119.23886pt x 79.49504pt.
[8
pdfTeX warning (ext4): destination with the same identifier (name{figure.4}) has been already used, duplicate ignored
<argument> ...shipout:D \box_use:N \l_shipout_box
\__shipout_drop_firstpage_...
l.57
pdfTeX warning (ext4): destination with the same identifier (name{table.2}) has been already used, duplicate ignored
<argument> ...shipout:D \box_use:N \l_shipout_box
\__shipout_drop_firstpage_...
l.57
<./main/pic/maze_complete.pdf> <./main/pic/cw_complete.pdf
pdfTeX warning: pdflatex.exe (file ./main/pic/cw_complete.pdf): PDF inclusion: multiple pdfs with page group included in a single page
> <./main/pic/mt_complete.pdf
pdfTeX warning: pdflatex.exe (file ./main/pic/mt_complete.pdf): PDF inclusion: multiple pdfs with page group included in a single page
> <./main/pic/Acrobot_complete.pdf
pdfTeX warning: pdflatex.exe (file ./main/pic/Acrobot_complete.pdf): PDF inclusion: multiple pdfs with page group included in a single page
>]) (./main/conclusion.tex) (./main/appendix.tex [9] [10]
LaTeX Warning: Command \textemdash invalid in math mode on input line 229.
LaTeX Warning: Command \textemdash invalid in math mode on input line 229.
[11] [12] [13]
Underfull \hbox (badness 1946) in paragraph at lines 683--696
[]\T1/ptm/m/n/10 (+20) Three ran-dom walk ex-per-i-ments: the $\OML/cmm/m/it/10 $ \T1/ptm/m/n/10 (+20) val-ues for all al-go-rithms are in the range of
[]
[14] [15]
Overfull \hbox (33.58313pt too wide) in paragraph at lines 738--752
[][]
[]
) (./neurips_2024.bbl [16
pdfTeX warning (ext4): destination with the same identifier (name{table.3}) has been already used, duplicate ignored
<argument> ...shipout:D \box_use:N \l_shipout_box
\__shipout_drop_firstpage_...
l.12
] [17]) [18] (./neurips_2024.aux)
Package rerunfilecheck Info: File `neurips_2024.out' has not changed.
(rerunfilecheck) Checksum: E5788AEC1D4F936207967A17A6B3E0A1;3587.
)
Here is how much of TeX's memory you used:
26626 strings out of 476025
484842 string characters out of 5789524
1897382 words of memory out of 5000000
46086 multiletter control sequences out of 15000+600000
567455 words of font info for 255 fonts, out of 8000000 for 9000
1141 hyphenation exceptions out of 8191
84i,16n,80p,1005b,1065s stack positions out of 10000i,1000n,20000p,200000b,200000s
<d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmex10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi6.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi9.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr6.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr9.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy6.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/symbols/msbm10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/cm-super/sftt1000.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmb8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmr8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmri8a.pfb>
Output written on neurips_2024.pdf (18 pages, 2290177 bytes).
PDF statistics:
1011 PDF objects out of 1200 (max. 8388607)
839 compressed objects within 9 object streams
195 named destinations out of 1000 (max. 500000)
52442 words of extra memory for PDF output out of 61914 (max. 10000000)
\BOOKMARK [1][-]{section.1}{\376\377\000I\000n\000t\000r\000o\000d\000u\000c\000t\000i\000o\000n}{}% 1
\BOOKMARK [1][-]{section.2}{\376\377\000B\000a\000c\000k\000g\000r\000o\000u\000n\000d}{}% 2
\BOOKMARK [1][-]{section.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{}% 3
\BOOKMARK [2][-]{subsection.3.1}{\376\377\000M\000o\000t\000i\000v\000a\000t\000i\000o\000n}{section.3}% 4
\BOOKMARK [2][-]{subsection.3.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D}{section.3}% 5
\BOOKMARK [2][-]{subsection.3.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000C\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D\000C}{section.3}% 6
\BOOKMARK [1][-]{section.4}{\376\377\000T\000h\000e\000o\000r\000e\000t\000i\000c\000a\000l\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{}% 7
\BOOKMARK [1][-]{section.5}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000S\000t\000u\000d\000i\000e\000s}{}% 8
\BOOKMARK [2][-]{subsection.5.1}{\376\377\000T\000e\000s\000t\000i\000n\000g\000\040\000T\000a\000s\000k\000s}{section.5}% 9
\BOOKMARK [2][-]{subsection.5.2}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000R\000e\000s\000u\000l\000t\000s\000\040\000a\000n\000d\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section.5}% 10
\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000l\000a\000t\000e\000d\000\040\000W\000o\000r\000k}{}% 11
\BOOKMARK [2][-]{subsection.6.1}{\376\377\000D\000i\000f\000f\000e\000r\000e\000n\000c\000e\000\040\000b\000e\000t\000w\000e\000e\000n\000\040\000V\000M\000Q\000\040\000a\000n\000d\000\040\000R\000-\000l\000e\000a\000r\000n\000i\000n\000g}{section.6}% 12
\BOOKMARK [2][-]{subsection.6.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g}{section.6}% 13
\BOOKMARK [2][-]{subsection.6.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{section.6}% 14
\BOOKMARK [1][-]{section.7}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n\000\040\000a\000n\000d\000\040\000F\000u\000t\000u\000r\000e\000\040\000W\000o\000r\000k}{}% 15
\BOOKMARK [1][-]{appendix.A}{\376\377\000R\000e\000l\000e\000v\000a\000n\000t\000\040\000p\000r\000o\000o\000f\000s}{}% 16
\BOOKMARK [2][-]{subsection.A.1}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0001}{appendix.A}% 17
\BOOKMARK [2][-]{subsection.A.2}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000C\000o\000r\000o\000l\000l\000a\000r\000y\000\040\0004\000.\0002}{appendix.A}% 18
\BOOKMARK [2][-]{subsection.A.3}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0003}{appendix.A}% 19
\BOOKMARK [1][-]{appendix.B}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000d\000e\000t\000a\000i\000l\000s}{}% 20
% partial rewrite of the LaTeX2e package for submissions to the
% Conference on Neural Information Processing Systems (NeurIPS):
%
% - uses more LaTeX conventions
% - line numbers at submission time replaced with aligned numbers from
% lineno package
% - \nipsfinalcopy replaced with [final] package option
% - automatically loads times package for authors
% - loads natbib automatically; this can be suppressed with the
% [nonatbib] package option
% - adds foot line to first page identifying the conference
% - adds preprint option for submission to e.g. arXiv
% - conference acronym modified
%
% Roman Garnett (garnett@wustl.edu) and the many authors of
% nips15submit_e.sty, including MK and drstrip@sandia
%
% last revision: March 2024
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{neurips_2024}[2024/03/31 NeurIPS 2024 submission/camera-ready style file]
% declare final option, which creates camera-ready copy
\newif\if@neuripsfinal\@neuripsfinalfalse
\DeclareOption{final}{
\@neuripsfinaltrue
}
% declare nonatbib option, which does not load natbib in case of
% package clash (users can pass options to natbib via
% \PassOptionsToPackage)
\newif\if@natbib\@natbibtrue
\DeclareOption{nonatbib}{
\@natbibfalse
}
% declare preprint option, which creates a preprint version ready for
% upload to, e.g., arXiv
\newif\if@preprint\@preprintfalse
\DeclareOption{preprint}{
\@preprinttrue
}
\ProcessOptions\relax
% determine whether this is an anonymized submission
\newif\if@submission\@submissiontrue
\if@neuripsfinal\@submissionfalse\fi
\if@preprint\@submissionfalse\fi
% fonts
\renewcommand{\rmdefault}{ptm}
\renewcommand{\sfdefault}{phv}
% change this every year for notice string at bottom
\newcommand{\@neuripsordinal}{38th}
\newcommand{\@neuripsyear}{2024}
\newcommand{\@neuripslocation}{Vancouver}
% acknowledgments
\usepackage{environ}
\newcommand{\acksection}{\section*{Acknowledgments and Disclosure of Funding}}
\NewEnviron{ack}{%
\acksection
\BODY
}
% load natbib unless told otherwise
\if@natbib
\RequirePackage{natbib}
\fi
% set page geometry
\usepackage[verbose=true,letterpaper]{geometry}
\AtBeginDocument{
\newgeometry{
textheight=9in,
textwidth=5.5in,
top=1in,
headheight=12pt,
headsep=25pt,
footskip=30pt
}
\@ifpackageloaded{fullpage}
{\PackageWarning{neurips_2024}{fullpage package not allowed! Overwriting formatting.}}
{}
}
\widowpenalty=10000
\clubpenalty=10000
\flushbottom
\sloppy
% font sizes with reduced leading
\renewcommand{\normalsize}{%
\@setfontsize\normalsize\@xpt\@xipt
\abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@
\abovedisplayshortskip \z@ \@plus 3\p@
\belowdisplayskip \abovedisplayskip
\belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@
}
\normalsize
\renewcommand{\small}{%
\@setfontsize\small\@ixpt\@xpt
\abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@
\abovedisplayshortskip \z@ \@plus 2\p@
\belowdisplayskip \abovedisplayskip
\belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@
}
\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt}
\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt}
\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt}
\renewcommand{\large}{\@setfontsize\large\@xiipt{14}}
\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}}
\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}}
\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}}
\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}}
% sections with less space
\providecommand{\section}{}
\renewcommand{\section}{%
\@startsection{section}{1}{\z@}%
{-2.0ex \@plus -0.5ex \@minus -0.2ex}%
{ 1.5ex \@plus 0.3ex \@minus 0.2ex}%
{\large\bf\raggedright}%
}
\providecommand{\subsection}{}
\renewcommand{\subsection}{%
\@startsection{subsection}{2}{\z@}%
{-1.8ex \@plus -0.5ex \@minus -0.2ex}%
{ 0.8ex \@plus 0.2ex}%
{\normalsize\bf\raggedright}%
}
\providecommand{\subsubsection}{}
\renewcommand{\subsubsection}{%
\@startsection{subsubsection}{3}{\z@}%
{-1.5ex \@plus -0.5ex \@minus -0.2ex}%
{ 0.5ex \@plus 0.2ex}%
{\normalsize\bf\raggedright}%
}
\providecommand{\paragraph}{}
\renewcommand{\paragraph}{%
\@startsection{paragraph}{4}{\z@}%
{1.5ex \@plus 0.5ex \@minus 0.2ex}%
{-1em}%
{\normalsize\bf}%
}
\providecommand{\subparagraph}{}
\renewcommand{\subparagraph}{%
\@startsection{subparagraph}{5}{\z@}%
{1.5ex \@plus 0.5ex \@minus 0.2ex}%
{-1em}%
{\normalsize\bf}%
}
\providecommand{\subsubsubsection}{}
\renewcommand{\subsubsubsection}{%
\vskip5pt{\noindent\normalsize\rm\raggedright}%
}
% float placement
\renewcommand{\topfraction }{0.85}
\renewcommand{\bottomfraction }{0.4}
\renewcommand{\textfraction }{0.1}
\renewcommand{\floatpagefraction}{0.7}
\newlength{\@neuripsabovecaptionskip}\setlength{\@neuripsabovecaptionskip}{7\p@}
\newlength{\@neuripsbelowcaptionskip}\setlength{\@neuripsbelowcaptionskip}{\z@}
\setlength{\abovecaptionskip}{\@neuripsabovecaptionskip}
\setlength{\belowcaptionskip}{\@neuripsbelowcaptionskip}
% swap above/belowcaptionskip lengths for tables
\renewenvironment{table}
{\setlength{\abovecaptionskip}{\@neuripsbelowcaptionskip}%
\setlength{\belowcaptionskip}{\@neuripsabovecaptionskip}%
\@float{table}}
{\end@float}
% footnote formatting
\setlength{\footnotesep }{6.65\p@}
\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@}
\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@}
\setcounter{footnote}{0}
% paragraph formatting
\setlength{\parindent}{\z@}
\setlength{\parskip }{5.5\p@}
% list formatting
\setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@}
\setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@}
\setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@}
\setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@}
\setlength{\leftmargin }{3pc}
\setlength{\leftmargini }{\leftmargin}
\setlength{\leftmarginii }{2em}
\setlength{\leftmarginiii}{1.5em}
\setlength{\leftmarginiv }{1.0em}
\setlength{\leftmarginv }{0.5em}
\def\@listi {\leftmargin\leftmargini}
\def\@listii {\leftmargin\leftmarginii
\labelwidth\leftmarginii
\advance\labelwidth-\labelsep
\topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@
\parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@
\itemsep \parsep}
\def\@listiii{\leftmargin\leftmarginiii
\labelwidth\leftmarginiii
\advance\labelwidth-\labelsep
\topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@
\parsep \z@
\partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@
\itemsep \topsep}
\def\@listiv {\leftmargin\leftmarginiv
\labelwidth\leftmarginiv
\advance\labelwidth-\labelsep}
\def\@listv {\leftmargin\leftmarginv
\labelwidth\leftmarginv
\advance\labelwidth-\labelsep}
\def\@listvi {\leftmargin\leftmarginvi
\labelwidth\leftmarginvi
\advance\labelwidth-\labelsep}
% create title
\providecommand{\maketitle}{}
\renewcommand{\maketitle}{%
\par
\begingroup
\renewcommand{\thefootnote}{\fnsymbol{footnote}}
% for perfect author name centering
\renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}}
% The footnote-mark was overlapping the footnote-text,
% added the following to fix this problem (MK)
\long\def\@makefntext##1{%
\parindent 1em\noindent
\hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1
}
\thispagestyle{empty}
\@maketitle
\@thanks
\@notice
\endgroup
\let\maketitle\relax
\let\thanks\relax
}
% rules for title box at top of first page
\newcommand{\@toptitlebar}{
\hrule height 4\p@
\vskip 0.25in
\vskip -\parskip%
}
\newcommand{\@bottomtitlebar}{
\vskip 0.29in
\vskip -\parskip
\hrule height 1\p@
\vskip 0.09in%
}
% create title (includes both anonymized and non-anonymized versions)
\providecommand{\@maketitle}{}
\renewcommand{\@maketitle}{%
\vbox{%
\hsize\textwidth
\linewidth\hsize
\vskip 0.1in
\@toptitlebar
\centering
{\LARGE\bf \@title\par}
\@bottomtitlebar
\if@submission
\begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}
Anonymous Author(s) \\
Affiliation \\
Address \\
\texttt{email} \\
\end{tabular}%
\else
\def\And{%
\end{tabular}\hfil\linebreak[0]\hfil%
\begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
}
\def\AND{%
\end{tabular}\hfil\linebreak[4]\hfil%
\begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
}
\begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}%
\fi
\vskip 0.3in \@minus 0.1in
}
}
% add conference notice to bottom of first page
\newcommand{\ftype@noticebox}{8}
\newcommand{\@notice}{%
% give a bit of extra room back to authors on first page
\enlargethispage{2\baselineskip}%
\@float{noticebox}[b]%
\footnotesize\@noticestring%
\end@float%
}
% abstract styling
\renewenvironment{abstract}%
{%
\vskip 0.075in%
\centerline%
{\large\bf Abstract}%
\vspace{0.5ex}%
\begin{quote}%
}
{
\par%
\end{quote}%
\vskip 1ex%
}
% For the paper checklist
\newcommand{\answerYes}[1][]{\textcolor{blue}{[Yes] #1}}
\newcommand{\answerNo}[1][]{\textcolor{orange}{[No] #1}}
\newcommand{\answerNA}[1][]{\textcolor{gray}{[NA] #1}}
\newcommand{\answerTODO}[1][]{\textcolor{red}{\bf [TODO]}}
\newcommand{\justificationTODO}[1][]{\textcolor{red}{\bf [TODO]}}
% handle tweaks for camera-ready copy vs. submission copy
\if@preprint
\newcommand{\@noticestring}{%
Preprint. Under review.%
}
\else
\if@neuripsfinal
\newcommand{\@noticestring}{%
\@neuripsordinal\/ Conference on Neural Information Processing Systems
(NeurIPS \@neuripsyear).%, \@neuripslocation.%
}
\else
\newcommand{\@noticestring}{%
Submitted to \@neuripsordinal\/ Conference on Neural Information
Processing Systems (NeurIPS \@neuripsyear). Do not distribute.%
}
% hide the acknowledgements
\NewEnviron{hide}{}
\let\ack\hide
\let\endack\endhide
% line numbers for submission
\RequirePackage{lineno}
\linenumbers
% fix incompatibilities between lineno and amsmath, if required, by
% transparently wrapping linenomath environments around amsmath
% environments
\AtBeginDocument{%
\@ifpackageloaded{amsmath}{%
\newcommand*\patchAmsMathEnvironmentForLineno[1]{%
\expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname
\expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname
\renewenvironment{#1}%
{\linenomath\csname old#1\endcsname}%
{\csname oldend#1\endcsname\endlinenomath}%
}%
\newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{%
\patchAmsMathEnvironmentForLineno{#1}%
\patchAmsMathEnvironmentForLineno{#1*}%
}%
\patchBothAmsMathEnvironmentsForLineno{equation}%
\patchBothAmsMathEnvironmentsForLineno{align}%
\patchBothAmsMathEnvironmentsForLineno{flalign}%
\patchBothAmsMathEnvironmentsForLineno{alignat}%
\patchBothAmsMathEnvironmentsForLineno{gather}%
\patchBothAmsMathEnvironmentsForLineno{multline}%
}
{}
}
\fi
\fi
\endinput
\documentclass{article}
% if you need to pass options to natbib, use, e.g.:
% \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2024
% ready for submission
\usepackage{neurips_2024}
% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
% \usepackage[preprint]{neurips_2024}
% to compile a camera-ready version, add the [final] option, e.g.:
% \usepackage[final]{neurips_2024}
% to avoid loading the natbib package, add option nonatbib:
% \usepackage[nonatbib]{neurips_2024}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc} % use 8-bit T1 fonts
\usepackage{hyperref} % hyperlinks
\usepackage{url} % simple URL typesetting
\usepackage{booktabs} % professional-quality tables
\usepackage{amsfonts} % blackboard math symbols
\usepackage{nicefrac} % compact symbols for 1/2, etc.
\usepackage{microtype} % microtypography
\usepackage{xcolor} % colors
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{diagbox}
\usepackage{wrapfig}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{tikz}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\usepackage{algorithm}
\usepackage{algorithmic}
\title{Is Minimizing Errors the Only Option for Value-based Reinforcement Learning?}
% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.
\author{%
David S.~Hippocampus\thanks{Use footnote for providing further information
about author (webpage, alternative address)---\emph{not} for acknowledging
funding agencies.} \\
Department of Computer Science\\
Cranberry-Lemon University\\
Pittsburgh, PA 15213 \\
\texttt{hippo@cs.cranberry-lemon.edu} \\
% examples of more authors
% \And
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email} \\
% \AND
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email} \\
% \And
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email} \\
% \And
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email} \\
}
\begin{document}
\maketitle
\begin{abstract}
The existing research on
value-based reinforcement learning also minimizes the error.
However, is error minimization really the only option
for value-based reinforcement learning?
We can easily observe that the policy on action
choosing probabilities is often related to the relative values,
and has nothing to do with their absolute values.
Based on this observation, we propose the objective
of variance minimization instead of error minimization,
derive many new variance minimization algorithms, both including a traditional parameter $\omega$,
and conduct an analysis of the convergence rate and experiments.
The experimental results show that our proposed variance minimization algorithms
converge much faster.
\end{abstract}
\input{main/introduction.tex}
\input{main/preliminaries.tex}
\input{main/motivation.tex}
\input{main/theory.tex}
\input{main/experiment.tex}
\input{main/relatedwork.tex}
\input{main/conclusion.tex}
\appendix
\input{main/appendix.tex}
\bibliographystyle{named}
\bibliography{neurips_2024}
% \bibliographystyle{neurips_2024}
\end{document}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment