Commit afbe69ae by GongYu

来不及了

parents
% ALGORITHM STYLE -- Released 8 April 1996
% for LaTeX-2e
% Copyright -- 1994 Peter Williams
% E-mail Peter.Williams@dsto.defence.gov.au
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{algorithm}
\typeout{Document Style `algorithm' - floating environment}
\RequirePackage{float}
\RequirePackage{ifthen}
\newcommand{\ALG@within}{nothing}
\newboolean{ALG@within}
\setboolean{ALG@within}{false}
\newcommand{\ALG@floatstyle}{ruled}
\newcommand{\ALG@name}{Algorithm}
\newcommand{\listalgorithmname}{List of \ALG@name s}
% Declare Options
% first appearance
\DeclareOption{plain}{
\renewcommand{\ALG@floatstyle}{plain}
}
\DeclareOption{ruled}{
\renewcommand{\ALG@floatstyle}{ruled}
}
\DeclareOption{boxed}{
\renewcommand{\ALG@floatstyle}{boxed}
}
% then numbering convention
\DeclareOption{part}{
\renewcommand{\ALG@within}{part}
\setboolean{ALG@within}{true}
}
\DeclareOption{chapter}{
\renewcommand{\ALG@within}{chapter}
\setboolean{ALG@within}{true}
}
\DeclareOption{section}{
\renewcommand{\ALG@within}{section}
\setboolean{ALG@within}{true}
}
\DeclareOption{subsection}{
\renewcommand{\ALG@within}{subsection}
\setboolean{ALG@within}{true}
}
\DeclareOption{subsubsection}{
\renewcommand{\ALG@within}{subsubsection}
\setboolean{ALG@within}{true}
}
\DeclareOption{nothing}{
\renewcommand{\ALG@within}{nothing}
\setboolean{ALG@within}{true}
}
\DeclareOption*{\edef\ALG@name{\CurrentOption}}
% ALGORITHM
%
\ProcessOptions
\floatstyle{\ALG@floatstyle}
\ifthenelse{\boolean{ALG@within}}{
\ifthenelse{\equal{\ALG@within}{part}}
{\newfloat{algorithm}{htbp}{loa}[part]}{}
\ifthenelse{\equal{\ALG@within}{chapter}}
{\newfloat{algorithm}{htbp}{loa}[chapter]}{}
\ifthenelse{\equal{\ALG@within}{section}}
{\newfloat{algorithm}{htbp}{loa}[section]}{}
\ifthenelse{\equal{\ALG@within}{subsection}}
{\newfloat{algorithm}{htbp}{loa}[subsection]}{}
\ifthenelse{\equal{\ALG@within}{subsubsection}}
{\newfloat{algorithm}{htbp}{loa}[subsubsection]}{}
\ifthenelse{\equal{\ALG@within}{nothing}}
{\newfloat{algorithm}{htbp}{loa}}{}
}{
\newfloat{algorithm}{htbp}{loa}
}
\floatname{algorithm}{\ALG@name}
\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}}
% ALGORITHMIC STYLE -- Released 8 APRIL 1996
% for LaTeX version 2e
% Copyright -- 1994 Peter Williams
% E-mail PeterWilliams@dsto.defence.gov.au
%
% Modified by Alex Smola (08/2000)
% E-mail Alex.Smola@anu.edu.au
%
\NeedsTeXFormat{LaTeX2e}
\ProvidesPackage{algorithmic}
\typeout{Document Style `algorithmic' - environment}
%
\RequirePackage{ifthen}
\RequirePackage{calc}
\newboolean{ALC@noend}
\setboolean{ALC@noend}{false}
\newcounter{ALC@line}
\newcounter{ALC@rem}
\newlength{\ALC@tlm}
%
\DeclareOption{noend}{\setboolean{ALC@noend}{true}}
%
\ProcessOptions
%
% ALGORITHMIC
\newcommand{\algorithmicrequire}{\textbf{Require:}}
\newcommand{\algorithmicensure}{\textbf{Ensure:}}
\newcommand{\algorithmiccomment}[1]{\{#1\}}
\newcommand{\algorithmicend}{\textbf{end}}
\newcommand{\algorithmicif}{\textbf{if}}
\newcommand{\algorithmicthen}{\textbf{then}}
\newcommand{\algorithmicelse}{\textbf{else}}
\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif}
\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif}
\newcommand{\algorithmicfor}{\textbf{for}}
\newcommand{\algorithmicforall}{\textbf{for all}}
\newcommand{\algorithmicdo}{\textbf{do}}
\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor}
\newcommand{\algorithmicwhile}{\textbf{while}}
\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile}
\newcommand{\algorithmicloop}{\textbf{loop}}
\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop}
\newcommand{\algorithmicrepeat}{\textbf{repeat}}
\newcommand{\algorithmicuntil}{\textbf{until}}
%changed by alex smola
\newcommand{\algorithmicinput}{\textbf{input}}
\newcommand{\algorithmicoutput}{\textbf{output}}
\newcommand{\algorithmicset}{\textbf{set}}
\newcommand{\algorithmictrue}{\textbf{true}}
\newcommand{\algorithmicfalse}{\textbf{false}}
\newcommand{\algorithmicand}{\textbf{and\ }}
\newcommand{\algorithmicor}{\textbf{or\ }}
\newcommand{\algorithmicfunction}{\textbf{function}}
\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction}
\newcommand{\algorithmicmain}{\textbf{main}}
\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain}
%end changed by alex smola
\def\ALC@item[#1]{%
\if@noparitem \@donoparitem
\else \if@inlabel \indent \par \fi
\ifhmode \unskip\unskip \par \fi
\if@newlist \if@nobreak \@nbitem \else
\addpenalty\@beginparpenalty
\addvspace\@topsep \addvspace{-\parskip}\fi
\else \addpenalty\@itempenalty \addvspace\itemsep
\fi
\global\@inlabeltrue
\fi
\everypar{\global\@minipagefalse\global\@newlistfalse
\if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels
\penalty\z@ \fi
\everypar{}}\global\@nobreakfalse
\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi
\sbox\@tempboxa{\makelabel{#1}}%
\global\setbox\@labels
\hbox{\unhbox\@labels \hskip \itemindent
\hskip -\labelwidth \hskip -\ALC@tlm
\ifdim \wd\@tempboxa >\labelwidth
\box\@tempboxa
\else \hbox to\labelwidth {\unhbox\@tempboxa}\fi
\hskip \ALC@tlm}\ignorespaces}
%
\newenvironment{algorithmic}[1][0]{
\let\@item\ALC@item
\newcommand{\ALC@lno}{%
\ifthenelse{\equal{\arabic{ALC@rem}}{0}}
{{\footnotesize \arabic{ALC@line}:}}{}%
}
\let\@listii\@listi
\let\@listiii\@listi
\let\@listiv\@listi
\let\@listv\@listi
\let\@listvi\@listi
\let\@listvii\@listi
\newenvironment{ALC@g}{
\begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@
\listparindent\z@ \rightmargin\z@
\topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@
\leftmargin 1em
\addtolength{\ALC@tlm}{\leftmargin}
}
}
{\end{list}}
\newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item}
\newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}%
{}{\ \algorithmiccomment{##1}}}
\newcommand{\REQUIRE}{\item[\algorithmicrequire]}
\newcommand{\ENSURE}{\item[\algorithmicensure]}
\newcommand{\STATE}{\ALC@it}
\newcommand{\COMMENT}[1]{\algorithmiccomment{##1}}
%changes by alex smola
\newcommand{\INPUT}{\item[\algorithmicinput]}
\newcommand{\OUTPUT}{\item[\algorithmicoutput]}
\newcommand{\SET}{\item[\algorithmicset]}
% \newcommand{\TRUE}{\algorithmictrue}
% \newcommand{\FALSE}{\algorithmicfalse}
\newcommand{\AND}{\algorithmicand}
\newcommand{\OR}{\algorithmicor}
\newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}}
%end changes by alex smola
\newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}}
\newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}}
\renewcommand{\\}{\@centercr}
\newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\
\algorithmicthen\ {##2}}
\newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\ELSIF}[2][default]%
{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen%
\ALC@com{##1}\begin{ALC@if}}
\newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo%
\ALC@com{##1}\begin{ALC@for}}
\newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ %
\algorithmicdo%
\ALC@com{##1}\begin{ALC@for}}
\newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ %
\algorithmicdo\ {##2}}
\newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ %
\algorithmicdo%
\ALC@com{##1}\begin{ALC@whl}}
\newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop%
\ALC@com{##1}\begin{ALC@loop}}
%changed by alex smola
\newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ %
\ALC@com{##1}\begin{ALC@func}}
\newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ %
\ALC@com{##1}\begin{ALC@main}}
%end changed by alex smola
\newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat%
\ALC@com{##1}\begin{ALC@rpt}}
\newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1}
\ifthenelse{\boolean{ALC@noend}}{
\newcommand{\ENDIF}{\end{ALC@if}}
\newcommand{\ENDFOR}{\end{ALC@for}}
\newcommand{\ENDWHILE}{\end{ALC@whl}}
\newcommand{\ENDLOOP}{\end{ALC@loop}}
\newcommand{\ENDFUNCTION}{\end{ALC@func}}
\newcommand{\ENDMAIN}{\end{ALC@main}}
}{
\newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif}
\newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor}
\newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile}
\newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop}
\newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction}
\newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain}
}
\renewcommand{\@toodeep}{}
\begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}%
\itemsep\z@ \itemindent\z@ \listparindent\z@%
\partopsep\z@ \parskip\z@ \parsep\z@%
\labelsep 0.5em \topsep 0.2em%
\ifthenelse{\equal{#1}{0}}
{\labelwidth 0.5em }
{\labelwidth 1.2em }
\leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
\ALC@tlm\labelsep
}
}
{\end{list}}
\section{Conclusion and Future Work}
Value-based reinforcement learning typically aims
to minimize error as an optimization objective.
As an alternation, this study proposes new objective
functions: VBE, VPBE and VNEU, and derives many variance minimization algorithms, including VMTD,
VMTDC, VMGTD, VMGTD2 and VMETD.
% The VMTD algorithm
% is essentially an adjustment or correction to the traditional
% TD update.
% Both
% algorithms are capable of stabilizing gradient estimation, reducing
% the variance of gradient estimation and accelerating convergence.
All algorithms demonstrated superior performance in policy
evaluation and control experiments.
Future work may include, but are not limited
to, (1) analysis of the convergence rate of VMTDC.
(2) extensions of VBE and VPBE to multi-step returns.
(3) extensions to nonlinear approximations, such as neural networks.
\ No newline at end of file
\section{Experimental Studies}
This section assesses algorithm performance through experiments,
which are divided into policy evaluation experiments and control experiments.
\subsection{Testing Tasks}
\textbf{Random-walk:} as shown in Figure \ref{randomwalk}, all episodes
start in the center state, $C$, and proceed to left or right by one state on each
step, equiprobably. Episodes terminate either on the extreme left or
the extreme right, and get a reward of $+1$ if terminate on the right, or
$0$ in the other case. In this task, the true value for each state is the
probability of starting from that state and terminating on the right
\cite{Sutton2018book}.
Thus, the true values of states from $A$ to $E$ are
$\frac{1}{6},\frac{2}{6},\frac{3}{6},\frac{4}{6},\frac{5}{6}$, respectively.
The discount factor $\gamma=1.0$.
There are three standard kinds of features for random-walk problems: tabular
feature, inverted feature and dependent feature \cite{sutton2009fast}.
The feature matrices corresponding to three random walks are shown in Appendix \ref{experimentaldetails}.
Conduct experiments using
an on-policy approach in the Random-walk environment.
\begin{figure}
\begin{center}
\input{main/pic/randomwalk.tex}
\caption{Random walk.}
\label{randomwalk}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
\input{main/pic/BairdExample.tex}
\caption{7-state version of Baird's off-policy counterexample.}
\label{bairdexample}
\end{center}
\end{figure}
\textbf{Baird's off-policy counterexample:} This task is well known as a
counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As
shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy
chooses actions represented by solid lines with a probability of $\frac{1}{7}$
and actions represented by dotted lines with a probability of $\frac{6}{7}$. The
target policy is expected to choose the solid line with more probability than $\frac{1}{7}$,
and it chooses the solid line with probability of $1$ in this paper.
The discount factor $\gamma =0.99$, and the feature matrix is
defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}.
\textbf{Maze}: The learning agent should find a shortest path from the upper
left corner to the lower right corner. In each state,
there are four alternative actions: $up$, $down$, $left$, and $right$, which
takes the agent deterministically to the corresponding neighbour state, except when
\begin{wrapfigure}{r}{3cm}
\centering
\includegraphics[scale=0.15]{main/pic/maze_13_13.pdf}
% \caption{The 2-state counterexample.}
\end{wrapfigure}
a movement is blocked by an obstacle or the edge
of the maze. Rewards are $-1$ in all transitions until the
agent reaches the goal state.
The discount factor $\gamma=0.99$, and states $s$ are represented by tabular
features.The maximum number of moves in the game is set to 1000.
\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are
selected from the gym official website and correspond to the following
versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''.
For specific details, please refer to the gym official website.
The maximum number of steps for the Mountain Car environment is set to 1000,
while the default settings are used for the other two environments. In Mountain car and Acrobot, features are generated by tile coding.
Please, refer to the Appendix \ref{experimentaldetails} for the selection of learning rates for all experiments.
\subsection{Experimental Results and Analysis}
\begin{figure}[htb]
\vskip 0.2in
\begin{center}
\subfigure[Dependent]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/dependent_new.pdf}
\label{DependentFull}
}
\subfigure[Tabular]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/tabular_new.pdf}
\label{TabularFull}
}
\\
\subfigure[Inverted]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/inverted_new.pdf}
\label{InvertedFull}
}
\subfigure[counterexample]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/counterexample_quanju_new.pdf}
\label{CounterExampleFull}
}
\caption{Learning curses of four evaluation environments.}
\label{Evaluation_full}
\end{center}
\vskip -0.2in
\end{figure}
% The learning rates of all algorithms in different environments are shown in Table \ref{lrofways}.
% Figure \ref{Complete_full} shows the experimental curves of different algorithms in four environments.
For policy evaluation experiments, compare the performance of the VMTD,
VMTDC, TD, and TDC algorithms.
The vertical axis is unified as RVBE.
For policy evaluation experiments, the criteria for evaluating
algorithms vary. The objective function minimized by our proposed
new algorithm differs from that of other algorithms. Therefore, to
ensure fairness in comparisons, this study only contrasts algorithm
experiments in controlled settings.
This study will compare the performance of Sarsa, Q-learning, GQ(0),
AC, VMSarsa, VMQ, and VMGQ(0) in four control environments.
% All experiments involved in this paper were run independently for 100 times.
The learning curses of the algorithms corresponding to
policy evaluation experiments and control experiments are
shown in Figures \ref{Evaluation_full} and \ref{Complete_full}, respectively.
The shaded area in Figure \ref{Evaluation_full}, \ref{Complete_full} represents the standard deviation (std).
In the random-walk tasks, VMTD and VMTDC exhibit excellent performance,
outperforming TD and TDC in the case of dependent random-walk.
In the 7-state example counter task, TD diverges,
while VMTDC converges and performs better than TDC.
From the update formula, it can be observed that the VMTD algorithm, like TDC,
is also an adjustment or correction of the TD update.
What is more surprising is that VMTD also maintains
convergence and demonstrates the best performance.
In Maze, Mountain Car, and Acrobot,
the convergence speed of VMSarsa, VMQ, and VMGQ(0) has
been significantly improved compared to Sarsa, Q-learning,
and GQ(0), respectively. The performance of the AC algorithm
is at an intermediate level. The performances of VMSarsa,
VMQ, and VMGQ(0) in these three experimental environments
have no significant differences.
In Cliff Walking, Sarsa and
VMSarsa converge to slightly worse solutions compared to
other algorithms. The convergence speed of VMSarsa is significantly
better than that of Sarsa. The convergence speed of VMGQ(0) and VMQ
is better than other algorithms, and the performance of VMGQ(0) is
slightly better than that of VMQ.
In summary, the performance of VMSarsa,
VMQ, and VMGQ(0) is better than that of other algorithms.
In the Cliff Walking environment,
the performance of VMGQ(0) is slightly better than that of
VMSarsa and VMQ. In the other three experimental environments,
the performances of VMSarsa, VMQ, and VMGQ(0) are close.
\ No newline at end of file
\section{Introduction}
\label{introduction}
Reinforcement learning can be mainly divided into two
categories: value-based reinforcement learning
and policy gradient-based reinforcement learning. This
paper focuses on temporal difference learning based on
linear approximated valued functions. Its research is
usually divided into two steps: the first step is to establish the convergence of the algorithm, and the second
step is to accelerate the algorithm.
In terms of stability, \citet{sutton1988learning} established the
convergence of on-policy TD(0), and \citet{tsitsiklis1997analysis}
established the convergence of on-policy TD($\lambda$).
However, ``The deadly triad'' consisting of off-policy learning,
bootstrapping, and function approximation makes
the stability a difficult problem \citep{Sutton2018book}.
To solve this problem, convergent off-policy temporal difference
learning algorithms are proposed, e.g., BR \cite{baird1995residual},
GTD \cite{sutton2008convergent}, GTD2 and TDC \cite{sutton2009fast},
ETD \cite{sutton2016emphatic}, and MRetrace \cite{chen2023modified}.
In terms of acceleration, \citet{hackman2012faster}
proposed Hybrid TD algorithm with on-policy matrix.
\citet{liu2015finite,liu2016proximal,liu2018proximal} proposed
true stochastic algorithms, i.e., GTD-MP and GTD2-MP, from
a convex-concave saddle-point formulation.
Second-order methods are used to accelerate TD learning,
e.g., Quasi Newton TD \cite{givchi2015quasi} and
accelerated TD (ATD) \citep{pan2017accelerated}.
\citet{hallak2016generalized} introduced an new parameter
to reduce variance for ETD.
\citet{zhang2022truncated} proposed truncated ETD with a lower variance.
Variance Reduced TD with direct variance reduction technique \citep{johnson2013accelerating} is proposed by \cite{korda2015td}
and analysed by \cite{xu2019reanalysis}.
How to further improve the convergence rates of reinforcement learning
algorithms is currently still an open problem.
Algorithm stability is prominently reflected in the changes
to the objective function, transitioning from mean squared
errors (MSE) \citep{Sutton2018book} to mean squared bellman errors (MSBE) \cite{baird1995residual}, then to
norm of the expected TD update \cite{sutton2009fast}, and further to
mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}. On the other hand, algorithm
acceleration is more centered around optimizing the iterative
update formula of the algorithm itself without altering the
objective function, thereby speeding up the convergence rate
of the algorithm. The emergence of new optimization objective
functions often leads to the development of novel algorithms.
The introduction of new algorithms, in turn, tends to inspire
researchers to explore methods for accelerating algorithms,
leading to the iterative creation of increasingly superior algorithms.
The kernel loss function can be optimized using standard
gradient-based methods, addressing the issue of double
sampling in residual gradient algorithm \cite{feng2019kernel}. It ensures convergence
in both on-policy and off-policy scenarios. The logistic bellman
error is convex and smooth in the action-value function parameters,
with bounded gradients \cite{basserrano2021logistic}. In contrast, the squared Bellman error is
not convex in the action-value function parameters, and RL algorithms
based on recursive optimization using it are known to be unstable.
% The value-based algorithms mentioned above aim to
% minimize some errors, e.g., mean squared errors \citep{Sutton2018book},
% mean squared Bellman errors \cite{baird1995residual}, norm
% of the expected TD update \cite{sutton2009fast},
% mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}, etc.
It is necessary to propose a new objective function, but the mentioned objective functions above are all some form of error.
Is minimizing error the only option for value-based reinforcement learning?
For policy evaluation experiments,
differences in objective functions may result
in inconsistent fixed points. This inconsistency
makes it difficult to uniformly compare the superiority
of algorithms derived from different objective functions.
However, for control experiments, since the choice of actions
depends on the relative values of the Q values rather than their
absolute values, the presence of solution bias is acceptable.
Based on this observation, we propose alternate objective functions
instead of minimizing errors. We minimize Variance of Bellman Error (VBE),
Variance of Projected Bellman Error (VPBE), and Variance of the norm of the expected TD update (VNEU)
and derive Variance Minimization (VM) algorithms.
These algorithms preserve the invariance of the optimal policy in the control environments,
but significantly reduce the variance of gradient estimation,
and thus hastening convergence.
The contributions of this paper are as follows:
(1) Introduction of novel objective functions based on
the invariance of the optimal policy.
(2) Derived mang variance minimization algorithms, including on-policy and one off-policy.
(3) Proof of their convergence.
(4) Analysis of the convergence rate of on-policy algorithm.
(5) Experiments demonstrating the faster convergence speed of the proposed algorithms.
\section{Variance Minimization Algorithms}
\subsection{Motivation}
As shown
in Table \ref{example_bias}, although there is a bias between the
true value and the predicted value, action $a_3$ is
still chosen under the greedy-policy.
On the contrary, supervised learning is usually used to predict temperature, humidity, morbidity, etc. If the bias is too large, the consequences could be serious.
\begin{table}[t]
\caption{Classification accuracies for naive Bayes and flexible
Bayes on various data sets.}
\label{example_bias}
\vskip 0.15in
\begin{center}
\begin{small}
\begin{sc}
\begin{tabular}{lcccr}
\toprule
action & $Q$ value & $Q$ value with bias \\
\midrule
$Q(s, a_0)$ & 1& 5 \\
$Q(s, a_1)$ & 2& 6 \\
$Q(s, a_2)$ & 3& 7 \\
$Q(s, a_3)$ & 4& 8 \\
$\arg \min_{a}Q(s,a)$ & $a_3$& $a_3$\\
\bottomrule
\end{tabular}
\end{sc}
\end{small}
\end{center}
\vskip -0.1in
\end{table}
In addition, reward shaping can significantly speed up the learning by adding a shaping
reward $F(s,s')$ to the original reward $r$,
where $F(s,s')$ is the general form of any state-based shaping reward.
Static potential-based reward shaping (Static PBRS) maintains the policy invariance if the
shaping reward follows from $F(s,s')=\gamma
f(s')-f(s)$ \cite{ng1999policy}.
This means that we can make changes to the TD error $\delta = r+\gamma \theta^{\top}\phi'-\theta^{\top}\phi $ while still ensuring the invariance of the optimal policy,
\begin{equation*}
\delta - \omega= r+\gamma \theta^{\top}\phi'-\theta^{\top}\phi - \omega,
\end{equation*}
where $\omega$ is a constant, acting as a static PBRS.
This also means that algorithms with the optimization goal
of minimizing errors, after introducing reward shaping,
may result in larger or smaller bias. Fortunately,
as discussed above, bias is acceptable in reinforcement
learning.
However, the problem is that selecting an appropriate
$\omega$ requires expert knowledge. This forces us to learn
$\omega$ dynamically, i.e., $\omega=\omega_t $ and dynamic PBRS can also maintain the policy
invariance if the shaping reward is $F(s,t,s',t')=\gamma f(s',t')-f(s,t)$,
where $t$ is the time-step the agent reaches in state $s$
\cite{devlin2012dynamic}.
However, this result requires the convergence guarantee of the dynamic potential
function $f(s,t)$. If $f(s,t)$ does not converge as the time-step
$t\rightarrow\infty$, the Q-values of dynamic PBRS are not
guaranteed to converge.
Let $f_{\omega_t}(s)=\frac{\omega_t}{\gamma-1}$.
Thus, $F_{\omega_t}(s,s')=\gamma f_{\omega_t}(s')-f_{\omega_t}(s)= \omega_t$
is a dynamic PBRS. And if $\omega$ converges finally, the dynamic potential
function $f(s,t)$ will converge.
Bias is the expected difference between the predicted value
and the true value. Therefore, under the premise of bootstrapping, we first think of
letting $\omega \doteq \mathbb{E}[\mathbb{E}[\delta|s]]=\mathbb{E}[\delta]$.
As we all know, the optimization process of linear TD(0) (semi-gradient) and linear TDC are as follows, respectively:
\begin{equation*}
\theta^{*}= \arg \min_{\theta} \mathbb{E}[(\mathbb{E}[\delta |s])^2],
\end{equation*}
and
\begin{equation*}
\theta^{*}=\arg \min_{\theta} \mathbb{E}[\delta \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1} \mathbb{E}[\delta\phi].
\end{equation*}
As a result, two novel objective functions and their corresponding algorithms are proposed,
where $\omega$ is subsequently proven to converge, meaning that these two algorithms can maintain the invariance of the optimal strategy.
\subsection{Variance Minimization TD Learning: VMTD}
For on-policy learning,
a novel objective function, Variance of Bellman Error (VBE), is proposed as follows:
\begin{equation}
\begin{array}{ccl}
\arg \min_{\theta}\text{VBE}(\theta)&=&\arg \min_{\theta}\mathbb{E}[(\mathbb{E}[\delta|s]-\mathbb{E}[\mathbb{E}[\delta|s]])^2]\\
&=&\arg \min_{\theta,\omega} \mathbb{E}[(\mathbb{E}[\delta|s]-\omega)^2].
\end{array}
\end{equation}
Clearly, it is no longer to minimize Bellman errors.
First, the parameter $\omega$ is derived directly based on
stochastic gradient descent:
\begin{equation}
\omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k),
\label{omega}
\end{equation}
where $\delta_k$ is the TD error as follows:
\begin{equation}
\delta_k = r+\gamma
\theta_k^{\top}\phi_{k}'-\theta_k^{\top}\phi_k.
\label{delta}
\end{equation}
Then, based on stochastic semi-gradient descent, the update of
the parameter $\theta$ is as follows:
\begin{equation}
\theta_{k+1}\leftarrow
\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi_k.
\label{theta}
\end{equation}
The pseudocode of the VMTD algorithm is shown in Algorithm \ref{alg:algorithm 1}.
For control tasks, two extensions of VMTD are named VMSarsa and VMQ respectively,
and the update formulas are shown below:
\begin{equation}
\theta_{k+1}\leftarrow
\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi(s_k,a_k).
\end{equation}
and
\begin{equation}
\omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k),
\end{equation}
where $\delta_k$ delta in VMSarsa is:
\begin{equation}
\delta_{k}=r_{k+1}+\gamma \theta_{k}^{\top}\phi(s_{k+1},a_{k+1}) - \theta_{k}^{\top}\phi(s_{k},a_{k}),
\label{deltaSarsa}
\end{equation}
and $\delta_k$ delta in VMQ is:
\begin{equation}
\delta_{k}=r_{k+1}+\gamma \max_{a\in A}\theta_{k}^{\top}\phi(s_{k+1},a) - \theta_{k}^{\top}\phi(s_{k},a_{k}).
\label{deltaQ}
\end{equation}
\begin{algorithm}[t]
\caption{VMTD algorithm with linear function approximation in the on-policy setting}
\label{alg:algorithm 1}
\begin{algorithmic}
\STATE {\bfseries Input:} $\theta_{0}$, $\omega_{0}$, $\gamma
$, learning rate $\alpha_t$ and $\beta_t$
\REPEAT
\STATE For any episode, initialize $\theta_{0}$ arbitrarily, $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$ and $\beta_t$ are constant.\\
\FOR{$t=0$ {\bfseries to} $T-1$}
\STATE Take $A_t$ from $S_t$ according to policy $\mu$, and arrive at $S_{t+1}$\\
\STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
\STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t}'-\theta_t^{\top}\phi_t$
\STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t(\delta_t-\omega_t)\phi_t$
\STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t(\delta_t-\omega_t)$
\STATE $S_t=S_{t+1}$
\ENDFOR
\UNTIL{terminal episode}
\end{algorithmic}
\end{algorithm}
\subsection{Variance Minimization TDC Learning: VMTDC}
For off-policy learning, we employ a projection operator.
The objective function is called Variance of Projected Bellman error (VPBE),
and the corresponding algorithm is called VMTDC.
\begin{equation}
\begin{array}{ccl}
\text{VPBE}(\theta)&=&\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta])\phi]\\
&=&\mathbb{E}[(\delta-\omega) \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta-\omega)\phi],
\end{array}
\end{equation}
where $\omega$ is used to estimate $\mathbb{E}[\delta]$, i.e., $\omega \doteq \mathbb{E}[\delta]$.
The derivation process of the VMTDC algorithm is the same
as that of the TDC algorithm, the only difference is that the original $\delta$ is replaced by $\delta-\omega$.
Therefore, we can easily get the updated formula of VMTDC, as follows:
\begin{equation}
\theta_{k+1}\leftarrow\theta_{k}+\alpha_{k}[(\delta_{k}- \omega_k) \phi(s_k)\\
- \gamma\phi(s_{k+1})(\phi^{\top} (s_k) u_k)],
\label{thetavmtdc}
\end{equation}
\begin{equation}
u_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\omega_k - \phi^{\top} (s_k) u_k]\phi(s_k),
\label{uvmtdc}
\end{equation}
and
\begin{equation}
\omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k),
\label{omegavmtdc}
\end{equation}
The pseudocode of the VMTDC algorithm for importance-sampling scenario is shown in Algorithm \ref{alg:algorithm 2} of Appendix \ref{proofth2}.
Now, we will introduce the improved version of the GQ(0) algorithm, named VMGQ(0):
\begin{equation}
\begin{array}{ccl}
\theta_{k+1}\leftarrow\theta_{k}&+&\alpha_{k}[(\delta_{k}- \omega_k) \phi(s_k,a_k)\\
&-& \gamma\phi(s_{k+1},A^{*}_{k+1})(\phi^{\top} (s_k,a_k) u_k)],
\end{array}
\end{equation}
\begin{equation}
u_{k+1}\leftarrow u_{k}+\zeta_{k}[(\delta_{k}-u_k) - \phi^{\top} (s_k,a_k) u_k]\phi(s_k,a_k),
\end{equation}
and
\begin{equation}
\omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k- \omega_k),
\end{equation}
where $\delta_{k}$ is (\ref{deltaQ}) and $A^{*}_{k+1}={\arg \max}_{a}(\theta_{k}^{\top}\phi(s_{k+1},a))$.
This paper also introduces an additional parameter $\omega$ into the GTD and GTD2 algorithms. For details, please refer to the appendix.
\ No newline at end of file
\resizebox{6cm}{4cm}{
\begin{tikzpicture}[smooth]
\node[coordinate] (origin) at (0.3,0) {};
\node[coordinate] (num7) at (3,0) {};
\node[coordinate] (num1) at (1,2.5) {};
\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {};
\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {};
\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {};
\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {};
\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {};
\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {};
\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {};
\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {};
\path (num1) ++ (0:1cm) node (num2) [coordinate] {};
\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {};
\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {};
\path (num2) ++ (0:1cm) node (num3) [coordinate] {};
\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {};
\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {};
\path (num3) ++ (0:1cm) node (num4) [coordinate] {};
\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {};
\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {};
\path (num4) ++ (0:1cm) node (num5) [coordinate] {};
\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {};
\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {};
\path (num5) ++ (0:1cm) node (num6) [coordinate] {};
\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {};
\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {};
%\draw[->](0,0) -- (1,1);
%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1);
%\fill (0.5,0.5) circle (0.5);
%\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7};
\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06]
coordinates{(num7) (origin) (origin_above) (origin_aright)};
\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5]
coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)};
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6};
\draw[->,>=stealth,line width = 0.02cm] (a)--(g);
\draw[->,>=stealth,line width = 0.02cm] (b)--(g);
\draw[->,>=stealth,line width = 0.02cm] (c)--(g);
\draw[->,>=stealth,line width = 0.02cm] (d)--(g);
\draw[->,>=stealth,line width = 0.02cm] (e)--(g);
\draw[->,>=stealth,line width = 0.02cm] (f)--(g);
\end{tikzpicture}
}
% \tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
% \tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
% \tikzstyle{init} = [pin edge={to-,thin,black}]
% \resizebox{8cm}{1.2cm}{
% \begin{tikzpicture}[node distance=1.5cm,auto,>=latex']
% \node [block] (o) {};
% \node (p) [left of=o,node distance=0.5cm, coordinate] {o};
% \node [shape=circle,int] (a) [right of=o]{$A$};
% \node (b) [left of=a,node distance=1.5cm, coordinate] {a};
% \node [shape=circle,int] (c) [right of=a] {$B$};
% \node (d) [left of=c,node distance=1.5cm, coordinate] {c};
% \node [shape=circle,int, pin={[init]above:$$}] (e) [right of=c]{$C$};
% \node (f) [left of=e,node distance=1.5cm, coordinate] {e};
% \node [shape=circle,int] (g) [right of=e] {$D$};
% \node (h) [left of=g,node distance=1.5cm, coordinate] {g};
% \node [shape=circle,int] (i) [right of=g] {$E$};
% \node (j) [left of=i,node distance=1.5cm, coordinate] {i};
% \node [block] (k) [right of=i] {};
% \node (l) [left of=k,node distance=0.5cm, coordinate] {k};
% \path[<-] (o) edge node {$0$} (a);
% \path[<->] (a) edge node {$0$} (c);
% \path[<->] (c) edge node {$0$} (e);
% \path[<->] (e) edge node {$0$} (g);
% \path[<->] (g) edge node {$0$} (i);
% \draw[->] (i) edge node {$1$} (k);
% \end{tikzpicture}
% }
\tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
\tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
\tikzstyle{init} = [pin edge={to-,thin,black}]
\resizebox{6cm}{1cm}{
\begin{tikzpicture}[node distance=1.5cm, auto, >=latex]
\node [block] (o) {};
\node (p) [left of=o, node distance=0.5cm, coordinate] {o};
\node [shape=circle, int] (a) [right of=o] {$A$};
\node (b) [left of=a, node distance=1.5cm, coordinate] {a};
\node [shape=circle, int] (c) [right of=a] {$B$};
\node (d) [left of=c, node distance=1.5cm, coordinate] {c};
\node [shape=circle, int, pin={[init]above:$ $}] (e) [right of=c] {$C$};
\node (f) [left of=e, node distance=1.5cm, coordinate] {e};
\node [shape=circle, int] (g) [right of=e] {$D$};
\node (h) [left of=g, node distance=1.5cm, coordinate] {g};
\node [shape=circle, int] (i) [right of=g] {$E$};
\node (j) [left of=i, node distance=1.5cm, coordinate] {i};
\node [block] (k) [right of=i] {};
\node (l) [left of=k, node distance=0.5cm, coordinate] {k};
\path[->] (o) edge node {$0$} (a);
\path[<->] (a) edge node {$0$} (c);
\path[<->] (c) edge node {$0$} (e);
\path[<->] (e) edge node {$0$} (g);
\path[<->] (g) edge node {$0$} (i);
\draw[->] (i) edge node {$1$} (k);
\end{tikzpicture}
}
\ No newline at end of file
\section{Background}
\label{preliminaries}
Reinforcement learning agent interacts with environment, observes state,
takes sequential decision makings to influence environment, and obtains
rewards.
Consider an infinite-horizon discounted
Markov Decision Process (MDP), defined by a tuple $\langle S,A,R,P,\gamma
\rangle$, where $S=\{1,2,\ldots,N\}$ is a finite set of states of the environment; $A$
is a finite set of actions of the agent;
$R:S\times A \times S \rightarrow \mathbb{R}$ is a bounded deterministic reward
function; $P:S\times A\times S \rightarrow [0,1]$ is the transition
probability distribution; and $\gamma\in (0,1)$
is the discount factor \cite{Sutton2018book}.
Due to the requirements of online learning, value iteration based on sampling
is considered in this paper.
In each sampling, an experience (or transition) $\langle s, a, s', r\rangle$ is
obtained.
A policy is a mapping $\pi:S\times A \rightarrow [0,1]$. The goal of the
agent is to find an optimal policy $\pi^*$ to maximize the expectation of a
discounted cumulative rewards in a long period.
State value function $V^{\pi}(s)$ for a stationary policy $\pi$ is
defined as:
\begin{equation*}
V^{\pi}(s)=\mathbb{E}_{\pi}[\sum_{k=0}^{\infty} \gamma^k R_{k}|s_0=s].
\label{valuefunction}
\end{equation*}
Linear value function for state $s\in S$ is defined as:
\begin{equation}
V_{{\theta}}(s):= {\theta}^{\top}{\phi}(s) = \sum_{i=1}^{m}
\theta_i \phi_i(s),
\label{linearvaluefunction}
\end{equation}
where ${\theta}:=(\theta_1,\theta_2,\ldots,\theta_m)^{\top}\in
\mathbb{R}^m$ is a parameter vector,
${\phi}:=(\phi_1,\phi_2,\ldots,\phi_m)^{\top}\in \mathbb{R}^m$ is a feature
function defined on state space $S$, and $m$ is the feature size.
Tabular temporal difference (TD) learning \cite{Sutton2018book} has been successfully applied to small-scale problems.
To deal with the well-known curse of dimensionality of large scale MDPs, value
function is usually approximated by a linear model, kernel methods, decision
trees, or neural networks, etc. This paper focuses on the linear model, where
features are usually hand coded by domain experts.
TD learning can also be used to find optimal strategies. The problem of finding an optimal policy is
often called the control problem. Two popular TD methods are Sarsa and Q-leaning. The former is an on-policy
TD control, while the latter is an off-policy control.
It is well known that TDC algorithm \cite{sutton2009fast} guarantees
convergence under off-policy conditions while the off-policy TD algorithm may diverge. The
objective function of TDC is MSPBE.
TDC is essentially an adjustment or correction of the TD update so that it
follows the gradient of the MSPBE objective function. In the context of the TDC algorithm, the control algorithm
is known as Greedy-GQ($\lambda$) \cite{sutton2009fast}. When $\lambda$ is set to 0, it is denoted
as GQ(0).
\ No newline at end of file
\begin{figure*}[htb]
\vskip 0.2in
\begin{center}
\subfigure[Maze]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/maze_complete.pdf}
\label{MazeFull}
}
\subfigure[Cliff Walking]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/cw_complete.pdf}
\label{CliffWalkingFull}
}
\\
\subfigure[Mountain Car]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/mt_complete.pdf}
\label{MountainCarFull}
}
\subfigure[Acrobot]{
\includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/Acrobot_complete.pdf}
\label{AcrobotFull}
}
\caption{Learning curses of four contral environments.}
\label{Complete_full}
\end{center}
\vskip -0.2in
\end{figure*}
\section{Related Work}
\subsection{Difference between VMQ and R-learning}
\begin{table*}[htb]
\centering
\caption{Difference between R-learning and tabular VMQ.}
\vskip 0.15in
\begin{tabular}{c|cc}
\hline
algorithms&update formula \\
\hline
R-learning&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}-m_{k}+ \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a))$\\
&$m_{k+1}\leftarrow m_{k}+\beta_k(r_{k+1}+\max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-m_{k})$\\
tabular VMQ&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_k)$\\
&$\omega_{k+1}\leftarrow \omega_{k}+\beta_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_{k})$\\
\hline
\end{tabular}
\label{differenceRandVMQ}
\vskip -0.1in
\end{table*}
Tabular VMQ's update formula bears some resemblance
to R-learning's update formula. As shown in Table \ref{differenceRandVMQ}, the update formulas of the two algorithms have the following differences:
\\(1) The goal of the R-learning algorithm \cite{schwartz1993reinforcement} is to maximize the average
reward, rather than the cumulative reward, by learning an estimate
of the average reward. This estimate $m$ is then used to update the Q-values.
On the contrary, the $\omega$ in the tabular VMQ update formula eventually converges to $\mathbb{E}[\delta]$.
\\(2) When $\gamma=1$ in the tabular VMQ update formula, the
R-learning update formula is formally
the same as the tabular VMQ update formula.
Therefore, R-learning algorithm can be
considered as a special case of VMQ algorithm in form.
\subsection{Variance Reduction for TD Learning}
The TD with centering algorithm (CTD) \cite{korda2015td}
was proposed, which directly applies variance reduction techniques to
the TD algorithm. The CTD algorithm updates its parameters using the
average gradient of a batch of Markovian samples and a projection operator.
Unfortunately, the authors’ analysis of the CTD algorithm contains technical
errors. The VRTD algorithm \cite{xu2020reanalysis} is also a variance-reduced algorithm that updates
its parameters using the average gradient of a batch of i.i.d. samples. The
authors of VRTD provide a technically sound analysis to demonstrate the
advantages of variance reduction.
\subsection{Variance Reduction for Policy Gradient Algorithms}
Policy gradient algorithms are a class of reinforcement
learning algorithms that directly optimize cumulative rewards.
REINFORCE is a Monte Carlo algorithm that estimates
gradients through sampling, but may have a high variance.
Baselines are introduced to reduce variance and to
accelerate learning \cite{Sutton2018book}. In Actor-Critic,
value function as a baseline and bootstrapping
are used to reduce variance, also accelerating convergence \cite{Sutton2018book}.
TRPO \cite{schulman2015trust} and PPO \cite{schulman2017proximal}
use generalized advantage
estimation, which combines multi-step bootstrapping and Monte Carlo
estimation to reduce variance, making gradient estimation more stable and
accelerating convergence.
In Variance Minimization,
the incorporation of $\omega \doteq \mathbb{E}[\delta]$
bears a striking resemblance to the use of a baseline
in policy gradient methods. The introduction of a baseline
in policy gradient techniques does not alter
the expected value of the update;
rather, it significantly impacts the variance of gradient estimation.
The addition of $\omega \doteq \mathbb{E}[\delta]$ in Variance Minimization
preserves the invariance of the optimal
policy while stabilizing gradient estimation,
reducing the variance of gradient estimation,
and hastening convergence.
\ No newline at end of file
\section{Theoretical Analysis}
The purpose of this section is to establish the stabilities of the VMTD algorithm
and the VMTDC algorithm, and also presents a corollary on the convergence rate of VMTD.
\begin{theorem}
\label{theorem1}(Convergence of VMTD).
In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta}) of VMTD.
Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
$
\sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
$
$
\sum_{k=0}^{\infty}\alpha_k^2<\infty,
$
$
\sum_{k=0}^{\infty}\beta_k^2<\infty,
$
and
$
\alpha_k = o(\beta_k).
$
Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain.
Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
$b=\mathrm{Cov}(r,\phi)$.
Assume that matrix $A$ is non-singular.
Then the parameter vector $\theta_k$ converges with probability one
to $A^{-1}b$.
\end{theorem}
Please refer to the appendix \ref{proofth1} for detailed proof process.
Theorem 3 in \cite{dalal2020tale} provides a general conclusion on the convergence speed of all linear two-timescale
algorithms. VMTD satisfies the assumptions of this theorem, leading
to the following corollary.
\begin{corollary}
\label{corollary4_2}
Consider the Sparsely Projected variant of VMTD. Then, for $\alpha_k = 1/(k+1)^{\alpha}$, $\beta_k = 1/(k+1)^{\beta}$,
$0<\beta<\alpha<1$, $p>1$, with probility larger than $1- \tau$, for all $k\geq N_3$, we have
\begin{equation}
||\theta'_{k} - \theta^{*}|| \le C_{3,\theta} \frac{\sqrt{\ln (4d_{1}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\alpha / 2}}
\end{equation}
\begin{equation}
||\omega'_{n} - \omega^{*}|| \le C_{3,\omega} \frac{\sqrt{\ln (4d_{2}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\omega / 2}},
\end{equation}
\end{corollary}
where $d_1$ and $d_2$ represent the dimensions of $\theta$ and $\omega$, respectively. For VMTD, $d_2 =1$.
The meanings of $N_3$,$C_{3,\theta}$ and $C_{3,\omega}$ are explained in \cite{dalal2020tale}.
The formulas for $\theta'_{k}$ and $\omega'_{n}$ can be found in (\ref{sparseprojectiontheta}) and (\ref{sparseprojectionomega}).
Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process.
\begin{theorem}
\label{theorem2}(Convergence of VMTDC).
In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc}) of VMTDC.
Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$,
$
\sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty,
$
$
\sum_{k=0}^{\infty}\alpha_k^2<\infty,
$
$
\sum_{k=0}^{\infty}\zeta_k^2<\infty,
$
$
\sum_{k=0}^{\infty}\beta_k^2<\infty,
$
and
$
\alpha_k = o(\zeta_k),
$
$
\zeta_k = o(\beta_k).
$
Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
uniformly bounded second moments.
Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
$b=\mathrm{Cov}(r,\phi)$, and $C=\mathbb{E}[\phi\phi^{\top}]$.
Assume that $A$ and $C$ are non-singular matrices.
Then the parameter vector $\theta_k$ converges with probability one
to $A^{-1}b$.
\end{theorem}
Please refer to the appendix \ref{proofth2} for detailed proof process.
\ No newline at end of file
This diff is collapsed. Click to expand it.
\begin{thebibliography}{}
\bibitem[\protect\citeauthoryear{Baird and others}{1995}]{baird1995residual}
Leemon Baird et~al.
\newblock Residual algorithms: Reinforcement learning with function approximation.
\newblock In {\em Proc. 12th Int. Conf. Mach. Learn.}, pages 30--37, 1995.
\bibitem[\protect\citeauthoryear{Bas-Serrano \bgroup \em et al.\egroup }{2021}]{basserrano2021logistic}
Joan Bas-Serrano, Sebastian Curi, Andreas Krause, and Gergely Neu.
\newblock Logistic q-learning.
\newblock In {\em International Conference on Artificial Intelligence and Statistics}, pages 3610--3618, 2021.
\bibitem[\protect\citeauthoryear{Borkar and Meyn}{2000}]{borkar2000ode}
Vivek~S Borkar and Sean~P Meyn.
\newblock The ode method for convergence of stochastic approximation and reinforcement learning.
\newblock {\em SIAM J. Control Optim.}, 38(2):447--469, 2000.
\bibitem[\protect\citeauthoryear{Borkar}{1997}]{borkar1997stochastic}
Vivek~S Borkar.
\newblock Stochastic approximation with two time scales.
\newblock {\em Syst. \& Control Letters}, 29(5):291--294, 1997.
\bibitem[\protect\citeauthoryear{Chen \bgroup \em et al.\egroup }{2023}]{chen2023modified}
Xingguo Chen, Xingzhou Ma, Yang Li, Guang Yang, Shangdong Yang, and Yang Gao.
\newblock Modified retrace for off-policy temporal difference learning.
\newblock In {\em Uncertainty in Artificial Intelligence}, pages 303--312. PMLR, 2023.
\bibitem[\protect\citeauthoryear{Dalal \bgroup \em et al.\egroup }{2020}]{dalal2020tale}
Gal Dalal, Balazs Szorenyi, and Gugan Thoppe.
\newblock A tale of two-timescale reinforcement learning with the tightest finite-time bound.
\newblock In {\em Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pages 3701--3708, 2020.
\bibitem[\protect\citeauthoryear{Devlin and Kudenko}{2012}]{devlin2012dynamic}
Sam Devlin and Daniel Kudenko.
\newblock Dynamic potential-based reward shaping.
\newblock In {\em Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, pages 433--440, 2012.
\bibitem[\protect\citeauthoryear{Feng \bgroup \em et al.\egroup }{2019}]{feng2019kernel}
Yihao Feng, Lihong Li, and Qiang Liu.
\newblock A kernel loss for solving the bellman equation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 15430--15441, 2019.
\bibitem[\protect\citeauthoryear{Givchi and Palhang}{2015}]{givchi2015quasi}
Arash Givchi and Maziar Palhang.
\newblock Quasi newton temporal difference learning.
\newblock In {\em Asian Conference on Machine Learning}, pages 159--172, 2015.
\bibitem[\protect\citeauthoryear{Hackman}{2012}]{hackman2012faster}
Leah Hackman.
\newblock {\em Faster Gradient-TD Algorithms}.
\newblock PhD thesis, University of Alberta, 2012.
\bibitem[\protect\citeauthoryear{Hallak \bgroup \em et al.\egroup }{2016}]{hallak2016generalized}
Assaf Hallak, Aviv Tamar, Remi Munos, and Shie Mannor.
\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
\newblock In {\em Proceedings of the 30th AAAI Conference on Artificial Intelligence}, pages 1631--1637, 2016.
\bibitem[\protect\citeauthoryear{Hirsch}{1989}]{hirsch1989convergent}
Morris~W Hirsch.
\newblock Convergent activation dynamics in continuous time networks.
\newblock {\em Neural Netw.}, 2(5):331--349, 1989.
\bibitem[\protect\citeauthoryear{Johnson and Zhang}{2013}]{johnson2013accelerating}
R.~Johnson and T.~Zhang.
\newblock Accelerating stochastic gradient descent using predictive variance reduction.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 315--323, 2013.
\bibitem[\protect\citeauthoryear{Korda and La}{2015}]{korda2015td}
Nathaniel Korda and Prashanth La.
\newblock On td (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
\newblock In {\em International conference on machine learning}, pages 626--634. PMLR, 2015.
\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2015}]{liu2015finite}
Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
\newblock Finite-sample analysis of proximal gradient td algorithms.
\newblock In {\em Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, pages 504--513, 2015.
\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2016}]{liu2016proximal}
Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
\newblock Proximal gradient temporal difference learning algorithms.
\newblock In {\em Proceedings of the International Joint Conference on Artificial Intelligence}, pages 4195--4199, 2016.
\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2018}]{liu2018proximal}
Bo~Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji~Liu, Sridhar Mahadevan, and Marek Petrik.
\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
\newblock {\em Journal of Artificial Intelligence Research}, 63:461--494, 2018.
\bibitem[\protect\citeauthoryear{Maei}{2011}]{maei2011gradient}
Hamid~Reza Maei.
\newblock {\em Gradient temporal-difference learning algorithms}.
\newblock PhD thesis, University of Alberta, 2011.
\bibitem[\protect\citeauthoryear{Ng \bgroup \em et al.\egroup }{1999}]{ng1999policy}
Andrew~Y Ng, Daishi Harada, and Stuart Russell.
\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
\newblock In {\em Proc. 16th Int. Conf. Mach. Learn.}, pages 278--287, 1999.
\bibitem[\protect\citeauthoryear{Pan \bgroup \em et al.\egroup }{2017}]{pan2017accelerated}
Yangchen Pan, Adam White, and Martha White.
\newblock Accelerated gradient temporal difference learning.
\newblock In {\em Proceedings of the 21st AAAI Conference on Artificial Intelligence}, pages 2464--2470, 2017.
\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2015}]{schulman2015trust}
J.~Schulman, S.~Levine, P.~Abbeel, M.~Jordan, and P.~Moritz.
\newblock Trust region policy optimization.
\newblock In {\em International Conference on Machine Learning}, pages 1889--1897, 2015.
\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2017}]{schulman2017proximal}
J.~Schulman, F.~Wolski, P.~Dhariwal, A.~Radford, and O.~Klimov.
\newblock Proximal policy optimization algorithms.
\newblock {\em arXiv preprint arXiv:1707.06347}, 2017.
\bibitem[\protect\citeauthoryear{Schwartz}{1993}]{schwartz1993reinforcement}
Anton Schwartz.
\newblock A reinforcement learning method for maximizing undiscounted rewards.
\newblock In {\em Proc. 10th Int. Conf. Mach. Learn.}, volume 298, pages 298--305, 1993.
\bibitem[\protect\citeauthoryear{Sutton and Barto}{2018}]{Sutton2018book}
Richard~S. Sutton and Andrew~G. Barto.
\newblock {\em Reinforcement Learning: An Introduction}.
\newblock The MIT Press, second edition, 2018.
\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2008}]{sutton2008convergent}
Richard~S Sutton, Hamid~R Maei, and Csaba Szepesv{\'a}ri.
\newblock A convergent $ o (n) $ temporal-difference algorithm for off-policy learning with linear function approximation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 1609--1616. Cambridge, MA: MIT Press, 2008.
\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2009}]{sutton2009fast}
R.S. Sutton, H.R. Maei, D.~Precup, S.~Bhatnagar, D.~Silver, C.~Szepesv{\'a}ri, and E.~Wiewiora.
\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
\newblock In {\em Proc. 26th Int. Conf. Mach. Learn.}, pages 993--1000, 2009.
\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2016}]{sutton2016emphatic}
Richard~S Sutton, A~Rupam Mahmood, and Martha White.
\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
\newblock {\em The Journal of Machine Learning Research}, 17(1):2603--2631, 2016.
\bibitem[\protect\citeauthoryear{Sutton}{1988}]{sutton1988learning}
Richard~S Sutton.
\newblock Learning to predict by the methods of temporal differences.
\newblock {\em Machine learning}, 3(1):9--44, 1988.
\bibitem[\protect\citeauthoryear{Tsitsiklis and Van~Roy}{1997}]{tsitsiklis1997analysis}
John~N Tsitsiklis and Benjamin Van~Roy.
\newblock Analysis of temporal-diffference learning with function approximation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 1075--1081, 1997.
\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2019}]{xu2019reanalysis}
Tengyu Xu, Zhe Wang, Yi~Zhou, and Yingbin Liang.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock In {\em International Conference on Learning Representations}, 2019.
\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2020}]{xu2020reanalysis}
T.~Xu, Z.~Wang, Y.~Zhou, and Y.~Liang.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock {\em arXiv preprint arXiv:2001.01898}, 2020.
\bibitem[\protect\citeauthoryear{Zhang and Whiteson}{2022}]{zhang2022truncated}
Shangtong Zhang and Shimon Whiteson.
\newblock Truncated emphatic temporal difference methods for prediction and control.
\newblock {\em The Journal of Machine Learning Research}, 23(1):6859--6917, 2022.
\end{thebibliography}
This is BibTeX, Version 0.99d (TeX Live 2023)
Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
The top-level auxiliary file: neurips_2024.aux
The style file: named.bst
Database file #1: neurips_2024.bib
Warning--can't use both volume and number fields in dalal2020tale
You've used 32 entries,
2439 wiz_defined-function locations,
737 strings with 10053 characters,
and the built_in function-call counts, 15617 in all, are:
= -- 1648
> -- 575
< -- 21
+ -- 200
- -- 194
* -- 1156
:= -- 2297
add.period$ -- 99
call.type$ -- 32
change.case$ -- 222
chr.to.int$ -- 32
cite$ -- 33
duplicate$ -- 692
empty$ -- 1205
format.name$ -- 235
if$ -- 3401
int.to.chr$ -- 1
int.to.str$ -- 0
missing$ -- 31
newline$ -- 163
num.names$ -- 96
pop$ -- 236
preamble$ -- 1
purify$ -- 256
quote$ -- 0
skip$ -- 627
stack$ -- 0
substring$ -- 1023
swap$ -- 276
text.length$ -- 21
text.prefix$ -- 0
top$ -- 0
type$ -- 252
warning$ -- 1
while$ -- 134
width$ -- 37
write$ -- 420
(There was 1 warning)
\BOOKMARK [1][-]{section.1}{\376\377\000I\000n\000t\000r\000o\000d\000u\000c\000t\000i\000o\000n}{}% 1
\BOOKMARK [1][-]{section.2}{\376\377\000B\000a\000c\000k\000g\000r\000o\000u\000n\000d}{}% 2
\BOOKMARK [1][-]{section.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{}% 3
\BOOKMARK [2][-]{subsection.3.1}{\376\377\000M\000o\000t\000i\000v\000a\000t\000i\000o\000n}{section.3}% 4
\BOOKMARK [2][-]{subsection.3.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D}{section.3}% 5
\BOOKMARK [2][-]{subsection.3.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000C\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D\000C}{section.3}% 6
\BOOKMARK [1][-]{section.4}{\376\377\000T\000h\000e\000o\000r\000e\000t\000i\000c\000a\000l\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{}% 7
\BOOKMARK [1][-]{section.5}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000S\000t\000u\000d\000i\000e\000s}{}% 8
\BOOKMARK [2][-]{subsection.5.1}{\376\377\000T\000e\000s\000t\000i\000n\000g\000\040\000T\000a\000s\000k\000s}{section.5}% 9
\BOOKMARK [2][-]{subsection.5.2}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000R\000e\000s\000u\000l\000t\000s\000\040\000a\000n\000d\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section.5}% 10
\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000l\000a\000t\000e\000d\000\040\000W\000o\000r\000k}{}% 11
\BOOKMARK [2][-]{subsection.6.1}{\376\377\000D\000i\000f\000f\000e\000r\000e\000n\000c\000e\000\040\000b\000e\000t\000w\000e\000e\000n\000\040\000V\000M\000Q\000\040\000a\000n\000d\000\040\000R\000-\000l\000e\000a\000r\000n\000i\000n\000g}{section.6}% 12
\BOOKMARK [2][-]{subsection.6.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g}{section.6}% 13
\BOOKMARK [2][-]{subsection.6.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{section.6}% 14
\BOOKMARK [1][-]{section.7}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n\000\040\000a\000n\000d\000\040\000F\000u\000t\000u\000r\000e\000\040\000W\000o\000r\000k}{}% 15
\BOOKMARK [1][-]{appendix.A}{\376\377\000R\000e\000l\000e\000v\000a\000n\000t\000\040\000p\000r\000o\000o\000f\000s}{}% 16
\BOOKMARK [2][-]{subsection.A.1}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0001}{appendix.A}% 17
\BOOKMARK [2][-]{subsection.A.2}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000C\000o\000r\000o\000l\000l\000a\000r\000y\000\040\0004\000.\0002}{appendix.A}% 18
\BOOKMARK [2][-]{subsection.A.3}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0003}{appendix.A}% 19
\BOOKMARK [1][-]{appendix.B}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000d\000e\000t\000a\000i\000l\000s}{}% 20
\documentclass{article}
% if you need to pass options to natbib, use, e.g.:
% \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2024
% ready for submission
\usepackage{neurips_2024}
% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
% \usepackage[preprint]{neurips_2024}
% to compile a camera-ready version, add the [final] option, e.g.:
% \usepackage[final]{neurips_2024}
% to avoid loading the natbib package, add option nonatbib:
% \usepackage[nonatbib]{neurips_2024}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc} % use 8-bit T1 fonts
\usepackage{hyperref} % hyperlinks
\usepackage{url} % simple URL typesetting
\usepackage{booktabs} % professional-quality tables
\usepackage{amsfonts} % blackboard math symbols
\usepackage{nicefrac} % compact symbols for 1/2, etc.
\usepackage{microtype} % microtypography
\usepackage{xcolor} % colors
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{diagbox}
\usepackage{wrapfig}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{tikz}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\usepackage{algorithm}
\usepackage{algorithmic}
\title{Is Minimizing Errors the Only Option for Value-based Reinforcement Learning?}
% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.
\author{%
David S.~Hippocampus\thanks{Use footnote for providing further information
about author (webpage, alternative address)---\emph{not} for acknowledging
funding agencies.} \\
Department of Computer Science\\
Cranberry-Lemon University\\
Pittsburgh, PA 15213 \\
\texttt{hippo@cs.cranberry-lemon.edu} \\
% examples of more authors
% \And
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email} \\
% \AND
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email} \\
% \And
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email} \\
% \And
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email} \\
}
\begin{document}
\maketitle
\begin{abstract}
The existing research on
value-based reinforcement learning also minimizes the error.
However, is error minimization really the only option
for value-based reinforcement learning?
We can easily observe that the policy on action
choosing probabilities is often related to the relative values,
and has nothing to do with their absolute values.
Based on this observation, we propose the objective
of variance minimization instead of error minimization,
derive many new variance minimization algorithms, both including a traditional parameter $\omega$,
and conduct an analysis of the convergence rate and experiments.
The experimental results show that our proposed variance minimization algorithms
converge much faster.
\end{abstract}
\input{main/introduction.tex}
\input{main/preliminaries.tex}
\input{main/motivation.tex}
\input{main/theory.tex}
\input{main/experiment.tex}
\input{main/relatedwork.tex}
\input{main/conclusion.tex}
\appendix
\input{main/appendix.tex}
\bibliographystyle{named}
\bibliography{neurips_2024}
% \bibliographystyle{neurips_2024}
\end{document}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment