来不及了

afbe69ae · GongYu · afbe69ae · afbe69ae · afbe69ae · afbe69ae
Commit afbe69ae authored May 19, 2024 by GongYu
32 changed files
--- a/main/algorithm.sty
+++ b/main/algorithm.sty
+% ALGORITHM STYLE -- Released 8 April 1996
+%    for LaTeX-2e
+% Copyright -- 1994 Peter Williams
+% E-mail Peter.Williams@dsto.defence.gov.au
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{algorithm}
+\typeout{Document Style `algorithm' - floating environment}
+\RequirePackage{float}
+\RequirePackage{ifthen}
+\newcommand{\ALG@within}{nothing}
+\newboolean{ALG@within}
+\setboolean{ALG@within}{false}
+\newcommand{\ALG@floatstyle}{ruled}
+\newcommand{\ALG@name}{Algorithm}
+\newcommand{\listalgorithmname}{List of \ALG@name s}
+% Declare Options
+% first appearance
+\DeclareOption{plain}{
+  \renewcommand{\ALG@floatstyle}{plain}
+}
+\DeclareOption{ruled}{
+  \renewcommand{\ALG@floatstyle}{ruled}
+}
+\DeclareOption{boxed}{
+  \renewcommand{\ALG@floatstyle}{boxed}
+}
+% then numbering convention
+\DeclareOption{part}{
+  \renewcommand{\ALG@within}{part}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{chapter}{
+  \renewcommand{\ALG@within}{chapter}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{section}{
+  \renewcommand{\ALG@within}{section}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{subsection}{
+  \renewcommand{\ALG@within}{subsection}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{subsubsection}{
+  \renewcommand{\ALG@within}{subsubsection}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption{nothing}{
+  \renewcommand{\ALG@within}{nothing}
+  \setboolean{ALG@within}{true}
+}
+\DeclareOption*{\edef\ALG@name{\CurrentOption}}
+% ALGORITHM
+%
+\ProcessOptions
+\floatstyle{\ALG@floatstyle}
+\ifthenelse{\boolean{ALG@within}}{
+  \ifthenelse{\equal{\ALG@within}{part}}
+     {\newfloat{algorithm}{htbp}{loa}[part]}{}
+  \ifthenelse{\equal{\ALG@within}{chapter}}
+     {\newfloat{algorithm}{htbp}{loa}[chapter]}{}
+  \ifthenelse{\equal{\ALG@within}{section}}
+     {\newfloat{algorithm}{htbp}{loa}[section]}{}
+  \ifthenelse{\equal{\ALG@within}{subsection}}
+     {\newfloat{algorithm}{htbp}{loa}[subsection]}{}
+  \ifthenelse{\equal{\ALG@within}{subsubsection}}
+     {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{}
+  \ifthenelse{\equal{\ALG@within}{nothing}}
+     {\newfloat{algorithm}{htbp}{loa}}{}
+}{
+  \newfloat{algorithm}{htbp}{loa}
+}
+\floatname{algorithm}{\ALG@name}
+\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}}
--- a/main/algorithmic.sty
+++ b/main/algorithmic.sty
+% ALGORITHMIC STYLE -- Released 8 APRIL 1996
+%    for LaTeX version 2e
+% Copyright -- 1994 Peter Williams
+% E-mail PeterWilliams@dsto.defence.gov.au
+%
+% Modified by Alex Smola (08/2000)
+% E-mail Alex.Smola@anu.edu.au
+%
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{algorithmic}
+\typeout{Document Style `algorithmic' - environment}
+%
+\RequirePackage{ifthen}
+\RequirePackage{calc}
+\newboolean{ALC@noend}
+\setboolean{ALC@noend}{false}
+\newcounter{ALC@line}
+\newcounter{ALC@rem}
+\newlength{\ALC@tlm}
+%
+\DeclareOption{noend}{\setboolean{ALC@noend}{true}}
+%
+\ProcessOptions
+%
+% ALGORITHMIC
+\newcommand{\algorithmicrequire}{\textbf{Require:}}
+\newcommand{\algorithmicensure}{\textbf{Ensure:}}
+\newcommand{\algorithmiccomment}[1]{\{#1\}}
+\newcommand{\algorithmicend}{\textbf{end}}
+\newcommand{\algorithmicif}{\textbf{if}}
+\newcommand{\algorithmicthen}{\textbf{then}}
+\newcommand{\algorithmicelse}{\textbf{else}}
+\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif}
+\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif}
+\newcommand{\algorithmicfor}{\textbf{for}}
+\newcommand{\algorithmicforall}{\textbf{for all}}
+\newcommand{\algorithmicdo}{\textbf{do}}
+\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor}
+\newcommand{\algorithmicwhile}{\textbf{while}}
+\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile}
+\newcommand{\algorithmicloop}{\textbf{loop}}
+\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop}
+\newcommand{\algorithmicrepeat}{\textbf{repeat}}
+\newcommand{\algorithmicuntil}{\textbf{until}}
+%changed by alex smola
+\newcommand{\algorithmicinput}{\textbf{input}}
+\newcommand{\algorithmicoutput}{\textbf{output}}
+\newcommand{\algorithmicset}{\textbf{set}}
+\newcommand{\algorithmictrue}{\textbf{true}}
+\newcommand{\algorithmicfalse}{\textbf{false}}
+\newcommand{\algorithmicand}{\textbf{and\ }}
+\newcommand{\algorithmicor}{\textbf{or\ }}
+\newcommand{\algorithmicfunction}{\textbf{function}}
+\newcommand{\algorithmicendfunction}{\algorithmicend\ \algorithmicfunction}
+\newcommand{\algorithmicmain}{\textbf{main}}
+\newcommand{\algorithmicendmain}{\algorithmicend\ \algorithmicmain}
+%end changed by alex smola
+\def\ALC@item[#1]{%
+\if@noparitem \@donoparitem
+  \else \if@inlabel \indent \par \fi
+         \ifhmode \unskip\unskip \par \fi
+         \if@newlist \if@nobreak \@nbitem \else
+                        \addpenalty\@beginparpenalty
+                        \addvspace\@topsep \addvspace{-\parskip}\fi
+           \else \addpenalty\@itempenalty \addvspace\itemsep
+          \fi
+    \global\@inlabeltrue
+\fi
+\everypar{\global\@minipagefalse\global\@newlistfalse
+          \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels
+             \penalty\z@ \fi
+          \everypar{}}\global\@nobreakfalse
+\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi
+\sbox\@tempboxa{\makelabel{#1}}%
+\global\setbox\@labels
+ \hbox{\unhbox\@labels \hskip \itemindent
+       \hskip -\labelwidth \hskip -\ALC@tlm
+       \ifdim \wd\@tempboxa >\labelwidth
+                \box\@tempboxa
+          \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi
+       \hskip \ALC@tlm}\ignorespaces}
+%
+\newenvironment{algorithmic}[1][0]{
+\let\@item\ALC@item
+  \newcommand{\ALC@lno}{%
+\ifthenelse{\equal{\arabic{ALC@rem}}{0}}
+{{\footnotesize \arabic{ALC@line}:}}{}%
+}
+\let\@listii\@listi
+\let\@listiii\@listi
+\let\@listiv\@listi
+\let\@listv\@listi
+\let\@listvi\@listi
+\let\@listvii\@listi
+  \newenvironment{ALC@g}{
+    \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@
+    \listparindent\z@ \rightmargin\z@ 
+    \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@
+    \leftmargin 1em
+    \addtolength{\ALC@tlm}{\leftmargin}
+    }
+  }
+  {\end{list}}
+  \newcommand{\ALC@it}{\addtocounter{ALC@line}{1}\addtocounter{ALC@rem}{1}\ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}\item}
+  \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}%
+{}{\ \algorithmiccomment{##1}}}
+  \newcommand{\REQUIRE}{\item[\algorithmicrequire]}
+  \newcommand{\ENSURE}{\item[\algorithmicensure]}
+  \newcommand{\STATE}{\ALC@it}
+  \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}}
+%changes by alex smola
+  \newcommand{\INPUT}{\item[\algorithmicinput]}
+  \newcommand{\OUTPUT}{\item[\algorithmicoutput]}
+  \newcommand{\SET}{\item[\algorithmicset]}
+%  \newcommand{\TRUE}{\algorithmictrue}
+%  \newcommand{\FALSE}{\algorithmicfalse}
+  \newcommand{\AND}{\algorithmicand}
+  \newcommand{\OR}{\algorithmicor}
+  \newenvironment{ALC@func}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@main}{\begin{ALC@g}}{\end{ALC@g}}
+%end changes by alex smola
+  \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}}
+  \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}}
+  \renewcommand{\\}{\@centercr}
+  \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\SHORTIF}[2]{\ALC@it\algorithmicif\ ##1\
+    \algorithmicthen\ {##2}}
+  \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\ELSIF}[2][default]%
+{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen%
+\ALC@com{##1}\begin{ALC@if}}
+  \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo%
+\ALC@com{##1}\begin{ALC@for}}
+  \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ %
+\algorithmicdo%
+\ALC@com{##1}\begin{ALC@for}}
+  \newcommand{\SHORTFORALL}[2]{\ALC@it\algorithmicforall\ ##1\ %
+    \algorithmicdo\ {##2}}
+  \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ %
+\algorithmicdo%
+\ALC@com{##1}\begin{ALC@whl}}
+  \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop%
+\ALC@com{##1}\begin{ALC@loop}}
+%changed by alex smola
+  \newcommand{\FUNCTION}[2][default]{\ALC@it\algorithmicfunction\ ##2\ %
+    \ALC@com{##1}\begin{ALC@func}}
+  \newcommand{\MAIN}[2][default]{\ALC@it\algorithmicmain\ ##2\ %
+    \ALC@com{##1}\begin{ALC@main}}
+%end changed by alex smola
+  \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat%
+    \ALC@com{##1}\begin{ALC@rpt}}
+    \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1}
+  \ifthenelse{\boolean{ALC@noend}}{
+    \newcommand{\ENDIF}{\end{ALC@if}}
+    \newcommand{\ENDFOR}{\end{ALC@for}}
+    \newcommand{\ENDWHILE}{\end{ALC@whl}}
+    \newcommand{\ENDLOOP}{\end{ALC@loop}}
+    \newcommand{\ENDFUNCTION}{\end{ALC@func}}
+    \newcommand{\ENDMAIN}{\end{ALC@main}}
+  }{
+    \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif}
+    \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor}
+    \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile}
+    \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop}
+    \newcommand{\ENDFUNCTION}{\end{ALC@func}\ALC@it\algorithmicendfunction}
+    \newcommand{\ENDMAIN}{\end{ALC@main}\ALC@it\algorithmicendmain}
+  } 
+  \renewcommand{\@toodeep}{}
+  \begin{list}{\ALC@lno}{\setcounter{ALC@line}{0}\setcounter{ALC@rem}{0}%
+      \itemsep\z@ \itemindent\z@ \listparindent\z@%
+      \partopsep\z@ \parskip\z@ \parsep\z@%
+      \labelsep 0.5em \topsep 0.2em%
+      \ifthenelse{\equal{#1}{0}}
+      {\labelwidth 0.5em }
+      {\labelwidth  1.2em }
+      \leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep}
+      \ALC@tlm\labelsep
+      }
+    }
+  {\end{list}}
--- a/main/appendix.tex
+++ b/main/appendix.tex
--- a/main/conclusion.tex
+++ b/main/conclusion.tex
+\section{Conclusion and Future Work}
+Value-based reinforcement learning typically aims 
+to minimize error as an optimization objective. 
+As an alternation, this study proposes new objective 
+functions: VBE, VPBE and VNEU, and derives many variance minimization algorithms, including VMTD, 
+VMTDC, VMGTD, VMGTD2 and VMETD. 
+% The VMTD algorithm 
+% is essentially an adjustment or correction to the traditional 
+% TD update. 
+%  Both 
+% algorithms are capable of stabilizing gradient estimation, reducing 
+% the variance of gradient estimation and accelerating convergence.
+All algorithms demonstrated superior performance in policy 
+evaluation and control experiments.
+Future work may include, but are not limited
+to, (1) analysis of the convergence rate of VMTDC. 
+(2) extensions of VBE and VPBE to multi-step returns. 
+(3) extensions to nonlinear approximations, such as neural networks. 
\ No newline at end of file
--- a/main/experiment.tex
+++ b/main/experiment.tex
+\section{Experimental Studies}
+This section assesses algorithm performance through experiments, 
+which are divided into policy evaluation experiments and control experiments.
+\subsection{Testing Tasks}
+\textbf{Random-walk:} as shown in Figure \ref{randomwalk}, all episodes
+start in the center state, $C$, and proceed to left or right by one state on each
+step, equiprobably. Episodes terminate either on the extreme left or
+the extreme right, and get a reward of $+1$ if terminate on the right, or
+$0$ in the other case. In this task, the true value for each state is  the
+probability of starting from that state and terminating on the right
+\cite{Sutton2018book}.
+Thus, the true values of states from $A$ to $E$ are
+$\frac{1}{6},\frac{2}{6},\frac{3}{6},\frac{4}{6},\frac{5}{6}$, respectively.
+The discount factor $\gamma=1.0$. 
+There are three standard kinds of features for random-walk problems: tabular
+feature, inverted feature and dependent feature \cite{sutton2009fast}. 
+The feature matrices corresponding to three random walks are shown in Appendix \ref{experimentaldetails}.
+Conduct experiments using
+an on-policy approach in the Random-walk environment.
+\begin{figure}
+    \begin{center}
+    \input{main/pic/randomwalk.tex}
+    \caption{Random walk.}
+    \label{randomwalk}
+    \end{center}
+\end{figure}
+\begin{figure}
+    \begin{center}
+    \input{main/pic/BairdExample.tex}
+    \caption{7-state version of Baird's off-policy counterexample.}
+    \label{bairdexample}
+    \end{center}
+\end{figure}
+\textbf{Baird's off-policy counterexample:} This task is well known as a
+counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As
+shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy
+chooses actions represented by solid lines with a probability of $\frac{1}{7}$
+and actions represented by dotted lines with a probability of $\frac{6}{7}$. The
+target policy is expected to choose the solid line with more probability than $\frac{1}{7}$,
+and it chooses the solid line with probability of $1$ in this paper.
+ The discount factor $\gamma =0.99$, and the feature matrix is
+defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}.
+\textbf{Maze}:  The learning agent should find a shortest path from the upper
+left corner to the lower right corner.  In each state,
+there are four alternative actions: $up$, $down$, $left$, and $right$, which
+takes the agent deterministically to the corresponding neighbour state, except when
+ \begin{wrapfigure}{r}{3cm}
+\centering
+\includegraphics[scale=0.15]{main/pic/maze_13_13.pdf} 
+% \caption{The 2-state counterexample.}
+\end{wrapfigure}
+ a movement is blocked by an obstacle or the edge
+of the maze. Rewards are $-1$ in all transitions until the
+agent reaches the goal state.
+The discount factor $\gamma=0.99$, and states $s$ are represented by tabular
+features.The maximum number of moves in the game is set to 1000.
+\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are 
+selected from the gym official website and correspond to the following 
+versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''. 
+For specific details, please refer to the gym official website.
+The maximum number of steps for the Mountain Car environment is set to 1000, 
+while the default settings are used for the other two environments. In  Mountain car and Acrobot, features are generated by tile coding.
+Please, refer to the Appendix \ref{experimentaldetails} for the selection of learning rates for all experiments.
+\subsection{Experimental Results and Analysis}
+\begin{figure}[htb]
+    \vskip 0.2in
+    \begin{center}
+    \subfigure[Dependent]{
+        \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/dependent_new.pdf}
+        \label{DependentFull}
+    }
+    \subfigure[Tabular]{
+        \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/tabular_new.pdf}
+        \label{TabularFull}
+    }
+    \\
+    \subfigure[Inverted]{
+        \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/inverted_new.pdf}
+        \label{InvertedFull}
+    }
+    \subfigure[counterexample]{
+        \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/counterexample_quanju_new.pdf}
+        \label{CounterExampleFull}
+    }
+        \caption{Learning curses of four evaluation environments.}
+        \label{Evaluation_full}
+    \end{center}
+    \vskip -0.2in
+\end{figure}
+% The learning rates of all algorithms in different environments are shown in Table \ref{lrofways}. 
+% Figure \ref{Complete_full} shows the experimental curves of different algorithms in four environments.
+For policy evaluation experiments, compare the performance of the VMTD, 
+VMTDC, TD, and TDC algorithms. 
+The vertical axis is unified as RVBE.
+For policy evaluation experiments, the criteria for evaluating 
+  algorithms vary. The objective function minimized by our proposed 
+  new algorithm differs from that of other algorithms. Therefore, to 
+ensure fairness in comparisons, this study only contrasts algorithm 
+experiments in controlled settings.
+This study will compare the performance of Sarsa, Q-learning, GQ(0), 
+  AC, VMSarsa, VMQ, and VMGQ(0) in four control environments.
+% All experiments involved in this paper were run independently for 100 times.
+The learning curses of the algorithms corresponding to 
+policy evaluation experiments and control experiments are 
+shown in Figures \ref{Evaluation_full} and \ref{Complete_full}, respectively.
+The shaded area in Figure \ref{Evaluation_full}, \ref{Complete_full} represents the standard deviation (std).
+In the random-walk tasks, VMTD and VMTDC exhibit excellent performance, 
+outperforming TD and TDC in the case of dependent random-walk.
+In the 7-state example counter task, TD diverges, 
+while VMTDC converges and performs better than TDC. 
+From the update formula, it can be observed that the VMTD algorithm, like TDC,  
+is also an adjustment or correction of the TD update.
+What is more surprising is that VMTD also maintains 
+convergence and demonstrates the best performance.
+In  Maze, Mountain Car, and Acrobot, 
+the convergence speed of VMSarsa, VMQ, and VMGQ(0) has 
+been significantly improved compared to Sarsa, Q-learning, 
+and GQ(0), respectively. The performance of the AC algorithm 
+is at an intermediate level. The performances of VMSarsa, 
+VMQ, and VMGQ(0) in these three experimental environments 
+have no significant differences.
+In  Cliff Walking, Sarsa and 
+VMSarsa converge to slightly worse solutions compared to 
+other algorithms. The convergence speed of VMSarsa is significantly 
+better than that of Sarsa. The convergence speed of VMGQ(0) and VMQ 
+is better than other algorithms, and the performance of VMGQ(0) is 
+slightly better than that of VMQ.
+In summary, the performance of VMSarsa, 
+VMQ, and VMGQ(0) is better than that of other algorithms. 
+In the Cliff Walking environment, 
+the performance of VMGQ(0) is slightly better than that of 
+VMSarsa and VMQ. In the other three experimental environments, 
+the performances of VMSarsa, VMQ, and VMGQ(0) are close.
\ No newline at end of file
--- a/main/introduction.tex
+++ b/main/introduction.tex
+\section{Introduction}
+\label{introduction}
+Reinforcement learning can be mainly divided into two
+categories: value-based reinforcement learning
+and policy gradient-based reinforcement learning. This
+paper focuses on temporal difference learning based on
+linear approximated valued functions. Its research is
+usually divided into two steps: the first step is to establish the convergence of the algorithm, and the second
+step is to accelerate the algorithm.
+In terms of stability, \citet{sutton1988learning} established the
+ convergence of on-policy TD(0), and \citet{tsitsiklis1997analysis}
+ established the convergence of on-policy TD($\lambda$).
+ However, ``The deadly triad'' consisting of off-policy learning, 
+ bootstrapping, and function approximation makes 
+ the stability  a difficult problem \citep{Sutton2018book}.
+ To solve this problem, convergent off-policy temporal difference
+  learning algorithms are proposed, e.g., BR \cite{baird1995residual},
+ GTD \cite{sutton2008convergent},  GTD2 and TDC \cite{sutton2009fast},
+  ETD \cite{sutton2016emphatic}, and MRetrace \cite{chen2023modified}.
+In terms of acceleration, \citet{hackman2012faster} 
+proposed Hybrid TD algorithm with on-policy matrix.
+\citet{liu2015finite,liu2016proximal,liu2018proximal} proposed
+true stochastic algorithms, i.e., GTD-MP and GTD2-MP, from 
+a convex-concave saddle-point formulation.
+Second-order  methods are used to accelerate TD learning,
+e.g.,  Quasi Newton TD \cite{givchi2015quasi} and 
+accelerated TD (ATD)  \citep{pan2017accelerated}.
+\citet{hallak2016generalized} introduced an new parameter 
+to reduce variance for ETD.
+\citet{zhang2022truncated} proposed truncated ETD with a lower variance.
+Variance Reduced TD with direct variance reduction technique \citep{johnson2013accelerating} is proposed by \cite{korda2015td} 
+and analysed by  \cite{xu2019reanalysis}.
+How to further improve the convergence rates of reinforcement learning 
+algorithms is currently still an open problem.
+Algorithm stability is prominently reflected in the changes 
+to the objective function, transitioning from mean squared 
+errors (MSE) \citep{Sutton2018book} to mean squared bellman errors (MSBE) \cite{baird1995residual}, then to 
+norm of the expected TD update \cite{sutton2009fast}, and further to 
+mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}. On the other hand, algorithm 
+acceleration is more centered around optimizing the iterative 
+update formula of the algorithm itself without altering the 
+objective function, thereby speeding up the convergence rate 
+of the algorithm. The emergence of new optimization objective 
+functions often leads to the development of novel algorithms. 
+The introduction of new algorithms, in turn, tends to inspire 
+researchers to explore methods for accelerating algorithms, 
+leading to the iterative creation of increasingly superior algorithms.
+The kernel loss function can be optimized using standard 
+gradient-based methods, addressing the issue of double 
+sampling in residual gradient algorithm \cite{feng2019kernel}. It ensures convergence 
+in both on-policy and off-policy scenarios. The logistic bellman 
+error is convex and smooth in the action-value function parameters, 
+with bounded gradients \cite{basserrano2021logistic}. In contrast, the squared Bellman error is 
+not convex in the action-value function parameters, and RL algorithms 
+based on recursive optimization using it are known to be unstable.
+% The value-based algorithms mentioned above aim to
+% minimize some errors, e.g., mean squared errors \citep{Sutton2018book},
+% mean squared Bellman errors \cite{baird1995residual}, norm
+% of the expected TD update \cite{sutton2009fast}, 
+% mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}, etc.
+It is necessary to propose a new objective function, but the mentioned objective functions above are all some form of error.
+Is minimizing error the only option for value-based reinforcement learning?
+For policy evaluation experiments, 
+differences in objective functions may result 
+in inconsistent fixed points. This inconsistency 
+makes it difficult to uniformly compare the superiority 
+of algorithms derived from different objective functions. 
+However, for control experiments, since the choice of actions 
+depends on the relative values of the Q values rather than their
+ absolute values, the presence of solution bias is acceptable.
+Based on this observation, we propose  alternate objective functions 
+instead of minimizing errors. We minimize Variance of Bellman Error (VBE),
+Variance of Projected Bellman Error (VPBE), and Variance of the norm of the expected TD update (VNEU)
+and derive Variance Minimization (VM) algorithms.
+These algorithms preserve the invariance of the optimal policy in the control environments,
+but significantly reduce the variance of gradient estimation,
+and thus hastening convergence.
+The contributions of this paper are as follows:
+(1) Introduction of  novel objective functions based on
+the invariance of the optimal policy.
+(2) Derived mang variance minimization algorithms, including on-policy and one off-policy.
+(3) Proof of their convergence.
+(4) Analysis of the convergence rate of on-policy algorithm.
+(5) Experiments demonstrating the faster convergence speed of the proposed algorithms.
--- a/main/motivation.tex
+++ b/main/motivation.tex
+\section{Variance Minimization Algorithms}
+\subsection{Motivation}
+ As shown 
+in Table \ref{example_bias}, although there is a bias between the 
+true value and the predicted value, action $a_3$ is 
+still chosen under the greedy-policy.
+On the contrary, supervised learning is usually used to predict temperature, humidity, morbidity, etc. If the bias is too large, the consequences could be serious. 
+\begin{table}[t]
+    \caption{Classification accuracies for naive Bayes and flexible
+    Bayes on various data sets.}
+    \label{example_bias}
+    \vskip 0.15in
+    \begin{center}
+    \begin{small}
+    \begin{sc}
+    \begin{tabular}{lcccr}
+    \toprule
+    action & $Q$ value & $Q$ value with bias \\
+    \midrule
+    $Q(s, a_0)$ & 1& 5 \\
+    $Q(s, a_1)$ & 2& 6 \\
+    $Q(s, a_2)$ & 3& 7 \\
+    $Q(s, a_3)$ & 4& 8 \\
+    $\arg \min_{a}Q(s,a)$ & $a_3$& $a_3$\\
+    \bottomrule
+    \end{tabular}
+    \end{sc}
+    \end{small}
+    \end{center}
+    \vskip -0.1in
+\end{table}
+In addition, reward shaping can significantly speed up the learning  by adding a shaping
+reward $F(s,s')$ to the original reward  $r$, 
+where $F(s,s')$ is the general form of any state-based shaping reward.
+Static potential-based reward shaping (Static PBRS) maintains the policy invariance if the
+shaping reward follows from $F(s,s')=\gamma
+f(s')-f(s)$ \cite{ng1999policy}.
+This means that we can make changes to the TD error $\delta = r+\gamma \theta^{\top}\phi'-\theta^{\top}\phi $ while still ensuring the invariance of the optimal policy,
+\begin{equation*}
+    \delta - \omega= r+\gamma \theta^{\top}\phi'-\theta^{\top}\phi - \omega,
+\end{equation*}
+where $\omega$ is a constant, acting as a static PBRS. 
+This also means that algorithms with the optimization goal 
+of minimizing errors, after introducing reward shaping, 
+may result in larger or smaller bias. Fortunately, 
+as discussed above, bias is acceptable in reinforcement 
+learning. 
+However, the problem is that selecting an appropriate 
+$\omega$ requires expert knowledge. This forces us to learn 
+$\omega$ dynamically, i.e., $\omega=\omega_t $ and dynamic PBRS can also maintain the policy 
+invariance if the shaping reward is $F(s,t,s',t')=\gamma f(s',t')-f(s,t)$,
+where $t$ is the time-step the agent reaches in  state $s$
+\cite{devlin2012dynamic}.
+However, this result requires the convergence guarantee of the dynamic potential
+function $f(s,t)$. If $f(s,t)$ does not converge as the time-step
+$t\rightarrow\infty$, the Q-values of  dynamic PBRS are not 
+guaranteed to converge.
+Let  $f_{\omega_t}(s)=\frac{\omega_t}{\gamma-1}$.
+Thus, $F_{\omega_t}(s,s')=\gamma f_{\omega_t}(s')-f_{\omega_t}(s)= \omega_t$
+is a dynamic PBRS. And if $\omega$ converges finally, the dynamic potential
+function $f(s,t)$ will converge.
+Bias is the expected difference between the predicted value 
+and the true value. Therefore, under the premise of bootstrapping, we first think of 
+letting $\omega \doteq \mathbb{E}[\mathbb{E}[\delta|s]]=\mathbb{E}[\delta]$. 
+As we all know, the optimization process of linear TD(0) (semi-gradient) and linear TDC are as follows, respectively:
+\begin{equation*}
+    \theta^{*}= \arg \min_{\theta} \mathbb{E}[(\mathbb{E}[\delta |s])^2],
+\end{equation*}
+and
+\begin{equation*}
+    \theta^{*}=\arg \min_{\theta} \mathbb{E}[\delta \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1} \mathbb{E}[\delta\phi].
+\end{equation*}
+As a result, two novel objective functions and their corresponding algorithms are proposed, 
+where $\omega$ is subsequently proven to converge, meaning that these two algorithms can maintain the invariance of the optimal strategy.
+\subsection{Variance Minimization TD Learning: VMTD}
+For on-policy learning,
+a novel objective function, Variance of Bellman Error (VBE), is proposed as follows:
+\begin{equation}
+    \begin{array}{ccl}
+        \arg \min_{\theta}\text{VBE}(\theta)&=&\arg \min_{\theta}\mathbb{E}[(\mathbb{E}[\delta|s]-\mathbb{E}[\mathbb{E}[\delta|s]])^2]\\
+        &=&\arg \min_{\theta,\omega} \mathbb{E}[(\mathbb{E}[\delta|s]-\omega)^2].
+    \end{array}
+\end{equation}
+Clearly, it is no longer to minimize Bellman errors. 
+First, the parameter  $\omega$ is derived directly based on
+stochastic gradient descent:
+\begin{equation}
+\omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k),
+\label{omega}
+\end{equation}
+where $\delta_k$ is the TD error as follows:
+\begin{equation}
+\delta_k = r+\gamma
+\theta_k^{\top}\phi_{k}'-\theta_k^{\top}\phi_k.
+\label{delta}
+\end{equation}
+Then, based on stochastic semi-gradient descent, the update of 
+the parameter $\theta$ is as follows:
+\begin{equation}
+\theta_{k+1}\leftarrow
+\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi_k.
+\label{theta}
+\end{equation}
+The pseudocode of the VMTD algorithm is shown in Algorithm \ref{alg:algorithm 1}.
+For control tasks,  two extensions of VMTD are named VMSarsa and VMQ respectively, 
+and the update formulas are shown below: 
+\begin{equation}
+    \theta_{k+1}\leftarrow
+    \theta_{k}+\alpha_k(\delta_k-\omega_k)\phi(s_k,a_k).
+\end{equation}
+and
+\begin{equation}
+    \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k),
+\end{equation}
+where $\delta_k$ delta in VMSarsa is:
+\begin{equation}
+    \delta_{k}=r_{k+1}+\gamma \theta_{k}^{\top}\phi(s_{k+1},a_{k+1}) - \theta_{k}^{\top}\phi(s_{k},a_{k}),
+    \label{deltaSarsa}
+\end{equation}
+and $\delta_k$ delta in VMQ is:
+\begin{equation}
+    \delta_{k}=r_{k+1}+\gamma \max_{a\in A}\theta_{k}^{\top}\phi(s_{k+1},a) - \theta_{k}^{\top}\phi(s_{k},a_{k}).
+    \label{deltaQ}
+\end{equation}
+\begin{algorithm}[t]
+    \caption{VMTD algorithm with linear function approximation in the on-policy setting}
+    \label{alg:algorithm 1}
+\begin{algorithmic}
+    \STATE {\bfseries Input:} $\theta_{0}$, $\omega_{0}$, $\gamma
+    $, learning rate $\alpha_t$ and $\beta_t$
+    \REPEAT
+    \STATE For any episode, initialize $\theta_{0}$ arbitrarily,  $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$ and $\beta_t$ are constant.\\
+    \FOR{$t=0$ {\bfseries to} $T-1$}
+    \STATE Take $A_t$ from $S_t$ according to policy $\mu$, and arrive at $S_{t+1}$\\
+    \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
+    \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t}'-\theta_t^{\top}\phi_t$
+    \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t(\delta_t-\omega_t)\phi_t$
+    \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t(\delta_t-\omega_t)$
+    \STATE $S_t=S_{t+1}$
+    \ENDFOR
+    \UNTIL{terminal episode}
+\end{algorithmic}
+\end{algorithm}
+\subsection{Variance Minimization TDC Learning: VMTDC}
+For off-policy learning, we employ a projection operator.
+The objective function is called Variance of Projected Bellman error (VPBE), 
+and the corresponding algorithm is called VMTDC.
+\begin{equation}
+    \begin{array}{ccl}
+    \text{VPBE}(\theta)&=&\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta])\phi]\\
+    &=&\mathbb{E}[(\delta-\omega) \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta-\omega)\phi],
+    \end{array}
+\end{equation}
+where $\omega$  is used to estimate $\mathbb{E}[\delta]$, i.e., $\omega \doteq \mathbb{E}[\delta]$.
+The derivation process of the VMTDC algorithm is the same 
+as that of the TDC algorithm, the only difference is that the original $\delta$ is replaced by $\delta-\omega$.
+Therefore, we can easily get the updated formula of VMTDC, as follows:
+\begin{equation}
+    \theta_{k+1}\leftarrow\theta_{k}+\alpha_{k}[(\delta_{k}- \omega_k) \phi(s_k)\\
+     - \gamma\phi(s_{k+1})(\phi^{\top} (s_k) u_k)],
+\label{thetavmtdc}
+\end{equation}
+\begin{equation}
+    u_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\omega_k - \phi^{\top} (s_k) u_k]\phi(s_k),
+\label{uvmtdc}
+\end{equation}
+and
+\begin{equation}
+    \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k),
+    \label{omegavmtdc}
+\end{equation}
+The pseudocode of the VMTDC algorithm for importance-sampling scenario is shown in Algorithm \ref{alg:algorithm 2} of Appendix \ref{proofth2}.
+Now, we will introduce the improved version of the GQ(0) algorithm, named VMGQ(0):
+\begin{equation}
+    \begin{array}{ccl}
+    \theta_{k+1}\leftarrow\theta_{k}&+&\alpha_{k}[(\delta_{k}- \omega_k) \phi(s_k,a_k)\\
+     &-& \gamma\phi(s_{k+1},A^{*}_{k+1})(\phi^{\top} (s_k,a_k) u_k)],
+    \end{array}
+\end{equation}
+\begin{equation}
+    u_{k+1}\leftarrow u_{k}+\zeta_{k}[(\delta_{k}-u_k) - \phi^{\top} (s_k,a_k) u_k]\phi(s_k,a_k),
+\end{equation}
+and
+\begin{equation}
+    \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k- \omega_k),
+\end{equation}
+where $\delta_{k}$ is (\ref{deltaQ}) and $A^{*}_{k+1}={\arg \max}_{a}(\theta_{k}^{\top}\phi(s_{k+1},a))$.
+This paper also introduces an additional parameter $\omega$ into the GTD and GTD2 algorithms. For details, please refer to the appendix.
\ No newline at end of file
--- a/main/pic/Acrobot_complete.pdf
+++ b/main/pic/Acrobot_complete.pdf
--- a/main/pic/BairdExample.tex
+++ b/main/pic/BairdExample.tex
+\resizebox{6cm}{4cm}{
+\begin{tikzpicture}[smooth]
+\node[coordinate] (origin) at (0.3,0) {};
+\node[coordinate] (num7) at (3,0) {};
+\node[coordinate] (num1) at (1,2.5) {};
+\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {};
+\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {};
+\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {};
+\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {};
+\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {};
+\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {};
+\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {};
+\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {};
+\path (num1) ++ (0:1cm) node (num2) [coordinate] {};
+\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {};
+\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {};
+\path (num2) ++ (0:1cm) node (num3) [coordinate] {};
+\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {};
+\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {};
+\path (num3) ++ (0:1cm) node (num4) [coordinate] {};
+\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {};
+\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {};
+\path (num4) ++ (0:1cm) node (num5) [coordinate] {};
+\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {};
+\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {};
+\path (num5) ++ (0:1cm) node (num6) [coordinate] {};
+\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {};
+\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {};
+%\draw[->](0,0) -- (1,1);
+%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1);
+ %\fill (0.5,0.5) circle (0.5);
+ %\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7};
+\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06]
+coordinates{(num7) (origin) (origin_above) (origin_aright)}; 
+\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5]
+coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)};
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6};
+\draw[->,>=stealth,line width = 0.02cm] (a)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (b)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (c)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (d)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (e)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (f)--(g);
+\end{tikzpicture}
+}
--- a/main/pic/counterexample_quanju_new.pdf
+++ b/main/pic/counterexample_quanju_new.pdf
--- a/main/pic/cw_complete.pdf
+++ b/main/pic/cw_complete.pdf
--- a/main/pic/dependent_new.pdf
+++ b/main/pic/dependent_new.pdf
--- a/main/pic/inverted_new.pdf
+++ b/main/pic/inverted_new.pdf
--- a/main/pic/maze_13_13.pdf
+++ b/main/pic/maze_13_13.pdf
--- a/main/pic/maze_complete.pdf
+++ b/main/pic/maze_complete.pdf
--- a/main/pic/mt_complete.pdf
+++ b/main/pic/mt_complete.pdf
--- a/main/pic/randomwalk.tex
+++ b/main/pic/randomwalk.tex
+% \tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
+% \tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
+% \tikzstyle{init} = [pin edge={to-,thin,black}]
+% 	\resizebox{8cm}{1.2cm}{
+% \begin{tikzpicture}[node distance=1.5cm,auto,>=latex']
+%     \node [block] (o) {};
+%     \node (p) [left of=o,node distance=0.5cm, coordinate] {o};
+%     \node [shape=circle,int] (a) [right of=o]{$A$};
+%     \node (b) [left of=a,node distance=1.5cm, coordinate] {a};
+%     \node [shape=circle,int] (c) [right of=a] {$B$};
+%     \node (d) [left of=c,node distance=1.5cm, coordinate] {c};
+%     \node [shape=circle,int, pin={[init]above:$$}] (e) [right of=c]{$C$}; 
+%     \node (f) [left of=e,node distance=1.5cm, coordinate] {e};
+%     \node [shape=circle,int] (g) [right of=e] {$D$};
+%     \node (h) [left of=g,node distance=1.5cm, coordinate] {g};
+%     \node [shape=circle,int] (i) [right of=g] {$E$};
+%     \node (j) [left of=i,node distance=1.5cm, coordinate] {i};
+%     \node [block] (k) [right of=i] {};
+%     \node (l) [left of=k,node distance=0.5cm, coordinate] {k};
+%     \path[<-] (o) edge node {$0$} (a);
+%     \path[<->] (a) edge node {$0$} (c);
+%     \path[<->] (c) edge node {$0$} (e);
+%     \path[<->] (e) edge node {$0$} (g);
+%     \path[<->] (g) edge node {$0$} (i);
+%     \draw[->] (i) edge node {$1$} (k);
+% \end{tikzpicture}
+% }
+\tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
+\tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
+\tikzstyle{init} = [pin edge={to-,thin,black}]
+\resizebox{6cm}{1cm}{
+    \begin{tikzpicture}[node distance=1.5cm, auto, >=latex]
+        \node [block] (o) {};
+        \node (p) [left of=o, node distance=0.5cm, coordinate] {o};
+        \node [shape=circle, int] (a) [right of=o] {$A$};
+        \node (b) [left of=a, node distance=1.5cm, coordinate] {a};
+        \node [shape=circle, int] (c) [right of=a] {$B$};
+        \node (d) [left of=c, node distance=1.5cm, coordinate] {c};
+        \node [shape=circle, int, pin={[init]above:$ $}] (e) [right of=c] {$C$};
+        \node (f) [left of=e, node distance=1.5cm, coordinate] {e};
+        \node [shape=circle, int] (g) [right of=e] {$D$};
+        \node (h) [left of=g, node distance=1.5cm, coordinate] {g};
+        \node [shape=circle, int] (i) [right of=g] {$E$};
+        \node (j) [left of=i, node distance=1.5cm, coordinate] {i};
+        \node [block] (k) [right of=i] {};
+        \node (l) [left of=k, node distance=0.5cm, coordinate] {k};
+        \path[->] (o) edge node {$0$} (a);
+        \path[<->] (a) edge node {$0$} (c);
+        \path[<->] (c) edge node {$0$} (e);
+        \path[<->] (e) edge node {$0$} (g);
+        \path[<->] (g) edge node {$0$} (i);
+        \draw[->] (i) edge node {$1$} (k);
+    \end{tikzpicture}
+}
\ No newline at end of file
--- a/main/pic/tabular_new.pdf
+++ b/main/pic/tabular_new.pdf
--- a/main/preliminaries.tex
+++ b/main/preliminaries.tex
+\section{Background}
+\label{preliminaries}
+Reinforcement learning agent interacts with environment, observes state,
+ takes sequential decision makings to influence environment, and obtains
+ rewards.
+ Consider an infinite-horizon discounted 
+ Markov Decision Process (MDP), defined by a tuple $\langle S,A,R,P,\gamma
+ \rangle$, where $S=\{1,2,\ldots,N\}$ is a finite set of states of the environment;  $A$
+ is a finite set of actions of the agent; 
+ $R:S\times A \times S \rightarrow \mathbb{R}$ is a bounded deterministic reward
+ function; $P:S\times A\times S \rightarrow [0,1]$ is the transition
+ probability distribution;  and $\gamma\in (0,1)$
+  is the discount factor \cite{Sutton2018book}.
+  Due to the requirements of  online learning, value iteration based on sampling
+  is considered in this paper. 
+  In each sampling, an experience (or transition) $\langle s, a, s', r\rangle$ is
+  obtained.
+  A policy is a mapping $\pi:S\times A \rightarrow [0,1]$. The goal of the
+  agent is to find an optimal policy $\pi^*$ to maximize the expectation of a
+  discounted cumulative rewards in a long period.
+  State value function $V^{\pi}(s)$  for a stationary policy $\pi$ is 
+  defined as:
+  \begin{equation*}
+  V^{\pi}(s)=\mathbb{E}_{\pi}[\sum_{k=0}^{\infty} \gamma^k R_{k}|s_0=s].
+  \label{valuefunction}
+  \end{equation*}
+  Linear value function for state $s\in S$ is defined as:
+   \begin{equation}
+   V_{{\theta}}(s):= {\theta}^{\top}{\phi}(s) = \sum_{i=1}^{m}
+   \theta_i \phi_i(s),
+   \label{linearvaluefunction}
+   \end{equation}
+  where ${\theta}:=(\theta_1,\theta_2,\ldots,\theta_m)^{\top}\in
+  \mathbb{R}^m$ is a parameter vector, 
+  ${\phi}:=(\phi_1,\phi_2,\ldots,\phi_m)^{\top}\in \mathbb{R}^m$ is a feature
+  function  defined on state space $S$, and $m$ is the feature size. 
+  Tabular temporal difference (TD) learning \cite{Sutton2018book} has been successfully applied to small-scale problems.
+  To deal with the well-known curse of dimensionality of large scale MDPs, value
+  function is usually approximated by a linear model, kernel methods, decision
+   trees, or neural networks, etc. This paper focuses on the linear model, where
+   features are usually hand coded by domain experts.
+TD learning can also be used to find optimal strategies. The problem of finding an optimal policy is 
+often called the control problem. Two popular TD methods are Sarsa and Q-leaning. The former is an on-policy 
+TD control, while the latter is an off-policy control. 
+It is well known that TDC algorithm \cite{sutton2009fast} guarantees 
+convergence under off-policy conditions while the off-policy TD algorithm may diverge. The 
+objective function of TDC is MSPBE. 
+TDC is essentially an adjustment or correction of the TD update so that it
+follows the gradient of the MSPBE objective function. In the context of the TDC algorithm, the control algorithm 
+is known as Greedy-GQ($\lambda$) \cite{sutton2009fast}. When $\lambda$ is set to 0, it is denoted 
+as GQ(0).
\ No newline at end of file
--- a/main/relatedwork.tex
+++ b/main/relatedwork.tex
+\begin{figure*}[htb]
+  \vskip 0.2in
+  \begin{center}
+  \subfigure[Maze]{
+      \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/maze_complete.pdf}
+      \label{MazeFull}
+  }
+  \subfigure[Cliff Walking]{
+      \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/cw_complete.pdf}
+      \label{CliffWalkingFull}
+  }
+  \\
+  \subfigure[Mountain Car]{
+      \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/mt_complete.pdf}
+      \label{MountainCarFull}
+  }
+  \subfigure[Acrobot]{
+      \includegraphics[width=0.3\columnwidth, height=0.2\columnwidth]{main/pic/Acrobot_complete.pdf}
+      \label{AcrobotFull}
+  }
+      \caption{Learning curses of four contral environments.}
+      \label{Complete_full}
+  \end{center}
+  \vskip -0.2in
+\end{figure*}
+\section{Related Work}
+\subsection{Difference between VMQ and R-learning}
+\begin{table*}[htb]
+  \centering
+  \caption{Difference between R-learning and tabular VMQ.}
+  \vskip 0.15in
+  \begin{tabular}{c|cc}
+      \hline
+      algorithms&update formula \\
+      \hline
+       R-learning&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}-m_{k}+ \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a))$\\
+              &$m_{k+1}\leftarrow m_{k}+\beta_k(r_{k+1}+\max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-m_{k})$\\
+       tabular VMQ&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_k)$\\
+       &$\omega_{k+1}\leftarrow \omega_{k}+\beta_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_{k})$\\
+       \hline
+  \end{tabular}
+  \label{differenceRandVMQ}
+  \vskip -0.1in
+\end{table*}
+Tabular VMQ's update formula bears some resemblance 
+to R-learning's update formula. As shown in Table \ref{differenceRandVMQ}, the update formulas of the two algorithms have the following differences:
+\\(1) The goal of the R-learning algorithm \cite{schwartz1993reinforcement} is to maximize the average 
+reward, rather than the cumulative reward, by learning an estimate 
+of the average reward. This estimate $m$ is then used to update the Q-values.
+On the contrary, the $\omega$ in the tabular VMQ update formula eventually converges to $\mathbb{E}[\delta]$.
+\\(2) When $\gamma=1$ in the tabular VMQ update formula, the 
+R-learning update formula is formally 
+the same as the tabular VMQ update formula. 
+Therefore, R-learning algorithm can be 
+considered as a special case of VMQ algorithm in form.
+\subsection{Variance Reduction for TD Learning}
+ The TD with centering algorithm (CTD) \cite{korda2015td} 
+was proposed, which directly applies variance reduction techniques to 
+the TD algorithm. The CTD algorithm updates its parameters using the 
+average gradient of a batch of Markovian samples and a projection operator. 
+Unfortunately, the authors’ analysis of the CTD algorithm contains technical 
+errors. The VRTD algorithm \cite{xu2020reanalysis} is also a variance-reduced algorithm that updates 
+its parameters using the average gradient of a batch of i.i.d. samples. The 
+authors of VRTD provide a technically sound analysis to demonstrate the 
+advantages of variance reduction. 
+\subsection{Variance Reduction for Policy Gradient Algorithms}
+Policy gradient algorithms are a class of reinforcement 
+learning algorithms that directly optimize cumulative rewards. 
+REINFORCE  is a Monte Carlo algorithm that estimates 
+gradients through sampling, but may have a high variance. 
+Baselines are introduced to reduce variance and to
+accelerate learning \cite{Sutton2018book}. In  Actor-Critic, 
+value function as a baseline and bootstrapping 
+ are used to reduce variance, also accelerating convergence \cite{Sutton2018book}.
+ TRPO \cite{schulman2015trust} and PPO \cite{schulman2017proximal}
+  use generalized advantage 
+estimation, which combines multi-step bootstrapping and Monte Carlo 
+estimation to reduce variance, making gradient estimation more stable and 
+accelerating convergence. 
+In Variance Minimization, 
+the incorporation of $\omega \doteq \mathbb{E}[\delta]$ 
+bears a striking resemblance to the use of a baseline 
+in policy gradient methods. The introduction of a baseline 
+in policy gradient techniques does not alter 
+the expected value of the update; 
+rather, it significantly impacts the variance of gradient estimation. 
+The addition of $\omega \doteq \mathbb{E}[\delta]$ in Variance Minimization 
+ preserves the invariance of the optimal 
+policy while stabilizing gradient estimation, 
+reducing the variance of gradient estimation, 
+and hastening convergence.
\ No newline at end of file
--- a/main/theory.tex
+++ b/main/theory.tex
+\section{Theoretical Analysis}
+The purpose of this section is to establish the stabilities of the VMTD algorithm
+and the VMTDC algorithm, and also presents a corollary on the convergence rate of VMTD.
+\begin{theorem}
+    \label{theorem1}(Convergence of VMTD).
+    In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta})  of VMTD.
+     Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
+    $
+    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\beta_k^2<\infty,
+    $
+    and  
+    $
+    \alpha_k = o(\beta_k).
+    $
+    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
+    uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain.
+    Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
+    $b=\mathrm{Cov}(r,\phi)$.
+    Assume that matrix $A$ is  non-singular. 
+    Then the parameter vector $\theta_k$ converges with probability one 
+    to $A^{-1}b$.
+\end{theorem}
+Please refer to the appendix \ref{proofth1} for detailed proof process.
+Theorem 3 in \cite{dalal2020tale} provides a general conclusion on the convergence speed of all linear two-timescale 
+algorithms. VMTD satisfies the assumptions of this theorem, leading 
+to the following corollary.
+\begin{corollary}
+    \label{corollary4_2}
+Consider the Sparsely Projected variant of VMTD. Then, for $\alpha_k = 1/(k+1)^{\alpha}$, $\beta_k = 1/(k+1)^{\beta}$, 
+$0<\beta<\alpha<1$, $p>1$, with probility larger than $1- \tau$, for all $k\geq N_3$, we have
+\begin{equation}
+    ||\theta'_{k} - \theta^{*}|| \le C_{3,\theta} \frac{\sqrt{\ln (4d_{1}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\alpha / 2}}
+\end{equation} 
+\begin{equation}
+    ||\omega'_{n} - \omega^{*}|| \le C_{3,\omega} \frac{\sqrt{\ln (4d_{2}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\omega / 2}},
+\end{equation} 
+\end{corollary}
+where $d_1$ and $d_2$ represent the dimensions of $\theta$ and $\omega$, respectively. For VMTD, $d_2 =1$.
+The meanings of $N_3$,$C_{3,\theta}$ and $C_{3,\omega}$ are explained in \cite{dalal2020tale}.
+The formulas for $\theta'_{k}$ and $\omega'_{n}$ can be found in (\ref{sparseprojectiontheta}) and (\ref{sparseprojectionomega}).
+Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process.
+\begin{theorem}
+    \label{theorem2}(Convergence of VMTDC).
+    In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc})   of VMTDC.
+     Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$,
+    $
+    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\zeta_k^2<\infty,
+    $
+    $
+    \sum_{k=0}^{\infty}\beta_k^2<\infty,
+    $
+    and  
+    $
+    \alpha_k = o(\zeta_k),
+    $
+    $
+    \zeta_k = o(\beta_k).
+    $
+    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
+    uniformly bounded second moments.
+    Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
+    $b=\mathrm{Cov}(r,\phi)$, and $C=\mathbb{E}[\phi\phi^{\top}]$.
+    Assume that  $A$ and $C$ are  non-singular matrices. 
+    Then the parameter vector $\theta_k$ converges with probability one 
+    to $A^{-1}b$.
+\end{theorem}
+Please refer to the appendix \ref{proofth2} for detailed proof process.
\ No newline at end of file
--- a/named.bst
+++ b/named.bst
--- a/neurips_2024.aux
+++ b/neurips_2024.aux
--- a/neurips_2024.bbl
+++ b/neurips_2024.bbl
+\begin{thebibliography}{}
+\bibitem[\protect\citeauthoryear{Baird and others}{1995}]{baird1995residual}
+Leemon Baird et~al.
+\newblock Residual algorithms: Reinforcement learning with function approximation.
+\newblock In {\em Proc. 12th Int. Conf. Mach. Learn.}, pages 30--37, 1995.
+\bibitem[\protect\citeauthoryear{Bas-Serrano \bgroup \em et al.\egroup }{2021}]{basserrano2021logistic}
+Joan Bas-Serrano, Sebastian Curi, Andreas Krause, and Gergely Neu.
+\newblock Logistic q-learning.
+\newblock In {\em International Conference on Artificial Intelligence and Statistics}, pages 3610--3618, 2021.
+\bibitem[\protect\citeauthoryear{Borkar and Meyn}{2000}]{borkar2000ode}
+Vivek~S Borkar and Sean~P Meyn.
+\newblock The ode method for convergence of stochastic approximation and reinforcement learning.
+\newblock {\em SIAM J. Control Optim.}, 38(2):447--469, 2000.
+\bibitem[\protect\citeauthoryear{Borkar}{1997}]{borkar1997stochastic}
+Vivek~S Borkar.
+\newblock Stochastic approximation with two time scales.
+\newblock {\em Syst. \& Control Letters}, 29(5):291--294, 1997.
+\bibitem[\protect\citeauthoryear{Chen \bgroup \em et al.\egroup }{2023}]{chen2023modified}
+Xingguo Chen, Xingzhou Ma, Yang Li, Guang Yang, Shangdong Yang, and Yang Gao.
+\newblock Modified retrace for off-policy temporal difference learning.
+\newblock In {\em Uncertainty in Artificial Intelligence}, pages 303--312. PMLR, 2023.
+\bibitem[\protect\citeauthoryear{Dalal \bgroup \em et al.\egroup }{2020}]{dalal2020tale}
+Gal Dalal, Balazs Szorenyi, and Gugan Thoppe.
+\newblock A tale of two-timescale reinforcement learning with the tightest finite-time bound.
+\newblock In {\em Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pages 3701--3708, 2020.
+\bibitem[\protect\citeauthoryear{Devlin and Kudenko}{2012}]{devlin2012dynamic}
+Sam Devlin and Daniel Kudenko.
+\newblock Dynamic potential-based reward shaping.
+\newblock In {\em Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, pages 433--440, 2012.
+\bibitem[\protect\citeauthoryear{Feng \bgroup \em et al.\egroup }{2019}]{feng2019kernel}
+Yihao Feng, Lihong Li, and Qiang Liu.
+\newblock A kernel loss for solving the bellman equation.
+\newblock In {\em Advances in Neural Information Processing Systems}, pages 15430--15441, 2019.
+\bibitem[\protect\citeauthoryear{Givchi and Palhang}{2015}]{givchi2015quasi}
+Arash Givchi and Maziar Palhang.
+\newblock Quasi newton temporal difference learning.
+\newblock In {\em Asian Conference on Machine Learning}, pages 159--172, 2015.
+\bibitem[\protect\citeauthoryear{Hackman}{2012}]{hackman2012faster}
+Leah Hackman.
+\newblock {\em Faster Gradient-TD Algorithms}.
+\newblock PhD thesis, University of Alberta, 2012.
+\bibitem[\protect\citeauthoryear{Hallak \bgroup \em et al.\egroup }{2016}]{hallak2016generalized}
+Assaf Hallak, Aviv Tamar, Remi Munos, and Shie Mannor.
+\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
+\newblock In {\em Proceedings of the 30th AAAI Conference on Artificial Intelligence}, pages 1631--1637, 2016.
+\bibitem[\protect\citeauthoryear{Hirsch}{1989}]{hirsch1989convergent}
+Morris~W Hirsch.
+\newblock Convergent activation dynamics in continuous time networks.
+\newblock {\em Neural Netw.}, 2(5):331--349, 1989.
+\bibitem[\protect\citeauthoryear{Johnson and Zhang}{2013}]{johnson2013accelerating}
+R.~Johnson and T.~Zhang.
+\newblock Accelerating stochastic gradient descent using predictive variance reduction.
+\newblock In {\em Advances in Neural Information Processing Systems}, pages 315--323, 2013.
+\bibitem[\protect\citeauthoryear{Korda and La}{2015}]{korda2015td}
+Nathaniel Korda and Prashanth La.
+\newblock On td (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
+\newblock In {\em International conference on machine learning}, pages 626--634. PMLR, 2015.
+\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2015}]{liu2015finite}
+Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
+\newblock Finite-sample analysis of proximal gradient td algorithms.
+\newblock In {\em Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, pages 504--513, 2015.
+\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2016}]{liu2016proximal}
+Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
+\newblock Proximal gradient temporal difference learning algorithms.
+\newblock In {\em Proceedings of the International Joint Conference on Artificial Intelligence}, pages 4195--4199, 2016.
+\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2018}]{liu2018proximal}
+Bo~Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji~Liu, Sridhar Mahadevan, and Marek Petrik.
+\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
+\newblock {\em Journal of Artificial Intelligence Research}, 63:461--494, 2018.
+\bibitem[\protect\citeauthoryear{Maei}{2011}]{maei2011gradient}
+Hamid~Reza Maei.
+\newblock {\em Gradient temporal-difference learning algorithms}.
+\newblock PhD thesis, University of Alberta, 2011.
+\bibitem[\protect\citeauthoryear{Ng \bgroup \em et al.\egroup }{1999}]{ng1999policy}
+Andrew~Y Ng, Daishi Harada, and Stuart Russell.
+\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
+\newblock In {\em Proc. 16th Int. Conf. Mach. Learn.}, pages 278--287, 1999.
+\bibitem[\protect\citeauthoryear{Pan \bgroup \em et al.\egroup }{2017}]{pan2017accelerated}
+Yangchen Pan, Adam White, and Martha White.
+\newblock Accelerated gradient temporal difference learning.
+\newblock In {\em Proceedings of the 21st AAAI Conference on Artificial Intelligence}, pages 2464--2470, 2017.
+\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2015}]{schulman2015trust}
+J.~Schulman, S.~Levine, P.~Abbeel, M.~Jordan, and P.~Moritz.
+\newblock Trust region policy optimization.
+\newblock In {\em International Conference on Machine Learning}, pages 1889--1897, 2015.
+\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2017}]{schulman2017proximal}
+J.~Schulman, F.~Wolski, P.~Dhariwal, A.~Radford, and O.~Klimov.
+\newblock Proximal policy optimization algorithms.
+\newblock {\em arXiv preprint arXiv:1707.06347}, 2017.
+\bibitem[\protect\citeauthoryear{Schwartz}{1993}]{schwartz1993reinforcement}
+Anton Schwartz.
+\newblock A reinforcement learning method for maximizing undiscounted rewards.
+\newblock In {\em Proc. 10th Int. Conf. Mach. Learn.}, volume 298, pages 298--305, 1993.
+\bibitem[\protect\citeauthoryear{Sutton and Barto}{2018}]{Sutton2018book}
+Richard~S. Sutton and Andrew~G. Barto.
+\newblock {\em Reinforcement Learning: An Introduction}.
+\newblock The MIT Press, second edition, 2018.
+\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2008}]{sutton2008convergent}
+Richard~S Sutton, Hamid~R Maei, and Csaba Szepesv{\'a}ri.
+\newblock A convergent $ o (n) $ temporal-difference algorithm for off-policy learning with linear function approximation.
+\newblock In {\em Advances in Neural Information Processing Systems}, pages 1609--1616. Cambridge, MA: MIT Press, 2008.
+\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2009}]{sutton2009fast}
+R.S. Sutton, H.R. Maei, D.~Precup, S.~Bhatnagar, D.~Silver, C.~Szepesv{\'a}ri, and E.~Wiewiora.
+\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
+\newblock In {\em Proc. 26th Int. Conf. Mach. Learn.}, pages 993--1000, 2009.
+\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2016}]{sutton2016emphatic}
+Richard~S Sutton, A~Rupam Mahmood, and Martha White.
+\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
+\newblock {\em The Journal of Machine Learning Research}, 17(1):2603--2631, 2016.
+\bibitem[\protect\citeauthoryear{Sutton}{1988}]{sutton1988learning}
+Richard~S Sutton.
+\newblock Learning to predict by the methods of temporal differences.
+\newblock {\em Machine learning}, 3(1):9--44, 1988.
+\bibitem[\protect\citeauthoryear{Tsitsiklis and Van~Roy}{1997}]{tsitsiklis1997analysis}
+John~N Tsitsiklis and Benjamin Van~Roy.
+\newblock Analysis of temporal-diffference learning with function approximation.
+\newblock In {\em Advances in Neural Information Processing Systems}, pages 1075--1081, 1997.
+\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2019}]{xu2019reanalysis}
+Tengyu Xu, Zhe Wang, Yi~Zhou, and Yingbin Liang.
+\newblock Reanalysis of variance reduced temporal difference learning.
+\newblock In {\em International Conference on Learning Representations}, 2019.
+\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2020}]{xu2020reanalysis}
+T.~Xu, Z.~Wang, Y.~Zhou, and Y.~Liang.
+\newblock Reanalysis of variance reduced temporal difference learning.
+\newblock {\em arXiv preprint arXiv:2001.01898}, 2020.
+\bibitem[\protect\citeauthoryear{Zhang and Whiteson}{2022}]{zhang2022truncated}
+Shangtong Zhang and Shimon Whiteson.
+\newblock Truncated emphatic temporal difference methods for prediction and control.
+\newblock {\em The Journal of Machine Learning Research}, 23(1):6859--6917, 2022.
+\end{thebibliography}
--- a/neurips_2024.bib
+++ b/neurips_2024.bib
--- a/neurips_2024.blg
+++ b/neurips_2024.blg
+This is BibTeX, Version 0.99d (TeX Live 2023)
+Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
+The top-level auxiliary file: neurips_2024.aux
+The style file: named.bst
+Database file #1: neurips_2024.bib
+Warning--can't use both volume and number fields in dalal2020tale
+You've used 32 entries,
+            2439 wiz_defined-function locations,
+            737 strings with 10053 characters,
+and the built_in function-call counts, 15617 in all, are:
+= -- 1648
+> -- 575
+< -- 21
+ -- 200
+- -- 194
+* -- 1156
+:= -- 2297
+add.period$ -- 99
+call.type$ -- 32
+change.case$ -- 222
+chr.to.int$ -- 32
+cite$ -- 33
+duplicate$ -- 692
+empty$ -- 1205
+format.name$ -- 235
+if$ -- 3401
+int.to.chr$ -- 1
+int.to.str$ -- 0
+missing$ -- 31
+newline$ -- 163
+num.names$ -- 96
+pop$ -- 236
+preamble$ -- 1
+purify$ -- 256
+quote$ -- 0
+skip$ -- 627
+stack$ -- 0
+substring$ -- 1023
+swap$ -- 276
+text.length$ -- 21
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 252
+warning$ -- 1
+while$ -- 134
+width$ -- 37
+write$ -- 420
+(There was 1 warning)
--- a/neurips_2024.log
+++ b/neurips_2024.log
--- a/neurips_2024.out
+++ b/neurips_2024.out
+\BOOKMARK [1][-]{section.1}{\376\377\000I\000n\000t\000r\000o\000d\000u\000c\000t\000i\000o\000n}{}% 1
+\BOOKMARK [1][-]{section.2}{\376\377\000B\000a\000c\000k\000g\000r\000o\000u\000n\000d}{}% 2
+\BOOKMARK [1][-]{section.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{}% 3
+\BOOKMARK [2][-]{subsection.3.1}{\376\377\000M\000o\000t\000i\000v\000a\000t\000i\000o\000n}{section.3}% 4
+\BOOKMARK [2][-]{subsection.3.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D}{section.3}% 5
+\BOOKMARK [2][-]{subsection.3.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000C\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D\000C}{section.3}% 6
+\BOOKMARK [1][-]{section.4}{\376\377\000T\000h\000e\000o\000r\000e\000t\000i\000c\000a\000l\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{}% 7
+\BOOKMARK [1][-]{section.5}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000S\000t\000u\000d\000i\000e\000s}{}% 8
+\BOOKMARK [2][-]{subsection.5.1}{\376\377\000T\000e\000s\000t\000i\000n\000g\000\040\000T\000a\000s\000k\000s}{section.5}% 9
+\BOOKMARK [2][-]{subsection.5.2}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000R\000e\000s\000u\000l\000t\000s\000\040\000a\000n\000d\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section.5}% 10
+\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000l\000a\000t\000e\000d\000\040\000W\000o\000r\000k}{}% 11
+\BOOKMARK [2][-]{subsection.6.1}{\376\377\000D\000i\000f\000f\000e\000r\000e\000n\000c\000e\000\040\000b\000e\000t\000w\000e\000e\000n\000\040\000V\000M\000Q\000\040\000a\000n\000d\000\040\000R\000-\000l\000e\000a\000r\000n\000i\000n\000g}{section.6}% 12
+\BOOKMARK [2][-]{subsection.6.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g}{section.6}% 13
+\BOOKMARK [2][-]{subsection.6.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{section.6}% 14
+\BOOKMARK [1][-]{section.7}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n\000\040\000a\000n\000d\000\040\000F\000u\000t\000u\000r\000e\000\040\000W\000o\000r\000k}{}% 15
+\BOOKMARK [1][-]{appendix.A}{\376\377\000R\000e\000l\000e\000v\000a\000n\000t\000\040\000p\000r\000o\000o\000f\000s}{}% 16
+\BOOKMARK [2][-]{subsection.A.1}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0001}{appendix.A}% 17
+\BOOKMARK [2][-]{subsection.A.2}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000C\000o\000r\000o\000l\000l\000a\000r\000y\000\040\0004\000.\0002}{appendix.A}% 18
+\BOOKMARK [2][-]{subsection.A.3}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0003}{appendix.A}% 19
+\BOOKMARK [1][-]{appendix.B}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000d\000e\000t\000a\000i\000l\000s}{}% 20
--- a/neurips_2024.pdf
+++ b/neurips_2024.pdf
--- a/neurips_2024.sty
+++ b/neurips_2024.sty
--- a/neurips_2024.synctex.gz
+++ b/neurips_2024.synctex.gz
--- a/neurips_2024.tex
+++ b/neurips_2024.tex
+\documentclass{article}
+% if you need to pass options to natbib, use, e.g.:
+%     \PassOptionsToPackage{numbers, compress}{natbib}
+% before loading neurips_2024
+% ready for submission
+\usepackage{neurips_2024}
+% to compile a preprint version, e.g., for submission to arXiv, add add the
+% [preprint] option:
+%     \usepackage[preprint]{neurips_2024}
+% to compile a camera-ready version, add the [final] option, e.g.:
+%     \usepackage[final]{neurips_2024}
+% to avoid loading the natbib package, add option nonatbib:
+%    \usepackage[nonatbib]{neurips_2024}
+\usepackage[utf8]{inputenc} % allow utf-8 input
+\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
+\usepackage{hyperref}       % hyperlinks
+\usepackage{url}            % simple URL typesetting
+\usepackage{booktabs}       % professional-quality tables
+\usepackage{amsfonts}       % blackboard math symbols
+\usepackage{nicefrac}       % compact symbols for 1/2, etc.
+\usepackage{microtype}      % microtypography
+\usepackage{xcolor}         % colors
+\usepackage{graphicx}
+\usepackage{subfigure}
+\usepackage{diagbox}
+\usepackage{wrapfig}
+\usepackage{booktabs}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{mathtools}
+\usepackage{amsthm}
+\usepackage{tikz}
+\theoremstyle{plain}
+\newtheorem{theorem}{Theorem}[section]
+\newtheorem{proposition}[theorem]{Proposition}
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{corollary}[theorem]{Corollary}
+\theoremstyle{definition}
+\newtheorem{definition}[theorem]{Definition}
+\newtheorem{assumption}[theorem]{Assumption}
+\theoremstyle{remark}
+\newtheorem{remark}[theorem]{Remark}
+\usepackage{algorithm}
+\usepackage{algorithmic}
+\title{Is Minimizing Errors the Only Option for Value-based Reinforcement Learning?}
+% The \author macro works with any number of authors. There are two commands
+% used to separate the names and addresses of multiple authors: \And and \AND.
+%
+% Using \And between authors leaves it to LaTeX to determine where to break the
+% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
+% authors names on the first line, and the last on the second line, try using
+% \AND instead of \And before the third author name.
+\author{%
+  David S.~Hippocampus\thanks{Use footnote for providing further information
+    about author (webpage, alternative address)---\emph{not} for acknowledging
+    funding agencies.} \\
+  Department of Computer Science\\
+  Cranberry-Lemon University\\
+  Pittsburgh, PA 15213 \\
+  \texttt{hippo@cs.cranberry-lemon.edu} \\
+  % examples of more authors
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \AND
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+}
+\begin{document}
+\maketitle
+\begin{abstract}
+  The existing research on 
+  value-based reinforcement learning also minimizes the error. 
+  However, is error minimization really the only option
+   for value-based reinforcement learning? 
+   We can easily observe that the policy on action 
+   choosing probabilities is often related to the relative values,
+    and has nothing to do with their absolute values. 
+    Based on this observation, we propose the objective
+    of variance minimization instead of error minimization, 
+    derive many new variance minimization algorithms, both including a traditional parameter $\omega$, 
+    and conduct an analysis of the convergence rate and experiments. 
+    The experimental results show that our proposed variance minimization algorithms
+     converge much faster.
+\end{abstract}
+\input{main/introduction.tex}
+\input{main/preliminaries.tex}
+\input{main/motivation.tex}
+\input{main/theory.tex}
+\input{main/experiment.tex}
+\input{main/relatedwork.tex}
+\input{main/conclusion.tex}
+\appendix
+\input{main/appendix.tex}
+\bibliographystyle{named}
+\bibliography{neurips_2024}
+% \bibliographystyle{neurips_2024}
+\end{document}
\ No newline at end of file