AAAI25的模板，把能写的写了进去，尽快补充实验

ff4dbddd · GongYu · ff4dbddd · ff4dbddd · ff4dbddd · ff4dbddd
Commit ff4dbddd authored Jun 30, 2024 by GongYu
47 changed files
--- a/AnonymousSubmission/aaai25.bib
+++ b/AnonymousSubmission/aaai25.bib
--- a/AnonymousSubmission/aaai25.bst
+++ b/AnonymousSubmission/aaai25.bst
--- a/AnonymousSubmission/aaai25.sty
+++ b/AnonymousSubmission/aaai25.sty
--- a/AnonymousSubmission/anonymous-submission-latex-2025.aux
+++ b/AnonymousSubmission/anonymous-submission-latex-2025.aux
+\relax 
+\bibstyle{aaai25}
+\citation{sutton1988learning}
+\citation{tsitsiklis1997analysis}
+\citation{Sutton2018book}
+\citation{baird1995residual}
+\citation{sutton2008convergent}
+\citation{sutton2009fast}
+\citation{sutton2016emphatic}
+\citation{chen2023modified}
+\citation{hackman2012faster}
+\citation{liu2015finite,liu2016proximal,liu2018proximal}
+\citation{givchi2015quasi}
+\citation{pan2017accelerated}
+\citation{hallak2016generalized}
+\citation{zhang2022truncated}
+\citation{johnson2013accelerating}
+\citation{korda2015td}
+\citation{xu2019reanalysis}
+\citation{Sutton2018book}
+\citation{baird1995residual}
+\citation{sutton2009fast}
+\citation{sutton2009fast}
+\citation{feng2019kernel}
+\citation{basserrano2021logistic}
+\newlabel{introduction}{{}{1}}
+\citation{Sutton2018book}
+\citation{Sutton2018book}
+\citation{ng1999policy}
+\citation{devlin2012dynamic}
+\newlabel{preliminaries}{{}{2}}
+\newlabel{valuefunction}{{}{2}}
+\newlabel{linearvaluefunction}{{1}{2}}
+\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
+\newlabel{example_bias}{{1}{2}}
+\newlabel{alg:algorithm 1}{{1}{3}}
+\newlabel{omega}{{3}{3}}
+\newlabel{delta}{{4}{3}}
+\newlabel{theta}{{5}{3}}
+\newlabel{deltaSarsa}{{8}{3}}
+\newlabel{deltaQ}{{9}{3}}
+\newlabel{alg:algorithm 2}{{2}{3}}
+\newlabel{thetavmtdc}{{11}{3}}
+\newlabel{uvmtdc}{{12}{3}}
+\newlabel{omegavmtdc}{{13}{3}}
+\newlabel{fvmetd}{{18}{3}}
+\newlabel{thetavmetd}{{19}{3}}
+\newlabel{omegavmetd}{{20}{3}}
+\citation{borkar1997stochastic}
+\citation{hirsch1989convergent}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\newlabel{alg:algorithm 5}{{3}{4}}
+\newlabel{theorem1}{{1}{4}}
+\newlabel{th1proof}{{}{4}}
+\newlabel{thetaFast}{{22}{4}}
+\newlabel{omegaFast}{{23}{4}}
+\newlabel{omegaFastFinal}{{24}{4}}
+\newlabel{omegaInfty}{{25}{4}}
+\citation{Sutton2018book}
+\citation{sutton2009fast}
+\citation{baird1995residual,sutton2009fast}
+\newlabel{odetheta}{{26}{5}}
+\newlabel{covariance}{{27}{5}}
+\newlabel{odethetafinal}{{28}{5}}
+\newlabel{theorem2}{{2}{5}}
+\newlabel{randomwalk}{{1}{5}}
+\newlabel{bairdexample}{{2}{5}}
+\newlabel{theorem3}{{3}{5}}
+\citation{schwartz1993reinforcement}
+\citation{korda2015td}
+\citation{xu2020reanalysis}
+\citation{Sutton2018book}
+\citation{Sutton2018book}
+\citation{schulman2015trust}
+\citation{schulman2017proximal}
+\bibdata{aaai25}
+\bibcite{baird1995residual}{{1}{1995}{{Baird et~al.}}{{}}}
+\bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano et~al.}}{{Bas-Serrano, Curi, Krause, and Neu}}}
+\bibcite{borkar1997stochastic}{{3}{1997}{{Borkar}}{{}}}
+\bibcite{borkar2000ode}{{4}{2000}{{Borkar and Meyn}}{{}}}
+\bibcite{chen2023modified}{{5}{2023}{{Chen et~al.}}{{Chen, Ma, Li, Yang, Yang, and Gao}}}
+\bibcite{devlin2012dynamic}{{6}{2012}{{Devlin and Kudenko}}{{}}}
+\bibcite{feng2019kernel}{{7}{2019}{{Feng, Li, and Liu}}{{}}}
+\bibcite{givchi2015quasi}{{8}{2015}{{Givchi and Palhang}}{{}}}
+\bibcite{hackman2012faster}{{9}{2012}{{Hackman}}{{}}}
+\bibcite{hallak2016generalized}{{10}{2016}{{Hallak et~al.}}{{Hallak, Tamar, Munos, and Mannor}}}
+\bibcite{hirsch1989convergent}{{11}{1989}{{Hirsch}}{{}}}
+\bibcite{johnson2013accelerating}{{12}{2013}{{Johnson and Zhang}}{{}}}
+\bibcite{korda2015td}{{13}{2015}{{Korda and La}}{{}}}
+\bibcite{liu2018proximal}{{14}{2018}{{Liu et~al.}}{{Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}}}
+\bibcite{liu2015finite}{{15}{2015}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}}
+\bibcite{liu2016proximal}{{16}{2016}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}}
+\bibcite{ng1999policy}{{17}{1999}{{Ng, Harada, and Russell}}{{}}}
+\bibcite{pan2017accelerated}{{18}{2017}{{Pan, White, and White}}{{}}}
+\bibcite{schulman2015trust}{{19}{2015}{{Schulman et~al.}}{{Schulman, Levine, Abbeel, Jordan, and Moritz}}}
+\bibcite{schulman2017proximal}{{20}{2017}{{Schulman et~al.}}{{Schulman, Wolski, Dhariwal, Radford, and Klimov}}}
+\bibcite{schwartz1993reinforcement}{{21}{1993}{{Schwartz}}{{}}}
+\bibcite{sutton2009fast}{{22}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}}
+\bibcite{sutton1988learning}{{23}{1988}{{Sutton}}{{}}}
+\bibcite{Sutton2018book}{{24}{2018}{{Sutton and Barto}}{{}}}
+\bibcite{sutton2008convergent}{{25}{2008}{{Sutton, Maei, and Szepesv{\'a}ri}}{{}}}
+\bibcite{sutton2016emphatic}{{26}{2016}{{Sutton, Mahmood, and White}}{{}}}
+\bibcite{tsitsiklis1997analysis}{{27}{1997}{{Tsitsiklis and Van~Roy}}{{}}}
+\bibcite{xu2019reanalysis}{{28}{2019}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}}
+\bibcite{xu2020reanalysis}{{29}{2020}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}}
+\bibcite{zhang2022truncated}{{30}{2022}{{Zhang and Whiteson}}{{}}}
+\gdef \@abspage@last{7}
--- a/AnonymousSubmission/anonymous-submission-latex-2025.bbl
+++ b/AnonymousSubmission/anonymous-submission-latex-2025.bbl
+\begin{thebibliography}{30}
+\providecommand{\natexlab}[1]{#1}
+\bibitem[{Baird et~al.(1995)}]{baird1995residual}
+Baird, L.; et~al. 1995.
+\newblock Residual algorithms: Reinforcement learning with function approximation.
+\newblock In \emph{Proc. 12th Int. Conf. Mach. Learn.}, 30--37.
+\bibitem[{Bas-Serrano et~al.(2021)Bas-Serrano, Curi, Krause, and Neu}]{basserrano2021logistic}
+Bas-Serrano, J.; Curi, S.; Krause, A.; and Neu, G. 2021.
+\newblock Logistic Q-Learning.
+\newblock In \emph{International Conference on Artificial Intelligence and Statistics}, 3610--3618.
+\bibitem[{Borkar(1997)}]{borkar1997stochastic}
+Borkar, V.~S. 1997.
+\newblock Stochastic approximation with two time scales.
+\newblock \emph{Syst. \& Control Letters}, 29(5): 291--294.
+\bibitem[{Borkar and Meyn(2000)}]{borkar2000ode}
+Borkar, V.~S.; and Meyn, S.~P. 2000.
+\newblock The ODE method for convergence of stochastic approximation and reinforcement learning.
+\newblock \emph{SIAM J. Control Optim.}, 38(2): 447--469.
+\bibitem[{Chen et~al.(2023)Chen, Ma, Li, Yang, Yang, and Gao}]{chen2023modified}
+Chen, X.; Ma, X.; Li, Y.; Yang, G.; Yang, S.; and Gao, Y. 2023.
+\newblock Modified Retrace for Off-Policy Temporal Difference Learning.
+\newblock In \emph{Uncertainty in Artificial Intelligence}, 303--312. PMLR.
+\bibitem[{Devlin and Kudenko(2012)}]{devlin2012dynamic}
+Devlin, S.; and Kudenko, D. 2012.
+\newblock Dynamic potential-based reward shaping.
+\newblock In \emph{Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, 433--440.
+\bibitem[{Feng, Li, and Liu(2019)}]{feng2019kernel}
+Feng, Y.; Li, L.; and Liu, Q. 2019.
+\newblock A kernel loss for solving the Bellman equation.
+\newblock In \emph{Advances in Neural Information Processing Systems}, 15430--15441.
+\bibitem[{Givchi and Palhang(2015)}]{givchi2015quasi}
+Givchi, A.; and Palhang, M. 2015.
+\newblock Quasi newton temporal difference learning.
+\newblock In \emph{Asian Conference on Machine Learning}, 159--172.
+\bibitem[{Hackman(2012)}]{hackman2012faster}
+Hackman, L. 2012.
+\newblock \emph{Faster Gradient-TD Algorithms}.
+\newblock Ph.D. thesis, University of Alberta.
+\bibitem[{Hallak et~al.(2016)Hallak, Tamar, Munos, and Mannor}]{hallak2016generalized}
+Hallak, A.; Tamar, A.; Munos, R.; and Mannor, S. 2016.
+\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
+\newblock In \emph{Proceedings of the 30th AAAI Conference on Artificial Intelligence}, 1631--1637.
+\bibitem[{Hirsch(1989)}]{hirsch1989convergent}
+Hirsch, M.~W. 1989.
+\newblock Convergent activation dynamics in continuous time networks.
+\newblock \emph{Neural Netw.}, 2(5): 331--349.
+\bibitem[{Johnson and Zhang(2013)}]{johnson2013accelerating}
+Johnson, R.; and Zhang, T. 2013.
+\newblock Accelerating stochastic gradient descent using predictive variance reduction.
+\newblock In \emph{Advances in Neural Information Processing Systems}, 315--323.
+\bibitem[{Korda and La(2015)}]{korda2015td}
+Korda, N.; and La, P. 2015.
+\newblock On TD (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
+\newblock In \emph{International conference on machine learning}, 626--634. PMLR.
+\bibitem[{Liu et~al.(2018)Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}]{liu2018proximal}
+Liu, B.; Gemp, I.; Ghavamzadeh, M.; Liu, J.; Mahadevan, S.; and Petrik, M. 2018.
+\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
+\newblock \emph{Journal of Artificial Intelligence Research}, 63: 461--494.
+\bibitem[{Liu et~al.(2015)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}]{liu2015finite}
+Liu, B.; Liu, J.; Ghavamzadeh, M.; Mahadevan, S.; and Petrik, M. 2015.
+\newblock Finite-sample analysis of proximal gradient TD algorithms.
+\newblock In \emph{Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, 504--513.
+\bibitem[{Liu et~al.(2016)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}]{liu2016proximal}
+Liu, B.; Liu, J.; Ghavamzadeh, M.; Mahadevan, S.; and Petrik, M. 2016.
+\newblock Proximal Gradient Temporal Difference Learning Algorithms.
+\newblock In \emph{Proceedings of the International Joint Conference on Artificial Intelligence}, 4195--4199.
+\bibitem[{Ng, Harada, and Russell(1999)}]{ng1999policy}
+Ng, A.~Y.; Harada, D.; and Russell, S. 1999.
+\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
+\newblock In \emph{Proc. 16th Int. Conf. Mach. Learn.}, 278--287.
+\bibitem[{Pan, White, and White(2017)}]{pan2017accelerated}
+Pan, Y.; White, A.; and White, M. 2017.
+\newblock Accelerated gradient temporal difference learning.
+\newblock In \emph{Proceedings of the 21st AAAI Conference on Artificial Intelligence}, 2464--2470.
+\bibitem[{Schulman et~al.(2015)Schulman, Levine, Abbeel, Jordan, and Moritz}]{schulman2015trust}
+Schulman, J.; Levine, S.; Abbeel, P.; Jordan, M.; and Moritz, P. 2015.
+\newblock Trust region policy optimization.
+\newblock In \emph{International Conference on Machine Learning}, 1889--1897.
+\bibitem[{Schulman et~al.(2017)Schulman, Wolski, Dhariwal, Radford, and Klimov}]{schulman2017proximal}
+Schulman, J.; Wolski, F.; Dhariwal, P.; Radford, A.; and Klimov, O. 2017.
+\newblock Proximal policy optimization algorithms.
+\newblock \emph{arXiv preprint arXiv:1707.06347}.
+\bibitem[{Schwartz(1993)}]{schwartz1993reinforcement}
+Schwartz, A. 1993.
+\newblock A reinforcement learning method for maximizing undiscounted rewards.
+\newblock In \emph{Proc. 10th Int. Conf. Mach. Learn.}, volume 298, 298--305.
+\bibitem[{Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}]{sutton2009fast}
+Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesv{\'a}ri, C.; and Wiewiora, E. 2009.
+\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
+\newblock In \emph{Proc. 26th Int. Conf. Mach. Learn.}, 993--1000.
+\bibitem[{Sutton(1988)}]{sutton1988learning}
+Sutton, R.~S. 1988.
+\newblock Learning to predict by the methods of temporal differences.
+\newblock \emph{Machine learning}, 3(1): 9--44.
+\bibitem[{Sutton and Barto(2018)}]{Sutton2018book}
+Sutton, R.~S.; and Barto, A.~G. 2018.
+\newblock \emph{Reinforcement Learning: An Introduction}.
+\newblock The MIT Press, second edition.
+\bibitem[{Sutton, Maei, and Szepesv{\'a}ri(2008)}]{sutton2008convergent}
+Sutton, R.~S.; Maei, H.~R.; and Szepesv{\'a}ri, C. 2008.
+\newblock A Convergent $ O (n) $ Temporal-difference Algorithm for Off-policy Learning with Linear Function Approximation.
+\newblock In \emph{Advances in Neural Information Processing Systems}, 1609--1616. Cambridge, MA: MIT Press.
+\bibitem[{Sutton, Mahmood, and White(2016)}]{sutton2016emphatic}
+Sutton, R.~S.; Mahmood, A.~R.; and White, M. 2016.
+\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
+\newblock \emph{The Journal of Machine Learning Research}, 17(1): 2603--2631.
+\bibitem[{Tsitsiklis and Van~Roy(1997)}]{tsitsiklis1997analysis}
+Tsitsiklis, J.~N.; and Van~Roy, B. 1997.
+\newblock Analysis of temporal-diffference learning with function approximation.
+\newblock In \emph{Advances in Neural Information Processing Systems}, 1075--1081.
+\bibitem[{Xu et~al.(2019)Xu, Wang, Zhou, and Liang}]{xu2019reanalysis}
+Xu, T.; Wang, Z.; Zhou, Y.; and Liang, Y. 2019.
+\newblock Reanalysis of Variance Reduced Temporal Difference Learning.
+\newblock In \emph{International Conference on Learning Representations}.
+\bibitem[{Xu et~al.(2020)Xu, Wang, Zhou, and Liang}]{xu2020reanalysis}
+Xu, T.; Wang, Z.; Zhou, Y.; and Liang, Y. 2020.
+\newblock Reanalysis of variance reduced temporal difference learning.
+\newblock \emph{arXiv preprint arXiv:2001.01898}.
+\bibitem[{Zhang and Whiteson(2022)}]{zhang2022truncated}
+Zhang, S.; and Whiteson, S. 2022.
+\newblock Truncated emphatic temporal difference methods for prediction and control.
+\newblock \emph{The Journal of Machine Learning Research}, 23(1): 6859--6917.
+\end{thebibliography}
--- a/AnonymousSubmission/anonymous-submission-latex-2025.blg
+++ b/AnonymousSubmission/anonymous-submission-latex-2025.blg
+This is BibTeX, Version 0.99d (TeX Live 2023)
+Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
+The top-level auxiliary file: anonymous-submission-latex-2025.aux
+The style file: aaai25.bst
+Database file #1: aaai25.bib
+You've used 30 entries,
+            2840 wiz_defined-function locations,
+            758 strings with 9820 characters,
+and the built_in function-call counts, 22009 in all, are:
+= -- 1873
+> -- 1021
+< -- 1
+ -- 379
+- -- 340
+* -- 1463
+:= -- 3421
+add.period$ -- 123
+call.type$ -- 30
+change.case$ -- 252
+chr.to.int$ -- 31
+cite$ -- 30
+duplicate$ -- 1509
+empty$ -- 1557
+format.name$ -- 414
+if$ -- 4459
+int.to.chr$ -- 1
+int.to.str$ -- 1
+missing$ -- 302
+newline$ -- 154
+num.names$ -- 120
+pop$ -- 709
+preamble$ -- 1
+purify$ -- 213
+quote$ -- 0
+skip$ -- 793
+stack$ -- 0
+substring$ -- 1161
+swap$ -- 801
+text.length$ -- 1
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 267
+warning$ -- 0
+while$ -- 188
+width$ -- 0
+write$ -- 394
--- a/AnonymousSubmission/anonymous-submission-latex-2025.log
+++ b/AnonymousSubmission/anonymous-submission-latex-2025.log
--- a/AnonymousSubmission/anonymous-submission-latex-2025.pdf
+++ b/AnonymousSubmission/anonymous-submission-latex-2025.pdf
--- a/AnonymousSubmission/anonymous-submission-latex-2025.synctex.gz
+++ b/AnonymousSubmission/anonymous-submission-latex-2025.synctex.gz
--- a/AnonymousSubmission/anonymous-submission-latex-2025.tex
+++ b/AnonymousSubmission/anonymous-submission-latex-2025.tex
+%File: anonymous-submission-latex-2025.tex
+\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
+\usepackage[submission]{aaai25}  % DO NOT CHANGE THIS
+\usepackage{times}  % DO NOT CHANGE THIS
+\usepackage{helvet}  % DO NOT CHANGE THIS
+\usepackage{courier}  % DO NOT CHANGE THIS
+\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
+\usepackage{graphicx} % DO NOT CHANGE THIS
+\urlstyle{rm} % DO NOT CHANGE THIS
+\def\UrlFont{\rm}  % DO NOT CHANGE THIS
+\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\frenchspacing  % DO NOT CHANGE THIS
+\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
+\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
+%
+% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
+\usepackage{algorithm}
+\usepackage{algorithmic}
+\usepackage{subfigure}
+\usepackage{diagbox}
+\usepackage{booktabs}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{mathtools}
+\usepackage{amsthm}
+\usepackage{tikz}
+\usepackage{bm}
+\usepackage{esvect}
+\usepackage{multirow}
+\theoremstyle{plain}
+% \newtheorem{theorem}{Theorem}[section]
+\newtheorem{theorem}{Theorem}
+\newtheorem{proposition}[theorem]{Proposition}
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{corollary}[theorem]{Corollary}
+\theoremstyle{definition}
+\newtheorem{definition}[theorem]{Definition}
+\newtheorem{assumption}[theorem]{Assumption}
+\theoremstyle{remark}
+\newtheorem{remark}[theorem]{Remark}
+%
+% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
+\usepackage{newfloat}
+\usepackage{listings}
+\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
+\lstset{%
+	basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
+	numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
+	aboveskip=0pt,belowskip=0pt,%
+	showstringspaces=false,tabsize=2,breaklines=true}
+\floatstyle{ruled}
+\newfloat{listing}{tb}{lst}{}
+\floatname{listing}{Listing}
+%
+% Keep the \pdfinfo as shown here. There's no need
+% for you to add the /Title and /Author tags.
+\pdfinfo{
+/TemplateVersion (2025.1)
+}
+% DISALLOWED PACKAGES
+% \usepackage{authblk} -- This package is specifically forbidden
+% \usepackage{balance} -- This package is specifically forbidden
+% \usepackage{color (if used in text)
+% \usepackage{CJK} -- This package is specifically forbidden
+% \usepackage{float} -- This package is specifically forbidden
+% \usepackage{flushend} -- This package is specifically forbidden
+% \usepackage{fontenc} -- This package is specifically forbidden
+% \usepackage{fullpage} -- This package is specifically forbidden
+% \usepackage{geometry} -- This package is specifically forbidden
+% \usepackage{grffile} -- This package is specifically forbidden
+% \usepackage{hyperref} -- This package is specifically forbidden
+% \usepackage{navigator} -- This package is specifically forbidden
+% (or any other package that embeds links such as navigator or hyperref)
+% \indentfirst} -- This package is specifically forbidden
+% \layout} -- This package is specifically forbidden
+% \multicol} -- This package is specifically forbidden
+% \nameref} -- This package is specifically forbidden
+% \usepackage{savetrees} -- This package is specifically forbidden
+% \usepackage{setspace} -- This package is specifically forbidden
+% \usepackage{stfloats} -- This package is specifically forbidden
+% \usepackage{tabu} -- This package is specifically forbidden
+% \usepackage{titlesec} -- This package is specifically forbidden
+% \usepackage{tocbibind} -- This package is specifically forbidden
+% \usepackage{ulem} -- This package is specifically forbidden
+% \usepackage{wrapfig} -- This package is specifically forbidden
+% DISALLOWED COMMANDS
+% \nocopyright -- Your paper will not be published if you use this command
+% \addtolength -- This command may not be used
+% \balance -- This command may not be used
+% \baselinestretch -- Your paper will not be published if you use this command
+% \clearpage -- No page breaks of any kind may be used for the final version of your paper
+% \columnsep -- This command may not be used
+% \newpage -- No page breaks of any kind may be used for the final version of your paper
+% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
+% \pagestyle -- This command may not be used
+% \tiny -- This is not an acceptable font size.
+% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
+% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference
+\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numbers are desired.
+% The file aaai25.sty is the style file for AAAI Press
+% proceedings, working notes, and technical reports.
+%
+% Title
+% Your title must be in mixed case, not sentence case.
+% That means all verbs (including short verbs like be, is, using,and go),
+% nouns, adverbs, adjectives should be capitalized, including both words in hyphenated terms, while
+% articles, conjunctions, and prepositions are lower case unless they
+% directly follow a colon or long dash
+\title{AAAI Press Anonymous Submission\\Instructions for Authors Using \LaTeX{}}
+\author{
+    %Authors
+    % All authors must be in the same font size and format.
+    Written by AAAI Press Staff\textsuperscript{\rm 1}\thanks{With help from the AAAI Publications Committee.}\\
+    AAAI Style Contributions by Pater Patel Schneider,
+    Sunil Issar,\\
+    J. Scott Penberthy,
+    George Ferguson,
+    Hans Guesgen,
+    Francisco Cruz\equalcontrib,
+    Marc Pujol-Gonzalez\equalcontrib
+}
+\affiliations{
+    %Afiliations
+    \textsuperscript{\rm 1}Association for the Advancement of Artificial Intelligence\\
+    % If you have multiple authors and multiple affiliations
+    % use superscripts in text and roman font to identify them.
+    % For example,
+    % Sunil Issar\textsuperscript{\rm 2},
+    % J. Scott Penberthy\textsuperscript{\rm 3},
+    % George Ferguson\textsuperscript{\rm 4},
+    % Hans Guesgen\textsuperscript{\rm 5}
+    % Note that the comma should be placed after the superscript
+    1101 Pennsylvania Ave, NW Suite 300\\
+    Washington, DC 20004 USA\\
+    % email address must be in roman text type, not monospace or sans serif
+    proceedings-questions@aaai.org
+%
+% See more examples next
+}
+%Example, Single Author, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it
+\iffalse
+\title{My Publication Title --- Single Author}
+\author {
+    Author Name
+}
+\affiliations{
+    Affiliation\\
+    Affiliation Line 2\\
+    name@example.com
+}
+\fi
+\iffalse
+%Example, Multiple Authors, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it
+\title{My Publication Title --- Multiple Authors}
+\author {
+    % Authors
+    First Author Name\textsuperscript{\rm 1},
+    Second Author Name\textsuperscript{\rm 2},
+    Third Author Name\textsuperscript{\rm 1}
+}
+\affiliations {
+    % Affiliations
+    \textsuperscript{\rm 1}Affiliation 1\\
+    \textsuperscript{\rm 2}Affiliation 2\\
+    firstAuthor@affiliation1.com, secondAuthor@affilation2.com, thirdAuthor@affiliation1.com
+}
+\fi
+% REMOVE THIS: bibentry
+% This is only needed to show inline citations in the guidelines document. You should not need it and can safely delete it.
+\usepackage{bibentry}
+% END REMOVE bibentry
+\begin{document}
+\setcounter{theorem}{0}
+\maketitle
+% \setcounter{theorem}{0}
+\begin{abstract}
+    The existing research on 
+    value-based reinforcement learning also minimizes the error. 
+    However, is error minimization really the only option
+     for value-based reinforcement learning? 
+     We can easily observe that the policy on action 
+     choosing probabilities is often related to the relative values,
+      and has nothing to do with their absolute values. 
+      Based on this observation, we propose the objective
+      of variance minimization instead of error minimization, 
+      derive many new variance minimization algorithms, both including a traditional parameter $\omega$, 
+      and conduct an analysis of the convergence rate and experiments. 
+      The experimental results show that our proposed variance minimization algorithms
+       converge much faster.
+\end{abstract}
+% Uncomment the following to link to your code, datasets, an extended version or similar.
+%
+% \begin{links}
+%     \link{Code}{https://aaai.org/example/code}
+%     \link{Datasets}{https://aaai.org/example/datasets}
+%     \link{Extended version}{https://aaai.org/example/extended-version}
+% \end{links}
+\input{main/introduction.tex}
+\input{main/preliminaries.tex}
+\input{main/motivation.tex}
+\input{main/theory.tex}
+\input{main/experiment.tex}
+\input{main/relatedwork.tex}
+\input{main/conclusion.tex}
+\bibliography{aaai25}
+\end{document}
--- a/AnonymousSubmission/figure1.pdf
+++ b/AnonymousSubmission/figure1.pdf
--- a/AnonymousSubmission/figure2.pdf
+++ b/AnonymousSubmission/figure2.pdf
--- a/AnonymousSubmission/main/conclusion.tex
+++ b/AnonymousSubmission/main/conclusion.tex
+\section{Conclusion and Future Work}
+Value-based reinforcement learning typically aims 
+to minimize error as an optimization objective. 
+As an alternation, this study proposes new objective 
+functions: VBE and VPBE, and derives many variance minimization algorithms, including VMTD, 
+VMTDC and VMETD. 
+% The VMTD algorithm 
+% is essentially an adjustment or correction to the traditional 
+% TD update. 
+%  Both 
+% algorithms are capable of stabilizing gradient estimation, reducing 
+% the variance of gradient estimation and accelerating convergence.
+All algorithms demonstrated superior performance in policy 
+evaluation and control experiments.
+Future work may include, but are not limited
+to, (1) analysis of the convergence rate of VMTDC and VMETD. 
+(2) extensions of VBE and VPBE to multi-step returns. 
+(3) extensions to nonlinear approximations, such as neural networks. 
\ No newline at end of file
--- a/AnonymousSubmission/main/experiment.tex
+++ b/AnonymousSubmission/main/experiment.tex
+\section{Experimental Studies}
+This section assesses algorithm performance through experiments, 
+which are divided into policy evaluation experiments and control experiments.
+\subsection{Testing Tasks}
+\textbf{Random-walk:} as shown in Figure \ref{randomwalk}, all episodes
+start in the center state, $C$, and proceed to left or right by one state on each
+step, equiprobably. Episodes terminate either on the extreme left or
+the extreme right, and get a reward of $+1$ if terminate on the right, or
+$0$ in the other case. In this task, the true value for each state is  the
+probability of starting from that state and terminating on the right
+\cite{Sutton2018book}.
+Thus, the true values of states from $A$ to $E$ are
+$\frac{1}{6},\frac{2}{6},\frac{3}{6},\frac{4}{6},\frac{5}{6}$, respectively.
+The discount factor $\gamma=1.0$. 
+There are three standard kinds of features for random-walk problems: tabular
+feature, inverted feature and dependent feature \cite{sutton2009fast}. 
+The feature matrices corresponding to three random walks are shown in Appendix.
+Conduct experiments using
+an on-policy approach in the Random-walk environment.
+\begin{figure}
+    \begin{center}
+    \input{main/pic/randomwalk.tex}
+    \caption{Random walk.}
+    \label{randomwalk}
+    \end{center}
+\end{figure}
+\begin{figure}
+    \begin{center}
+    \input{main/pic/BairdExample.tex}
+    \caption{7-state version of Baird's off-policy counterexample.}
+    \label{bairdexample}
+    \end{center}
+\end{figure}  
+\textbf{Baird's off-policy counterexample:} This task is well known as a
+counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As
+shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy
+chooses actions represented by solid lines with a probability of $\frac{1}{7}$
+and actions represented by dotted lines with a probability of $\frac{6}{7}$. The
+target policy is expected to choose the solid line with more probability than $\frac{1}{7}$,
+and it chooses the solid line with probability of $1$ in this paper.
+ The discount factor $\gamma =0.99$, and the feature matrix is
+% defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}.
+defined in Appendix.
+\textbf{Maze}:  The learning agent should find a shortest path from the upper
+left corner to the lower right corner.  In each state,
+there are four alternative actions: $up$, $down$, $left$, and $right$, which
+takes the agent deterministically to the corresponding neighbour state, except when
+%  \begin{wrapfigure}{r}{3cm}
+% \centering
+% \includegraphics[scale=0.15]{main/pic/maze_13_13.pdf} 
+% % \caption{The 2-state counterexample.}
+% \end{wrapfigure}
+ a movement is blocked by an obstacle or the edge
+of the maze. Rewards are $-1$ in all transitions until the
+agent reaches the goal state.
+The discount factor $\gamma=0.99$, and states $s$ are represented by tabular
+features.The maximum number of moves in the game is set to 1000.
+ \begin{figure}
+\centering
+\includegraphics[scale=0.20]{main/pic/maze_13_13.pdf} 
+\caption{Maze.}
+ \end{figure}
+\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are 
+selected from the gym official website and correspond to the following 
+versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''. 
+For specific details, please refer to the gym official website.
+The maximum number of steps for the Mountain Car environment is set to 1000, 
+while the default settings are used for the other two environments. In  Mountain car and Acrobot, features are generated by tile coding.
+Please, refer to the Appendix for the selection of learning rates for all experiments.
+\subsection{Experimental Results and Analysis}
+% \begin{figure}[htb]
+%     \vskip 0.2in
+%     \begin{center}
+%     \subfigure[Dependent]{
+%         \includegraphics[width=0.4\columnwidth, height=0.3\columnwidth]{main/pic/dependent_new.pdf}
+%         \label{DependentFull}
+%     }
+%     \subfigure[Tabular]{
+%         \includegraphics[width=0.4\columnwidth, height=0.3\columnwidth]{main/pic/tabular_new.pdf}
+%         \label{TabularFull}
+%     }
+%     \\
+%     \subfigure[Inverted]{
+%         \includegraphics[width=0.4\columnwidth, height=0.3\columnwidth]{main/pic/inverted_new.pdf}
+%         \label{InvertedFull}
+%     }
+%     \subfigure[counterexample]{
+%         \includegraphics[width=0.4\columnwidth, height=0.3\columnwidth]{main/pic/counterexample_quanju_new.pdf}
+%         \label{CounterExampleFull}
+%     }
+%         \caption{Learning curses of four evaluation environments.}
+%         \label{Evaluation_full}
+%     \end{center}
+%     \vskip -0.2in
+% \end{figure}
+% \begin{figure*}[htb]
+%     \vskip 0.2in
+%     \begin{center}
+%     \subfigure[Maze]{
+%         \includegraphics[width=0.55\columnwidth, height=0.4\columnwidth]{main/pic/maze_complete.pdf}
+%         \label{MazeFull}
+%     }
+%     \subfigure[Cliff Walking]{
+%         \includegraphics[width=0.55\columnwidth, height=0.4\columnwidth]{main/pic/cw_complete.pdf}
+%         \label{CliffWalkingFull}
+%     }
+%     \\
+%     \subfigure[Mountain Car]{
+%         \includegraphics[width=0.55\columnwidth, height=0.4\columnwidth]{main/pic/mt_complete.pdf}
+%         \label{MountainCarFull}
+%     }
+%     \subfigure[Acrobot]{
+%         \includegraphics[width=0.55\columnwidth, height=0.4\columnwidth]{main/pic/Acrobot_complete.pdf}
+%         \label{AcrobotFull}
+%     }
+%         \caption{Learning curses of four contral environments.}
+%         \label{Complete_full}
+%     \end{center}
+%     \vskip -0.2in
+%   \end{figure*}
+%   \begin{table*}[htb]
+%     \centering
+%     \caption{Difference between R-learning and tabular VMQ.}
+%     \vskip 0.15in
+%     \begin{tabular}{c|cc}
+%         \hline
+%         algorithms&update formula \\
+%         \hline
+%          R-learning&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}-m_{k}+ \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a))$\\
+%                 &$m_{k+1}\leftarrow m_{k}+\beta_k(r_{k+1}+\max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-m_{k})$\\
+%          tabular VMQ&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_k)$\\
+%          &$\omega_{k+1}\leftarrow \omega_{k}+\beta_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_{k})$\\
+%          \hline
+%     \end{tabular}
+%     \label{differenceRandVMQ}
+%     \vskip -0.1in
+%   \end{table*}
+The experiment needs further elaboration.
+% For policy evaluation experiments, compare the performance of the VMTD, 
+% VMTDC, TD, and TDC algorithms. 
+% The vertical axis is unified as RVBE.
+% For policy evaluation experiments, the criteria for evaluating 
+%   algorithms vary. The objective function minimized by our proposed 
+%   new algorithm differs from that of other algorithms. Therefore, to 
+% ensure fairness in comparisons, this study only contrasts algorithm 
+% experiments in controlled settings.
+% This study will compare the performance of Sarsa, Q-learning, GQ(0), 
+%   AC, VMSarsa, VMQ, and VMGQ(0) in four control environments.
+% % All experiments involved in this paper were run independently for 100 times.
+% The learning curses of the algorithms corresponding to 
+% policy evaluation experiments and control experiments are 
+% shown in Figures \ref{Evaluation_full} and \ref{Complete_full}, respectively.
+% The shaded area in Figure \ref{Evaluation_full}, \ref{Complete_full} represents the standard deviation (std).
+% In the random-walk tasks, VMTD and VMTDC exhibit excellent performance, 
+% outperforming TD and TDC in the case of dependent random-walk.
+% In the 7-state example counter task, TD diverges, 
+% while VMTDC converges and performs better than TDC. 
+% From the update formula, it can be observed that the VMTD algorithm, like TDC,  
+% is also an adjustment or correction of the TD update.
+% What is more surprising is that VMTD also maintains 
+% convergence and demonstrates the best performance.
+% In  Maze, Mountain Car, and Acrobot, 
+% the convergence speed of VMSarsa, VMQ, and VMGQ(0) has 
+% been significantly improved compared to Sarsa, Q-learning, 
+% and GQ(0), respectively. The performance of the AC algorithm 
+% is at an intermediate level. The performances of VMSarsa, 
+% VMQ, and VMGQ(0) in these three experimental environments 
+% have no significant differences.
+% In  Cliff Walking, Sarsa and 
+% VMSarsa converge to slightly worse solutions compared to 
+% other algorithms. The convergence speed of VMSarsa is significantly 
+% better than that of Sarsa. The convergence speed of VMGQ(0) and VMQ 
+% is better than other algorithms, and the performance of VMGQ(0) is 
+% slightly better than that of VMQ.
+% In summary, the performance of VMSarsa, 
+% VMQ, and VMGQ(0) is better than that of other algorithms. 
+% In the Cliff Walking environment, 
+% the performance of VMGQ(0) is slightly better than that of 
+% VMSarsa and VMQ. In the other three experimental environments, 
+% the performances of VMSarsa, VMQ, and VMGQ(0) are close.
\ No newline at end of file
--- a/AnonymousSubmission/main/introduction.tex
+++ b/AnonymousSubmission/main/introduction.tex
+\section{Introduction}
+\label{introduction}
+Reinforcement learning can be mainly divided into two
+categories: value-based reinforcement learning
+and policy gradient-based reinforcement learning. This
+paper focuses on temporal difference learning based on
+linear approximated valued functions. Its research is
+usually divided into two steps: the first step is to establish the convergence of the algorithm, and the second
+step is to accelerate the algorithm.
+In terms of stability, \citet{sutton1988learning} established the
+ convergence of on-policy TD(0), and \citet{tsitsiklis1997analysis}
+ established the convergence of on-policy TD($\lambda$).
+ However, ``The deadly triad'' consisting of off-policy learning, 
+ bootstrapping, and function approximation makes 
+ the stability  a difficult problem \citep{Sutton2018book}.
+ To solve this problem, convergent off-policy temporal difference
+  learning algorithms are proposed, e.g., BR \cite{baird1995residual},
+ GTD \cite{sutton2008convergent},  GTD2 and TDC \cite{sutton2009fast},
+  ETD \cite{sutton2016emphatic}, and MRetrace \cite{chen2023modified}.
+In terms of acceleration, \citet{hackman2012faster} 
+proposed Hybrid TD algorithm with on-policy matrix.
+\citet{liu2015finite,liu2016proximal,liu2018proximal} proposed
+true stochastic algorithms, i.e., GTD-MP and GTD2-MP, from 
+a convex-concave saddle-point formulation.
+Second-order  methods are used to accelerate TD learning,
+e.g.,  Quasi Newton TD \cite{givchi2015quasi} and 
+accelerated TD (ATD)  \citep{pan2017accelerated}.
+\citet{hallak2016generalized} introduced an new parameter 
+to reduce variance for ETD.
+\citet{zhang2022truncated} proposed truncated ETD with a lower variance.
+Variance Reduced TD with direct variance reduction technique \citep{johnson2013accelerating} is proposed by \cite{korda2015td} 
+and analysed by  \cite{xu2019reanalysis}.
+How to further improve the convergence rates of reinforcement learning 
+algorithms is currently still an open problem.
+Algorithm stability is prominently reflected in the changes 
+to the objective function, transitioning from mean squared 
+errors (MSE) \citep{Sutton2018book} to mean squared bellman errors (MSBE) \cite{baird1995residual}, then to 
+norm of the expected TD update \cite{sutton2009fast}, and further to 
+mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}. On the other hand, algorithm 
+acceleration is more centered around optimizing the iterative 
+update formula of the algorithm itself without altering the 
+objective function, thereby speeding up the convergence rate 
+of the algorithm. The emergence of new optimization objective 
+functions often leads to the development of novel algorithms. 
+The introduction of new algorithms, in turn, tends to inspire 
+researchers to explore methods for accelerating algorithms, 
+leading to the iterative creation of increasingly superior algorithms.
+The kernel loss function can be optimized using standard 
+gradient-based methods, addressing the issue of double 
+sampling in residual gradient algorithm \cite{feng2019kernel}. It ensures convergence 
+in both on-policy and off-policy scenarios. The logistic bellman 
+error is convex and smooth in the action-value function parameters, 
+with bounded gradients \cite{basserrano2021logistic}. In contrast, the squared Bellman error is 
+not convex in the action-value function parameters, and RL algorithms 
+based on recursive optimization using it are known to be unstable.
+% The value-based algorithms mentioned above aim to
+% minimize some errors, e.g., mean squared errors \citep{Sutton2018book},
+% mean squared Bellman errors \cite{baird1995residual}, norm
+% of the expected TD update \cite{sutton2009fast}, 
+% mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}, etc.
+It is necessary to propose a new objective function, but the mentioned objective functions above are all some form of error.
+Is minimizing error the only option for value-based reinforcement learning?
+For policy evaluation experiments, 
+differences in objective functions may result 
+in inconsistent fixed points. This inconsistency 
+makes it difficult to uniformly compare the superiority 
+of algorithms derived from different objective functions. 
+However, for control experiments, since the choice of actions 
+depends on the relative values of the Q values rather than their
+ absolute values, the presence of solution bias is acceptable.
+Based on this observation, we propose  alternate objective functions 
+instead of minimizing errors. We minimize Variance of Bellman Error (VBE) and 
+Variance of Projected Bellman Error (VPBE)
+and derive Variance Minimization (VM) algorithms.
+These algorithms preserve the invariance of the optimal policy in the control environments,
+but significantly reduce the variance of gradient estimation,
+and thus hastening convergence.
+The contributions of this paper are as follows:
+(1) Introduction of  novel objective functions based on
+the invariance of the optimal policy.
+(2) Derived three variance minimization algorithms, including on-policy and off-policy.
+(3) Proof of their convergence.
+(4) Analysis of the convergence rate of on-policy algorithm.
+(5) Experiments demonstrating the faster convergence speed of the proposed algorithms.
--- a/AnonymousSubmission/main/motivation.tex
+++ b/AnonymousSubmission/main/motivation.tex
--- a/AnonymousSubmission/main/pic/Acrobot_complete.pdf
+++ b/AnonymousSubmission/main/pic/Acrobot_complete.pdf
--- a/AnonymousSubmission/main/pic/BairdExample.tex
+++ b/AnonymousSubmission/main/pic/BairdExample.tex
+\resizebox{5cm}{3cm}{
+\begin{tikzpicture}[smooth]
+\node[coordinate] (origin) at (0.3,0) {};
+\node[coordinate] (num7) at (3,0) {};
+\node[coordinate] (num1) at (1,2.5) {};
+\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {};
+\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {};
+\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {};
+\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {};
+\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {};
+\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {};
+\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {};
+\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {};
+\path (num1) ++ (0:1cm) node (num2) [coordinate] {};
+\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {};
+\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {};
+\path (num2) ++ (0:1cm) node (num3) [coordinate] {};
+\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {};
+\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {};
+\path (num3) ++ (0:1cm) node (num4) [coordinate] {};
+\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {};
+\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {};
+\path (num4) ++ (0:1cm) node (num5) [coordinate] {};
+\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {};
+\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {};
+\path (num5) ++ (0:1cm) node (num6) [coordinate] {};
+\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {};
+\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {};
+%\draw[->](0,0) -- (1,1);
+%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1);
+ %\fill (0.5,0.5) circle (0.5);
+ %\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7};
+\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06]
+coordinates{(num7) (origin) (origin_above) (origin_aright)}; 
+\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5]
+coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)};
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6};
+\draw[->,>=stealth,line width = 0.02cm] (a)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (b)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (c)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (d)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (e)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (f)--(g);
+\end{tikzpicture}
+}
--- a/AnonymousSubmission/main/pic/counterexample_quanju_new.pdf
+++ b/AnonymousSubmission/main/pic/counterexample_quanju_new.pdf
--- a/AnonymousSubmission/main/pic/cw_complete.pdf
+++ b/AnonymousSubmission/main/pic/cw_complete.pdf
--- a/AnonymousSubmission/main/pic/dependent_new.pdf
+++ b/AnonymousSubmission/main/pic/dependent_new.pdf
--- a/AnonymousSubmission/main/pic/inverted_new.pdf
+++ b/AnonymousSubmission/main/pic/inverted_new.pdf
--- a/AnonymousSubmission/main/pic/maze_13_13.pdf
+++ b/AnonymousSubmission/main/pic/maze_13_13.pdf
--- a/AnonymousSubmission/main/pic/maze_complete.pdf
+++ b/AnonymousSubmission/main/pic/maze_complete.pdf
--- a/AnonymousSubmission/main/pic/mt_complete.pdf
+++ b/AnonymousSubmission/main/pic/mt_complete.pdf
--- a/AnonymousSubmission/main/pic/randomwalk.tex
+++ b/AnonymousSubmission/main/pic/randomwalk.tex
+% \tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
+% \tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
+% \tikzstyle{init} = [pin edge={to-,thin,black}]
+% 	\resizebox{8cm}{1.2cm}{
+% \begin{tikzpicture}[node distance=1.5cm,auto,>=latex']
+%     \node [block] (o) {};
+%     \node (p) [left of=o,node distance=0.5cm, coordinate] {o};
+%     \node [shape=circle,int] (a) [right of=o]{$A$};
+%     \node (b) [left of=a,node distance=1.5cm, coordinate] {a};
+%     \node [shape=circle,int] (c) [right of=a] {$B$};
+%     \node (d) [left of=c,node distance=1.5cm, coordinate] {c};
+%     \node [shape=circle,int, pin={[init]above:$$}] (e) [right of=c]{$C$}; 
+%     \node (f) [left of=e,node distance=1.5cm, coordinate] {e};
+%     \node [shape=circle,int] (g) [right of=e] {$D$};
+%     \node (h) [left of=g,node distance=1.5cm, coordinate] {g};
+%     \node [shape=circle,int] (i) [right of=g] {$E$};
+%     \node (j) [left of=i,node distance=1.5cm, coordinate] {i};
+%     \node [block] (k) [right of=i] {};
+%     \node (l) [left of=k,node distance=0.5cm, coordinate] {k};
+%     \path[<-] (o) edge node {$0$} (a);
+%     \path[<->] (a) edge node {$0$} (c);
+%     \path[<->] (c) edge node {$0$} (e);
+%     \path[<->] (e) edge node {$0$} (g);
+%     \path[<->] (g) edge node {$0$} (i);
+%     \draw[->] (i) edge node {$1$} (k);
+% \end{tikzpicture}
+% }
+\tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
+\tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
+\tikzstyle{init} = [pin edge={to-,thin,black}]
+\resizebox{5cm}{1cm}{
+    \begin{tikzpicture}[node distance=1.5cm, auto, >=latex]
+        \node [block] (o) {};
+        \node (p) [left of=o, node distance=0.5cm, coordinate] {o};
+        \node [shape=circle, int] (a) [right of=o] {$A$};
+        \node (b) [left of=a, node distance=1.5cm, coordinate] {a};
+        \node [shape=circle, int] (c) [right of=a] {$B$};
+        \node (d) [left of=c, node distance=1.5cm, coordinate] {c};
+        \node [shape=circle, int, pin={[init]above:$ $}] (e) [right of=c] {$C$};
+        \node (f) [left of=e, node distance=1.5cm, coordinate] {e};
+        \node [shape=circle, int] (g) [right of=e] {$D$};
+        \node (h) [left of=g, node distance=1.5cm, coordinate] {g};
+        \node [shape=circle, int] (i) [right of=g] {$E$};
+        \node (j) [left of=i, node distance=1.5cm, coordinate] {i};
+        \node [block] (k) [right of=i] {};
+        \node (l) [left of=k, node distance=0.5cm, coordinate] {k};
+        \path[->] (o) edge node {$0$} (a);
+        \path[<->] (a) edge node {$0$} (c);
+        \path[<->] (c) edge node {$0$} (e);
+        \path[<->] (e) edge node {$0$} (g);
+        \path[<->] (g) edge node {$0$} (i);
+        \draw[->] (i) edge node {$1$} (k);
+    \end{tikzpicture}
+}
\ No newline at end of file
--- a/AnonymousSubmission/main/pic/tabular_new.pdf
+++ b/AnonymousSubmission/main/pic/tabular_new.pdf
--- a/AnonymousSubmission/main/preliminaries.tex
+++ b/AnonymousSubmission/main/preliminaries.tex
+\section{Background}
+\label{preliminaries}
+Reinforcement learning agent interacts with environment, observes state,
+ takes sequential decision makings to influence environment, and obtains
+ rewards.
+ Consider an infinite-horizon discounted 
+ Markov Decision Process (MDP), defined by a tuple $\langle S,A,R,P,\gamma
+ \rangle$, where $S=\{1,2,\ldots,N\}$ is a finite set of states of the environment;  $A$
+ is a finite set of actions of the agent; 
+ $R:S\times A \times S \rightarrow \mathbb{R}$ is a bounded deterministic reward
+ function; $P:S\times A\times S \rightarrow [0,1]$ is the transition
+ probability distribution;  and $\gamma\in (0,1)$
+  is the discount factor \cite{Sutton2018book}.
+  Due to the requirements of  online learning, value iteration based on sampling
+  is considered in this paper. 
+  In each sampling, an experience (or transition) $\langle s, a, s', r\rangle$ is
+  obtained.
+  A policy is a mapping $\pi:S\times A \rightarrow [0,1]$. The goal of the
+  agent is to find an optimal policy $\pi^*$ to maximize the expectation of a
+  discounted cumulative rewards in a long period.
+  State value function $V^{\pi}(s)$  for a stationary policy $\pi$ is 
+  defined as:
+  \begin{equation*}
+  V^{\pi}(s)=\mathbb{E}_{\pi}[\sum_{k=0}^{\infty} \gamma^k R_{k}|s_0=s].
+  \label{valuefunction}
+  \end{equation*}
+  Linear value function for state $s\in S$ is defined as:
+   \begin{equation}
+   V_{{\theta}}(s):= {\bm{\theta}}^{\top}{\bm{\phi}}(s) = \sum_{i=1}^{m}
+   \theta_i \phi_i(s),
+   \label{linearvaluefunction}
+   \end{equation}
+  where ${\bm{\theta}}:=(\theta_1,\theta_2,\ldots,\theta_m)^{\top}\in
+  \mathbb{R}^m$ is a parameter vector, 
+  ${\bm{\phi}}:=(\phi_1,\phi_2,\ldots,\phi_m)^{\top}\in \mathbb{R}^m$ is a feature
+  function  defined on state space $S$, and $m$ is the feature size. 
+  Tabular temporal difference (TD) learning \cite{Sutton2018book} has been successfully applied to small-scale problems.
+  To deal with the well-known curse of dimensionality of large scale MDPs, value
+  function is usually approximated by a linear model, kernel methods, decision
+   trees, or neural networks, etc. This paper focuses on the linear model, where
+   features are usually hand coded by domain experts.
+% TD learning can also be used to find optimal strategies. The problem of finding an optimal policy is 
+% often called the control problem. Two popular TD methods are Sarsa and Q-leaning. The former is an on-policy 
+% TD control, while the latter is an off-policy control. 
+% It is well known that TDC algorithm \cite{sutton2009fast} guarantees 
+% convergence under off-policy conditions while the off-policy TD algorithm may diverge. The 
+% objective function of TDC is MSPBE. 
+% TDC is essentially an adjustment or correction of the TD update so that it
+% follows the gradient of the MSPBE objective function. In the context of the TDC algorithm, the control algorithm 
+% is known as Greedy-GQ($\lambda$) \cite{sutton2009fast}. When $\lambda$ is set to 0, it is denoted 
+% as GQ(0).
\ No newline at end of file
--- a/AnonymousSubmission/main/relatedwork.tex
+++ b/AnonymousSubmission/main/relatedwork.tex
+\section{Related Work}
+\subsection{Difference between VMQ and R-learning}
+Tabular VMQ's update formula bears some resemblance 
+to R-learning's update formula. As shown in Table \ref{differenceRandVMQ}, the update formulas of the two algorithms have the following differences:
+\\(1) The goal of the R-learning algorithm \cite{schwartz1993reinforcement} is to maximize the average 
+reward, rather than the cumulative reward, by learning an estimate 
+of the average reward. This estimate $m$ is then used to update the Q-values.
+On the contrary, the $\omega$ in the tabular VMQ update formula eventually converges to $\mathbb{E}[\delta]$.
+\\(2) When $\gamma=1$ in the tabular VMQ update formula, the 
+R-learning update formula is formally 
+the same as the tabular VMQ update formula. 
+Therefore, R-learning algorithm can be 
+considered as a special case of VMQ algorithm in form.
+\subsection{Variance Reduction for TD Learning}
+ The TD with centering algorithm (CTD) \cite{korda2015td} 
+was proposed, which directly applies variance reduction techniques to 
+the TD algorithm. The CTD algorithm updates its parameters using the 
+average gradient of a batch of Markovian samples and a projection operator. 
+Unfortunately, the authors’ analysis of the CTD algorithm contains technical 
+errors. The VRTD algorithm \cite{xu2020reanalysis} is also a variance-reduced algorithm that updates 
+its parameters using the average gradient of a batch of i.i.d. samples. The 
+authors of VRTD provide a technically sound analysis to demonstrate the 
+advantages of variance reduction. 
+\subsection{Variance Reduction for Policy Gradient Algorithms}
+Policy gradient algorithms are a class of reinforcement 
+learning algorithms that directly optimize cumulative rewards. 
+REINFORCE  is a Monte Carlo algorithm that estimates 
+gradients through sampling, but may have a high variance. 
+Baselines are introduced to reduce variance and to
+accelerate learning \cite{Sutton2018book}. In  Actor-Critic, 
+value function as a baseline and bootstrapping 
+ are used to reduce variance, also accelerating convergence \cite{Sutton2018book}.
+ TRPO \cite{schulman2015trust} and PPO \cite{schulman2017proximal}
+  use generalized advantage 
+estimation, which combines multi-step bootstrapping and Monte Carlo 
+estimation to reduce variance, making gradient estimation more stable and 
+accelerating convergence. 
+In Variance Minimization, 
+the incorporation of $\omega \doteq \mathbb{E}[\delta]$ 
+bears a striking resemblance to the use of a baseline 
+in policy gradient methods. The introduction of a baseline 
+in policy gradient techniques does not alter 
+the expected value of the update; 
+rather, it significantly impacts the variance of gradient estimation. 
+The addition of $\omega \doteq \mathbb{E}[\delta]$ in Variance Minimization 
+ preserves the invariance of the optimal 
+policy while stabilizing gradient estimation, 
+reducing the variance of gradient estimation, 
+and hastening convergence.
\ No newline at end of file
--- a/AnonymousSubmission/main/theory.tex
+++ b/AnonymousSubmission/main/theory.tex
--- a/Apendix/aaai24.bib
+++ b/Apendix/aaai24.bib
--- a/Apendix/aaai24.bst
+++ b/Apendix/aaai24.bst
--- a/Apendix/aaai24.sty
+++ b/Apendix/aaai24.sty
--- a/Apendix/anonymous-submission-latex-2024.aux
+++ b/Apendix/anonymous-submission-latex-2024.aux
+\relax 
+\bibstyle{aaai24}
+\citation{sutton2009fast}
+\citation{hirsch1989convergent}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{hirsch1989convergent}
+\newlabel{proofth2}{{A.1}{1}}
+\newlabel{thetavmtdcFastest}{{A-1}{1}}
+\newlabel{uvmtdcFastest}{{A-2}{1}}
+\newlabel{omegavmtdcFastest}{{A-3}{1}}
+\newlabel{omegavmtdcFastestFinal}{{A-4}{1}}
+\newlabel{omegavmtdcInfty}{{A-5}{1}}
+\newlabel{thetavmtdcFaster}{{A-6}{1}}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar1997stochastic}
+\newlabel{uvmtdcFaster}{{A-7}{2}}
+\newlabel{uvmtdcFasterFinal}{{A-8}{2}}
+\newlabel{uvmtdcInfty}{{A-9}{2}}
+\newlabel{thetavmtdcSlowerFinal}{{A-11}{2}}
+\newlabel{odethetavmtdcfinal}{{A-12}{2}}
+\citation{hirsch1989convergent}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\newlabel{proofVMETD}{{A.2}{3}}
+\newlabel{th1proof}{{A.2}{3}}
+\newlabel{thetaFast}{{A-13}{3}}
+\newlabel{omegaFast}{{A-14}{3}}
+\newlabel{omegaFastFinal}{{A-15}{3}}
+\newlabel{omegaInfty}{{A-16}{3}}
+\citation{sutton2016emphatic}
+\newlabel{odetheta}{{A-17}{4}}
+\newlabel{rowsum}{{A-20}{4}}
+\newlabel{columnsum}{{A-21}{5}}
+\newlabel{odethetafinal}{{A-22}{5}}
+\newlabel{mathematicalanalysis}{{B}{5}}
+\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
+\newlabel{keymatrices}{{1}{5}}
+\newlabel{minimumeigenvalues}{{2}{5}}
+\newlabel{experimentaldetails}{{C}{5}}
+\newlabel{bairdcounterexample}{{\caption@xref {bairdcounterexample}{ on input line 731}}{6}}
+\newlabel{randomwalk}{{\caption@xref {randomwalk}{ on input line 754}}{6}}
+\newlabel{boyanchain}{{\caption@xref {boyanchain}{ on input line 777}}{6}}
+\bibdata{aaai24}
+\bibcite{borkar1997stochastic}{{1}{1997}{{Borkar}}{{}}}
+\bibcite{borkar2000ode}{{2}{2000}{{Borkar and Meyn}}{{}}}
+\bibcite{hirsch1989convergent}{{3}{1989}{{Hirsch}}{{}}}
+\bibcite{sutton2009fast}{{4}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}}
+\bibcite{sutton2016emphatic}{{5}{2016}{{Sutton, Mahmood, and White}}{{}}}
+\newlabel{lrofways}{{6}{7}}
+\gdef \@abspage@last{7}
--- a/Apendix/anonymous-submission-latex-2024.bbl
+++ b/Apendix/anonymous-submission-latex-2024.bbl
+\begin{thebibliography}{5}
+\providecommand{\natexlab}[1]{#1}
+\bibitem[{Borkar(1997)}]{borkar1997stochastic}
+Borkar, V.~S. 1997.
+\newblock Stochastic approximation with two time scales.
+\newblock \emph{Syst. \& Control Letters}, 29(5): 291--294.
+\bibitem[{Borkar and Meyn(2000)}]{borkar2000ode}
+Borkar, V.~S.; and Meyn, S.~P. 2000.
+\newblock The ODE method for convergence of stochastic approximation and reinforcement learning.
+\newblock \emph{SIAM J. Control Optim.}, 38(2): 447--469.
+\bibitem[{Hirsch(1989)}]{hirsch1989convergent}
+Hirsch, M.~W. 1989.
+\newblock Convergent activation dynamics in continuous time networks.
+\newblock \emph{Neural Netw.}, 2(5): 331--349.
+\bibitem[{Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}]{sutton2009fast}
+Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesv{\'a}ri, C.; and Wiewiora, E. 2009.
+\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
+\newblock In \emph{Proc. 26th Int. Conf. Mach. Learn.}, 993--1000.
+\bibitem[{Sutton, Mahmood, and White(2016)}]{sutton2016emphatic}
+Sutton, R.~S.; Mahmood, A.~R.; and White, M. 2016.
+\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
+\newblock \emph{The Journal of Machine Learning Research}, 17(1): 2603--2631.
+\end{thebibliography}
--- a/Apendix/anonymous-submission-latex-2024.blg
+++ b/Apendix/anonymous-submission-latex-2024.blg
+This is BibTeX, Version 0.99d (TeX Live 2023)
+Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
+The top-level auxiliary file: anonymous-submission-latex-2024.aux
+The style file: aaai24.bst
+Database file #1: aaai24.bib
+You've used 5 entries,
+            2840 wiz_defined-function locations,
+            619 strings with 5446 characters,
+and the built_in function-call counts, 3370 in all, are:
+= -- 277
+> -- 153
+< -- 0
+ -- 60
+- -- 52
+* -- 242
+:= -- 547
+add.period$ -- 20
+call.type$ -- 5
+change.case$ -- 36
+chr.to.int$ -- 6
+cite$ -- 5
+duplicate$ -- 223
+empty$ -- 240
+format.name$ -- 60
+if$ -- 649
+int.to.chr$ -- 1
+int.to.str$ -- 1
+missing$ -- 49
+newline$ -- 29
+num.names$ -- 20
+pop$ -- 92
+preamble$ -- 1
+purify$ -- 34
+quote$ -- 0
+skip$ -- 96
+stack$ -- 0
+substring$ -- 200
+swap$ -- 128
+text.length$ -- 0
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 45
+warning$ -- 0
+while$ -- 31
+width$ -- 0
+write$ -- 68
--- a/Apendix/anonymous-submission-latex-2024.log
+++ b/Apendix/anonymous-submission-latex-2024.log
--- a/Apendix/anonymous-submission-latex-2024.pdf
+++ b/Apendix/anonymous-submission-latex-2024.pdf
--- a/Apendix/anonymous-submission-latex-2024.synctex.gz
+++ b/Apendix/anonymous-submission-latex-2024.synctex.gz
--- a/Apendix/anonymous-submission-latex-2024.tex
+++ b/Apendix/anonymous-submission-latex-2024.tex
--- a/Apendix/figure1.pdf
+++ b/Apendix/figure1.pdf
--- a/Apendix/figure2.pdf
+++ b/Apendix/figure2.pdf
--- a/Apendix/pic/2StateExample.pdf
+++ b/Apendix/pic/2StateExample.pdf
--- a/Apendix/pic/Acrobot_complete.pdf
+++ b/Apendix/pic/Acrobot_complete.pdf
--- a/Apendix/pic/maze_complete.pdf
+++ b/Apendix/pic/maze_complete.pdf
--- a/Apendix/pic/maze_key_complete.pdf
+++ b/Apendix/pic/maze_key_complete.pdf
--- a/Apendix/pic/mt_complete.pdf
+++ b/Apendix/pic/mt_complete.pdf