Commit ff4dbddd by GongYu

AAAI25的模板,把能写的写了进去,尽快补充实验

parents
\relax
\bibstyle{aaai25}
\citation{sutton1988learning}
\citation{tsitsiklis1997analysis}
\citation{Sutton2018book}
\citation{baird1995residual}
\citation{sutton2008convergent}
\citation{sutton2009fast}
\citation{sutton2016emphatic}
\citation{chen2023modified}
\citation{hackman2012faster}
\citation{liu2015finite,liu2016proximal,liu2018proximal}
\citation{givchi2015quasi}
\citation{pan2017accelerated}
\citation{hallak2016generalized}
\citation{zhang2022truncated}
\citation{johnson2013accelerating}
\citation{korda2015td}
\citation{xu2019reanalysis}
\citation{Sutton2018book}
\citation{baird1995residual}
\citation{sutton2009fast}
\citation{sutton2009fast}
\citation{feng2019kernel}
\citation{basserrano2021logistic}
\newlabel{introduction}{{}{1}}
\citation{Sutton2018book}
\citation{Sutton2018book}
\citation{ng1999policy}
\citation{devlin2012dynamic}
\newlabel{preliminaries}{{}{2}}
\newlabel{valuefunction}{{}{2}}
\newlabel{linearvaluefunction}{{1}{2}}
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{example_bias}{{1}{2}}
\newlabel{alg:algorithm 1}{{1}{3}}
\newlabel{omega}{{3}{3}}
\newlabel{delta}{{4}{3}}
\newlabel{theta}{{5}{3}}
\newlabel{deltaSarsa}{{8}{3}}
\newlabel{deltaQ}{{9}{3}}
\newlabel{alg:algorithm 2}{{2}{3}}
\newlabel{thetavmtdc}{{11}{3}}
\newlabel{uvmtdc}{{12}{3}}
\newlabel{omegavmtdc}{{13}{3}}
\newlabel{fvmetd}{{18}{3}}
\newlabel{thetavmetd}{{19}{3}}
\newlabel{omegavmetd}{{20}{3}}
\citation{borkar1997stochastic}
\citation{hirsch1989convergent}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{borkar2000ode}
\newlabel{alg:algorithm 5}{{3}{4}}
\newlabel{theorem1}{{1}{4}}
\newlabel{th1proof}{{}{4}}
\newlabel{thetaFast}{{22}{4}}
\newlabel{omegaFast}{{23}{4}}
\newlabel{omegaFastFinal}{{24}{4}}
\newlabel{omegaInfty}{{25}{4}}
\citation{Sutton2018book}
\citation{sutton2009fast}
\citation{baird1995residual,sutton2009fast}
\newlabel{odetheta}{{26}{5}}
\newlabel{covariance}{{27}{5}}
\newlabel{odethetafinal}{{28}{5}}
\newlabel{theorem2}{{2}{5}}
\newlabel{randomwalk}{{1}{5}}
\newlabel{bairdexample}{{2}{5}}
\newlabel{theorem3}{{3}{5}}
\citation{schwartz1993reinforcement}
\citation{korda2015td}
\citation{xu2020reanalysis}
\citation{Sutton2018book}
\citation{Sutton2018book}
\citation{schulman2015trust}
\citation{schulman2017proximal}
\bibdata{aaai25}
\bibcite{baird1995residual}{{1}{1995}{{Baird et~al.}}{{}}}
\bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano et~al.}}{{Bas-Serrano, Curi, Krause, and Neu}}}
\bibcite{borkar1997stochastic}{{3}{1997}{{Borkar}}{{}}}
\bibcite{borkar2000ode}{{4}{2000}{{Borkar and Meyn}}{{}}}
\bibcite{chen2023modified}{{5}{2023}{{Chen et~al.}}{{Chen, Ma, Li, Yang, Yang, and Gao}}}
\bibcite{devlin2012dynamic}{{6}{2012}{{Devlin and Kudenko}}{{}}}
\bibcite{feng2019kernel}{{7}{2019}{{Feng, Li, and Liu}}{{}}}
\bibcite{givchi2015quasi}{{8}{2015}{{Givchi and Palhang}}{{}}}
\bibcite{hackman2012faster}{{9}{2012}{{Hackman}}{{}}}
\bibcite{hallak2016generalized}{{10}{2016}{{Hallak et~al.}}{{Hallak, Tamar, Munos, and Mannor}}}
\bibcite{hirsch1989convergent}{{11}{1989}{{Hirsch}}{{}}}
\bibcite{johnson2013accelerating}{{12}{2013}{{Johnson and Zhang}}{{}}}
\bibcite{korda2015td}{{13}{2015}{{Korda and La}}{{}}}
\bibcite{liu2018proximal}{{14}{2018}{{Liu et~al.}}{{Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}}}
\bibcite{liu2015finite}{{15}{2015}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}}
\bibcite{liu2016proximal}{{16}{2016}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}}
\bibcite{ng1999policy}{{17}{1999}{{Ng, Harada, and Russell}}{{}}}
\bibcite{pan2017accelerated}{{18}{2017}{{Pan, White, and White}}{{}}}
\bibcite{schulman2015trust}{{19}{2015}{{Schulman et~al.}}{{Schulman, Levine, Abbeel, Jordan, and Moritz}}}
\bibcite{schulman2017proximal}{{20}{2017}{{Schulman et~al.}}{{Schulman, Wolski, Dhariwal, Radford, and Klimov}}}
\bibcite{schwartz1993reinforcement}{{21}{1993}{{Schwartz}}{{}}}
\bibcite{sutton2009fast}{{22}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}}
\bibcite{sutton1988learning}{{23}{1988}{{Sutton}}{{}}}
\bibcite{Sutton2018book}{{24}{2018}{{Sutton and Barto}}{{}}}
\bibcite{sutton2008convergent}{{25}{2008}{{Sutton, Maei, and Szepesv{\'a}ri}}{{}}}
\bibcite{sutton2016emphatic}{{26}{2016}{{Sutton, Mahmood, and White}}{{}}}
\bibcite{tsitsiklis1997analysis}{{27}{1997}{{Tsitsiklis and Van~Roy}}{{}}}
\bibcite{xu2019reanalysis}{{28}{2019}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}}
\bibcite{xu2020reanalysis}{{29}{2020}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}}
\bibcite{zhang2022truncated}{{30}{2022}{{Zhang and Whiteson}}{{}}}
\gdef \@abspage@last{7}
\begin{thebibliography}{30}
\providecommand{\natexlab}[1]{#1}
\bibitem[{Baird et~al.(1995)}]{baird1995residual}
Baird, L.; et~al. 1995.
\newblock Residual algorithms: Reinforcement learning with function approximation.
\newblock In \emph{Proc. 12th Int. Conf. Mach. Learn.}, 30--37.
\bibitem[{Bas-Serrano et~al.(2021)Bas-Serrano, Curi, Krause, and Neu}]{basserrano2021logistic}
Bas-Serrano, J.; Curi, S.; Krause, A.; and Neu, G. 2021.
\newblock Logistic Q-Learning.
\newblock In \emph{International Conference on Artificial Intelligence and Statistics}, 3610--3618.
\bibitem[{Borkar(1997)}]{borkar1997stochastic}
Borkar, V.~S. 1997.
\newblock Stochastic approximation with two time scales.
\newblock \emph{Syst. \& Control Letters}, 29(5): 291--294.
\bibitem[{Borkar and Meyn(2000)}]{borkar2000ode}
Borkar, V.~S.; and Meyn, S.~P. 2000.
\newblock The ODE method for convergence of stochastic approximation and reinforcement learning.
\newblock \emph{SIAM J. Control Optim.}, 38(2): 447--469.
\bibitem[{Chen et~al.(2023)Chen, Ma, Li, Yang, Yang, and Gao}]{chen2023modified}
Chen, X.; Ma, X.; Li, Y.; Yang, G.; Yang, S.; and Gao, Y. 2023.
\newblock Modified Retrace for Off-Policy Temporal Difference Learning.
\newblock In \emph{Uncertainty in Artificial Intelligence}, 303--312. PMLR.
\bibitem[{Devlin and Kudenko(2012)}]{devlin2012dynamic}
Devlin, S.; and Kudenko, D. 2012.
\newblock Dynamic potential-based reward shaping.
\newblock In \emph{Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, 433--440.
\bibitem[{Feng, Li, and Liu(2019)}]{feng2019kernel}
Feng, Y.; Li, L.; and Liu, Q. 2019.
\newblock A kernel loss for solving the Bellman equation.
\newblock In \emph{Advances in Neural Information Processing Systems}, 15430--15441.
\bibitem[{Givchi and Palhang(2015)}]{givchi2015quasi}
Givchi, A.; and Palhang, M. 2015.
\newblock Quasi newton temporal difference learning.
\newblock In \emph{Asian Conference on Machine Learning}, 159--172.
\bibitem[{Hackman(2012)}]{hackman2012faster}
Hackman, L. 2012.
\newblock \emph{Faster Gradient-TD Algorithms}.
\newblock Ph.D. thesis, University of Alberta.
\bibitem[{Hallak et~al.(2016)Hallak, Tamar, Munos, and Mannor}]{hallak2016generalized}
Hallak, A.; Tamar, A.; Munos, R.; and Mannor, S. 2016.
\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
\newblock In \emph{Proceedings of the 30th AAAI Conference on Artificial Intelligence}, 1631--1637.
\bibitem[{Hirsch(1989)}]{hirsch1989convergent}
Hirsch, M.~W. 1989.
\newblock Convergent activation dynamics in continuous time networks.
\newblock \emph{Neural Netw.}, 2(5): 331--349.
\bibitem[{Johnson and Zhang(2013)}]{johnson2013accelerating}
Johnson, R.; and Zhang, T. 2013.
\newblock Accelerating stochastic gradient descent using predictive variance reduction.
\newblock In \emph{Advances in Neural Information Processing Systems}, 315--323.
\bibitem[{Korda and La(2015)}]{korda2015td}
Korda, N.; and La, P. 2015.
\newblock On TD (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
\newblock In \emph{International conference on machine learning}, 626--634. PMLR.
\bibitem[{Liu et~al.(2018)Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}]{liu2018proximal}
Liu, B.; Gemp, I.; Ghavamzadeh, M.; Liu, J.; Mahadevan, S.; and Petrik, M. 2018.
\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
\newblock \emph{Journal of Artificial Intelligence Research}, 63: 461--494.
\bibitem[{Liu et~al.(2015)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}]{liu2015finite}
Liu, B.; Liu, J.; Ghavamzadeh, M.; Mahadevan, S.; and Petrik, M. 2015.
\newblock Finite-sample analysis of proximal gradient TD algorithms.
\newblock In \emph{Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, 504--513.
\bibitem[{Liu et~al.(2016)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}]{liu2016proximal}
Liu, B.; Liu, J.; Ghavamzadeh, M.; Mahadevan, S.; and Petrik, M. 2016.
\newblock Proximal Gradient Temporal Difference Learning Algorithms.
\newblock In \emph{Proceedings of the International Joint Conference on Artificial Intelligence}, 4195--4199.
\bibitem[{Ng, Harada, and Russell(1999)}]{ng1999policy}
Ng, A.~Y.; Harada, D.; and Russell, S. 1999.
\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
\newblock In \emph{Proc. 16th Int. Conf. Mach. Learn.}, 278--287.
\bibitem[{Pan, White, and White(2017)}]{pan2017accelerated}
Pan, Y.; White, A.; and White, M. 2017.
\newblock Accelerated gradient temporal difference learning.
\newblock In \emph{Proceedings of the 21st AAAI Conference on Artificial Intelligence}, 2464--2470.
\bibitem[{Schulman et~al.(2015)Schulman, Levine, Abbeel, Jordan, and Moritz}]{schulman2015trust}
Schulman, J.; Levine, S.; Abbeel, P.; Jordan, M.; and Moritz, P. 2015.
\newblock Trust region policy optimization.
\newblock In \emph{International Conference on Machine Learning}, 1889--1897.
\bibitem[{Schulman et~al.(2017)Schulman, Wolski, Dhariwal, Radford, and Klimov}]{schulman2017proximal}
Schulman, J.; Wolski, F.; Dhariwal, P.; Radford, A.; and Klimov, O. 2017.
\newblock Proximal policy optimization algorithms.
\newblock \emph{arXiv preprint arXiv:1707.06347}.
\bibitem[{Schwartz(1993)}]{schwartz1993reinforcement}
Schwartz, A. 1993.
\newblock A reinforcement learning method for maximizing undiscounted rewards.
\newblock In \emph{Proc. 10th Int. Conf. Mach. Learn.}, volume 298, 298--305.
\bibitem[{Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}]{sutton2009fast}
Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesv{\'a}ri, C.; and Wiewiora, E. 2009.
\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
\newblock In \emph{Proc. 26th Int. Conf. Mach. Learn.}, 993--1000.
\bibitem[{Sutton(1988)}]{sutton1988learning}
Sutton, R.~S. 1988.
\newblock Learning to predict by the methods of temporal differences.
\newblock \emph{Machine learning}, 3(1): 9--44.
\bibitem[{Sutton and Barto(2018)}]{Sutton2018book}
Sutton, R.~S.; and Barto, A.~G. 2018.
\newblock \emph{Reinforcement Learning: An Introduction}.
\newblock The MIT Press, second edition.
\bibitem[{Sutton, Maei, and Szepesv{\'a}ri(2008)}]{sutton2008convergent}
Sutton, R.~S.; Maei, H.~R.; and Szepesv{\'a}ri, C. 2008.
\newblock A Convergent $ O (n) $ Temporal-difference Algorithm for Off-policy Learning with Linear Function Approximation.
\newblock In \emph{Advances in Neural Information Processing Systems}, 1609--1616. Cambridge, MA: MIT Press.
\bibitem[{Sutton, Mahmood, and White(2016)}]{sutton2016emphatic}
Sutton, R.~S.; Mahmood, A.~R.; and White, M. 2016.
\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
\newblock \emph{The Journal of Machine Learning Research}, 17(1): 2603--2631.
\bibitem[{Tsitsiklis and Van~Roy(1997)}]{tsitsiklis1997analysis}
Tsitsiklis, J.~N.; and Van~Roy, B. 1997.
\newblock Analysis of temporal-diffference learning with function approximation.
\newblock In \emph{Advances in Neural Information Processing Systems}, 1075--1081.
\bibitem[{Xu et~al.(2019)Xu, Wang, Zhou, and Liang}]{xu2019reanalysis}
Xu, T.; Wang, Z.; Zhou, Y.; and Liang, Y. 2019.
\newblock Reanalysis of Variance Reduced Temporal Difference Learning.
\newblock In \emph{International Conference on Learning Representations}.
\bibitem[{Xu et~al.(2020)Xu, Wang, Zhou, and Liang}]{xu2020reanalysis}
Xu, T.; Wang, Z.; Zhou, Y.; and Liang, Y. 2020.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock \emph{arXiv preprint arXiv:2001.01898}.
\bibitem[{Zhang and Whiteson(2022)}]{zhang2022truncated}
Zhang, S.; and Whiteson, S. 2022.
\newblock Truncated emphatic temporal difference methods for prediction and control.
\newblock \emph{The Journal of Machine Learning Research}, 23(1): 6859--6917.
\end{thebibliography}
This is BibTeX, Version 0.99d (TeX Live 2023)
Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
The top-level auxiliary file: anonymous-submission-latex-2025.aux
The style file: aaai25.bst
Database file #1: aaai25.bib
You've used 30 entries,
2840 wiz_defined-function locations,
758 strings with 9820 characters,
and the built_in function-call counts, 22009 in all, are:
= -- 1873
> -- 1021
< -- 1
+ -- 379
- -- 340
* -- 1463
:= -- 3421
add.period$ -- 123
call.type$ -- 30
change.case$ -- 252
chr.to.int$ -- 31
cite$ -- 30
duplicate$ -- 1509
empty$ -- 1557
format.name$ -- 414
if$ -- 4459
int.to.chr$ -- 1
int.to.str$ -- 1
missing$ -- 302
newline$ -- 154
num.names$ -- 120
pop$ -- 709
preamble$ -- 1
purify$ -- 213
quote$ -- 0
skip$ -- 793
stack$ -- 0
substring$ -- 1161
swap$ -- 801
text.length$ -- 1
text.prefix$ -- 0
top$ -- 0
type$ -- 267
warning$ -- 0
while$ -- 188
width$ -- 0
write$ -- 394
%File: anonymous-submission-latex-2025.tex
\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
\usepackage[submission]{aaai25} % DO NOT CHANGE THIS
\usepackage{times} % DO NOT CHANGE THIS
\usepackage{helvet} % DO NOT CHANGE THIS
\usepackage{courier} % DO NOT CHANGE THIS
\usepackage[hyphens]{url} % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\urlstyle{rm} % DO NOT CHANGE THIS
\def\UrlFont{\rm} % DO NOT CHANGE THIS
\usepackage{natbib} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\frenchspacing % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
%
% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{subfigure}
\usepackage{diagbox}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{tikz}
\usepackage{bm}
\usepackage{esvect}
\usepackage{multirow}
\theoremstyle{plain}
% \newtheorem{theorem}{Theorem}[section]
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
%
% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
\usepackage{newfloat}
\usepackage{listings}
\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
\lstset{%
basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
aboveskip=0pt,belowskip=0pt,%
showstringspaces=false,tabsize=2,breaklines=true}
\floatstyle{ruled}
\newfloat{listing}{tb}{lst}{}
\floatname{listing}{Listing}
%
% Keep the \pdfinfo as shown here. There's no need
% for you to add the /Title and /Author tags.
\pdfinfo{
/TemplateVersion (2025.1)
}
% DISALLOWED PACKAGES
% \usepackage{authblk} -- This package is specifically forbidden
% \usepackage{balance} -- This package is specifically forbidden
% \usepackage{color (if used in text)
% \usepackage{CJK} -- This package is specifically forbidden
% \usepackage{float} -- This package is specifically forbidden
% \usepackage{flushend} -- This package is specifically forbidden
% \usepackage{fontenc} -- This package is specifically forbidden
% \usepackage{fullpage} -- This package is specifically forbidden
% \usepackage{geometry} -- This package is specifically forbidden
% \usepackage{grffile} -- This package is specifically forbidden
% \usepackage{hyperref} -- This package is specifically forbidden
% \usepackage{navigator} -- This package is specifically forbidden
% (or any other package that embeds links such as navigator or hyperref)
% \indentfirst} -- This package is specifically forbidden
% \layout} -- This package is specifically forbidden
% \multicol} -- This package is specifically forbidden
% \nameref} -- This package is specifically forbidden
% \usepackage{savetrees} -- This package is specifically forbidden
% \usepackage{setspace} -- This package is specifically forbidden
% \usepackage{stfloats} -- This package is specifically forbidden
% \usepackage{tabu} -- This package is specifically forbidden
% \usepackage{titlesec} -- This package is specifically forbidden
% \usepackage{tocbibind} -- This package is specifically forbidden
% \usepackage{ulem} -- This package is specifically forbidden
% \usepackage{wrapfig} -- This package is specifically forbidden
% DISALLOWED COMMANDS
% \nocopyright -- Your paper will not be published if you use this command
% \addtolength -- This command may not be used
% \balance -- This command may not be used
% \baselinestretch -- Your paper will not be published if you use this command
% \clearpage -- No page breaks of any kind may be used for the final version of your paper
% \columnsep -- This command may not be used
% \newpage -- No page breaks of any kind may be used for the final version of your paper
% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
% \pagestyle -- This command may not be used
% \tiny -- This is not an acceptable font size.
% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference
\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numbers are desired.
% The file aaai25.sty is the style file for AAAI Press
% proceedings, working notes, and technical reports.
%
% Title
% Your title must be in mixed case, not sentence case.
% That means all verbs (including short verbs like be, is, using,and go),
% nouns, adverbs, adjectives should be capitalized, including both words in hyphenated terms, while
% articles, conjunctions, and prepositions are lower case unless they
% directly follow a colon or long dash
\title{AAAI Press Anonymous Submission\\Instructions for Authors Using \LaTeX{}}
\author{
%Authors
% All authors must be in the same font size and format.
Written by AAAI Press Staff\textsuperscript{\rm 1}\thanks{With help from the AAAI Publications Committee.}\\
AAAI Style Contributions by Pater Patel Schneider,
Sunil Issar,\\
J. Scott Penberthy,
George Ferguson,
Hans Guesgen,
Francisco Cruz\equalcontrib,
Marc Pujol-Gonzalez\equalcontrib
}
\affiliations{
%Afiliations
\textsuperscript{\rm 1}Association for the Advancement of Artificial Intelligence\\
% If you have multiple authors and multiple affiliations
% use superscripts in text and roman font to identify them.
% For example,
% Sunil Issar\textsuperscript{\rm 2},
% J. Scott Penberthy\textsuperscript{\rm 3},
% George Ferguson\textsuperscript{\rm 4},
% Hans Guesgen\textsuperscript{\rm 5}
% Note that the comma should be placed after the superscript
1101 Pennsylvania Ave, NW Suite 300\\
Washington, DC 20004 USA\\
% email address must be in roman text type, not monospace or sans serif
proceedings-questions@aaai.org
%
% See more examples next
}
%Example, Single Author, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it
\iffalse
\title{My Publication Title --- Single Author}
\author {
Author Name
}
\affiliations{
Affiliation\\
Affiliation Line 2\\
name@example.com
}
\fi
\iffalse
%Example, Multiple Authors, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it
\title{My Publication Title --- Multiple Authors}
\author {
% Authors
First Author Name\textsuperscript{\rm 1},
Second Author Name\textsuperscript{\rm 2},
Third Author Name\textsuperscript{\rm 1}
}
\affiliations {
% Affiliations
\textsuperscript{\rm 1}Affiliation 1\\
\textsuperscript{\rm 2}Affiliation 2\\
firstAuthor@affiliation1.com, secondAuthor@affilation2.com, thirdAuthor@affiliation1.com
}
\fi
% REMOVE THIS: bibentry
% This is only needed to show inline citations in the guidelines document. You should not need it and can safely delete it.
\usepackage{bibentry}
% END REMOVE bibentry
\begin{document}
\setcounter{theorem}{0}
\maketitle
% \setcounter{theorem}{0}
\begin{abstract}
The existing research on
value-based reinforcement learning also minimizes the error.
However, is error minimization really the only option
for value-based reinforcement learning?
We can easily observe that the policy on action
choosing probabilities is often related to the relative values,
and has nothing to do with their absolute values.
Based on this observation, we propose the objective
of variance minimization instead of error minimization,
derive many new variance minimization algorithms, both including a traditional parameter $\omega$,
and conduct an analysis of the convergence rate and experiments.
The experimental results show that our proposed variance minimization algorithms
converge much faster.
\end{abstract}
% Uncomment the following to link to your code, datasets, an extended version or similar.
%
% \begin{links}
% \link{Code}{https://aaai.org/example/code}
% \link{Datasets}{https://aaai.org/example/datasets}
% \link{Extended version}{https://aaai.org/example/extended-version}
% \end{links}
\input{main/introduction.tex}
\input{main/preliminaries.tex}
\input{main/motivation.tex}
\input{main/theory.tex}
\input{main/experiment.tex}
\input{main/relatedwork.tex}
\input{main/conclusion.tex}
\bibliography{aaai25}
\end{document}
\section{Conclusion and Future Work}
Value-based reinforcement learning typically aims
to minimize error as an optimization objective.
As an alternation, this study proposes new objective
functions: VBE and VPBE, and derives many variance minimization algorithms, including VMTD,
VMTDC and VMETD.
% The VMTD algorithm
% is essentially an adjustment or correction to the traditional
% TD update.
% Both
% algorithms are capable of stabilizing gradient estimation, reducing
% the variance of gradient estimation and accelerating convergence.
All algorithms demonstrated superior performance in policy
evaluation and control experiments.
Future work may include, but are not limited
to, (1) analysis of the convergence rate of VMTDC and VMETD.
(2) extensions of VBE and VPBE to multi-step returns.
(3) extensions to nonlinear approximations, such as neural networks.
\ No newline at end of file
\section{Experimental Studies}
This section assesses algorithm performance through experiments,
which are divided into policy evaluation experiments and control experiments.
\subsection{Testing Tasks}
\textbf{Random-walk:} as shown in Figure \ref{randomwalk}, all episodes
start in the center state, $C$, and proceed to left or right by one state on each
step, equiprobably. Episodes terminate either on the extreme left or
the extreme right, and get a reward of $+1$ if terminate on the right, or
$0$ in the other case. In this task, the true value for each state is the
probability of starting from that state and terminating on the right
\cite{Sutton2018book}.
Thus, the true values of states from $A$ to $E$ are
$\frac{1}{6},\frac{2}{6},\frac{3}{6},\frac{4}{6},\frac{5}{6}$, respectively.
The discount factor $\gamma=1.0$.
There are three standard kinds of features for random-walk problems: tabular
feature, inverted feature and dependent feature \cite{sutton2009fast}.
The feature matrices corresponding to three random walks are shown in Appendix.
Conduct experiments using
an on-policy approach in the Random-walk environment.
\begin{figure}
\begin{center}
\input{main/pic/randomwalk.tex}
\caption{Random walk.}
\label{randomwalk}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
\input{main/pic/BairdExample.tex}
\caption{7-state version of Baird's off-policy counterexample.}
\label{bairdexample}
\end{center}
\end{figure}
\textbf{Baird's off-policy counterexample:} This task is well known as a
counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As
shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy
chooses actions represented by solid lines with a probability of $\frac{1}{7}$
and actions represented by dotted lines with a probability of $\frac{6}{7}$. The
target policy is expected to choose the solid line with more probability than $\frac{1}{7}$,
and it chooses the solid line with probability of $1$ in this paper.
The discount factor $\gamma =0.99$, and the feature matrix is
% defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}.
defined in Appendix.
\textbf{Maze}: The learning agent should find a shortest path from the upper
left corner to the lower right corner. In each state,
there are four alternative actions: $up$, $down$, $left$, and $right$, which
takes the agent deterministically to the corresponding neighbour state, except when
% \begin{wrapfigure}{r}{3cm}
% \centering
% \includegraphics[scale=0.15]{main/pic/maze_13_13.pdf}
% % \caption{The 2-state counterexample.}
% \end{wrapfigure}
a movement is blocked by an obstacle or the edge
of the maze. Rewards are $-1$ in all transitions until the
agent reaches the goal state.
The discount factor $\gamma=0.99$, and states $s$ are represented by tabular
features.The maximum number of moves in the game is set to 1000.
\begin{figure}
\centering
\includegraphics[scale=0.20]{main/pic/maze_13_13.pdf}
\caption{Maze.}
\end{figure}
\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are
selected from the gym official website and correspond to the following
versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''.
For specific details, please refer to the gym official website.
The maximum number of steps for the Mountain Car environment is set to 1000,
while the default settings are used for the other two environments. In Mountain car and Acrobot, features are generated by tile coding.
Please, refer to the Appendix for the selection of learning rates for all experiments.
\subsection{Experimental Results and Analysis}
% \begin{figure}[htb]
% \vskip 0.2in
% \begin{center}
% \subfigure[Dependent]{
% \includegraphics[width=0.4\columnwidth, height=0.3\columnwidth]{main/pic/dependent_new.pdf}
% \label{DependentFull}
% }
% \subfigure[Tabular]{
% \includegraphics[width=0.4\columnwidth, height=0.3\columnwidth]{main/pic/tabular_new.pdf}
% \label{TabularFull}
% }
% \\
% \subfigure[Inverted]{
% \includegraphics[width=0.4\columnwidth, height=0.3\columnwidth]{main/pic/inverted_new.pdf}
% \label{InvertedFull}
% }
% \subfigure[counterexample]{
% \includegraphics[width=0.4\columnwidth, height=0.3\columnwidth]{main/pic/counterexample_quanju_new.pdf}
% \label{CounterExampleFull}
% }
% \caption{Learning curses of four evaluation environments.}
% \label{Evaluation_full}
% \end{center}
% \vskip -0.2in
% \end{figure}
% \begin{figure*}[htb]
% \vskip 0.2in
% \begin{center}
% \subfigure[Maze]{
% \includegraphics[width=0.55\columnwidth, height=0.4\columnwidth]{main/pic/maze_complete.pdf}
% \label{MazeFull}
% }
% \subfigure[Cliff Walking]{
% \includegraphics[width=0.55\columnwidth, height=0.4\columnwidth]{main/pic/cw_complete.pdf}
% \label{CliffWalkingFull}
% }
% \\
% \subfigure[Mountain Car]{
% \includegraphics[width=0.55\columnwidth, height=0.4\columnwidth]{main/pic/mt_complete.pdf}
% \label{MountainCarFull}
% }
% \subfigure[Acrobot]{
% \includegraphics[width=0.55\columnwidth, height=0.4\columnwidth]{main/pic/Acrobot_complete.pdf}
% \label{AcrobotFull}
% }
% \caption{Learning curses of four contral environments.}
% \label{Complete_full}
% \end{center}
% \vskip -0.2in
% \end{figure*}
% \begin{table*}[htb]
% \centering
% \caption{Difference between R-learning and tabular VMQ.}
% \vskip 0.15in
% \begin{tabular}{c|cc}
% \hline
% algorithms&update formula \\
% \hline
% R-learning&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}-m_{k}+ \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a))$\\
% &$m_{k+1}\leftarrow m_{k}+\beta_k(r_{k+1}+\max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-m_{k})$\\
% tabular VMQ&$Q_{k+1}(s,a)\leftarrow Q_{k}(s,a)+\alpha_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_k)$\\
% &$\omega_{k+1}\leftarrow \omega_{k}+\beta_k(r_{k+1}+\gamma \max_{b\in A}Q_{k}(s,b) - Q_{k}(s,a)-\omega_{k})$\\
% \hline
% \end{tabular}
% \label{differenceRandVMQ}
% \vskip -0.1in
% \end{table*}
The experiment needs further elaboration.
% For policy evaluation experiments, compare the performance of the VMTD,
% VMTDC, TD, and TDC algorithms.
% The vertical axis is unified as RVBE.
% For policy evaluation experiments, the criteria for evaluating
% algorithms vary. The objective function minimized by our proposed
% new algorithm differs from that of other algorithms. Therefore, to
% ensure fairness in comparisons, this study only contrasts algorithm
% experiments in controlled settings.
% This study will compare the performance of Sarsa, Q-learning, GQ(0),
% AC, VMSarsa, VMQ, and VMGQ(0) in four control environments.
% % All experiments involved in this paper were run independently for 100 times.
% The learning curses of the algorithms corresponding to
% policy evaluation experiments and control experiments are
% shown in Figures \ref{Evaluation_full} and \ref{Complete_full}, respectively.
% The shaded area in Figure \ref{Evaluation_full}, \ref{Complete_full} represents the standard deviation (std).
% In the random-walk tasks, VMTD and VMTDC exhibit excellent performance,
% outperforming TD and TDC in the case of dependent random-walk.
% In the 7-state example counter task, TD diverges,
% while VMTDC converges and performs better than TDC.
% From the update formula, it can be observed that the VMTD algorithm, like TDC,
% is also an adjustment or correction of the TD update.
% What is more surprising is that VMTD also maintains
% convergence and demonstrates the best performance.
% In Maze, Mountain Car, and Acrobot,
% the convergence speed of VMSarsa, VMQ, and VMGQ(0) has
% been significantly improved compared to Sarsa, Q-learning,
% and GQ(0), respectively. The performance of the AC algorithm
% is at an intermediate level. The performances of VMSarsa,
% VMQ, and VMGQ(0) in these three experimental environments
% have no significant differences.
% In Cliff Walking, Sarsa and
% VMSarsa converge to slightly worse solutions compared to
% other algorithms. The convergence speed of VMSarsa is significantly
% better than that of Sarsa. The convergence speed of VMGQ(0) and VMQ
% is better than other algorithms, and the performance of VMGQ(0) is
% slightly better than that of VMQ.
% In summary, the performance of VMSarsa,
% VMQ, and VMGQ(0) is better than that of other algorithms.
% In the Cliff Walking environment,
% the performance of VMGQ(0) is slightly better than that of
% VMSarsa and VMQ. In the other three experimental environments,
% the performances of VMSarsa, VMQ, and VMGQ(0) are close.
\ No newline at end of file
\section{Introduction}
\label{introduction}
Reinforcement learning can be mainly divided into two
categories: value-based reinforcement learning
and policy gradient-based reinforcement learning. This
paper focuses on temporal difference learning based on
linear approximated valued functions. Its research is
usually divided into two steps: the first step is to establish the convergence of the algorithm, and the second
step is to accelerate the algorithm.
In terms of stability, \citet{sutton1988learning} established the
convergence of on-policy TD(0), and \citet{tsitsiklis1997analysis}
established the convergence of on-policy TD($\lambda$).
However, ``The deadly triad'' consisting of off-policy learning,
bootstrapping, and function approximation makes
the stability a difficult problem \citep{Sutton2018book}.
To solve this problem, convergent off-policy temporal difference
learning algorithms are proposed, e.g., BR \cite{baird1995residual},
GTD \cite{sutton2008convergent}, GTD2 and TDC \cite{sutton2009fast},
ETD \cite{sutton2016emphatic}, and MRetrace \cite{chen2023modified}.
In terms of acceleration, \citet{hackman2012faster}
proposed Hybrid TD algorithm with on-policy matrix.
\citet{liu2015finite,liu2016proximal,liu2018proximal} proposed
true stochastic algorithms, i.e., GTD-MP and GTD2-MP, from
a convex-concave saddle-point formulation.
Second-order methods are used to accelerate TD learning,
e.g., Quasi Newton TD \cite{givchi2015quasi} and
accelerated TD (ATD) \citep{pan2017accelerated}.
\citet{hallak2016generalized} introduced an new parameter
to reduce variance for ETD.
\citet{zhang2022truncated} proposed truncated ETD with a lower variance.
Variance Reduced TD with direct variance reduction technique \citep{johnson2013accelerating} is proposed by \cite{korda2015td}
and analysed by \cite{xu2019reanalysis}.
How to further improve the convergence rates of reinforcement learning
algorithms is currently still an open problem.
Algorithm stability is prominently reflected in the changes
to the objective function, transitioning from mean squared
errors (MSE) \citep{Sutton2018book} to mean squared bellman errors (MSBE) \cite{baird1995residual}, then to
norm of the expected TD update \cite{sutton2009fast}, and further to
mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}. On the other hand, algorithm
acceleration is more centered around optimizing the iterative
update formula of the algorithm itself without altering the
objective function, thereby speeding up the convergence rate
of the algorithm. The emergence of new optimization objective
functions often leads to the development of novel algorithms.
The introduction of new algorithms, in turn, tends to inspire
researchers to explore methods for accelerating algorithms,
leading to the iterative creation of increasingly superior algorithms.
The kernel loss function can be optimized using standard
gradient-based methods, addressing the issue of double
sampling in residual gradient algorithm \cite{feng2019kernel}. It ensures convergence
in both on-policy and off-policy scenarios. The logistic bellman
error is convex and smooth in the action-value function parameters,
with bounded gradients \cite{basserrano2021logistic}. In contrast, the squared Bellman error is
not convex in the action-value function parameters, and RL algorithms
based on recursive optimization using it are known to be unstable.
% The value-based algorithms mentioned above aim to
% minimize some errors, e.g., mean squared errors \citep{Sutton2018book},
% mean squared Bellman errors \cite{baird1995residual}, norm
% of the expected TD update \cite{sutton2009fast},
% mean squared projected Bellman errors (MSPBE) \cite{sutton2009fast}, etc.
It is necessary to propose a new objective function, but the mentioned objective functions above are all some form of error.
Is minimizing error the only option for value-based reinforcement learning?
For policy evaluation experiments,
differences in objective functions may result
in inconsistent fixed points. This inconsistency
makes it difficult to uniformly compare the superiority
of algorithms derived from different objective functions.
However, for control experiments, since the choice of actions
depends on the relative values of the Q values rather than their
absolute values, the presence of solution bias is acceptable.
Based on this observation, we propose alternate objective functions
instead of minimizing errors. We minimize Variance of Bellman Error (VBE) and
Variance of Projected Bellman Error (VPBE)
and derive Variance Minimization (VM) algorithms.
These algorithms preserve the invariance of the optimal policy in the control environments,
but significantly reduce the variance of gradient estimation,
and thus hastening convergence.
The contributions of this paper are as follows:
(1) Introduction of novel objective functions based on
the invariance of the optimal policy.
(2) Derived three variance minimization algorithms, including on-policy and off-policy.
(3) Proof of their convergence.
(4) Analysis of the convergence rate of on-policy algorithm.
(5) Experiments demonstrating the faster convergence speed of the proposed algorithms.
\resizebox{5cm}{3cm}{
\begin{tikzpicture}[smooth]
\node[coordinate] (origin) at (0.3,0) {};
\node[coordinate] (num7) at (3,0) {};
\node[coordinate] (num1) at (1,2.5) {};
\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {};
\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {};
\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {};
\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {};
\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {};
\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {};
\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {};
\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {};
\path (num1) ++ (0:1cm) node (num2) [coordinate] {};
\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {};
\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {};
\path (num2) ++ (0:1cm) node (num3) [coordinate] {};
\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {};
\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {};
\path (num3) ++ (0:1cm) node (num4) [coordinate] {};
\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {};
\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {};
\path (num4) ++ (0:1cm) node (num5) [coordinate] {};
\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {};
\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {};
\path (num5) ++ (0:1cm) node (num6) [coordinate] {};
\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {};
\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {};
%\draw[->](0,0) -- (1,1);
%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1);
%\fill (0.5,0.5) circle (0.5);
%\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7};
\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06]
coordinates{(num7) (origin) (origin_above) (origin_aright)};
\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5]
coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)};
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5};
\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ;
\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6};
\draw[->,>=stealth,line width = 0.02cm] (a)--(g);
\draw[->,>=stealth,line width = 0.02cm] (b)--(g);
\draw[->,>=stealth,line width = 0.02cm] (c)--(g);
\draw[->,>=stealth,line width = 0.02cm] (d)--(g);
\draw[->,>=stealth,line width = 0.02cm] (e)--(g);
\draw[->,>=stealth,line width = 0.02cm] (f)--(g);
\end{tikzpicture}
}
% \tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
% \tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
% \tikzstyle{init} = [pin edge={to-,thin,black}]
% \resizebox{8cm}{1.2cm}{
% \begin{tikzpicture}[node distance=1.5cm,auto,>=latex']
% \node [block] (o) {};
% \node (p) [left of=o,node distance=0.5cm, coordinate] {o};
% \node [shape=circle,int] (a) [right of=o]{$A$};
% \node (b) [left of=a,node distance=1.5cm, coordinate] {a};
% \node [shape=circle,int] (c) [right of=a] {$B$};
% \node (d) [left of=c,node distance=1.5cm, coordinate] {c};
% \node [shape=circle,int, pin={[init]above:$$}] (e) [right of=c]{$C$};
% \node (f) [left of=e,node distance=1.5cm, coordinate] {e};
% \node [shape=circle,int] (g) [right of=e] {$D$};
% \node (h) [left of=g,node distance=1.5cm, coordinate] {g};
% \node [shape=circle,int] (i) [right of=g] {$E$};
% \node (j) [left of=i,node distance=1.5cm, coordinate] {i};
% \node [block] (k) [right of=i] {};
% \node (l) [left of=k,node distance=0.5cm, coordinate] {k};
% \path[<-] (o) edge node {$0$} (a);
% \path[<->] (a) edge node {$0$} (c);
% \path[<->] (c) edge node {$0$} (e);
% \path[<->] (e) edge node {$0$} (g);
% \path[<->] (g) edge node {$0$} (i);
% \draw[->] (i) edge node {$1$} (k);
% \end{tikzpicture}
% }
\tikzstyle{int}=[draw, fill=blue!20, minimum size=2em]
\tikzstyle{block}=[draw, fill=gray, minimum size=1.5em]
\tikzstyle{init} = [pin edge={to-,thin,black}]
\resizebox{5cm}{1cm}{
\begin{tikzpicture}[node distance=1.5cm, auto, >=latex]
\node [block] (o) {};
\node (p) [left of=o, node distance=0.5cm, coordinate] {o};
\node [shape=circle, int] (a) [right of=o] {$A$};
\node (b) [left of=a, node distance=1.5cm, coordinate] {a};
\node [shape=circle, int] (c) [right of=a] {$B$};
\node (d) [left of=c, node distance=1.5cm, coordinate] {c};
\node [shape=circle, int, pin={[init]above:$ $}] (e) [right of=c] {$C$};
\node (f) [left of=e, node distance=1.5cm, coordinate] {e};
\node [shape=circle, int] (g) [right of=e] {$D$};
\node (h) [left of=g, node distance=1.5cm, coordinate] {g};
\node [shape=circle, int] (i) [right of=g] {$E$};
\node (j) [left of=i, node distance=1.5cm, coordinate] {i};
\node [block] (k) [right of=i] {};
\node (l) [left of=k, node distance=0.5cm, coordinate] {k};
\path[->] (o) edge node {$0$} (a);
\path[<->] (a) edge node {$0$} (c);
\path[<->] (c) edge node {$0$} (e);
\path[<->] (e) edge node {$0$} (g);
\path[<->] (g) edge node {$0$} (i);
\draw[->] (i) edge node {$1$} (k);
\end{tikzpicture}
}
\ No newline at end of file
\section{Background}
\label{preliminaries}
Reinforcement learning agent interacts with environment, observes state,
takes sequential decision makings to influence environment, and obtains
rewards.
Consider an infinite-horizon discounted
Markov Decision Process (MDP), defined by a tuple $\langle S,A,R,P,\gamma
\rangle$, where $S=\{1,2,\ldots,N\}$ is a finite set of states of the environment; $A$
is a finite set of actions of the agent;
$R:S\times A \times S \rightarrow \mathbb{R}$ is a bounded deterministic reward
function; $P:S\times A\times S \rightarrow [0,1]$ is the transition
probability distribution; and $\gamma\in (0,1)$
is the discount factor \cite{Sutton2018book}.
Due to the requirements of online learning, value iteration based on sampling
is considered in this paper.
In each sampling, an experience (or transition) $\langle s, a, s', r\rangle$ is
obtained.
A policy is a mapping $\pi:S\times A \rightarrow [0,1]$. The goal of the
agent is to find an optimal policy $\pi^*$ to maximize the expectation of a
discounted cumulative rewards in a long period.
State value function $V^{\pi}(s)$ for a stationary policy $\pi$ is
defined as:
\begin{equation*}
V^{\pi}(s)=\mathbb{E}_{\pi}[\sum_{k=0}^{\infty} \gamma^k R_{k}|s_0=s].
\label{valuefunction}
\end{equation*}
Linear value function for state $s\in S$ is defined as:
\begin{equation}
V_{{\theta}}(s):= {\bm{\theta}}^{\top}{\bm{\phi}}(s) = \sum_{i=1}^{m}
\theta_i \phi_i(s),
\label{linearvaluefunction}
\end{equation}
where ${\bm{\theta}}:=(\theta_1,\theta_2,\ldots,\theta_m)^{\top}\in
\mathbb{R}^m$ is a parameter vector,
${\bm{\phi}}:=(\phi_1,\phi_2,\ldots,\phi_m)^{\top}\in \mathbb{R}^m$ is a feature
function defined on state space $S$, and $m$ is the feature size.
Tabular temporal difference (TD) learning \cite{Sutton2018book} has been successfully applied to small-scale problems.
To deal with the well-known curse of dimensionality of large scale MDPs, value
function is usually approximated by a linear model, kernel methods, decision
trees, or neural networks, etc. This paper focuses on the linear model, where
features are usually hand coded by domain experts.
% TD learning can also be used to find optimal strategies. The problem of finding an optimal policy is
% often called the control problem. Two popular TD methods are Sarsa and Q-leaning. The former is an on-policy
% TD control, while the latter is an off-policy control.
% It is well known that TDC algorithm \cite{sutton2009fast} guarantees
% convergence under off-policy conditions while the off-policy TD algorithm may diverge. The
% objective function of TDC is MSPBE.
% TDC is essentially an adjustment or correction of the TD update so that it
% follows the gradient of the MSPBE objective function. In the context of the TDC algorithm, the control algorithm
% is known as Greedy-GQ($\lambda$) \cite{sutton2009fast}. When $\lambda$ is set to 0, it is denoted
% as GQ(0).
\ No newline at end of file
\section{Related Work}
\subsection{Difference between VMQ and R-learning}
Tabular VMQ's update formula bears some resemblance
to R-learning's update formula. As shown in Table \ref{differenceRandVMQ}, the update formulas of the two algorithms have the following differences:
\\(1) The goal of the R-learning algorithm \cite{schwartz1993reinforcement} is to maximize the average
reward, rather than the cumulative reward, by learning an estimate
of the average reward. This estimate $m$ is then used to update the Q-values.
On the contrary, the $\omega$ in the tabular VMQ update formula eventually converges to $\mathbb{E}[\delta]$.
\\(2) When $\gamma=1$ in the tabular VMQ update formula, the
R-learning update formula is formally
the same as the tabular VMQ update formula.
Therefore, R-learning algorithm can be
considered as a special case of VMQ algorithm in form.
\subsection{Variance Reduction for TD Learning}
The TD with centering algorithm (CTD) \cite{korda2015td}
was proposed, which directly applies variance reduction techniques to
the TD algorithm. The CTD algorithm updates its parameters using the
average gradient of a batch of Markovian samples and a projection operator.
Unfortunately, the authors’ analysis of the CTD algorithm contains technical
errors. The VRTD algorithm \cite{xu2020reanalysis} is also a variance-reduced algorithm that updates
its parameters using the average gradient of a batch of i.i.d. samples. The
authors of VRTD provide a technically sound analysis to demonstrate the
advantages of variance reduction.
\subsection{Variance Reduction for Policy Gradient Algorithms}
Policy gradient algorithms are a class of reinforcement
learning algorithms that directly optimize cumulative rewards.
REINFORCE is a Monte Carlo algorithm that estimates
gradients through sampling, but may have a high variance.
Baselines are introduced to reduce variance and to
accelerate learning \cite{Sutton2018book}. In Actor-Critic,
value function as a baseline and bootstrapping
are used to reduce variance, also accelerating convergence \cite{Sutton2018book}.
TRPO \cite{schulman2015trust} and PPO \cite{schulman2017proximal}
use generalized advantage
estimation, which combines multi-step bootstrapping and Monte Carlo
estimation to reduce variance, making gradient estimation more stable and
accelerating convergence.
In Variance Minimization,
the incorporation of $\omega \doteq \mathbb{E}[\delta]$
bears a striking resemblance to the use of a baseline
in policy gradient methods. The introduction of a baseline
in policy gradient techniques does not alter
the expected value of the update;
rather, it significantly impacts the variance of gradient estimation.
The addition of $\omega \doteq \mathbb{E}[\delta]$ in Variance Minimization
preserves the invariance of the optimal
policy while stabilizing gradient estimation,
reducing the variance of gradient estimation,
and hastening convergence.
\ No newline at end of file
\relax
\bibstyle{aaai24}
\citation{sutton2009fast}
\citation{hirsch1989convergent}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{hirsch1989convergent}
\newlabel{proofth2}{{A.1}{1}}
\newlabel{thetavmtdcFastest}{{A-1}{1}}
\newlabel{uvmtdcFastest}{{A-2}{1}}
\newlabel{omegavmtdcFastest}{{A-3}{1}}
\newlabel{omegavmtdcFastestFinal}{{A-4}{1}}
\newlabel{omegavmtdcInfty}{{A-5}{1}}
\newlabel{thetavmtdcFaster}{{A-6}{1}}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{borkar1997stochastic}
\newlabel{uvmtdcFaster}{{A-7}{2}}
\newlabel{uvmtdcFasterFinal}{{A-8}{2}}
\newlabel{uvmtdcInfty}{{A-9}{2}}
\newlabel{thetavmtdcSlowerFinal}{{A-11}{2}}
\newlabel{odethetavmtdcfinal}{{A-12}{2}}
\citation{hirsch1989convergent}
\citation{borkar2000ode}
\citation{borkar2000ode}
\citation{borkar2000ode}
\newlabel{proofVMETD}{{A.2}{3}}
\newlabel{th1proof}{{A.2}{3}}
\newlabel{thetaFast}{{A-13}{3}}
\newlabel{omegaFast}{{A-14}{3}}
\newlabel{omegaFastFinal}{{A-15}{3}}
\newlabel{omegaInfty}{{A-16}{3}}
\citation{sutton2016emphatic}
\newlabel{odetheta}{{A-17}{4}}
\newlabel{rowsum}{{A-20}{4}}
\newlabel{columnsum}{{A-21}{5}}
\newlabel{odethetafinal}{{A-22}{5}}
\newlabel{mathematicalanalysis}{{B}{5}}
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{keymatrices}{{1}{5}}
\newlabel{minimumeigenvalues}{{2}{5}}
\newlabel{experimentaldetails}{{C}{5}}
\newlabel{bairdcounterexample}{{\caption@xref {bairdcounterexample}{ on input line 731}}{6}}
\newlabel{randomwalk}{{\caption@xref {randomwalk}{ on input line 754}}{6}}
\newlabel{boyanchain}{{\caption@xref {boyanchain}{ on input line 777}}{6}}
\bibdata{aaai24}
\bibcite{borkar1997stochastic}{{1}{1997}{{Borkar}}{{}}}
\bibcite{borkar2000ode}{{2}{2000}{{Borkar and Meyn}}{{}}}
\bibcite{hirsch1989convergent}{{3}{1989}{{Hirsch}}{{}}}
\bibcite{sutton2009fast}{{4}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}}
\bibcite{sutton2016emphatic}{{5}{2016}{{Sutton, Mahmood, and White}}{{}}}
\newlabel{lrofways}{{6}{7}}
\gdef \@abspage@last{7}
\begin{thebibliography}{5}
\providecommand{\natexlab}[1]{#1}
\bibitem[{Borkar(1997)}]{borkar1997stochastic}
Borkar, V.~S. 1997.
\newblock Stochastic approximation with two time scales.
\newblock \emph{Syst. \& Control Letters}, 29(5): 291--294.
\bibitem[{Borkar and Meyn(2000)}]{borkar2000ode}
Borkar, V.~S.; and Meyn, S.~P. 2000.
\newblock The ODE method for convergence of stochastic approximation and reinforcement learning.
\newblock \emph{SIAM J. Control Optim.}, 38(2): 447--469.
\bibitem[{Hirsch(1989)}]{hirsch1989convergent}
Hirsch, M.~W. 1989.
\newblock Convergent activation dynamics in continuous time networks.
\newblock \emph{Neural Netw.}, 2(5): 331--349.
\bibitem[{Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}]{sutton2009fast}
Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesv{\'a}ri, C.; and Wiewiora, E. 2009.
\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
\newblock In \emph{Proc. 26th Int. Conf. Mach. Learn.}, 993--1000.
\bibitem[{Sutton, Mahmood, and White(2016)}]{sutton2016emphatic}
Sutton, R.~S.; Mahmood, A.~R.; and White, M. 2016.
\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
\newblock \emph{The Journal of Machine Learning Research}, 17(1): 2603--2631.
\end{thebibliography}
This is BibTeX, Version 0.99d (TeX Live 2023)
Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
The top-level auxiliary file: anonymous-submission-latex-2024.aux
The style file: aaai24.bst
Database file #1: aaai24.bib
You've used 5 entries,
2840 wiz_defined-function locations,
619 strings with 5446 characters,
and the built_in function-call counts, 3370 in all, are:
= -- 277
> -- 153
< -- 0
+ -- 60
- -- 52
* -- 242
:= -- 547
add.period$ -- 20
call.type$ -- 5
change.case$ -- 36
chr.to.int$ -- 6
cite$ -- 5
duplicate$ -- 223
empty$ -- 240
format.name$ -- 60
if$ -- 649
int.to.chr$ -- 1
int.to.str$ -- 1
missing$ -- 49
newline$ -- 29
num.names$ -- 20
pop$ -- 92
preamble$ -- 1
purify$ -- 34
quote$ -- 0
skip$ -- 96
stack$ -- 0
substring$ -- 200
swap$ -- 128
text.length$ -- 0
text.prefix$ -- 0
top$ -- 0
type$ -- 45
warning$ -- 0
while$ -- 31
width$ -- 0
write$ -- 68
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment