\relax \providecommand\hyper@newdestlabel[2]{} \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined \global\let\oldnewlabel\newlabel \gdef\newlabel#1#2{\newlabelxx{#1}#2} \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} \AtEndDocument{\ifx\hyper@anchor\@undefined \let\newlabel\oldnewlabel \fi} \fi} \global\let\hyper@last\relax \gdef\HyperFirstAtBeginDocument#1{#1} \providecommand\HyField@AuxAddToFields[1]{} \providecommand\HyField@AuxAddToCoFields[2]{} \citation{sutton1988learning} \citation{tsitsiklis1997analysis} \citation{Sutton2018book} \citation{baird1995residual} \citation{sutton2008convergent} \citation{sutton2009fast} \citation{sutton2016emphatic} \citation{chen2023modified} \citation{hackman2012faster} \citation{liu2015finite,liu2016proximal,liu2018proximal} \citation{givchi2015quasi} \citation{pan2017accelerated} \citation{hallak2016generalized} \citation{zhang2022truncated} \citation{johnson2013accelerating} \citation{korda2015td} \citation{xu2019reanalysis} \citation{Sutton2018book} \citation{baird1995residual} \citation{sutton2009fast} \citation{sutton2009fast} \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent } \newlabel{introduction}{{1}{1}{Introduction}{section.1}{}} \citation{feng2019kernel} \citation{basserrano2021logistic} \citation{Sutton2018book} \citation{Sutton2018book} \@writefile{toc}{\contentsline {section}{\numberline {2}Background}{2}{section.2}\protected@file@percent } \newlabel{preliminaries}{{2}{2}{Background}{section.2}{}} \newlabel{valuefunction}{{2}{2}{Background}{section.2}{}} \newlabel{linearvaluefunction}{{1}{2}{Background}{equation.2.1}{}} \citation{sutton2009fast} \citation{sutton2009fast} \citation{ng1999policy} \citation{devlin2012dynamic} \@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Classification accuracies for naive Bayes and flexible Bayes on various data sets.}}{3}{table.1}\protected@file@percent } \newlabel{example_bias}{{1}{3}{Classification accuracies for naive Bayes and flexible Bayes on various data sets}{table.1}{}} \@writefile{toc}{\contentsline {section}{\numberline {3}Variance Minimization Algorithms}{3}{section.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Motivation}{3}{subsection.3.1}\protected@file@percent } \@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces VMTD algorithm with linear function approximation in the on-policy setting}}{4}{algorithm.1}\protected@file@percent } \newlabel{alg:algorithm 1}{{1}{4}{Variance Minimization TD Learning: VMTD}{algorithm.1}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Variance Minimization TD Learning: VMTD}{4}{subsection.3.2}\protected@file@percent } \newlabel{omega}{{3}{4}{Variance Minimization TD Learning: VMTD}{equation.3.3}{}} \newlabel{delta}{{4}{4}{Variance Minimization TD Learning: VMTD}{equation.3.4}{}} \newlabel{theta}{{5}{4}{Variance Minimization TD Learning: VMTD}{equation.3.5}{}} \newlabel{deltaSarsa}{{8}{4}{Variance Minimization TD Learning: VMTD}{equation.3.8}{}} \newlabel{deltaQ}{{9}{4}{Variance Minimization TD Learning: VMTD}{equation.3.9}{}} \citation{dalal2020tale} \citation{dalal2020tale} \@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Variance Minimization TDC Learning: VMTDC}{5}{subsection.3.3}\protected@file@percent } \newlabel{thetavmtdc}{{11}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.11}{}} \newlabel{uvmtdc}{{12}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.12}{}} \newlabel{omegavmtdc}{{13}{5}{Variance Minimization TDC Learning: VMTDC}{equation.3.13}{}} \@writefile{toc}{\contentsline {section}{\numberline {4}Theoretical Analysis}{5}{section.4}\protected@file@percent } \newlabel{theorem1}{{4.1}{5}{}{theorem.4.1}{}} \newlabel{corollary4_2}{{4.2}{5}{}{theorem.4.2}{}} \citation{Sutton2018book} \citation{sutton2009fast} \citation{baird1995residual,sutton2009fast} \citation{baird1995residual,sutton2009fast,maei2011gradient} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Random walk.}}{6}{figure.1}\protected@file@percent } \newlabel{randomwalk}{{1}{6}{Random walk}{figure.1}{}} \@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces 7-state version of Baird's off-policy counterexample.}}{6}{figure.2}\protected@file@percent } \newlabel{bairdexample}{{2}{6}{7-state version of Baird's off-policy counterexample}{figure.2}{}} \newlabel{theorem2}{{4.3}{6}{}{theorem.4.3}{}} \@writefile{toc}{\contentsline {section}{\numberline {5}Experimental Studies}{6}{section.5}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Testing Tasks}{6}{subsection.5.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Experimental Results and Analysis}{7}{subsection.5.2}\protected@file@percent } \newlabel{DependentFull}{{3(a)}{7}{Subfigure 3(a)}{subfigure.3.1}{}} \newlabel{sub@DependentFull}{{(a)}{7}{Subfigure 3(a)\relax }{subfigure.3.1}{}} \newlabel{TabularFull}{{3(b)}{7}{Subfigure 3(b)}{subfigure.3.2}{}} \newlabel{sub@TabularFull}{{(b)}{7}{Subfigure 3(b)\relax }{subfigure.3.2}{}} \newlabel{InvertedFull}{{3(c)}{7}{Subfigure 3(c)}{subfigure.3.3}{}} \newlabel{sub@InvertedFull}{{(c)}{7}{Subfigure 3(c)\relax }{subfigure.3.3}{}} \newlabel{CounterExampleFull}{{3(d)}{7}{Subfigure 3(d)}{subfigure.3.4}{}} \newlabel{sub@CounterExampleFull}{{(d)}{7}{Subfigure 3(d)\relax }{subfigure.3.4}{}} \@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Learning curses of four evaluation environments.}}{7}{figure.3}\protected@file@percent } \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Dependent}}}{7}{figure.3}\protected@file@percent } \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Tabular}}}{7}{figure.3}\protected@file@percent } \@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Inverted}}}{7}{figure.3}\protected@file@percent } \@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {counterexample}}}{7}{figure.3}\protected@file@percent } \newlabel{Evaluation_full}{{3}{7}{Learning curses of four evaluation environments}{figure.3}{}} \citation{schwartz1993reinforcement} \newlabel{MazeFull}{{4(a)}{8}{Subfigure 4(a)}{subfigure.4.1}{}} \newlabel{sub@MazeFull}{{(a)}{8}{Subfigure 4(a)\relax }{subfigure.4.1}{}} \newlabel{CliffWalkingFull}{{4(b)}{8}{Subfigure 4(b)}{subfigure.4.2}{}} \newlabel{sub@CliffWalkingFull}{{(b)}{8}{Subfigure 4(b)\relax }{subfigure.4.2}{}} \newlabel{MountainCarFull}{{4(c)}{8}{Subfigure 4(c)}{subfigure.4.3}{}} \newlabel{sub@MountainCarFull}{{(c)}{8}{Subfigure 4(c)\relax }{subfigure.4.3}{}} \newlabel{AcrobotFull}{{4(d)}{8}{Subfigure 4(d)}{subfigure.4.4}{}} \newlabel{sub@AcrobotFull}{{(d)}{8}{Subfigure 4(d)\relax }{subfigure.4.4}{}} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Learning curses of four contral environments.}}{8}{figure.4}\protected@file@percent } \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Maze}}}{8}{figure.4}\protected@file@percent } \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Cliff Walking}}}{8}{figure.4}\protected@file@percent } \@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Mountain Car}}}{8}{figure.4}\protected@file@percent } \@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Acrobot}}}{8}{figure.4}\protected@file@percent } \newlabel{Complete_full}{{4}{8}{Learning curses of four contral environments}{figure.4}{}} \@writefile{toc}{\contentsline {section}{\numberline {6}Related Work}{8}{section.6}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Difference between VMQ and R-learning}{8}{subsection.6.1}\protected@file@percent } \@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Difference between R-learning and tabular VMQ.}}{8}{table.2}\protected@file@percent } \newlabel{differenceRandVMQ}{{2}{8}{Difference between R-learning and tabular VMQ}{table.2}{}} \citation{korda2015td} \citation{xu2020reanalysis} \citation{Sutton2018book} \citation{Sutton2018book} \citation{schulman2015trust} \citation{schulman2017proximal} \citation{borkar1997stochastic} \@writefile{toc}{\contentsline {subsection}{\numberline {6.2}Variance Reduction for TD Learning}{9}{subsection.6.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {6.3}Variance Reduction for Policy Gradient Algorithms}{9}{subsection.6.3}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion and Future Work}{9}{section.7}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {A}Relevant proofs}{9}{appendix.A}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {A.1}Proof of Theorem \ref {theorem1}}{9}{subsection.A.1}\protected@file@percent } \newlabel{proofth1}{{A.1}{9}{Proof of Theorem \ref {theorem1}}{subsection.A.1}{}} \newlabel{th1proof}{{A.1}{9}{Proof of Theorem \ref {theorem1}}{subsection.A.1}{}} \citation{hirsch1989convergent} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} \newlabel{thetaFast}{{19}{10}{Proof of Theorem \ref {theorem1}}{equation.A.19}{}} \newlabel{omegaFast}{{20}{10}{Proof of Theorem \ref {theorem1}}{equation.A.20}{}} \newlabel{omegaFastFinal}{{21}{10}{Proof of Theorem \ref {theorem1}}{equation.A.21}{}} \newlabel{omegaInfty}{{22}{10}{Proof of Theorem \ref {theorem1}}{equation.A.22}{}} \newlabel{odetheta}{{23}{10}{Proof of Theorem \ref {theorem1}}{equation.A.23}{}} \citation{dalal2020tale} \citation{dalal2020tale} \newlabel{covariance}{{24}{11}{Proof of Theorem \ref {theorem1}}{equation.A.24}{}} \newlabel{odethetafinal}{{25}{11}{Proof of Theorem \ref {theorem1}}{equation.A.25}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {A.2}Proof of Corollary \ref {corollary4_2}}{11}{subsection.A.2}\protected@file@percent } \newlabel{proofcorollary4_2}{{A.2}{11}{Proof of Corollary \ref {corollary4_2}}{subsection.A.2}{}} \newlabel{matrixassumption}{{A.1}{11}{}{theorem.A.1}{}} \newlabel{stepsizeassumption}{{A.2}{11}{}{theorem.A.2}{}} \newlabel{sparseprojection}{{A.3}{11}{}{theorem.A.3}{}} \citation{dalal2020tale} \citation{dalal2020tale} \citation{sutton2009fast} \citation{hirsch1989convergent} \newlabel{sparseprojectiontheta}{{30}{12}{}{equation.A.30}{}} \newlabel{sparseprojectionomega}{{31}{12}{}{equation.A.31}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {A.3}Proof of Theorem \ref {theorem2}}{12}{subsection.A.3}\protected@file@percent } \newlabel{proofth2}{{A.3}{12}{Proof of Theorem \ref {theorem2}}{subsection.A.3}{}} \newlabel{thetavmtdcFastest}{{32}{12}{Proof of Theorem \ref {theorem2}}{equation.A.32}{}} \newlabel{uvmtdcFastest}{{33}{12}{Proof of Theorem \ref {theorem2}}{equation.A.33}{}} \newlabel{omegavmtdcFastest}{{34}{12}{Proof of Theorem \ref {theorem2}}{equation.A.34}{}} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} \citation{hirsch1989convergent} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} \newlabel{omegavmtdcFastestFinal}{{35}{13}{Proof of Theorem \ref {theorem2}}{equation.A.35}{}} \newlabel{omegavmtdcInfty}{{36}{13}{Proof of Theorem \ref {theorem2}}{equation.A.36}{}} \newlabel{thetavmtdcFaster}{{37}{13}{Proof of Theorem \ref {theorem2}}{equation.A.37}{}} \newlabel{uvmtdcFaster}{{38}{13}{Proof of Theorem \ref {theorem2}}{equation.A.38}{}} \newlabel{uvmtdcFasterFinal}{{39}{13}{Proof of Theorem \ref {theorem2}}{equation.A.39}{}} \newlabel{uvmtdcInfty}{{40}{13}{Proof of Theorem \ref {theorem2}}{equation.A.40}{}} \newlabel{thetavmtdcSlowerFinal}{{42}{14}{Proof of Theorem \ref {theorem2}}{equation.A.42}{}} \newlabel{odethetavmtdcfinal}{{43}{14}{Proof of Theorem \ref {theorem2}}{equation.A.43}{}} \@writefile{toc}{\contentsline {section}{\numberline {B}Experimental details}{14}{appendix.B}\protected@file@percent } \newlabel{experimentaldetails}{{B}{14}{Experimental details}{appendix.B}{}} \@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces VMTDC algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.2}\protected@file@percent } \newlabel{alg:algorithm 2}{{2}{15}{Proof of Theorem \ref {theorem2}}{algorithm.2}{}} \@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces VMGTD algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.3}\protected@file@percent } \newlabel{alg:algorithm 3}{{3}{15}{Proof of Theorem \ref {theorem2}}{algorithm.3}{}} \bibstyle{named} \bibdata{neurips_2024} \bibcite{baird1995residual}{{1}{1995}{{Baird and others}}{{}}} \bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano \bgroup \em et al.\egroup }}{{}}} \@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces VMGTD2 algorithm with linear function approximation in the off-policy setting}}{16}{algorithm.4}\protected@file@percent } \newlabel{alg:algorithm 4}{{4}{16}{Proof of Theorem \ref {theorem2}}{algorithm.4}{}} \@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Learning rates ($lr$) of four control experiments.}}{16}{table.3}\protected@file@percent } \newlabel{lrofways}{{3}{16}{Learning rates ($lr$) of four control experiments}{table.3}{}} \bibcite{borkar2000ode}{{3}{2000}{{Borkar and Meyn}}{{}}} \bibcite{borkar1997stochastic}{{4}{1997}{{Borkar}}{{}}} \bibcite{chen2023modified}{{5}{2023}{{Chen \bgroup \em et al.\egroup }}{{}}} \bibcite{dalal2020tale}{{6}{2020}{{Dalal \bgroup \em et al.\egroup }}{{}}} \bibcite{devlin2012dynamic}{{7}{2012}{{Devlin and Kudenko}}{{}}} \bibcite{feng2019kernel}{{8}{2019}{{Feng \bgroup \em et al.\egroup }}{{}}} \bibcite{givchi2015quasi}{{9}{2015}{{Givchi and Palhang}}{{}}} \bibcite{hackman2012faster}{{10}{2012}{{Hackman}}{{}}} \bibcite{hallak2016generalized}{{11}{2016}{{Hallak \bgroup \em et al.\egroup }}{{}}} \bibcite{hirsch1989convergent}{{12}{1989}{{Hirsch}}{{}}} \bibcite{johnson2013accelerating}{{13}{2013}{{Johnson and Zhang}}{{}}} \bibcite{korda2015td}{{14}{2015}{{Korda and La}}{{}}} \bibcite{liu2015finite}{{15}{2015}{{Liu \bgroup \em et al.\egroup }}{{}}} \bibcite{liu2016proximal}{{16}{2016}{{Liu \bgroup \em et al.\egroup }}{{}}} \bibcite{liu2018proximal}{{17}{2018}{{Liu \bgroup \em et al.\egroup }}{{}}} \bibcite{maei2011gradient}{{18}{2011}{{Maei}}{{}}} \bibcite{ng1999policy}{{19}{1999}{{Ng \bgroup \em et al.\egroup }}{{}}} \bibcite{pan2017accelerated}{{20}{2017}{{Pan \bgroup \em et al.\egroup }}{{}}} \bibcite{schulman2015trust}{{21}{2015}{{Schulman \bgroup \em et al.\egroup }}{{}}} \bibcite{schulman2017proximal}{{22}{2017}{{Schulman \bgroup \em et al.\egroup }}{{}}} \bibcite{schwartz1993reinforcement}{{23}{1993}{{Schwartz}}{{}}} \bibcite{Sutton2018book}{{24}{2018}{{Sutton and Barto}}{{}}} \bibcite{sutton2008convergent}{{25}{2008}{{Sutton \bgroup \em et al.\egroup }}{{}}} \bibcite{sutton2009fast}{{26}{2009}{{Sutton \bgroup \em et al.\egroup }}{{}}} \bibcite{sutton2016emphatic}{{27}{2016}{{Sutton \bgroup \em et al.\egroup }}{{}}} \bibcite{sutton1988learning}{{28}{1988}{{Sutton}}{{}}} \bibcite{tsitsiklis1997analysis}{{29}{1997}{{Tsitsiklis and Van~Roy}}{{}}} \bibcite{xu2019reanalysis}{{30}{2019}{{Xu \bgroup \em et al.\egroup }}{{}}} \bibcite{xu2020reanalysis}{{31}{2020}{{Xu \bgroup \em et al.\egroup }}{{}}} \bibcite{zhang2022truncated}{{32}{2022}{{Zhang and Whiteson}}{{}}} \gdef \@abspage@last{18}