\relax \providecommand\hyper@newdestlabel[2]{} \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined \global\let\oldnewlabel\newlabel \gdef\newlabel#1#2{\newlabelxx{#1}#2} \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} \AtEndDocument{\ifx\hyper@anchor\@undefined \let\newlabel\oldnewlabel \fi} \fi} \global\let\hyper@last\relax \gdef\HyperFirstAtBeginDocument#1{#1} \providecommand\HyField@AuxAddToFields[1]{} \providecommand\HyField@AuxAddToCoFields[2]{} \citation{sutton1988learning} \citation{tsitsiklis1997analysis} \citation{Sutton2018book} \citation{baird1995residual} \citation{sutton2008convergent} \citation{sutton2009fast} \citation{sutton2016emphatic} \citation{chen2023modified} \citation{hackman2012faster} \citation{liu2015finite,liu2016proximal,liu2018proximal} \citation{givchi2015quasi} \citation{pan2017accelerated} \citation{hallak2016generalized} \citation{zhang2022truncated} \citation{johnson2013accelerating} \citation{korda2015td} \citation{xu2019reanalysis} \citation{Sutton2018book} \citation{baird1995residual} \citation{sutton2009fast} \citation{sutton2009fast} \citation{feng2019kernel} \citation{basserrano2021logistic} \newlabel{introduction}{{1}{1}{}{section.1}{}} \newlabel{introduction@cref}{{[section][1][]1}{[1][1][]1}} \citation{zhou2021machine} \citation{Sutton2018book} \citation{Sutton2018book} \citation{sutton2009fast} \citation{sutton2009fast} \citation{ng1999policy} \newlabel{preliminaries}{{2}{2}{}{section.2}{}} \newlabel{preliminaries@cref}{{[section][2][]2}{[1][2][]2}} \newlabel{valuefunction}{{2}{2}{}{section.2}{}} \newlabel{valuefunction@cref}{{[section][2][]2}{[1][2][]2}} \newlabel{linearvaluefunction}{{1}{2}{}{equation.2.1}{}} \newlabel{linearvaluefunction@cref}{{[equation][1][]1}{[1][2][]2}} \citation{devlin2012dynamic} \newlabel{example_bias}{{1}{3}{Classification accuracies for naive Bayes and flexible Bayes on various data sets}{table.1}{}} \newlabel{example_bias@cref}{{[table][1][]1}{[1][2][]3}} \newlabel{omega}{{3}{3}{}{equation.3.3}{}} \newlabel{omega@cref}{{[equation][3][]3}{[1][3][]3}} \newlabel{delta}{{4}{3}{}{equation.3.4}{}} \newlabel{delta@cref}{{[equation][4][]4}{[1][3][]3}} \newlabel{theta}{{5}{3}{}{equation.3.5}{}} \newlabel{theta@cref}{{[equation][5][]5}{[1][3][]3}} \newlabel{deltaSarsa}{{8}{3}{}{equation.3.8}{}} \newlabel{deltaSarsa@cref}{{[equation][8][]8}{[1][3][]3}} \newlabel{deltaQ}{{9}{3}{}{equation.3.9}{}} \newlabel{deltaQ@cref}{{[equation][9][]9}{[1][3][]3}} \citation{borkar1997stochastic} \citation{hirsch1989convergent} \newlabel{alg:algorithm 1}{{1}{4}{}{algorithm.1}{}} \newlabel{alg:algorithm 1@cref}{{[algorithm][1][]1}{[1][3][]4}} \newlabel{thetavmtdc}{{11}{4}{}{equation.3.11}{}} \newlabel{thetavmtdc@cref}{{[equation][11][]11}{[1][3][]4}} \newlabel{uvmtdc}{{12}{4}{}{equation.3.12}{}} \newlabel{uvmtdc@cref}{{[equation][12][]12}{[1][3][]4}} \newlabel{omegavmtdc}{{13}{4}{}{equation.3.13}{}} \newlabel{omegavmtdc@cref}{{[equation][13][]13}{[1][3][]4}} \newlabel{theorem1}{{4.1}{4}{}{theorem.4.1}{}} \newlabel{theorem1@cref}{{[theorem][1][4]4.1}{[1][4][]4}} \newlabel{th1proof}{{4}{4}{}{theorem.4.1}{}} \newlabel{th1proof@cref}{{[section][4][]4}{[1][4][]4}} \newlabel{thetaFast}{{17}{4}{}{equation.4.17}{}} \newlabel{thetaFast@cref}{{[equation][17][]17}{[1][4][]4}} \newlabel{omegaFast}{{18}{4}{}{equation.4.18}{}} \newlabel{omegaFast@cref}{{[equation][18][]18}{[1][4][]4}} \newlabel{omegaFastFinal}{{19}{4}{}{equation.4.19}{}} \newlabel{omegaFastFinal@cref}{{[equation][19][]19}{[1][4][]4}} \newlabel{omegaInfty}{{20}{4}{}{equation.4.20}{}} \newlabel{omegaInfty@cref}{{[equation][20][]20}{[1][4][]4}} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} \citation{dalal2020tale} \citation{dalal2020tale} \newlabel{odetheta}{{21}{5}{}{equation.4.21}{}} \newlabel{odetheta@cref}{{[equation][21][]21}{[1][5][]5}} \newlabel{covariance}{{22}{5}{}{equation.4.22}{}} \newlabel{covariance@cref}{{[equation][22][]22}{[1][5][]5}} \newlabel{odethetafinal}{{23}{5}{}{equation.4.23}{}} \newlabel{odethetafinal@cref}{{[equation][23][]23}{[1][5][]5}} \newlabel{corollary4_2}{{4.2}{5}{}{theorem.4.2}{}} \newlabel{corollary4_2@cref}{{[corollary][2][4]4.2}{[1][5][]5}} \newlabel{theorem2}{{4.3}{5}{}{theorem.4.3}{}} \newlabel{theorem2@cref}{{[theorem][3][4]4.3}{[1][5][]5}} \citation{Sutton2018book} \citation{sutton2009fast} \citation{baird1995residual,sutton2009fast} \citation{baird1995residual,sutton2009fast,maei2011gradient} \newlabel{randomwalk}{{1}{6}{Random walk}{figure.1}{}} \newlabel{randomwalk@cref}{{[figure][1][]1}{[1][6][]6}} \newlabel{bairdexample}{{2}{6}{7-state version of Baird's off-policy counterexample}{figure.2}{}} \newlabel{bairdexample@cref}{{[figure][2][]2}{[1][6][]6}} \citation{schwartz1993reinforcement} \citation{korda2015td} \citation{xu2020reanalysis} \newlabel{differenceRandVMQ}{{2}{7}{Difference between R-learning and tabular VMQ}{table.2}{}} \newlabel{differenceRandVMQ@cref}{{[table][2][]2}{[1][6][]7}} \newlabel{DependentFull}{{3(a)}{7}{Subfigure 3(a)}{subfigure.3.1}{}} \newlabel{DependentFull@cref}{{[subfigure][1][3]3(a)}{[1][6][]7}} \newlabel{sub@DependentFull}{{(a)}{7}{Subfigure 3(a)\relax }{subfigure.3.1}{}} \newlabel{TabularFull}{{3(b)}{7}{Subfigure 3(b)}{subfigure.3.2}{}} \newlabel{TabularFull@cref}{{[subfigure][2][3]3(b)}{[1][6][]7}} \newlabel{sub@TabularFull}{{(b)}{7}{Subfigure 3(b)\relax }{subfigure.3.2}{}} \newlabel{InvertedFull}{{3(c)}{7}{Subfigure 3(c)}{subfigure.3.3}{}} \newlabel{InvertedFull@cref}{{[subfigure][3][3]3(c)}{[1][6][]7}} \newlabel{sub@InvertedFull}{{(c)}{7}{Subfigure 3(c)\relax }{subfigure.3.3}{}} \newlabel{CounterExampleFull}{{3(d)}{7}{Subfigure 3(d)}{subfigure.3.4}{}} \newlabel{CounterExampleFull@cref}{{[subfigure][4][3]3(d)}{[1][6][]7}} \newlabel{sub@CounterExampleFull}{{(d)}{7}{Subfigure 3(d)\relax }{subfigure.3.4}{}} \newlabel{Evaluation_full}{{3}{7}{Learning curses of four evaluation environments}{figure.3}{}} \newlabel{Evaluation_full@cref}{{[figure][3][]3}{[1][6][]7}} \citation{Sutton2018book} \citation{Sutton2018book} \citation{schulman2015trust} \citation{schulman2017proximal} \citation{langley00} \bibdata{example_paper} \bibcite{baird1995residual}{{1}{1995}{{Baird et~al.}}{{}}} \newlabel{MazeFull}{{4(a)}{8}{Subfigure 4(a)}{subfigure.4.1}{}} \newlabel{MazeFull@cref}{{[subfigure][1][4]4(a)}{[1][6][]8}} \newlabel{sub@MazeFull}{{(a)}{8}{Subfigure 4(a)\relax }{subfigure.4.1}{}} \newlabel{CliffWalkingFull}{{4(b)}{8}{Subfigure 4(b)}{subfigure.4.2}{}} \newlabel{CliffWalkingFull@cref}{{[subfigure][2][4]4(b)}{[1][6][]8}} \newlabel{sub@CliffWalkingFull}{{(b)}{8}{Subfigure 4(b)\relax }{subfigure.4.2}{}} \newlabel{MountainCarFull}{{4(c)}{8}{Subfigure 4(c)}{subfigure.4.3}{}} \newlabel{MountainCarFull@cref}{{[subfigure][3][4]4(c)}{[1][6][]8}} \newlabel{sub@MountainCarFull}{{(c)}{8}{Subfigure 4(c)\relax }{subfigure.4.3}{}} \newlabel{AcrobotFull}{{4(d)}{8}{Subfigure 4(d)}{subfigure.4.4}{}} \newlabel{AcrobotFull@cref}{{[subfigure][4][4]4(d)}{[1][6][]8}} \newlabel{sub@AcrobotFull}{{(d)}{8}{Subfigure 4(d)\relax }{subfigure.4.4}{}} \newlabel{Complete_full}{{4}{8}{Learning curses of four contral environments}{figure.4}{}} \newlabel{Complete_full@cref}{{[figure][4][]4}{[1][6][]8}} \bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano et~al.}}{{Bas-Serrano, Curi, Krause, and Neu}}} \bibcite{borkar1997stochastic}{{3}{1997}{{Borkar}}{{}}} \bibcite{borkar2000ode}{{4}{2000}{{Borkar \& Meyn}}{{Borkar and Meyn}}} \bibcite{chen2023modified}{{5}{2023}{{Chen et~al.}}{{Chen, Ma, Li, Yang, Yang, and Gao}}} \bibcite{dalal2020tale}{{6}{2020}{{Dalal et~al.}}{{Dalal, Szorenyi, and Thoppe}}} \bibcite{devlin2012dynamic}{{7}{2012}{{Devlin \& Kudenko}}{{Devlin and Kudenko}}} \bibcite{feng2019kernel}{{8}{2019}{{Feng et~al.}}{{Feng, Li, and Liu}}} \bibcite{givchi2015quasi}{{9}{2015}{{Givchi \& Palhang}}{{Givchi and Palhang}}} \bibcite{hackman2012faster}{{10}{2012}{{Hackman}}{{}}} \bibcite{hallak2016generalized}{{11}{2016}{{Hallak et~al.}}{{Hallak, Tamar, Munos, and Mannor}}} \bibcite{hirsch1989convergent}{{12}{1989}{{Hirsch}}{{}}} \bibcite{johnson2013accelerating}{{13}{2013}{{Johnson \& Zhang}}{{Johnson and Zhang}}} \bibcite{korda2015td}{{14}{2015}{{Korda \& La}}{{Korda and La}}} \bibcite{langley00}{{15}{2000}{{Langley}}{{}}} \bibcite{liu2015finite}{{16}{2015}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}} \bibcite{liu2016proximal}{{17}{2016}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}} \bibcite{liu2018proximal}{{18}{2018}{{Liu et~al.}}{{Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}}} \bibcite{maei2011gradient}{{19}{2011}{{Maei}}{{}}} \bibcite{ng1999policy}{{20}{1999}{{Ng et~al.}}{{Ng, Harada, and Russell}}} \bibcite{pan2017accelerated}{{21}{2017}{{Pan et~al.}}{{Pan, White, and White}}} \bibcite{schulman2015trust}{{22}{2015}{{Schulman et~al.}}{{Schulman, Levine, Abbeel, Jordan, and Moritz}}} \bibcite{schulman2017proximal}{{23}{2017}{{Schulman et~al.}}{{Schulman, Wolski, Dhariwal, Radford, and Klimov}}} \bibcite{schwartz1993reinforcement}{{24}{1993}{{Schwartz}}{{}}} \bibcite{sutton2009fast}{{25}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}} \bibcite{sutton1988learning}{{26}{1988}{{Sutton}}{{}}} \bibcite{Sutton2018book}{{27}{2018}{{Sutton \& Barto}}{{Sutton and Barto}}} \bibcite{sutton2008convergent}{{28}{2008}{{Sutton et~al.}}{{Sutton, Maei, and Szepesv{\'a}ri}}} \bibcite{sutton2016emphatic}{{29}{2016}{{Sutton et~al.}}{{Sutton, Mahmood, and White}}} \bibcite{tsitsiklis1997analysis}{{30}{1997}{{Tsitsiklis \& Van~Roy}}{{Tsitsiklis and Van~Roy}}} \bibcite{xu2019reanalysis}{{31}{2019}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}} \bibcite{xu2020reanalysis}{{32}{2020}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}} \bibcite{zhang2022truncated}{{33}{2022}{{Zhang \& Whiteson}}{{Zhang and Whiteson}}} \bibcite{zhou2021machine}{{34}{2021}{{Zhou}}{{}}} \bibstyle{icml2024} \citation{dalal2020tale} \citation{dalal2020tale} \citation{dalal2020tale} \citation{dalal2020tale} \citation{sutton2009fast} \newlabel{proofcorollary4_2}{{A.1}{11}{}{subsection.A.1}{}} \newlabel{proofcorollary4_2@cref}{{[subappendix][1][2147483647,1]A.1}{[1][11][]11}} \newlabel{matrixassumption}{{A.1}{11}{}{theorem.A.1}{}} \newlabel{matrixassumption@cref}{{[assumption][1][2147483647,1]A.1}{[1][11][]11}} \newlabel{stepsizeassumption}{{A.2}{11}{}{theorem.A.2}{}} \newlabel{stepsizeassumption@cref}{{[assumption][2][2147483647,1]A.2}{[1][11][]11}} \newlabel{sparseprojection}{{A.3}{11}{}{theorem.A.3}{}} \newlabel{sparseprojection@cref}{{[definition][3][2147483647,1]A.3}{[1][11][]11}} \newlabel{sparseprojectiontheta}{{30}{11}{}{equation.A.30}{}} \newlabel{sparseprojectiontheta@cref}{{[equation][30][2147483647]30}{[1][11][]11}} \newlabel{sparseprojectionomega}{{31}{11}{}{equation.A.31}{}} \newlabel{sparseprojectionomega@cref}{{[equation][31][2147483647]31}{[1][11][]11}} \citation{hirsch1989convergent} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} \newlabel{proofth2}{{A.2}{12}{}{subsection.A.2}{}} \newlabel{proofth2@cref}{{[subappendix][2][2147483647,1]A.2}{[1][11][]12}} \newlabel{thetavmtdcFastest}{{32}{12}{}{equation.A.32}{}} \newlabel{thetavmtdcFastest@cref}{{[equation][32][2147483647]32}{[1][12][]12}} \newlabel{uvmtdcFastest}{{33}{12}{}{equation.A.33}{}} \newlabel{uvmtdcFastest@cref}{{[equation][33][2147483647]33}{[1][12][]12}} \newlabel{omegavmtdcFastest}{{34}{12}{}{equation.A.34}{}} \newlabel{omegavmtdcFastest@cref}{{[equation][34][2147483647]34}{[1][12][]12}} \newlabel{omegavmtdcFastestFinal}{{35}{12}{}{equation.A.35}{}} \newlabel{omegavmtdcFastestFinal@cref}{{[equation][35][2147483647]35}{[1][12][]12}} \newlabel{omegavmtdcInfty}{{36}{12}{}{equation.A.36}{}} \newlabel{omegavmtdcInfty@cref}{{[equation][36][2147483647]36}{[1][12][]12}} \citation{hirsch1989convergent} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} \newlabel{thetavmtdcFaster}{{37}{13}{}{equation.A.37}{}} \newlabel{thetavmtdcFaster@cref}{{[equation][37][2147483647]37}{[1][13][]13}} \newlabel{uvmtdcFaster}{{38}{13}{}{equation.A.38}{}} \newlabel{uvmtdcFaster@cref}{{[equation][38][2147483647]38}{[1][13][]13}} \newlabel{uvmtdcFasterFinal}{{39}{13}{}{equation.A.39}{}} \newlabel{uvmtdcFasterFinal@cref}{{[equation][39][2147483647]39}{[1][13][]13}} \newlabel{uvmtdcInfty}{{40}{13}{}{equation.A.40}{}} \newlabel{uvmtdcInfty@cref}{{[equation][40][2147483647]40}{[1][13][]13}} \newlabel{thetavmtdcSlowerFinal}{{42}{13}{}{equation.A.42}{}} \newlabel{thetavmtdcSlowerFinal@cref}{{[equation][42][2147483647]42}{[1][13][]13}} \newlabel{alg:algorithm 2}{{2}{14}{}{algorithm.2}{}} \newlabel{alg:algorithm 2@cref}{{[algorithm][2][2147483647]2}{[1][14][]14}} \newlabel{odethetavmtdcfinal}{{43}{14}{}{equation.A.43}{}} \newlabel{odethetavmtdcfinal@cref}{{[equation][43][2147483647]43}{[1][14][]14}} \newlabel{experimentaldetails}{{B}{14}{}{appendix.B}{}} \newlabel{experimentaldetails@cref}{{[appendix][2][2147483647]B}{[1][14][]14}} \newlabel{lrofways}{{3}{15}{Learning rates ($lr$) of four control experiments}{table.3}{}} \newlabel{lrofways@cref}{{[table][3][2147483647]3}{[1][15][]15}} \gdef \@abspage@last{15}