\begin{thebibliography}{34} \providecommand{\natexlab}[1]{#1} \providecommand{\url}[1]{\texttt{#1}} \expandafter\ifx\csname urlstyle\endcsname\relax \providecommand{\doi}[1]{doi: #1}\else \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi \bibitem[Baird et~al.(1995)]{baird1995residual} Baird, L. et~al. \newblock Residual algorithms: Reinforcement learning with function approximation. \newblock In \emph{Proc. 12th Int. Conf. Mach. Learn.}, pp.\ 30--37, 1995. \bibitem[Bas-Serrano et~al.(2021)Bas-Serrano, Curi, Krause, and Neu]{basserrano2021logistic} Bas-Serrano, J., Curi, S., Krause, A., and Neu, G. \newblock Logistic q-learning. \newblock In \emph{International Conference on Artificial Intelligence and Statistics}, pp.\ 3610--3618, 2021. \bibitem[Borkar(1997)]{borkar1997stochastic} Borkar, V.~S. \newblock Stochastic approximation with two time scales. \newblock \emph{Syst. \& Control Letters}, 29\penalty0 (5):\penalty0 291--294, 1997. \bibitem[Borkar \& Meyn(2000)Borkar and Meyn]{borkar2000ode} Borkar, V.~S. and Meyn, S.~P. \newblock The ode method for convergence of stochastic approximation and reinforcement learning. \newblock \emph{SIAM J. Control Optim.}, 38\penalty0 (2):\penalty0 447--469, 2000. \bibitem[Chen et~al.(2023)Chen, Ma, Li, Yang, Yang, and Gao]{chen2023modified} Chen, X., Ma, X., Li, Y., Yang, G., Yang, S., and Gao, Y. \newblock Modified retrace for off-policy temporal difference learning. \newblock In \emph{Uncertainty in Artificial Intelligence}, pp.\ 303--312. PMLR, 2023. \bibitem[Dalal et~al.(2020)Dalal, Szorenyi, and Thoppe]{dalal2020tale} Dalal, G., Szorenyi, B., and Thoppe, G. \newblock A tale of two-timescale reinforcement learning with the tightest finite-time bound. \newblock In \emph{Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pp.\ 3701--3708, 2020. \bibitem[Devlin \& Kudenko(2012)Devlin and Kudenko]{devlin2012dynamic} Devlin, S. and Kudenko, D. \newblock Dynamic potential-based reward shaping. \newblock In \emph{Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, pp.\ 433--440, 2012. \bibitem[Feng et~al.(2019)Feng, Li, and Liu]{feng2019kernel} Feng, Y., Li, L., and Liu, Q. \newblock A kernel loss for solving the bellman equation. \newblock In \emph{Advances in Neural Information Processing Systems}, pp.\ 15430--15441, 2019. \bibitem[Givchi \& Palhang(2015)Givchi and Palhang]{givchi2015quasi} Givchi, A. and Palhang, M. \newblock Quasi newton temporal difference learning. \newblock In \emph{Asian Conference on Machine Learning}, pp.\ 159--172, 2015. \bibitem[Hackman(2012)]{hackman2012faster} Hackman, L. \newblock \emph{Faster Gradient-TD Algorithms}. \newblock PhD thesis, University of Alberta, 2012. \bibitem[Hallak et~al.(2016)Hallak, Tamar, Munos, and Mannor]{hallak2016generalized} Hallak, A., Tamar, A., Munos, R., and Mannor, S. \newblock Generalized emphatic temporal difference learning: bias-variance analysis. \newblock In \emph{Proceedings of the 30th AAAI Conference on Artificial Intelligence}, pp.\ 1631--1637, 2016. \bibitem[Hirsch(1989)]{hirsch1989convergent} Hirsch, M.~W. \newblock Convergent activation dynamics in continuous time networks. \newblock \emph{Neural Netw.}, 2\penalty0 (5):\penalty0 331--349, 1989. \bibitem[Johnson \& Zhang(2013)Johnson and Zhang]{johnson2013accelerating} Johnson, R. and Zhang, T. \newblock Accelerating stochastic gradient descent using predictive variance reduction. \newblock In \emph{Advances in Neural Information Processing Systems}, pp.\ 315--323, 2013. \bibitem[Korda \& La(2015)Korda and La]{korda2015td} Korda, N. and La, P. \newblock On td (0) with function approximation: Concentration bounds and a centered variant with exponential convergence. \newblock In \emph{International conference on machine learning}, pp.\ 626--634. PMLR, 2015. \bibitem[Langley(2000)]{langley00} Langley, P. \newblock Crafting papers on machine learning. \newblock In Langley, P. (ed.), \emph{Proceedings of the 17th International Conference on Machine Learning (ICML 2000)}, pp.\ 1207--1216, Stanford, CA, 2000. Morgan Kaufmann. \bibitem[Liu et~al.(2015)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik]{liu2015finite} Liu, B., Liu, J., Ghavamzadeh, M., Mahadevan, S., and Petrik, M. \newblock Finite-sample analysis of proximal gradient td algorithms. \newblock In \emph{Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, pp.\ 504--513, 2015. \bibitem[Liu et~al.(2016)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik]{liu2016proximal} Liu, B., Liu, J., Ghavamzadeh, M., Mahadevan, S., and Petrik, M. \newblock Proximal gradient temporal difference learning algorithms. \newblock In \emph{Proceedings of the International Joint Conference on Artificial Intelligence}, pp.\ 4195--4199, 2016. \bibitem[Liu et~al.(2018)Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik]{liu2018proximal} Liu, B., Gemp, I., Ghavamzadeh, M., Liu, J., Mahadevan, S., and Petrik, M. \newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity. \newblock \emph{Journal of Artificial Intelligence Research}, 63:\penalty0 461--494, 2018. \bibitem[Maei(2011)]{maei2011gradient} Maei, H.~R. \newblock \emph{Gradient temporal-difference learning algorithms}. \newblock PhD thesis, University of Alberta, 2011. \bibitem[Ng et~al.(1999)Ng, Harada, and Russell]{ng1999policy} Ng, A.~Y., Harada, D., and Russell, S. \newblock Policy invariance under reward transformations: Theory and application to reward shaping. \newblock In \emph{Proc. 16th Int. Conf. Mach. Learn.}, pp.\ 278--287, 1999. \bibitem[Pan et~al.(2017)Pan, White, and White]{pan2017accelerated} Pan, Y., White, A., and White, M. \newblock Accelerated gradient temporal difference learning. \newblock In \emph{Proceedings of the 21st AAAI Conference on Artificial Intelligence}, pp.\ 2464--2470, 2017. \bibitem[Schulman et~al.(2015)Schulman, Levine, Abbeel, Jordan, and Moritz]{schulman2015trust} Schulman, J., Levine, S., Abbeel, P., Jordan, M., and Moritz, P. \newblock Trust region policy optimization. \newblock In \emph{International Conference on Machine Learning}, pp.\ 1889--1897, 2015. \bibitem[Schulman et~al.(2017)Schulman, Wolski, Dhariwal, Radford, and Klimov]{schulman2017proximal} Schulman, J., Wolski, F., Dhariwal, P., Radford, A., and Klimov, O. \newblock Proximal policy optimization algorithms. \newblock \emph{arXiv preprint arXiv:1707.06347}, 2017. \bibitem[Schwartz(1993)]{schwartz1993reinforcement} Schwartz, A. \newblock A reinforcement learning method for maximizing undiscounted rewards. \newblock In \emph{Proc. 10th Int. Conf. Mach. Learn.}, volume 298, pp.\ 298--305, 1993. \bibitem[Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora]{sutton2009fast} Sutton, R., Maei, H., Precup, D., Bhatnagar, S., Silver, D., Szepesv{\'a}ri, C., and Wiewiora, E. \newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation. \newblock In \emph{Proc. 26th Int. Conf. Mach. Learn.}, pp.\ 993--1000, 2009. \bibitem[Sutton(1988)]{sutton1988learning} Sutton, R.~S. \newblock Learning to predict by the methods of temporal differences. \newblock \emph{Machine learning}, 3\penalty0 (1):\penalty0 9--44, 1988. \bibitem[Sutton \& Barto(2018)Sutton and Barto]{Sutton2018book} Sutton, R.~S. and Barto, A.~G. \newblock \emph{Reinforcement Learning: An Introduction}. \newblock The MIT Press, second edition, 2018. \bibitem[Sutton et~al.(2008)Sutton, Maei, and Szepesv{\'a}ri]{sutton2008convergent} Sutton, R.~S., Maei, H.~R., and Szepesv{\'a}ri, C. \newblock A convergent $ o (n) $ temporal-difference algorithm for off-policy learning with linear function approximation. \newblock In \emph{Advances in Neural Information Processing Systems}, pp.\ 1609--1616. Cambridge, MA: MIT Press, 2008. \bibitem[Sutton et~al.(2016)Sutton, Mahmood, and White]{sutton2016emphatic} Sutton, R.~S., Mahmood, A.~R., and White, M. \newblock An emphatic approach to the problem of off-policy temporal-difference learning. \newblock \emph{The Journal of Machine Learning Research}, 17\penalty0 (1):\penalty0 2603--2631, 2016. \bibitem[Tsitsiklis \& Van~Roy(1997)Tsitsiklis and Van~Roy]{tsitsiklis1997analysis} Tsitsiklis, J.~N. and Van~Roy, B. \newblock Analysis of temporal-diffference learning with function approximation. \newblock In \emph{Advances in Neural Information Processing Systems}, pp.\ 1075--1081, 1997. \bibitem[Xu et~al.(2019)Xu, Wang, Zhou, and Liang]{xu2019reanalysis} Xu, T., Wang, Z., Zhou, Y., and Liang, Y. \newblock Reanalysis of variance reduced temporal difference learning. \newblock In \emph{International Conference on Learning Representations}, 2019. \bibitem[Xu et~al.(2020)Xu, Wang, Zhou, and Liang]{xu2020reanalysis} Xu, T., Wang, Z., Zhou, Y., and Liang, Y. \newblock Reanalysis of variance reduced temporal difference learning. \newblock \emph{arXiv preprint arXiv:2001.01898}, 2020. \bibitem[Zhang \& Whiteson(2022)Zhang and Whiteson]{zhang2022truncated} Zhang, S. and Whiteson, S. \newblock Truncated emphatic temporal difference methods for prediction and control. \newblock \emph{The Journal of Machine Learning Research}, 23\penalty0 (1):\penalty0 6859--6917, 2022. \bibitem[Zhou(2021)]{zhou2021machine} Zhou, Z.-H. \newblock \emph{Machine learning}. \newblock Springer Nature, 2021. \end{thebibliography}