\begin{thebibliography}{} \bibitem[\protect\citeauthoryear{Baird and others}{1995}]{baird1995residual} Leemon Baird et~al. \newblock Residual algorithms: Reinforcement learning with function approximation. \newblock In {\em Proc. 12th Int. Conf. Mach. Learn.}, pages 30--37, 1995. \bibitem[\protect\citeauthoryear{Bas-Serrano \bgroup \em et al.\egroup }{2021}]{basserrano2021logistic} Joan Bas-Serrano, Sebastian Curi, Andreas Krause, and Gergely Neu. \newblock Logistic q-learning. \newblock In {\em International Conference on Artificial Intelligence and Statistics}, pages 3610--3618, 2021. \bibitem[\protect\citeauthoryear{Borkar and Meyn}{2000}]{borkar2000ode} Vivek~S Borkar and Sean~P Meyn. \newblock The ode method for convergence of stochastic approximation and reinforcement learning. \newblock {\em SIAM J. Control Optim.}, 38(2):447--469, 2000. \bibitem[\protect\citeauthoryear{Borkar}{1997}]{borkar1997stochastic} Vivek~S Borkar. \newblock Stochastic approximation with two time scales. \newblock {\em Syst. \& Control Letters}, 29(5):291--294, 1997. \bibitem[\protect\citeauthoryear{Chen \bgroup \em et al.\egroup }{2023}]{chen2023modified} Xingguo Chen, Xingzhou Ma, Yang Li, Guang Yang, Shangdong Yang, and Yang Gao. \newblock Modified retrace for off-policy temporal difference learning. \newblock In {\em Uncertainty in Artificial Intelligence}, pages 303--312. PMLR, 2023. \bibitem[\protect\citeauthoryear{Dalal \bgroup \em et al.\egroup }{2020}]{dalal2020tale} Gal Dalal, Balazs Szorenyi, and Gugan Thoppe. \newblock A tale of two-timescale reinforcement learning with the tightest finite-time bound. \newblock In {\em Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pages 3701--3708, 2020. \bibitem[\protect\citeauthoryear{Devlin and Kudenko}{2012}]{devlin2012dynamic} Sam Devlin and Daniel Kudenko. \newblock Dynamic potential-based reward shaping. \newblock In {\em Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, pages 433--440, 2012. \bibitem[\protect\citeauthoryear{Feng \bgroup \em et al.\egroup }{2019}]{feng2019kernel} Yihao Feng, Lihong Li, and Qiang Liu. \newblock A kernel loss for solving the bellman equation. \newblock In {\em Advances in Neural Information Processing Systems}, pages 15430--15441, 2019. \bibitem[\protect\citeauthoryear{Givchi and Palhang}{2015}]{givchi2015quasi} Arash Givchi and Maziar Palhang. \newblock Quasi newton temporal difference learning. \newblock In {\em Asian Conference on Machine Learning}, pages 159--172, 2015. \bibitem[\protect\citeauthoryear{Hackman}{2012}]{hackman2012faster} Leah Hackman. \newblock {\em Faster Gradient-TD Algorithms}. \newblock PhD thesis, University of Alberta, 2012. \bibitem[\protect\citeauthoryear{Hallak \bgroup \em et al.\egroup }{2016}]{hallak2016generalized} Assaf Hallak, Aviv Tamar, Remi Munos, and Shie Mannor. \newblock Generalized emphatic temporal difference learning: bias-variance analysis. \newblock In {\em Proceedings of the 30th AAAI Conference on Artificial Intelligence}, pages 1631--1637, 2016. \bibitem[\protect\citeauthoryear{Hirsch}{1989}]{hirsch1989convergent} Morris~W Hirsch. \newblock Convergent activation dynamics in continuous time networks. \newblock {\em Neural Netw.}, 2(5):331--349, 1989. \bibitem[\protect\citeauthoryear{Johnson and Zhang}{2013}]{johnson2013accelerating} R.~Johnson and T.~Zhang. \newblock Accelerating stochastic gradient descent using predictive variance reduction. \newblock In {\em Advances in Neural Information Processing Systems}, pages 315--323, 2013. \bibitem[\protect\citeauthoryear{Korda and La}{2015}]{korda2015td} Nathaniel Korda and Prashanth La. \newblock On td (0) with function approximation: Concentration bounds and a centered variant with exponential convergence. \newblock In {\em International conference on machine learning}, pages 626--634. PMLR, 2015. \bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2015}]{liu2015finite} Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik. \newblock Finite-sample analysis of proximal gradient td algorithms. \newblock In {\em Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, pages 504--513, 2015. \bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2016}]{liu2016proximal} Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik. \newblock Proximal gradient temporal difference learning algorithms. \newblock In {\em Proceedings of the International Joint Conference on Artificial Intelligence}, pages 4195--4199, 2016. \bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2018}]{liu2018proximal} Bo~Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji~Liu, Sridhar Mahadevan, and Marek Petrik. \newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity. \newblock {\em Journal of Artificial Intelligence Research}, 63:461--494, 2018. \bibitem[\protect\citeauthoryear{Maei}{2011}]{maei2011gradient} Hamid~Reza Maei. \newblock {\em Gradient temporal-difference learning algorithms}. \newblock PhD thesis, University of Alberta, 2011. \bibitem[\protect\citeauthoryear{Ng \bgroup \em et al.\egroup }{1999}]{ng1999policy} Andrew~Y Ng, Daishi Harada, and Stuart Russell. \newblock Policy invariance under reward transformations: Theory and application to reward shaping. \newblock In {\em Proc. 16th Int. Conf. Mach. Learn.}, pages 278--287, 1999. \bibitem[\protect\citeauthoryear{Pan \bgroup \em et al.\egroup }{2017}]{pan2017accelerated} Yangchen Pan, Adam White, and Martha White. \newblock Accelerated gradient temporal difference learning. \newblock In {\em Proceedings of the 21st AAAI Conference on Artificial Intelligence}, pages 2464--2470, 2017. \bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2015}]{schulman2015trust} J.~Schulman, S.~Levine, P.~Abbeel, M.~Jordan, and P.~Moritz. \newblock Trust region policy optimization. \newblock In {\em International Conference on Machine Learning}, pages 1889--1897, 2015. \bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2017}]{schulman2017proximal} J.~Schulman, F.~Wolski, P.~Dhariwal, A.~Radford, and O.~Klimov. \newblock Proximal policy optimization algorithms. \newblock {\em arXiv preprint arXiv:1707.06347}, 2017. \bibitem[\protect\citeauthoryear{Schwartz}{1993}]{schwartz1993reinforcement} Anton Schwartz. \newblock A reinforcement learning method for maximizing undiscounted rewards. \newblock In {\em Proc. 10th Int. Conf. Mach. Learn.}, volume 298, pages 298--305, 1993. \bibitem[\protect\citeauthoryear{Sutton and Barto}{2018}]{Sutton2018book} Richard~S. Sutton and Andrew~G. Barto. \newblock {\em Reinforcement Learning: An Introduction}. \newblock The MIT Press, second edition, 2018. \bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2008}]{sutton2008convergent} Richard~S Sutton, Hamid~R Maei, and Csaba Szepesv{\'a}ri. \newblock A convergent $ o (n) $ temporal-difference algorithm for off-policy learning with linear function approximation. \newblock In {\em Advances in Neural Information Processing Systems}, pages 1609--1616. Cambridge, MA: MIT Press, 2008. \bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2009}]{sutton2009fast} R.S. Sutton, H.R. Maei, D.~Precup, S.~Bhatnagar, D.~Silver, C.~Szepesv{\'a}ri, and E.~Wiewiora. \newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation. \newblock In {\em Proc. 26th Int. Conf. Mach. Learn.}, pages 993--1000, 2009. \bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2016}]{sutton2016emphatic} Richard~S Sutton, A~Rupam Mahmood, and Martha White. \newblock An emphatic approach to the problem of off-policy temporal-difference learning. \newblock {\em The Journal of Machine Learning Research}, 17(1):2603--2631, 2016. \bibitem[\protect\citeauthoryear{Sutton}{1988}]{sutton1988learning} Richard~S Sutton. \newblock Learning to predict by the methods of temporal differences. \newblock {\em Machine learning}, 3(1):9--44, 1988. \bibitem[\protect\citeauthoryear{Tsitsiklis and Van~Roy}{1997}]{tsitsiklis1997analysis} John~N Tsitsiklis and Benjamin Van~Roy. \newblock Analysis of temporal-diffference learning with function approximation. \newblock In {\em Advances in Neural Information Processing Systems}, pages 1075--1081, 1997. \bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2019}]{xu2019reanalysis} Tengyu Xu, Zhe Wang, Yi~Zhou, and Yingbin Liang. \newblock Reanalysis of variance reduced temporal difference learning. \newblock In {\em International Conference on Learning Representations}, 2019. \bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2020}]{xu2020reanalysis} T.~Xu, Z.~Wang, Y.~Zhou, and Y.~Liang. \newblock Reanalysis of variance reduced temporal difference learning. \newblock {\em arXiv preprint arXiv:2001.01898}, 2020. \bibitem[\protect\citeauthoryear{Zhang and Whiteson}{2022}]{zhang2022truncated} Shangtong Zhang and Shimon Whiteson. \newblock Truncated emphatic temporal difference methods for prediction and control. \newblock {\em The Journal of Machine Learning Research}, 23(1):6859--6917, 2022. \end{thebibliography}