\begin{thebibliography}{30} \providecommand{\natexlab}[1]{#1} \bibitem[{Baird et~al.(1995)}]{baird1995residual} Baird, L.; et~al. 1995. \newblock Residual algorithms: Reinforcement learning with function approximation. \newblock In \emph{Proc. 12th Int. Conf. Mach. Learn.}, 30--37. \bibitem[{Bas-Serrano et~al.(2021)Bas-Serrano, Curi, Krause, and Neu}]{basserrano2021logistic} Bas-Serrano, J.; Curi, S.; Krause, A.; and Neu, G. 2021. \newblock Logistic Q-Learning. \newblock In \emph{International Conference on Artificial Intelligence and Statistics}, 3610--3618. \bibitem[{Borkar(1997)}]{borkar1997stochastic} Borkar, V.~S. 1997. \newblock Stochastic approximation with two time scales. \newblock \emph{Syst. \& Control Letters}, 29(5): 291--294. \bibitem[{Borkar and Meyn(2000)}]{borkar2000ode} Borkar, V.~S.; and Meyn, S.~P. 2000. \newblock The ODE method for convergence of stochastic approximation and reinforcement learning. \newblock \emph{SIAM J. Control Optim.}, 38(2): 447--469. \bibitem[{Chen et~al.(2023)Chen, Ma, Li, Yang, Yang, and Gao}]{chen2023modified} Chen, X.; Ma, X.; Li, Y.; Yang, G.; Yang, S.; and Gao, Y. 2023. \newblock Modified Retrace for Off-Policy Temporal Difference Learning. \newblock In \emph{Uncertainty in Artificial Intelligence}, 303--312. PMLR. \bibitem[{Devlin and Kudenko(2012)}]{devlin2012dynamic} Devlin, S.; and Kudenko, D. 2012. \newblock Dynamic potential-based reward shaping. \newblock In \emph{Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, 433--440. \bibitem[{Feng, Li, and Liu(2019)}]{feng2019kernel} Feng, Y.; Li, L.; and Liu, Q. 2019. \newblock A kernel loss for solving the Bellman equation. \newblock In \emph{Advances in Neural Information Processing Systems}, 15430--15441. \bibitem[{Givchi and Palhang(2015)}]{givchi2015quasi} Givchi, A.; and Palhang, M. 2015. \newblock Quasi newton temporal difference learning. \newblock In \emph{Asian Conference on Machine Learning}, 159--172. \bibitem[{Hackman(2012)}]{hackman2012faster} Hackman, L. 2012. \newblock \emph{Faster Gradient-TD Algorithms}. \newblock Ph.D. thesis, University of Alberta. \bibitem[{Hallak et~al.(2016)Hallak, Tamar, Munos, and Mannor}]{hallak2016generalized} Hallak, A.; Tamar, A.; Munos, R.; and Mannor, S. 2016. \newblock Generalized emphatic temporal difference learning: bias-variance analysis. \newblock In \emph{Proceedings of the 30th AAAI Conference on Artificial Intelligence}, 1631--1637. \bibitem[{Hirsch(1989)}]{hirsch1989convergent} Hirsch, M.~W. 1989. \newblock Convergent activation dynamics in continuous time networks. \newblock \emph{Neural Netw.}, 2(5): 331--349. \bibitem[{Johnson and Zhang(2013)}]{johnson2013accelerating} Johnson, R.; and Zhang, T. 2013. \newblock Accelerating stochastic gradient descent using predictive variance reduction. \newblock In \emph{Advances in Neural Information Processing Systems}, 315--323. \bibitem[{Korda and La(2015)}]{korda2015td} Korda, N.; and La, P. 2015. \newblock On TD (0) with function approximation: Concentration bounds and a centered variant with exponential convergence. \newblock In \emph{International conference on machine learning}, 626--634. PMLR. \bibitem[{Liu et~al.(2018)Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}]{liu2018proximal} Liu, B.; Gemp, I.; Ghavamzadeh, M.; Liu, J.; Mahadevan, S.; and Petrik, M. 2018. \newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity. \newblock \emph{Journal of Artificial Intelligence Research}, 63: 461--494. \bibitem[{Liu et~al.(2015)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}]{liu2015finite} Liu, B.; Liu, J.; Ghavamzadeh, M.; Mahadevan, S.; and Petrik, M. 2015. \newblock Finite-sample analysis of proximal gradient TD algorithms. \newblock In \emph{Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, 504--513. \bibitem[{Liu et~al.(2016)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}]{liu2016proximal} Liu, B.; Liu, J.; Ghavamzadeh, M.; Mahadevan, S.; and Petrik, M. 2016. \newblock Proximal Gradient Temporal Difference Learning Algorithms. \newblock In \emph{Proceedings of the International Joint Conference on Artificial Intelligence}, 4195--4199. \bibitem[{Ng, Harada, and Russell(1999)}]{ng1999policy} Ng, A.~Y.; Harada, D.; and Russell, S. 1999. \newblock Policy invariance under reward transformations: Theory and application to reward shaping. \newblock In \emph{Proc. 16th Int. Conf. Mach. Learn.}, 278--287. \bibitem[{Pan, White, and White(2017)}]{pan2017accelerated} Pan, Y.; White, A.; and White, M. 2017. \newblock Accelerated gradient temporal difference learning. \newblock In \emph{Proceedings of the 21st AAAI Conference on Artificial Intelligence}, 2464--2470. \bibitem[{Schulman et~al.(2015)Schulman, Levine, Abbeel, Jordan, and Moritz}]{schulman2015trust} Schulman, J.; Levine, S.; Abbeel, P.; Jordan, M.; and Moritz, P. 2015. \newblock Trust region policy optimization. \newblock In \emph{International Conference on Machine Learning}, 1889--1897. \bibitem[{Schulman et~al.(2017)Schulman, Wolski, Dhariwal, Radford, and Klimov}]{schulman2017proximal} Schulman, J.; Wolski, F.; Dhariwal, P.; Radford, A.; and Klimov, O. 2017. \newblock Proximal policy optimization algorithms. \newblock \emph{arXiv preprint arXiv:1707.06347}. \bibitem[{Schwartz(1993)}]{schwartz1993reinforcement} Schwartz, A. 1993. \newblock A reinforcement learning method for maximizing undiscounted rewards. \newblock In \emph{Proc. 10th Int. Conf. Mach. Learn.}, volume 298, 298--305. \bibitem[{Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}]{sutton2009fast} Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesv{\'a}ri, C.; and Wiewiora, E. 2009. \newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation. \newblock In \emph{Proc. 26th Int. Conf. Mach. Learn.}, 993--1000. \bibitem[{Sutton(1988)}]{sutton1988learning} Sutton, R.~S. 1988. \newblock Learning to predict by the methods of temporal differences. \newblock \emph{Machine learning}, 3(1): 9--44. \bibitem[{Sutton and Barto(2018)}]{Sutton2018book} Sutton, R.~S.; and Barto, A.~G. 2018. \newblock \emph{Reinforcement Learning: An Introduction}. \newblock The MIT Press, second edition. \bibitem[{Sutton, Maei, and Szepesv{\'a}ri(2008)}]{sutton2008convergent} Sutton, R.~S.; Maei, H.~R.; and Szepesv{\'a}ri, C. 2008. \newblock A Convergent $ O (n) $ Temporal-difference Algorithm for Off-policy Learning with Linear Function Approximation. \newblock In \emph{Advances in Neural Information Processing Systems}, 1609--1616. Cambridge, MA: MIT Press. \bibitem[{Sutton, Mahmood, and White(2016)}]{sutton2016emphatic} Sutton, R.~S.; Mahmood, A.~R.; and White, M. 2016. \newblock An emphatic approach to the problem of off-policy temporal-difference learning. \newblock \emph{The Journal of Machine Learning Research}, 17(1): 2603--2631. \bibitem[{Tsitsiklis and Van~Roy(1997)}]{tsitsiklis1997analysis} Tsitsiklis, J.~N.; and Van~Roy, B. 1997. \newblock Analysis of temporal-diffference learning with function approximation. \newblock In \emph{Advances in Neural Information Processing Systems}, 1075--1081. \bibitem[{Xu et~al.(2019)Xu, Wang, Zhou, and Liang}]{xu2019reanalysis} Xu, T.; Wang, Z.; Zhou, Y.; and Liang, Y. 2019. \newblock Reanalysis of Variance Reduced Temporal Difference Learning. \newblock In \emph{International Conference on Learning Representations}. \bibitem[{Xu et~al.(2020)Xu, Wang, Zhou, and Liang}]{xu2020reanalysis} Xu, T.; Wang, Z.; Zhou, Y.; and Liang, Y. 2020. \newblock Reanalysis of variance reduced temporal difference learning. \newblock \emph{arXiv preprint arXiv:2001.01898}. \bibitem[{Zhang and Whiteson(2022)}]{zhang2022truncated} Zhang, S.; and Whiteson, S. 2022. \newblock Truncated emphatic temporal difference methods for prediction and control. \newblock \emph{The Journal of Machine Learning Research}, 23(1): 6859--6917. \end{thebibliography}