\begin{thebibliography}{}

\bibitem[\protect\citeauthoryear{Baird and others}{1995}]{baird1995residual}
Leemon Baird et~al.
\newblock Residual algorithms: Reinforcement learning with function approximation.
\newblock In {\em Proc. 12th Int. Conf. Mach. Learn.}, pages 30--37, 1995.

\bibitem[\protect\citeauthoryear{Bas-Serrano \bgroup \em et al.\egroup }{2021}]{basserrano2021logistic}
Joan Bas-Serrano, Sebastian Curi, Andreas Krause, and Gergely Neu.
\newblock Logistic q-learning.
\newblock In {\em International Conference on Artificial Intelligence and Statistics}, pages 3610--3618, 2021.

\bibitem[\protect\citeauthoryear{Borkar and Meyn}{2000}]{borkar2000ode}
Vivek~S Borkar and Sean~P Meyn.
\newblock The ode method for convergence of stochastic approximation and reinforcement learning.
\newblock {\em SIAM J. Control Optim.}, 38(2):447--469, 2000.

\bibitem[\protect\citeauthoryear{Borkar}{1997}]{borkar1997stochastic}
Vivek~S Borkar.
\newblock Stochastic approximation with two time scales.
\newblock {\em Syst. \& Control Letters}, 29(5):291--294, 1997.

\bibitem[\protect\citeauthoryear{Chen \bgroup \em et al.\egroup }{2023}]{chen2023modified}
Xingguo Chen, Xingzhou Ma, Yang Li, Guang Yang, Shangdong Yang, and Yang Gao.
\newblock Modified retrace for off-policy temporal difference learning.
\newblock In {\em Uncertainty in Artificial Intelligence}, pages 303--312. PMLR, 2023.

\bibitem[\protect\citeauthoryear{Dalal \bgroup \em et al.\egroup }{2020}]{dalal2020tale}
Gal Dalal, Balazs Szorenyi, and Gugan Thoppe.
\newblock A tale of two-timescale reinforcement learning with the tightest finite-time bound.
\newblock In {\em Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pages 3701--3708, 2020.

\bibitem[\protect\citeauthoryear{Devlin and Kudenko}{2012}]{devlin2012dynamic}
Sam Devlin and Daniel Kudenko.
\newblock Dynamic potential-based reward shaping.
\newblock In {\em Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, pages 433--440, 2012.

\bibitem[\protect\citeauthoryear{Feng \bgroup \em et al.\egroup }{2019}]{feng2019kernel}
Yihao Feng, Lihong Li, and Qiang Liu.
\newblock A kernel loss for solving the bellman equation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 15430--15441, 2019.

\bibitem[\protect\citeauthoryear{Givchi and Palhang}{2015}]{givchi2015quasi}
Arash Givchi and Maziar Palhang.
\newblock Quasi newton temporal difference learning.
\newblock In {\em Asian Conference on Machine Learning}, pages 159--172, 2015.

\bibitem[\protect\citeauthoryear{Hackman}{2012}]{hackman2012faster}
Leah Hackman.
\newblock {\em Faster Gradient-TD Algorithms}.
\newblock PhD thesis, University of Alberta, 2012.

\bibitem[\protect\citeauthoryear{Hallak \bgroup \em et al.\egroup }{2016}]{hallak2016generalized}
Assaf Hallak, Aviv Tamar, Remi Munos, and Shie Mannor.
\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
\newblock In {\em Proceedings of the 30th AAAI Conference on Artificial Intelligence}, pages 1631--1637, 2016.

\bibitem[\protect\citeauthoryear{Hirsch}{1989}]{hirsch1989convergent}
Morris~W Hirsch.
\newblock Convergent activation dynamics in continuous time networks.
\newblock {\em Neural Netw.}, 2(5):331--349, 1989.

\bibitem[\protect\citeauthoryear{Johnson and Zhang}{2013}]{johnson2013accelerating}
R.~Johnson and T.~Zhang.
\newblock Accelerating stochastic gradient descent using predictive variance reduction.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 315--323, 2013.

\bibitem[\protect\citeauthoryear{Korda and La}{2015}]{korda2015td}
Nathaniel Korda and Prashanth La.
\newblock On td (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
\newblock In {\em International conference on machine learning}, pages 626--634. PMLR, 2015.

\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2015}]{liu2015finite}
Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
\newblock Finite-sample analysis of proximal gradient td algorithms.
\newblock In {\em Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, pages 504--513, 2015.

\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2016}]{liu2016proximal}
Bo~Liu, Ji~Liu, Mohammad Ghavamzadeh, Sridhar Mahadevan, and Marek Petrik.
\newblock Proximal gradient temporal difference learning algorithms.
\newblock In {\em Proceedings of the International Joint Conference on Artificial Intelligence}, pages 4195--4199, 2016.

\bibitem[\protect\citeauthoryear{Liu \bgroup \em et al.\egroup }{2018}]{liu2018proximal}
Bo~Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji~Liu, Sridhar Mahadevan, and Marek Petrik.
\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
\newblock {\em Journal of Artificial Intelligence Research}, 63:461--494, 2018.

\bibitem[\protect\citeauthoryear{Maei}{2011}]{maei2011gradient}
Hamid~Reza Maei.
\newblock {\em Gradient temporal-difference learning algorithms}.
\newblock PhD thesis, University of Alberta, 2011.

\bibitem[\protect\citeauthoryear{Ng \bgroup \em et al.\egroup }{1999}]{ng1999policy}
Andrew~Y Ng, Daishi Harada, and Stuart Russell.
\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
\newblock In {\em Proc. 16th Int. Conf. Mach. Learn.}, pages 278--287, 1999.

\bibitem[\protect\citeauthoryear{Pan \bgroup \em et al.\egroup }{2017}]{pan2017accelerated}
Yangchen Pan, Adam White, and Martha White.
\newblock Accelerated gradient temporal difference learning.
\newblock In {\em Proceedings of the 21st AAAI Conference on Artificial Intelligence}, pages 2464--2470, 2017.

\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2015}]{schulman2015trust}
J.~Schulman, S.~Levine, P.~Abbeel, M.~Jordan, and P.~Moritz.
\newblock Trust region policy optimization.
\newblock In {\em International Conference on Machine Learning}, pages 1889--1897, 2015.

\bibitem[\protect\citeauthoryear{Schulman \bgroup \em et al.\egroup }{2017}]{schulman2017proximal}
J.~Schulman, F.~Wolski, P.~Dhariwal, A.~Radford, and O.~Klimov.
\newblock Proximal policy optimization algorithms.
\newblock {\em arXiv preprint arXiv:1707.06347}, 2017.

\bibitem[\protect\citeauthoryear{Schwartz}{1993}]{schwartz1993reinforcement}
Anton Schwartz.
\newblock A reinforcement learning method for maximizing undiscounted rewards.
\newblock In {\em Proc. 10th Int. Conf. Mach. Learn.}, volume 298, pages 298--305, 1993.

\bibitem[\protect\citeauthoryear{Sutton and Barto}{2018}]{Sutton2018book}
Richard~S. Sutton and Andrew~G. Barto.
\newblock {\em Reinforcement Learning: An Introduction}.
\newblock The MIT Press, second edition, 2018.

\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2008}]{sutton2008convergent}
Richard~S Sutton, Hamid~R Maei, and Csaba Szepesv{\'a}ri.
\newblock A convergent $ o (n) $ temporal-difference algorithm for off-policy learning with linear function approximation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 1609--1616. Cambridge, MA: MIT Press, 2008.

\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2009}]{sutton2009fast}
R.S. Sutton, H.R. Maei, D.~Precup, S.~Bhatnagar, D.~Silver, C.~Szepesv{\'a}ri, and E.~Wiewiora.
\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
\newblock In {\em Proc. 26th Int. Conf. Mach. Learn.}, pages 993--1000, 2009.

\bibitem[\protect\citeauthoryear{Sutton \bgroup \em et al.\egroup }{2016}]{sutton2016emphatic}
Richard~S Sutton, A~Rupam Mahmood, and Martha White.
\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
\newblock {\em The Journal of Machine Learning Research}, 17(1):2603--2631, 2016.

\bibitem[\protect\citeauthoryear{Sutton}{1988}]{sutton1988learning}
Richard~S Sutton.
\newblock Learning to predict by the methods of temporal differences.
\newblock {\em Machine learning}, 3(1):9--44, 1988.

\bibitem[\protect\citeauthoryear{Tsitsiklis and Van~Roy}{1997}]{tsitsiklis1997analysis}
John~N Tsitsiklis and Benjamin Van~Roy.
\newblock Analysis of temporal-diffference learning with function approximation.
\newblock In {\em Advances in Neural Information Processing Systems}, pages 1075--1081, 1997.

\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2019}]{xu2019reanalysis}
Tengyu Xu, Zhe Wang, Yi~Zhou, and Yingbin Liang.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock In {\em International Conference on Learning Representations}, 2019.

\bibitem[\protect\citeauthoryear{Xu \bgroup \em et al.\egroup }{2020}]{xu2020reanalysis}
T.~Xu, Z.~Wang, Y.~Zhou, and Y.~Liang.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock {\em arXiv preprint arXiv:2001.01898}, 2020.

\bibitem[\protect\citeauthoryear{Zhang and Whiteson}{2022}]{zhang2022truncated}
Shangtong Zhang and Shimon Whiteson.
\newblock Truncated emphatic temporal difference methods for prediction and control.
\newblock {\em The Journal of Machine Learning Research}, 23(1):6859--6917, 2022.

\end{thebibliography}