\begin{thebibliography}{30}
\providecommand{\natexlab}[1]{#1}

\bibitem[{Baird et~al.(1995)}]{baird1995residual}
Baird, L.; et~al. 1995.
\newblock Residual algorithms: Reinforcement learning with function approximation.
\newblock In \emph{Proc. 12th Int. Conf. Mach. Learn.}, 30--37.

\bibitem[{Bas-Serrano et~al.(2021)Bas-Serrano, Curi, Krause, and Neu}]{basserrano2021logistic}
Bas-Serrano, J.; Curi, S.; Krause, A.; and Neu, G. 2021.
\newblock Logistic Q-Learning.
\newblock In \emph{International Conference on Artificial Intelligence and Statistics}, 3610--3618.

\bibitem[{Borkar(1997)}]{borkar1997stochastic}
Borkar, V.~S. 1997.
\newblock Stochastic approximation with two time scales.
\newblock \emph{Syst. \& Control Letters}, 29(5): 291--294.

\bibitem[{Borkar and Meyn(2000)}]{borkar2000ode}
Borkar, V.~S.; and Meyn, S.~P. 2000.
\newblock The ODE method for convergence of stochastic approximation and reinforcement learning.
\newblock \emph{SIAM J. Control Optim.}, 38(2): 447--469.

\bibitem[{Chen et~al.(2023)Chen, Ma, Li, Yang, Yang, and Gao}]{chen2023modified}
Chen, X.; Ma, X.; Li, Y.; Yang, G.; Yang, S.; and Gao, Y. 2023.
\newblock Modified Retrace for Off-Policy Temporal Difference Learning.
\newblock In \emph{Uncertainty in Artificial Intelligence}, 303--312. PMLR.

\bibitem[{Devlin and Kudenko(2012)}]{devlin2012dynamic}
Devlin, S.; and Kudenko, D. 2012.
\newblock Dynamic potential-based reward shaping.
\newblock In \emph{Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, 433--440.

\bibitem[{Feng, Li, and Liu(2019)}]{feng2019kernel}
Feng, Y.; Li, L.; and Liu, Q. 2019.
\newblock A kernel loss for solving the Bellman equation.
\newblock In \emph{Advances in Neural Information Processing Systems}, 15430--15441.

\bibitem[{Givchi and Palhang(2015)}]{givchi2015quasi}
Givchi, A.; and Palhang, M. 2015.
\newblock Quasi newton temporal difference learning.
\newblock In \emph{Asian Conference on Machine Learning}, 159--172.

\bibitem[{Hackman(2012)}]{hackman2012faster}
Hackman, L. 2012.
\newblock \emph{Faster Gradient-TD Algorithms}.
\newblock Ph.D. thesis, University of Alberta.

\bibitem[{Hallak et~al.(2016)Hallak, Tamar, Munos, and Mannor}]{hallak2016generalized}
Hallak, A.; Tamar, A.; Munos, R.; and Mannor, S. 2016.
\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
\newblock In \emph{Proceedings of the 30th AAAI Conference on Artificial Intelligence}, 1631--1637.

\bibitem[{Hirsch(1989)}]{hirsch1989convergent}
Hirsch, M.~W. 1989.
\newblock Convergent activation dynamics in continuous time networks.
\newblock \emph{Neural Netw.}, 2(5): 331--349.

\bibitem[{Johnson and Zhang(2013)}]{johnson2013accelerating}
Johnson, R.; and Zhang, T. 2013.
\newblock Accelerating stochastic gradient descent using predictive variance reduction.
\newblock In \emph{Advances in Neural Information Processing Systems}, 315--323.

\bibitem[{Korda and La(2015)}]{korda2015td}
Korda, N.; and La, P. 2015.
\newblock On TD (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
\newblock In \emph{International conference on machine learning}, 626--634. PMLR.

\bibitem[{Liu et~al.(2018)Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}]{liu2018proximal}
Liu, B.; Gemp, I.; Ghavamzadeh, M.; Liu, J.; Mahadevan, S.; and Petrik, M. 2018.
\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
\newblock \emph{Journal of Artificial Intelligence Research}, 63: 461--494.

\bibitem[{Liu et~al.(2015)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}]{liu2015finite}
Liu, B.; Liu, J.; Ghavamzadeh, M.; Mahadevan, S.; and Petrik, M. 2015.
\newblock Finite-sample analysis of proximal gradient TD algorithms.
\newblock In \emph{Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, 504--513.

\bibitem[{Liu et~al.(2016)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}]{liu2016proximal}
Liu, B.; Liu, J.; Ghavamzadeh, M.; Mahadevan, S.; and Petrik, M. 2016.
\newblock Proximal Gradient Temporal Difference Learning Algorithms.
\newblock In \emph{Proceedings of the International Joint Conference on Artificial Intelligence}, 4195--4199.

\bibitem[{Ng, Harada, and Russell(1999)}]{ng1999policy}
Ng, A.~Y.; Harada, D.; and Russell, S. 1999.
\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
\newblock In \emph{Proc. 16th Int. Conf. Mach. Learn.}, 278--287.

\bibitem[{Pan, White, and White(2017)}]{pan2017accelerated}
Pan, Y.; White, A.; and White, M. 2017.
\newblock Accelerated gradient temporal difference learning.
\newblock In \emph{Proceedings of the 21st AAAI Conference on Artificial Intelligence}, 2464--2470.

\bibitem[{Schulman et~al.(2015)Schulman, Levine, Abbeel, Jordan, and Moritz}]{schulman2015trust}
Schulman, J.; Levine, S.; Abbeel, P.; Jordan, M.; and Moritz, P. 2015.
\newblock Trust region policy optimization.
\newblock In \emph{International Conference on Machine Learning}, 1889--1897.

\bibitem[{Schulman et~al.(2017)Schulman, Wolski, Dhariwal, Radford, and Klimov}]{schulman2017proximal}
Schulman, J.; Wolski, F.; Dhariwal, P.; Radford, A.; and Klimov, O. 2017.
\newblock Proximal policy optimization algorithms.
\newblock \emph{arXiv preprint arXiv:1707.06347}.

\bibitem[{Schwartz(1993)}]{schwartz1993reinforcement}
Schwartz, A. 1993.
\newblock A reinforcement learning method for maximizing undiscounted rewards.
\newblock In \emph{Proc. 10th Int. Conf. Mach. Learn.}, volume 298, 298--305.

\bibitem[{Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}]{sutton2009fast}
Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesv{\'a}ri, C.; and Wiewiora, E. 2009.
\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
\newblock In \emph{Proc. 26th Int. Conf. Mach. Learn.}, 993--1000.

\bibitem[{Sutton(1988)}]{sutton1988learning}
Sutton, R.~S. 1988.
\newblock Learning to predict by the methods of temporal differences.
\newblock \emph{Machine learning}, 3(1): 9--44.

\bibitem[{Sutton and Barto(2018)}]{Sutton2018book}
Sutton, R.~S.; and Barto, A.~G. 2018.
\newblock \emph{Reinforcement Learning: An Introduction}.
\newblock The MIT Press, second edition.

\bibitem[{Sutton, Maei, and Szepesv{\'a}ri(2008)}]{sutton2008convergent}
Sutton, R.~S.; Maei, H.~R.; and Szepesv{\'a}ri, C. 2008.
\newblock A Convergent $ O (n) $ Temporal-difference Algorithm for Off-policy Learning with Linear Function Approximation.
\newblock In \emph{Advances in Neural Information Processing Systems}, 1609--1616. Cambridge, MA: MIT Press.

\bibitem[{Sutton, Mahmood, and White(2016)}]{sutton2016emphatic}
Sutton, R.~S.; Mahmood, A.~R.; and White, M. 2016.
\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
\newblock \emph{The Journal of Machine Learning Research}, 17(1): 2603--2631.

\bibitem[{Tsitsiklis and Van~Roy(1997)}]{tsitsiklis1997analysis}
Tsitsiklis, J.~N.; and Van~Roy, B. 1997.
\newblock Analysis of temporal-diffference learning with function approximation.
\newblock In \emph{Advances in Neural Information Processing Systems}, 1075--1081.

\bibitem[{Xu et~al.(2019)Xu, Wang, Zhou, and Liang}]{xu2019reanalysis}
Xu, T.; Wang, Z.; Zhou, Y.; and Liang, Y. 2019.
\newblock Reanalysis of Variance Reduced Temporal Difference Learning.
\newblock In \emph{International Conference on Learning Representations}.

\bibitem[{Xu et~al.(2020)Xu, Wang, Zhou, and Liang}]{xu2020reanalysis}
Xu, T.; Wang, Z.; Zhou, Y.; and Liang, Y. 2020.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock \emph{arXiv preprint arXiv:2001.01898}.

\bibitem[{Zhang and Whiteson(2022)}]{zhang2022truncated}
Zhang, S.; and Whiteson, S. 2022.
\newblock Truncated emphatic temporal difference methods for prediction and control.
\newblock \emph{The Journal of Machine Learning Research}, 23(1): 6859--6917.

\end{thebibliography}