\begin{thebibliography}{34}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi: #1}\else
  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

\bibitem[Baird et~al.(1995)]{baird1995residual}
Baird, L. et~al.
\newblock Residual algorithms: Reinforcement learning with function approximation.
\newblock In \emph{Proc. 12th Int. Conf. Mach. Learn.}, pp.\  30--37, 1995.

\bibitem[Bas-Serrano et~al.(2021)Bas-Serrano, Curi, Krause, and Neu]{basserrano2021logistic}
Bas-Serrano, J., Curi, S., Krause, A., and Neu, G.
\newblock Logistic q-learning.
\newblock In \emph{International Conference on Artificial Intelligence and Statistics}, pp.\  3610--3618, 2021.

\bibitem[Borkar(1997)]{borkar1997stochastic}
Borkar, V.~S.
\newblock Stochastic approximation with two time scales.
\newblock \emph{Syst. \& Control Letters}, 29\penalty0 (5):\penalty0 291--294, 1997.

\bibitem[Borkar \& Meyn(2000)Borkar and Meyn]{borkar2000ode}
Borkar, V.~S. and Meyn, S.~P.
\newblock The ode method for convergence of stochastic approximation and reinforcement learning.
\newblock \emph{SIAM J. Control Optim.}, 38\penalty0 (2):\penalty0 447--469, 2000.

\bibitem[Chen et~al.(2023)Chen, Ma, Li, Yang, Yang, and Gao]{chen2023modified}
Chen, X., Ma, X., Li, Y., Yang, G., Yang, S., and Gao, Y.
\newblock Modified retrace for off-policy temporal difference learning.
\newblock In \emph{Uncertainty in Artificial Intelligence}, pp.\  303--312. PMLR, 2023.

\bibitem[Dalal et~al.(2020)Dalal, Szorenyi, and Thoppe]{dalal2020tale}
Dalal, G., Szorenyi, B., and Thoppe, G.
\newblock A tale of two-timescale reinforcement learning with the tightest finite-time bound.
\newblock In \emph{Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pp.\  3701--3708, 2020.

\bibitem[Devlin \& Kudenko(2012)Devlin and Kudenko]{devlin2012dynamic}
Devlin, S. and Kudenko, D.
\newblock Dynamic potential-based reward shaping.
\newblock In \emph{Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems}, pp.\  433--440, 2012.

\bibitem[Feng et~al.(2019)Feng, Li, and Liu]{feng2019kernel}
Feng, Y., Li, L., and Liu, Q.
\newblock A kernel loss for solving the bellman equation.
\newblock In \emph{Advances in Neural Information Processing Systems}, pp.\  15430--15441, 2019.

\bibitem[Givchi \& Palhang(2015)Givchi and Palhang]{givchi2015quasi}
Givchi, A. and Palhang, M.
\newblock Quasi newton temporal difference learning.
\newblock In \emph{Asian Conference on Machine Learning}, pp.\  159--172, 2015.

\bibitem[Hackman(2012)]{hackman2012faster}
Hackman, L.
\newblock \emph{Faster Gradient-TD Algorithms}.
\newblock PhD thesis, University of Alberta, 2012.

\bibitem[Hallak et~al.(2016)Hallak, Tamar, Munos, and Mannor]{hallak2016generalized}
Hallak, A., Tamar, A., Munos, R., and Mannor, S.
\newblock Generalized emphatic temporal difference learning: bias-variance analysis.
\newblock In \emph{Proceedings of the 30th AAAI Conference on Artificial Intelligence}, pp.\  1631--1637, 2016.

\bibitem[Hirsch(1989)]{hirsch1989convergent}
Hirsch, M.~W.
\newblock Convergent activation dynamics in continuous time networks.
\newblock \emph{Neural Netw.}, 2\penalty0 (5):\penalty0 331--349, 1989.

\bibitem[Johnson \& Zhang(2013)Johnson and Zhang]{johnson2013accelerating}
Johnson, R. and Zhang, T.
\newblock Accelerating stochastic gradient descent using predictive variance reduction.
\newblock In \emph{Advances in Neural Information Processing Systems}, pp.\  315--323, 2013.

\bibitem[Korda \& La(2015)Korda and La]{korda2015td}
Korda, N. and La, P.
\newblock On td (0) with function approximation: Concentration bounds and a centered variant with exponential convergence.
\newblock In \emph{International conference on machine learning}, pp.\  626--634. PMLR, 2015.

\bibitem[Langley(2000)]{langley00}
Langley, P.
\newblock Crafting papers on machine learning.
\newblock In Langley, P. (ed.), \emph{Proceedings of the 17th International Conference on Machine Learning (ICML 2000)}, pp.\  1207--1216, Stanford, CA, 2000. Morgan Kaufmann.

\bibitem[Liu et~al.(2015)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik]{liu2015finite}
Liu, B., Liu, J., Ghavamzadeh, M., Mahadevan, S., and Petrik, M.
\newblock Finite-sample analysis of proximal gradient td algorithms.
\newblock In \emph{Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence}, pp.\  504--513, 2015.

\bibitem[Liu et~al.(2016)Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik]{liu2016proximal}
Liu, B., Liu, J., Ghavamzadeh, M., Mahadevan, S., and Petrik, M.
\newblock Proximal gradient temporal difference learning algorithms.
\newblock In \emph{Proceedings of the International Joint Conference on Artificial Intelligence}, pp.\  4195--4199, 2016.

\bibitem[Liu et~al.(2018)Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik]{liu2018proximal}
Liu, B., Gemp, I., Ghavamzadeh, M., Liu, J., Mahadevan, S., and Petrik, M.
\newblock Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity.
\newblock \emph{Journal of Artificial Intelligence Research}, 63:\penalty0 461--494, 2018.

\bibitem[Maei(2011)]{maei2011gradient}
Maei, H.~R.
\newblock \emph{Gradient temporal-difference learning algorithms}.
\newblock PhD thesis, University of Alberta, 2011.

\bibitem[Ng et~al.(1999)Ng, Harada, and Russell]{ng1999policy}
Ng, A.~Y., Harada, D., and Russell, S.
\newblock Policy invariance under reward transformations: Theory and application to reward shaping.
\newblock In \emph{Proc. 16th Int. Conf. Mach. Learn.}, pp.\  278--287, 1999.

\bibitem[Pan et~al.(2017)Pan, White, and White]{pan2017accelerated}
Pan, Y., White, A., and White, M.
\newblock Accelerated gradient temporal difference learning.
\newblock In \emph{Proceedings of the 21st AAAI Conference on Artificial Intelligence}, pp.\  2464--2470, 2017.

\bibitem[Schulman et~al.(2015)Schulman, Levine, Abbeel, Jordan, and Moritz]{schulman2015trust}
Schulman, J., Levine, S., Abbeel, P., Jordan, M., and Moritz, P.
\newblock Trust region policy optimization.
\newblock In \emph{International Conference on Machine Learning}, pp.\  1889--1897, 2015.

\bibitem[Schulman et~al.(2017)Schulman, Wolski, Dhariwal, Radford, and Klimov]{schulman2017proximal}
Schulman, J., Wolski, F., Dhariwal, P., Radford, A., and Klimov, O.
\newblock Proximal policy optimization algorithms.
\newblock \emph{arXiv preprint arXiv:1707.06347}, 2017.

\bibitem[Schwartz(1993)]{schwartz1993reinforcement}
Schwartz, A.
\newblock A reinforcement learning method for maximizing undiscounted rewards.
\newblock In \emph{Proc. 10th Int. Conf. Mach. Learn.}, volume 298, pp.\  298--305, 1993.

\bibitem[Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora]{sutton2009fast}
Sutton, R., Maei, H., Precup, D., Bhatnagar, S., Silver, D., Szepesv{\'a}ri, C., and Wiewiora, E.
\newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.
\newblock In \emph{Proc. 26th Int. Conf. Mach. Learn.}, pp.\  993--1000, 2009.

\bibitem[Sutton(1988)]{sutton1988learning}
Sutton, R.~S.
\newblock Learning to predict by the methods of temporal differences.
\newblock \emph{Machine learning}, 3\penalty0 (1):\penalty0 9--44, 1988.

\bibitem[Sutton \& Barto(2018)Sutton and Barto]{Sutton2018book}
Sutton, R.~S. and Barto, A.~G.
\newblock \emph{Reinforcement Learning: An Introduction}.
\newblock The MIT Press, second edition, 2018.

\bibitem[Sutton et~al.(2008)Sutton, Maei, and Szepesv{\'a}ri]{sutton2008convergent}
Sutton, R.~S., Maei, H.~R., and Szepesv{\'a}ri, C.
\newblock A convergent $ o (n) $ temporal-difference algorithm for off-policy learning with linear function approximation.
\newblock In \emph{Advances in Neural Information Processing Systems}, pp.\  1609--1616. Cambridge, MA: MIT Press, 2008.

\bibitem[Sutton et~al.(2016)Sutton, Mahmood, and White]{sutton2016emphatic}
Sutton, R.~S., Mahmood, A.~R., and White, M.
\newblock An emphatic approach to the problem of off-policy temporal-difference learning.
\newblock \emph{The Journal of Machine Learning Research}, 17\penalty0 (1):\penalty0 2603--2631, 2016.

\bibitem[Tsitsiklis \& Van~Roy(1997)Tsitsiklis and Van~Roy]{tsitsiklis1997analysis}
Tsitsiklis, J.~N. and Van~Roy, B.
\newblock Analysis of temporal-diffference learning with function approximation.
\newblock In \emph{Advances in Neural Information Processing Systems}, pp.\  1075--1081, 1997.

\bibitem[Xu et~al.(2019)Xu, Wang, Zhou, and Liang]{xu2019reanalysis}
Xu, T., Wang, Z., Zhou, Y., and Liang, Y.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock In \emph{International Conference on Learning Representations}, 2019.

\bibitem[Xu et~al.(2020)Xu, Wang, Zhou, and Liang]{xu2020reanalysis}
Xu, T., Wang, Z., Zhou, Y., and Liang, Y.
\newblock Reanalysis of variance reduced temporal difference learning.
\newblock \emph{arXiv preprint arXiv:2001.01898}, 2020.

\bibitem[Zhang \& Whiteson(2022)Zhang and Whiteson]{zhang2022truncated}
Zhang, S. and Whiteson, S.
\newblock Truncated emphatic temporal difference methods for prediction and control.
\newblock \emph{The Journal of Machine Learning Research}, 23\penalty0 (1):\penalty0 6859--6917, 2022.

\bibitem[Zhou(2021)]{zhou2021machine}
Zhou, Z.-H.
\newblock \emph{Machine learning}.
\newblock Springer Nature, 2021.

\end{thebibliography}