@inproceedings{langley00,
 author    = {P. Langley},
 title     = {Crafting Papers on Machine Learning},
 year      = {2000},
 pages     = {1207--1216},
 editor    = {Pat Langley},
 booktitle     = {Proceedings of the 17th International Conference
              on Machine Learning (ICML 2000)},
 address   = {Stanford, CA},
 publisher = {Morgan Kaufmann}
}

@TechReport{mitchell80,
  author = 	 "T. M. Mitchell",
  title = 	 "The Need for Biases in Learning Generalizations",
  institution =  "Computer Science Department, Rutgers University",
  year = 	 "1980",
  address =	 "New Brunswick, MA",
}

@phdthesis{kearns89,
  author = {M. J. Kearns},
  title =  {Computational Complexity of Machine Learning},
  school = {Department of Computer Science, Harvard University},
  year =   {1989}
}

@Book{MachineLearningI,
  editor = 	 "R. S. Michalski and J. G. Carbonell and T.
		  M. Mitchell",
  title = 	 "Machine Learning: An Artificial Intelligence
		  Approach, Vol. I",
  publisher = 	 "Tioga",
  year = 	 "1983",
  address =	 "Palo Alto, CA"
}

@Book{DudaHart2nd,
  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
  title =        "Pattern Classification",
  publisher =    "John Wiley and Sons",
  edition =      "2nd",
  year =         "2000"
}

@misc{anonymous,
  title= {Suppressed for Anonymity},
  author= {Author, N. N.},
  year= {2021}
}

@InCollection{Newell81,
  author =       "A. Newell and P. S. Rosenbloom",
  title =        "Mechanisms of Skill Acquisition and the Law of
                  Practice", 
  booktitle =    "Cognitive Skills and Their Acquisition",
  pages =        "1--51",
  publisher =    "Lawrence Erlbaum Associates, Inc.",
  year =         "1981",
  editor =       "J. R. Anderson",
  chapter =      "1",
  address =      "Hillsdale, NJ"
}


@Article{Samuel59,
  author = 	 "A. L. Samuel",
  title = 	 "Some Studies in Machine Learning Using the Game of
		  Checkers",
  journal =	 "IBM Journal of Research and Development",
  year =	 "1959",
  volume =	 "3",
  number =	 "3",
  pages =	 "211--229"
}
@inproceedings{langley00,
 author    = {P. Langley},
 title     = {Crafting Papers on Machine Learning},
 year      = {2000},
 pages     = {1207--1216},
 editor    = {Pat Langley},
 booktitle     = {Proceedings of the 17th International Conference
              on Machine Learning (ICML 2000)},
 address   = {Stanford, CA},
 publisher = {Morgan Kaufmann}
}

@TechReport{mitchell80,
  author = 	 "T. M. Mitchell",
  title = 	 "The Need for Biases in Learning Generalizations",
  institution =  "Computer Science Department, Rutgers University",
  year = 	 "1980",
  address =	 "New Brunswick, MA",
}

@phdthesis{kearns89,
  author = {M. J. Kearns},
  title =  {Computational Complexity of Machine Learning},
  school = {Department of Computer Science, Harvard University},
  year =   {1989}
}

@Book{MachineLearningI,
  editor = 	 "R. S. Michalski and J. G. Carbonell and T.
		  M. Mitchell",
  title = 	 "Machine Learning: An Artificial Intelligence
		  Approach, Vol. I",
  publisher = 	 "Tioga",
  year = 	 "1983",
  address =	 "Palo Alto, CA"
}

@Book{DudaHart2nd,
  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
  title =        "Pattern Classification",
  publisher =    "John Wiley and Sons",
  edition =      "2nd",
  year =         "2000"
}

@misc{anonymous,
  title= {Suppressed for Anonymity},
  author= {Author, N. N.},
  year= {2021}
}

@InCollection{Newell81,
  author =       "A. Newell and P. S. Rosenbloom",
  title =        "Mechanisms of Skill Acquisition and the Law of
                  Practice", 
  booktitle =    "Cognitive Skills and Their Acquisition",
  pages =        "1--51",
  publisher =    "Lawrence Erlbaum Associates, Inc.",
  year =         "1981",
  editor =       "J. R. Anderson",
  chapter =      "1",
  address =      "Hillsdale, NJ"
}


@Article{Samuel59,
  author = 	 "A. L. Samuel",
  title = 	 "Some Studies in Machine Learning Using the Game of
		  Checkers",
  journal =	 "IBM Journal of Research and Development",
  year =	 "1959",
  volume =	 "3",
  number =	 "3",
  pages =	 "211--229"
}

@book{em:86,
  editor  = "Engelmore, Robert and Morgan, Anthony",
  title   = "Blackboard Systems",
  year    = 1986,
  address = "Reading, Mass.",
  publisher = "Addison-Wesley",
}
@inproceedings{dalal2018finite,
  title={Finite sample analyses for TD (0) with function approximation},
  author={Dalal, Gal and Szorenyi, Balazs and Thoppe, Gugan and Mannor, Shie},
  booktitle={Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence and Thirtieth Innovative Applications of Artificial Intelligence Conference and Eighth AAAI Symposium on Educational Advances in Artificial Intelligence},
  pages={6144--6160},
  year={2018}
}
@inproceedings{xu2019reanalysis,
  title={Reanalysis of Variance Reduced Temporal Difference Learning},
  author={Xu, Tengyu and Wang, Zhe and Zhou, Yi and Liang, Yingbin},
  booktitle={International Conference on Learning Representations},
  year={2019}
}
@inproceedings{c:83,
  author  = "Clancey, William J.",
  year    = 1983,
  title   = "{Communication, Simulation, and Intelligent
Agents: Implications of Personal Intelligent Machines
for Medical Education}",
  booktitle="Proceedings of the Eighth International Joint Conference on Artificial Intelligence {(IJCAI-83)}", 
  pages   = "556-560",
  address = "Menlo Park, Calif",
  publisher = "{IJCAI Organization}",
}
@inproceedings{c:84,
  author  = "Clancey, William J.",
  year    = 1984,
  title   = "{Classification Problem Solving}",
  booktitle = "Proceedings of the Fourth National 
              Conference on Artificial Intelligence",
  pages   = "45-54",
  address = "Menlo Park, Calif.",
  publisher="AAAI Press",
}
@article{r:80,
  author = {Robinson, Arthur L.},
  title = {New Ways to Make Microcircuits Smaller},
  volume = {208},
  number = {4447},
  pages = {1019--1022},
  year = {1980},
  doi = {10.1126/science.208.4447.1019},
  publisher = {American Association for the Advancement of Science},
  issn = {0036-8075},
  URL = {https://science.sciencemag.org/content/208/4447/1019},
  eprint = {https://science.sciencemag.org/content/208/4447/1019.full.pdf},
  journal = {Science},
}
@article{r:80x,
  author  = "Robinson, Arthur L.",
  year    = 1980,
  title   = "{New Ways to Make Microcircuits Smaller---Duplicate Entry}",
  journal = "Science",
  volume  =  208,
  pages   = "1019-1026",
}
@article{hcr:83,
title = {Strategic explanations for a diagnostic consultation system},
journal = {International Journal of Man-Machine Studies},
volume = {20},
number = {1},
pages = {3-19},
year = {1984},
issn = {0020-7373},
doi = {https://doi.org/10.1016/S0020-7373(84)80003-6},
url = {https://www.sciencedirect.com/science/article/pii/S0020737384800036},
author = {Diane Warner Hasling and William J. Clancey and Glenn Rennels},
abstract = {This article examines the problem of automatte explanation of reasoning, especially as it relates to expert systems. By explanation we mean the ability of a program to discuss what it is doing in some understandable way. We first present a general framework in which to view explanation and review some of the research done in this area. We then focus on the explanation system for NEOMYCIN, a medical consultation program. A consultation program interactively helps a user to solve a problem. Our goal is to have NEOMYCIN explain its problem-solving strategies. An explanation of strategy describes the plan the program is using to reach a solution. Such an explanation is usually concrete, referring to aspects of the current problem situation. Abstract explanations articulate a general principle, which can be applied in different situations; such explanations are useful in teaching and in explaining by analogy. We describe the aspects of NEOMYCIN that make abstract strategic explanations possible—the representation of strategic knowledge explicitly and separately from domain knowledge— and demonstrate how this representation can be used to generate explanations.}
}
@article{hcrt:83,
  author  = "Hasling, Diane Warner and Clancey, William J. and Rennels, Glenn R. and Test, Thomas",
  year    = 1983,
  title   = "{Strategic Explanations in Consultation---Duplicate}",
  journal = "The International Journal of Man-Machine Studies",
  volume  = 20,
  number  = 1,
  pages   = "3-19",
}
@techreport{r:86,
  author  = "Rice, James",
  year    = 1986,
  title   = "{Poligon: A System for Parallel Problem Solving}",
  type    = "Technical Report", 
  number  = "KSL-86-19", 
  institution = "Dept.\ of Computer Science, Stanford Univ.",
}
@phdthesis{c:79,
  author  = "Clancey, William J.",
  year    = 1979,
  title   = "{Transfer of Rule-Based Expertise
through a Tutorial Dialogue}",
  type    = "{Ph.D.} diss.",
  school  = "Dept.\ of Computer Science, Stanford Univ.",
  address = "Stanford, Calif.",
}
@unpublished{c:21,
  author  = "Clancey, William J.",
  title   = "{The Engineering of Qualitative Models}",
  year    = 2021,
  note    = "Forthcoming",
}
@misc{c:22,
      title={Attention Is All You Need}, 
      author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
      year={2017},
      eprint={1706.03762},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
@misc{c:23,
  title        = "Pluto: The 'Other' Red Planet",
  author       = "{NASA}",
  howpublished = "\url{https://www.nasa.gov/nh/pluto-the-other-red-planet}",
  year         = 2015,
  note         = "Accessed: 2018-12-06"
}
@article{r:80x,
  author  = "Robinson, Arthur L.",
  year    = 1980,
  title   = "{New Ways to Make Microcircuits Smaller---Duplicate Entry}",
  journal = "Science",
  volume  =  208,
  pages   = "1019-1026",
}
@article{hcrt:83,
  author  = "Hasling, Diane Warner and Clancey, William J. and Rennels, Glenn R. and Test, Thomas",
  year    = 1983,
  title   = "{Strategic Explanations in Consultation---Duplicate}",
  journal = "The International Journal of Man-Machine Studies",
  volume  = 20,
  number  = 1,
  pages   = "3-19",
}
@article{xu2013online,
  title={Online learning control using adaptive critic designs with sparse kernel machines},
  author={Xu, Xin and Hou, Zhongsheng and Lian, Chuanqiang and He, Haibo},
  journal={IEEE Trans. Neural Netw. Learn. Syst.},
  volume={24},
  number={5},
  pages={762--775},
  year={2013},
  publisher={IEEE}
}
@article{bertsekas2017value,
  title={Value and policy iterations in optimal control and adaptive dynamic programming},
  author={Bertsekas, Dimitri P},
  journal={IEEE Trans. Neural Netw. Learn. Syst.},
  year={2017},
  volume={28},
  number={3},
  pages={500 - 509},
  publisher={IEEE}
}
@phdthesis{hackman2012faster,
  title={Faster Gradient-TD Algorithms},
  author={Hackman, Leah},
  year={2012},
  school={University of Alberta}
}
@inproceedings{harutyunyan2015multi,
  title={Multi-scale reward shaping via an off-policy ensemble},
  author={Harutyunyan, Anna and Brys, Tim and Vrancx, Peter and Now{\'e}, Ann},
  booktitle={Proc. 2015 Int. Conf. Autonomous Agents and Multiagent Systems},
  pages={1641--1642},
  year={2015},
  organization={International Foundation for Autonomous Agents and Multiagent Systems}
}
@inproceedings{harutyunyan2015expressing,
  title={Expressing Arbitrary Reward Functions as Potential-Based Advice.},
  author={Harutyunyan, Anna and Devlin, Sam and Vrancx, Peter and Now{\'e}, Ann},
  booktitle={AAAI},
  pages={2652--2658},
  year={2015}
}
@article{wiewiora2003potential,
  title={Potential-based shaping and Q-value initialization are equivalent},
  author={Wiewiora, Eric},
  journal={J. Artif. Intell. Res.},
  volume={19},
  pages={205--208},
  year={2003}
}
@article{grzes2010online,
  title={Online learning of shaping rewards in reinforcement learning},
  author={Grze{\'s}, Marek and Kudenko, Daniel},
  journal={Neural Netw.},
  volume={23},
  number={4},
  pages={541--550},
  year={2010},
  publisher={Elsevier}
}
@inproceedings{marthi2007automatic,
  title={Automatic shaping and decomposition of reward functions},
  author={Marthi, Bhaskara},
  booktitle={Proc. 24th Int. Conf. Mach. Learn.},
  pages={601--608},
  year={2007}
}
@inproceedings{laud2003influence,
  title={The Influence of Reward on the Speed of Reinforcement Learning: An Analysis of Shaping},
  author={Laud, Adam and Dejong, Gerald},
  booktitle={Proc. 20th Int. Conf. Mach. Learn.},
  pages={440--447},
  year={2003}
}
@phdthesis{laud2004theory,
  title={Theory and application of reward shaping in reinforcement learning},
  author={Laud, Adam Daniel},
  year={2004},
  school={University of Illinois at Urbana-Champaign}
}
@article{geist2013algorithmic,
  title={Algorithmic survey of parametric value function approximation},
  author={Geist, Matthieu and Pietquin, Olivier},
  journal={IEEE Trans. Neural Netw. Learn. Syst.},
  volume={24},
  number={6},
  pages={845--867},
  year={2013},
  publisher={IEEE}
}
@article{furmston2016approximate,
  title={Approximate Newton Methods for Policy Search in Markov Decision Processes},
  author={Furmston, Thomas and Lever, Guy and Barber, David},
  journal={J. Mach. Learn. Res.},
  volume={17},
  number={227},
  pages={1--51},
  year={2016}
}
@article{silver2016mastering,
  title={Mastering the game of Go with deep neural networks and tree search},
  author={Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and van den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others},
  journal={Nature},
  volume={529},
  number={7587},
  pages={484--489},
  year={2016},
  publisher={Nature Publishing Group}
}

@article{mnih2015human,
  title={Human-level control through deep reinforcement learning},
  author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A and Veness, Joel and Bellemare, Marc G and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K and Ostrovski, Georg and others},
  journal={Nature},
  volume={518},
  number={7540},
  pages={529--533},
  year={2015},
  publisher={Nature Publishing Group}
}
@inproceedings{guo2014deep,
  title={Deep learning for real-time Atari game play using offline Monte-Carlo tree search planning},
  author={Guo, Xiaoxiao and Singh, Satinder and Lee, Honglak and Lewis, Richard L and Wang, Xiaoshi},
  booktitle={Advances in Neural Information Processing Systems},
  pages={3338--3346},
  publisher={Cambridge, MA: MIT Press},
  year={2014}
}
@inproceedings{scherrer2010should,
  title={Should one compute the Temporal Difference fix point or minimize the Bellman Residual? The unified oblique projection view},
  author={Scherrer, Bruno},
  booktitle={Proc. 27th Int. Conf. Mach. Learn.},
  pages={959--966},
  year={2010}
}
@article{hirsch1989convergent,
  title={Convergent activation dynamics in continuous time networks},
  author={Hirsch, Morris W},
  journal={Neural Netw.},
  volume={2},
  number={5},
  pages={331--349},
  year={1989},
  publisher={Elsevier}
}
@article{borkar1997stochastic,
  title={Stochastic approximation with two time scales},
  author={Borkar, Vivek S},
  journal={Syst. \& Control Letters},
  volume={29},
  number={5},
  pages={291--294},
  year={1997},
  publisher={Elsevier}
}
@article{ortner2013adaptive,
  title={Adaptive aggregation for reinforcement learning in average reward Markov decision processes},
  author={Ortner, Ronald},
  journal={Annals Oper. Res.},
  volume={208},
  number={1},
  pages={321--336},
  year={2013},
  publisher={Springer}
}
@article{jaksch2010near,
  title={Near-optimal regret bounds for reinforcement learning},
  author={Jaksch, Thomas and Ortner, Ronald and Auer, Peter},
  journal={Journal of Machine Learning Research},
  number={Apr},
  volume={11},
  pages={1563--1600},
  year={2010}
}
@article{ortner2007logarithmic,
  title={Logarithmic online regret bounds for undiscounted reinforcement learning},
  author={Ortner, P and Auer, R},
  journal={Advances in Neural Information Processing Systems},
  publisher={Cambridge, MA: MIT Press},
  volume={19},
  pages={49},
  year={2007}
}
@article{das1999solving,
  title={Solving semi-Markov decision problems using average reward reinforcement learning},
  author={Das, Tapas K and Gosavi, Abhijit and Mahadevan, Sridhar and Marchalleck, Nicholas},
  journal={Management Science},
  volume={45},
  number={4},
  pages={560--574},
  year={1999},
  publisher={INFORMS}
}
@article{abounadi2001learning,
  title={Learning algorithms for Markov decision processes with average cost},
  author={Abounadi, Jinane and Bertsekas, D and Borkar, Vivek S},
  journal={SIAM J. Control Optim.},
  volume={40},
  number={3},
  pages={681--698},
  year={2001},
  publisher={SIAM}
}
@inproceedings{singh1994reinforcement,
  title={Reinforcement learning algorithms for average-payoff Markovian decision processes},
  author={Singh, Satinder P},
  booktitle={AAAI},
  volume={94},
  pages={700--705},
  year={1994}
}
@inproceedings{schwartz1993reinforcement,
  title={A reinforcement learning method for maximizing undiscounted rewards},
  author={Schwartz, Anton},
  booktitle={Proc. 10th Int. Conf. Mach. Learn.},
  volume={298},
  pages={298--305},
  year={1993}
}

@inproceedings{yang2016efficient,
  title={Efficient Average Reward Reinforcement Learning Using Constant Shifting Values},
  author={Yang, Shangdong and Gao, Yang and An, Bo and Wang, Hao and Chen, Xingguo},
  booktitle={Thirtieth AAAI Conference on Artificial Intelligence},
  pages={2258-2264},
  year={2016}
}
@inproceedings{devlin2012dynamic,
  title={Dynamic potential-based reward shaping},
  author={Devlin, Sam and Kudenko, Daniel},
  booktitle={Proc. 11th Int. Conf. Autonomous Agents and Multiagent Systems},
  pages={433--440},
  year={2012}
}

@inproceedings{ng1999policy,
  title={Policy invariance under reward transformations: Theory and application to reward shaping},
  author={Ng, Andrew Y and Harada, Daishi and Russell, Stuart},
  booktitle={Proc. 16th Int. Conf. Mach. Learn.},
  pages={278--287},
  year={1999}
}
@article{borkar2000ode,
  title={The ODE method for convergence of stochastic approximation and reinforcement learning},
  author={Borkar, Vivek S and Meyn, Sean P},
  journal={SIAM J. Control Optim.},
  volume={38},
  number={2},
  pages={447--469},
  year={2000},
  publisher={SIAM}
}
@phdthesis{maei2011gradient,
  title={Gradient temporal-difference learning algorithms},
  author={Maei, Hamid Reza},
  year={2011},
  school={University of Alberta}
}
@phdthesis{baird1999reinforcement,
  title={Reinforcement learning through gradient descent},
  author={Baird III, Leemon C},
  year={1999},
  school={US Air Force Academy, US}
}
@PHDTHESIS{Driessens2004,
  AUTHOR ="Kurt Driessens",
  TITLE ="Relational Reinforcement Learning",
  SCHOOL ="Catholic University of Leuven",
  YEAR ="2004",
}
@article{tsitsiklis1996feature,
  title={Feature-based methods for large scale dynamic programming},
  author={Tsitsiklis, John N and Van Roy, Benjamin},
  journal={Mach. Learn.},
  volume={22},
  number={1-3},
  pages={59--94},
  year={1996},
  publisher={Springer}
}
@inproceedings{chen2009apply,
  title={Apply ant colony optimization to Tetris},
  author={Chen, X. and Wang, H. and Wang, W. and Shi, Y. and Gao, Y.},
  booktitle={Proceedings of the 11th Annual Conference on Genetic and Evolutionary Computation (GECCO)},
  pages={1741--1742},
  year={2009},
  organization={ACM}
}
@incollection{farias2006tetris,
  title={Tetris: A study of randomized constraint sampling},
  author={Farias, Vivek F and Van Roy, Benjamin},
  booktitle={Probabilistic and Randomized Methods for Design Under Uncertainty},
  pages={189--201},
  year={2006},
  publisher={Springer}
}
@article{bertsekas1996temporal,
  title={Temporal differences-based policy iteration and applications in neuro-dynamic programming},
  author={Bertsekas, Dimitri P and Ioffe, Sergey},
  journal={Lab. for Info. and Decision Systems Report LIDS-P-2349, MIT, Cambridge, MA},
  year={1996},
  publisher={Citeseer}
}
@inproceedings{kakade2001natural,
  title={A Natural Policy Gradient.},
  author={Kakade, Sham},
  booktitle={Advances in Neural Information Processing Systems},
  publisher={Cambridge, MA: MIT Press},
  volume={14},
  pages={1531--1538},
  year={2001}
}
@article{peters2008natural,
  title={Natural actor-critic},
  author={Peters, Jan and Schaal, Stefan},
  journal={Neurocomputing},
  volume={71},
  number={7},
  pages={1180--1190},
  year={2008},
  publisher={Elsevier}
}
@article{baxter2001infinite,
  title={Infinite-horizon policy-gradient estimation},
  author={Baxter, Jonathan and Bartlett, Peter L.},
  journal={J. Artif. Intell. Res.},
  pages={319--350},
  year={2001}
}
@inproceedings{sutton1999policy,
  title={Policy Gradient Methods for Reinforcement Learning with Function Approximation.},
  author={Sutton, Richard S and McAllester, David A and Singh, Satinder P and Mansour, Yishay and others},
  booktitle={Advances in Neural Information Processing Systems},
  publisher={Cambridge, MA: MIT Press},
  pages={1057--1063},
  year={1999}
}
@inproceedings{bohm2005evolutionary,
  title={An evolutionary approach to tetris},
  author={B{\"o}hm, Niko and K{\'o}kai, Gabriella and Mandl, Stefan},
  booktitle={Proc. 6th Metaheuristics Int. Conf.},
  pages={137-148},
  year={2005}
}
@article{szita2006learning,
  title={Learning Tetris using the noisy cross-entropy method},
  author={Szita, Istv{\'a}n and L{\"o}rincz, Andr{\'a}s},
  journal={Neural Comput.},
  volume={18},
  number={12},
  pages={2936--2941},
  year={2006},
  publisher={MIT Press}
}
@inproceedings{thiery2010least,
  title={Least-Squares $\lambda$ Policy Iteration: Bias-Variance Trade-off in Control Problems},
  author={Thiery, Christophe and Scherrer, Bruno},
  booktitle={Proc. 27th Int. Conf. Mach. Learn.},
  pages={1071--1078},
  year={2010}
}

@inproceedings{gabillon2013approximate,
  title={Approximate dynamic programming finally performs well in the game of Tetris},
  author={Gabillon, Victor and Ghavamzadeh, Mohammad and Scherrer, Bruno},
  booktitle={Advances in Neural Information Processing Systems},
  publisher={Cambridge, MA: MIT Press},
  pages={1754--1762},
  year={2013}
}
@article{scherrer2013performance,
  title={Performance bounds for $\lambda$ policy iteration and application to the game of Tetris},
  author={Scherrer, Bruno},
  journal={J. Mach. Learn. Res.},
  volume={14},
  number={1},
  pages={1181--1227},
  year={2013},
  publisher={JMLR. org}
}
@article{thiery2009improvements,
  title={Improvements on Learning Tetris with Cross Entropy},
  author={Thiery, Christophe and Scherrer, Bruno},
  journal={Int. Computer Games Assoc. J.},
  volume={32},
  number={1},
  pages={23--33},
  year={2009}
}
@article{scherrer2015approximate,
  title={Approximate Modified Policy Iteration and its Application to the Game of Tetris},
  author={Scherrer, Bruno and Ghavamzadeh, Mohammad and Gabillon, Victor and Lesner, Boris and Geist, Matthieu},
  journal={J. Mach. Learn. Res.},
  volume={16},
  pages={1629--1676},
  year={2015}
}

@article{efron2004least,
  title={Least angle regression},
  author={Efron, Bradley and Hastie, Trevor and Johnstone, Iain and Tibshirani, Robert and others},
  journal={The Annals of statistics},
  volume={32},
  number={2},
  pages={407--499},
  year={2004},
  publisher={Institute of Mathematical Statistics}
}
@MASTERSTHESIS{Brzustowski1992,
  author ={John Brzustowski},
  title ={Can you win at tetris?},
  school = {University of British Columbia},
  year ={1992}
}
@Article{Breukelaar04,
  author =	 {Ron Breukelaar and Erik D. Demaine and Susan
                  Hohenberger and Hendrik Jan Hoogeboom and Walter
                  A. Kosters and David Liben-Nowell},
  title =	 {Tetris is Hard, Even to Approximate},
  journal =	 {International Journal of Computational Geometry and
                  Applications},
  year =	 {2004},
  volume =	 {14},
  number =	 {1--2},
  pages =	 {41--68},
  month =	 {April},
}
@book{Bertsekas1996,
  author =	 {Bertsekas, D. and Tsitsiklis, J. N.},
  title =	 {Neuro-Dynamic Programming},
  year =	 {1996},
  publisher =	 {Athena Scientific},
}
@inproceedings{maei2010gq,
  title={GQ ($\lambda$): A general gradient algorithm for temporal-difference prediction learning with eligibility traces},
  author={Maei, Hamid Reza and Sutton, Richard S},
  booktitle={Proceedings of the Third Conference on Artificial General Intelligence},
  volume={1},
  pages={91--96},
  year={2010}
}
@inproceedings{maei2010toward,
  title={Toward off-policy learning control with function approximation},
  author={Maei, Hamid R and Szepesv{\'a}ri, Csaba and Bhatnagar, Shalabh and Sutton, Richard S},
  booktitle={Proc. 27th Int. Conf. Mach. Learn.},
  pages={719--726},
  year={2010}
}
@inproceedings{phua2007tracking,
  title={Tracking value function dynamics to improve reinforcement learning with piecewise linear function approximation},
  author={Phua, Chee Wee and Fitch, Robert},
  booktitle={Proc. 24th Int. Conf. Mach. Learn.},
  pages={751--758},
  year={2007},
  organization={ACM}
}
@inproceedings{szubert2014temporal,
  title={Temporal difference learning of N-tuple networks for the game 2048},
  author={Szubert, Marcin and Jaskowski, Wojciech},
  booktitle={2014 IEEE Conference on Computational Intelligence and Games (CIG)},
  pages={1--8},
  year={2014},
  organization={IEEE}
}
@article{chen2013online,
  title={Online Selective Kernel-based Temporal Differece Learning},
  author={Chen, Xingguo and Gao, Yang and Wang, Ruili},
  journal={IEEE Trans. Neural Netw. Learn. Syst.},
  year={2013},
  volume={24},
  number={12},
  pages={1944--1956},
  publisher={IEEE}
}

@article{xu2007kernel,
  title={Kernel-based least squares policy iteration for reinforcement learning},
  author={Xu, Xin and Hu, Dewen and Lu, Xicheng},
  journal={IEEE Trans. Neural Netw.},
  volume={18},
  number={4},
  pages={973--992},
  year={2007},
  publisher={IEEE}
}
@INPROCEEDINGS{Engel03bayesmeets,
    author = {Yaakov Engel and Shie Mannor and Ron Meir},
    title = {Bayes meets {B}ellman: the {G}aussian process approach to temporal difference learning},
    booktitle = {Proc. 20th Int. Conf. Mach. Learn.},
    year = {2003},
    pages = {154--161},
    address={Washington, DC},
    month={Aug.},
}
@inproceedings{robards2011sparse,
  title={Sparse Kernel-SARSA ($\lambda$) with an eligibility trace},
  author={Robards, M. and Sunehag, P. and Sanner, S. and Marthi, B.},
  booktitle = {Proc. 22nd Eur. Conf. Mach. Learn.},
  pages={1--17},
  year={2011},
  month={Sept.},
   address = {Athens, Greece},
}
@conference{reisinger2008online,
  title={{Online kernel selection for {B}ayesian reinforcement learning}},
  author={Reisinger, J. and Stone, P. and Miikkulainen, R.},
  booktitle={Proc. 25th Int. Conf. Mach. Learn.},
  pages={816--823},
  year={2008},
  month={July},
  address={ Helsinki, Finland},
}
@book{Sutton1998,
  title={{Reinforcement learning: an introduction}},
  author={Sutton, R.S. and Barto, A.G.},
  year={1998},
  publisher={MIT Press},
  address={Cambridge, MA}
}
@book{Sutton2018book,
  author = {Sutton, Richard S. and Barto, Andrew G.},
  edition = {Second},
  publisher = {The MIT Press},
  title = {Reinforcement Learning: An Introduction},
  year = {2018 }
}
@phdthesis{Bradtke1994phd,
  title={Incremental Dynamic Programming for On-line Adaptive Optimal Control},
  author={Bradtke, Steven J},
  year={1994},
  school={University of  Massachusetts},
  month={Sept.},
  address={Amherst},
}
@inproceedings{baird1995residual,
  title={Residual algorithms: Reinforcement learning with function approximation},
  author={Baird, Leemon and others},
  booktitle={Proc. 12th Int. Conf. Mach. Learn.},
  pages={30--37},
  year={1995}
}
@article{bradtke1996linear,
  title={Linear least-squares algorithms for temporal difference learning},
  author={Bradtke, S.J. and Barto, A.G.},
  journal={Mach. Learn.},
  volume={22},
  number={1},
  pages={33--57},
  year={1996},
  publisher={Springer}
}
@article{lagoudakis2003least,
  title={Least-squares policy iteration},
  author={Lagoudakis, M.G. and Parr, R.},
  journal={J. Mach. Learn. Res.},
  volume={4},
  pages={1107--1149},
  year={2003},
  publisher={JMLR. org}
}
@article{boyan2002technical,
  title={Technical update: Least-squares temporal difference learning},
  author={Boyan, J.A.},
  journal={Mach. Learn.},
  volume={49},
  number={2},
  pages={233--246},
  year={2002},
  publisher={Springer}
}
@inproceedings{geramifard2006incremental,
  title={Incremental least-squares temporal difference learning},
  author={Geramifard, A. and Bowling, M. and Sutton, R.S.},
  booktitle={Proc. 21st AAAI Conf. Artif. Intell.},
  pages={356--361},
  year={2006},
  month={July},
  address={Boston, Massachusetts},
}
@inproceedings{sutton2009fast,
  title={Fast gradient-descent methods for temporal-difference learning with linear function approximation},
  author={Sutton, R.S. and Maei, H.R. and Precup, D. and Bhatnagar, S. and Silver, D. and Szepesv{\'a}ri, C. and Wiewiora, E.},
  booktitle={Proc. 26th Int. Conf. Mach. Learn.},
  pages={993--1000},
  year={2009}
}
@inproceedings{sutton2008convergent,
  title={A Convergent $ O (n) $ Temporal-difference Algorithm for Off-policy Learning with Linear Function Approximation},
  author={Sutton, Richard S and Maei, Hamid R and Szepesv{\'a}ri, Csaba},
  booktitle={Advances in Neural Information Processing Systems},
  publisher={Cambridge, MA: MIT Press},
  pages={1609--1616},
  year={2008}
}
@inproceedings{dabney2014natural,
  title={Natural Temporal Difference Learning},
  author={Dabney, William and Thomas, Philip},
  booktitle={Twenty-Eighth AAAI Conference on Artificial Intelligence},
  year={2014}
}
@inproceedings{mahmood2014weighted,
  title={Weighted importance sampling for off-policy learning with linear function approximation},
  author={Mahmood, A Rupam and van Hasselt, Hado P and Sutton, Richard S},
  booktitle={Advances in Neural Information Processing Systems},
  publisher={Cambridge, MA: MIT Press},
  pages={3014--3022},
  year={2014}
}
@inproceedings{seijen2014true,
  title={True Online TD ($\lambda$)},
  author={Seijen, Harm V and Sutton, Rich},
  booktitle={Proc. 31st Int. Conf. Mach. Learn.},
  pages={692--700},
  year={2014}
}
@article{ormoneit2002kernel,
  title={{Kernel-based reinforcement learning}},
  author={Ormoneit, D. and Sen, {\'S}.},
  journal={Mach. Learn.},
  volume={49},
  number={2-3},
  pages={161--178},
  issn={0885-6125},
  year={2002},
  publisher={Springer-Verlag },
  address = {Hingham, MA, USA},
}
@inproceedings{Ghavamzadeh2010lstd,
  author = {M. Ghavamzadeh and A. Lazaric and O. A. Maillard and R. Munos},
  title = {{LSTD} with Random Projections},
  BOOKTITLE={Advances in Neural Information Processing Systems},
  publisher={Cambridge, MA: MIT Press},
  volume = {23},
  pages = {721--729},
  Address = {Lake Tahoe, Nevada, USA},
  year = {2010}
}
@inproceedings{loth2007sparse,
  title={Sparse temporal difference learning using LASSO},
  author={Loth, M. and Davy, M. and Preux, P.},
  booktitle={Proc. IEEE Symp. Approx. Dynamic Program. Reinforce. Learn.},
  pages={352--359},
  year={2007},
  organization={IEEE}
}
@inproceedings{kolter2009regularization,
  title={Regularization and feature selection in least-squares temporal difference learning},
  author={Kolter, J.Z. and Ng, A.Y.},
  booktitle={Proc. 26th Int. Conf. Mach. Learn.},
  pages={521--528},
  year={2009},
  organization={ACM}
}
@inproceedings{hoffman2011regularized,
  title={Regularized least squares temporal difference learning with nested l2 and l1 penalization},
  author={Hoffman, M.W. and Lazaric, A. and Ghavamzadeh, M. and Munos, R.},
  booktitle={Proc. Eur. Workshop Reinforce. Learn.},
  year={2011}
}
@inproceedings{Ghavamzadeh2011finite,
  author = {M. Ghavamzadeh and A. Lazaric and R. Munos and M. Hoffman},
  title = {Finite-Sample Analysis of {Lasso-TD}},
  booktitle = {Proc. 28th Int. Conf. Mach. Learn.},
  year = {2011},
  month= {June},
  address={Bellevue, Washington, USA},
  pages={1177--1184},
}
@inproceedings{johnson2013accelerating,
  title={Accelerating stochastic gradient descent using predictive variance reduction},
  author={Johnson, R. and Zhang, T.},
  booktitle={Advances in Neural Information Processing Systems},
  pages={315--323},
  year={2013}
}
@article{xu2020reanalysis,
  title={Reanalysis of variance reduced temporal difference learning},
  author={Xu, T. and Wang, Z. and Zhou, Y. and Liang, Y.},
  journal={arXiv preprint arXiv:2001.01898},
  year={2020}
}
@inproceedings{schulman2015trust,
  title={Trust region policy optimization},
  author={Schulman, J. and Levine, S. and Abbeel, P. and Jordan, M. and Moritz, P.},
  booktitle={International Conference on Machine Learning},
  pages={1889--1897},
  year={2015}
}
@article{schulman2017proximal,
  title={Proximal policy optimization algorithms},
  author={Schulman, J. and Wolski, F. and Dhariwal, P. and Radford, A. and Klimov, O.},
  journal={arXiv preprint arXiv:1707.06347},
  year={2017}
}
@inproceedings{defazio2014saga,
  title={SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives},
  author={Defazio, A. and Bach, F. and Lacoste-Julien, S.},
  booktitle={Advances in Neural Information Processing Systems},
  pages={1646--1654},
  year={2014}
}
@inproceedings{du2017stochastic,
  title={Stochastic variance reduction methods for policy evaluation},
  author={Du, S. S. and Chen, J. and Li, L. and Xiao, L. and Zhou, D.},
  booktitle={Proceedings of the 34th International Conference on Machine Learning},
  pages={1049--1058},
  year={2017}
}
@inproceedings{chen2023modified,
  title={Modified Retrace for Off-Policy Temporal Difference Learning},
  author={Chen, Xingguo and Ma, Xingzhou and Li, Yang and Yang, Guang and Yang, Shangdong and Gao, Yang},
  booktitle={Uncertainty in Artificial Intelligence},
  pages={303--312},
  year={2023},
  organization={PMLR}
}
@article{dalal2017finite,
  title={Finite Sample Analyses for TD(0) with Function Approximation},
  author={Dalal, Gal and Szörényi, Balázs and Thoppe, Gugan and Mannor, Shie},
  journal={arXiv preprint arXiv:1704.01161},
  year={2017}
}
@article{sutton1988learning,
  title={Learning to predict by the methods of temporal differences},
  author={Sutton, Richard S},
  journal={Machine learning},
  volume={3},
  number={1},
  pages={9--44},
  year={1988},
  publisher={Springer}
}
@inproceedings{tsitsiklis1997analysis,
  title={Analysis of temporal-diffference learning with function approximation},
  author={Tsitsiklis, John N and Van Roy, Benjamin},
  booktitle={Advances in Neural Information Processing Systems},
  pages={1075--1081},
  year={1997}
}
@article{sutton2016emphatic,
  title={An emphatic approach to the problem of off-policy temporal-difference learning},
  author={Sutton, Richard S and Mahmood, A Rupam and White, Martha},
  journal={The Journal of Machine Learning Research},
  volume={17},
  number={1},
  pages={2603--2631},
  year={2016},
  publisher={JMLR. org}
}
@inproceedings{liu2015finite,
  title={Finite-sample analysis of proximal gradient TD algorithms},
  author={Liu, Bo and Liu, Ji and Ghavamzadeh, Mohammad and Mahadevan, Sridhar and Petrik, Marek},
  booktitle={Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence},
  pages={504--513},
  year={2015}
}
@inproceedings{liu2016proximal,
  title={Proximal Gradient Temporal Difference Learning Algorithms.},
  author={Liu, Bo and Liu, Ji and Ghavamzadeh, Mohammad and Mahadevan, Sridhar and Petrik, Marek},
  booktitle={Proceedings of the International Joint Conference on Artificial Intelligence},
  pages={4195--4199},
  year={2016}
}
@article{liu2018proximal,
  title={Proximal gradient temporal difference learning: Stable reinforcement learning with polynomial sample complexity},
  author={Liu, Bo and Gemp, Ian and Ghavamzadeh, Mohammad and Liu, Ji and Mahadevan, Sridhar and Petrik, Marek},
  journal={Journal of Artificial Intelligence Research},
  volume={63},
  pages={461--494},
  year={2018}
}
@inproceedings{givchi2015quasi,
  title={Quasi newton temporal difference learning},
  author={Givchi, Arash and Palhang, Maziar},
  booktitle={Asian Conference on Machine Learning},
  pages={159--172},
  year={2015}
}
@inproceedings{pan2017accelerated,
  title={Accelerated gradient temporal difference learning},
  author={Pan, Yangchen and White, Adam and White, Martha},
  booktitle={Proceedings of the 21st AAAI Conference on Artificial Intelligence},
  pages={2464--2470},
  year={2017}
}
@inproceedings{hallak2016generalized,
  title={Generalized emphatic temporal difference learning: bias-variance analysis},
  author={Hallak, Assaf and Tamar, Aviv and Munos, Remi and Mannor, Shie},
  booktitle={Proceedings of the 30th AAAI Conference on Artificial Intelligence},
  pages={1631--1637},
  year={2016}
}
@article{zhang2022truncated,
  title={Truncated emphatic temporal difference methods for prediction and control},
  author={Zhang, Shangtong and Whiteson, Shimon},
  journal={The Journal of Machine Learning Research},
  volume={23},
  number={1},
  pages={6859--6917},
  year={2022},
  publisher={JMLRORG}
}
@inproceedings{korda2015td,
  title={On TD (0) with function approximation: Concentration bounds and a centered variant with exponential convergence},
  author={Korda, Nathaniel and La, Prashanth},
  booktitle={International conference on machine learning},
  pages={626--634},
  year={2015},
  organization={PMLR}
}
@book{zhou2021machine,
  title={Machine learning},
  author={Zhou, Zhi-Hua},
  year={2021},
  publisher={Springer Nature}
}
@inproceedings{dalal2020tale,
  title={A tale of two-timescale reinforcement learning with the tightest finite-time bound},
  author={Dalal, Gal and Szorenyi, Balazs and Thoppe, Gugan},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={34},
  number={04},
  pages={3701-3708},
  year={2020}
}
@inproceedings{feng2019kernel,
  title={A kernel loss for solving the Bellman equation},
  author={Feng, Yihao and Li, Lihong and Liu, Qiang},
  booktitle={Advances in Neural Information Processing Systems},
  pages={15430--15441},
  year={2019}
}
@inproceedings{basserrano2021logistic,
  title={Logistic Q-Learning},
  author={Bas-Serrano, Joan and Curi, Sebastian and Krause, Andreas and Neu, Gergely},
  booktitle={International Conference on Artificial Intelligence and Statistics},
  pages={3610--3618},
  year={2021}
}