From 60bf36ccd12594a97a666c08aafefd8de4945d7d Mon Sep 17 00:00:00 2001 From: GongYu <1356681720@qq.com> Date: Wed, 14 Aug 2024 06:27:52 +0800 Subject: [PATCH] 新版 --- AAAI控制实验图/acrobot.pdf | Bin 171121 -> 0 bytes AAAI控制实验图/cl.pdf | Bin 171847 -> 0 bytes AAAI控制实验图/maze.pdf | Bin 147657 -> 0 bytes AAAI控制实验图/mt.pdf | Bin 154787 -> 0 bytes Apendix/anonymous-submission-latex-2024.aux | 81 +++++++++++++++++++++++++++++++++++++++++++++------------------------------------ Apendix/anonymous-submission-latex-2024.bbl | 12 +----------- Apendix/anonymous-submission-latex-2024.blg | 58 +++++++++++++++++++++++++++++----------------------------- Apendix/anonymous-submission-latex-2024.log | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++------------------------ Apendix/anonymous-submission-latex-2024.pdf | Bin 200712 -> 0 bytes Apendix/anonymous-submission-latex-2024.synctex.gz | Bin 88567 -> 0 bytes Apendix/anonymous-submission-latex-2024.tex | 671 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- NEW_aaai/anonymous-submission-latex-2025.aux | 100 ++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------- NEW_aaai/anonymous-submission-latex-2025.bbl | 12 +----------- NEW_aaai/anonymous-submission-latex-2025.blg | 58 +++++++++++++++++++++++++++++----------------------------- NEW_aaai/anonymous-submission-latex-2025.log | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------- NEW_aaai/anonymous-submission-latex-2025.pdf | Bin 1216302 -> 0 bytes NEW_aaai/anonymous-submission-latex-2025.synctex.gz | Bin 166442 -> 0 bytes NEW_aaai/anonymous-submission-latex-2025.tex | 31 +++++++++++++++++++++++-------- NEW_aaai/main/conclusion.tex | 35 +++++++++++++++++++++++++++++------ NEW_aaai/main/experiment.tex | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------- NEW_aaai/main/introduction.tex | 32 +++++++++++++++++--------------- NEW_aaai/main/motivation.tex | 483 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- NEW_aaai/main/pic/2-state-offpolicy.pdf | Bin 0 -> 222301 bytes NEW_aaai/main/pic/2-state-onpolicy.pdf | Bin 0 -> 163732 bytes NEW_aaai/main/pic/2StateExample.pdf | Bin 158477 -> 0 bytes NEW_aaai/main/pic/BairdExample copy 2.tex | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ NEW_aaai/main/pic/BairdExample copy.tex | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ NEW_aaai/main/pic/acrobot.pdf | Bin 171121 -> 0 bytes NEW_aaai/main/pic/cl.pdf | Bin 171847 -> 0 bytes NEW_aaai/main/pic/maze.pdf | Bin 147657 -> 0 bytes NEW_aaai/main/pic/mt.pdf | Bin 154787 -> 0 bytes NEW_aaai/main/preliminaries.tex | 393 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------- NEW_aaai/main/theory.tex | 390 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 环境图片/2-state.png | Bin 0 -> 11708 bytes 画图.pptx | Bin 0 -> 38493 bytes 论文草稿.txt | 26 ++++++++++++++++++++++++++ 评估实验图/2-state-offpolicy.pdf | Bin 0 -> 222301 bytes 评估实验图/2-state-onpolicy.pdf | Bin 0 -> 163732 bytes 38 files changed, 1860 insertions(+), 1060 deletions(-) create mode 100644 NEW_aaai/main/pic/2-state-offpolicy.pdf create mode 100644 NEW_aaai/main/pic/2-state-onpolicy.pdf create mode 100644 NEW_aaai/main/pic/BairdExample copy 2.tex create mode 100644 NEW_aaai/main/pic/BairdExample copy.tex create mode 100644 环境图片/2-state.png create mode 100644 画图.pptx create mode 100644 评估实验图/2-state-offpolicy.pdf create mode 100644 评估实验图/2-state-onpolicy.pdf diff --git "a/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/acrobot.pdf" "b/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/acrobot.pdf" index dbb3116..373e4f1 100644 Binary files "a/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/acrobot.pdf" and "b/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/acrobot.pdf" differ diff --git "a/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/cl.pdf" "b/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/cl.pdf" index 0bf0a70..32f9c69 100644 Binary files "a/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/cl.pdf" and "b/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/cl.pdf" differ diff --git "a/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/maze.pdf" "b/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/maze.pdf" index 947501e..baf79bf 100644 Binary files "a/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/maze.pdf" and "b/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/maze.pdf" differ diff --git "a/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/mt.pdf" "b/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/mt.pdf" index 89548ac..2641925 100644 Binary files "a/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/mt.pdf" and "b/AAAI\346\216\247\345\210\266\345\256\236\351\252\214\345\233\276/mt.pdf" differ diff --git a/Apendix/anonymous-submission-latex-2024.aux b/Apendix/anonymous-submission-latex-2024.aux index 20866e9..fccf89e 100644 --- a/Apendix/anonymous-submission-latex-2024.aux +++ b/Apendix/anonymous-submission-latex-2024.aux @@ -1,54 +1,63 @@ \relax \bibstyle{aaai24} +\citation{borkar1997stochastic} +\citation{hirsch1989convergent} +\citation{borkar2000ode} +\citation{borkar2000ode} +\citation{borkar2000ode} +\newlabel{proofth1}{{A.1}{1}} +\newlabel{th1proof}{{A.1}{1}} +\newlabel{thetaFast}{{A-1}{1}} +\newlabel{omegaFast}{{A-2}{1}} +\newlabel{omegaFastFinal}{{A-3}{1}} +\newlabel{omegaInfty}{{A-4}{1}} \citation{sutton2009fast} +\newlabel{odetheta}{{A-5}{2}} +\newlabel{covariance}{{A-6}{2}} +\newlabel{odethetafinal}{{A-7}{2}} +\newlabel{proofth2}{{A.2}{2}} \citation{hirsch1989convergent} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} \citation{hirsch1989convergent} -\newlabel{proofth2}{{A.1}{1}} -\newlabel{thetavmtdcFastest}{{A-1}{1}} -\newlabel{uvmtdcFastest}{{A-2}{1}} -\newlabel{omegavmtdcFastest}{{A-3}{1}} -\newlabel{omegavmtdcFastestFinal}{{A-4}{1}} -\newlabel{omegavmtdcInfty}{{A-5}{1}} -\newlabel{thetavmtdcFaster}{{A-6}{1}} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} +\newlabel{thetavmtdcFastest}{{A-8}{3}} +\newlabel{uvmtdcFastest}{{A-9}{3}} +\newlabel{omegavmtdcFastest}{{A-10}{3}} +\newlabel{omegavmtdcFastestFinal}{{A-11}{3}} +\newlabel{omegavmtdcInfty}{{A-12}{3}} +\newlabel{thetavmtdcFaster}{{A-13}{3}} +\newlabel{uvmtdcFaster}{{A-14}{3}} +\newlabel{uvmtdcFasterFinal}{{A-15}{3}} +\newlabel{uvmtdcInfty}{{A-16}{3}} \citation{borkar1997stochastic} -\newlabel{uvmtdcFaster}{{A-7}{2}} -\newlabel{uvmtdcFasterFinal}{{A-8}{2}} -\newlabel{uvmtdcInfty}{{A-9}{2}} -\newlabel{thetavmtdcSlowerFinal}{{A-11}{2}} -\newlabel{odethetavmtdcfinal}{{A-12}{2}} \citation{hirsch1989convergent} +\newlabel{thetavmtdcSlowerFinal}{{A-18}{4}} +\newlabel{odethetavmtdcfinal}{{A-19}{4}} +\newlabel{proofVMETD}{{A.3}{4}} +\newlabel{th1proof}{{A.3}{4}} +\newlabel{thetaFast}{{A-20}{4}} +\newlabel{omegaFast}{{A-21}{4}} +\newlabel{omegaFastFinal}{{A-22}{4}} \citation{borkar2000ode} \citation{borkar2000ode} \citation{borkar2000ode} -\newlabel{proofVMETD}{{A.2}{3}} -\newlabel{th1proof}{{A.2}{3}} -\newlabel{thetaFast}{{A-13}{3}} -\newlabel{omegaFast}{{A-14}{3}} -\newlabel{omegaFastFinal}{{A-15}{3}} -\newlabel{omegaInfty}{{A-16}{3}} +\newlabel{omegaInfty}{{A-23}{5}} +\newlabel{odetheta}{{A-24}{5}} \citation{sutton2016emphatic} -\newlabel{odetheta}{{A-17}{4}} -\newlabel{rowsum}{{A-20}{4}} -\citation{baird1995residual,sutton2009fast} -\citation{baird1995residual,sutton2009fast,maei2011gradient} -\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} -\newlabel{bairdexample}{{1}{5}} -\newlabel{columnsum}{{A-21}{5}} -\newlabel{odethetafinal}{{A-22}{5}} -\newlabel{experimentaldetails}{{B}{5}} +\newlabel{rowsum}{{A-27}{6}} +\newlabel{columnsum}{{A-28}{6}} +\newlabel{odethetafinal}{{A-29}{6}} +\newlabel{experimentaldetails}{{B}{6}} \bibdata{aaai24} -\bibcite{baird1995residual}{{1}{1995}{{Baird et~al.}}{{}}} -\bibcite{borkar1997stochastic}{{2}{1997}{{Borkar}}{{}}} -\bibcite{borkar2000ode}{{3}{2000}{{Borkar and Meyn}}{{}}} -\bibcite{hirsch1989convergent}{{4}{1989}{{Hirsch}}{{}}} -\bibcite{maei2011gradient}{{5}{2011}{{Maei}}{{}}} -\bibcite{sutton2009fast}{{6}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}} -\bibcite{sutton2016emphatic}{{7}{2016}{{Sutton, Mahmood, and White}}{{}}} -\newlabel{lrofways}{{1}{6}} -\gdef \@abspage@last{6} +\bibcite{borkar1997stochastic}{{1}{1997}{{Borkar}}{{}}} +\bibcite{borkar2000ode}{{2}{2000}{{Borkar and Meyn}}{{}}} +\bibcite{hirsch1989convergent}{{3}{1989}{{Hirsch}}{{}}} +\bibcite{sutton2009fast}{{4}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}} +\bibcite{sutton2016emphatic}{{5}{2016}{{Sutton, Mahmood, and White}}{{}}} +\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} +\newlabel{lrofways}{{1}{7}} +\gdef \@abspage@last{7} diff --git a/Apendix/anonymous-submission-latex-2024.bbl b/Apendix/anonymous-submission-latex-2024.bbl index c409b1b..8bd13dd 100644 --- a/Apendix/anonymous-submission-latex-2024.bbl +++ b/Apendix/anonymous-submission-latex-2024.bbl @@ -1,11 +1,6 @@ -\begin{thebibliography}{7} +\begin{thebibliography}{5} \providecommand{\natexlab}[1]{#1} -\bibitem[{Baird et~al.(1995)}]{baird1995residual} -Baird, L.; et~al. 1995. -\newblock Residual algorithms: Reinforcement learning with function approximation. -\newblock In \emph{Proc. 12th Int. Conf. Mach. Learn.}, 30--37. - \bibitem[{Borkar(1997)}]{borkar1997stochastic} Borkar, V.~S. 1997. \newblock Stochastic approximation with two time scales. @@ -21,11 +16,6 @@ Hirsch, M.~W. 1989. \newblock Convergent activation dynamics in continuous time networks. \newblock \emph{Neural Netw.}, 2(5): 331--349. -\bibitem[{Maei(2011)}]{maei2011gradient} -Maei, H.~R. 2011. -\newblock \emph{Gradient temporal-difference learning algorithms}. -\newblock Ph.D. thesis, University of Alberta. - \bibitem[{Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}]{sutton2009fast} Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesv{\'a}ri, C.; and Wiewiora, E. 2009. \newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation. diff --git a/Apendix/anonymous-submission-latex-2024.blg b/Apendix/anonymous-submission-latex-2024.blg index 020924d..117d52a 100644 --- a/Apendix/anonymous-submission-latex-2024.blg +++ b/Apendix/anonymous-submission-latex-2024.blg @@ -3,44 +3,44 @@ Capacity: max_strings=200000, hash_size=200000, hash_prime=170003 The top-level auxiliary file: anonymous-submission-latex-2024.aux The style file: aaai24.bst Database file #1: aaai24.bib -You've used 7 entries, +You've used 5 entries, 2840 wiz_defined-function locations, - 630 strings with 5707 characters, -and the built_in function-call counts, 4424 in all, are: -= -- 372 -> -- 189 + 619 strings with 5446 characters, +and the built_in function-call counts, 3370 in all, are: += -- 277 +> -- 153 < -- 0 -+ -- 74 -- -- 64 -* -- 295 -:= -- 731 -add.period$ -- 28 -call.type$ -- 7 -change.case$ -- 49 -chr.to.int$ -- 8 -cite$ -- 7 -duplicate$ -- 302 -empty$ -- 320 -format.name$ -- 75 -if$ -- 861 ++ -- 60 +- -- 52 +* -- 242 +:= -- 547 +add.period$ -- 20 +call.type$ -- 5 +change.case$ -- 36 +chr.to.int$ -- 6 +cite$ -- 5 +duplicate$ -- 223 +empty$ -- 240 +format.name$ -- 60 +if$ -- 649 int.to.chr$ -- 1 int.to.str$ -- 1 -missing$ -- 63 -newline$ -- 39 -num.names$ -- 28 -pop$ -- 125 +missing$ -- 49 +newline$ -- 29 +num.names$ -- 20 +pop$ -- 92 preamble$ -- 1 -purify$ -- 45 +purify$ -- 34 quote$ -- 0 -skip$ -- 134 +skip$ -- 96 stack$ -- 0 -substring$ -- 246 -swap$ -- 160 +substring$ -- 200 +swap$ -- 128 text.length$ -- 0 text.prefix$ -- 0 top$ -- 0 -type$ -- 63 +type$ -- 45 warning$ -- 0 -while$ -- 42 +while$ -- 31 width$ -- 0 -write$ -- 94 +write$ -- 68 diff --git a/Apendix/anonymous-submission-latex-2024.log b/Apendix/anonymous-submission-latex-2024.log index 1b461bb..9af2568 100644 --- a/Apendix/anonymous-submission-latex-2024.log +++ b/Apendix/anonymous-submission-latex-2024.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.3.31) 12 AUG 2024 17:11 +This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.3.31) 14 AUG 2024 06:25 entering extended mode restricted \write18 enabled. file:line:error style messages enabled. @@ -582,7 +582,29 @@ File: ot1ptm.fd 2001/06/04 font definitions for OT1/ptm. File: l3backend-pdftex.def 2023-01-16 L3 backend support: PDF output (pdfTeX) \l__color_backend_stack_int=\count335 \l__pdf_internal_box=\box82 -) (./anonymous-submission-latex-2024.aux) +) (./anonymous-submission-latex-2024.aux + +LaTeX Warning: Label `th1proof' multiply defined. + + +LaTeX Warning: Label `thetaFast' multiply defined. + + +LaTeX Warning: Label `omegaFast' multiply defined. + + +LaTeX Warning: Label `omegaFastFinal' multiply defined. + + +LaTeX Warning: Label `omegaInfty' multiply defined. + + +LaTeX Warning: Label `odetheta' multiply defined. + + +LaTeX Warning: Label `odethetafinal' multiply defined. + +) \openout1 = `anonymous-submission-latex-2024.aux'. LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 183. @@ -627,41 +649,46 @@ Package caption Info: listings package is loaded. Package caption Info: End \AtBeginDocument code. Package newfloat Info: `float' package detected. \c@lstlisting=\count342 -LaTeX Font Info: Trying to load font information for U+msa on input line 234. +LaTeX Font Info: Trying to load font information for U+msa on input line 206. (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsa.fd File: umsa.fd 2013/01/14 v3.01 AMS symbols A ) -LaTeX Font Info: Trying to load font information for U+msb on input line 234. +LaTeX Font Info: Trying to load font information for U+msb on input line 206. (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsb.fd File: umsb.fd 2013/01/14 v3.01 AMS symbols B ) -LaTeX Font Info: Trying to load font information for U+esvect on input line 234. +LaTeX Font Info: Trying to load font information for U+esvect on input line 206. (d:/software/texlive/2023/texmf-dist/tex/latex/esvect/uesvect.fd File: uesvect.fd -) [1 +) + +LaTeX Warning: Reference `omega' on page 1 undefined on input line 265. + +[1 + + + +{d:/software/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/base/8r.enc}] [2] [3] [4] [5] [6] (./anonymous-submission-latex-2024.bbl) [7] (./anonymous-submission-latex-2024.aux) + +LaTeX Warning: There were undefined references. +LaTeX Warning: There were multiply-defined labels. -{d:/software/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/base/8r.enc}] [2] [3] [4] (./pic/BairdExample.tex) - -File: pic/maze_13_13.pdf Graphic file (type pdf) - -Package pdftex.def Info: pic/maze_13_13.pdf used on input line 902. -(pdftex.def) Requested size: 172.61018pt x 135.67113pt. - [5] (./anonymous-submission-latex-2024.bbl) [6 <./pic/maze_13_13.pdf>] (./anonymous-submission-latex-2024.aux) ) + ) Here is how much of TeX's memory you used: - 22926 strings out of 476025 - 482831 string characters out of 5789524 - 1878382 words of memory out of 5000000 - 43000 multiletter control sequences out of 15000+600000 - 531474 words of font info for 71 fonts, out of 8000000 for 9000 + 22606 strings out of 476025 + 476412 string characters out of 5789524 + 1879382 words of memory out of 5000000 + 42668 multiletter control sequences out of 15000+600000 + 539762 words of font info for 95 fonts, out of 8000000 for 9000 1141 hyphenation exceptions out of 8191 - 84i,22n,89p,423b,789s stack positions out of 10000i,1000n,20000p,200000b,200000s - -Output written on anonymous-submission-latex-2024.pdf (6 pages, 200712 bytes). + 84i,22n,89p,423b,526s stack positions out of 10000i,1000n,20000p,200000b,200000s + +Output written on anonymous-submission-latex-2024.pdf (7 pages, 208835 bytes). PDF statistics: - 110 PDF objects out of 1000 (max. 8388607) - 68 compressed objects within 1 object stream + 117 PDF objects out of 1000 (max. 8388607) + 73 compressed objects within 1 object stream 0 named destinations out of 1000 (max. 500000) - 18 words of extra memory for PDF output out of 10000 (max. 10000000) + 13 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/Apendix/anonymous-submission-latex-2024.pdf b/Apendix/anonymous-submission-latex-2024.pdf index 48159e7..d36b9b8 100644 Binary files a/Apendix/anonymous-submission-latex-2024.pdf and b/Apendix/anonymous-submission-latex-2024.pdf differ diff --git a/Apendix/anonymous-submission-latex-2024.synctex.gz b/Apendix/anonymous-submission-latex-2024.synctex.gz index b0bf64b..054ddcc 100644 Binary files a/Apendix/anonymous-submission-latex-2024.synctex.gz and b/Apendix/anonymous-submission-latex-2024.synctex.gz differ diff --git a/Apendix/anonymous-submission-latex-2024.tex b/Apendix/anonymous-submission-latex-2024.tex index 3d18dca..a8479fb 100644 --- a/Apendix/anonymous-submission-latex-2024.tex +++ b/Apendix/anonymous-submission-latex-2024.tex @@ -187,122 +187,287 @@ \onecolumn \appendix \section{Relevant proofs} -% \subsection{VMTD} -% \begin{equation} -% \begin{array}{ccl} -% \text{VBE}(\bm{\theta})&=&\mathbb{E}[(\mathbb{E}[\delta|s]-\kappa \mathbb{E}[\mathbb{E}[\delta|s]])^2]. -% \end{array} -% \end{equation} - -% semi-gradient: -% \begin{equation} -% \begin{array}{ccl} -% 0&=&\mathbb{E}[\mathbb{E}[\delta|s]-\kappa \mathbb{E}[\mathbb{E}[\delta|s]](\bm{\phi} - \kappa\mathbb{E}[\bm{\phi}])]\\ -% &=&\mathbb{E}[\delta \phi] - (2\kappa - \kappa^{2})\mathbb{E}[\delta]\mathbb{E}[\phi]. -% \end{array} -% \end{equation} - -% or -% \begin{equation} -% \begin{array}{ccl} -% 0&=&\mathbb{E}[\delta \phi] - \kappa\mathbb{E}[\delta]\mathbb{E}[\phi]. -% \end{array} -% \end{equation} - -% Therefore: -% \begin{equation} -% \begin{array}{ccl} -% \textbf{A}_{\text{VMTD}}&=&{\bm{\Phi}}^{\top} (\textbf{D}_{\mu}-(2\kappa - \kappa^{2})\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){\bm{\Phi}}. -% \end{array} -% \end{equation} - -% or -% \begin{equation} -% \begin{array}{ccl} -% \textbf{A}_{\text{VMTD}}&=&{\bm{\Phi}}^{\top} (\textbf{D}_{\mu}-\kappa\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){\bm{\Phi}}. -% \end{array} -% \end{equation} - - - \subsection{Proof of Theorem 1} +\label{proofth1} +\begin{proof} + \label{th1proof} + The proof is based on Borkar's Theorem for + general stochastic approximation recursions with two time scales + \cite{borkar1997stochastic}. + + % The new TD error for the linear setting is + % \begin{equation*} + % \delta_{\text{new}}=r+\gamma + % \theta^{\top}\phi'-\theta^{\top}\phi-\mathbb{E}[\delta]. + % \end{equation*} + A new one-step + linear TD solution is defined + as: + \begin{equation*} + 0=\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]=-A\theta+b. + \end{equation*} + Thus, the VMTD's solution is + $\theta_{\text{VMTD}}=A^{-1}b$. + + First, note that recursion (5) can be rewritten as + \begin{equation*} + \theta_{k+1}\leftarrow \theta_k+\beta_k\xi(k), + \end{equation*} + where + \begin{equation*} + \xi(k)=\frac{\alpha_k}{\beta_k}(\delta_k-\omega_k)\phi_k + \end{equation*} + Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$, + $\xi(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$. + That is the increments in iteration (4) are uniformly larger than + those in (5), thus (4) is the faster recursion. + Along the faster time scale, iterations of (4) and (5) + are associated to ODEs system as follows: + \begin{equation} + \dot{\theta}(t) = 0, + \label{thetaFast} + \end{equation} + \begin{equation} + \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta(t)]-\omega(t). + \label{omegaFast} + \end{equation} + Based on the ODE (\ref{thetaFast}), $\theta(t)\equiv \theta$ when + viewed from the faster timescale. + By the Hirsch lemma \cite{hirsch1989convergent}, it follows that + $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some + $\theta$ that depends on the initial condition $\theta_0$ of recursion + (5). + Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as + \begin{equation} + \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta]-\omega(t). + \label{omegaFastFinal} + \end{equation} + Consider the function $h(\omega)=\mathbb{E}[\delta|\theta]-\omega$, + i.e., the driving vector field of the ODE (\ref{omegaFastFinal}). + It is easy to find that the function $h$ is Lipschitz with coefficient + $-1$. + Let $h_{\infty}(\cdot)$ be the function defined by + $h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$. + Then $h_{\infty}(\omega)= -\omega$, is well-defined. + For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}[\delta|\theta]$ + is the unique globally asymptotically stable equilibrium. + For the ODE + \begin{equation} + \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), + \label{omegaInfty} + \end{equation} + apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its + associated strict Liapunov function. Then, + the origin of (\ref{omegaInfty}) is a globally asymptotically stable + equilibrium. + + + Consider now the recursion (\ref{omega}). + Let + $M_{k+1}=(\delta_k-\omega_k) + -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, + where $\mathcal{F}(k)=\sigma(\omega_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, + \begin{equation*} + \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq + c_1(1+||\omega_k||^2+||\theta_k||^2). + \end{equation*} + + + Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. + Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our + conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus, + by Theorem 2.2 of \cite{borkar2000ode} we obtain that + $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. + + Consider now the slower time scale recursion (5). + Based on the above analysis, (5) can be rewritten as + \begin{equation*} + \theta_{k+1}\leftarrow + \theta_{k}+\alpha_k(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k. + \end{equation*} + + Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, + \begin{equation*} + \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq + c_2(1+||\theta_k||^2). + \end{equation*} + + Consider now the following ODE associated with (5): + \begin{equation} + \begin{array}{ccl} + \dot{\theta}(t)&=&\mathrm{Cov}(\delta|\theta(t),\phi)\\ + &=&\mathrm{Cov}(r+(\gamma\phi'-\phi)^{\top}\theta(t),\phi)\\ + &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\theta(t)^{\top}(\phi-\gamma\phi'),\phi)\\ + &=&\mathrm{Cov}(r,\phi)-\theta(t)^{\top}\mathrm{Cov}(\phi-\gamma\phi',\phi)\\ + &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi-\gamma\phi',\phi)^{\top}\theta(t)\\ + &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi-\gamma\phi')\theta(t)\\ + &=&-A\theta(t)+b. + \end{array} + \label{odetheta} + \end{equation} + Let $\vec{h}(\theta(t))$ be the driving vector field of the ODE + (\ref{odetheta}). + \begin{equation*} + \vec{h}(\theta(t))=-A\theta(t)+b. + \end{equation*} + Consider the cross-covariance matrix, + \begin{equation} + \begin{array}{ccl} + A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\ + &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\ + &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\ + &=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\ + \end{array} + \label{covariance} + \end{equation} + where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$ + \footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to + the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or + initialized randomly in a Markov chain for on-policy update.}. + Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and + $\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive + definite. Then, the matrix $A$ is semi-positive definite because $A$ is + linearly combined by two positive-weighted semi-positive definite matrice + (\ref{covariance}). + Furthermore, $A$ is nonsingular due to the assumption. + Hence, the cross-covariance matrix $A$ is positive definite. + + Therefore, + $\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically + stable equilibrium for ODE (\ref{odetheta}). + Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow + \infty}\frac{\vec{h}(r\theta)}{r}$. Then + $\vec{h}_{\infty}(\theta)=-A\theta$ is well-defined. + Consider now the ODE + \begin{equation} + \dot{\theta}(t)=-A\theta(t). + \label{odethetafinal} + \end{equation} + The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium. + Thus, the assumption (A1) and (A2) are verified. + \end{proof} + +\subsection{Proof of Theorem 2} \label{proofth2} \begin{proof} The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. For the VMTDC algorithm, a new one-step linear TD solution is defined as: \begin{equation*} - 0=\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b}). + 0=\mathbb{E}[({\phi} - \gamma {\phi}' - \mathbb{E}[{\phi} - \gamma {\phi}']){\phi}^\top]\mathbb{E}[{\phi} {\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta]){\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}{\theta}+{b}). \end{equation*} The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the VMTD's solution is -$\bm{\theta}_{\text{VMTDC}}=\textbf{A}^{-1}\bm{b}$. +${\theta}_{\text{VMTDC}}=\textbf{A}^{-1}{b}$. First, note that recursion (5) and (6) can be rewritten as, respectively, \begin{equation*} - \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\zeta_k \bm{x}(k), + {\theta}_{k+1}\leftarrow {\theta}_k+\zeta_k {x}(k), \end{equation*} \begin{equation*} - \bm{u}_{k+1}\leftarrow \bm{u}_k+\beta_k \bm{y}(k), + {u}_{k+1}\leftarrow {u}_k+\beta_k {y}(k), \end{equation*} where \begin{equation*} - \bm{x}(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)], + {x}(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) {\phi}_k - \gamma{\phi}'_{k}({\phi}^{\top}_k {u}_k)], \end{equation*} \begin{equation*} - \bm{y}(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \bm{\phi}^{\top}_k \bm{u}_k]\bm{\phi}_k. + {y}(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - {\phi}^{\top}_k {u}_k]{\phi}_k. \end{equation*} Recursion (5) can also be rewritten as \begin{equation*} - \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\beta_k z(k), + {\theta}_{k+1}\leftarrow {\theta}_k+\beta_k z(k), \end{equation*} where \begin{equation*} - z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)], + z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) {\phi}_k - \gamma{\phi}'_{k}({\phi}^{\top}_k {u}_k)], \end{equation*} Due to the settings of step-size schedule -$\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $\bm{x}(k)\rightarrow 0$, $\bm{y}(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. +$\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, ${x}(k)\rightarrow 0$, ${y}(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. That is that the increments in iteration (7) are uniformly larger than those in (6) and the increments in iteration (6) are uniformly larger than those in (5), thus (7) is the fastest recursion, (6) is the second fast recursion and (5) is the slower recursion. Along the fastest time scale, iterations of (5), (6) and (7) are associated to ODEs system as follows: \begin{equation} - \dot{\bm{\theta}}(t) = 0, + \dot{{\theta}}(t) = 0, \label{thetavmtdcFastest} \end{equation} \begin{equation} - \dot{\bm{u}}(t) = 0, + \dot{{u}}(t) = 0, \label{uvmtdcFastest} \end{equation} \begin{equation} - \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)]-\omega(t). + \dot{\omega}(t)=\mathbb{E}[\delta_t|{u}(t),{\theta}(t)]-\omega(t). \label{omegavmtdcFastest} \end{equation} -Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\bm{\theta}(t)\equiv \bm{\theta}$ -and $\bm{u}(t)\equiv \bm{u}$ when viewed from the fastest timescale. +Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both ${\theta}(t)\equiv {\theta}$ +and ${u}(t)\equiv {u}$ when viewed from the fastest timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that -$||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some -$\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion -(5) and $||\bm{u}_k-\bm{u}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some +$||{\theta}_k-{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some +${\theta}$ that depends on the initial condition ${\theta}_0$ of recursion +(5) and $||{u}_k-{u}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $u$ that depends on the initial condition $u_0$ of recursion (6). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest}) can be written as \begin{equation} - \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u},\bm{\theta}]-\omega(t). + \dot{\omega}(t)=\mathbb{E}[\delta_t|{u},{\theta}]-\omega(t). \label{omegavmtdcFastestFinal} \end{equation} -Consider the function $h(\omega)=\mathbb{E}[\delta|\bm{\theta},\bm{u}]-\omega$, +Consider the function $h(\omega)=\mathbb{E}[\delta|{\theta},{u}]-\omega$, i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}). It is easy to find that the function $h$ is Lipschitz with coefficient $-1$. Let $h_{\infty}(\cdot)$ be the function defined by $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$. Then $h_{\infty}(\omega)= -\omega$, is well-defined. - For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\bm{\theta},\bm{u}]$ + For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|{\theta},{u}]$ is the unique globally asymptotically stable equilibrium. For the ODE \begin{equation} @@ -318,18 +483,18 @@ Consider now the recursion (7). Let $M_{k+1}=(\delta_k-\omega_k) -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, -where $\mathcal{F}(k)=\sigma(\omega_l,\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq -c_1(1+||\omega_k||^2+||\bm{u}_k||^2+||\bm{\theta}_k||^2). +c_1(1+||\omega_k||^2+||{u}_k||^2+||{\theta}_k||^2). \end{equation*} @@ -342,53 +507,53 @@ $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the second time scale recursion (6). Based on the above analysis, (6) can be rewritten as % \begin{equation*} -% \bm{u}_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|\bm{u}_k,\bm{\theta}_k] - \bm{\phi}^{\top} (s_k) \bm{u}_k]\bm{\phi}(s_k). +% {u}_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|{u}_k,{\theta}_k] - {\phi}^{\top} (s_k) {u}_k]{\phi}(s_k). % \end{equation*} \begin{equation} - \dot{\bm{\theta}}(t) = 0, + \dot{{\theta}}(t) = 0, \label{thetavmtdcFaster} \end{equation} \begin{equation} - \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)])\bm{\phi}_t|\bm{\theta}(t)] - \textbf{C}\bm{u}(t). + \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|{u}(t),{\theta}(t)]){\phi}_t|{\theta}(t)] - \textbf{C}{u}(t). \label{uvmtdcFaster} \end{equation} -The ODE (\ref{thetavmtdcFaster}) suggests that $\bm{\theta}(t)\equiv \bm{\theta}$ (i.e., a time invariant parameter) +The ODE (\ref{thetavmtdcFaster}) suggests that ${\theta}(t)\equiv {\theta}$ (i.e., a time invariant parameter) when viewed from the second fast timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that -$||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some -$\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion +$||{\theta}_k-{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some +${\theta}$ that depends on the initial condition ${\theta}_0$ of recursion (5). Consider now the recursion (6). Let -$N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k)|\mathcal{I} (k)]$, -where $\mathcal{I}(k)=\sigma(\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq -c_2(1+||\bm{u}_k||^2+||\bm{\theta}_k||^2). +c_2(1+||{u}_k||^2+||{\theta}_k||^2). \end{equation*} -Because $\bm{\theta}(t)\equiv \bm{\theta}$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster}) +Because ${\theta}(t)\equiv {\theta}$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster}) can be written as \begin{equation} - \dot{\bm{u}}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{\theta}])\bm{\phi}_t|\bm{\theta}] - \textbf{C}\bm{u}(t). + \dot{{u}}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|{\theta}]){\phi}_t|{\theta}] - \textbf{C}{u}(t). \label{uvmtdcFasterFinal} \end{equation} -Now consider the function $h(\bm{u})=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\bm{\theta}]|\bm{\theta}] -\textbf{C}\bm{u}$, i.e., the +Now consider the function $h({u})=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|{\theta}]|{\theta}] -\textbf{C}{u}$, i.e., the driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}), -$\bm{u}^* = \textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\bm{\theta}])\bm{\phi}|\bm{\theta}]$ is the unique globally asymptotically -stable equilibrium. Let $h_{\infty}(\bm{u})=-\textbf{C}\bm{u}$. +${u}^* = \textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|{\theta}]){\phi}|{\theta}]$ is the unique globally asymptotically +stable equilibrium. Let $h_{\infty}({u})=-\textbf{C}{u}$. For the ODE \begin{equation} - \dot{\bm{u}}(t) = h_{\infty}(\bm{u}(t))= -\textbf{C}\bm{u}(t), + \dot{{u}}(t) = h_{\infty}({u}(t))= -\textbf{C}{u}(t), \label{uvmtdcInfty} \end{equation} the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable @@ -397,60 +562,60 @@ Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that -$||\bm{u}_k-\bm{u}^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. +$||{u}_k-{u}^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower timescale recursion (5). In the light of the above, (5) can be rewritten as \begin{equation} - \bm{\theta}_{k+1} \leftarrow \bm{\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\ - - \alpha_k \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]). + {\theta}_{k+1} \leftarrow {\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|{\theta}_k]) {\phi}_k\\ + - \alpha_k \gamma{\phi}'_{k}({\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|{\theta}_k]){\phi}|{\theta}_k]). \end{equation} -Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq - c_1(1+||\omega_k||^2+||\bm{\theta}_k||^2). + c_1(1+||\omega_k||^2+||{\theta}_k||^2). \end{equation*} @@ -561,23 +726,23 @@ The VMTD's solution is Consider now the slower time scale recursion (12). Based on the above analysis, (12) can be rewritten as % \begin{equation*} - % \bm{\theta}_{k+1}\leftarrow - % \bm{\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k. + % {\theta}_{k+1}\leftarrow + % {\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|{\theta}_k]){\phi}_k. % \end{equation*} \begin{equation*} \begin{split} - \bm{\theta}_{k+1}&\leftarrow \bm{\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_k)\bm{\phi}_k -\alpha_k \omega_{k+1}\bm{\phi}_k\\ - &=\bm{\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k\\ - &=\bm{\theta}_k+\alpha_k F_k \rho_k (R_{k+1}+\gamma \bm{\theta}_k^{\top}\bm{\phi}_{k+1}-\bm{\theta}_k^{\top}\bm{\phi}_k)\bm{\phi}_k -\alpha_k \mathbb{E}_{\mu}[F_k \rho_k \delta_k]\bm{\phi}_k\\ - &= \bm{\theta}_k+\alpha_k \{\underbrace{(F_k\rho_kR_{k+1}-\mathbb{E}_{\mu}[F_k\rho_k R_{k+1}])\bm{\phi}_k}_{\bm{b}_{\text{VMETD},k}} - -\underbrace{(F_k\rho_k\bm{\phi}_k(\bm{\phi}_k-\gamma\bm{\phi}_{k+1})^{\top}-\bm{\phi}_k\mathbb{E}_{\mu}[F_k\rho_k (\bm{\phi}_k-\gamma\bm{\phi}_{k+1})]^{\top})}_{\textbf{A}_{\text{VMETD},k}}\bm{\theta}_k\} + {\theta}_{k+1}&\leftarrow {\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_k){\phi}_k -\alpha_k \omega_{k+1}{\phi}_k\\ + &={\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|{\theta}_k]){\phi}_k\\ + &={\theta}_k+\alpha_k F_k \rho_k (R_{k+1}+\gamma {\theta}_k^{\top}{\phi}_{k+1}-{\theta}_k^{\top}{\phi}_k){\phi}_k -\alpha_k \mathbb{E}_{\mu}[F_k \rho_k \delta_k]{\phi}_k\\ + &= {\theta}_k+\alpha_k \{\underbrace{(F_k\rho_kR_{k+1}-\mathbb{E}_{\mu}[F_k\rho_k R_{k+1}]){\phi}_k}_{{b}_{\text{VMETD},k}} + -\underbrace{(F_k\rho_k{\phi}_k({\phi}_k-\gamma{\phi}_{k+1})^{\top}-{\phi}_k\mathbb{E}_{\mu}[F_k\rho_k ({\phi}_k-\gamma{\phi}_{k+1})]^{\top})}_{\textbf{A}_{\text{VMETD},k}}{\theta}_k\} \end{split} \end{equation*} - Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq - c_2(1+||\bm{\theta}_k||^2). + c_2(1+||{\theta}_k||^2). \end{equation*} Consider now the following ODE associated with (12): \begin{equation} \begin{array}{ccl} - \dot{\bm{\theta}}(t)&=&-\textbf{A}_{\text{VMETD}}\bm{\theta}(t)+\bm{b}_{\text{VMETD}}. + \dot{{\theta}}(t)&=&-\textbf{A}_{\text{VMETD}}{\theta}(t)+{b}_{\text{VMETD}}. \end{array} \label{odetheta} \end{equation} \begin{equation} \begin{split} \textbf{A}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[\textbf{A}_{\text{VMETD},k}]\\ - &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k \rho_k \bm{\phi}_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ - % &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\underbrace{\bm{\phi}_k}_{X}\underbrace{F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}}_{Y}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ - &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ - &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ - &=\sum_{s} f(s) \bm{\phi}(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top} - \sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top} \\ - &={\bm{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) \bm{\Phi} - {\bm{\Phi}}^{\top} \textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) \bm{\Phi} \\ - &={\bm{\Phi}}^{\top} (\textbf{F} - \textbf{d}_{\mu} \textbf{f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}} \\ - &={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){\bm{\Phi}} \\ - &={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}} \\ + &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k \rho_k {\phi}_k ({\phi}_k - \gamma {\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})]^{\top}\\ + % &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\underbrace{{\phi}_k}_{X}\underbrace{F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})^{\top}}_{Y}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})]^{\top}\\ + &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_kF_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})]^{\top}\\ + &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_kF_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})^{\top}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})]^{\top}\\ + &=\sum_{s} f(s) {\phi}(s)({\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}{\phi}(s'))^{\top} - \sum_{s} d_{\mu}(s) {\phi}(s) * \sum_{s} f(s)({\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}{\phi}(s'))^{\top} \\ + &={{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) {\Phi} - {{\Phi}}^{\top} \textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) {\Phi} \\ + &={{\Phi}}^{\top} (\textbf{F} - \textbf{d}_{\mu} \textbf{f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){{\Phi}} \\ + &={{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){{\Phi}} \\ + &={{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){{\Phi}} \\ \end{split} \end{equation} \begin{equation} \begin{split} - \bm{b}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[\bm{b}_{\text{VMETD},k}]\\ - &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k\rho_kR_{k+1}\bm{\phi}_k]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ - &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ - &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ - &=\sum_{s} f(s) \bm{\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)r_{\pi} \\ - &=\bm{\bm{\Phi}}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi} \\ + {b}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[{b}_{\text{VMETD},k}]\\ + &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k\rho_kR_{k+1}{\phi}_k]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ + &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\mathbb{E}_{\mu}[{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ + &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ + &=\sum_{s} f(s) {\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) {\phi}(s) * \sum_{s} f(s)r_{\pi} \\ + &={{\Phi}}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi} \\ \end{split} \end{equation} - Let $\vec{h}(\bm{\theta}(t))$ be the driving vector field of the ODE + Let $\vec{h}({\theta}(t))$ be the driving vector field of the ODE (\ref{odetheta}). \begin{equation*} - \vec{h}(\bm{\theta}(t))=-\textbf{A}_{\text{VMETD}}\bm{\theta}(t)+\bm{b}_{\text{VMETD}}. + \vec{h}({\theta}(t))=-\textbf{A}_{\text{VMETD}}{\theta}(t)+{b}_{\text{VMETD}}. \end{equation*} - An $\bm{\Phi}^{\top}\bm{\text{X}}\bm{\Phi}$ matrix of this - form will be positive definite whenever the matrix $\bm{\text{X}}$ is positive definite. - Any matrix $\bm{\text{X}}$ is positive definite if and only if - the symmetric matrix $\bm{\text{S}}=\bm{\text{X}}+\bm{\text{X}}^{\top}$ is positive definite. - Any symmetric real matrix $\bm{\text{S}}$ is positive definite if the absolute values of + An ${\Phi}^{\top}{\text{X}}{\Phi}$ matrix of this + form will be positive definite whenever the matrix ${\text{X}}$ is positive definite. + Any matrix ${\text{X}}$ is positive definite if and only if + the symmetric matrix ${\text{S}}={\text{X}}+{\text{X}}^{\top}$ is positive definite. + Any symmetric real matrix ${\text{S}}$ is positive definite if the absolute values of its diagonal entries are greater than the sum of the absolute values of the corresponding off-diagonal entries\cite{sutton2016emphatic}. @@ -693,14 +858,14 @@ The VMTD's solution is Therefore, - $\bm{\theta}^*=\textbf{A}_{\text{VMETD}}^{-1}\bm{b}_{\text{VMETD}}$ can be seen to be the unique globally asymptotically + ${\theta}^*=\textbf{A}_{\text{VMETD}}^{-1}{b}_{\text{VMETD}}$ can be seen to be the unique globally asymptotically stable equilibrium for ODE (\ref{odetheta}). - Let $\vec{h}_{\infty}(\bm{\theta})=\lim_{r\rightarrow - \infty}\frac{\vec{h}(r\bm{\theta})}{r}$. Then - $\vec{h}_{\infty}(\bm{\theta})=-\textbf{A}_{\text{VMETD}}\bm{\theta}$ is well-defined. + Let $\vec{h}_{\infty}({\theta})=\lim_{r\rightarrow + \infty}\frac{\vec{h}(r{\theta})}{r}$. Then + $\vec{h}_{\infty}({\theta})=-\textbf{A}_{\text{VMETD}}{\theta}$ is well-defined. Consider now the ODE \begin{equation} - \dot{\bm{\theta}}(t)=-\textbf{A}_{\text{VMETD}}\bm{\theta}(t). + \dot{{\theta}}(t)=-\textbf{A}_{\text{VMETD}}{\theta}(t). \label{odethetafinal} \end{equation} The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium. @@ -719,25 +884,25 @@ The VMTD's solution is % { % \begin{tabular}{cccc} % \toprule -% Algorithm&Key matrix $\textbf{A}$&{Positive definite}&{$\bm{b}$}\\\midrule -% On-policy TD&$\bm{\Phi}^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma -% \textbf{P}_{\pi})\bm{\Phi}$&$\checkmark$&$\bm{b}_{\text{on}}=\bm{\Phi}^{\top}\textbf{D}_{\pi}\textbf{r}_{\pi}$\\ -% On-policy VMTD&${\bm{\Phi}}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){\bm{\Phi}}$ -% &$\checkmark$&$\bm{\Phi}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top})\textbf{r}_{\pi}$\\ +% Algorithm&Key matrix $\textbf{A}$&{Positive definite}&{${b}$}\\\midrule +% On-policy TD&${\Phi}^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma +% \textbf{P}_{\pi}){\Phi}$&$\checkmark$&${b}_{\text{on}}={\Phi}^{\top}\textbf{D}_{\pi}\textbf{r}_{\pi}$\\ +% On-policy VMTD&${{\Phi}}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){{\Phi}}$ +% &$\checkmark$&${\Phi}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top})\textbf{r}_{\pi}$\\ % \midrule -% Off-policy TD&$\textbf{A}_{\text{off}}={\bm{\Phi}}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma -% \textbf{P}_{\pi}){\bm{\Phi}}$&$\times$&$\bm{b}_{\text{off}}=\bm{\Phi}^{\top}\textbf{D}_{\mu}\textbf{r}_{\pi}$\\ -% TDC& $\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{off}}$&$\checkmark$&$\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\bm{b}_{\text{off}}$ +% Off-policy TD&$\textbf{A}_{\text{off}}={{\Phi}}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma +% \textbf{P}_{\pi}){{\Phi}}$&$\times$&${b}_{\text{off}}={\Phi}^{\top}\textbf{D}_{\mu}\textbf{r}_{\pi}$\\ +% TDC& $\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{off}}$&$\checkmark$&$\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}{b}_{\text{off}}$ % \\ -% ETD& ${\bm{\Phi}}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}}$ -% &$\checkmark$&$\bm{\Phi}^{\top}\textbf{F}\textbf{r}_{\pi}$\\ +% ETD& ${{\Phi}}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}){{\Phi}}$ +% &$\checkmark$&${\Phi}^{\top}\textbf{F}\textbf{r}_{\pi}$\\ % \midrule -% Off-policy VMTD&$\textbf{A}_{\text{VMTD}}={\bm{\Phi}}^{\top} (\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){\bm{\Phi}}$ -% &$\times$&$\bm{b}_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$\\ -% VMTDC& $\textbf{A}_{\text{VMTD}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{VMTD}}$&$\checkmark$&$\textbf{A}_{\text{VMTD}}^{\top}\textbf{C}^{-1}\bm{b}_{\text{VMTD}}$ +% Off-policy VMTD&$\textbf{A}_{\text{VMTD}}={{\Phi}}^{\top} (\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){{\Phi}}$ +% &$\times$&${b}_{\text{VMTD}}={\Phi}^{\top}(\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$\\ +% VMTDC& $\textbf{A}_{\text{VMTD}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{VMTD}}$&$\checkmark$&$\textbf{A}_{\text{VMTD}}^{\top}\textbf{C}^{-1}{b}_{\text{VMTD}}$ % \\ -% VMETD& ${\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}}$ -% &$\checkmark$&$\bm{\Phi}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi}$\\ +% VMETD& ${{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){{\Phi}}$ +% &$\checkmark$&${\Phi}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi}$\\ % \bottomrule % \end{tabular} % } @@ -835,50 +1000,57 @@ The VMTD's solution is \section{Experimental details} \label{experimentaldetails} -The 2-state counterexample and the 7-state counterexample -are well-known off-policy experimental environments. The 2-state -counterexample is relatively simple, so next, I'll provide a detailed -description of the 7-state counterexample environment. - -\textbf{Baird's off-policy counterexample:} This task is well known as a -counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As -shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy -chooses actions represented by solid lines with a probability of $\frac{1}{7}$ -and actions represented by dotted lines with a probability of $\frac{6}{7}$. The -target policy is expected to choose the solid line with more probability than $\frac{1}{7}$, -and it chooses the solid line with probability of $1$ in this paper. - The discount factor $\gamma =0.99$, and the feature matrix is -defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}. -\begin{figure} - \begin{center} - \input{pic/BairdExample.tex} - \caption{7-state version of Baird's off-policy counterexample.} - \label{bairdexample} - \end{center} -\end{figure} - -The feature matrix of 7-state version of Baird's off-policy counterexample is -defined as follow: -\begin{equation*} -\Phi_{Counter}=\left[ -\begin{array}{cccccccc} -1 & 2& 0& 0& 0& 0& 0& 0\\ -1 & 0& 2& 0& 0& 0& 0& 0\\ -1 & 0& 0& 2& 0& 0& 0& 0\\ -1 & 0& 0& 0& 2& 0& 0& 0\\ -1 & 0& 0& 0& 0& 2& 0& 0\\ -1 & 0& 0& 0& 0& 0& 2& 0\\ -2 & 0& 0& 0& 0& 0& 0& 1 -\end{array}\right] -\end{equation*} +% The 2-state counterexample and the 7-state counterexample +% are well-known off-policy experimental environments. The 2-state +% counterexample is relatively simple, so next, I'll provide a detailed +% description of the 7-state counterexample environment. + +% \textbf{Baird's off-policy counterexample:} This task is well known as a +% counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As +% shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy +% chooses actions represented by solid lines with a probability of $\frac{1}{7}$ +% and actions represented by dotted lines with a probability of $\frac{6}{7}$. The +% target policy is expected to choose the solid line with more probability than $\frac{1}{7}$, +% and it chooses the solid line with probability of $1$ in this paper. +% The discount factor $\gamma =0.99$, and the feature matrix is +% defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}. +% \begin{figure} +% \begin{center} +% \input{pic/BairdExample.tex} +% \caption{7-state version of Baird's off-policy counterexample.} +% \label{bairdexample} +% \end{center} +% \end{figure} + +% The feature matrix of 7-state version of Baird's off-policy counterexample is +% defined as follow: +% \begin{equation*} +% \Phi_{Counter}=\left[ +% \begin{array}{cccccccc} +% 1 & 2& 0& 0& 0& 0& 0& 0\\ +% 1 & 0& 2& 0& 0& 0& 0& 0\\ +% 1 & 0& 0& 2& 0& 0& 0& 0\\ +% 1 & 0& 0& 0& 2& 0& 0& 0\\ +% 1 & 0& 0& 0& 0& 2& 0& 0\\ +% 1 & 0& 0& 0& 0& 0& 2& 0\\ +% 2 & 0& 0& 0& 0& 0& 0& 1 +% \end{array}\right] +% \end{equation*} +2-state version of Baird's off-policy counterexample: All learning rates follow linear learning rate decay. +For TD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. +For TDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$ and $\alpha_0 = 0.1$. +For VMTDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$, $\frac{\alpha_k}{\omega_k}=4$,and $\alpha_0 = 0.1$. +For VMTD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. 2-state version of Baird's off-policy counterexample: All learning rates follow linear learning rate decay. +For TD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. For TDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$ and $\alpha_0 = 0.1$.For ETD algorithm, $\alpha_0 = 0.1$. -For VMTDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$, $\frac{\alpha_k}{\omega_k}=4$,and $\alpha_0 = 0.1$.For ETD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. +For VMTDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$, $\frac{\alpha_k}{\omega_k}=4$,and $\alpha_0 = 0.1$.For VMETD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. +For VMTD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. -7-state version of Baird's off-policy counterexample: All learning rates follow linear learning rate decay. -For TDC algorithm, $\frac{\alpha_k}{\zeta_k}=3$ and $\alpha_0 = 0.1$.For ETD algorithm, $\alpha_0 = 0.1$. -For VMTDC algorithm, $\frac{\alpha_k}{\zeta_k}=3$, $\frac{\alpha_k}{\omega_k}=4$,and $\alpha_0 = 0.1$.For ETD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. +% 7-state version of Baird's off-policy counterexample: All learning rates follow linear learning rate decay. +% For TDC algorithm, $\frac{\alpha_k}{\zeta_k}=3$ and $\alpha_0 = 0.1$.For ETD algorithm, $\alpha_0 = 0.1$. +% For VMTDC algorithm, $\frac{\alpha_k}{\zeta_k}=3$, $\frac{\alpha_k}{\omega_k}=4$,and $\alpha_0 = 0.1$.For ETD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. For all policy evaluation experiments, each experiment is independently run 100 times. @@ -887,28 +1059,28 @@ For the four control experiments: The learning rates for each algorithm in all experiments are shown in Table \ref{lrofways}. For all control experiments, each experiment is independently run 50 times. -\textbf{Maze}: The learning agent should find a shortest path from the upper -left corner to the lower right corner. - In each state, -there are four alternative actions: $up$, $down$, $left$, and $right$, which -takes the agent deterministically to the corresponding neighbour state, -except when a movement is blocked by an obstacle or the edge -of the maze. Rewards are $-1$ in all transitions until the -agent reaches the goal state. -The discount factor $\gamma=0.99$, and states $s$ are represented by tabular -features.The maximum number of moves in the game is set to 1000. - \begin{figure} -\centering -\includegraphics[scale=0.35]{pic/maze_13_13.pdf} -\caption{Maze.} -\end{figure} - -\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are -selected from the gym official website and correspond to the following -versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''. -For specific details, please refer to the gym official website. -The maximum number of steps for the Mountain Car environment is set to 1000, -while the default settings are used for the other two environments. In Mountain car and Acrobot, features are generated by tile coding. +% \textbf{Maze}: The learning agent should find a shortest path from the upper +% left corner to the lower right corner. +% In each state, +% there are four alternative actions: $up$, $down$, $left$, and $right$, which +% takes the agent deterministically to the corresponding neighbour state, +% except when a movement is blocked by an obstacle or the edge +% of the maze. Rewards are $-1$ in all transitions until the +% agent reaches the goal state. +% The discount factor $\gamma=0.99$, and states $s$ are represented by tabular +% features.The maximum number of moves in the game is set to 1000. +% \begin{figure} +% \centering +% \includegraphics[scale=0.35]{pic/maze_13_13.pdf} +% \caption{Maze.} +% \end{figure} + +% \textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are +% selected from the gym official website and correspond to the following +% versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''. +% For specific details, please refer to the gym official website. +% The maximum number of steps for the Mountain Car environment is set to 1000, +% while the default settings are used for the other two environments. In Mountain car and Acrobot, features are generated by tile coding. \begin{table*}[htb] \centering @@ -918,15 +1090,14 @@ while the default settings are used for the other two environments. In Mountain \hline \multicolumn{1}{c|}{\diagbox{algorithms($lr$)}{envs}} &Maze &Cliff walking &Mountain Car &Acrobot \\ \hline - % Sarsa($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\ + Sarsa($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\ GQ($\alpha,\zeta$)&$0.1,0.003$ &$0.1,0.004$ &$0.1,0.01$ &$0.1,0.01$ \\ EQ($\alpha$)&$0.006$ &$0.005$ &$0.001$ &$0.0005$ \\ - % VMSarsa($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\ + VMSarsa($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\ VMGQ($\alpha,\zeta,\beta$)&$0.1,0.001,0.001$ &$0.1,0.005,\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ \\ VMEQ($\alpha,\beta$)&$0.001,0.0005$ &$0.005,0.0001$ &$0.001,0.0001$ &$0.0005,0.0001$ \\ - % AC($lr_{\text{actor}},lr_{\text{critic}}$)&$0.01,0.1$ &$0.01,0.01$ &$0.01,0.05$ &$0.01,0.05$ \\ - % Q-learning($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\ - % VMQ($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\ + Q-learning($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\ + VMQ($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\ \hline \end{tabular} \label{lrofways} diff --git a/NEW_aaai/anonymous-submission-latex-2025.aux b/NEW_aaai/anonymous-submission-latex-2025.aux index 18db901..a78317c 100644 --- a/NEW_aaai/anonymous-submission-latex-2025.aux +++ b/NEW_aaai/anonymous-submission-latex-2025.aux @@ -26,81 +26,69 @@ \newlabel{introduction}{{}{1}} \citation{Sutton2018book} \citation{Sutton2018book} -\citation{sutton2016emphatic} \newlabel{preliminaries}{{}{2}} \newlabel{valuefunction}{{}{2}} \newlabel{linearvaluefunction}{{1}{2}} -\newlabel{thetatd_onpolicy}{{}{2}} -\newlabel{thetatd_offpolicy}{{}{2}} -\newlabel{thetatdc}{{}{3}} -\newlabel{utdc}{{}{3}} -\newlabel{fvmetd}{{2}{3}} -\newlabel{thetaetd}{{}{3}} \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} -\newlabel{alg:algorithm 2}{{1}{3}} -\newlabel{alg:algorithm 5}{{2}{4}} -\newlabel{thetavmtdc}{{5}{4}} -\newlabel{uvmtdc}{{6}{4}} -\newlabel{omegavmtdc}{{7}{4}} -\newlabel{rho_VPBE}{{8}{4}} +\newlabel{tab:min_eigenvalues}{{1}{3}} +\newlabel{delta}{{3}{3}} +\newlabel{omega}{{4}{3}} +\newlabel{theta}{{5}{3}} +\newlabel{thetavmtdc}{{8}{4}} +\newlabel{uvmtdc}{{9}{4}} +\newlabel{omegavmtdc}{{10}{4}} +\newlabel{fvmetd}{{11}{4}} \newlabel{thetavmetd}{{12}{4}} \newlabel{omegavmetd}{{13}{4}} +\citation{borkar1997stochastic} \citation{sutton2009fast} -\citation{hirsch1989convergent} -\newlabel{theorem2}{{1}{5}} -\newlabel{thetavmtdcFastest}{{14}{5}} -\newlabel{uvmtdcFastest}{{15}{5}} -\newlabel{omegavmtdcFastest}{{16}{5}} -\newlabel{omegavmtdcFastestFinal}{{17}{5}} -\newlabel{omegavmtdcInfty}{{18}{5}} -\citation{borkar2000ode} -\citation{borkar2000ode} -\citation{borkar2000ode} \citation{borkar1997stochastic} +\newlabel{theorem1}{{1}{5}} +\newlabel{th1proof}{{}{5}} +\newlabel{covariance}{{14}{5}} +\newlabel{theorem2}{{2}{5}} +\newlabel{theorem3}{{3}{5}} +\newlabel{rowsum}{{15}{5}} +\newlabel{columnsum}{{16}{5}} \citation{ng1999policy} \citation{devlin2012dynamic} -\newlabel{theorem3}{{2}{6}} -\newlabel{rowsum}{{19}{6}} \newlabel{example_bias}{{2}{6}} -\newlabel{columnsum}{{20}{6}} \bibdata{aaai25} \bibcite{baird1995residual}{{1}{1995}{{Baird et~al.}}{{}}} -\newlabel{2-state}{{1(a)}{7}} +\newlabel{2-state}{{3(a)}{7}} \newlabel{sub@2-state}{{(a)}{7}} -\newlabel{7-state}{{1(b)}{7}} +\newlabel{7-state}{{3(b)}{7}} \newlabel{sub@7-state}{{(b)}{7}} -\newlabel{MazeFull}{{1(c)}{7}} +\newlabel{MazeFull}{{3(c)}{7}} \newlabel{sub@MazeFull}{{(c)}{7}} -\newlabel{CliffWalkingFull}{{1(d)}{7}} +\newlabel{CliffWalkingFull}{{3(d)}{7}} \newlabel{sub@CliffWalkingFull}{{(d)}{7}} -\newlabel{MountainCarFull}{{1(e)}{7}} +\newlabel{MountainCarFull}{{3(e)}{7}} \newlabel{sub@MountainCarFull}{{(e)}{7}} -\newlabel{AcrobotFull}{{1(f)}{7}} +\newlabel{AcrobotFull}{{3(f)}{7}} \newlabel{sub@AcrobotFull}{{(f)}{7}} -\newlabel{Complete_full}{{1}{7}} +\newlabel{Complete_full}{{3}{7}} \bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano et~al.}}{{Bas-Serrano, Curi, Krause, and Neu}}} \bibcite{borkar1997stochastic}{{3}{1997}{{Borkar}}{{}}} -\bibcite{borkar2000ode}{{4}{2000}{{Borkar and Meyn}}{{}}} -\bibcite{chen2023modified}{{5}{2023}{{Chen et~al.}}{{Chen, Ma, Li, Yang, Yang, and Gao}}} -\bibcite{devlin2012dynamic}{{6}{2012}{{Devlin and Kudenko}}{{}}} -\bibcite{feng2019kernel}{{7}{2019}{{Feng, Li, and Liu}}{{}}} -\bibcite{givchi2015quasi}{{8}{2015}{{Givchi and Palhang}}{{}}} -\bibcite{hackman2012faster}{{9}{2012}{{Hackman}}{{}}} -\bibcite{hallak2016generalized}{{10}{2016}{{Hallak et~al.}}{{Hallak, Tamar, Munos, and Mannor}}} -\bibcite{hirsch1989convergent}{{11}{1989}{{Hirsch}}{{}}} -\bibcite{johnson2013accelerating}{{12}{2013}{{Johnson and Zhang}}{{}}} -\bibcite{korda2015td}{{13}{2015}{{Korda and La}}{{}}} -\bibcite{liu2018proximal}{{14}{2018}{{Liu et~al.}}{{Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}}} -\bibcite{liu2015finite}{{15}{2015}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}} -\bibcite{liu2016proximal}{{16}{2016}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}} -\bibcite{ng1999policy}{{17}{1999}{{Ng, Harada, and Russell}}{{}}} -\bibcite{pan2017accelerated}{{18}{2017}{{Pan, White, and White}}{{}}} -\bibcite{sutton2009fast}{{19}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}} -\bibcite{sutton1988learning}{{20}{1988}{{Sutton}}{{}}} -\bibcite{Sutton2018book}{{21}{2018}{{Sutton and Barto}}{{}}} -\bibcite{sutton2008convergent}{{22}{2008}{{Sutton, Maei, and Szepesv{\'a}ri}}{{}}} -\bibcite{sutton2016emphatic}{{23}{2016}{{Sutton, Mahmood, and White}}{{}}} -\bibcite{tsitsiklis1997analysis}{{24}{1997}{{Tsitsiklis and Van~Roy}}{{}}} -\bibcite{xu2019reanalysis}{{25}{2019}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}} -\bibcite{zhang2022truncated}{{26}{2022}{{Zhang and Whiteson}}{{}}} +\bibcite{chen2023modified}{{4}{2023}{{Chen et~al.}}{{Chen, Ma, Li, Yang, Yang, and Gao}}} +\bibcite{devlin2012dynamic}{{5}{2012}{{Devlin and Kudenko}}{{}}} +\bibcite{feng2019kernel}{{6}{2019}{{Feng, Li, and Liu}}{{}}} +\bibcite{givchi2015quasi}{{7}{2015}{{Givchi and Palhang}}{{}}} +\bibcite{hackman2012faster}{{8}{2012}{{Hackman}}{{}}} +\bibcite{hallak2016generalized}{{9}{2016}{{Hallak et~al.}}{{Hallak, Tamar, Munos, and Mannor}}} +\bibcite{johnson2013accelerating}{{10}{2013}{{Johnson and Zhang}}{{}}} +\bibcite{korda2015td}{{11}{2015}{{Korda and La}}{{}}} +\bibcite{liu2018proximal}{{12}{2018}{{Liu et~al.}}{{Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}}} +\bibcite{liu2015finite}{{13}{2015}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}} +\bibcite{liu2016proximal}{{14}{2016}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}} +\bibcite{ng1999policy}{{15}{1999}{{Ng, Harada, and Russell}}{{}}} +\bibcite{pan2017accelerated}{{16}{2017}{{Pan, White, and White}}{{}}} +\bibcite{sutton2009fast}{{17}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}} +\bibcite{sutton1988learning}{{18}{1988}{{Sutton}}{{}}} +\bibcite{Sutton2018book}{{19}{2018}{{Sutton and Barto}}{{}}} +\bibcite{sutton2008convergent}{{20}{2008}{{Sutton, Maei, and Szepesv{\'a}ri}}{{}}} +\bibcite{sutton2016emphatic}{{21}{2016}{{Sutton, Mahmood, and White}}{{}}} +\bibcite{tsitsiklis1997analysis}{{22}{1997}{{Tsitsiklis and Van~Roy}}{{}}} +\bibcite{xu2019reanalysis}{{23}{2019}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}} +\bibcite{zhang2022truncated}{{24}{2022}{{Zhang and Whiteson}}{{}}} \gdef \@abspage@last{8} diff --git a/NEW_aaai/anonymous-submission-latex-2025.bbl b/NEW_aaai/anonymous-submission-latex-2025.bbl index d058220..cfde2c2 100644 --- a/NEW_aaai/anonymous-submission-latex-2025.bbl +++ b/NEW_aaai/anonymous-submission-latex-2025.bbl @@ -1,4 +1,4 @@ -\begin{thebibliography}{26} +\begin{thebibliography}{24} \providecommand{\natexlab}[1]{#1} \bibitem[{Baird et~al.(1995)}]{baird1995residual} @@ -16,11 +16,6 @@ Borkar, V.~S. 1997. \newblock Stochastic approximation with two time scales. \newblock \emph{Syst. \& Control Letters}, 29(5): 291--294. -\bibitem[{Borkar and Meyn(2000)}]{borkar2000ode} -Borkar, V.~S.; and Meyn, S.~P. 2000. -\newblock The ODE method for convergence of stochastic approximation and reinforcement learning. -\newblock \emph{SIAM J. Control Optim.}, 38(2): 447--469. - \bibitem[{Chen et~al.(2023)Chen, Ma, Li, Yang, Yang, and Gao}]{chen2023modified} Chen, X.; Ma, X.; Li, Y.; Yang, G.; Yang, S.; and Gao, Y. 2023. \newblock Modified Retrace for Off-Policy Temporal Difference Learning. @@ -51,11 +46,6 @@ Hallak, A.; Tamar, A.; Munos, R.; and Mannor, S. 2016. \newblock Generalized emphatic temporal difference learning: bias-variance analysis. \newblock In \emph{Proceedings of the 30th AAAI Conference on Artificial Intelligence}, 1631--1637. -\bibitem[{Hirsch(1989)}]{hirsch1989convergent} -Hirsch, M.~W. 1989. -\newblock Convergent activation dynamics in continuous time networks. -\newblock \emph{Neural Netw.}, 2(5): 331--349. - \bibitem[{Johnson and Zhang(2013)}]{johnson2013accelerating} Johnson, R.; and Zhang, T. 2013. \newblock Accelerating stochastic gradient descent using predictive variance reduction. diff --git a/NEW_aaai/anonymous-submission-latex-2025.blg b/NEW_aaai/anonymous-submission-latex-2025.blg index 470ba75..20e6de0 100644 --- a/NEW_aaai/anonymous-submission-latex-2025.blg +++ b/NEW_aaai/anonymous-submission-latex-2025.blg @@ -3,44 +3,44 @@ Capacity: max_strings=200000, hash_size=200000, hash_prime=170003 The top-level auxiliary file: anonymous-submission-latex-2025.aux The style file: aaai25.bst Database file #1: aaai25.bib -You've used 26 entries, +You've used 24 entries, 2840 wiz_defined-function locations, - 737 strings with 9168 characters, -and the built_in function-call counts, 19179 in all, are: -= -- 1644 -> -- 870 + 723 strings with 8880 characters, +and the built_in function-call counts, 18055 in all, are: += -- 1547 +> -- 832 < -- 0 -+ -- 321 -- -- 288 -* -- 1273 -:= -- 2961 -add.period$ -- 107 -call.type$ -- 26 -change.case$ -- 217 -chr.to.int$ -- 27 -cite$ -- 26 -duplicate$ -- 1316 -empty$ -- 1372 -format.name$ -- 353 -if$ -- 3900 ++ -- 305 +- -- 276 +* -- 1196 +:= -- 2777 +add.period$ -- 99 +call.type$ -- 24 +change.case$ -- 206 +chr.to.int$ -- 25 +cite$ -- 24 +duplicate$ -- 1237 +empty$ -- 1285 +format.name$ -- 338 +if$ -- 3685 int.to.chr$ -- 1 int.to.str$ -- 1 -missing$ -- 261 -newline$ -- 134 -num.names$ -- 104 -pop$ -- 614 +missing$ -- 244 +newline$ -- 124 +num.names$ -- 96 +pop$ -- 586 preamble$ -- 1 -purify$ -- 182 +purify$ -- 171 quote$ -- 0 -skip$ -- 694 +skip$ -- 664 stack$ -- 0 -substring$ -- 1043 -swap$ -- 703 +substring$ -- 969 +swap$ -- 658 text.length$ -- 0 text.prefix$ -- 0 top$ -- 0 -type$ -- 231 +type$ -- 213 warning$ -- 0 -while$ -- 166 +while$ -- 154 width$ -- 0 -write$ -- 343 +write$ -- 317 diff --git a/NEW_aaai/anonymous-submission-latex-2025.log b/NEW_aaai/anonymous-submission-latex-2025.log index 0a8ff42..c38e178 100644 --- a/NEW_aaai/anonymous-submission-latex-2025.log +++ b/NEW_aaai/anonymous-submission-latex-2025.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.3.31) 12 AUG 2024 17:12 +This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.3.31) 14 AUG 2024 06:11 entering extended mode restricted \write18 enabled. file:line:error style messages enabled. @@ -581,15 +581,15 @@ Package newfloat Info: `float' package detected. \c@eqfn=\count330 \titlearea=\box76 \actualheight=\skip75 -LaTeX Font Info: Trying to load font information for U+msa on input line 198. +LaTeX Font Info: Trying to load font information for U+msa on input line 213. (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsa.fd File: umsa.fd 2013/01/14 v3.01 AMS symbols A ) -LaTeX Font Info: Trying to load font information for U+msb on input line 198. +LaTeX Font Info: Trying to load font information for U+msb on input line 213. (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsb.fd File: umsb.fd 2013/01/14 v3.01 AMS symbols B ) -LaTeX Font Info: Trying to load font information for U+esvect on input line 198. +LaTeX Font Info: Trying to load font information for U+esvect on input line 213. (d:/software/texlive/2023/texmf-dist/tex/latex/esvect/uesvect.fd File: uesvect.fd ) (./main/introduction.tex @@ -597,89 +597,120 @@ Underfull \hbox (badness 3884) in paragraph at lines 39--52 []\OT1/ptm/m/n/10 Algorithm sta-bil-ity is promi-nently re-flected in the [] +LaTeX Font Info: Trying to load font information for TS1+ptm on input line 90. +(d:/software/texlive/2023/texmf-dist/tex/latex/psnfss/ts1ptm.fd +File: ts1ptm.fd 2001/06/04 font definitions for TS1/ptm. +) +Underfull \hbox (badness 1320) in paragraph at lines 90--91 +[]\OT1/ptm/m/n/10 Introduction of novel ob-jec-tive func-tions, VBE and + [] + [1{d:/software/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/base/8r.enc} -]) (./main/preliminaries.tex [2]) (./main/motivation.tex [3] -Overfull \hbox (19.88512pt too wide) detected at line 90 -[] - [] +]) (./main/preliminaries.tex +LaTeX Warning: Command \textellipsis invalid in math mode on input line 23. -Underfull \hbox (badness 4120) in paragraph at lines 140--149 -\OT1/ptm/m/n/10 a given set of sub-samples|in the form of triples +LaTeX Font Info: Trying to load font information for OMS+ptm on input line 58. +(d:/software/texlive/2023/texmf-dist/tex/latex/psnfss/omsptm.fd +File: omsptm.fd +) +LaTeX Font Info: Font shape `OMS/ptm/m/sc' in size <9> not available +(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 58. +
+File: main/pic/2StateExample.pdf Graphic file (type pdf) + +Package pdftex.def Info: main/pic/2StateExample.pdf used on input line 98. +(pdftex.def) Requested size: 71.81705pt x 35.90657pt. + [2 <./main/pic/2StateExample.pdf>] +Underfull \hbox (badness 1472) in paragraph at lines 255--262 +\OT1/ptm/m/n/10 where $[]$ is a di-ag-o-nal ma-trix with di-ag-o-nal el-e-ments [] -Overfull \hbox (32.7115pt too wide) detected at line 184 -[] +Underfull \hbox (badness 1931) in paragraph at lines 282--290 +[]\OT1/ptm/m/n/10 Minimum eigen-value larger, al-go-rithm's con-ver-gence [] - -Overfull \hbox (38.64899pt too wide) detected at line 264 +) (./main/motivation.tex +Overfull \hbox (36.0123pt too wide) detected at line 119 [] [] - -Overfull \hbox (30.29318pt too wide) detected at line 287 +[3] +Overfull \hbox (13.08682pt too wide) detected at line 182 [] [] -[4] -Overfull \hbox (19.45059pt too wide) detected at line 300 -[] - [] -) (./main/theory.tex [5] -Underfull \hbox (badness 2035) in paragraph at lines 140--145 -\OT1/ptm/m/n/10 2000) are ver-i-fied. Fur-ther-more, As-sump-tions (TS) of +Overfull \hbox (50.91507pt too wide) detected at line 381 +[] [] -
-File: main/pic/2-state.pdf Graphic file (type pdf) - -Package pdftex.def Info: main/pic/2-state.pdf used on input line 483. -(pdftex.def) Requested size: 131.66446pt x 113.71024pt. -
-File: main/pic/7-state.pdf Graphic file (type pdf) - -Package pdftex.def Info: main/pic/7-state.pdf used on input line 487. -(pdftex.def) Requested size: 131.66617pt x 113.70975pt. -
+) (./main/theory.tex [4] [5]) (./main/experiment.tex +
+File: main/pic/maze_13_13.pdf Graphic file (type pdf) + +Package pdftex.def Info: main/pic/maze_13_13.pdf used on input line 4. +(pdftex.def) Requested size: 98.63116pt x 77.52382pt. +
+File: main/pic/2-state-onpolicy.pdf Graphic file (type pdf) + +Package pdftex.def Info: main/pic/2-state-onpolicy.pdf used on input line 13. +(pdftex.def) Requested size: 155.6025pt x 138.84679pt. +
+File: main/pic/2-state-offpolicy.pdf Graphic file (type pdf) + +Package pdftex.def Info: main/pic/2-state-offpolicy.pdf used on input line 17. +(pdftex.def) Requested size: 155.59775pt x 138.85121pt. +
File: main/pic/maze.pdf Graphic file (type pdf) -Package pdftex.def Info: main/pic/maze.pdf used on input line 491. -(pdftex.def) Requested size: 131.65952pt x 113.71227pt. -
+Package pdftex.def Info: main/pic/maze.pdf used on input line 21. +(pdftex.def) Requested size: 155.60243pt x 138.84613pt. +
File: main/pic/cl.pdf Graphic file (type pdf) -Package pdftex.def Info: main/pic/cl.pdf used on input line 495. -(pdftex.def) Requested size: 131.65952pt x 113.71227pt. -
+Package pdftex.def Info: main/pic/cl.pdf used on input line 25. +(pdftex.def) Requested size: 155.60243pt x 138.84613pt. +
File: main/pic/mt.pdf Graphic file (type pdf) -Package pdftex.def Info: main/pic/mt.pdf used on input line 499. -(pdftex.def) Requested size: 131.65952pt x 113.71227pt. -
+Package pdftex.def Info: main/pic/mt.pdf used on input line 29. +(pdftex.def) Requested size: 155.60243pt x 138.84613pt. +
File: main/pic/acrobot.pdf Graphic file (type pdf) -Package pdftex.def Info: main/pic/acrobot.pdf used on input line 503. -(pdftex.def) Requested size: 131.65952pt x 113.71227pt. -[6]) (./main/experiment.tex) (./main/conclusion.tex) (./anonymous-submission-latex-2025.bbl [7 <./main/pic/2-state.pdf> <./main/pic/7-state.pdf> <./main/pic/maze.pdf> <./main/pic/cl.pdf> <./main/pic/mt.pdf> <./main/pic/acrobot.pdf>]) [8] (./anonymous-submission-latex-2025.aux) ) +Package pdftex.def Info: main/pic/acrobot.pdf used on input line 33. +(pdftex.def) Requested size: 155.60243pt x 138.84613pt. + + +LaTeX Warning: `h' float specifier changed to `ht'. + +[6 <./main/pic/maze_13_13.pdf>] +Underfull \vbox (badness 10000) has occurred while \output is active [] + +) (./main/conclusion.tex +Underfull \hbox (badness 2951) in paragraph at lines 32--33 +[]\OT1/ptm/m/n/10 analysis of the con-ver-gence rate of VMTDC and + [] + +) (./anonymous-submission-latex-2025.bbl [7 <./main/pic/2-state-onpolicy.pdf> <./main/pic/2-state-offpolicy.pdf> <./main/pic/maze.pdf> <./main/pic/cl.pdf> <./main/pic/mt.pdf> <./main/pic/acrobot.pdf>]) [8] (./anonymous-submission-latex-2025.aux) ) Here is how much of TeX's memory you used: - 18678 strings out of 476025 - 360333 string characters out of 5789524 - 1897382 words of memory out of 5000000 - 38742 multiletter control sequences out of 15000+600000 - 546921 words of font info for 105 fonts, out of 8000000 for 9000 + 18693 strings out of 476025 + 361060 string characters out of 5789524 + 1890382 words of memory out of 5000000 + 38746 multiletter control sequences out of 15000+600000 + 549956 words of font info for 109 fonts, out of 8000000 for 9000 1141 hyphenation exceptions out of 8191 - 84i,17n,89p,423b,663s stack positions out of 10000i,1000n,20000p,200000b,200000s - -Output written on anonymous-submission-latex-2025.pdf (8 pages, 1216302 bytes). + 84i,17n,89p,423b,419s stack positions out of 10000i,1000n,20000p,200000b,200000s + +Output written on anonymous-submission-latex-2025.pdf (8 pages, 1896294 bytes). PDF statistics: - 345 PDF objects out of 1000 (max. 8388607) - 136 compressed objects within 2 object streams + 375 PDF objects out of 1000 (max. 8388607) + 147 compressed objects within 2 object streams 0 named destinations out of 1000 (max. 500000) - 43 words of extra memory for PDF output out of 10000 (max. 10000000) + 53 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/NEW_aaai/anonymous-submission-latex-2025.pdf b/NEW_aaai/anonymous-submission-latex-2025.pdf index 72ffdda..c6511c0 100644 Binary files a/NEW_aaai/anonymous-submission-latex-2025.pdf and b/NEW_aaai/anonymous-submission-latex-2025.pdf differ diff --git a/NEW_aaai/anonymous-submission-latex-2025.synctex.gz b/NEW_aaai/anonymous-submission-latex-2025.synctex.gz index 33a3b0c..c79140c 100644 Binary files a/NEW_aaai/anonymous-submission-latex-2025.synctex.gz and b/NEW_aaai/anonymous-submission-latex-2025.synctex.gz differ diff --git a/NEW_aaai/anonymous-submission-latex-2025.tex b/NEW_aaai/anonymous-submission-latex-2025.tex index 74c0836..7382cc3 100644 --- a/NEW_aaai/anonymous-submission-latex-2025.tex +++ b/NEW_aaai/anonymous-submission-latex-2025.tex @@ -119,7 +119,7 @@ % nouns, adverbs, adjectives should be capitalized, including both words in hyphenated terms, while % articles, conjunctions, and prepositions are lower case unless they % directly follow a colon or long dash -\title{A Variance Minimization Approach to Off-policy Temporal-Difference Learning} +\title{A Variance Minimization Approach to Temporal-Difference Learning} \author{ %Authors % All authors must be in the same font size and format. @@ -194,16 +194,31 @@ \maketitle % \setcounter{theorem}{0} \begin{abstract} - In this paper, we introduce the concept of improving the performance of parametric - Temporal-Difference (TD) learning algorithms by the Variance Minimization (VM) parameter, $\omega$, - which is dynamically updated at each time step. Specifically, we incorporate the VM parameter into off-policy linear algorithms such as TDC and ETD, resulting in the - Variance Minimization TDC (VMTDC) algorithm and the Variance Minimization ETD (VMETD) algorithm. In the two-state counterexample, + % In this paper, we introduce the concept of improving the performance of parametric + % Temporal-Difference (TD) learning algorithms by the Variance Minimization (VM) parameter, $\omega$, + % which is dynamically updated at each time step. Specifically, we incorporate the VM parameter into off-policy linear algorithms such as TDC and ETD, resulting in the + % Variance Minimization TDC (VMTDC) algorithm and the Variance Minimization ETD (VMETD) algorithm. In the two-state counterexample, + % we analyze + % the convergence speed of these algorithms by calculating the minimum eigenvalue of the key + % matrices and find that the VMTDC algorithm converges faster than TDC, while VMETD is more stable in convergence than ETD + % through the + % experiment.In controlled experiments, the VM algorithms demonstrate + % superior performance. + Under certain conditions, the larger the smallest + eigenvalue of the key matrix of an algorithm, the + faster the algorithm converges. By observation, most + current objective functions aim to minimize error. + Therefore, in this paper, we propose two new objective + functions and derive three Variance Minimization (VM) algorithms, including VMTD, VMTDC and VMETD. + A scalar parameter, $\omega$, is introduced, to improve the performance of parametric + Temporal-Difference (TD) learning algorithms. + In the policy evaluation experiment, two-state, we analyze the convergence speed of these algorithms by calculating the minimum eigenvalue of the key - matrices and find that the VMTDC algorithm converges faster than TDC, while VMETD is more stable in convergence than ETD - through the - experiment.In controlled experiments, the VM algorithms demonstrate + matrices both on-policy and off-policy.In controlled experiments, the VM algorithms demonstrate superior performance. + + \end{abstract} % Uncomment the following to link to your code, datasets, an extended version or similar. diff --git a/NEW_aaai/main/conclusion.tex b/NEW_aaai/main/conclusion.tex index a5ff5d1..04e454a 100644 --- a/NEW_aaai/main/conclusion.tex +++ b/NEW_aaai/main/conclusion.tex @@ -1,12 +1,35 @@ \section{Conclusion and Future Work} +% Value-based reinforcement learning typically aims +% to minimize error as an optimization objective. +% As an alternation, this study proposes new objective +% functions: VBE and VPBE, and derives many variance minimization algorithms, including VMTD, +% VMTDC and VMETD. +% All algorithms demonstrated superior performance in policy +% evaluation and control experiments. +% Future work may include, but are not limited +% to, (1) analysis of the convergence rate of VMTDC and VMETD. +% (2) extensions of VBE and VPBE to multi-step returns. +% (3) extensions to nonlinear approximations, such as neural networks. + Value-based reinforcement learning typically aims to minimize error as an optimization objective. -As an alternation, this study proposes new objective -functions: VBE and VPBE, and derives many variance minimization algorithms, including VMTD, -VMTDC and VMETD. +As an alternation, this study proposes two new objective +functions: VBE and VPBE, and derives an on-policy algorithm: +VMTD and two off-policy algorithms: VMTDC and VMETD. +% The VMTD algorithm +% is essentially an adjustment or correction to the traditional +% TD update. +% Both +% algorithms are capable of stabilizing gradient estimation, reducing +% the variance of gradient estimation and accelerating convergence. All algorithms demonstrated superior performance in policy evaluation and control experiments. +Both algorithms demonstrated superior performance in policy +evaluation and control experiments. Future work may include, but are not limited -to, (1) analysis of the convergence rate of VMTDC and VMETD. -(2) extensions of VBE and VPBE to multi-step returns. -(3) extensions to nonlinear approximations, such as neural networks. \ No newline at end of file +to, +\begin{itemize} + \item analysis of the convergence rate of VMTDC and VMETD. + \item extensions of VBE and VPBE to multi-step returns. + \item extensions to nonlinear approximations, such as neural networks. +\end{itemize} \ No newline at end of file diff --git a/NEW_aaai/main/experiment.tex b/NEW_aaai/main/experiment.tex index 10bff8a..12b2f64 100644 --- a/NEW_aaai/main/experiment.tex +++ b/NEW_aaai/main/experiment.tex @@ -1,35 +1,157 @@ +% \subsection{Testing Tasks} +\begin{figure}[h] + \centering + \includegraphics[scale=0.2]{main/pic/maze_13_13.pdf} + \caption{Maze.} + \end{figure} +\begin{figure*}[tb] + \vskip 0.2in + \begin{center} + \subfigure[on-policy 2-state]{ + \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/2-state-onpolicy.pdf} + \label{2-state} + } + \subfigure[off-policy 2-state]{ + \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/2-state-offpolicy.pdf} + \label{7-state} + } + \subfigure[Maze]{ + \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/maze.pdf} + \label{MazeFull} + }\\ + \subfigure[Cliff Walking]{ + \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/cl.pdf} + \label{CliffWalkingFull} + } + \subfigure[Mountain Car]{ + \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/mt.pdf} + \label{MountainCarFull} + } + \subfigure[Acrobot]{ + \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/acrobot.pdf} + \label{AcrobotFull} + } + \caption{Learning curses of one evaluation environment and four contral environments.} + \label{Complete_full} + \end{center} + \vskip -0.2in + \end{figure*} \section{Experimental Studies} This section assesses algorithm performance through experiments, which are divided into policy evaluation experiments and control experiments. -The control algorithms for TDC, ETD, VMTDC, and VMETD are named GQ, EQ, VMGQ, and VMEQ, respectively. -The evaluation experimental environments are the 2-state and 7-state counterexample. +The evaluation experimental environments is the 2-state. +In a 2-state environment, we conducted two types of experiments—on-policy +and off-policy—to verify the relationship between the convergence speed of +the algorithm and the smallest eigenvalue of the key matrix $\textbf{A}$. +Control experiments, by allowing the algorithm to interact +with the environment to optimize the policy, can evaluate its +performance in learning the optimal policy. This provides a more +comprehensive assessment of the algorithm's overall capabilities. The control experimental environments are Maze, CliffWalking-v0, MountainCar-v0, and Acrobot-v1. +The control algorithms for TDC, ETD, VMTDC, and VMETD are named GQ, EQ, VMGQ, and VMEQ, respectively. +For TD and VMTD control algorithms, there are two variants each: Sarsa and Q-learning for TD, and VMSarsa and VMQ for VMTD. + +% For specific experimental parameters, please refer to the appendix. + +% \textbf{Baird's off-policy counterexample:} This task is well known as a +% counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As +% shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy +% chooses actions represented by solid lines with a probability of $\frac{1}{7}$ +% and actions represented by dotted lines with a probability of $\frac{6}{7}$. The +% target policy is expected to choose the solid line with more probability than $\frac{1}{7}$, +% and it chooses the solid line with probability of $1$ in this paper. +% The discount factor $\gamma =0.99$, and the feature matrix is +% defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}. +% \begin{figure} +% \begin{center} +% \input{main/pic/BairdExample.tex} +% \caption{7-state.} +% \label{bairdexample} +% \end{center} +% \end{figure} + +% The feature matrix of 7-state version of Baird's off-policy counterexample is +% defined as follow: +% \begin{equation*} +% \Phi_{Counter}=\left[ +% \begin{array}{cccccccc} +% 1 & 2& 0& 0& 0& 0& 0& 0\\ +% 1 & 0& 2& 0& 0& 0& 0& 0\\ +% 1 & 0& 0& 2& 0& 0& 0& 0\\ +% 1 & 0& 0& 0& 2& 0& 0& 0\\ +% 1 & 0& 0& 0& 0& 2& 0& 0\\ +% 1 & 0& 0& 0& 0& 0& 2& 0\\ +% 2 & 0& 0& 0& 0& 0& 0& 1 +% \end{array}\right] +% \end{equation*} +\subsection{Testing Tasks} +% \begin{figure}[h] +% \centering +% \includegraphics[scale=0.2]{main/pic/maze_13_13.pdf} +% \caption{Maze.} +% \end{figure} +\textbf{Maze}: The learning agent should find a shortest path from the upper +left corner to the lower right corner. + In each state, +there are four alternative actions: $up$, $down$, $left$, and $right$, which +takes the agent deterministically to the corresponding neighbour state, +except when a movement is blocked by an obstacle or the edge +of the maze. Rewards are $-1$ in all transitions until the +agent reaches the goal state. +The discount factor $\gamma=0.99$, and states $s$ are represented by tabular +features.The maximum number of moves in the game is set to 1000. + +\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are +selected from the gym official website and correspond to the following +versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''. +For specific details, please refer to the gym official website. +The maximum number of steps for the Mountain Car environment is set to 1000, +while the default settings are used for the other two environments. In Mountain car and Acrobot, features are generated by tile coding. + +For all policy evaluation experiments, each experiment +is independently run 100 times. +For all control experiments, each experiment is independently run 50 times. For specific experimental parameters, please refer to the appendix. -For the evaluation experiment, the experimental results -align with our previous analysis. In the 2-state counterexample -environment, the TDC algorithm has the smallest minimum -eigenvalue of the key matrix, resulting in the slowest -convergence speed. In contrast, the minimum eigenvalue -of VMTDC is larger, leading to faster convergence. -Although VMETD's minimum eigenvalue is larger than ETD's, -causing VMETD to converge more slowly than ETD in the -2-state counterexample, the standard deviation (shaded area) -of VMETD is smaller than that of ETD, indicating that VMETD -converges more smoothly. In the 7-state counterexample -environment, VMTDC converges faster than TDC and both VMETD and ETD are diverge. - -For the control experiments, the results for the maze and -cliff walking environments are similar: VMGQ -outperforms GQ, EQ outperforms VMGQ, and VMEQ performs -the best. In the mountain car and Acrobot experiments, -VMGQ and VMEQ show comparable performance, both outperforming -GQ and EQ. In summary, for control experiments, VM algorithms -outperform non-VM algorithms. - -In summary, the performance of VMSarsa, -VMQ, and VMGQ(0) is better than that of other algorithms. -In the Cliff Walking environment, -the performance of VMGQ(0) is slightly better than that of -VMSarsa and VMQ. In the other three experimental environments, -the performances of VMSarsa, VMQ, and VMGQ(0) are close. \ No newline at end of file +\subsection{Experimental Results and Analysis} +Figure \ref{2-state} shows the learning curves for the on-policy +2-state policy evaluation experiment. In this setup, +the convergence speed of TD, VMTD, TDC, and VMTDC decreases +sequentially. Table \ref{tab:min_eigenvalues} indicates that the smallest eigenvalue +of the key matrix for these four algorithms is greater than 0 +and decreases sequentially, which is consistent with the +experimental curves and table values. + +Figure B displays the learning curves for the off-policy +2-state policy evaluation experiment. In this setup, +the convergence speed of ETD, VMETD, VMTD, VMTDC, and +TDC decreases sequentially, while TD diverges. Table \ref{tab:min_eigenvalues} +shows that the smallest eigenvalue of the key matrix for +ETD, VMETD, VMTD, VMTDC, and TDC is greater than 0 and +decreases sequentially, while the smallest eigenvalue +for TD is less than 0. This is consistent with the +experimental curves and table values. Remarkably, +although VMTD is guaranteed to converge under +on-policy conditions, it still converges in the +off-policy 2-state scenario. The update formula +of VMTD indicates that it is essentially an +adjustment and correction of the TD update, +with the introduction of the parameter $\omega$ +making the variance of the gradient estimate +more stable, thereby making the update of theta more stable. + +Figures \ref{MazeFull}, \ref{CliffWalkingFull}, \ref{MountainCarFull} and \ref{AcrobotFull} show the learning curves +for four control experiments. A common feature +observed across these experiments is that VMEQ +outperforms EQ, VMGQ outperforms GQ, VMQ outperforms +Q-learning, and VMSarsa outperforms Sarsa. For the +Maze and Cliffwalking experiments, VMEQ demonstrated +the best performance with the fastest convergence speed. +In the Mountain Car and Acrobot experiments, the performance +of the four VM algorithms was nearly identical and all +outperformed the other algorithms. + +Overall, whether in policy evaluation experiments or +control experiments, the VM algorithms have +demonstrated superior performance, +especially excelling in the control experiments. \ No newline at end of file diff --git a/NEW_aaai/main/introduction.tex b/NEW_aaai/main/introduction.tex index d70882b..194f822 100644 --- a/NEW_aaai/main/introduction.tex +++ b/NEW_aaai/main/introduction.tex @@ -68,26 +68,28 @@ based on recursive optimization using it are known to be unstable. It is necessary to propose a new objective function, but the mentioned objective functions above are all some form of error. Is minimizing error the only option for value-based reinforcement learning? -For policy evaluation experiments, -differences in objective functions may result -in inconsistent fixed points. This inconsistency -makes it difficult to uniformly compare the superiority -of algorithms derived from different objective functions. -However, for control experiments, since the choice of actions -depends on the relative values of the Q values rather than their - absolute values, the presence of solution bias is acceptable. +% For policy evaluation experiments, +% differences in objective functions may result +% in inconsistent fixed points. This inconsistency +% makes it difficult to uniformly compare the superiority +% of algorithms derived from different objective functions. +% However, for control experiments, since the choice of actions +% depends on the relative values of the Q values rather than their +% absolute values, the presence of solution bias is acceptable. Based on this observation, we propose alternate objective functions -instead of minimizing errors. We minimize +instead of minimizing errors. We minimize Variance of Bellman Error (VBE) and Variance of Projected Bellman Error (VPBE) and derive Variance Minimization (VM) algorithms. These algorithms preserve the invariance of the optimal policy in the control environments, -but significantly reduce the variance of gradient estimation, +and significantly reduce the variance of gradient estimation, and thus hastening convergence. The contributions of this paper are as follows: -(1) Introduction of novel objective functions based on -the invariance of the optimal policy. -(2) Propose two off-policy variance minimization algorithms. -(3) Proof of their convergence. -(5) Experiments demonstrating the faster convergence speed of the proposed algorithms. +\begin{itemize} + \item Introduction of novel objective functions, VBE and VPBE. + \item Propose a on-policy VM algorithm and two off-policy VM algorithms. + \item Proof of their convergence. + \item The experiments demonstrate the superiority of the VM algorithms. +\end{itemize} + diff --git a/NEW_aaai/main/motivation.tex b/NEW_aaai/main/motivation.tex index 3223cb5..16e450b 100644 --- a/NEW_aaai/main/motivation.tex +++ b/NEW_aaai/main/motivation.tex @@ -1,116 +1,198 @@ \section{Variance Minimization Algorithms} -To derive an algorithm with a larger minimum eigenvalue for matrix -$\textbf{A}$, it is necessary to propose new objective functions. -The mentioned objective functions in the Introduction -are all forms of error. Is minimizing error the only option -for value-based reinforcement learning? Based on this observation, -we propose alternative objective functions instead of minimizing errors. -We minimize the Variance of Projected Bellman Error (VPBE) and derive the -VMTDC algorithm. This idea is then innovatively applied to ETD, resulting -in the VMETD algorithm. +This section will introduce two new objective functions and +three new algorithms, including one on-policy algorithm and two off-policy algorithms, and calculate the minimum eigenvalue +of $\textbf{A}$ for each of the three algorithms under on-policy and +off-policy in a 2-state environment, thereby comparing the +convergence speed of the three algorithms. +% To derive an algorithm with a larger minimum eigenvalue for matrix +% $\textbf{A}$, it is necessary to propose new objective functions. +% The mentioned objective functions in the Introduction +% are all forms of error. Is minimizing error the only option +% for value-based reinforcement learning? Based on this observation, +% We propose alternative objective functions instead of minimizing errors. +% We minimize the Variance of Projected Bellman Error (VPBE) and derive the +% VMTDC algorithm. This idea is then innovatively applied to ETD, resulting +% in the VMETD algorithm. % \subsection{Motivation} % gagagga -\begin{algorithm}[t] - \caption{VMTDC algorithm with linear function approximation in the off-policy setting} - \label{alg:algorithm 2} -\begin{algorithmic} - \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $\bm{u}_0$, $\omega_{0}$, $\gamma - $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ - \REPEAT - \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $\bm{u}_{0}$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ - % \textbf{Output}: $\bm{\theta}^*$.\\ - \FOR{$t=0$ {\bfseries to} $T-1$} - \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ - \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ - \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ - \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ - \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t [ (\rho_t\delta_t-\omega_t)\bm{\phi}_t - \gamma \rho_t\bm{\phi}_{t+1}(\bm{\phi}^{\top}_{t} \bm{u}_{t})]$ - \STATE $\bm{u}_{t+1}\leftarrow \bm{u}_{t}+\zeta_t[(\rho_t\delta_t-\omega_t) - \bm{\phi}^{\top}_{t} \bm{u}_{t}] \bm{\phi}_t$ - \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (\rho_t\delta_t-\omega_t)$ - \STATE $S_t=S_{t+1}$ - \ENDFOR - \UNTIL{terminal episode} -\end{algorithmic} -\end{algorithm} -\begin{algorithm}[t] - \caption{VMETD algorithm with linear function approximation in the off-policy setting} - \label{alg:algorithm 5} -\begin{algorithmic} - \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $F_0$, $\omega_{0}$, $\gamma - $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ - \REPEAT - \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $F_{0}$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ - % \textbf{Output}: $\theta^*$.\\ - \FOR{$t=0$ {\bfseries to} $T-1$} - \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ - \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ - \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ - \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ - \STATE $F_{t}\leftarrow \gamma \rho_t F_{t-1} +1$ - \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t (F_t \rho_t\delta_t-\omega_t)\bm{\phi}_t$ - \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (F_t \rho_t\delta_t-\omega_t)$ - \STATE $S_t=S_{t+1}$ - \ENDFOR - \UNTIL{terminal episode} -\end{algorithmic} -\end{algorithm} +% \begin{algorithm}[t] +% \caption{VMTDC algorithm with linear function approximation in the off-policy setting} +% \label{alg:algorithm 2} +% \begin{algorithmic} +% \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $\bm{u}_0$, $\omega_{0}$, $\gamma +% $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ +% \REPEAT +% \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $\bm{u}_{0}$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ +% % \textbf{Output}: $\bm{\theta}^*$.\\ +% \FOR{$t=0$ {\bfseries to} $T-1$} +% \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ +% \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ +% \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ +% \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ +% \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t [ (\rho_t\delta_t-\omega_t)\bm{\phi}_t - \gamma \rho_t\bm{\phi}_{t+1}(\bm{\phi}^{\top}_{t} \bm{u}_{t})]$ +% \STATE $\bm{u}_{t+1}\leftarrow \bm{u}_{t}+\zeta_t[(\rho_t\delta_t-\omega_t) - \bm{\phi}^{\top}_{t} \bm{u}_{t}] \bm{\phi}_t$ +% \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (\rho_t\delta_t-\omega_t)$ +% \STATE $S_t=S_{t+1}$ +% \ENDFOR +% \UNTIL{terminal episode} +% \end{algorithmic} +% \end{algorithm} +% \begin{algorithm}[t] +% \caption{VMETD algorithm with linear function approximation in the off-policy setting} +% \label{alg:algorithm 5} +% \begin{algorithmic} +% \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $F_0$, $\omega_{0}$, $\gamma +% $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ +% \REPEAT +% \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $F_{0}$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ +% % \textbf{Output}: $\theta^*$.\\ +% \FOR{$t=0$ {\bfseries to} $T-1$} +% \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ +% \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ +% \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ +% \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ +% \STATE $F_{t}\leftarrow \gamma \rho_t F_{t-1} +1$ +% \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t (F_t \rho_t\delta_t-\omega_t)\bm{\phi}_t$ +% \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (F_t \rho_t\delta_t-\omega_t)$ +% \STATE $S_t=S_{t+1}$ +% \ENDFOR +% \UNTIL{terminal episode} +% \end{algorithmic} +% \end{algorithm} + +\subsection{Variance Minimization TD Learning: VMTD} +For on-policy learning, +a novel objective function, Variance of Bellman Error (VBE), is proposed as follows: +% \begin{equation} +% \begin{array}{ccl} +% \arg \min_{\theta}\text{VBE}(\theta)&=&\arg \min_{\theta}\mathbb{E}[(\mathbb{E}[\delta|s]-\mathbb{E}[\mathbb{E}[\delta|s]])^2]\\ +% &=&\arg \min_{\theta,\omega} \mathbb{E}[(\mathbb{E}[\delta|s]-\omega)^2]. +% \end{array} +% \end{equation} -\subsection{Variance Minimization TDC Learning: VMTDC} -For off-policy learning, we propose a new objective function, -called Variance of Projected Bellman error (VPBE), -and the corresponding algorithm is called VMTDC. \begin{align} - \text{VPBE}(\bm{\theta}) &= \mathbb{E}[(\delta-\mathbb{E}[\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1} \\ - & \mathbb{E}[(\delta -\mathbb{E}[\delta ])\bm{\phi}] \notag \\ - &= (\bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}))^{\top}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1} \notag \\ - & \bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ - &= (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}})^{\top}\textbf{D}^{\top}\bm{\Phi}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1} \notag \\ - & \bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ - &= (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}})^{\top}\Pi^{\top}\textbf{D}\Pi \notag \\ - & (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ - &= (\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}-\textbf{W}_{\bm{\theta}}))^{\top}\textbf{D} \notag \\ - & (\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}-\textbf{W}_{\bm{\theta}})) \notag \\ - &= ||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}} - \textbf{W}_{\bm{\theta}})||^{2}_{\mu} \notag \\ - &= ||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}) - \Pi\textbf{W}_{\bm{\theta}}||^{2}_{\mu} \notag \\ - &= \mathbb{E}[(\delta-\omega) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\omega)\bm{\phi}] + \arg \min_{\theta}\text{VBE}(\theta) &= \arg \min_{\theta}\mathbb{E}[(\mathbb{E}[\delta_t|S_t]-\mathbb{E}[\mathbb{E}[\delta_t|S_t]])^2] \\ + &= \arg \min_{\theta,\omega} \mathbb{E}[(\mathbb{E}[\delta_t|S_t]-\omega)^2]\notag \end{align} -where $\textbf{W}_{\bm{\theta}}$ is viewed as vectors with every element being equal to $||\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}||^{2}_{\mu}$ and $\omega$ is used to approximate $\mathbb{E}[\delta]$, i.e., $\omega \doteq\mathbb{E}[\delta] $. +where $\delta_t$ is the TD error as follows: +\begin{equation} +\delta_t = r_{t+1}+\gamma +\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t. +\label{delta} +\end{equation} +Clearly, it is no longer to minimize Bellman errors. + +First, the parameter $\omega$ is derived directly based on +stochastic gradient descent: +\begin{equation} +\omega_{t+1}\leftarrow \omega_{t}+\beta_t(\delta_t-\omega_t), +\label{omega} +\end{equation} + +Then, based on stochastic semi-gradient descent, the update of +the parameter $\theta$ is as follows: +\begin{equation} +\theta_{t+1}\leftarrow +\theta_{t}+\alpha_t(\delta_t-\omega_t)\phi_t. +\label{theta} +\end{equation} -The gradient of the (3) with respect to $\theta$ is +The semi-gradient of the (2) with respect to $\theta$ is \begin{equation*} \begin{array}{ccl} - -\frac{1}{2}\nabla \text{VPBE}(\bm{\theta}) &=& -\mathbb{E}\Big[\Big( (\gamma \bm{\phi}' - \bm{\phi}) - \mathbb{E}[ (\gamma \bm{\phi}' - \bm{\phi})]\Big)\bm{\phi}^{\top} \Big] \\ - & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1} \mathbb{E}[( \delta -\mathbb{E}[ \delta])\bm{\phi}]\\ - &=& \mathbb{E}\Big[\Big( (\bm{\phi} - \gamma \bm{\phi}')- \mathbb{E}[ (\bm{\phi} - \gamma \bm{\phi}')]\Big)\bm{\phi}^{\top} \Big] \\ - & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ - & & \mathbb{E}\Big[\Big( r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta}\\ - & & \hspace{2em} -\mathbb{E}[ r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta}]\Big)\bm{\phi} \Big]\\ - &=& \textbf{A}^{\top} \textbf{C}^{-1}(-\textbf{A}\bm{\theta} + \textbf{b}) - \end{array} + &&-\frac{1}{2}\nabla \text{VBE}({\theta}) \\ + &=& \mathbb{E}[(\mathbb{E}[\delta_t|S_t]-\mathbb{E}[\mathbb{E}[\delta_t|S_t]])(\phi_t -\mathbb{E}[\phi_t])] \\ + &=& \mathbb{E}[\delta_t \phi_t] -\mathbb{E}[\delta_t] \mathbb{E}[\phi_t] , + % &=&-\mathbb{E}\Big[\Big( (\phi_t - \gamma\phi_t')- \mathbb{E}[ (\phi_t- \gamma {\phi_t}')]\Big)\phi_t^{\top} \Big]\theta + \mathbb{E}( r_{t+1}- \mathbb{E}[r_{t+1}])\bm{\phi_t} +\end{array} \end{equation*} -where +The key matrix $\textbf{A}_{\text{VMTD}}$ and $b_{\text{VMTD}}$ of on-policy VMTD is \begin{equation*} \begin{array}{ccl} - \textbf{A} &=& \mathbb{E}\Big[\Big( (\bm{\phi} - \gamma \bm{\phi}')- \mathbb{E}[ (\bm{\phi} - \gamma \bm{\phi}')]\Big)\bm{\phi}^{\top} \Big] \\ - &=& \mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}')\bm{\phi}^{\top}] - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}']\mathbb{E}[\bm{\phi}^{\top}]\\ - &=& \mathrm{Cov}(\bm{\bm{\phi}},\bm{\bm{\phi}}-\gamma\bm{\bm{\phi}}'), + &&\textbf{A}_{\text{VMTD}} \\ + &=& \mathbb{E}[(\phi - \gamma \phi')\phi^{\top}] - \mathbb{E}[\phi - \gamma \phi']\mathbb{E}[\phi^{\top}]\\ + &=&\sum_{s}d_{\pi}(s)\phi(s)\Big(\phi(s) -\gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\phi(s') \Big)^{\top} \\ + && -\sum_{s}d_{\pi}(s)\phi(s) \cdot \sum_{s}d_{\pi}(s)\Big(\phi(s) -\gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\phi(s') \Big)^{\top}\\ + &=& \bm{\Phi}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi} -\bm{\Phi}^{\top}d_{\pi}d_{\pi}^{\top}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}\\ + &=& \bm{\Phi}^{\top}(\textbf{D}_{\pi}-d_{\pi}d_{\pi}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}, \end{array} \end{equation*} +% \begin{equation*} +% \begin{array}{ccl} +% \textbf{C} &=& \mathbb{E}[\bm{\bm{\phi}}\bm{\bm{\phi}}^{\top}], +% \end{array} +% \end{equation*} \begin{equation*} \begin{array}{ccl} - \textbf{C} &=& \mathbb{E}[\bm{\bm{\phi}}\bm{\bm{\phi}}^{\top}], + &&b_{\text{VMTD}}\\ + &=& \mathbb{E}( r- \mathbb{E}[r])\phi \\ + &=& \mathbb{E}[r\phi] - \mathbb{E}[r]\mathbb{E}[\phi]\\ + &=& \bm{\Phi}^{\top}(\textbf{D}_{\pi}-d_{\pi}d_{\pi}^{\top})r_\pi. \end{array} \end{equation*} +It can be easily obtained that The key matrix $\textbf{A}_{\text{VMTD}}$ and $b_{\text{VMTD}}$ of off-policy VMTD are, respectively, +\begin{equation*} + \textbf{A}_{\text{VMTD}} = \bm{\Phi}^{\top}(\textbf{D}_{\mu}-d_{\mu}d_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}, + \end{equation*} + \begin{equation*} + b_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-d_{\mu}d_{\mu}^{\top})r_\pi, + \end{equation*} + In the on-policy 2-state environment, the minimum eigenvalue +of the key matrix for VMTD is greater than that of on-policy TDC and smaller than that of on-policy TD(0), +indicating that VMTD converges faster than TDC and slower than TD(0) in this +environment. In the off-policy 2-state environment, the +minimum eigenvalue of the key matrix for VMTD is greater than 0, +suggesting that VMTD can converge stably. + +\subsection{Variance Minimization TDC Learning: VMTDC} +For off-policy learning, we propose a new objective function, +called Variance of Projected Bellman error (VPBE), +and the corresponding algorithm is called VMTDC. +\begin{align} + &\text{VPBE}(\bm{\theta}) \notag\\ + &= \mathbb{E}[(\delta-\mathbb{E}[\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta ])\bm{\phi}] \\ + % &= (\bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}))^{\top}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1} \notag \\ + % & \bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ + % &= (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}})^{\top}\textbf{D}^{\top}\bm{\Phi}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1} \notag \\ + % & \bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ + % &= (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}})^{\top}\Pi^{\top}\textbf{D}\Pi \notag \\ + % & (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ + % &= (\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}-\textbf{W}_{\bm{\theta}}))^{\top}\textbf{D} \notag \\ + % & (\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}-\textbf{W}_{\bm{\theta}})) \notag \\ + % &= ||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}} - \textbf{W}_{\bm{\theta}})||^{2}_{\mu} \notag \\ + % &= ||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}) - \Pi\textbf{W}_{\bm{\theta}}||^{2}_{\mu} \notag \\ + &= \mathbb{E}[(\delta-\omega) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\omega)\bm{\phi}] , +\end{align} +where +% $\textbf{W}_{\bm{\theta}}$ is viewed as vectors with every element being equal to +% $||\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}||^{2}_{\mu}$ and +$\omega$ is used to approximate $\mathbb{E}[\delta]$, i.e., $\omega \doteq\mathbb{E}[\delta] $. + +The gradient of the (6) with respect to $\theta$ is \begin{equation*} \begin{array}{ccl} - \textbf{b} &=& \mathbb{E}( r- \mathbb{E}[r])\bm{\phi} \\ - &=& \mathbb{E}[r\bm{\phi}] - \mathbb{E}[r]\mathbb{E}[\bm{\phi}]\\ - &=& \mathrm{Cov}(r,\bm{\bm{\phi}}), + -\frac{1}{2}\nabla \text{VPBE}({\theta}) &=& -\mathbb{E}\Big[\Big( (\gamma {\phi}' - {\phi}) - \mathbb{E}[ (\gamma {\phi}' - {\phi})]\Big){\phi}^{\top} \Big] \\ + & & \mathbb{E}[{\phi} {\phi}^{\top}]^{-1} \mathbb{E}[( \delta -\mathbb{E}[ \delta]){\phi}]\\ + &=& \mathbb{E}\Big[\Big( ({\phi} - \gamma {\phi}')- \mathbb{E}[ ({\phi} - \gamma {\phi}')]\Big){\phi}^{\top} \Big] \\ + & & \mathbb{E}[{\phi} {\phi}^{\top}]^{-1}\\ + & & \mathbb{E}\Big[\Big( r + \gamma {{\phi}'}^{\top} {\theta} -{\phi}^{\top} {\theta}\\ + & & \hspace{2em} -\mathbb{E}[ r + \gamma {{\phi}'}^{\top} {\theta} -{\phi}^{\top} {\theta}]\Big){\phi} \Big]. + % &=& \textbf{A}^{\top} \textbf{C}^{-1}(-\textbf{A}\bm{\theta} + \textbf{b}) \end{array} \end{equation*} -where $\mathrm{Cov}(\cdot,\cdot )$ is a covariance operator. +It can be easily obtained that The key matrix $\textbf{A}_{\text{VMTDC}}$ and $b_{\text{VMTDC}}$ of VMTDC are, respectively, +\begin{equation*} + \textbf{A}_{\text{VMTDC}} = \textbf{A}_{\text{VMTD}}^{\top} \textbf{C}^{-1}\textbf{A}_{\text{VMTD}}, + \end{equation*} + \begin{equation*} + b_{\text{VMTDC}}=\textbf{A}_{\text{VMTD}}^{\top} \textbf{C}^{-1}b_{\text{VMTD}}, + \end{equation*} +where, for on-policy, $\textbf{A}_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\pi}-d_{\pi}d_{\pi}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$ +and $b_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\pi}-d_{\pi}d_{\pi}^{\top})r_\pi$ and, for off-policy, +$\textbf{A}_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-d_{\mu}d_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$ +and $b_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-d_{\mu}d_{\mu}^{\top})r_\pi$. -In the process of computing the gradient of the (4) with respect to $\theta$, +In the process of computing the gradient of the (7) with respect to $\theta$, $\omega$ is treated as a constant. So, the derivation process of the VMTDC algorithm is the same as that of the TDC algorithm, the only difference is that the original $\delta$ is replaced by $\delta-\omega$. @@ -133,60 +215,67 @@ Therefore, we can easily get the updated formula of VMTDC, as follows: \end{equation} and \begin{equation} - \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k), + \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k). \label{omegavmtdc} \end{equation} - The VMTDC algorithm (\ref{thetavmtdc}) is derived to work with a given set of sub-samples—in the form of triples $(S_k, R_k, S'_k)$ that match transitions -from both the behavior and target policies. What if -we wanted to use all the data? The data -is generated according to the behavior policy $\pi_b$, -while our objective is to learn about the target -policy $\pi$. We should use importance-sampling. -The VPBE with importance sampling is: -\begin{equation} - \label{rho_VPBE} - \begin{array}{ccl} - \text{VPBE}(\bm{\theta})&=&\mathbb{E}[(\rho\delta-\mathbb{E}[\rho\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ - & &\mathbb{E}[(\rho\delta -\mathbb{E}[\rho\delta ])\bm{\phi}], -\end{array} -\end{equation} -Following the linear VMTDC derivation, we get the following algorithm (linear VMTDC algorithm -based on importance weighting scenario): -\begin{equation} - \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k}[(\rho_k\delta_{k}- \omega_k) \bm{\phi}_k\\ - - \gamma\rho_k\bm{\phi}_{k+1}(\bm{\phi}^{\top}_k \bm{u}_{k})], -\end{equation} -\begin{equation} - \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[(\rho_k\delta_{k}-\omega_k) - \bm{\phi}^{\top}_k \bm{u}_{k}]\bm{\phi}_k, -\end{equation} -and -\begin{equation} - \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\rho_k\delta_k- \omega_k), -\end{equation} +from both the behavior and target policies. -The gradient of the (\ref{rho_VPBE}) with respect to $\theta$ is -\begin{equation*} - \begin{array}{ccl} - -\frac{1}{2}\nabla \text{VPBE}(\bm{\theta}) &=& \mathbb{E}\Big[\Big( \rho(\bm{\phi} - \gamma \bm{\phi}')- \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')]\Big)\bm{\phi}^{\top} \Big] \\ - & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ - & & \mathbb{E}\Big[\Big( \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})\\ - & & \hspace{2em} -\mathbb{E}[ \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})]\Big)\bm{\phi} \Big]\\ - &=& \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')\bm{\phi}^{\top}]- \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')]\mathbb{E}[\bm{\phi}^{\top}] \\ - & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ - & & \mathbb{E}\Big[\Big( \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})\\ - & & \hspace{2em} -\mathbb{E}[ \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})]\Big)\bm{\phi} \Big]\\ - % &=&\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}\\ - &=& \textbf{A}^{\top} \textbf{C}^{-1}(-\textbf{A}\bm{\theta} + \textbf{b}), - \end{array} -\end{equation*} -where $\textbf{A}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$, -$\textbf{b}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$ and -$\textbf{r}_{\pi}$ is viewed as vectors. -In the 2-state counterexample, -$\textbf{A}_{\text{VMTDC}}=0.025$, meaning that VMTDC can stably converge and converges faster than TDC. +% What if +% we wanted to use all the data? The data +% is generated according to the behavior policy $\pi_b$, +% while our objective is to learn about the target +% policy $\pi$. We should use importance-sampling. +% The VPBE with importance sampling is: +% \begin{equation} +% \label{rho_VPBE} +% \begin{array}{ccl} +% \text{VPBE}(\bm{\theta})&=&\mathbb{E}[(\rho\delta-\mathbb{E}[\rho\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ +% & &\mathbb{E}[(\rho\delta -\mathbb{E}[\rho\delta ])\bm{\phi}], +% \end{array} +% \end{equation} +% Following the linear VMTDC derivation, we get the following algorithm (linear VMTDC algorithm +% based on importance weighting scenario): +% \begin{equation} +% \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k}[(\rho_k\delta_{k}- \omega_k) \bm{\phi}_k\\ +% - \gamma\rho_k\bm{\phi}_{k+1}(\bm{\phi}^{\top}_k \bm{u}_{k})], +% \end{equation} +% \begin{equation} +% \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[(\rho_k\delta_{k}-\omega_k) - \bm{\phi}^{\top}_k \bm{u}_{k}]\bm{\phi}_k, +% \end{equation} +% and +% \begin{equation} +% \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\rho_k\delta_k- \omega_k), +% \end{equation} + +% The gradient of the (\ref{rho_VPBE}) with respect to $\theta$ is +% \begin{equation*} +% \begin{array}{ccl} +% -\frac{1}{2}\nabla \text{VPBE}(\bm{\theta}) &=& \mathbb{E}\Big[\Big( \rho(\bm{\phi} - \gamma \bm{\phi}')- \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')]\Big)\bm{\phi}^{\top} \Big] \\ +% & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ +% & & \mathbb{E}\Big[\Big( \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})\\ +% & & \hspace{2em} -\mathbb{E}[ \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})]\Big)\bm{\phi} \Big]\\ +% &=& \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')\bm{\phi}^{\top}]- \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')]\mathbb{E}[\bm{\phi}^{\top}] \\ +% & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ +% & & \mathbb{E}\Big[\Big( \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})\\ +% & & \hspace{2em} -\mathbb{E}[ \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})]\Big)\bm{\phi} \Big]\\ +% % &=&\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}\\ +% &=& \textbf{A}^{\top} \textbf{C}^{-1}(-\textbf{A}\bm{\theta} + \textbf{b}), +% \end{array} +% \end{equation*} +% where $\textbf{A}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$, +% $\textbf{b}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$ and +% $\textbf{r}_{\pi}$ is viewed as vectors. + +In the on-policy 2-state environment, the minimum eigenvalue +of the key matrix for VMTDC is smaller than that of TD(0), TDC and VMTD +indicating that VMTDC converges slower than them in this +on-policy. In the off-policy 2-state environment, the +minimum eigenvalue of the key matrix for VMTD is greater than TDC, +suggesting that VMTDC converges faster than them in off-policy +environment. @@ -213,24 +302,24 @@ $\textbf{A}_{\text{VMTDC}}=0.025$, meaning that VMTDC can stably converge and co % \label{deltaQ} % \end{equation} % and $A^{*}_{k+1}={\arg \max}_{a}(\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a))$. -\begin{table*}[t] - \caption{Minimum eigenvalues of various algorithms in the 2-state counterexample.} - \vskip 0.15in - \begin{center} - \begin{small} - \begin{sc} - \begin{tabular}{lcccccr} - \toprule - algorithm & off-policy TD & TDC & ETD & VMTDC & VMETD \\ - \midrule - Minimum eigenvalues&$-0.2$ & $0.016$ & $3.4$ & $0.025$ & $1.15$ \\ - \bottomrule - \end{tabular} - \end{sc} - \end{small} - \end{center} - \vskip -0.1in -\end{table*} +% \begin{table*}[t] +% \caption{Minimum eigenvalues of various algorithms in the 2-state counterexample.} +% \vskip 0.15in +% \begin{center} +% \begin{small} +% \begin{sc} +% \begin{tabular}{lcccccr} +% \toprule +% algorithm & off-policy TD & TDC & ETD & VMTDC & VMETD \\ +% \midrule +% Minimum eigenvalues&$-0.2$ & $0.016$ & $3.4$ & $0.025$ & $1.15$ \\ +% \bottomrule +% \end{tabular} +% \end{sc} +% \end{small} +% \end{center} +% \vskip -0.1in +% \end{table*} \subsection{Variance Minimization ETD Learning: VMETD} Based on the off-policy TD algorithm, a scalar, $F$, is introduced to obtain the ETD algorithm, @@ -242,65 +331,73 @@ VMETD by the following update: % \delta_{t}= R_{t+1}+\gamma \theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t. % \end{equation} \begin{equation} + \label{fvmetd} + F_t \leftarrow \gamma \rho_{t-1}F_{t-1}+1, +\end{equation} +\begin{equation} \label{thetavmetd} - \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_{k})\bm{\phi}_k, + {\theta}_{t+1}\leftarrow {\theta}_t+\alpha_t (F_t \rho_t\delta_t - \omega_{t}){\phi}_t, \end{equation} \begin{equation} \label{omegavmetd} - \omega_{k+1} \leftarrow \omega_k+\beta_k(F_k \rho_k \delta_k - \omega_k), + \omega_{t+1} \leftarrow \omega_t+\beta_t(F_t \rho_t \delta_t - \omega_t), \end{equation} where $\omega$ is used to estimate $\mathbb{E}[F \rho\delta]$, i.e., $\omega \doteq \mathbb{E}[F \rho\delta]$. (\ref{thetavmetd}) can be rewritten as \begin{equation*} \begin{array}{ccl} - \bm{\theta}_{k+1}&\leftarrow& \bm{\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_k)\bm{\phi}_k -\alpha_k \omega_{k+1}\bm{\phi}_k\\ - &=&\bm{\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k\\ - &=&\bm{\theta}_k+\alpha_k F_k \rho_k (R_{k+1}+\gamma \bm{\theta}_k^{\top}\bm{\phi}_{k+1}-\bm{\theta}_k^{\top}\bm{\phi}_k)\bm{\phi}_k\\ - & & \hspace{2em} -\alpha_k \mathbb{E}_{\mu}[F_k \rho_k \delta_k]\bm{\phi}_k\\ - &=& \bm{\theta}_k+\alpha_k \{\underbrace{(F_k\rho_kR_{k+1}-\mathbb{E}_{\mu}[F_k\rho_k R_{k+1}])\bm{\phi}_k}_{\textbf{b}_{\text{VMETD},k}}\\ - &&\hspace{-5em}- \underbrace{(F_k\rho_k\bm{\phi}_k(\bm{\phi}_k-\gamma\bm{\phi}_{k+1})^{\top}-\bm{\phi}_k\mathbb{E}_{\mu}[F_k\rho_k (\bm{\phi}_k-\gamma\bm{\phi}_{k+1})]^{\top})}_{\textbf{A}_{\text{VMETD},k}}\bm{\theta}_k\}. + {\theta}_{t+1}&\leftarrow& {\theta}_t+\alpha_t (F_t \rho_t\delta_t - \omega_t){\phi}_t -\alpha_t \omega_{t+1}{\phi}_t\\ + &=&{\theta}_{t}+\alpha_t(F_t\rho_t\delta_t-\mathbb{E}_{\mu}[F_t\rho_t\delta_t|{\theta}_t]){\phi}_t\\ + &=&{\theta}_t+\alpha_t F_t \rho_t (r_{t+1}+\gamma {\theta}_t^{\top}{\phi}_{t+1}-{\theta}_t^{\top}{\phi}_t){\phi}_t\\ + & & \hspace{2em} -\alpha_t \mathbb{E}_{\mu}[F_t \rho_t \delta_t]{\phi}_t\\ + &=& {\theta}_t+\alpha_t \{\underbrace{(F_t\rho_tr_{t+1}-\mathbb{E}_{\mu}[F_t\rho_t r_{t+1}]){\phi}_t}_{{b}_{\text{VMETD},t}}\\ + &&\hspace{-7em}- \underbrace{(F_t\rho_t{\phi}_t({\phi}_t-\gamma{\phi}_{t+1})^{\top}-{\phi}_t\mathbb{E}_{\mu}[F_t\rho_t ({\phi}_t-\gamma{\phi}_{t+1})]^{\top})}_{\textbf{A}_{\text{VMETD},t}}{\theta}_t\}. \end{array} \end{equation*} Therefore, \begin{equation*} \begin{array}{ccl} - \textbf{A}_{\text{VMETD}}&=&\lim_{k \rightarrow \infty} \mathbb{E}[\textbf{A}_{\text{VMETD},k}]\\ - &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k \rho_k \bm{\phi}_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]\\ - &&\hspace{-1em}- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ - &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]\\ - &&\hspace{-1em}- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ - &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]\\ - &&\hspace{-2em}-\lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ - && \hspace{-9em}=\sum_{s} d_{\mu}(s)\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k|S_k = s]\mathbb{E}_{\mu}[\rho_k\phi_k(\phi_k - \gamma \phi_{k+1})^{\top}|S_k = s]\\ - &&\hspace{-2em}-\sum_{s} d_{\mu}(s)\phi(s)\sum_{s} d_{\mu}(s)\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k|S_k = s]\\ - &&\hspace{2em}\mathbb{E}_{\mu}[\rho_k(\phi_k - \gamma \phi_{k+1})^{\top}|S_k = s]\\ - &=& \sum_{s} f(s)\mathbb{E}_{\pi}[\phi_k(\phi_k - \gamma \phi_{k+1})^{\top}|S_k = s]\\ - &&\hspace{-3em}-\sum_{s} d_{\mu}(s)\phi(s)\sum_{s} f(s)\mathbb{E}_{\pi}[(\phi_k - \gamma \phi_{k+1})^{\top}|S_k = s]\\ + &&\textbf{A}_{\text{VMETD}}\\ + &=&\lim_{t \rightarrow \infty} \mathbb{E}[\textbf{A}_{\text{VMETD},t}]\\ + &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[F_t \rho_t {\phi}_t ({\phi}_t - \gamma {\phi}_{t+1})^{\top}]\\ + &&\hspace{1em}- \lim_{t\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\mathbb{E}_{\mu}[F_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})]^{\top}\\ + % &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_tF_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})^{\top}]\\ + % &&\hspace{1em}- \lim_{t\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\mathbb{E}_{\mu}[F_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})]^{\top}\\ + &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_tF_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})^{\top}]\\ + &&\hspace{1em}-\lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\lim_{t \rightarrow \infty}\mathbb{E}_{\mu}[F_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})]^{\top}\\ + && \hspace{-2em}=\sum_{s} d_{\mu}(s)\lim_{t \rightarrow \infty}\mathbb{E}_{\mu}[F_t|S_t = s]\mathbb{E}_{\mu}[\rho_t\phi_t(\phi_t - \gamma \phi_{t+1})^{\top}|S_t= s]\\ + &&\hspace{1em}-\sum_{s} d_{\mu}(s)\phi(s)\sum_{s} d_{\mu}(s)\lim_{t \rightarrow \infty}\mathbb{E}_{\mu}[F_t|S_t = s]\\ + &&\hspace{7em}\mathbb{E}_{\mu}[\rho_t(\phi_t - \gamma \phi_{t+1})^{\top}|S_t = s]\\ + &=& \sum_{s} f(s)\mathbb{E}_{\pi}[\phi_t(\phi_t- \gamma \phi_{t+1})^{\top}|S_t = s]\\ + &&\hspace{1em}-\sum_{s} d_{\mu}(s)\phi(s)\sum_{s} f(s)\mathbb{E}_{\pi}[(\phi_t- \gamma \phi_{t+1})^{\top}|S_t = s]\\ &=&\sum_{s} f(s) \bm{\phi}(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top} \\ - &&\hspace{-4em} -\sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top}\\ - &=&{\bm{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) \bm{\Phi} - {\bm{\Phi}}^{\top} \textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) \bm{\Phi} \\ - &=&{\bm{\Phi}}^{\top} (\textbf{F} - \textbf{d}_{\mu} \textbf{f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}} \\ - &=&{\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){\bm{\Phi}} \\ - &=&{\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}}, + &&\hspace{1em} -\sum_{s} d_{\mu}(s) {\phi}(s) * \sum_{s} f(s)({\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}{\phi}(s'))^{\top}\\ + &=&{\bm{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) \bm{\Phi} - {\bm{\Phi}}^{\top} {d}_{\mu} {f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) \bm{\Phi} \\ + &=&{\bm{\Phi}}^{\top} (\textbf{F} - {d}_{\mu} {f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}} \\ + &=&{\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-{d}_{\mu} {f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){\bm{\Phi}} \\ + &=&{\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-{d}_{\mu} {d}_{\mu}^{\top} ){\bm{\Phi}}, \end{array} \end{equation*} \begin{equation*} \begin{array}{ccl} - \textbf{b}_{\text{VMETD}}&=&\lim_{k \rightarrow \infty} \mathbb{E}[\textbf{b}_{\text{VMETD},k}]\\ - &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k\rho_kR_{k+1}\bm{\phi}_k]\\ - &&- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ - &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]\\ - &&- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ - &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]\\ - &&- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ - &=&\sum_{s} f(s) \bm{\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)r_{\pi} \\ - &=&\bm{\bm{\Phi}}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi}. + &&{b}_{\text{VMETD}}\\ + &=&\lim_{t \rightarrow \infty} \mathbb{E}[{b}_{\text{VMETD},t}]\\ + &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[F_t\rho_tR_{t+1}{\phi}_t]\\ + &&\hspace{2em} - \lim_{t\rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_t]\mathbb{E}_{\mu}[F_t\rho_kR_{k+1}]\\ + &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_tF_t\rho_tr_{t+1}]\\ + &&\hspace{2em} - \lim_{t\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\mathbb{E}_{\mu}[{\phi}_t]\mathbb{E}_{\mu}[F_t\rho_tr_{t+1}]\\ + &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_tF_t\rho_tr_{t+1}]\\ + &&\hspace{2em} - \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\lim_{t \rightarrow \infty}\mathbb{E}_{\mu}[F_t\rho_tr_{t+1}]\\ + &=&\sum_{s} f(s) {\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) {\phi}(s) * \sum_{s} f(s)r_{\pi} \\ + &=&\bm{\bm{\Phi}}^{\top}(\textbf{F}-{d}_{\mu} {f}^{\top}){r}_{\pi}. \end{array} \end{equation*} -Therefore, in the 2-state counterexample, -$\textbf{A}_{\text{VMETD}}=1.15$, meaning that VMETD can stably converge and converges slower than ETD. -However, subsequent experiments showed that the VMETD algorithm converges more smoothly and performs better in controlled experiments. +In the off-policy 2-state environment, the minimum eigenvalue +of the key matrix for VMETD is greater than that of TD(0), TDC and VMTD and smaller than that of ETD, +indicating that VMTDC converges faster than TD(0), TDC and VMTD and slower than ETD in this +off-policy. +However, subsequent experiments showed that the VMETD algorithm converges more smoothly and performs best in controlled experiments. % In this paper, we refer to the control algorithm for ETD as EQ. % Now, we will introduce the improved version of the EQ algorithm, named VMEQ: % \begin{equation} diff --git a/NEW_aaai/main/pic/2-state-offpolicy.pdf b/NEW_aaai/main/pic/2-state-offpolicy.pdf new file mode 100644 index 0000000..7bd91ce Binary files /dev/null and b/NEW_aaai/main/pic/2-state-offpolicy.pdf differ diff --git a/NEW_aaai/main/pic/2-state-onpolicy.pdf b/NEW_aaai/main/pic/2-state-onpolicy.pdf new file mode 100644 index 0000000..a010a5e Binary files /dev/null and b/NEW_aaai/main/pic/2-state-onpolicy.pdf differ diff --git a/NEW_aaai/main/pic/2StateExample.pdf b/NEW_aaai/main/pic/2StateExample.pdf index a2520f9..1f799e2 100644 Binary files a/NEW_aaai/main/pic/2StateExample.pdf and b/NEW_aaai/main/pic/2StateExample.pdf differ diff --git a/NEW_aaai/main/pic/BairdExample copy 2.tex b/NEW_aaai/main/pic/BairdExample copy 2.tex new file mode 100644 index 0000000..af5a22e --- /dev/null +++ b/NEW_aaai/main/pic/BairdExample copy 2.tex @@ -0,0 +1,69 @@ +\resizebox{7cm}{4.4cm}{ +\begin{tikzpicture}[smooth] +\node[coordinate] (origin) at (0.3,0) {}; +\node[coordinate] (num7) at (3,0) {}; +\node[coordinate] (num1) at (1,2.5) {}; +\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {}; +\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {}; +\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {}; +\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {}; +\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {}; +\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {}; +\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {}; +\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {}; + +\path (num1) ++ (0:1cm) node (num2) [coordinate] {}; +\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {}; +\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {}; +\path (num2) ++ (0:1cm) node (num3) [coordinate] {}; +\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {}; +\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {}; +\path (num3) ++ (0:1cm) node (num4) [coordinate] {}; +\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {}; +\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {}; +\path (num4) ++ (0:1cm) node (num5) [coordinate] {}; +\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {}; +\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {}; +\path (num5) ++ (0:1cm) node (num6) [coordinate] {}; +\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {}; +\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {}; + + +%\draw[->](0,0) -- (1,1); +%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1); + %\fill (0.5,0.5) circle (0.5); + %\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7}; + + +\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06] +coordinates{(num7) (origin) (origin_above) (origin_aright)}; + +\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5] +coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)}; + +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7}; + + + +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6}; + +\draw[->,>=stealth,line width = 0.02cm] (a)--(g); +\draw[->,>=stealth,line width = 0.02cm] (b)--(g); +\draw[->,>=stealth,line width = 0.02cm] (c)--(g); +\draw[->,>=stealth,line width = 0.02cm] (d)--(g); +\draw[->,>=stealth,line width = 0.02cm] (e)--(g); +\draw[->,>=stealth,line width = 0.02cm] (f)--(g); +\end{tikzpicture} +} + diff --git a/NEW_aaai/main/pic/BairdExample copy.tex b/NEW_aaai/main/pic/BairdExample copy.tex new file mode 100644 index 0000000..af5a22e --- /dev/null +++ b/NEW_aaai/main/pic/BairdExample copy.tex @@ -0,0 +1,69 @@ +\resizebox{7cm}{4.4cm}{ +\begin{tikzpicture}[smooth] +\node[coordinate] (origin) at (0.3,0) {}; +\node[coordinate] (num7) at (3,0) {}; +\node[coordinate] (num1) at (1,2.5) {}; +\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {}; +\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {}; +\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {}; +\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {}; +\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {}; +\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {}; +\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {}; +\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {}; + +\path (num1) ++ (0:1cm) node (num2) [coordinate] {}; +\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {}; +\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {}; +\path (num2) ++ (0:1cm) node (num3) [coordinate] {}; +\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {}; +\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {}; +\path (num3) ++ (0:1cm) node (num4) [coordinate] {}; +\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {}; +\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {}; +\path (num4) ++ (0:1cm) node (num5) [coordinate] {}; +\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {}; +\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {}; +\path (num5) ++ (0:1cm) node (num6) [coordinate] {}; +\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {}; +\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {}; + + +%\draw[->](0,0) -- (1,1); +%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1); + %\fill (0.5,0.5) circle (0.5); + %\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7}; + + +\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06] +coordinates{(num7) (origin) (origin_above) (origin_aright)}; + +\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5] +coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)}; + +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7}; + + + +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5}; +\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ; +\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6}; + +\draw[->,>=stealth,line width = 0.02cm] (a)--(g); +\draw[->,>=stealth,line width = 0.02cm] (b)--(g); +\draw[->,>=stealth,line width = 0.02cm] (c)--(g); +\draw[->,>=stealth,line width = 0.02cm] (d)--(g); +\draw[->,>=stealth,line width = 0.02cm] (e)--(g); +\draw[->,>=stealth,line width = 0.02cm] (f)--(g); +\end{tikzpicture} +} + diff --git a/NEW_aaai/main/pic/acrobot.pdf b/NEW_aaai/main/pic/acrobot.pdf index dbb3116..373e4f1 100644 Binary files a/NEW_aaai/main/pic/acrobot.pdf and b/NEW_aaai/main/pic/acrobot.pdf differ diff --git a/NEW_aaai/main/pic/cl.pdf b/NEW_aaai/main/pic/cl.pdf index 0bf0a70..32f9c69 100644 Binary files a/NEW_aaai/main/pic/cl.pdf and b/NEW_aaai/main/pic/cl.pdf differ diff --git a/NEW_aaai/main/pic/maze.pdf b/NEW_aaai/main/pic/maze.pdf index 947501e..baf79bf 100644 Binary files a/NEW_aaai/main/pic/maze.pdf and b/NEW_aaai/main/pic/maze.pdf differ diff --git a/NEW_aaai/main/pic/mt.pdf b/NEW_aaai/main/pic/mt.pdf index 89548ac..2641925 100644 Binary files a/NEW_aaai/main/pic/mt.pdf and b/NEW_aaai/main/pic/mt.pdf differ diff --git a/NEW_aaai/main/preliminaries.tex b/NEW_aaai/main/preliminaries.tex index 1a14e0f..a914bfa 100644 --- a/NEW_aaai/main/preliminaries.tex +++ b/NEW_aaai/main/preliminaries.tex @@ -19,172 +19,243 @@ Reinforcement learning agent interacts with environment, observes state, A policy is a mapping $\pi:S\times A \rightarrow [0,1]$. The goal of the agent is to find an optimal policy $\pi^*$ to maximize the expectation of a - discounted cumulative rewards in a long period. + discounted cumulative rewards in a long period. For each discrete time step + $t=0,1,2,3,…$, State value function $V^{\pi}(s)$ for a stationary policy $\pi$ is defined as: \begin{equation*} - V^{\pi}(s)=\mathbb{E}_{\pi}[\sum_{k=0}^{\infty} \gamma^k R_{k}|s_0=s]. + V^{\pi}(s)=\mathbb{E}_{\pi}[\sum_{k=0}^{\infty} \gamma^k R_{t+k+1}|S_t=s]. \label{valuefunction} \end{equation*} Linear value function for state $s\in S$ is defined as: \begin{equation} - V_{{\theta}}(s):= {\bm{\theta}}^{\top}{\bm{\phi}}(s) = \sum_{i=1}^{m} + V_{{\theta}}(s):= {{\theta}}^{\top}{{\phi}}(s) = \sum_{i=1}^{m} \theta_i \phi_i(s), \label{linearvaluefunction} \end{equation} - where ${\bm{\theta}}:=(\theta_1,\theta_2,\ldots,\theta_m)^{\top}\in + where ${{\theta}}:=(\theta_1,\theta_2,\ldots,\theta_m)^{\top}\in \mathbb{R}^m$ is a parameter vector, - ${\bm{\phi}}:=(\phi_1,\phi_2,\ldots,\phi_m)^{\top}\in \mathbb{R}^m$ is a feature + ${{\phi}}:=(\phi_1,\phi_2,\ldots,\phi_m)^{\top}\in \mathbb{R}^m$ is a feature function defined on state space $S$, and $m$ is the feature size. Tabular temporal difference (TD) learning \cite{Sutton2018book} has been successfully applied to small-scale problems. To deal with the well-known curse of dimensionality of large scale MDPs, value function is usually approximated by a linear model (the focus of this paper), kernel methods, decision trees, or neural networks, etc. - % This paper focuses on the linear model. -% TD learning can also be used to find optimal strategies. The problem of finding an optimal policy is -% often called the control problem. Two popular TD methods are Sarsa and Q-leaning. The former is an on-policy -% TD control, while the latter is an off-policy control. - -% It is well known that TDC algorithm \cite{sutton2009fast} guarantees -% convergence under off-policy conditions while the off-policy TD algorithm may diverge. The -% objective function of TDC is MSPBE. -% TDC is essentially an adjustment or correction of the TD update so that it -% follows the gradient of the MSPBE objective function. In the context of the TDC algorithm, the control algorithm -% is known as Greedy-GQ($\lambda$) \cite{sutton2009fast}. When $\lambda$ is set to 0, it is denoted -% as GQ(0). - \subsection{On-policy and Off-policy} -On-policy and off-policy algorithms are currently hot topics in research. -Off-policy algorithms, in particular, present greater challenges due to the -difficulty in ensuring their convergence, making them more complex to study. +\begin{table*}[t] + \caption{Minimum eigenvalues of various algorithms in the 2-state counterexample.} + \label{tab:min_eigenvalues} % 添加标签 + \vskip 0.15in + \begin{center} + \begin{small} + \begin{sc} + \begin{tabular}{lccccccr} + \toprule + algorithm &TD & TDC & ETD & VMTD & VMTDC & VMETD \\ + \midrule + on-policy 2-state&$0.475$ & $0.09025$ & \text{\textbackslash}& $0.25$ & $0.025$ & \text{\textbackslash} \\ + off-policy 2-state&$-0.2$ & $0.016$ & $3.4$ & $0.25$ & $0.025$ & $1.15$\\ + \bottomrule + \end{tabular} + \end{sc} + \end{small} + \end{center} + \vskip -0.1in +\end{table*} +On-policy and off-policy algorithms are currently hot topics in research. The main difference between the two lies in the fact that in on-policy algorithms, the behavior policy $\mu$ and the target policy $\pi$ are the same during the learning process. -The algorithm directly generates data from the current policy and optimizes it. In off-policy algorithms, however, the behavior policy and the target policy are different. The algorithm uses data generated from the behavior policy to optimize the target policy, which leads to higher sample efficiency and complex stability issues. -Taking the TD(0) algorithm as an example can help understand the different -performances of on-policy and off-policy: +% In the on-policy TD(0) algorithm, since the behavior policy and the target policy +% are consistent, the convergence of TD(0) is more assured. In each time step $t$ of the +% update, the algorithm is based on the actual behavior of the current policy, +% which gradually leads the value function estimate to converge to the true +% value of the target policy. -In the on-policy TD(0) algorithm, the behavior policy and the target policy -are the same. The algorithm uses the data generated by the current policy to -update its value estimates. Since the behavior policy and the target policy -are consistent, the convergence of TD(0) is more assured. In each step of the -update, the algorithm is based on the actual behavior of the current policy, -which gradually leads the value function estimate to converge to the true -value of the target policy. - -The on-policy TD(0) update formula is +From the theory of stochastic methods, the +convergence point of linear TD algorithms, is a parameter vector, say $\bm{\theta}$, that satisfies \begin{equation*} - \label{thetatd_onpolicy} \begin{array}{ccl} - \bm{\theta}_{k+1}&\leftarrow&\bm{\theta}_k+\alpha_k \delta_k\bm{\phi}_k, + b - \textbf{A}{\theta}&=&0, \end{array} \end{equation*} -where $\delta_k = r_{k+1}+\gamma \bm{\theta}_k^{\top}\bm{\phi}_{k+1}-\bm{\theta}_k^{\top}\bm{\phi}_k$ and -the key matrix $\textbf{A}_{\text{on}}$ of on-policy TD(0) is +where $\textbf{A}\in \mathbb{R}^{|S| \times m}$ and $b\in \mathbb{R}^{m}$. +If the matrix +$\textbf{A}$ is positive definite, then the algorithm converges. +The convergence rate of the algorithm is related to the matrix +$\textbf{A}$. The larger the minimum eigenvalue of +$\textbf{A}$, the faster the convergence rate. +Next, we will compute the minimum eigenvalue of +$\textbf{A}$ for TD(0), TDC, and ETD in both on-policy and off-policy settings in a 2-state environment. +First, we will introduce the environment setup for the 2-state case in both on-policy and off-policy settings. +\begin{figure}[h] + \begin{center} + \includegraphics[width=0.3\columnwidth, height=0.15\columnwidth]{main/pic/2StateExample.pdf} + \caption{2-state} + \end{center} + \end{figure} + +The "1"$\rightarrow$"2" problem has only two states. From each +state, there are two actions, left and right, which take +the agent to the left or right state. All rewards are zero. +The feature $\bm{\Phi}=(1,2)^{\top}$ +are assigned to the left and +the right state. The first policy takes the equal +probability to left or right +in both states, i.e., +$ +\textbf{P}_{1}= +\begin{bmatrix} +0.5 & 0.5 \\ +0.5 & 0.5 +\end{bmatrix} +$. +The second policy only selects action right in both states, i.e., +$ +\textbf{P}_{2}= +\begin{bmatrix} +0 & 1 \\ +0 & 1 +\end{bmatrix} +$. +The state distribution of +the first policy is $d_1 =(0.5,0.5)^{\top}$. +The state distribution of +the second policy is $d_1 =(0,1)^{\top}$. +The discount factor is $\gamma=0.9$. +In the on-policy setting, the behavior policy +and the target policy are the same, so +let $\textbf{P}_{\mu}=\textbf{P}_{\pi}=\textbf{P}_{1}$. +In the off-policy setting, +let $\textbf{P}_{\mu}=\textbf{P}_{1}$ and $\textbf{P}_{\pi}=\textbf{P}_{2}$. + + +% The on-policy TD(0) update formula is +% \begin{equation*} +% \label{thetatd_onpolicy} +% \begin{array}{ccl} +% \bm{\theta}_{t+1}&\leftarrow&\bm{\theta}_t+\alpha_t \delta_t\bm{\phi}_t, +% \end{array} +% \end{equation*} +% where $\delta_t = r_{t+1}+\gamma \bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ is one-step TD error and +The key matrix $\textbf{A}_{\text{on}}$ of on-policy TD(0) is \begin{equation*} \textbf{A}_{\text{on}} = \bm{\Phi}^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}, \end{equation*} -where $\bm{\Phi}$ is the $N \times n$ matrix with the $\phi(s)$ as its rows, and $\textbf{D}_{\pi}$ is the $N \times N$ diagonal -matrix with $\textbf{d}_{\pi}$ on its diagonal. $\textbf{d}_{\pi}$ is a vector, each component representing the steady-state -distribution under $\pi$. $\textbf{P}_{\pi}$ denote the $N \times N$ matrix of transition probabilities under $\pi$. And $\textbf{P}_{\pi}^{\top}\textbf{d}_{\pi}=\textbf{d}_{\pi}$. - -An $\bm{\Phi}^{\top}\bm{\text{X}}\bm{\Phi}$ matrix of this - form will be positive definite whenever the matrix $\bm{\text{X}}$ is positive definite. - Any matrix $\bm{\text{X}}$ is positive definite if and only if - the symmetric matrix $\bm{\text{S}}=\bm{\text{X}}+\bm{\text{X}}^{\top}$ is positive definite. - Any symmetric real matrix $\bm{\text{S}}$ is positive definite if the absolute values of - its diagonal entries are greater than the sum of the absolute values of the corresponding -off-diagonal entries\cite{sutton2016emphatic}. - -All components of the matrix $\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})$ are positive. -The row sums of $\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})$ are positive. And The row sums of -$\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})$ are -\begin{equation*} - \begin{array}{ccl} - \textbf{1}^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})&=&\textbf{d}_{\pi}^{\top}(\textbf{I}-\gamma \textbf{P}_{\pi})\\ - &=& \textbf{d}_{\pi}^{\top} - \gamma \textbf{d}_{\pi}^{\top}\textbf{P}_{\pi}\\ - &=& \textbf{d}_{\pi}^{\top} - \gamma \textbf{d}_{\pi}^{\top}\\ - &=& (1-\gamma)\textbf{d}_{\pi}^{\top}, - \end{array} -\end{equation*} -all components of which are positive. Thus, the key matrix and its $\textbf{A}_{\text{on}}$ matrix are positive -definite, and on-policy TD(0) is stable +where $\bm{\Phi}$ is the $|S| \times m$ matrix with the $\phi(s)$ as its rows, and $\textbf{D}_{\pi}$ is the $|S| \times |S|$ diagonal +matrix with $d_{\pi}$ on its diagonal. $d_{\pi}$ is a vector, each component representing the steady-state +distribution under policy $\pi$. $\textbf{P}_{\pi}$ denote the $|S| \times |S|$ matrix of transition probabilities under $\pi$. And $\textbf{P}_{\pi}^{\top}d_{\pi}=d_{\pi}$. -The off-policy TD(0) update formula is -\begin{equation*} - \label{thetatd_offpolicy} - \begin{array}{ccl} - \bm{\theta}_{k+1}&\leftarrow&\bm{\theta}_k+\alpha_k \rho_k \delta_k\bm{\phi}_k, - \end{array} -\end{equation*} -where $\rho_k =\frac{\pi(A_k | S_k)}{\mu(A_k | S_k)}$, called importance sampling ratio, - and the key matrix $\textbf{A}_{\text{off}}$ of off-policy TD(0) is -\begin{equation*} - \textbf{A}_{\text{off}} = \bm{\Phi}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}. -\end{equation*} -where $\textbf{D}_{\mu}$ is the $N \times N$ diagonal -matrix with $\textbf{d}_{\mu}$ on its diagonal. $\textbf{d}_{\mu}$ is a vector, each component representing the steady-state -distribution under $\mu$ - -If the key matrix -$\textbf{A}$ in the algorithm is positive definite, then the -algorithm is stable and converges. However, in the off-policy TD(0) - algorithm, it cannot be guaranteed that -$\textbf{A}$ is a positive definite matrix. In the 2-state counterexample, -$\textbf{A}_{\text{off}}=-0.2$, which means that off-policy TD(0) cannot stably converge. - -TDC and ETD are two well-known off-policy algorithms. -The former is an off-policy algorithm derived from the -objective function Mean Squared Projected Bellman error (MSPBE), while the latter employs a technique -to transform the key matrix -$\textbf{A}$ in the original off-policy TD(0) from non-positive -definite to positive definite, thereby ensuring the algorithm's -convergence under off-policy conditions. - -The MSPBE with importance sampling is -\begin{equation*} - \begin{array}{ccl} - \text{MSPBE}(\bm{\theta})&=&||\textbf{V}_{\bm{\theta}} - \Pi \textbf{T}^{\pi}\textbf{V}_{\bm{\theta}}||^{2}_{\mu}\\ - &=&||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}^{\pi}\textbf{V}_{\bm{\theta}})||^{2}_{\mu}\\ - &=&\mathbb{E}[\rho \delta \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[\rho \delta \bm{\phi}], - \end{array} -\end{equation*} -where $\textbf{V}_{\bm{\theta}}$ is viewed as vectors with one element for each state, - the norm $||\bm{v}||^{2}_{\mu}=\sum_{s}^{}\mu(s)\bm{v}^{2}(s)$, $\textbf{T}^{\pi}$, simplified to - $\textbf{T}$ in the following text, is Bellman operator and $\bm{\Pi}=\bm{\Phi}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1}\bm{\Phi}^{\top}\textbf{D}$. -The TDC update formula with importance sampling is -\begin{equation*} - \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k} \rho_{k}[\delta_{k} \bm{\phi}_k- \gamma\bm{\phi}_{k+1}(\bm{\phi}^{\top}_k \bm{u}_{k})], -\label{thetatdc} -\end{equation*} +% An $\bm{\Phi}^{\top}\bm{\text{X}}\bm{\Phi}$ matrix of this +% form will be positive definite whenever the matrix $\bm{\text{X}}$ is positive definite. +% Any matrix $\bm{\text{X}}$ is positive definite if and only if +% the symmetric matrix $\bm{\text{S}}=\bm{\text{X}}+\bm{\text{X}}^{\top}$ is positive definite. +% Any symmetric real matrix $\bm{\text{S}}$ is positive definite if the absolute values of +% its diagonal entries are greater than the sum of the absolute values of the corresponding +% off-diagonal entries\cite{sutton2016emphatic}. + +% All components of the matrix $\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})$ are positive. +% The row sums of $\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})$ are positive. And The row sums of +% $\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})$ are +% \begin{equation*} +% \begin{array}{ccl} +% \textbf{1}^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})&=&\textbf{d}_{\pi}^{\top}(\textbf{I}-\gamma \textbf{P}_{\pi})\\ +% &=& \textbf{d}_{\pi}^{\top} - \gamma \textbf{d}_{\pi}^{\top}\textbf{P}_{\pi}\\ +% &=& \textbf{d}_{\pi}^{\top} - \gamma \textbf{d}_{\pi}^{\top}\\ +% &=& (1-\gamma)\textbf{d}_{\pi}^{\top}, +% \end{array} +% \end{equation*} +% all components of which are positive. Thus, the key matrix and its $\textbf{A}_{\text{on}}$ matrix are positive +% definite, and on-policy TD(0) is stable + +% The off-policy TD(0) update formula is +% \begin{equation*} +% \label{thetatd_offpolicy} +% \begin{array}{ccl} +% \bm{\theta}_{k+1}&\leftarrow&\bm{\theta}_k+\alpha_k \rho_k \delta_k\bm{\phi}_k, +% \end{array} +% \end{equation*} +% where $\rho_k =\frac{\pi(A_k | S_k)}{\mu(A_k | S_k)}$, called importance sampling ratio, and +The key matrix $\textbf{A}_{\text{off}}$ of off-policy TD(0) is \begin{equation*} - \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[\rho_k \delta_{k} - \bm{\phi}^{\top}_k \bm{u}_{k}]\bm{\phi}_k. -\label{utdc} + \textbf{A}_{\text{off}} = \bm{\Phi}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}, \end{equation*} +where $\textbf{D}_{\mu}$ is the $|S| \times |S|$ diagonal +matrix with $d_{\mu}$ on its diagonal. $d_{\mu}$ is a vector, each component representing the steady-state +distribution under behavior policy $\mu$. + +% If the key matrix +% $\textbf{A}$ in the algorithm is positive definite, then the +% algorithm is stable and converges. However, in the off-policy TD(0) +% algorithm, it cannot be guaranteed that +% $\textbf{A}$ is a positive definite matrix. +In the off-policy 2-state, +$\textbf{A}_{\text{off}}=-0.2$, which means that off-policy TD(0) cannot stably converge, +while , in the on-policy 2-state, $\textbf{A}_{\text{on}}=0.475$, which means that on-policy TD(0) can stably converge. + +% TDC and ETD are two well-known off-policy algorithms. +% The former is an off-policy algorithm derived from the +% objective function Mean Squared Projected Bellman error (MSPBE), while the latter employs a technique +% to transform the key matrix +% $\textbf{A}$ in the original off-policy TD(0) from non-positive +% definite to positive definite, thereby ensuring the algorithm's +% convergence under off-policy conditions. + +% The MSPBE with importance sampling is +% \begin{equation*} +% \begin{array}{ccl} +% \text{MSPBE}(\bm{\theta})&=&||\textbf{V}_{\bm{\theta}} - \Pi \textbf{T}^{\pi}\textbf{V}_{\bm{\theta}}||^{2}_{\mu}\\ +% &=&||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}^{\pi}\textbf{V}_{\bm{\theta}})||^{2}_{\mu}\\ +% &=&\mathbb{E}[\rho \delta \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[\rho \delta \bm{\phi}], +% \end{array} +% \end{equation*} +% where $\textbf{V}_{\bm{\theta}}$ is viewed as vectors with one element for each state, +% the norm $||\bm{v}||^{2}_{\mu}=\sum_{s}^{}\mu(s)\bm{v}^{2}(s)$, $\textbf{T}^{\pi}$, simplified to +% $\textbf{T}$ in the following text, is Bellman operator and $\bm{\Pi}=\bm{\Phi}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1}\bm{\Phi}^{\top}\textbf{D}$. +% The TDC update formula with importance sampling is +% \begin{equation*} +% \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k} \rho_{k}[\delta_{k} \bm{\phi}_k- \gamma\bm{\phi}_{k+1}(\bm{\phi}^{\top}_k \bm{u}_{k})], +% \label{thetatdc} +% \end{equation*} +% \begin{equation*} +% \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[\rho_k \delta_{k} - \bm{\phi}^{\top}_k \bm{u}_{k}]\bm{\phi}_k. +% \label{utdc} +% \end{equation*} The key matrix $\textbf{A}_{\text{TDC}}= \textbf{A}^{\top}_{\text{off}}\textbf{C}^{-1}\textbf{A}_{\text{off}}$, where $\textbf{C}=\mathbb{E}[\bm{\bm{\phi}}\bm{\bm{\phi}}^{\top}]$. In the 2-state counterexample, $\textbf{A}_{\text{TDC}}=0.016$, which means that TDC can stably converge. -The ETD update formula is -\begin{equation} - \label{fvmetd} - F_k \leftarrow \gamma \rho_{k-1}F_{k-1}+1, -\end{equation} +The key matrix $\textbf{A}_{\text{TDC}}$ of on-policy TDC is \begin{equation*} - \label{thetaetd} - \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\alpha_k F_k \rho_k\delta_k\bm{\phi}_k, + \textbf{A}_{\text{TDC}} = \textbf{A}^{\top}_{\text{on}}\textbf{C}^{-1}\textbf{A}_{\text{on}}. +\end{equation*} +The key matrix $\textbf{A}_{\text{TDC}}$ of off-policy TDC is +\begin{equation*} + \textbf{A}_{\text{TDC}} = \textbf{A}^{\top}_{\text{off}}\textbf{C}^{-1}\textbf{A}_{\text{off}}. +\end{equation*} +$\textbf{A}_{\text{TDC}}=0.016$ in the off-policy 2-state and $\textbf{A}_{\text{TDC}}=0.09025$ +in the on-policy 2-state, which means that TDC can stably converge in two settings. + +To address the issue of the key matrix $\textbf{A}_{\text{off}}$ +in off-policy TD(0) being non-positive definite, + a scalar variable, $F_t$, +is introduced to obtain the off-policy TD(0) algorithm, +which ensures convergence under off-policy +conditions. + +The key matrix $\textbf{A}_{\text{ETD}}$ is +\begin{equation*} + \textbf{A}_{\text{ETD}} = \bm{\Phi}^{\top}\textbf{F}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}, \end{equation*} -where $F_t$ is a scalar variable and $F_0=1$. -The key matrix $\textbf{A}_{\text{ETD}}= \bm{\Phi}^{\top}\textbf{F}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$, where $\textbf{F}$ is a diagonal matrix with diagonal elements -$f(s)\dot{=}d_{\mu}(s)\lim_{t\rightarrow \infty}\mathbb{E}_{\mu}[F_k|S_k=s]$, +$f(s)\dot{=}d_{\mu}(s)\lim_{t\rightarrow \infty}\mathbb{E}_{\mu}[F_t|S_t=s]$, which we assume exists. The vector $\textbf{f}\in \mathbb{R}^N$ with components $[\textbf{f}]_s\dot{=}f(s)$ can be written as @@ -193,21 +264,71 @@ $[\textbf{f}]_s\dot{=}f(s)$ can be written as \textbf{f}&=\textbf{d}_{\mu}+\gamma \textbf{P}_{\pi}^{\top}\textbf{d}_{\mu}+(\gamma \textbf{P}_{\pi}^{\top})^2\textbf{d}_{\mu}+\ldots\\ &=(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}\textbf{d}_{\mu}. \end{split} -\end{equation*}. -The row sums of -$\textbf{F}(\textbf{I}-\gamma \textbf{P}_{\pi})$ are -\begin{equation*} - \begin{array}{ccl} - \textbf{1}^{\top}\textbf{F}(\textbf{I}-\gamma \textbf{P}_{\pi})&=&\textbf{f}^{\top}(\textbf{I}-\gamma \textbf{P}_{\pi})\\ - &=& \textbf{d}_{\mu}^{\top}(\textbf{I}-\gamma \textbf{P}_{\pi})^{-1}(\textbf{I}-\gamma \textbf{P}_{\pi})\\ - &=& \textbf{d}_{\mu}^{\top}, - \end{array} \end{equation*} -and in the 2-state counterexample, +In the off-policy 2-state, $\textbf{A}_{\text{ETD}}=3.4$, which means that ETD can stably converge. -The convergence rate of the algorithm is related to the matrix -$\textbf{A}$. The larger the minimum eigenvalue of -$\textbf{A}$, the faster the convergence rate. In the 2-state case, the minimum eigenvalue of the matrix -$\textbf{A}$ in ETD is the largest, so it converges the fastest. -Based on this theorem, can we derive an algorithm with a larger minimum eigenvalue for matrix $\textbf{A}$. + +Table \ref{tab:min_eigenvalues} shows Minimum eigenvalues +of various algorithms in the 2-state counterexample. + +In the on-policy 2-state environment, the minimum eigenvalue +of the key matrix for TDC is greater than that of TD(0), +indicating that TDC converges faster than TD(0) in this +environment. In the off-policy 2-state environment, the +minimum eigenvalue of the key matrix for ETD is the largest, +suggesting that ETD has the fastest convergence rate. + +Minimum eigenvalue larger, algorithm's convergence faster. +To derive an algorithm with a larger minimum eigenvalue for matrix +$\textbf{A}$, it is necessary to propose new objective functions. +The mentioned objective functions in the Introduction +are all forms of error. Is minimizing error the only option +for value-based reinforcement learning? +Based on this observation, +we propose alternative objective functions instead of minimizing errors. + + + + + + + +% The ETD update formula is +% \begin{equation} +% \label{fvmetd} +% F_k \leftarrow \gamma \rho_{k-1}F_{k-1}+1, +% \end{equation} +% \begin{equation*} +% \label{thetaetd} +% \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\alpha_k F_k \rho_k\delta_k\bm{\phi}_k, +% \end{equation*} +% where $F_t$ is a scalar variable and $F_0=1$. +% The key matrix $\textbf{A}_{\text{ETD}}= \bm{\Phi}^{\top}\textbf{F}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$, +% where +% $\textbf{F}$ is a diagonal matrix with diagonal elements +% $f(s)\dot{=}d_{\mu}(s)\lim_{t\rightarrow \infty}\mathbb{E}_{\mu}[F_k|S_k=s]$, +% which we assume exists. +% The vector $\textbf{f}\in \mathbb{R}^N$ with components +% $[\textbf{f}]_s\dot{=}f(s)$ can be written as +% \begin{equation*} +% \begin{split} +% \textbf{f}&=\textbf{d}_{\mu}+\gamma \textbf{P}_{\pi}^{\top}\textbf{d}_{\mu}+(\gamma \textbf{P}_{\pi}^{\top})^2\textbf{d}_{\mu}+\ldots\\ +% &=(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}\textbf{d}_{\mu}. +% \end{split} +% \end{equation*}. +% The row sums of +% $\textbf{F}(\textbf{I}-\gamma \textbf{P}_{\pi})$ are +% \begin{equation*} +% \begin{array}{ccl} +% \textbf{1}^{\top}\textbf{F}(\textbf{I}-\gamma \textbf{P}_{\pi})&=&\textbf{f}^{\top}(\textbf{I}-\gamma \textbf{P}_{\pi})\\ +% &=& \textbf{d}_{\mu}^{\top}(\textbf{I}-\gamma \textbf{P}_{\pi})^{-1}(\textbf{I}-\gamma \textbf{P}_{\pi})\\ +% &=& \textbf{d}_{\mu}^{\top}, +% \end{array} +% \end{equation*} +% and in the 2-state counterexample, +% $\textbf{A}_{\text{ETD}}=3.4$, which means that ETD can stably converge. + +% In the 2-state case, the minimum eigenvalue of the matrix +% $\textbf{A}$ in ETD is the largest, so it converges the fastest. +% Based on this theorem, can we derive an algorithm with a larger minimum eigenvalue for matrix $\textbf{A}$. diff --git a/NEW_aaai/main/theory.tex b/NEW_aaai/main/theory.tex index 604b341..0aff01c 100644 --- a/NEW_aaai/main/theory.tex +++ b/NEW_aaai/main/theory.tex @@ -1,6 +1,69 @@ \section{Theoretical Analysis} -The purpose of this section is to establish the stabilities of the VMTDC algorithm -and the VMETD algorithm. +This section primarily focuses on proving the convergence of VMTD, VMTDC, and VMETD. +\begin{theorem} + \label{theorem1}(Convergence of VMTD). + In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta}) of VMTD. + Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$, + $ + \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty, + $ + $ + \sum_{k=0}^{\infty}\alpha_k^2<\infty, + $ + $ + \sum_{k=0}^{\infty}\beta_k^2<\infty, + $ + and + $ + \alpha_k = o(\beta_k). + $ + Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with + uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain. + Let $\textbf{A} = \mathrm{Cov}(\phi,\phi-\gamma\phi')$, + $b=\mathrm{Cov}(r,\phi)$. + Assume that matrix $\textbf{A}$ is non-singular. + Then the parameter vector $\theta_k$ converges with probability one + to $\textbf{A}^{-1}b$. +\end{theorem} + +\begin{proof} +\label{th1proof} + The proof is based on Borkar's Theorem for + general stochastic approximation recursions with two time scales + \cite{borkar1997stochastic}. + + A sketch proof is given as follows. + In the fast time scale, the parameter $w$ converges to + $\mathbb{E}[\delta|\theta_k]$. + In the slow time scale, + the associated ODE is + \begin{equation*} + \vec{h}(\theta(t))=-\textbf{A}\theta(t)+b. + \end{equation*} + \begin{equation} + \begin{array}{ccl} + A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\ + &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\ + &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\ + &=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\ + \end{array} + \label{covariance} + \end{equation} + where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$ + \footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to + the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or + initialized randomly in a Markov chain for on-policy update.}. + Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and + $\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive + definite. Then, the matrix $\textbf{A}$ is semi-positive definite because $\textbf{A}$ is + linearly combined by two positive-weighted semi-positive definite matrice + (\ref{covariance}). + Furthermore, $\textbf{A}$ is nonsingular due to the assumption. + Hence, the matrix $\textbf{A}$ is positive definite. And, + the parameter $\theta$ converges to $\textbf{A}^{-1}b$. +\end{proof} +Please refer to the appendix for VMTD's detailed proof process. + \begin{theorem} \label{theorem2}(Convergence of VMTDC). In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc}) of VMTDC. @@ -24,129 +87,147 @@ and the VMETD algorithm. $ \zeta_k = o(\beta_k). $ - Assume that $(\bm{\bm{\phi}}_k,r_k,\bm{\bm{\phi}}_k')$ is an i.i.d. sequence with + Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with uniformly bounded second moments. - Let $\textbf{A} = \mathrm{Cov}(\bm{\bm{\phi}},\bm{\bm{\phi}}-\gamma\bm{\bm{\phi}}')$, - $\bm{b}=\mathrm{Cov}(r,\bm{\bm{\phi}})$, and $\textbf{C}=\mathbb{E}[\bm{\bm{\phi}}\bm{\bm{\phi}}^{\top}]$. + Let $\textbf{A} = \mathrm{Cov}(\phi,\phi-\gamma\phi')$, + $b=\mathrm{Cov}(r,\phi)$, and $\textbf{C}=\mathbb{E}[\phi\phi^{\top}]$. Assume that $\textbf{A}$ and $\textbf{C}$ are non-singular matrices. - Then the parameter vector $\bm{\theta}_k$ converges with probability one - to $\textbf{A}^{-1}\bm{b}$. + Then the parameter vector $\theta_k$ converges with probability one + to $\textbf{A}^{-1}b$. \end{theorem} \begin{proof} - The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. - % For the VMTDC algorithm, a new one-step linear TD solution is defined as: - % \begin{equation*} - % 0=\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b}). - % \end{equation*} - % The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the VMTD's solution is - % $\bm{\theta}_{\text{VMTDC}}=\bm{\theta}_{\text{VMTD}}=\textbf{A}^{-1}\bm{b}$. + The proof is similar to that given by \cite{sutton2009fast} for TDC, + but it is based on multi-time-scale stochastic approximation. - First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, -\begin{equation*} - \theta_{k+1}\leftarrow \theta_k+\zeta_k x(k), -\end{equation*} -\begin{equation*} - u_{k+1}\leftarrow u_k+\beta_k y(k), -\end{equation*} -where -\begin{equation*} - x(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)], -\end{equation*} -\begin{equation*} - y(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \phi^{\top}_k u_k]\phi_k. -\end{equation*} + A sketch proof is given as follows. + In the fastest time scale, the parameter $w$ converges to + $\mathbb{E}[\delta|u_k,\theta_k]$. + In the second fast time scale, + the parameter $u$ converges to $\textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta_k])\phi|\theta_k]$. + In the slower time scale, + the associated ODE is + \begin{equation*} + \vec{h}(\theta(t))=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\theta(t)+b). + \end{equation*} + The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, + the parameter $\theta$ converges to $\textbf{A}^{-1}b$. +\end{proof} +Please refer to the appendix for VMTDC's detailed proof process. +% \begin{proof} +% The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. +% % For the VMTDC algorithm, a new one-step linear TD solution is defined as: +% % \begin{equation*} +% % 0=\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b}). +% % \end{equation*} +% % The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the VMTD's solution is +% % $\bm{\theta}_{\text{VMTDC}}=\bm{\theta}_{\text{VMTD}}=\textbf{A}^{-1}\bm{b}$. + +% First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, +% \begin{equation*} +% \theta_{k+1}\leftarrow \theta_k+\zeta_k x(k), +% \end{equation*} +% \begin{equation*} +% u_{k+1}\leftarrow u_k+\beta_k y(k), +% \end{equation*} +% where +% \begin{equation*} +% x(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)], +% \end{equation*} +% \begin{equation*} +% y(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \phi^{\top}_k u_k]\phi_k. +% \end{equation*} -Recursion (\ref{thetavmtdc}) can also be rewritten as -\begin{equation*} - \theta_{k+1}\leftarrow \theta_k+\beta_k z(k), -\end{equation*} -where -\begin{equation*} - z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)], -\end{equation*} +% Recursion (\ref{thetavmtdc}) can also be rewritten as +% \begin{equation*} +% \theta_{k+1}\leftarrow \theta_k+\beta_k z(k), +% \end{equation*} +% where +% \begin{equation*} +% z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)], +% \end{equation*} -Due to the settings of step-size schedule -$\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. -That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than -those in (\ref{uvmtdc}) and the increments in iteration (\ref{uvmtdc}) are uniformly larger than -those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion. -Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc}) -are associated to ODEs system as follows: -\begin{equation} - \dot{\theta}(t) = 0, - \label{thetavmtdcFastest} -\end{equation} -\begin{equation} - \dot{u}(t) = 0, - \label{uvmtdcFastest} -\end{equation} -\begin{equation} - \dot{\omega}(t)=\mathbb{E}[\delta_t|u(t),\theta(t)]-\omega(t). - \label{omegavmtdcFastest} -\end{equation} +% Due to the settings of step-size schedule +% $\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. +% That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than +% those in (\ref{uvmtdc}) and the increments in iteration (\ref{uvmtdc}) are uniformly larger than +% those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion. +% Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc}) +% are associated to ODEs system as follows: +% \begin{equation} +% \dot{\theta}(t) = 0, +% \label{thetavmtdcFastest} +% \end{equation} +% \begin{equation} +% \dot{u}(t) = 0, +% \label{uvmtdcFastest} +% \end{equation} +% \begin{equation} +% \dot{\omega}(t)=\mathbb{E}[\delta_t|u(t),\theta(t)]-\omega(t). +% \label{omegavmtdcFastest} +% \end{equation} -Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$ -and $u(t)\equiv u$ when viewed from the fastest timescale. -By the Hirsch lemma \cite{hirsch1989convergent}, it follows that -$||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some -$\theta$ that depends on the initial condition $\theta_0$ of recursion -(\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some -$u$ that depends on the initial condition $u_0$ of recursion -(\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest}) -can be written as -\begin{equation} - \dot{\omega}(t)=\mathbb{E}[\delta_t|u,\theta]-\omega(t). - \label{omegavmtdcFastestFinal} -\end{equation} +% Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$ +% and $u(t)\equiv u$ when viewed from the fastest timescale. +% By the Hirsch lemma \cite{hirsch1989convergent}, it follows that +% $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some +% $\theta$ that depends on the initial condition $\theta_0$ of recursion +% (\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some +% $u$ that depends on the initial condition $u_0$ of recursion +% (\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest}) +% can be written as +% \begin{equation} +% \dot{\omega}(t)=\mathbb{E}[\delta_t|u,\theta]-\omega(t). +% \label{omegavmtdcFastestFinal} +% \end{equation} -Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$, -i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}). -It is easy to find that the function $h$ is Lipschitz with coefficient -$-1$. -Let $h_{\infty}(\cdot)$ be the function defined by - $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$. - Then $h_{\infty}(\omega)= -\omega$, is well-defined. - For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\theta,u]$ -is the unique globally asymptotically stable equilibrium. -For the ODE -\begin{equation} - \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), - \label{omegavmtdcInfty} -\end{equation} -apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its -associated strict Liapunov function. Then, -the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable -equilibrium. +% Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$, +% i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}). +% It is easy to find that the function $h$ is Lipschitz with coefficient +% $-1$. +% Let $h_{\infty}(\cdot)$ be the function defined by +% $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$. +% Then $h_{\infty}(\omega)= -\omega$, is well-defined. +% For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\theta,u]$ +% is the unique globally asymptotically stable equilibrium. +% For the ODE +% \begin{equation} +% \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), +% \label{omegavmtdcInfty} +% \end{equation} +% apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its +% associated strict Liapunov function. Then, +% the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable +% equilibrium. -Consider now the recursion (\ref{omegavmtdc}). -Let -$M_{k+1}=(\delta_k-\omega_k) --\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, -where $\mathcal{F}(k)=\sigma(\omega_l,u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, -\begin{equation*} -\mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq -c_1(1+||\omega_k||^2+||u_k||^2+||\theta_k||^2). -\end{equation*} +% Consider now the recursion (\ref{omegavmtdc}). +% Let +% $M_{k+1}=(\delta_k-\omega_k) +% -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, +% where $\mathcal{F}(k)=\sigma(\omega_l,u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, +% \begin{equation*} +% \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq +% c_1(1+||\omega_k||^2+||u_k||^2+||\theta_k||^2). +% \end{equation*} -Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. -Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our -conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, -by Theorem 2.2 of \cite{borkar2000ode} we obtain that -$||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. +% Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. +% Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our +% conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, +% by Theorem 2.2 of \cite{borkar2000ode} we obtain that +% $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. -Recursion (\ref{uvmtdc}) is considered the second timescale. -Recursion (\ref{thetavmtdc}) is considered the slower timescale. -For the convergence properties of $u$ and $\theta$, please refer to the appendix. -\end{proof} +% Recursion (\ref{uvmtdc}) is considered the second timescale. +% Recursion (\ref{thetavmtdc}) is considered the slower timescale. +% For the convergence properties of $u$ and $\theta$, please refer to the appendix. +% \end{proof} % \begin{proof} % The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. @@ -426,18 +507,20 @@ For the convergence properties of $u$ and $\theta$, please refer to the appendix $ Assume that $(\bm{\bm{\phi}}_k,r_k,\bm{\bm{\phi}}_k')$ is an i.i.d. sequence with uniformly bounded second moments, where $\bm{\bm{\phi}}_k$ and $\bm{\bm{\phi}}'_{k}$ are sampled from the same Markov chain. - Let $\textbf{A}_{\textbf{VMETD}} ={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}}$, - $\bm{b}_{\textbf{VMETD}}=\bm{\Phi}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi}$. - Assume that matrix $A$ is non-singular. - Then the parameter vector $\bm{\theta}_k$ converges with probability one - to $\textbf{A}_{\textbf{VMETD}}^{-1}\bm{b}_{\textbf{VMETD}}$. + Let $\textbf{A}_{\textbf{VMETD}} ={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-d_{\mu} d_{\mu}^{\top} ){\bm{\Phi}}$, + $b_{\text{VMETD}}=\bm{\Phi}^{\top}(\textbf{F}-d_{\mu} f^{\top})r_{\pi}$. + Assume that matrix $\textbf{A}$ is non-singular. + Then the parameter vector $\theta_k$ converges with probability one + to $\textbf{A}_{\textbf{VMETD}}^{-1}b_{\textbf{VMETD}}$. \end{theorem} \begin{proof} The proof of VMETD's convergence is also based on Borkar's Theorem for general stochastic approximation recursions with two time scales \cite{borkar1997stochastic}. - Recursion (\ref{omegavmetd}) is considered the faster timescale. For the convergence properties of $\omega$, please refer to the appendix. + A sketch proof is given as follows. + In the fast time scale, the parameter $\omega$ converges to + $\mathbb{E}_{\mu}[F\rho\delta|\theta_k]$. Recursion (\ref{thetavmetd}) is considered the slower timescale. If the key matrix $\textbf{A}_{\text{VMETD}}$ is positive definite, then @@ -445,70 +528,37 @@ $\theta$ converges. \begin{equation} \label{rowsum} \begin{split} - (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )\textbf{1} - &=\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})\textbf{1}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ - &=\textbf{F}(\textbf{1}-\gamma \textbf{P}_{\pi} \textbf{1})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ - &=(1-\gamma)\textbf{F}\textbf{1}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ - &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ - &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \\ - &=(1-\gamma)(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}\textbf{d}_{\mu}-\textbf{d}_{\mu} \\ - &=(1-\gamma)[(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}-\textbf{I}]\textbf{d}_{\mu} \\ - &=(1-\gamma)[\sum_{t=0}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}-\textbf{I}]\textbf{d}_{\mu} \\ - &=(1-\gamma)[\sum_{t=1}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}]\textbf{d}_{\mu} > 0 \\ + &(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-{d}_{\mu} {d}_{\mu}^{\top} )e\\ + &=\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})e-{d}_{\mu} {d}_{\mu}^{\top} e\\ + % &=\textbf{F}(\textbfe-\gamma \textbf{P}_{\pi} \textbfe)-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbfe\\ + &=(1-\gamma)\textbf{F}e-{d}_{\mu} {d}_{\mu}^{\top} e\\ + % &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbfe\\ + &=(1-\gamma){f}-{d}_{\mu} \\ + &=(1-\gamma)(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}{d}_{\mu}-{d}_{\mu} \\ + &=(1-\gamma)[(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}-\textbf{I}]{d}_{\mu} \\ + &=(1-\gamma)[\sum_{t=0}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}-\textbf{I}]{d}_{\mu} \\ + &=(1-\gamma)[\sum_{t=1}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}]{d}_{\mu} > 0, \\ \end{split} \end{equation} \begin{equation} \label{columnsum} \begin{split} - \textbf{1}^{\top}(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ) - &=\textbf{1}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{1}^{\top}\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \\ - &=\textbf{d}_{\mu}^{\top}-\textbf{1}^{\top}\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \\ - &=\textbf{d}_{\mu}^{\top}- \textbf{d}_{\mu}^{\top} \\ - &=0 + &e^{\top}(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} {d}_{\mu}^{\top} )\\ + &=e^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-e^{\top}{d}_{\mu} {d}_{\mu}^{\top} \\ + &={d}_{\mu}^{\top}-e^{\top}{d}_{\mu} {d}_{\mu}^{\top} \\ + &={d}_{\mu}^{\top}- {d}_{\mu}^{\top} \\ + &=0, \end{split} \end{equation} -(\ref{rowsum}) and (\ref{columnsum}) show that the matrix $\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top}$ of +where $e$ is the all-ones vector. +(\ref{rowsum}) and (\ref{columnsum}) show that the matrix $\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-d_{\mu} d_{\mu}^{\top}$ of diagonal entries are positive and its off-diagonal entries are negative. So its each row sum plus the corresponding column sum is positive. So $\textbf{A}_{\text{VMETD}}$ is positive definite. - - \end{proof} -\begin{figure*}[htb] - \vskip 0.2in - \begin{center} - \subfigure[2-state counterexample]{ - \includegraphics[width=0.55\columnwidth, height=0.475\columnwidth]{main/pic/2-state.pdf} - \label{2-state} - } - \subfigure[7-state counterexample]{ - \includegraphics[width=0.55\columnwidth, height=0.475\columnwidth]{main/pic/7-state.pdf} - \label{7-state} - } - \subfigure[Maze]{ - \includegraphics[width=0.55\columnwidth, height=0.475\columnwidth]{main/pic/maze.pdf} - \label{MazeFull} - }\\ - \subfigure[Cliff Walking]{ - \includegraphics[width=0.55\columnwidth, height=0.475\columnwidth]{main/pic/cl.pdf} - \label{CliffWalkingFull} - } - \subfigure[Mountain Car]{ - \includegraphics[width=0.55\columnwidth, height=0.475\columnwidth]{main/pic/mt.pdf} - \label{MountainCarFull} - } - \subfigure[Acrobot]{ - \includegraphics[width=0.55\columnwidth, height=0.475\columnwidth]{main/pic/acrobot.pdf} - \label{AcrobotFull} - } - \caption{Learning curses of two evaluation environments and four contral environments.} - \label{Complete_full} - \end{center} - \vskip -0.2in - \end{figure*} \subsection{Optimal Policy Invariance} This section prove -the optimal policy invariance of +the optimal policy invariance of VMTD, VMTDC and VMETD in control experiments, laying the groundwork for subsequent experiments. @@ -518,7 +568,7 @@ true value and the predicted value, action $a_3$ is still chosen under the greedy-policy. On the contrary, supervised learning is usually used to predict temperature, humidity, morbidity, etc. If the bias is too large, the consequences could be serious. -\begin{table}[t] +\begin{table}[ht] \caption{Comparison of action selection with and without constant bias in $Q$ values.} \label{example_bias} diff --git "a/\347\216\257\345\242\203\345\233\276\347\211\207/2-state.png" "b/\347\216\257\345\242\203\345\233\276\347\211\207/2-state.png" new file mode 100644 index 0000000..0fd573e Binary files /dev/null and "b/\347\216\257\345\242\203\345\233\276\347\211\207/2-state.png" differ diff --git "a/\347\224\273\345\233\276.pptx" "b/\347\224\273\345\233\276.pptx" new file mode 100644 index 0000000..0c1c92d Binary files /dev/null and "b/\347\224\273\345\233\276.pptx" differ diff --git "a/\350\256\272\346\226\207\350\215\211\347\250\277.txt" "b/\350\256\272\346\226\207\350\215\211\347\250\277.txt" index 1141469..c664fc7 100644 --- "a/\350\256\272\346\226\207\350\215\211\347\250\277.txt" +++ "b/\350\256\272\346\226\207\350\215\211\347\250\277.txt" @@ -104,3 +104,29 @@ to, (1) 将标量参数引入到更多的TD算法中. 对于控制实验,迷宫和cliff walking的实验结果相似,VMGQ表现优于GQ,EQ表现优于VMGQ,而VMEQ的性能最优。 mountain car和Acrobot的实验结果相似,VMGQ和VMEQ的性能接近都优于GQ和EQ。总之对于控制实验,VM算法优于非VM算法 + +接下来,我们将在2-state环境中计算TD(0)、TDC、ETD的分别在on-policy和off-policy下的各自A的最小特征值。 +如果矩阵A正定,则算法收敛。 + +首先,我们将介绍2-state分别在on-policy和off-policy下的环境设定。 +在on-policy设定下,行为策略与目标策略一样,令A=B。 + +为了解决off-policy TD(0)的关键矩阵A_off非正定问题, + +为了方便 + +在2-state环境中,我们进行了两种实验——on-policy实验和off-policy实验,来验证算法的收敛速度与关键矩阵的最小特征值的关系。 + + +图A是on-policy 2-state的策略评估实验的曲线图。在该实验设定下,TD、VMTD、TDC以及VMTDC的收敛速度在依次递减,而表1可以得到这四个算法的关键矩阵的最小特征值都大于0,并且依次递减。实验曲线和表格数值相照应。 + +图B是off-policy 2-state的策略评估实验的曲线图。在该实验设定下,ETD、VMETD、VMTD、VMTDC以及TDC的收敛速度在依次递减,TD则发散。而表1可以得到ETD、VMETD、VMTD、VMTDC以及TDC这五个算法的关键矩阵的最小特征值都大于0,并且依次递减,TD算法的关键矩阵的最小特征值小于0。实验曲线和表格数值相照应。令人惊喜的是,尽管VMTD是on-policy下保证收敛的算法,但在off-policy 2-state下依旧可以收敛。由VMTD的更新公式可以看出,VMTD的更新公式相当于是对TD更新的调整与修正,参数omega的引入使得梯度估计的方差更加稳定,从而让theta的更新更加稳定。 + + +图1,2,3,4分别是四个控制实验的曲线图。四个控制实验都表现出了一个共性特征:VMEQ的表现优于EQ,VMGQ优于GQ,VMQ优于Q-learning,VMSarsa优于Sarsa。对于Maze和Cliffwalking实验,VMEQ都表现出了最佳的性能,收敛速度最快。对于Mountain car和 Acrobot实验,四个VM算法的表现近乎一样,并且都优于其他算法。 + + +总的来说,不管是策略评估实验还是控制实验,VM算法都表现较为优秀,尤其在控制实验中特别突出。 + +在本论文中, + diff --git "a/\350\257\204\344\274\260\345\256\236\351\252\214\345\233\276/2-state-offpolicy.pdf" "b/\350\257\204\344\274\260\345\256\236\351\252\214\345\233\276/2-state-offpolicy.pdf" new file mode 100644 index 0000000..7bd91ce Binary files /dev/null and "b/\350\257\204\344\274\260\345\256\236\351\252\214\345\233\276/2-state-offpolicy.pdf" differ diff --git "a/\350\257\204\344\274\260\345\256\236\351\252\214\345\233\276/2-state-onpolicy.pdf" "b/\350\257\204\344\274\260\345\256\236\351\252\214\345\233\276/2-state-onpolicy.pdf" new file mode 100644 index 0000000..a010a5e Binary files /dev/null and "b/\350\257\204\344\274\260\345\256\236\351\252\214\345\233\276/2-state-onpolicy.pdf" differ -- libgit2 0.26.0