新版

60bf36cc · GongYu · 9c95a8b3 · 60bf36cc · 60bf36cc · 60bf36cc
Commit 60bf36cc authored Aug 14, 2024 by GongYu
38 changed files
--- a/AAAI控制实验图/acrobot.pdf
+++ b/AAAI控制实验图/acrobot.pdf
--- a/AAAI控制实验图/cl.pdf
+++ b/AAAI控制实验图/cl.pdf
--- a/AAAI控制实验图/maze.pdf
+++ b/AAAI控制实验图/maze.pdf
--- a/AAAI控制实验图/mt.pdf
+++ b/AAAI控制实验图/mt.pdf
--- a/Apendix/anonymous-submission-latex-2024.aux
+++ b/Apendix/anonymous-submission-latex-2024.aux
 \relax 
 \bibstyle{aaai24}
+\citation{borkar1997stochastic}
+\citation{hirsch1989convergent}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\citation{borkar2000ode}
+\newlabel{proofth1}{{A.1}{1}}
+\newlabel{th1proof}{{A.1}{1}}
+\newlabel{thetaFast}{{A-1}{1}}
+\newlabel{omegaFast}{{A-2}{1}}
+\newlabel{omegaFastFinal}{{A-3}{1}}
+\newlabel{omegaInfty}{{A-4}{1}}
 \citation{sutton2009fast}
+\newlabel{odetheta}{{A-5}{2}}
+\newlabel{covariance}{{A-6}{2}}
+\newlabel{odethetafinal}{{A-7}{2}}
+\newlabel{proofth2}{{A.2}{2}}
 \citation{hirsch1989convergent}
 \citation{borkar2000ode}
 \citation{borkar2000ode}
 \citation{borkar2000ode}
 \citation{hirsch1989convergent}
-\newlabel{proofth2}{{A.1}{1}}
-\newlabel{thetavmtdcFastest}{{A-1}{1}}
-\newlabel{uvmtdcFastest}{{A-2}{1}}
-\newlabel{omegavmtdcFastest}{{A-3}{1}}
-\newlabel{omegavmtdcFastestFinal}{{A-4}{1}}
-\newlabel{omegavmtdcInfty}{{A-5}{1}}
-\newlabel{thetavmtdcFaster}{{A-6}{1}}
 \citation{borkar2000ode}
 \citation{borkar2000ode}
 \citation{borkar2000ode}
+\newlabel{thetavmtdcFastest}{{A-8}{3}}
+\newlabel{uvmtdcFastest}{{A-9}{3}}
+\newlabel{omegavmtdcFastest}{{A-10}{3}}
+\newlabel{omegavmtdcFastestFinal}{{A-11}{3}}
+\newlabel{omegavmtdcInfty}{{A-12}{3}}
+\newlabel{thetavmtdcFaster}{{A-13}{3}}
+\newlabel{uvmtdcFaster}{{A-14}{3}}
+\newlabel{uvmtdcFasterFinal}{{A-15}{3}}
+\newlabel{uvmtdcInfty}{{A-16}{3}}
 \citation{borkar1997stochastic}
-\newlabel{uvmtdcFaster}{{A-7}{2}}
-\newlabel{uvmtdcFasterFinal}{{A-8}{2}}
-\newlabel{uvmtdcInfty}{{A-9}{2}}
-\newlabel{thetavmtdcSlowerFinal}{{A-11}{2}}
-\newlabel{odethetavmtdcfinal}{{A-12}{2}}
 \citation{hirsch1989convergent}
+\newlabel{thetavmtdcSlowerFinal}{{A-18}{4}}
+\newlabel{odethetavmtdcfinal}{{A-19}{4}}
+\newlabel{proofVMETD}{{A.3}{4}}
+\newlabel{th1proof}{{A.3}{4}}
+\newlabel{thetaFast}{{A-20}{4}}
+\newlabel{omegaFast}{{A-21}{4}}
+\newlabel{omegaFastFinal}{{A-22}{4}}
 \citation{borkar2000ode}
 \citation{borkar2000ode}
 \citation{borkar2000ode}
-\newlabel{proofVMETD}{{A.2}{3}}
+\newlabel{omegaInfty}{{A-23}{5}}
-\newlabel{th1proof}{{A.2}{3}}
+\newlabel{odetheta}{{A-24}{5}}
-\newlabel{thetaFast}{{A-13}{3}}
-\newlabel{omegaFast}{{A-14}{3}}
-\newlabel{omegaFastFinal}{{A-15}{3}}
-\newlabel{omegaInfty}{{A-16}{3}}
 \citation{sutton2016emphatic}
-\newlabel{odetheta}{{A-17}{4}}
+\newlabel{rowsum}{{A-27}{6}}
-\newlabel{rowsum}{{A-20}{4}}
+\newlabel{columnsum}{{A-28}{6}}
-\citation{baird1995residual,sutton2009fast}
+\newlabel{odethetafinal}{{A-29}{6}}
-\citation{baird1995residual,sutton2009fast,maei2011gradient}
+\newlabel{experimentaldetails}{{B}{6}}
-\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
-\newlabel{bairdexample}{{1}{5}}
-\newlabel{columnsum}{{A-21}{5}}
-\newlabel{odethetafinal}{{A-22}{5}}
-\newlabel{experimentaldetails}{{B}{5}}
 \bibdata{aaai24}
-\bibcite{baird1995residual}{{1}{1995}{{Baird et~al.}}{{}}}
+\bibcite{borkar1997stochastic}{{1}{1997}{{Borkar}}{{}}}
-\bibcite{borkar1997stochastic}{{2}{1997}{{Borkar}}{{}}}
+\bibcite{borkar2000ode}{{2}{2000}{{Borkar and Meyn}}{{}}}
-\bibcite{borkar2000ode}{{3}{2000}{{Borkar and Meyn}}{{}}}
+\bibcite{hirsch1989convergent}{{3}{1989}{{Hirsch}}{{}}}
-\bibcite{hirsch1989convergent}{{4}{1989}{{Hirsch}}{{}}}
+\bibcite{sutton2009fast}{{4}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}}
-\bibcite{maei2011gradient}{{5}{2011}{{Maei}}{{}}}
+\bibcite{sutton2016emphatic}{{5}{2016}{{Sutton, Mahmood, and White}}{{}}}
-\bibcite{sutton2009fast}{{6}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}}
+\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
-\bibcite{sutton2016emphatic}{{7}{2016}{{Sutton, Mahmood, and White}}{{}}}
+\newlabel{lrofways}{{1}{7}}
-\newlabel{lrofways}{{1}{6}}
+\gdef \@abspage@last{7}
-\gdef \@abspage@last{6}
--- a/Apendix/anonymous-submission-latex-2024.bbl
+++ b/Apendix/anonymous-submission-latex-2024.bbl
-\begin{thebibliography}{7}
+\begin{thebibliography}{5}
 \providecommand{\natexlab}[1]{#1}
-\bibitem[{Baird et~al.(1995)}]{baird1995residual}
-Baird, L.; et~al. 1995.
-\newblock Residual algorithms: Reinforcement learning with function approximation.
-\newblock In \emph{Proc. 12th Int. Conf. Mach. Learn.}, 30--37.
 \bibitem[{Borkar(1997)}]{borkar1997stochastic}
 Borkar, V.~S. 1997.
 \newblock Stochastic approximation with two time scales.
@@ -21,11 +16,6 @@ Hirsch, M.~W. 1989.
 \newblock Convergent activation dynamics in continuous time networks.
 \newblock \emph{Neural Netw.}, 2(5): 331--349.
-\bibitem[{Maei(2011)}]{maei2011gradient}
-Maei, H.~R. 2011.
-\newblock \emph{Gradient temporal-difference learning algorithms}.
-\newblock Ph.D. thesis, University of Alberta.
 \bibitem[{Sutton et~al.(2009)Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}]{sutton2009fast}
 Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesv{\'a}ri, C.; and Wiewiora, E. 2009.
 \newblock Fast gradient-descent methods for temporal-difference learning with linear function approximation.

--- a/Apendix/anonymous-submission-latex-2024.blg
+++ b/Apendix/anonymous-submission-latex-2024.blg
@@ -3,44 +3,44 @@ Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
 The top-level auxiliary file: anonymous-submission-latex-2024.aux
 The style file: aaai24.bst
 Database file #1: aaai24.bib
-You've used 7 entries,
+You've used 5 entries,
            2840 wiz_defined-function locations,
-            630 strings with 5707 characters,
+            619 strings with 5446 characters,
-and the built_in function-call counts, 4424 in all, are:
+and the built_in function-call counts, 3370 in all, are:
-= -- 372
+= -- 277
-> -- 189
+> -- 153
 < -- 0
-+ -- 74
+ -- 60
- -- 64
+- -- 52
-* -- 295
+* -- 242
-:= -- 731
+:= -- 547
-add.period$ -- 28
+add.period$ -- 20
-call.type$ -- 7
+call.type$ -- 5
-change.case$ -- 49
+change.case$ -- 36
-chr.to.int$ -- 8
+chr.to.int$ -- 6
-cite$ -- 7
+cite$ -- 5
-duplicate$ -- 302
+duplicate$ -- 223
-empty$ -- 320
+empty$ -- 240
-format.name$ -- 75
+format.name$ -- 60
-if$ -- 861
+if$ -- 649
 int.to.chr$ -- 1
 int.to.str$ -- 1
-missing$ -- 63
+missing$ -- 49
-newline$ -- 39
+newline$ -- 29
-num.names$ -- 28
+num.names$ -- 20
-pop$ -- 125
+pop$ -- 92
 preamble$ -- 1
-purify$ -- 45
+purify$ -- 34
 quote$ -- 0
-skip$ -- 134
+skip$ -- 96
 stack$ -- 0
-substring$ -- 246
+substring$ -- 200
-swap$ -- 160
+swap$ -- 128
 text.length$ -- 0
 text.prefix$ -- 0
 top$ -- 0
-type$ -- 63
+type$ -- 45
 warning$ -- 0
-while$ -- 42
+while$ -- 31
 width$ -- 0
-write$ -- 94
+write$ -- 68
--- a/Apendix/anonymous-submission-latex-2024.log
+++ b/Apendix/anonymous-submission-latex-2024.log
-This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.3.31)  12 AUG 2024 17:11
+This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2023.3.31)  14 AUG 2024 06:25
 entering extended mode
 restricted \write18 enabled.
 file:line:error style messages enabled.
@@ -582,7 +582,29 @@ File: ot1ptm.fd 2001/06/04 font definitions for OT1/ptm.
 File: l3backend-pdftex.def 2023-01-16 L3 backend support: PDF output (pdfTeX)
 \l__color_backend_stack_int=\count335
 \l__pdf_internal_box=\box82
-) (./anonymous-submission-latex-2024.aux)
+) (./anonymous-submission-latex-2024.aux
+LaTeX Warning: Label `th1proof' multiply defined.
+LaTeX Warning: Label `thetaFast' multiply defined.
+LaTeX Warning: Label `omegaFast' multiply defined.
+LaTeX Warning: Label `omegaFastFinal' multiply defined.
+LaTeX Warning: Label `omegaInfty' multiply defined.
+LaTeX Warning: Label `odetheta' multiply defined.
+LaTeX Warning: Label `odethetafinal' multiply defined.
+)
 \openout1 = `anonymous-submission-latex-2024.aux'.
 LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 183.
@@ -627,41 +649,46 @@ Package caption Info: listings package is loaded.
 Package caption Info: End \AtBeginDocument code.
 Package newfloat Info: `float' package detected.
 \c@lstlisting=\count342
-LaTeX Font Info:    Trying to load font information for U+msa on input line 234.
+LaTeX Font Info:    Trying to load font information for U+msa on input line 206.
 (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsa.fd
 File: umsa.fd 2013/01/14 v3.01 AMS symbols A
 )
-LaTeX Font Info:    Trying to load font information for U+msb on input line 234.
+LaTeX Font Info:    Trying to load font information for U+msb on input line 206.
 (d:/software/texlive/2023/texmf-dist/tex/latex/amsfonts/umsb.fd
 File: umsb.fd 2013/01/14 v3.01 AMS symbols B
 )
-LaTeX Font Info:    Trying to load font information for U+esvect on input line 234.
+LaTeX Font Info:    Trying to load font information for U+esvect on input line 206.
 (d:/software/texlive/2023/texmf-dist/tex/latex/esvect/uesvect.fd
 File: uesvect.fd 
-) [1
+)
+LaTeX Warning: Reference `omega' on page 1 undefined on input line 265.
+[1
+{d:/software/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/base/8r.enc}] [2] [3] [4] [5] [6] (./anonymous-submission-latex-2024.bbl) [7] (./anonymous-submission-latex-2024.aux)
+LaTeX Warning: There were undefined references.
+LaTeX Warning: There were multiply-defined labels.
-{d:/software/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}{d:/software/texlive/2023/texmf-dist/fonts/enc/dvips/base/8r.enc}] [2] [3] [4] (./pic/BairdExample.tex)
+ ) 
-<pic/maze_13_13.pdf, id=34, 493.1646pt x 387.62602pt>
-File: pic/maze_13_13.pdf Graphic file (type pdf)
-<use pic/maze_13_13.pdf>
-Package pdftex.def Info: pic/maze_13_13.pdf  used on input line 902.
-(pdftex.def)             Requested size: 172.61018pt x 135.67113pt.
- [5] (./anonymous-submission-latex-2024.bbl) [6 <./pic/maze_13_13.pdf>] (./anonymous-submission-latex-2024.aux) ) 
 Here is how much of TeX's memory you used:
- 22926 strings out of 476025
+ 22606 strings out of 476025
- 482831 string characters out of 5789524
+ 476412 string characters out of 5789524
- 1878382 words of memory out of 5000000
+ 1879382 words of memory out of 5000000
- 43000 multiletter control sequences out of 15000+600000
+ 42668 multiletter control sequences out of 15000+600000
- 531474 words of font info for 71 fonts, out of 8000000 for 9000
+ 539762 words of font info for 95 fonts, out of 8000000 for 9000
 1141 hyphenation exceptions out of 8191
- 84i,22n,89p,423b,789s stack positions out of 10000i,1000n,20000p,200000b,200000s
+ 84i,22n,89p,423b,526s stack positions out of 10000i,1000n,20000p,200000b,200000s
-<d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmex10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmib10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cmextra/cmmib7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/symbols/msbm10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmb8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmr8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmri8a.pfb>
+<d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmex10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi9.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr9.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy5.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy6.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy7.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/symbols/msbm10.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmb8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmr8a.pfb><d:/software/texlive/2023/texmf-dist/fonts/type1/urw/times/utmri8a.pfb>
-Output written on anonymous-submission-latex-2024.pdf (6 pages, 200712 bytes).
+Output written on anonymous-submission-latex-2024.pdf (7 pages, 208835 bytes).
 PDF statistics:
- 110 PDF objects out of 1000 (max. 8388607)
+ 117 PDF objects out of 1000 (max. 8388607)
- 68 compressed objects within 1 object stream
+ 73 compressed objects within 1 object stream
 0 named destinations out of 1000 (max. 500000)
- 18 words of extra memory for PDF output out of 10000 (max. 10000000)
+ 13 words of extra memory for PDF output out of 10000 (max. 10000000)
--- a/Apendix/anonymous-submission-latex-2024.pdf
+++ b/Apendix/anonymous-submission-latex-2024.pdf
--- a/Apendix/anonymous-submission-latex-2024.synctex.gz
+++ b/Apendix/anonymous-submission-latex-2024.synctex.gz
--- a/Apendix/anonymous-submission-latex-2024.tex
+++ b/Apendix/anonymous-submission-latex-2024.tex
--- a/NEW_aaai/anonymous-submission-latex-2025.aux
+++ b/NEW_aaai/anonymous-submission-latex-2025.aux
@@ -26,81 +26,69 @@
 \newlabel{introduction}{{}{1}}
 \citation{Sutton2018book}
 \citation{Sutton2018book}
-\citation{sutton2016emphatic}
 \newlabel{preliminaries}{{}{2}}
 \newlabel{valuefunction}{{}{2}}
 \newlabel{linearvaluefunction}{{1}{2}}
-\newlabel{thetatd_onpolicy}{{}{2}}
-\newlabel{thetatd_offpolicy}{{}{2}}
-\newlabel{thetatdc}{{}{3}}
-\newlabel{utdc}{{}{3}}
-\newlabel{fvmetd}{{2}{3}}
-\newlabel{thetaetd}{{}{3}}
 \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
-\newlabel{alg:algorithm 2}{{1}{3}}
+\newlabel{tab:min_eigenvalues}{{1}{3}}
-\newlabel{alg:algorithm 5}{{2}{4}}
+\newlabel{delta}{{3}{3}}
-\newlabel{thetavmtdc}{{5}{4}}
+\newlabel{omega}{{4}{3}}
-\newlabel{uvmtdc}{{6}{4}}
+\newlabel{theta}{{5}{3}}
-\newlabel{omegavmtdc}{{7}{4}}
+\newlabel{thetavmtdc}{{8}{4}}
-\newlabel{rho_VPBE}{{8}{4}}
+\newlabel{uvmtdc}{{9}{4}}
+\newlabel{omegavmtdc}{{10}{4}}
+\newlabel{fvmetd}{{11}{4}}
 \newlabel{thetavmetd}{{12}{4}}
 \newlabel{omegavmetd}{{13}{4}}
+\citation{borkar1997stochastic}
 \citation{sutton2009fast}
-\citation{hirsch1989convergent}
-\newlabel{theorem2}{{1}{5}}
-\newlabel{thetavmtdcFastest}{{14}{5}}
-\newlabel{uvmtdcFastest}{{15}{5}}
-\newlabel{omegavmtdcFastest}{{16}{5}}
-\newlabel{omegavmtdcFastestFinal}{{17}{5}}
-\newlabel{omegavmtdcInfty}{{18}{5}}
-\citation{borkar2000ode}
-\citation{borkar2000ode}
-\citation{borkar2000ode}
 \citation{borkar1997stochastic}
+\newlabel{theorem1}{{1}{5}}
+\newlabel{th1proof}{{}{5}}
+\newlabel{covariance}{{14}{5}}
+\newlabel{theorem2}{{2}{5}}
+\newlabel{theorem3}{{3}{5}}
+\newlabel{rowsum}{{15}{5}}
+\newlabel{columnsum}{{16}{5}}
 \citation{ng1999policy}
 \citation{devlin2012dynamic}
-\newlabel{theorem3}{{2}{6}}
-\newlabel{rowsum}{{19}{6}}
 \newlabel{example_bias}{{2}{6}}
-\newlabel{columnsum}{{20}{6}}
 \bibdata{aaai25}
 \bibcite{baird1995residual}{{1}{1995}{{Baird et~al.}}{{}}}
-\newlabel{2-state}{{1(a)}{7}}
+\newlabel{2-state}{{3(a)}{7}}
 \newlabel{sub@2-state}{{(a)}{7}}
-\newlabel{7-state}{{1(b)}{7}}
+\newlabel{7-state}{{3(b)}{7}}
 \newlabel{sub@7-state}{{(b)}{7}}
-\newlabel{MazeFull}{{1(c)}{7}}
+\newlabel{MazeFull}{{3(c)}{7}}
 \newlabel{sub@MazeFull}{{(c)}{7}}
-\newlabel{CliffWalkingFull}{{1(d)}{7}}
+\newlabel{CliffWalkingFull}{{3(d)}{7}}
 \newlabel{sub@CliffWalkingFull}{{(d)}{7}}
-\newlabel{MountainCarFull}{{1(e)}{7}}
+\newlabel{MountainCarFull}{{3(e)}{7}}
 \newlabel{sub@MountainCarFull}{{(e)}{7}}
-\newlabel{AcrobotFull}{{1(f)}{7}}
+\newlabel{AcrobotFull}{{3(f)}{7}}
 \newlabel{sub@AcrobotFull}{{(f)}{7}}
-\newlabel{Complete_full}{{1}{7}}
+\newlabel{Complete_full}{{3}{7}}
 \bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano et~al.}}{{Bas-Serrano, Curi, Krause, and Neu}}}
 \bibcite{borkar1997stochastic}{{3}{1997}{{Borkar}}{{}}}
-\bibcite{borkar2000ode}{{4}{2000}{{Borkar and Meyn}}{{}}}
+\bibcite{chen2023modified}{{4}{2023}{{Chen et~al.}}{{Chen, Ma, Li, Yang, Yang, and Gao}}}
-\bibcite{chen2023modified}{{5}{2023}{{Chen et~al.}}{{Chen, Ma, Li, Yang, Yang, and Gao}}}
+\bibcite{devlin2012dynamic}{{5}{2012}{{Devlin and Kudenko}}{{}}}
-\bibcite{devlin2012dynamic}{{6}{2012}{{Devlin and Kudenko}}{{}}}
+\bibcite{feng2019kernel}{{6}{2019}{{Feng, Li, and Liu}}{{}}}
-\bibcite{feng2019kernel}{{7}{2019}{{Feng, Li, and Liu}}{{}}}
+\bibcite{givchi2015quasi}{{7}{2015}{{Givchi and Palhang}}{{}}}
-\bibcite{givchi2015quasi}{{8}{2015}{{Givchi and Palhang}}{{}}}
+\bibcite{hackman2012faster}{{8}{2012}{{Hackman}}{{}}}
-\bibcite{hackman2012faster}{{9}{2012}{{Hackman}}{{}}}
+\bibcite{hallak2016generalized}{{9}{2016}{{Hallak et~al.}}{{Hallak, Tamar, Munos, and Mannor}}}
-\bibcite{hallak2016generalized}{{10}{2016}{{Hallak et~al.}}{{Hallak, Tamar, Munos, and Mannor}}}
+\bibcite{johnson2013accelerating}{{10}{2013}{{Johnson and Zhang}}{{}}}
-\bibcite{hirsch1989convergent}{{11}{1989}{{Hirsch}}{{}}}
+\bibcite{korda2015td}{{11}{2015}{{Korda and La}}{{}}}
-\bibcite{johnson2013accelerating}{{12}{2013}{{Johnson and Zhang}}{{}}}
+\bibcite{liu2018proximal}{{12}{2018}{{Liu et~al.}}{{Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}}}
-\bibcite{korda2015td}{{13}{2015}{{Korda and La}}{{}}}
+\bibcite{liu2015finite}{{13}{2015}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}}
-\bibcite{liu2018proximal}{{14}{2018}{{Liu et~al.}}{{Liu, Gemp, Ghavamzadeh, Liu, Mahadevan, and Petrik}}}
+\bibcite{liu2016proximal}{{14}{2016}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}}
-\bibcite{liu2015finite}{{15}{2015}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}}
+\bibcite{ng1999policy}{{15}{1999}{{Ng, Harada, and Russell}}{{}}}
-\bibcite{liu2016proximal}{{16}{2016}{{Liu et~al.}}{{Liu, Liu, Ghavamzadeh, Mahadevan, and Petrik}}}
+\bibcite{pan2017accelerated}{{16}{2017}{{Pan, White, and White}}{{}}}
-\bibcite{ng1999policy}{{17}{1999}{{Ng, Harada, and Russell}}{{}}}
+\bibcite{sutton2009fast}{{17}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}}
-\bibcite{pan2017accelerated}{{18}{2017}{{Pan, White, and White}}{{}}}
+\bibcite{sutton1988learning}{{18}{1988}{{Sutton}}{{}}}
-\bibcite{sutton2009fast}{{19}{2009}{{Sutton et~al.}}{{Sutton, Maei, Precup, Bhatnagar, Silver, Szepesv{\'a}ri, and Wiewiora}}}
+\bibcite{Sutton2018book}{{19}{2018}{{Sutton and Barto}}{{}}}
-\bibcite{sutton1988learning}{{20}{1988}{{Sutton}}{{}}}
+\bibcite{sutton2008convergent}{{20}{2008}{{Sutton, Maei, and Szepesv{\'a}ri}}{{}}}
-\bibcite{Sutton2018book}{{21}{2018}{{Sutton and Barto}}{{}}}
+\bibcite{sutton2016emphatic}{{21}{2016}{{Sutton, Mahmood, and White}}{{}}}
-\bibcite{sutton2008convergent}{{22}{2008}{{Sutton, Maei, and Szepesv{\'a}ri}}{{}}}
+\bibcite{tsitsiklis1997analysis}{{22}{1997}{{Tsitsiklis and Van~Roy}}{{}}}
-\bibcite{sutton2016emphatic}{{23}{2016}{{Sutton, Mahmood, and White}}{{}}}
+\bibcite{xu2019reanalysis}{{23}{2019}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}}
-\bibcite{tsitsiklis1997analysis}{{24}{1997}{{Tsitsiklis and Van~Roy}}{{}}}
+\bibcite{zhang2022truncated}{{24}{2022}{{Zhang and Whiteson}}{{}}}
-\bibcite{xu2019reanalysis}{{25}{2019}{{Xu et~al.}}{{Xu, Wang, Zhou, and Liang}}}
-\bibcite{zhang2022truncated}{{26}{2022}{{Zhang and Whiteson}}{{}}}
 \gdef \@abspage@last{8}
--- a/NEW_aaai/anonymous-submission-latex-2025.bbl
+++ b/NEW_aaai/anonymous-submission-latex-2025.bbl
-\begin{thebibliography}{26}
+\begin{thebibliography}{24}
 \providecommand{\natexlab}[1]{#1}
 \bibitem[{Baird et~al.(1995)}]{baird1995residual}
@@ -16,11 +16,6 @@ Borkar, V.~S. 1997.
 \newblock Stochastic approximation with two time scales.
 \newblock \emph{Syst. \& Control Letters}, 29(5): 291--294.
-\bibitem[{Borkar and Meyn(2000)}]{borkar2000ode}
-Borkar, V.~S.; and Meyn, S.~P. 2000.
-\newblock The ODE method for convergence of stochastic approximation and reinforcement learning.
-\newblock \emph{SIAM J. Control Optim.}, 38(2): 447--469.
 \bibitem[{Chen et~al.(2023)Chen, Ma, Li, Yang, Yang, and Gao}]{chen2023modified}
 Chen, X.; Ma, X.; Li, Y.; Yang, G.; Yang, S.; and Gao, Y. 2023.
 \newblock Modified Retrace for Off-Policy Temporal Difference Learning.
@@ -51,11 +46,6 @@ Hallak, A.; Tamar, A.; Munos, R.; and Mannor, S. 2016.
 \newblock Generalized emphatic temporal difference learning: bias-variance analysis.
 \newblock In \emph{Proceedings of the 30th AAAI Conference on Artificial Intelligence}, 1631--1637.
-\bibitem[{Hirsch(1989)}]{hirsch1989convergent}
-Hirsch, M.~W. 1989.
-\newblock Convergent activation dynamics in continuous time networks.
-\newblock \emph{Neural Netw.}, 2(5): 331--349.
 \bibitem[{Johnson and Zhang(2013)}]{johnson2013accelerating}
 Johnson, R.; and Zhang, T. 2013.
 \newblock Accelerating stochastic gradient descent using predictive variance reduction.

--- a/NEW_aaai/anonymous-submission-latex-2025.blg
+++ b/NEW_aaai/anonymous-submission-latex-2025.blg
@@ -3,44 +3,44 @@ Capacity: max_strings=200000, hash_size=200000, hash_prime=170003
 The top-level auxiliary file: anonymous-submission-latex-2025.aux
 The style file: aaai25.bst
 Database file #1: aaai25.bib
-You've used 26 entries,
+You've used 24 entries,
            2840 wiz_defined-function locations,
-            737 strings with 9168 characters,
+            723 strings with 8880 characters,
-and the built_in function-call counts, 19179 in all, are:
+and the built_in function-call counts, 18055 in all, are:
-= -- 1644
+= -- 1547
-> -- 870
+> -- 832
 < -- 0
-+ -- 321
+ -- 305
- -- 288
+- -- 276
-* -- 1273
+* -- 1196
-:= -- 2961
+:= -- 2777
-add.period$ -- 107
+add.period$ -- 99
-call.type$ -- 26
+call.type$ -- 24
-change.case$ -- 217
+change.case$ -- 206
-chr.to.int$ -- 27
+chr.to.int$ -- 25
-cite$ -- 26
+cite$ -- 24
-duplicate$ -- 1316
+duplicate$ -- 1237
-empty$ -- 1372
+empty$ -- 1285
-format.name$ -- 353
+format.name$ -- 338
-if$ -- 3900
+if$ -- 3685
 int.to.chr$ -- 1
 int.to.str$ -- 1
-missing$ -- 261
+missing$ -- 244
-newline$ -- 134
+newline$ -- 124
-num.names$ -- 104
+num.names$ -- 96
-pop$ -- 614
+pop$ -- 586
 preamble$ -- 1
-purify$ -- 182
+purify$ -- 171
 quote$ -- 0
-skip$ -- 694
+skip$ -- 664
 stack$ -- 0
-substring$ -- 1043
+substring$ -- 969
-swap$ -- 703
+swap$ -- 658
 text.length$ -- 0
 text.prefix$ -- 0
 top$ -- 0
-type$ -- 231
+type$ -- 213
 warning$ -- 0
-while$ -- 166
+while$ -- 154
 width$ -- 0
-write$ -- 343
+write$ -- 317
--- a/NEW_aaai/anonymous-submission-latex-2025.log
+++ b/NEW_aaai/anonymous-submission-latex-2025.log
--- a/NEW_aaai/anonymous-submission-latex-2025.pdf
+++ b/NEW_aaai/anonymous-submission-latex-2025.pdf
--- a/NEW_aaai/anonymous-submission-latex-2025.synctex.gz
+++ b/NEW_aaai/anonymous-submission-latex-2025.synctex.gz
--- a/NEW_aaai/anonymous-submission-latex-2025.tex
+++ b/NEW_aaai/anonymous-submission-latex-2025.tex
@@ -119,7 +119,7 @@
 % nouns, adverbs, adjectives should be capitalized, including both words in hyphenated terms, while
 % articles, conjunctions, and prepositions are lower case unless they
 % directly follow a colon or long dash
-\title{A Variance Minimization Approach to  Off-policy Temporal-Difference Learning}
+\title{A Variance Minimization Approach to Temporal-Difference Learning}
 \author{
    %Authors
    % All authors must be in the same font size and format.
@@ -194,16 +194,31 @@
 \maketitle
 % \setcounter{theorem}{0}
 \begin{abstract}
-    In this paper, we introduce the concept of improving the performance of parametric 
+    % In this paper, we introduce the concept of improving the performance of parametric 
-    Temporal-Difference (TD) learning algorithms by the Variance Minimization (VM) parameter, $\omega$, 
+    % Temporal-Difference (TD) learning algorithms by the Variance Minimization (VM) parameter, $\omega$, 
-    which is dynamically updated at each time step. Specifically, we incorporate the VM parameter into off-policy linear algorithms such as TDC and ETD, resulting in the 
+    % which is dynamically updated at each time step. Specifically, we incorporate the VM parameter into off-policy linear algorithms such as TDC and ETD, resulting in the 
-    Variance Minimization TDC (VMTDC) algorithm and the Variance Minimization ETD (VMETD) algorithm. In the two-state counterexample, 
+    % Variance Minimization TDC (VMTDC) algorithm and the Variance Minimization ETD (VMETD) algorithm. In the two-state counterexample, 
+    % we analyze 
+    % the convergence speed of these algorithms by calculating the minimum eigenvalue of the key 
+    % matrices and find that the VMTDC algorithm converges faster than TDC, while VMETD is more stable in convergence than ETD
+    %  through the 
+    % experiment.In controlled experiments, the VM algorithms demonstrate 
+    % superior performance.
+    Under certain conditions, the larger the smallest 
+    eigenvalue of the key matrix of an algorithm, the 
+    faster the algorithm converges. By observation, most 
+    current objective functions aim to minimize error. 
+    Therefore, in this paper, we propose two new objective 
+    functions and derive three Variance Minimization (VM) algorithms, including VMTD, VMTDC and VMETD.
+    A scalar parameter, $\omega$, is introduced, to improve the performance of parametric 
+    Temporal-Difference (TD) learning algorithms.
+    In the policy evaluation experiment, two-state, 
    we analyze 
    the convergence speed of these algorithms by calculating the minimum eigenvalue of the key 
-    matrices and find that the VMTDC algorithm converges faster than TDC, while VMETD is more stable in convergence than ETD
+    matrices both on-policy and off-policy.In controlled experiments, the VM algorithms demonstrate 
-     through the 
-    experiment.In controlled experiments, the VM algorithms demonstrate 
    superior performance.
 \end{abstract}
 % Uncomment the following to link to your code, datasets, an extended version or similar.

--- a/NEW_aaai/main/conclusion.tex
+++ b/NEW_aaai/main/conclusion.tex
 \section{Conclusion and Future Work}
+% Value-based reinforcement learning typically aims 
+% to minimize error as an optimization objective. 
+% As an alternation, this study proposes new objective 
+% functions: VBE and VPBE, and derives many variance minimization algorithms, including VMTD, 
+% VMTDC and VMETD. 
+% All algorithms demonstrated superior performance in policy 
+% evaluation and control experiments.
+% Future work may include, but are not limited
+% to, (1) analysis of the convergence rate of VMTDC and VMETD. 
+% (2) extensions of VBE and VPBE to multi-step returns. 
+% (3) extensions to nonlinear approximations, such as neural networks. 
 Value-based reinforcement learning typically aims 
 to minimize error as an optimization objective. 
-As an alternation, this study proposes new objective 
+As an alternation, this study proposes two new objective 
-functions: VBE and VPBE, and derives many variance minimization algorithms, including VMTD, 
+functions: VBE and VPBE, and derives an on-policy algorithm: 
-VMTDC and VMETD. 
+VMTD and two off-policy algorithms: VMTDC and VMETD. 
+% The VMTD algorithm 
+% is essentially an adjustment or correction to the traditional 
+% TD update. 
+%  Both 
+% algorithms are capable of stabilizing gradient estimation, reducing 
+% the variance of gradient estimation and accelerating convergence.
 All algorithms demonstrated superior performance in policy 
 evaluation and control experiments.
+Both algorithms demonstrated superior performance in policy 
+evaluation and control experiments.
 Future work may include, but are not limited
-to, (1) analysis of the convergence rate of VMTDC and VMETD. 
+to, 
-(2) extensions of VBE and VPBE to multi-step returns. 
+\begin{itemize}
-(3) extensions to nonlinear approximations, such as neural networks. 
+    \item analysis of the convergence rate of VMTDC and VMETD.
\ No newline at end of file
+    \item extensions of VBE and VPBE to multi-step returns. 
+    \item extensions to nonlinear approximations, such as neural networks. 
+\end{itemize}
\ No newline at end of file
--- a/NEW_aaai/main/experiment.tex
+++ b/NEW_aaai/main/experiment.tex
+% \subsection{Testing Tasks}
+\begin{figure}[h]
+    \centering
+    \includegraphics[scale=0.2]{main/pic/maze_13_13.pdf} 
+    \caption{Maze.}
+    \end{figure}
+\begin{figure*}[tb]
+    \vskip 0.2in
+    \begin{center}
+    \subfigure[on-policy 2-state]{
+        \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/2-state-onpolicy.pdf}
+        \label{2-state}
+    }
+    \subfigure[off-policy 2-state]{
+        \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/2-state-offpolicy.pdf}
+        \label{7-state}
+    }
+    \subfigure[Maze]{
+        \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/maze.pdf}
+        \label{MazeFull}
+    }\\
+    \subfigure[Cliff Walking]{
+        \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/cl.pdf}
+        \label{CliffWalkingFull}
+    }
+    \subfigure[Mountain Car]{
+        \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/mt.pdf}
+        \label{MountainCarFull}
+    }
+    \subfigure[Acrobot]{
+        \includegraphics[width=0.65\columnwidth, height=0.58\columnwidth]{main/pic/acrobot.pdf}
+        \label{AcrobotFull}
+    }
+        \caption{Learning curses of one evaluation environment and four contral environments.}
+        \label{Complete_full}
+    \end{center}
+    \vskip -0.2in
+  \end{figure*}
 \section{Experimental Studies}
 This section assesses algorithm performance through experiments, 
 which are divided into policy evaluation experiments and control experiments.
-The control algorithms for TDC, ETD, VMTDC, and VMETD are named GQ, EQ, VMGQ, and VMEQ, respectively.
+The evaluation experimental environments is the 2-state. 
-The evaluation experimental environments are the 2-state and 7-state counterexample. 
+In a 2-state environment, we conducted two types of experiments—on-policy 
+and off-policy—to verify the relationship between the convergence speed of 
+the algorithm and the smallest eigenvalue of the key matrix $\textbf{A}$.
+Control experiments, by allowing the algorithm to interact 
+with the environment to optimize the policy, can evaluate its 
+performance in learning the optimal policy. This provides a more 
+comprehensive assessment of the algorithm's overall capabilities.
 The control experimental environments are Maze, CliffWalking-v0, MountainCar-v0, and Acrobot-v1.
+The control algorithms for TDC, ETD, VMTDC, and VMETD are named GQ, EQ, VMGQ, and VMEQ, respectively.
+For TD and VMTD control algorithms, there are two variants each: Sarsa and Q-learning for TD, and VMSarsa and VMQ for VMTD.
+% For specific experimental parameters, please refer to the appendix.
+% \textbf{Baird's off-policy counterexample:} This task is well known as a
+% counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As
+% shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy
+% chooses actions represented by solid lines with a probability of $\frac{1}{7}$
+% and actions represented by dotted lines with a probability of $\frac{6}{7}$. The
+% target policy is expected to choose the solid line with more probability than $\frac{1}{7}$,
+% and it chooses the solid line with probability of $1$ in this paper.
+%  The discount factor $\gamma =0.99$, and the feature matrix is
+% defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}.
+% \begin{figure}
+%     \begin{center}
+%     \input{main/pic/BairdExample.tex}
+%     \caption{7-state.}
+%     \label{bairdexample}
+%     \end{center}
+% \end{figure}
+% The feature matrix of 7-state version of Baird's off-policy counterexample is
+% defined as follow:
+% \begin{equation*}
+% \Phi_{Counter}=\left[ 
+% \begin{array}{cccccccc}
+% 1 & 2& 0& 0& 0& 0& 0& 0\\
+% 1 & 0& 2& 0& 0& 0& 0& 0\\
+% 1 & 0& 0& 2& 0& 0& 0& 0\\
+% 1 & 0& 0& 0& 2& 0& 0& 0\\
+% 1 & 0& 0& 0& 0& 2& 0& 0\\
+% 1 & 0& 0& 0& 0& 0& 2& 0\\
+% 2 & 0& 0& 0& 0& 0& 0& 1
+% \end{array}\right]
+% \end{equation*}
+\subsection{Testing Tasks}
+% \begin{figure}[h]
+%     \centering
+%     \includegraphics[scale=0.2]{main/pic/maze_13_13.pdf} 
+%     \caption{Maze.}
+%     \end{figure}
+\textbf{Maze}:  The learning agent should find a shortest path from the upper
+left corner to the lower right corner. 
+ In each state,
+there are four alternative actions: $up$, $down$, $left$, and $right$, which
+takes the agent deterministically to the corresponding neighbour state,
+except when a movement is blocked by an obstacle or the edge
+of the maze. Rewards are $-1$ in all transitions until the
+agent reaches the goal state.
+The discount factor $\gamma=0.99$, and states $s$ are represented by tabular
+features.The maximum number of moves in the game is set to 1000.
+\textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are 
+selected from the gym official website and correspond to the following 
+versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''. 
+For specific details, please refer to the gym official website.
+The maximum number of steps for the Mountain Car environment is set to 1000, 
+while the default settings are used for the other two environments. In  Mountain car and Acrobot, features are generated by tile coding.
+For all policy evaluation experiments, each experiment 
+is independently run 100 times.
+For all control experiments, each experiment is independently run 50 times.
 For specific experimental parameters, please refer to the appendix.
-For the evaluation experiment, the experimental results 
+\subsection{Experimental Results and Analysis}
-align with our previous analysis. In the 2-state counterexample 
+Figure \ref{2-state} shows the learning curves for the on-policy 
-environment, the TDC algorithm has the smallest minimum 
+2-state policy evaluation experiment. In this setup, 
-eigenvalue of the key matrix, resulting in the slowest 
+the convergence speed of TD, VMTD, TDC, and VMTDC decreases 
-convergence speed. In contrast, the minimum eigenvalue 
+sequentially. Table \ref{tab:min_eigenvalues} indicates that the smallest eigenvalue 
-of VMTDC is larger, leading to faster convergence. 
+of the key matrix for these four algorithms is greater than 0 
-Although VMETD's minimum eigenvalue is larger than ETD's, 
+and decreases sequentially, which is consistent with the 
-causing VMETD to converge more slowly than ETD in the 
+experimental curves and table values.
-2-state counterexample, the standard deviation (shaded area) 
-of VMETD is smaller than that of ETD, indicating that VMETD 
+Figure B displays the learning curves for the off-policy 
-converges more smoothly. In the 7-state counterexample 
+2-state policy evaluation experiment. In this setup, 
-environment, VMTDC converges faster than TDC and both VMETD and ETD are diverge.
+the convergence speed of ETD, VMETD, VMTD, VMTDC, and 
+TDC decreases sequentially, while TD diverges. Table \ref{tab:min_eigenvalues} 
-For the control experiments, the results for the maze and 
+shows that the smallest eigenvalue of the key matrix for 
-cliff walking environments are similar: VMGQ 
+ETD, VMETD, VMTD, VMTDC, and TDC is greater than 0 and 
-outperforms GQ, EQ outperforms VMGQ, and VMEQ performs 
+decreases sequentially, while the smallest eigenvalue 
-the best. In the mountain car and Acrobot experiments, 
+for TD is less than 0. This is consistent with the 
-VMGQ and VMEQ show comparable performance, both outperforming 
+experimental curves and table values. Remarkably, 
-GQ and EQ. In summary, for control experiments, VM algorithms 
+although VMTD is guaranteed to converge under 
-outperform non-VM algorithms.
+on-policy conditions, it still converges in the 
+off-policy 2-state scenario. The update formula 
-In summary, the performance of VMSarsa, 
+of VMTD indicates that it is essentially an 
-VMQ, and VMGQ(0) is better than that of other algorithms. 
+adjustment and correction of the TD update, 
-In the Cliff Walking environment, 
+with the introduction of the parameter $\omega$ 
-the performance of VMGQ(0) is slightly better than that of 
+making the variance of the gradient estimate 
-VMSarsa and VMQ. In the other three experimental environments, 
+more stable, thereby making the update of theta more stable.
-the performances of VMSarsa, VMQ, and VMGQ(0) are close.
\ No newline at end of file
+Figures \ref{MazeFull}, \ref{CliffWalkingFull}, \ref{MountainCarFull} and \ref{AcrobotFull} show the learning curves 
+for four control experiments. A common feature 
+observed across these experiments is that VMEQ 
+outperforms EQ, VMGQ outperforms GQ, VMQ outperforms 
+Q-learning, and VMSarsa outperforms Sarsa. For the 
+Maze and Cliffwalking experiments, VMEQ demonstrated 
+the best performance with the fastest convergence speed. 
+In the Mountain Car and Acrobot experiments, the performance 
+of the four VM algorithms was nearly identical and all 
+outperformed the other algorithms.
+Overall, whether in policy evaluation experiments or 
+control experiments, the VM algorithms have 
+demonstrated superior performance, 
+especially excelling in the control experiments.
\ No newline at end of file
--- a/NEW_aaai/main/introduction.tex
+++ b/NEW_aaai/main/introduction.tex
@@ -68,26 +68,28 @@ based on recursive optimization using it are known to be unstable.
 It is necessary to propose a new objective function, but the mentioned objective functions above are all some form of error.
 Is minimizing error the only option for value-based reinforcement learning?
-For policy evaluation experiments, 
+% For policy evaluation experiments, 
-differences in objective functions may result 
+% differences in objective functions may result 
-in inconsistent fixed points. This inconsistency 
+% in inconsistent fixed points. This inconsistency 
-makes it difficult to uniformly compare the superiority 
+% makes it difficult to uniformly compare the superiority 
-of algorithms derived from different objective functions. 
+% of algorithms derived from different objective functions. 
-However, for control experiments, since the choice of actions 
+% However, for control experiments, since the choice of actions 
-depends on the relative values of the Q values rather than their
+% depends on the relative values of the Q values rather than their
- absolute values, the presence of solution bias is acceptable.
+%  absolute values, the presence of solution bias is acceptable.
 Based on this observation, we propose  alternate objective functions 
-instead of minimizing errors. We minimize 
+instead of minimizing errors. We minimize Variance of Bellman Error (VBE) and 
 Variance of Projected Bellman Error (VPBE)
 and derive Variance Minimization (VM) algorithms.
 These algorithms preserve the invariance of the optimal policy in the control environments,
-but significantly reduce the variance of gradient estimation,
+and significantly reduce the variance of gradient estimation,
 and thus hastening convergence.
 The contributions of this paper are as follows:
-(1) Introduction of  novel objective functions based on
+\begin{itemize}
-the invariance of the optimal policy.
+  \item Introduction of  novel objective functions, VBE and VPBE.
-(2) Propose two off-policy variance minimization algorithms.
+  \item Propose a on-policy VM algorithm and two off-policy VM algorithms.
-(3) Proof of their convergence.
+  \item Proof of their convergence. 
-(5) Experiments demonstrating the faster convergence speed of the proposed algorithms.
+  \item The experiments demonstrate the superiority of the VM algorithms.
+\end{itemize}
--- a/NEW_aaai/main/motivation.tex
+++ b/NEW_aaai/main/motivation.tex
--- a/NEW_aaai/main/pic/2-state-offpolicy.pdf
+++ b/NEW_aaai/main/pic/2-state-offpolicy.pdf
--- a/NEW_aaai/main/pic/2-state-onpolicy.pdf
+++ b/NEW_aaai/main/pic/2-state-onpolicy.pdf
--- a/NEW_aaai/main/pic/2StateExample.pdf
+++ b/NEW_aaai/main/pic/2StateExample.pdf
--- a/NEW_aaai/main/pic/BairdExample copy 2.tex
+++ b/NEW_aaai/main/pic/BairdExample copy 2.tex
+\resizebox{7cm}{4.4cm}{
+\begin{tikzpicture}[smooth]
+\node[coordinate] (origin) at (0.3,0) {};
+\node[coordinate] (num7) at (3,0) {};
+\node[coordinate] (num1) at (1,2.5) {};
+\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {};
+\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {};
+\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {};
+\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {};
+\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {};
+\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {};
+\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {};
+\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {};
+\path (num1) ++ (0:1cm) node (num2) [coordinate] {};
+\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {};
+\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {};
+\path (num2) ++ (0:1cm) node (num3) [coordinate] {};
+\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {};
+\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {};
+\path (num3) ++ (0:1cm) node (num4) [coordinate] {};
+\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {};
+\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {};
+\path (num4) ++ (0:1cm) node (num5) [coordinate] {};
+\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {};
+\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {};
+\path (num5) ++ (0:1cm) node (num6) [coordinate] {};
+\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {};
+\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {};
+%\draw[->](0,0) -- (1,1);
+%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1);
+ %\fill (0.5,0.5) circle (0.5);
+ %\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7};
+\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06]
+coordinates{(num7) (origin) (origin_above) (origin_aright)}; 
+\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5]
+coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)};
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6};
+\draw[->,>=stealth,line width = 0.02cm] (a)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (b)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (c)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (d)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (e)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (f)--(g);
+\end{tikzpicture}
+}
--- a/NEW_aaai/main/pic/BairdExample copy.tex
+++ b/NEW_aaai/main/pic/BairdExample copy.tex
+\resizebox{7cm}{4.4cm}{
+\begin{tikzpicture}[smooth]
+\node[coordinate] (origin) at (0.3,0) {};
+\node[coordinate] (num7) at (3,0) {};
+\node[coordinate] (num1) at (1,2.5) {};
+\path (num7) ++ (-10:0.5cm) node (num7_bright1) [coordinate] {};
+\path (num7) ++ (-30:0.7cm) node (num7_bright2) [coordinate] {};
+\path (num7) ++ (-60:0.35cm) node (num7_bright3) [coordinate] {};
+\path (num7) ++ (-60:0.6cm) node (num7_bright4) [coordinate] {};
+\path (origin) ++ (90:3cm) node (origin_above) [coordinate] {};
+\path (origin_above) ++ (0:5.7cm) node (origin_aright) [coordinate] {};
+\path (num1) ++ (90:0.5cm) node (num1_a) [coordinate] {};
+\path (num1) ++ (-90:0.3cm) node (num1_b) [coordinate] {};
+\path (num1) ++ (0:1cm) node (num2) [coordinate] {};
+\path (num1_a) ++ (0:1cm) node (num2_a) [coordinate] {};
+\path (num1_b) ++ (0:1cm) node (num2_b) [coordinate] {};
+\path (num2) ++ (0:1cm) node (num3) [coordinate] {};
+\path (num2_a) ++ (0:1cm) node (num3_a) [coordinate] {};
+\path (num2_b) ++ (0:1cm) node (num3_b) [coordinate] {};
+\path (num3) ++ (0:1cm) node (num4) [coordinate] {};
+\path (num3_a) ++ (0:1cm) node (num4_a) [coordinate] {};
+\path (num3_b) ++ (0:1cm) node (num4_b) [coordinate] {};
+\path (num4) ++ (0:1cm) node (num5) [coordinate] {};
+\path (num4_a) ++ (0:1cm) node (num5_a) [coordinate] {};
+\path (num4_b) ++ (0:1cm) node (num5_b) [coordinate] {};
+\path (num5) ++ (0:1cm) node (num6) [coordinate] {};
+\path (num5_a) ++ (0:1cm) node (num6_a) [coordinate] {};
+\path (num5_b) ++ (0:1cm) node (num6_b) [coordinate] {};
+%\draw[->](0,0) -- (1,1);
+%\draw[dashed,line width = 0.03cm] (0,0) -- (1,1);
+ %\fill (0.5,0.5) circle (0.5);
+ %\draw[shape=circle,fill=white,draw=black] (a) at (num7) {7};
+\draw[dashed,line width = 0.03cm,xshift=3cm] plot[tension=0.06]
+coordinates{(num7) (origin) (origin_above) (origin_aright)}; 
+\draw[->,>=stealth,line width = 0.02cm,xshift=3cm] plot[tension=0.5]
+coordinates{(num7) (num7_bright1) (num7_bright2)(num7_bright4) (num7_bright3)};
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (g) at (num7) {7};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num1) -- (num1_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (a) at (num1_b) {1};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num2) -- (num2_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (b) at (num2_b) {2};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num3) -- (num3_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (c) at (num3_b) {3};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num4) -- (num4_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (d) at (num4_b) {4};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num5) -- (num5_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (e) at (num5_b) {5};
+\draw[<->,>=stealth,dashed,line width = 0.03cm,] (num6) -- (num6_a) ;
+\node[line width = 0.02cm,shape=circle,fill=white,draw=black] (f) at (num6_b) {6};
+\draw[->,>=stealth,line width = 0.02cm] (a)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (b)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (c)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (d)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (e)--(g);
+\draw[->,>=stealth,line width = 0.02cm] (f)--(g);
+\end{tikzpicture}
+}
--- a/NEW_aaai/main/pic/acrobot.pdf
+++ b/NEW_aaai/main/pic/acrobot.pdf
--- a/NEW_aaai/main/pic/cl.pdf
+++ b/NEW_aaai/main/pic/cl.pdf
--- a/NEW_aaai/main/pic/maze.pdf
+++ b/NEW_aaai/main/pic/maze.pdf
--- a/NEW_aaai/main/pic/mt.pdf
+++ b/NEW_aaai/main/pic/mt.pdf
--- a/NEW_aaai/main/preliminaries.tex
+++ b/NEW_aaai/main/preliminaries.tex
--- a/NEW_aaai/main/theory.tex
+++ b/NEW_aaai/main/theory.tex
--- a/环境图片/2-state.png
+++ b/环境图片/2-state.png
--- a/画图.pptx
+++ b/画图.pptx
--- a/论文草稿.txt
+++ b/论文草稿.txt
 题目：A Variance Minimization Approach to  Off-policy Temporal-Difference Learning
@@ -104,3 +104,29 @@ to, (1) 将标量参数引入到更多的TD算法中.
 对于控制实验，迷宫和cliff walking的实验结果相似，VMGQ表现优于GQ，EQ表现优于VMGQ，而VMEQ的性能最优。
 mountain car和Acrobot的实验结果相似，VMGQ和VMEQ的性能接近都优于GQ和EQ。总之对于控制实验，VM算法优于非VM算法
+接下来，我们将在2-state环境中计算TD(0)、TDC、ETD的分别在on-policy和off-policy下的各自A的最小特征值。
+如果矩阵A正定，则算法收敛。
+首先，我们将介绍2-state分别在on-policy和off-policy下的环境设定。
+在on-policy设定下，行为策略与目标策略一样，令A=B。
+为了解决off-policy TD(0)的关键矩阵A_off非正定问题，
+为了方便
+在2-state环境中，我们进行了两种实验——on-policy实验和off-policy实验，来验证算法的收敛速度与关键矩阵的最小特征值的关系。
+图A是on-policy 2-state的策略评估实验的曲线图。在该实验设定下，TD、VMTD、TDC以及VMTDC的收敛速度在依次递减，而表1可以得到这四个算法的关键矩阵的最小特征值都大于0，并且依次递减。实验曲线和表格数值相照应。
+图B是off-policy 2-state的策略评估实验的曲线图。在该实验设定下，ETD、VMETD、VMTD、VMTDC以及TDC的收敛速度在依次递减，TD则发散。而表1可以得到ETD、VMETD、VMTD、VMTDC以及TDC这五个算法的关键矩阵的最小特征值都大于0，并且依次递减，TD算法的关键矩阵的最小特征值小于0。实验曲线和表格数值相照应。令人惊喜的是，尽管VMTD是on-policy下保证收敛的算法，但在off-policy 2-state下依旧可以收敛。由VMTD的更新公式可以看出，VMTD的更新公式相当于是对TD更新的调整与修正，参数omega的引入使得梯度估计的方差更加稳定，从而让theta的更新更加稳定。
+图1，2，3，4分别是四个控制实验的曲线图。四个控制实验都表现出了一个共性特征：VMEQ的表现优于EQ，VMGQ优于GQ，VMQ优于Q-learning，VMSarsa优于Sarsa。对于Maze和Cliffwalking实验，VMEQ都表现出了最佳的性能，收敛速度最快。对于Mountain car和 Acrobot实验，四个VM算法的表现近乎一样，并且都优于其他算法。
+总的来说，不管是策略评估实验还是控制实验，VM算法都表现较为优秀，尤其在控制实验中特别突出。
+在本论文中，
--- a/评估实验图/2-state-offpolicy.pdf
+++ b/评估实验图/2-state-offpolicy.pdf
--- a/评估实验图/2-state-onpolicy.pdf
+++ b/评估实验图/2-state-onpolicy.pdf