%File: anonymous-submission-latex-2024.tex \documentclass[letterpaper]{article} % DO NOT CHANGE THIS \usepackage[submission]{aaai24} % DO NOT CHANGE THIS \usepackage{times} % DO NOT CHANGE THIS \usepackage{helvet} % DO NOT CHANGE THIS \usepackage{courier} % DO NOT CHANGE THIS \usepackage[hyphens]{url} % DO NOT CHANGE THIS \usepackage{graphicx} % DO NOT CHANGE THIS \usepackage{amssymb} \usepackage{amsmath} \usepackage{subfigure} \usepackage{array} \usepackage{diagbox} \usepackage{siunitx} \urlstyle{rm} % DO NOT CHANGE THIS \def\UrlFont{\rm} % DO NOT CHANGE THIS \usepackage{natbib} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT \usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT \frenchspacing % DO NOT CHANGE THIS \setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS \setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS % % These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper. \usepackage{algorithm} \usepackage{algorithmic} \usepackage{subfigure} \usepackage{diagbox} \usepackage{booktabs} \usepackage{amsmath} \usepackage{amssymb} \usepackage{mathtools} \usepackage{amsthm} \usepackage{tikz} \usepackage{bm} \usepackage{esvect} \usepackage{multirow} % % These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper. \usepackage{newfloat} \usepackage{listings} \numberwithin{equation}{section} \renewcommand{\theequation}{\thesection-\arabic{equation}} % 自定义公式编号格式 \DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS \lstset{% basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers. aboveskip=0pt,belowskip=0pt,% showstringspaces=false,tabsize=2,breaklines=true} \floatstyle{ruled} \newfloat{listing}{tb}{lst}{} \floatname{listing}{Listing} % % Keep the \pdfinfo as shown here. There's no need % for you to add the /Title and /Author tags. \pdfinfo{ /TemplateVersion (2024.1) } % DISALLOWED PACKAGES % \usepackage{authblk} -- This package is specifically forbidden % \usepackage{balance} -- This package is specifically forbidden % \usepackage{color (if used in text) % \usepackage{CJK} -- This package is specifically forbidden % \usepackage{float} -- This package is specifically forbidden % \usepackage{flushend} -- This package is specifically forbidden % \usepackage{fontenc} -- This package is specifically forbidden % \usepackage{fullpage} -- This package is specifically forbidden % \usepackage{geometry} -- This package is specifically forbidden % \usepackage{grffile} -- This package is specifically forbidden % \usepackage{hyperref} -- This package is specifically forbidden % \usepackage{navigator} -- This package is specifically forbidden % (or any other package that embeds links such as navigator or hyperref) % \indentfirst} -- This package is specifically forbidden % \layout} -- This package is specifically forbidden % \multicol} -- This package is specifically forbidden % \nameref} -- This package is specifically forbidden % \usepackage{savetrees} -- This package is specifically forbidden % \usepackage{setspace} -- This package is specifically forbidden % \usepackage{stfloats} -- This package is specifically forbidden % \usepackage{tabu} -- This package is specifically forbidden % \usepackage{titlesec} -- This package is specifically forbidden % \usepackage{tocbibind} -- This package is specifically forbidden % \usepackage{ulem} -- This package is specifically forbidden % \usepackage{wrapfig} -- This package is specifically forbidden % DISALLOWED COMMANDS % \nocopyright -- Your paper will not be published if you use this command % \addtolength -- This command may not be used % \balance -- This command may not be used % \baselinestretch -- Your paper will not be published if you use this command % \clearpage -- No page breaks of any kind may be used for the final version of your paper % \columnsep -- This command may not be used % \newpage -- No page breaks of any kind may be used for the final version of your paper % \pagebreak -- No page breaks of any kind may be used for the final version of your paperr % \pagestyle -- This command may not be used % \tiny -- This is not an acceptable font size. % \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference % \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference \setcounter{secnumdepth}{2} %May be changed to 1 or 2 if section numbers are desired. % The file aaai24.sty is the style file for AAAI Press % proceedings, working notes, and technical reports. % % Title % Your title must be in mixed case, not sentence case. % That means all verbs (including short verbs like be, is, using,and go), % nouns, adverbs, adjectives should be capitalized, including both words in hyphenated terms, while % articles, conjunctions, and prepositions are lower case unless they % directly follow a colon or long dash \title{AAAI Press Anonymous Submission\\Instructions for Authors Using \LaTeX{}} \author{ %Authors % All authors must be in the same font size and format. Written by AAAI Press Staff\textsuperscript{\rm 1}\thanks{With help from the AAAI Publications Committee.}\\ AAAI Style Contributions by Pater Patel Schneider, Sunil Issar,\\ J. Scott Penberthy, George Ferguson, Hans Guesgen, Francisco Cruz\equalcontrib, Marc Pujol-Gonzalez\equalcontrib } \affiliations{ %Afiliations \textsuperscript{\rm 1}Association for the Advancement of Artificial Intelligence\\ % If you have multiple authors and multiple affiliations % use superscripts in text and roman font to identify them. % For example, % Sunil Issar\textsuperscript{\rm 2}, % J. Scott Penberthy\textsuperscript{\rm 3}, % George Ferguson\textsuperscript{\rm 4}, % Hans Guesgen\textsuperscript{\rm 5} % Note that the comma should be placed after the superscript 1900 Embarcadero Road, Suite 101\\ Palo Alto, California 94303-3310 USA\\ % email address must be in roman text type, not monospace or sans serif proceedings-questions@aaai.org % % See more examples next } %Example, Single Author, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it \iffalse \title{My Publication Title --- Single Author} \author { Author Name } \affiliations{ Affiliation\\ Affiliation Line 2\\ name@example.com } \fi \iffalse %Example, Multiple Authors, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it \title{My Publication Title --- Multiple Authors} \author { % Authors First Author Name\textsuperscript{\rm 1}, Second Author Name\textsuperscript{\rm 2}, Third Author Name\textsuperscript{\rm 1} } \affiliations { % Affiliations \textsuperscript{\rm 1}Affiliation 1\\ \textsuperscript{\rm 2}Affiliation 2\\ firstAuthor@affiliation1.com, secondAuthor@affilation2.com, thirdAuthor@affiliation1.com } \fi % REMOVE THIS: bibentry % This is only needed to show inline citations in the guidelines document. You should not need it and can safely delete it. \usepackage{bibentry} % END REMOVE bibentry \begin{document} % \maketitle \onecolumn \appendix \section{Relevant proofs} \subsection{Proof of Theorem 2} \label{proofth2} \begin{proof} The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. For the VMTDC algorithm, a new one-step linear TD solution is defined as: \begin{equation*} 0=\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b}). \end{equation*} The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the VMTD's solution is $\bm{\theta}_{\text{VMTDC}}=\bm{\theta}_{\text{VMTD}}=\textbf{A}^{-1}\bm{b}$. First, note that recursion (11) and (12) can be rewritten as, respectively, \begin{equation*} \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\zeta_k \bm{x}(k), \end{equation*} \begin{equation*} \bm{u}_{k+1}\leftarrow \bm{u}_k+\beta_k \bm{y}(k), \end{equation*} where \begin{equation*} \bm{x}(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)], \end{equation*} \begin{equation*} \bm{y}(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \bm{\phi}^{\top}_k \bm{u}_k]\bm{\phi}_k. \end{equation*} Recursion (11) can also be rewritten as \begin{equation*} \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\beta_k z(k), \end{equation*} where \begin{equation*} z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)], \end{equation*} Due to the settings of step-size schedule $\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $\bm{x}(k)\rightarrow 0$, $\bm{y}(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. That is that the increments in iteration (13) are uniformly larger than those in (12) and the increments in iteration (12) are uniformly larger than those in (11), thus (13) is the fastest recursion, (12) is the second fast recursion and (11) is the slower recursion. Along the fastest time scale, iterations of (11), (12) and (13) are associated to ODEs system as follows: \begin{equation} \dot{\bm{\theta}}(t) = 0, \label{thetavmtdcFastest} \end{equation} \begin{equation} \dot{\bm{u}}(t) = 0, \label{uvmtdcFastest} \end{equation} \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)]-\omega(t). \label{omegavmtdcFastest} \end{equation} Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\bm{\theta}(t)\equiv \bm{\theta}$ and $\bm{u}(t)\equiv \bm{u}$ when viewed from the fastest timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion (11) and $||\bm{u}_k-\bm{u}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $u$ that depends on the initial condition $u_0$ of recursion (12). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest}) can be written as \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u},\bm{\theta}]-\omega(t). \label{omegavmtdcFastestFinal} \end{equation} Consider the function $h(\omega)=\mathbb{E}[\delta|\bm{\theta},\bm{u}]-\omega$, i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}). It is easy to find that the function $h$ is Lipschitz with coefficient $-1$. Let $h_{\infty}(\cdot)$ be the function defined by $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$. Then $h_{\infty}(\omega)= -\omega$, is well-defined. For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\bm{\theta},\bm{u}]$ is the unique globally asymptotically stable equilibrium. For the ODE \begin{equation} \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), \label{omegavmtdcInfty} \end{equation} apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its associated strict Liapunov function. Then, the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable equilibrium. Consider now the recursion (13). Let $M_{k+1}=(\delta_k-\omega_k) -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, where $\mathcal{F}(k)=\sigma(\omega_l,\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq c_1(1+||\omega_k||^2+||\bm{u}_k||^2+||\bm{\theta}_k||^2). \end{equation*} Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the second time scale recursion (12). Based on the above analysis, (12) can be rewritten as % \begin{equation*} % \bm{u}_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|\bm{u}_k,\bm{\theta}_k] - \bm{\phi}^{\top} (s_k) \bm{u}_k]\bm{\phi}(s_k). % \end{equation*} \begin{equation} \dot{\bm{\theta}}(t) = 0, \label{thetavmtdcFaster} \end{equation} \begin{equation} \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)])\bm{\phi}_t|\bm{\theta}(t)] - \textbf{C}\bm{u}(t). \label{uvmtdcFaster} \end{equation} The ODE (\ref{thetavmtdcFaster}) suggests that $\bm{\theta}(t)\equiv \bm{\theta}$ (i.e., a time invariant parameter) when viewed from the second fast timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion (11). Consider now the recursion (12). Let $N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k)|\mathcal{I} (k)]$, where $\mathcal{I}(k)=\sigma(\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq c_2(1+||\bm{u}_k||^2+||\bm{\theta}_k||^2). \end{equation*} Because $\bm{\theta}(t)\equiv \bm{\theta}$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster}) can be written as \begin{equation} \dot{\bm{u}}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{\theta}])\bm{\phi}_t|\bm{\theta}] - \textbf{C}\bm{u}(t). \label{uvmtdcFasterFinal} \end{equation} Now consider the function $h(\bm{u})=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\bm{\theta}]|\bm{\theta}] -\textbf{C}\bm{u}$, i.e., the driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}), $\bm{u}^* = \textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\bm{\theta}])\bm{\phi}|\bm{\theta}]$ is the unique globally asymptotically stable equilibrium. Let $h_{\infty}(\bm{u})=-\textbf{C}\bm{u}$. For the ODE \begin{equation} \dot{\bm{u}}(t) = h_{\infty}(\bm{u}(t))= -\textbf{C}\bm{u}(t), \label{uvmtdcInfty} \end{equation} the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable equilibrium because $\textbf{C}$ is a positive definite matrix (because it is nonnegative definite and nonsingular). Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\bm{u}_k-\bm{u}^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower timescale recursion (11). In the light of the above, (11) can be rewritten as \begin{equation} \bm{\theta}_{k+1} \leftarrow \bm{\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\ - \alpha_k \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]). \end{equation} Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq c_1(1+||\omega_k||^2+||\bm{\theta}_k||^2). \end{equation*} Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower time scale recursion (19). Based on the above analysis, (19) can be rewritten as % \begin{equation*} % \bm{\theta}_{k+1}\leftarrow % \bm{\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k. % \end{equation*} \begin{equation*} \begin{split} \bm{\theta}_{k+1}&\leftarrow \bm{\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_k)\bm{\phi}_k -\alpha_k \omega_{k+1}\bm{\phi}_k\\ &=\bm{\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k\\ &=\bm{\theta}_k+\alpha_k F_k \rho_k (R_{k+1}+\gamma \bm{\theta}_k^{\top}\bm{\phi}_{k+1}-\bm{\theta}_k^{\top}\bm{\phi}_k)\bm{\phi}_k -\alpha_k \mathbb{E}_{\mu}[F_k \rho_k \delta_k]\bm{\phi}_k\\ &= \bm{\theta}_k+\alpha_k \{\underbrace{(F_k\rho_kR_{k+1}-\mathbb{E}_{\mu}[F_k\rho_k R_{k+1}])\bm{\phi}_k}_{\bm{b}_{\text{VMETD},k}} -\underbrace{(F_k\rho_k\bm{\phi}_k(\bm{\phi}_k-\gamma\bm{\phi}_{k+1})^{\top}-\bm{\phi}_k\mathbb{E}_{\mu}[F_k\rho_k (\bm{\phi}_k-\gamma\bm{\phi}_{k+1})]^{\top})}_{\textbf{A}_{\text{VMETD},k}}\bm{\theta}_k\} \end{split} \end{equation*} Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq c_2(1+||\bm{\theta}_k||^2). \end{equation*} Consider now the following ODE associated with (19): \begin{equation} \begin{array}{ccl} \dot{\bm{\theta}}(t)&=&-\textbf{A}_{\text{VMETD}}\bm{\theta}(t)+\bm{b}_{\text{VMETD}}. \end{array} \label{odetheta} \end{equation} \begin{equation} \begin{split} \textbf{A}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[\textbf{A}_{\text{VMETD},k}]\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k \rho_k \bm{\phi}_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ % &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\underbrace{\bm{\phi}_k}_{X}\underbrace{F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}}_{Y}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ &=\sum_{s} f(s) \bm{\phi}(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top} - \sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top} \\ &={\bm{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) \bm{\Phi} - {\bm{\Phi}}^{\top} \textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) \bm{\Phi} \\ &={\bm{\Phi}}^{\top} (\textbf{F} - \textbf{d}_{\mu} \textbf{f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}} \\ &={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){\bm{\Phi}} \\ &={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}} \\ \end{split} \end{equation} \begin{equation} \begin{split} \bm{b}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[\bm{b}_{\text{VMETD},k}]\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k\rho_kR_{k+1}\bm{\phi}_k]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &=\sum_{s} f(s) \bm{\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)r_{\pi} \\ &=\bm{\bm{\Phi}}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi} \\ \end{split} \end{equation} Let $\vec{h}(\bm{\theta}(t))$ be the driving vector field of the ODE (\ref{odetheta}). \begin{equation*} \vec{h}(\bm{\theta}(t))=-\textbf{A}_{\text{VMETD}}\bm{\theta}(t)+\bm{b}_{\text{VMETD}}. \end{equation*} An $\bm{\Phi}^{\top}\bm{\text{X}}\bm{\Phi}$ matrix of this form will be positive definite whenever the matrix $\bm{\text{X}}$ is positive definite. Any matrix $\bm{\text{X}}$ is positive definite if and only if the symmetric matrix $\bm{\text{S}}=\bm{\text{X}}+\bm{\text{X}}^{\top}$ is positive definite. Any symmetric real matrix $\bm{\text{S}}$ is positive definite if the absolute values of its diagonal entries are greater than the sum of the absolute values of the corresponding off-diagonal entries\cite{sutton2016emphatic}. \begin{equation} \label{rowsum} \begin{split} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )\textbf{1} &=\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})\textbf{1}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ &=\textbf{F}(\textbf{1}-\gamma \textbf{P}_{\pi} \textbf{1})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ &=(1-\gamma)\textbf{F}\textbf{1}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \\ &=(1-\gamma)(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}\textbf{d}_{\mu}-\textbf{d}_{\mu} \\ &=(1-\gamma)[(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}-\textbf{I}]\textbf{d}_{\mu} \\ &=(1-\gamma)[\sum_{t=0}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}-\textbf{I}]\textbf{d}_{\mu} \\ &=(1-\gamma)[\sum_{t=1}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}]\textbf{d}_{\mu} > 0 \\ \end{split} \end{equation} \begin{equation} \label{columnsum} \begin{split} \textbf{1}^{\top}(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ) &=\textbf{1}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{1}^{\top}\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \\ &=\textbf{d}_{\mu}^{\top}-\textbf{1}^{\top}\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \\ &=\textbf{d}_{\mu}^{\top}- \textbf{d}_{\mu}^{\top} \\ &=0 \end{split} \end{equation} (\ref{rowsum}) and (\ref{columnsum}) show that the matrix $\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top}$ of diagonal entries are positive and its off-diagonal entries are negative. So its each row sum plus the corresponding column sum is positive. So $\textbf{A}_{\text{VMETD}}$ is positive definite. Therefore, $\bm{\theta}^*=\textbf{A}_{\text{VMETD}}^{-1}\bm{b}_{\text{VMETD}}$ can be seen to be the unique globally asymptotically stable equilibrium for ODE (\ref{odetheta}). Let $\vec{h}_{\infty}(\bm{\theta})=\lim_{r\rightarrow \infty}\frac{\vec{h}(r\bm{\theta})}{r}$. Then $\vec{h}_{\infty}(\bm{\theta})=-\textbf{A}_{\text{VMETD}}\bm{\theta}$ is well-defined. Consider now the ODE \begin{equation} \dot{\bm{\theta}}(t)=-\textbf{A}_{\text{VMETD}}\bm{\theta}(t). \label{odethetafinal} \end{equation} The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium. Thus, the assumption (A1) and (A2) are verified. \end{proof} \section{Mathematical Analysis} \label{mathematicalanalysis} Table \ref{keymatrices} shows the key matrices of various algorithms. Table \ref{minimumeigenvalues} shows minimum eigenvalues $\frac{1}{2}\lambda_{\min}(\textbf{A} + \textbf{A}^\top)$ of various algorithms on several examples. \begin{table}[htb] \centering \caption{Key matrices of various algorithms.} \label{keymatrices} { \begin{tabular}{cccc} \toprule Algorithm&Key matrix $\textbf{A}$&{Positive definite}&{$\bm{b}$}\\\midrule On-policy TD&$\bm{\Phi}^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$&$\checkmark$&$\bm{b}_{\text{on}}=\bm{\Phi}^{\top}\textbf{D}_{\pi}\textbf{r}_{\pi}$\\ On-policy VMTD&${\bm{\Phi}}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){\bm{\Phi}}$ &$\checkmark$&$\bm{\Phi}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top})\textbf{r}_{\pi}$\\ \midrule Off-policy TD&$\textbf{A}_{\text{off}}={\bm{\Phi}}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma \textbf{P}_{\pi}){\bm{\Phi}}$&$\times$&$\bm{b}_{\text{off}}=\bm{\Phi}^{\top}\textbf{D}_{\mu}\textbf{r}_{\pi}$\\ TDC& $\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{off}}$&$\checkmark$&$\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\bm{b}_{\text{off}}$ \\ ETD& ${\bm{\Phi}}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}}$ &$\checkmark$&$\bm{\Phi}^{\top}\textbf{F}\textbf{r}_{\pi}$\\ \midrule Off-policy VMTD&$\textbf{A}_{\text{VM}}={\bm{\Phi}}^{\top} (\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){\bm{\Phi}}$ &$\times$&$\bm{b}_{\text{VM}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$\\ VMTDC& $\textbf{A}_{\text{VM}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{VM}}$&$\checkmark$&$\textbf{A}_{\text{VM}}^{\top}\textbf{C}^{-1}\bm{b}_{\text{VM}}$ \\ VMETD& ${\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}}$ &$\checkmark$&$\bm{\Phi}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$\\ \bottomrule \end{tabular} } \end{table} \begin{table}[htb] \centering \caption{Minimum eigenvalues $\frac{1}{2}\lambda_{\min}(\textbf{A} + \textbf{A}^\top)$ of various algorithms on several examples.} \label{minimumeigenvalues} { \begin{tabular}{ccccccc} \toprule \multirow{2}{*}{Algorithm}&\multirow{2}{*}{2-state} & \multirow{2}{*}{Baird's}&\multicolumn{3}{c}{Random Walk}&\multirow{2}{*}{Boyan Chain} \\\cmidrule{4-6} %\cline{3-4} &&&Tabular & Inverted & Dependent& \\ \midrule TD& $-0.2$ & $-0.791$&$0.018$&$0.017$&$0.07$&$0.024$\\ VMTD & & & & & & \\ \midrule TDC&$0.016$ & $-0.002$&$0.002$&$0.007$&$0.011$&$0.002$\\ VMTDC & & & & & & \\ \midrule ETD&$\textbf{3.4}$& $-2.82e-16$&$\textbf{0.195}$&$\textbf{0.165}$&$\textbf{0.76}$&$\textbf{0.245}$\\ VMETD & & & & & & \\ \bottomrule \end{tabular} } \end{table} \begin{table}[htb] \label{bairdcounterexample} \centering \caption{Baird's Counterexample.} { \begin{tabular}{ccc} \toprule \multirow{2}{*}{Algorithm}&\multicolumn{2}{c}{Baird's Counterexample} \\\cmidrule{2-3} &Two State & Seven State \\ \midrule TD& [-0.2]& [-7.91e-01, 1.07e-16, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 7.48e-01]\\ VMTD & [0.25]& [-2.25e-01, -3.45e-17, 5.711e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 2.87e+00] \\ \midrule TDC&[0.016]&[-0.0024, 0.0078, 0.5714, 0.5714, 0.5714, 0.5714, 0.5729, 1.7682]\\ VMTDC &[0.025] &[-1.55e-03, -3.15e-04, 3.56e-01, 5.47e-01, 5.70e-01, 5.72e-01, 5.83e-01, 5.98e-01] \\ \midrule ETD& [3.4]& [-2.82e-16, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 7.40e-01, 3.72e+00]\\ VMETD &[1.15] &[-2.25e-01, -3.45e-17, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 2.86e+00] \\ \bottomrule \end{tabular} } \end{table} \begin{table}[htb] \label{randomwalk} \centering \caption{Random Walk.} { \begin{tabular}{cccc} \toprule \multirow{2}{*}{Algorithm}&\multicolumn{3}{c}{Random Walk} \\\cmidrule{2-4} &Tabular & Inverted & Dependent \\ \midrule TD&[0.018,0.074,0.183,0.260,0.464] &[0.017,0.044,0.065,0.083,0.116] &[0.070,0.113,0.138]\\ VMTD & [1.62e-17,0.073,0.180,0.260,0.463]& [-2.07e-17,0.018,0.045,0.065,0.115] &[0.022,0.115,0.116]\\ \midrule TDC&[0.002,0.046,0.240,0.364,0.769]&[0.007,0.012,0.060,0.091,0.192]&[0.011,0.065,0.182]\\ VMTDC & [6.74e-17,0.044,0.240,0.364,0.769]& [-7.248e-18,0.011,0.060,0.091,0.192] & [0.0008,0.062,0.182]\\ \midrule ETD&[0.195,0.669,1.712,2.765,4.660] & [0.165,0.420,0.678,0.820,1.168]&[0.760,1.084,1.394]\\ VMETD &[-3.40e-04,0.664,1.69,2.76,4.65] & [-0.001,0.167,0.423,0.689,1.163]&[0.221,1.043,1.293] \\ \bottomrule \end{tabular} } \end{table} \begin{table}[htb] \label{boyanchain} \centering \caption{Boyan Chain.} { \begin{tabular}{cc} \toprule Algorithm&eigenvalues\\ \midrule TD&[0.024495,0.054,0.065,0.135]\\ VMTD&[2.70e-18,0.053,0.065,0.135]\\ \midrule TDC&[0.002,0.058,0.067,0.153] \\ VMTDC& [-1.40e-18,0.057,0.067,0.153]\\ \midrule ETD& [0.245,0.540,0.647,1.352]\\ VMETD&[1.57e-17,0.529,0.647,1.352] \\ \bottomrule \end{tabular} } \end{table} \section{Experimental details} \label{experimentaldetails} The feature matrices corresponding to three random walks are shown below respectively: \begin{equation*} \Phi_{tabular}=\left[ \begin{array}{ccccc} 1 & 0& 0& 0& 0\\ 0 & 1& 0& 0& 0\\ 0 & 0& 1& 0& 0\\ 0 & 0& 0& 1& 0\\ 0 & 0& 0& 0& 1 \end{array}\right] \end{equation*} \begin{equation*} \Phi_{inverted}=\left[ \begin{array}{ccccc} 0 & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\ \frac{1}{2} & 0& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\ \frac{1}{2} & \frac{1}{2}& 0& \frac{1}{2}& \frac{1}{2}\\ \frac{1}{2} & \frac{1}{2}& \frac{1}{2}& 0& \frac{1}{2}\\ \frac{1}{2} & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& 0 \end{array}\right] \end{equation*} \begin{equation*} \Phi_{dependent}=\left[ \begin{array}{ccccc} 1 & 0& 0\\ \frac{1}{\sqrt{2}} & \frac{1}{\sqrt{2}}& 0\\ \frac{1}{\sqrt{3}} & \frac{1}{\sqrt{3}}& \frac{1}{\sqrt{3}}\\ 0 & \frac{1}{\sqrt{2}}& \frac{1}{\sqrt{2}}\\ 0 & 0& 1 \end{array}\right] \end{equation*} Three random walk experiments: the $\alpha$ values for all algorithms are in the range of $\{0.008, 0.015, 0.03, 0.06, 0.12, 0.25, 0.5\}$. For the TDC algorithm, the range of the ratio $\frac{\zeta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. For the VMTD algorithm, the range of the ratio $\frac{\beta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. It can be observed from the update formula of VMTDC that when $\zeta$ takes a very small value, the VMTDC update tends to be similar to VMTD update. Similarly, when $\beta$ takes a very small value, the VMTDC update tends to be similar to TDC update. Through experiments, it was found that setting $\zeta$ to a small value makes VMTDC updates approach VMTD updates, resulting in better performance. Therefore, for the VMTDC algorithm, the range of $\frac{\beta}{\alpha}$ ratio is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$, and the range of $\zeta$ is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. The learning curves in Figure \ref{Evaluation_full} correspond to the optimal parameters. The feature matrix of 7-state version of Baird's off-policy counterexample is defined as follow: \begin{equation*} \Phi_{Counter}=\left[ \begin{array}{cccccccc} 1 & 2& 0& 0& 0& 0& 0& 0\\ 1 & 0& 2& 0& 0& 0& 0& 0\\ 1 & 0& 0& 2& 0& 0& 0& 0\\ 1 & 0& 0& 0& 2& 0& 0& 0\\ 1 & 0& 0& 0& 0& 2& 0& 0\\ 1 & 0& 0& 0& 0& 0& 2& 0\\ 2 & 0& 0& 0& 0& 0& 0& 1 \end{array}\right] \end{equation*} 7-state version of Baird's off-policy counterexample: for TD algorithm, $\alpha$ is set to 0.1. For the TDC algorithm, the range of $\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$, and the range of $\zeta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. For the VMTD algorithm, the range of $\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$, and the range of $\beta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. Through experiments, it was found that setting $\zeta$ to a small value makes VMTDC updates approach VMTD updates, resulting in better performance. Therefore, for the VMTDC algorithm, The range of values for $\alpha$ and $\beta$ is the same as that of VMTD and the range of $\zeta$ is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. The learning curves in Figure \ref{Complete_full} correspond to the optimal parameters. For all policy evaluation experiments, each experiment is independently run 100 times. For the four control experiments: The learning rates for each algorithm in all experiments are shown in Table \ref{lrofways}. For all control experiments, each experiment is independently run 50 times. \begin{table*}[htb] \centering \caption{Learning rates ($lr$) of four control experiments.} \vskip 0.15in \begin{tabular}{c|ccccc} \hline \multicolumn{1}{c|}{\diagbox{algorithms($lr$)}{envs}} &Maze &Cliff walking &Mountain Car &Acrobot \\ \hline Sarsa($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\ GQ(0)($\alpha,\zeta$)&$0.1,0.003$ &$0.1,0.004$ &$0.1,0.01$ &$0.1,0.01$ \\ VMSarsa($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\ VMGQ(0)($\alpha,\zeta,\beta$)&$0.1,0.001,0.001$ &$0.1,0.005,\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ \\ AC($lr_{\text{actor}},lr_{\text{critic}}$)&$0.01,0.1$ &$0.01,0.01$ &$0.01,0.05$ &$0.01,0.05$ \\ Q-learning($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\ VMQ($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\ \hline \end{tabular} \label{lrofways} \vskip -0.1in \end{table*} \bibliography{aaai24} \end{document}