%File: anonymous-submission-latex-2024.tex \documentclass[letterpaper]{article} % DO NOT CHANGE THIS \usepackage[submission]{aaai24} % DO NOT CHANGE THIS \usepackage{times} % DO NOT CHANGE THIS \usepackage{helvet} % DO NOT CHANGE THIS \usepackage{courier} % DO NOT CHANGE THIS \usepackage[hyphens]{url} % DO NOT CHANGE THIS \usepackage{graphicx} % DO NOT CHANGE THIS \usepackage{amssymb} \usepackage{amsmath} \usepackage{subfigure} \usepackage{array} \usepackage{diagbox} \usepackage{siunitx} \urlstyle{rm} % DO NOT CHANGE THIS \def\UrlFont{\rm} % DO NOT CHANGE THIS \usepackage{natbib} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT \usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT \frenchspacing % DO NOT CHANGE THIS \setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS \setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS % % These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper. \usepackage{algorithm} \usepackage{algorithmic} \usepackage{subfigure} \usepackage{diagbox} \usepackage{booktabs} \usepackage{amsmath} \usepackage{amssymb} \usepackage{mathtools} \usepackage{amsthm} \usepackage{tikz} \usepackage{bm} \usepackage{esvect} \usepackage{multirow} % % These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper. \usepackage{newfloat} \usepackage{listings} \numberwithin{equation}{section} \renewcommand{\theequation}{\thesection-\arabic{equation}} % 自定义公式编号格式 \DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS \lstset{% basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers. aboveskip=0pt,belowskip=0pt,% showstringspaces=false,tabsize=2,breaklines=true} \floatstyle{ruled} \newfloat{listing}{tb}{lst}{} \floatname{listing}{Listing} % % Keep the \pdfinfo as shown here. There's no need % for you to add the /Title and /Author tags. \pdfinfo{ /TemplateVersion (2024.1) } % DISALLOWED PACKAGES % \usepackage{authblk} -- This package is specifically forbidden % \usepackage{balance} -- This package is specifically forbidden % \usepackage{color (if used in text) % \usepackage{CJK} -- This package is specifically forbidden % \usepackage{float} -- This package is specifically forbidden % \usepackage{flushend} -- This package is specifically forbidden % \usepackage{fontenc} -- This package is specifically forbidden % \usepackage{fullpage} -- This package is specifically forbidden % \usepackage{geometry} -- This package is specifically forbidden % \usepackage{grffile} -- This package is specifically forbidden % \usepackage{hyperref} -- This package is specifically forbidden % \usepackage{navigator} -- This package is specifically forbidden % (or any other package that embeds links such as navigator or hyperref) % \indentfirst} -- This package is specifically forbidden % \layout} -- This package is specifically forbidden % \multicol} -- This package is specifically forbidden % \nameref} -- This package is specifically forbidden % \usepackage{savetrees} -- This package is specifically forbidden % \usepackage{setspace} -- This package is specifically forbidden % \usepackage{stfloats} -- This package is specifically forbidden % \usepackage{tabu} -- This package is specifically forbidden % \usepackage{titlesec} -- This package is specifically forbidden % \usepackage{tocbibind} -- This package is specifically forbidden % \usepackage{ulem} -- This package is specifically forbidden % \usepackage{wrapfig} -- This package is specifically forbidden % DISALLOWED COMMANDS % \nocopyright -- Your paper will not be published if you use this command % \addtolength -- This command may not be used % \balance -- This command may not be used % \baselinestretch -- Your paper will not be published if you use this command % \clearpage -- No page breaks of any kind may be used for the final version of your paper % \columnsep -- This command may not be used % \newpage -- No page breaks of any kind may be used for the final version of your paper % \pagebreak -- No page breaks of any kind may be used for the final version of your paperr % \pagestyle -- This command may not be used % \tiny -- This is not an acceptable font size. % \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference % \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference \setcounter{secnumdepth}{2} %May be changed to 1 or 2 if section numbers are desired. % The file aaai24.sty is the style file for AAAI Press % proceedings, working notes, and technical reports. % % Title % Your title must be in mixed case, not sentence case. % That means all verbs (including short verbs like be, is, using,and go), % nouns, adverbs, adjectives should be capitalized, including both words in hyphenated terms, while % articles, conjunctions, and prepositions are lower case unless they % directly follow a colon or long dash \title{AAAI Press Anonymous Submission\\Instructions for Authors Using \LaTeX{}} \author{ %Authors % All authors must be in the same font size and format. Written by AAAI Press Staff\textsuperscript{\rm 1}\thanks{With help from the AAAI Publications Committee.}\\ AAAI Style Contributions by Pater Patel Schneider, Sunil Issar,\\ J. Scott Penberthy, George Ferguson, Hans Guesgen, Francisco Cruz\equalcontrib, Marc Pujol-Gonzalez\equalcontrib } \affiliations{ %Afiliations \textsuperscript{\rm 1}Association for the Advancement of Artificial Intelligence\\ % If you have multiple authors and multiple affiliations % use superscripts in text and roman font to identify them. % For example, % Sunil Issar\textsuperscript{\rm 2}, % J. Scott Penberthy\textsuperscript{\rm 3}, % George Ferguson\textsuperscript{\rm 4}, % Hans Guesgen\textsuperscript{\rm 5} % Note that the comma should be placed after the superscript 1900 Embarcadero Road, Suite 101\\ Palo Alto, California 94303-3310 USA\\ % email address must be in roman text type, not monospace or sans serif proceedings-questions@aaai.org % % See more examples next } %Example, Single Author, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it \iffalse \title{My Publication Title --- Single Author} \author { Author Name } \affiliations{ Affiliation\\ Affiliation Line 2\\ name@example.com } \fi \iffalse %Example, Multiple Authors, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it \title{My Publication Title --- Multiple Authors} \author { % Authors First Author Name\textsuperscript{\rm 1}, Second Author Name\textsuperscript{\rm 2}, Third Author Name\textsuperscript{\rm 1} } \affiliations { % Affiliations \textsuperscript{\rm 1}Affiliation 1\\ \textsuperscript{\rm 2}Affiliation 2\\ firstAuthor@affiliation1.com, secondAuthor@affilation2.com, thirdAuthor@affiliation1.com } \fi % REMOVE THIS: bibentry % This is only needed to show inline citations in the guidelines document. You should not need it and can safely delete it. \usepackage{bibentry} % END REMOVE bibentry \begin{document} % \maketitle \onecolumn \appendix \section{Relevant proofs} \subsection{Proof of Theorem 1} \label{proofth1} \begin{proof} \label{th1proof} The proof is based on Borkar's Theorem for general stochastic approximation recursions with two time scales \cite{borkar1997stochastic}. % The new TD error for the linear setting is % \begin{equation*} % \delta_{\text{new}}=r+\gamma % \theta^{\top}\phi'-\theta^{\top}\phi-\mathbb{E}[\delta]. % \end{equation*} A new one-step linear TD solution is defined as: \begin{equation*} 0=\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]=-A\theta+b. \end{equation*} Thus, the VMTD's solution is $\theta_{\text{VMTD}}=A^{-1}b$. First, note that recursion (5) can be rewritten as \begin{equation*} \theta_{k+1}\leftarrow \theta_k+\beta_k\xi(k), \end{equation*} where \begin{equation*} \xi(k)=\frac{\alpha_k}{\beta_k}(\delta_k-\omega_k)\phi_k \end{equation*} Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$, $\xi(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$. That is the increments in iteration (4) are uniformly larger than those in (5), thus (4) is the faster recursion. Along the faster time scale, iterations of (4) and (5) are associated to ODEs system as follows: \begin{equation} \dot{\theta}(t) = 0, \label{thetaFast} \end{equation} \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta(t)]-\omega(t). \label{omegaFast} \end{equation} Based on the ODE (\ref{thetaFast}), $\theta(t)\equiv \theta$ when viewed from the faster timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $\theta$ that depends on the initial condition $\theta_0$ of recursion (5). Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta]-\omega(t). \label{omegaFastFinal} \end{equation} Consider the function $h(\omega)=\mathbb{E}[\delta|\theta]-\omega$, i.e., the driving vector field of the ODE (\ref{omegaFastFinal}). It is easy to find that the function $h$ is Lipschitz with coefficient $-1$. Let $h_{\infty}(\cdot)$ be the function defined by $h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$. Then $h_{\infty}(\omega)= -\omega$, is well-defined. For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}[\delta|\theta]$ is the unique globally asymptotically stable equilibrium. For the ODE \begin{equation} \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), \label{omegaInfty} \end{equation} apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its associated strict Liapunov function. Then, the origin of (\ref{omegaInfty}) is a globally asymptotically stable equilibrium. Consider now the recursion (\ref{omega}). Let $M_{k+1}=(\delta_k-\omega_k) -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, where $\mathcal{F}(k)=\sigma(\omega_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq c_1(1+||\omega_k||^2+||\theta_k||^2). \end{equation*} Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower time scale recursion (5). Based on the above analysis, (5) can be rewritten as \begin{equation*} \theta_{k+1}\leftarrow \theta_{k}+\alpha_k(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k. \end{equation*} Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq c_2(1+||\theta_k||^2). \end{equation*} Consider now the following ODE associated with (5): \begin{equation} \begin{array}{ccl} \dot{\theta}(t)&=&\mathrm{Cov}(\delta|\theta(t),\phi)\\ &=&\mathrm{Cov}(r+(\gamma\phi'-\phi)^{\top}\theta(t),\phi)\\ &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\theta(t)^{\top}(\phi-\gamma\phi'),\phi)\\ &=&\mathrm{Cov}(r,\phi)-\theta(t)^{\top}\mathrm{Cov}(\phi-\gamma\phi',\phi)\\ &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi-\gamma\phi',\phi)^{\top}\theta(t)\\ &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi-\gamma\phi')\theta(t)\\ &=&-A\theta(t)+b. \end{array} \label{odetheta} \end{equation} Let $\vec{h}(\theta(t))$ be the driving vector field of the ODE (\ref{odetheta}). \begin{equation*} \vec{h}(\theta(t))=-A\theta(t)+b. \end{equation*} Consider the cross-covariance matrix, \begin{equation} \begin{array}{ccl} A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\ &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\ &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\ &=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\ \end{array} \label{covariance} \end{equation} where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$ \footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or initialized randomly in a Markov chain for on-policy update.}. Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and $\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive definite. Then, the matrix $A$ is semi-positive definite because $A$ is linearly combined by two positive-weighted semi-positive definite matrice (\ref{covariance}). Furthermore, $A$ is nonsingular due to the assumption. Hence, the cross-covariance matrix $A$ is positive definite. Therefore, $\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically stable equilibrium for ODE (\ref{odetheta}). Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow \infty}\frac{\vec{h}(r\theta)}{r}$. Then $\vec{h}_{\infty}(\theta)=-A\theta$ is well-defined. Consider now the ODE \begin{equation} \dot{\theta}(t)=-A\theta(t). \label{odethetafinal} \end{equation} The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium. Thus, the assumption (A1) and (A2) are verified. \end{proof} \subsection{Proof of Theorem 2} \label{proofth2} \begin{proof} The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. For the VMTDC algorithm, a new one-step linear TD solution is defined as: \begin{equation*} 0=\mathbb{E}[({\phi} - \gamma {\phi}' - \mathbb{E}[{\phi} - \gamma {\phi}']){\phi}^\top]\mathbb{E}[{\phi} {\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta]){\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}{\theta}+{b}). \end{equation*} The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the VMTD's solution is ${\theta}_{\text{VMTDC}}=\textbf{A}^{-1}{b}$. First, note that recursion (5) and (6) can be rewritten as, respectively, \begin{equation*} {\theta}_{k+1}\leftarrow {\theta}_k+\zeta_k {x}(k), \end{equation*} \begin{equation*} {u}_{k+1}\leftarrow {u}_k+\beta_k {y}(k), \end{equation*} where \begin{equation*} {x}(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) {\phi}_k - \gamma{\phi}'_{k}({\phi}^{\top}_k {u}_k)], \end{equation*} \begin{equation*} {y}(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - {\phi}^{\top}_k {u}_k]{\phi}_k. \end{equation*} Recursion (5) can also be rewritten as \begin{equation*} {\theta}_{k+1}\leftarrow {\theta}_k+\beta_k z(k), \end{equation*} where \begin{equation*} z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) {\phi}_k - \gamma{\phi}'_{k}({\phi}^{\top}_k {u}_k)], \end{equation*} Due to the settings of step-size schedule $\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, ${x}(k)\rightarrow 0$, ${y}(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. That is that the increments in iteration (7) are uniformly larger than those in (6) and the increments in iteration (6) are uniformly larger than those in (5), thus (7) is the fastest recursion, (6) is the second fast recursion and (5) is the slower recursion. Along the fastest time scale, iterations of (5), (6) and (7) are associated to ODEs system as follows: \begin{equation} \dot{{\theta}}(t) = 0, \label{thetavmtdcFastest} \end{equation} \begin{equation} \dot{{u}}(t) = 0, \label{uvmtdcFastest} \end{equation} \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|{u}(t),{\theta}(t)]-\omega(t). \label{omegavmtdcFastest} \end{equation} Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both ${\theta}(t)\equiv {\theta}$ and ${u}(t)\equiv {u}$ when viewed from the fastest timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||{\theta}_k-{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some ${\theta}$ that depends on the initial condition ${\theta}_0$ of recursion (5) and $||{u}_k-{u}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $u$ that depends on the initial condition $u_0$ of recursion (6). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest}) can be written as \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|{u},{\theta}]-\omega(t). \label{omegavmtdcFastestFinal} \end{equation} Consider the function $h(\omega)=\mathbb{E}[\delta|{\theta},{u}]-\omega$, i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}). It is easy to find that the function $h$ is Lipschitz with coefficient $-1$. Let $h_{\infty}(\cdot)$ be the function defined by $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$. Then $h_{\infty}(\omega)= -\omega$, is well-defined. For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|{\theta},{u}]$ is the unique globally asymptotically stable equilibrium. For the ODE \begin{equation} \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), \label{omegavmtdcInfty} \end{equation} apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its associated strict Liapunov function. Then, the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable equilibrium. Consider now the recursion (7). Let $M_{k+1}=(\delta_k-\omega_k) -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, where $\mathcal{F}(k)=\sigma(\omega_l,{u}_l,{\theta}_l,l\leq k;{\phi}_s,{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq c_1(1+||\omega_k||^2+||{u}_k||^2+||{\theta}_k||^2). \end{equation*} Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the second time scale recursion (6). Based on the above analysis, (6) can be rewritten as % \begin{equation*} % {u}_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|{u}_k,{\theta}_k] - {\phi}^{\top} (s_k) {u}_k]{\phi}(s_k). % \end{equation*} \begin{equation} \dot{{\theta}}(t) = 0, \label{thetavmtdcFaster} \end{equation} \begin{equation} \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|{u}(t),{\theta}(t)]){\phi}_t|{\theta}(t)] - \textbf{C}{u}(t). \label{uvmtdcFaster} \end{equation} The ODE (\ref{thetavmtdcFaster}) suggests that ${\theta}(t)\equiv {\theta}$ (i.e., a time invariant parameter) when viewed from the second fast timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||{\theta}_k-{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some ${\theta}$ that depends on the initial condition ${\theta}_0$ of recursion (5). Consider now the recursion (6). Let $N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - {\phi}_k {\phi}^{\top}_k {u}_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - {\phi}_k {\phi}^{\top}_k {u}_k)|\mathcal{I} (k)]$, where $\mathcal{I}(k)=\sigma({u}_l,{\theta}_l,l\leq k;{\phi}_s,{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq c_2(1+||{u}_k||^2+||{\theta}_k||^2). \end{equation*} Because ${\theta}(t)\equiv {\theta}$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster}) can be written as \begin{equation} \dot{{u}}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|{\theta}]){\phi}_t|{\theta}] - \textbf{C}{u}(t). \label{uvmtdcFasterFinal} \end{equation} Now consider the function $h({u})=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|{\theta}]|{\theta}] -\textbf{C}{u}$, i.e., the driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}), ${u}^* = \textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|{\theta}]){\phi}|{\theta}]$ is the unique globally asymptotically stable equilibrium. Let $h_{\infty}({u})=-\textbf{C}{u}$. For the ODE \begin{equation} \dot{{u}}(t) = h_{\infty}({u}(t))= -\textbf{C}{u}(t), \label{uvmtdcInfty} \end{equation} the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable equilibrium because $\textbf{C}$ is a positive definite matrix (because it is nonnegative definite and nonsingular). Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||{u}_k-{u}^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower timescale recursion (5). In the light of the above, (5) can be rewritten as \begin{equation} {\theta}_{k+1} \leftarrow {\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|{\theta}_k]) {\phi}_k\\ - \alpha_k \gamma{\phi}'_{k}({\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|{\theta}_k]){\phi}|{\theta}_k]). \end{equation} Let $\mathcal{G}(k)=\sigma({\theta}_l,l\leq k;{\phi}_s,{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq c_1(1+||\omega_k||^2+||{\theta}_k||^2). \end{equation*} Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower time scale recursion (12). Based on the above analysis, (12) can be rewritten as % \begin{equation*} % {\theta}_{k+1}\leftarrow % {\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|{\theta}_k]){\phi}_k. % \end{equation*} \begin{equation*} \begin{split} {\theta}_{k+1}&\leftarrow {\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_k){\phi}_k -\alpha_k \omega_{k+1}{\phi}_k\\ &={\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|{\theta}_k]){\phi}_k\\ &={\theta}_k+\alpha_k F_k \rho_k (R_{k+1}+\gamma {\theta}_k^{\top}{\phi}_{k+1}-{\theta}_k^{\top}{\phi}_k){\phi}_k -\alpha_k \mathbb{E}_{\mu}[F_k \rho_k \delta_k]{\phi}_k\\ &= {\theta}_k+\alpha_k \{\underbrace{(F_k\rho_kR_{k+1}-\mathbb{E}_{\mu}[F_k\rho_k R_{k+1}]){\phi}_k}_{{b}_{\text{VMETD},k}} -\underbrace{(F_k\rho_k{\phi}_k({\phi}_k-\gamma{\phi}_{k+1})^{\top}-{\phi}_k\mathbb{E}_{\mu}[F_k\rho_k ({\phi}_k-\gamma{\phi}_{k+1})]^{\top})}_{\textbf{A}_{\text{VMETD},k}}{\theta}_k\} \end{split} \end{equation*} Let $\mathcal{G}(k)=\sigma({\theta}_l,l\leq k;{\phi}_s,{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq c_2(1+||{\theta}_k||^2). \end{equation*} Consider now the following ODE associated with (12): \begin{equation} \begin{array}{ccl} \dot{{\theta}}(t)&=&-\textbf{A}_{\text{VMETD}}{\theta}(t)+{b}_{\text{VMETD}}. \end{array} \label{odetheta} \end{equation} \begin{equation} \begin{split} \textbf{A}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[\textbf{A}_{\text{VMETD},k}]\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k \rho_k {\phi}_k ({\phi}_k - \gamma {\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})]^{\top}\\ % &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\underbrace{{\phi}_k}_{X}\underbrace{F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})^{\top}}_{Y}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})]^{\top}\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_kF_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})]^{\top}\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_kF_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})^{\top}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k \rho_k ({\phi}_k - \gamma {\phi}_{k+1})]^{\top}\\ &=\sum_{s} f(s) {\phi}(s)({\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}{\phi}(s'))^{\top} - \sum_{s} d_{\mu}(s) {\phi}(s) * \sum_{s} f(s)({\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}{\phi}(s'))^{\top} \\ &={{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) {\Phi} - {{\Phi}}^{\top} \textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) {\Phi} \\ &={{\Phi}}^{\top} (\textbf{F} - \textbf{d}_{\mu} \textbf{f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){{\Phi}} \\ &={{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){{\Phi}} \\ &={{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){{\Phi}} \\ \end{split} \end{equation} \begin{equation} \begin{split} {b}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[{b}_{\text{VMETD},k}]\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k\rho_kR_{k+1}{\phi}_k]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\mathbb{E}_{\mu}[{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &=\sum_{s} f(s) {\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) {\phi}(s) * \sum_{s} f(s)r_{\pi} \\ &={{\Phi}}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi} \\ \end{split} \end{equation} Let $\vec{h}({\theta}(t))$ be the driving vector field of the ODE (\ref{odetheta}). \begin{equation*} \vec{h}({\theta}(t))=-\textbf{A}_{\text{VMETD}}{\theta}(t)+{b}_{\text{VMETD}}. \end{equation*} An ${\Phi}^{\top}{\text{X}}{\Phi}$ matrix of this form will be positive definite whenever the matrix ${\text{X}}$ is positive definite. Any matrix ${\text{X}}$ is positive definite if and only if the symmetric matrix ${\text{S}}={\text{X}}+{\text{X}}^{\top}$ is positive definite. Any symmetric real matrix ${\text{S}}$ is positive definite if the absolute values of its diagonal entries are greater than the sum of the absolute values of the corresponding off-diagonal entries\cite{sutton2016emphatic}. \begin{equation} \label{rowsum} \begin{split} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )\textbf{1} &=\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})\textbf{1}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ &=\textbf{F}(\textbf{1}-\gamma \textbf{P}_{\pi} \textbf{1})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ &=(1-\gamma)\textbf{F}\textbf{1}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\ &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \\ &=(1-\gamma)(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}\textbf{d}_{\mu}-\textbf{d}_{\mu} \\ &=(1-\gamma)[(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}-\textbf{I}]\textbf{d}_{\mu} \\ &=(1-\gamma)[\sum_{t=0}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}-\textbf{I}]\textbf{d}_{\mu} \\ &=(1-\gamma)[\sum_{t=1}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}]\textbf{d}_{\mu} > 0 \\ \end{split} \end{equation} \begin{equation} \label{columnsum} \begin{split} \textbf{1}^{\top}(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ) &=\textbf{1}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{1}^{\top}\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \\ &=\textbf{d}_{\mu}^{\top}-\textbf{1}^{\top}\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \\ &=\textbf{d}_{\mu}^{\top}- \textbf{d}_{\mu}^{\top} \\ &=0 \end{split} \end{equation} (\ref{rowsum}) and (\ref{columnsum}) show that the matrix $\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top}$ of diagonal entries are positive and its off-diagonal entries are negative. So its each row sum plus the corresponding column sum is positive. So $\textbf{A}_{\text{VMETD}}$ is positive definite. Therefore, ${\theta}^*=\textbf{A}_{\text{VMETD}}^{-1}{b}_{\text{VMETD}}$ can be seen to be the unique globally asymptotically stable equilibrium for ODE (\ref{odetheta}). Let $\vec{h}_{\infty}({\theta})=\lim_{r\rightarrow \infty}\frac{\vec{h}(r{\theta})}{r}$. Then $\vec{h}_{\infty}({\theta})=-\textbf{A}_{\text{VMETD}}{\theta}$ is well-defined. Consider now the ODE \begin{equation} \dot{{\theta}}(t)=-\textbf{A}_{\text{VMETD}}{\theta}(t). \label{odethetafinal} \end{equation} The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium. Thus, the assumption (A1) and (A2) are verified. \end{proof} % \section{Mathematical Analysis} % \label{mathematicalanalysis} % Table \ref{keymatrices} shows the key matrices of various algorithms. % Table \ref{minimumeigenvalues} shows minimum eigenvalues $\frac{1}{2}\lambda_{\min}(\textbf{A} + \textbf{A}^\top)$ of various algorithms on several examples. % \begin{table}[htb] % \centering % \caption{Key matrices of various algorithms.} % \label{keymatrices} % { % \begin{tabular}{cccc} % \toprule % Algorithm&Key matrix $\textbf{A}$&{Positive definite}&{${b}$}\\\midrule % On-policy TD&${\Phi}^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma % \textbf{P}_{\pi}){\Phi}$&$\checkmark$&${b}_{\text{on}}={\Phi}^{\top}\textbf{D}_{\pi}\textbf{r}_{\pi}$\\ % On-policy VMTD&${{\Phi}}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){{\Phi}}$ % &$\checkmark$&${\Phi}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top})\textbf{r}_{\pi}$\\ % \midrule % Off-policy TD&$\textbf{A}_{\text{off}}={{\Phi}}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma % \textbf{P}_{\pi}){{\Phi}}$&$\times$&${b}_{\text{off}}={\Phi}^{\top}\textbf{D}_{\mu}\textbf{r}_{\pi}$\\ % TDC& $\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{off}}$&$\checkmark$&$\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}{b}_{\text{off}}$ % \\ % ETD& ${{\Phi}}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}){{\Phi}}$ % &$\checkmark$&${\Phi}^{\top}\textbf{F}\textbf{r}_{\pi}$\\ % \midrule % Off-policy VMTD&$\textbf{A}_{\text{VMTD}}={{\Phi}}^{\top} (\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){{\Phi}}$ % &$\times$&${b}_{\text{VMTD}}={\Phi}^{\top}(\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$\\ % VMTDC& $\textbf{A}_{\text{VMTD}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{VMTD}}$&$\checkmark$&$\textbf{A}_{\text{VMTD}}^{\top}\textbf{C}^{-1}{b}_{\text{VMTD}}$ % \\ % VMETD& ${{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){{\Phi}}$ % &$\checkmark$&${\Phi}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi}$\\ % \bottomrule % \end{tabular} % } % \end{table} % \begin{table}[htb] % \centering % \caption{Minimum eigenvalues $\frac{1}{2}\lambda_{\min}(\textbf{A} + \textbf{A}^\top)$ of various algorithms on several examples.} % \label{minimumeigenvalues} % { % \begin{tabular}{ccccccc} % \toprule % \multirow{2}{*}{Algorithm}&\multirow{2}{*}{2-state} & \multirow{2}{*}{Baird's}&\multicolumn{3}{c}{Random Walk}&\multirow{2}{*}{Boyan Chain} % \\\cmidrule{4-6} %\cline{3-4} % &&&Tabular & Inverted & Dependent& \\ % \midrule % TD& $-0.2$ & $-0.791$&$0.018$&$0.017$&$0.07$&$0.024$\\ % VMTD & & & & & & \\ % \midrule % TDC&$0.016$ & $-0.002$&$0.002$&$0.007$&$0.011$&$0.002$\\ % VMTDC & & & & & & \\ % \midrule % ETD&$\textbf{3.4}$& $-2.82e-16$&$\textbf{0.195}$&$\textbf{0.165}$&$\textbf{0.76}$&$\textbf{0.245}$\\ % VMETD & & & & & & \\ % \bottomrule % \end{tabular} % } % \end{table} % \begin{table}[htb] % \label{bairdcounterexample} % \centering % \caption{Baird's Counterexample.} % { % \begin{tabular}{ccc} % \toprule % \multirow{2}{*}{Algorithm}&\multicolumn{2}{c}{Baird's Counterexample} % \\\cmidrule{2-3} % &Two State & Seven State \\ % \midrule % TD& [-0.2]& [-7.91e-01, 1.07e-16, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 7.48e-01]\\ % VMTD & [0.25]& [-2.25e-01, -3.45e-17, 5.711e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 2.87e+00] \\ % \midrule % TDC&[0.016]&[-0.0024, 0.0078, 0.5714, 0.5714, 0.5714, 0.5714, 0.5729, 1.7682]\\ % VMTDC &[0.025] &[-1.55e-03, -3.15e-04, 3.56e-01, 5.47e-01, 5.70e-01, 5.72e-01, 5.83e-01, 5.98e-01] \\ % \midrule % ETD& [3.4]& [-2.82e-16, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 7.40e-01, 3.72e+00]\\ % VMETD &[1.15] &[-2.25e-01, -3.45e-17, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 2.86e+00] \\ % \bottomrule % \end{tabular} % } % \end{table} % \begin{table}[htb] % \label{randomwalk} % \centering % \caption{Random Walk.} % { % \begin{tabular}{cccc} % \toprule % \multirow{2}{*}{Algorithm}&\multicolumn{3}{c}{Random Walk} % \\\cmidrule{2-4} % &Tabular & Inverted & Dependent \\ % \midrule % TD&[0.018,0.074,0.183,0.260,0.464] &[0.017,0.044,0.065,0.083,0.116] &[0.070,0.113,0.138]\\ % VMTD & [1.62e-17,0.073,0.180,0.260,0.463]& [-2.07e-17,0.018,0.045,0.065,0.115] &[0.022,0.115,0.116]\\ % \midrule % TDC&[0.002,0.046,0.240,0.364,0.769]&[0.007,0.012,0.060,0.091,0.192]&[0.011,0.065,0.182]\\ % VMTDC & [6.74e-17,0.044,0.240,0.364,0.769]& [-7.248e-18,0.011,0.060,0.091,0.192] & [0.0008,0.062,0.182]\\ % \midrule % ETD&[0.195,0.669,1.712,2.765,4.660] & [0.165,0.420,0.678,0.820,1.168]&[0.760,1.084,1.394]\\ % VMETD &[-3.40e-04,0.664,1.69,2.76,4.65] & [-0.001,0.167,0.423,0.689,1.163]&[0.221,1.043,1.293] \\ % \bottomrule % \end{tabular} % } % \end{table} % \begin{table}[htb] % \label{boyanchain} % \centering % \caption{Boyan Chain.} % { % \begin{tabular}{cc} % \toprule % Algorithm&eigenvalues\\ % \midrule % TD&[0.024495,0.054,0.065,0.135]\\ % VMTD&[2.70e-18,0.053,0.065,0.135]\\ % \midrule % TDC&[0.002,0.058,0.067,0.153] \\ % VMTDC& [-1.40e-18,0.057,0.067,0.153]\\ % \midrule % ETD& [0.245,0.540,0.647,1.352]\\ % VMETD&[1.57e-17,0.529,0.647,1.352] \\ % \bottomrule % \end{tabular} % } % \end{table} \section{Experimental details} \label{experimentaldetails} % The 2-state counterexample and the 7-state counterexample % are well-known off-policy experimental environments. The 2-state % counterexample is relatively simple, so next, I'll provide a detailed % description of the 7-state counterexample environment. % \textbf{Baird's off-policy counterexample:} This task is well known as a % counterexample, in which TD diverges \cite{baird1995residual,sutton2009fast}. As % shown in Figure \ref{bairdexample}, reward for each transition is zero. Thus the true values are zeros for all states and for any given policy. The behaviour policy % chooses actions represented by solid lines with a probability of $\frac{1}{7}$ % and actions represented by dotted lines with a probability of $\frac{6}{7}$. The % target policy is expected to choose the solid line with more probability than $\frac{1}{7}$, % and it chooses the solid line with probability of $1$ in this paper. % The discount factor $\gamma =0.99$, and the feature matrix is % defined in Appendix \ref{experimentaldetails} \cite{baird1995residual,sutton2009fast,maei2011gradient}. % \begin{figure} % \begin{center} % \input{pic/BairdExample.tex} % \caption{7-state version of Baird's off-policy counterexample.} % \label{bairdexample} % \end{center} % \end{figure} % The feature matrix of 7-state version of Baird's off-policy counterexample is % defined as follow: % \begin{equation*} % \Phi_{Counter}=\left[ % \begin{array}{cccccccc} % 1 & 2& 0& 0& 0& 0& 0& 0\\ % 1 & 0& 2& 0& 0& 0& 0& 0\\ % 1 & 0& 0& 2& 0& 0& 0& 0\\ % 1 & 0& 0& 0& 2& 0& 0& 0\\ % 1 & 0& 0& 0& 0& 2& 0& 0\\ % 1 & 0& 0& 0& 0& 0& 2& 0\\ % 2 & 0& 0& 0& 0& 0& 0& 1 % \end{array}\right] % \end{equation*} 2-state version of Baird's off-policy counterexample: All learning rates follow linear learning rate decay. For TD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. For TDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$ and $\alpha_0 = 0.1$. For VMTDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$, $\frac{\alpha_k}{\omega_k}=4$,and $\alpha_0 = 0.1$. For VMTD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. 2-state version of Baird's off-policy counterexample: All learning rates follow linear learning rate decay. For TD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. For TDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$ and $\alpha_0 = 0.1$.For ETD algorithm, $\alpha_0 = 0.1$. For VMTDC algorithm, $\frac{\alpha_k}{\zeta_k}=5$, $\frac{\alpha_k}{\omega_k}=4$,and $\alpha_0 = 0.1$.For VMETD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. For VMTD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. % 7-state version of Baird's off-policy counterexample: All learning rates follow linear learning rate decay. % For TDC algorithm, $\frac{\alpha_k}{\zeta_k}=3$ and $\alpha_0 = 0.1$.For ETD algorithm, $\alpha_0 = 0.1$. % For VMTDC algorithm, $\frac{\alpha_k}{\zeta_k}=3$, $\frac{\alpha_k}{\omega_k}=4$,and $\alpha_0 = 0.1$.For ETD algorithm, $\frac{\alpha_k}{\omega_k}=4$ and $\alpha_0 = 0.1$. For all policy evaluation experiments, each experiment is independently run 100 times. For the four control experiments: The learning rates for each algorithm in all experiments are shown in Table \ref{lrofways}. For all control experiments, each experiment is independently run 50 times. % \textbf{Maze}: The learning agent should find a shortest path from the upper % left corner to the lower right corner. % In each state, % there are four alternative actions: $up$, $down$, $left$, and $right$, which % takes the agent deterministically to the corresponding neighbour state, % except when a movement is blocked by an obstacle or the edge % of the maze. Rewards are $-1$ in all transitions until the % agent reaches the goal state. % The discount factor $\gamma=0.99$, and states $s$ are represented by tabular % features.The maximum number of moves in the game is set to 1000. % \begin{figure} % \centering % \includegraphics[scale=0.35]{pic/maze_13_13.pdf} % \caption{Maze.} % \end{figure} % \textbf{The other three control environments}: Cliff Walking, Mountain Car, and Acrobot are % selected from the gym official website and correspond to the following % versions: ``CliffWalking-v0'', ``MountainCar-v0'' and ``Acrobot-v1''. % For specific details, please refer to the gym official website. % The maximum number of steps for the Mountain Car environment is set to 1000, % while the default settings are used for the other two environments. In Mountain car and Acrobot, features are generated by tile coding. \begin{table*}[htb] \centering \caption{Learning rates ($lr$) of four control experiments.} \vskip 0.15in \begin{tabular}{c|ccccc} \hline \multicolumn{1}{c|}{\diagbox{algorithms($lr$)}{envs}} &Maze &Cliff walking &Mountain Car &Acrobot \\ \hline Sarsa($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\ GQ($\alpha,\zeta$)&$0.1,0.003$ &$0.1,0.004$ &$0.1,0.01$ &$0.1,0.01$ \\ EQ($\alpha$)&$0.006$ &$0.005$ &$0.001$ &$0.0005$ \\ VMSarsa($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\ VMGQ($\alpha,\zeta,\beta$)&$0.1,0.001,0.001$ &$0.1,0.005,\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ \\ VMEQ($\alpha,\beta$)&$0.001,0.0005$ &$0.005,0.0001$ &$0.001,0.0001$ &$0.0005,0.0001$ \\ Q-learning($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\ VMQ($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\ \hline \end{tabular} \label{lrofways} \vskip -0.1in \end{table*} \bibliography{aaai24} \end{document}