%File: anonymous-submission-latex-2024.tex
\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
\usepackage[submission]{aaai24}  % DO NOT CHANGE THIS
\usepackage{times}  % DO NOT CHANGE THIS
\usepackage{helvet}  % DO NOT CHANGE THIS
\usepackage{courier}  % DO NOT CHANGE THIS
\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{subfigure}
\usepackage{array}
\usepackage{diagbox}
\usepackage{siunitx}
\urlstyle{rm} % DO NOT CHANGE THIS
\def\UrlFont{\rm}  % DO NOT CHANGE THIS
\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\frenchspacing  % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
%
% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{subfigure}
\usepackage{diagbox}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{tikz}
\usepackage{bm}
\usepackage{esvect}
\usepackage{multirow}

%
% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
\usepackage{newfloat}
\usepackage{listings}
\numberwithin{equation}{section}
\renewcommand{\theequation}{\thesection-\arabic{equation}} % 自定义公式编号格式
\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
\lstset{%
	basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
	numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
	aboveskip=0pt,belowskip=0pt,%
	showstringspaces=false,tabsize=2,breaklines=true}
\floatstyle{ruled}
\newfloat{listing}{tb}{lst}{}
\floatname{listing}{Listing}
%
% Keep the \pdfinfo as shown here. There's no need
% for you to add the /Title and /Author tags.
\pdfinfo{
/TemplateVersion (2024.1)
}

% DISALLOWED PACKAGES
% \usepackage{authblk} -- This package is specifically forbidden
% \usepackage{balance} -- This package is specifically forbidden
% \usepackage{color (if used in text)
% \usepackage{CJK} -- This package is specifically forbidden
% \usepackage{float} -- This package is specifically forbidden
% \usepackage{flushend} -- This package is specifically forbidden
% \usepackage{fontenc} -- This package is specifically forbidden
% \usepackage{fullpage} -- This package is specifically forbidden
% \usepackage{geometry} -- This package is specifically forbidden
% \usepackage{grffile} -- This package is specifically forbidden
% \usepackage{hyperref} -- This package is specifically forbidden
% \usepackage{navigator} -- This package is specifically forbidden
% (or any other package that embeds links such as navigator or hyperref)
% \indentfirst} -- This package is specifically forbidden
% \layout} -- This package is specifically forbidden
% \multicol} -- This package is specifically forbidden
% \nameref} -- This package is specifically forbidden
% \usepackage{savetrees} -- This package is specifically forbidden
% \usepackage{setspace} -- This package is specifically forbidden
% \usepackage{stfloats} -- This package is specifically forbidden
% \usepackage{tabu} -- This package is specifically forbidden
% \usepackage{titlesec} -- This package is specifically forbidden
% \usepackage{tocbibind} -- This package is specifically forbidden
% \usepackage{ulem} -- This package is specifically forbidden
% \usepackage{wrapfig} -- This package is specifically forbidden
% DISALLOWED COMMANDS
% \nocopyright -- Your paper will not be published if you use this command
% \addtolength -- This command may not be used
% \balance -- This command may not be used
% \baselinestretch -- Your paper will not be published if you use this command
% \clearpage -- No page breaks of any kind may be used for the final version of your paper
% \columnsep -- This command may not be used
% \newpage -- No page breaks of any kind may be used for the final version of your paper
% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
% \pagestyle -- This command may not be used
% \tiny -- This is not an acceptable font size.
% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference

\setcounter{secnumdepth}{2} %May be changed to 1 or 2 if section numbers are desired.

% The file aaai24.sty is the style file for AAAI Press
% proceedings, working notes, and technical reports.
%

% Title

% Your title must be in mixed case, not sentence case.
% That means all verbs (including short verbs like be, is, using,and go),
% nouns, adverbs, adjectives should be capitalized, including both words in hyphenated terms, while
% articles, conjunctions, and prepositions are lower case unless they
% directly follow a colon or long dash
\title{AAAI Press Anonymous Submission\\Instructions for Authors Using \LaTeX{}}
\author{
    %Authors
    % All authors must be in the same font size and format.
    Written by AAAI Press Staff\textsuperscript{\rm 1}\thanks{With help from the AAAI Publications Committee.}\\
    AAAI Style Contributions by Pater Patel Schneider,
    Sunil Issar,\\
    J. Scott Penberthy,
    George Ferguson,
    Hans Guesgen,
    Francisco Cruz\equalcontrib,
    Marc Pujol-Gonzalez\equalcontrib
}
\affiliations{
    %Afiliations
    \textsuperscript{\rm 1}Association for the Advancement of Artificial Intelligence\\
    % If you have multiple authors and multiple affiliations
    % use superscripts in text and roman font to identify them.
    % For example,

    % Sunil Issar\textsuperscript{\rm 2},
    % J. Scott Penberthy\textsuperscript{\rm 3},
    % George Ferguson\textsuperscript{\rm 4},
    % Hans Guesgen\textsuperscript{\rm 5}
    % Note that the comma should be placed after the superscript

    1900 Embarcadero Road, Suite 101\\
    Palo Alto, California 94303-3310 USA\\
    % email address must be in roman text type, not monospace or sans serif
    proceedings-questions@aaai.org
%
% See more examples next
}

%Example, Single Author, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it
\iffalse
\title{My Publication Title --- Single Author}
\author {
    Author Name
}
\affiliations{
    Affiliation\\
    Affiliation Line 2\\
    name@example.com
}
\fi

\iffalse
%Example, Multiple Authors, ->> remove \iffalse,\fi and place them surrounding AAAI title to use it
\title{My Publication Title --- Multiple Authors}
\author {
    % Authors
    First Author Name\textsuperscript{\rm 1},
    Second Author Name\textsuperscript{\rm 2},
    Third Author Name\textsuperscript{\rm 1}
}
\affiliations {
    % Affiliations
    \textsuperscript{\rm 1}Affiliation 1\\
    \textsuperscript{\rm 2}Affiliation 2\\
    firstAuthor@affiliation1.com, secondAuthor@affilation2.com, thirdAuthor@affiliation1.com
}
\fi


% REMOVE THIS: bibentry
% This is only needed to show inline citations in the guidelines document. You should not need it and can safely delete it.
\usepackage{bibentry}
% END REMOVE bibentry

\begin{document}

% \maketitle

\onecolumn
\appendix
\section{Relevant proofs}
\subsection{Proof of Theorem 2}
\label{proofth2}
\begin{proof}
The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation.

For the VMTDC algorithm, a new one-step linear TD solution is defined as:
\begin{equation*}
    0=\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b}).
\end{equation*}
The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the  VMTD's solution is
$\bm{\theta}_{\text{VMTDC}}=\bm{\theta}_{\text{VMTD}}=\textbf{A}^{-1}\bm{b}$.

First, note that recursion (11) and (12) can be rewritten as, respectively, 
\begin{equation*}
    \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\zeta_k \bm{x}(k),
\end{equation*}
\begin{equation*}
    \bm{u}_{k+1}\leftarrow \bm{u}_k+\beta_k \bm{y}(k),
\end{equation*}
where 
\begin{equation*}
    \bm{x}(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)],
\end{equation*}
\begin{equation*}
    \bm{y}(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \bm{\phi}^{\top}_k \bm{u}_k]\bm{\phi}_k.
\end{equation*}

Recursion (11) can also be rewritten as
\begin{equation*}
    \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\beta_k z(k),
\end{equation*}
where
\begin{equation*}
    z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)],
\end{equation*}

Due to the settings of step-size schedule 
$\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $\bm{x}(k)\rightarrow 0$, $\bm{y}(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$.
That is that the increments in iteration (13) are uniformly larger than
those in (12) and  the increments in iteration (12) are uniformly larger than
those in (11), thus (13) is the fastest recursion, (12) is the second fast recursion and (11) is the slower recursion.
Along the fastest time scale, iterations of (11), (12) and (13)
are associated to ODEs system as follows:
\begin{equation}
    \dot{\bm{\theta}}(t) = 0,
    \label{thetavmtdcFastest}
\end{equation}
\begin{equation}
    \dot{\bm{u}}(t) = 0,
    \label{uvmtdcFastest}
\end{equation}
\begin{equation}
    \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)]-\omega(t).
    \label{omegavmtdcFastest}
\end{equation}

Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\bm{\theta}(t)\equiv \bm{\theta}$
and $\bm{u}(t)\equiv \bm{u}$ when viewed from the fastest timescale.
By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
$||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion
(11) and $||\bm{u}_k-\bm{u}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$u$ that depends on the initial condition $u_0$ of recursion
(12). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest})
can be written as 
\begin{equation}
    \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u},\bm{\theta}]-\omega(t).
    \label{omegavmtdcFastestFinal}
\end{equation}

Consider the function $h(\omega)=\mathbb{E}[\delta|\bm{\theta},\bm{u}]-\omega$,
i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}).
It is easy to find that the function $h$ is Lipschitz with coefficient
$-1$.
Let $h_{\infty}(\cdot)$ be the function defined by
 $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$.
 Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
 For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\bm{\theta},\bm{u}]$
is the unique globally asymptotically stable equilibrium.
For the ODE
\begin{equation}
 \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
 \label{omegavmtdcInfty}
\end{equation}
apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
associated strict Liapunov function. Then,
the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable
equilibrium.

Consider now the recursion (13).
Let
$M_{k+1}=(\delta_k-\omega_k)
-\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
where $\mathcal{F}(k)=\sigma(\omega_l,\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
$k\geq 1$ are the sigma fields
generated by $\omega_0,u_0,\bm{\theta}_0,\omega_{l+1},\bm{u}_{l+1},\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
$0\leq l<k$.
It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
Because $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$   have
uniformly bounded second moments, it can be seen that for some constant
$c_1>0$, $\forall k\geq0$,
\begin{equation*}
\mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
c_1(1+||\omega_k||^2+||\bm{u}_k||^2+||\bm{\theta}_k||^2).
\end{equation*}


Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
by Theorem 2.2 of \cite{borkar2000ode} we obtain that
$||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.

Consider now the second time scale recursion (12).
Based on the above analysis, (12) can be rewritten as
% \begin{equation*}
%     \bm{u}_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|\bm{u}_k,\bm{\theta}_k] - \bm{\phi}^{\top} (s_k) \bm{u}_k]\bm{\phi}(s_k).
% \end{equation*}
\begin{equation}
    \dot{\bm{\theta}}(t) = 0,
    \label{thetavmtdcFaster}
\end{equation}
\begin{equation}
    \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)])\bm{\phi}_t|\bm{\theta}(t)] - \textbf{C}\bm{u}(t).
    \label{uvmtdcFaster}
\end{equation}
The ODE (\ref{thetavmtdcFaster}) suggests that $\bm{\theta}(t)\equiv \bm{\theta}$ (i.e., a time invariant parameter)
when viewed from the second fast timescale.
By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
$||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion
(11). 

Consider now the recursion (12).
Let
$N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k)|\mathcal{I} (k)]$,
where $\mathcal{I}(k)=\sigma(\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
$k\geq 1$ are the sigma fields
generated by $\bm{u}_0,\bm{\theta}_0,\bm{u}_{l+1},\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
$0\leq l<k$.
It is easy to verify that $N_{k+1},k\geq0$ are integrable random variables that 
satisfy $\mathbb{E}[N_{k+1}|\mathcal{I}(k)]=0$, $\forall k\geq0$.
Because $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$   have
uniformly bounded second moments, it can be seen that for some constant
$c_2>0$, $\forall k\geq0$,
\begin{equation*}
\mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq
c_2(1+||\bm{u}_k||^2+||\bm{\theta}_k||^2).
\end{equation*}

Because $\bm{\theta}(t)\equiv \bm{\theta}$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster})
can be written as 
\begin{equation}
    \dot{\bm{u}}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{\theta}])\bm{\phi}_t|\bm{\theta}] - \textbf{C}\bm{u}(t).
    \label{uvmtdcFasterFinal}
\end{equation}
Now consider the function $h(\bm{u})=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\bm{\theta}]|\bm{\theta}] -\textbf{C}\bm{u}$, i.e., the
driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}),
$\bm{u}^* = \textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\bm{\theta}])\bm{\phi}|\bm{\theta}]$ is the unique globally asymptotically
stable equilibrium. Let $h_{\infty}(\bm{u})=-\textbf{C}\bm{u}$.
For the ODE
\begin{equation}
    \dot{\bm{u}}(t) = h_{\infty}(\bm{u}(t))= -\textbf{C}\bm{u}(t),
    \label{uvmtdcInfty}
\end{equation}
the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable
equilibrium because $\textbf{C}$ is a positive definite matrix (because it is nonnegative definite and nonsingular).
Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
by Theorem 2.2 of \cite{borkar2000ode} we obtain that
$||\bm{u}_k-\bm{u}^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.

Consider now the slower timescale recursion (11). In the light of the above,
(11) can be rewritten as 
\begin{equation}
    \bm{\theta}_{k+1} \leftarrow \bm{\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\
    - \alpha_k \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]).
\end{equation}
Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
$k\geq 1$ be the sigma fields
generated by $\bm{\theta}_0,\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
$0\leq l<k$. Let
\begin{equation*}
    \begin{array}{ccl}
    Z_{k+1}&=&((\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k - \gamma \bm{\phi}'_{k}\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k])\\ 
     & &-\mathbb{E}[((\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k - \gamma \bm{\phi}'_{k}\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k])|\mathcal{G}(k)]\\
    &=&((\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k - \gamma \bm{\phi}'_{k}\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k])\\
    & &-\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k|\bm{\theta}_k] - \gamma\mathbb{E}[\bm{\phi}' \bm{\phi}^{\top}]\textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k|\bm{\theta}_k].
    \end{array}
\end{equation*}
It is easy to see that $Z_k$, $k\geq 0$ are integrable random variables and $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$. Further,
\begin{equation*}
\mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
c_3(1+||\bm{\theta}_k||^2), k\geq 0
\end{equation*}
for some constant $c_3 \geq 0$, again beacuse $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$   have
uniformly bounded second moments, it can be seen that for some constant.

Consider now the following ODE associated with (11):
\begin{equation}
    \dot{\bm{\theta}}(t) = (\textbf{I} - \mathbb{E}[\gamma \bm{\phi}' \bm{\phi}^{\top}]\textbf{C}^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)].
    \label{thetavmtdcSlowerFinal}
\end{equation}
Let 
\begin{equation*}
\begin{array}{ccl}
    \vec{h}(\bm{\theta}(t))&=&(\textbf{I} - \mathbb{E}[\gamma \bm{\phi}' \bm{\phi}^{\top}]\textbf{C}^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)]\\
    &=&(\textbf{C} - \mathbb{E}[\gamma \bm{\phi}' \bm{\phi}^{\top}])\textbf{C}^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)]\\
    &=& (\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}] - \mathbb{E}[\gamma \bm{\phi}' \bm{\phi}^{\top}])\textbf{C}^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)]\\
    &=& \textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}(t)+\bm{b}),
\end{array}
\end{equation*}
because $\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)]=-\textbf{A}\bm{\theta}(t)+\bm{b}$, where 
$\textbf{A} = \mathrm{Cov}(\bm{\phi},\bm{\phi}-\gamma\bm{\phi}')$, $\bm{b}=\mathrm{Cov}(r,\bm{\phi})$, and $\textbf{C}=\mathbb{E}[\bm{\phi}\bm{\phi}^{\top}]$

Therefore,
$\bm{\theta}^*=\textbf{A}^{-1}\bm{b}$ can be seen to be the unique globally asymptotically
stable equilibrium for ODE (\ref{thetavmtdcSlowerFinal}).
Let $\vec{h}_{\infty}(\bm{\theta})=\lim_{r\rightarrow
\infty}\frac{\vec{h}(r\bm{\theta})}{r}$. Then
$\vec{h}_{\infty}(\bm{\theta})=-\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}\bm{\theta}$ is well-defined. 
Consider now the ODE
\begin{equation}
\dot{\bm{\theta}}(t)=-\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}\bm{\theta}(t).
\label{odethetavmtdcfinal}
\end{equation}

Because $\textbf{C}^{-1}$ is positive definite and $\textbf{A}$ has full rank (as it
is nonsingular by assumption), the matrix $\textbf{A}^{\top} \textbf{C}^{-1}\textbf{A}$ is also
positive definite. 
The ODE (\ref{odethetavmtdcfinal}) has the origin as its unique globally asymptotically stable equilibrium.
Thus, the assumption (A1) and (A2) are verified.

The proof is given above.
In the fastest time scale, the parameter $w$ converges to
$\mathbb{E}[\delta|\bm{u}_k,\bm{\theta}_k]$.
In the second fast time scale,
the parameter $u$ converges to $\textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]$.
In the slower time scale,
the parameter $\bm{\theta}$ converges to $\textbf{A}^{-1}\bm{b}$.
\end{proof}

\subsection{Proof of Theorem 3}
\label{proofVMETD}
\begin{proof}
    \label{th1proof}   
        The proof of VMETD's convergence is also based on Borkar's Theorem   for
        general stochastic approximation recursions with two time scales
        \cite{borkar1997stochastic}. 
        
The  VMTD's solution is
        $\bm{\theta}_{\text{VMETD}}=\textbf{A}_{\text{VMETD}}^{-1}\bm{b}_{\text{VMETD}}$.
        
        First, note that recursion (19) can be rewritten as
        \begin{equation*}
        \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\beta_k\bm{\xi}(k),
        \end{equation*}
        where
        \begin{equation*}
        \bm{\xi}(k)=\frac{\alpha_k}{\beta_k} (F_k \rho_k\delta_k - \omega_{k+1})\bm{\phi}_k
        \end{equation*}
        Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$,
        $\bm{\xi}(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$. 
        That is the increments in iteration (20) are uniformly larger than
        those in (19), thus (20) is the faster recursion.
        Along the faster time scale, iterations of (20) and (19)
        are associated to ODEs system as follows:
        \begin{equation}
        \dot{\bm{\theta}}(t) = 0,
        \label{thetaFast}
        \end{equation}
        \begin{equation}
        \dot{\omega}(t)=\mathbb{E}_{\mu}[F_t\rho_t\delta_t|\bm{\theta}(t)]-\omega(t).
        \label{omegaFast}
        \end{equation}
        Based on the ODE (\ref{thetaFast}), $\bm{\theta}(t)\equiv \bm{\theta}$ when
        viewed from the faster timescale. 
        By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
        $||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
        $\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion
        (19).
        Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as
        \begin{equation}
        \dot{\omega}(t)=\mathbb{E}_{\mu}[F_t\rho_t\delta_t|\bm{\theta}]-\omega(t).
        \label{omegaFastFinal}
        \end{equation}
        Consider the function $h(\omega)=\mathbb{E}_{\mu}[F\rho\delta|\bm{\theta}]-\omega$,
        i.e., the driving vector field of the ODE (\ref{omegaFastFinal}).
        It is easy to find that the function $h$ is Lipschitz with coefficient
        $-1$.
        Let $h_{\infty}(\cdot)$ be the function defined by
         $h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$.
         Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
         For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}_{\mu}[F\rho\delta|\bm{\theta}]$
        is the unique globally asymptotically stable equilibrium.
         For the ODE
          \begin{equation}
         \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
         \label{omegaInfty}
         \end{equation}
         apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
        associated strict Liapunov function. Then,
        the origin of (\ref{omegaInfty}) is a globally asymptotically stable
        equilibrium.
        
        
        Consider now the recursion (20).
        Let
        $M_{k+1}=(F_k\rho_k\delta_k-\omega_k)
        -\mathbb{E}_{\mu}[(F_k\rho_k\delta_k-\omega_k)|\mathcal{F}(k)]$,
        where $\mathcal{F}(k)=\sigma(\omega_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
        $k\geq 1$ are the sigma fields
        generated by $\omega_0,\bm{\theta}_0,\omega_{l+1},\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
        $0\leq l<k$.
        It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
        satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
        Because $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$   have
        uniformly bounded second moments, it can be seen that for some constant
        $c_1>0$, $\forall k\geq0$,
        \begin{equation*}
        \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
        c_1(1+||\omega_k||^2+||\bm{\theta}_k||^2).
        \end{equation*}
        
        
        Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
        Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
        conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus,
        by Theorem 2.2 of \cite{borkar2000ode} we obtain that
        $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
        
        Consider now the slower time scale recursion (19).
        Based on the above analysis, (19) can be rewritten as 
        % \begin{equation*}
        % \bm{\theta}_{k+1}\leftarrow
        % \bm{\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k.
        % \end{equation*}

        \begin{equation*}
            \begin{split}
                \bm{\theta}_{k+1}&\leftarrow \bm{\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_k)\bm{\phi}_k -\alpha_k \omega_{k+1}\bm{\phi}_k\\
                &=\bm{\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k\\
            &=\bm{\theta}_k+\alpha_k F_k \rho_k (R_{k+1}+\gamma \bm{\theta}_k^{\top}\bm{\phi}_{k+1}-\bm{\theta}_k^{\top}\bm{\phi}_k)\bm{\phi}_k -\alpha_k \mathbb{E}_{\mu}[F_k \rho_k \delta_k]\bm{\phi}_k\\
            &= \bm{\theta}_k+\alpha_k \{\underbrace{(F_k\rho_kR_{k+1}-\mathbb{E}_{\mu}[F_k\rho_k R_{k+1}])\bm{\phi}_k}_{\bm{b}_{\text{VMETD},k}}
            -\underbrace{(F_k\rho_k\bm{\phi}_k(\bm{\phi}_k-\gamma\bm{\phi}_{k+1})^{\top}-\bm{\phi}_k\mathbb{E}_{\mu}[F_k\rho_k (\bm{\phi}_k-\gamma\bm{\phi}_{k+1})]^{\top})}_{\textbf{A}_{\text{VMETD},k}}\bm{\theta}_k\}
        \end{split}
        \end{equation*}
        
        Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
        $k\geq 1$ be the sigma fields
        generated by $\bm{\theta}_0,\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
        $0\leq l<k$.
        Let 
        $
        Z_{k+1} = Y_{k}-\mathbb{E}[Y_{k}|\mathcal{G}(k)],
        $
        where
        \begin{equation*}
        Y_{k}=(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k.
        \end{equation*}
        Consequently,
        \begin{equation*}
        \begin{array}{ccl}
        \mathbb{E}_{\mu}[Y_k|\mathcal{G}(k)]&=&\mathbb{E}_{\mu}[(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k|\mathcal{G}(k)]\\
        &=&\mathbb{E}_{\mu}[F_k\rho_k\delta_k\bm{\phi}_k|\bm{\theta}_k]
        -\mathbb{E}_{\mu}[\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k]\bm{\phi}_k]\\
        &=&\mathbb{E}_{\mu}[F_k\rho_k\delta_k\bm{\phi}_k|\bm{\theta}_k]
        -\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k]\mathbb{E}_{\mu}[\bm{\phi}_k]\\
        &=&\mathrm{Cov}(F_k\rho_k\delta_k|\bm{\theta}_k,\bm{\phi}_k),
        \end{array}
        \end{equation*}
        where $\mathrm{Cov}(\cdot,\cdot)$ is a covariance operator.
        
         Thus,
         \begin{equation*}
        \begin{array}{ccl}
        Z_{k+1}&=&(F_k\rho_k\delta_k-\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}_k-\mathrm{Cov}(F_k\rho_k\delta_k|\bm{\theta}_k,\bm{\phi}_k).
        \end{array}
        \end{equation*}
        It is easy to verify that $Z_{k+1},k\geq0$ are integrable random variables that 
        satisfy $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$.
        Also, because $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$  have
        uniformly bounded second moments, it can be seen that for some constant
        $c_2>0$, $\forall k\geq0$,
        \begin{equation*}
        \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
        c_2(1+||\bm{\theta}_k||^2).
        \end{equation*}
        
        Consider now the following ODE associated with (19):
        \begin{equation}
        \begin{array}{ccl}
        \dot{\bm{\theta}}(t)&=&-\textbf{A}_{\text{VMETD}}\bm{\theta}(t)+\bm{b}_{\text{VMETD}}.
        \end{array}
        \label{odetheta}
        \end{equation}
        \begin{equation}
            \begin{split}
                \textbf{A}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[\textbf{A}_{\text{VMETD},k}]\\
                &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k \rho_k \bm{\phi}_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[  \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\  
                % &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\underbrace{\bm{\phi}_k}_{X}\underbrace{F_k \rho_k  (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}}_{Y}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[  \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\  
                &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k  (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[  \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ 
                &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\   
                &=\sum_{s} f(s) \bm{\phi}(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top} - \sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top}  \\
                &={\bm{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) \bm{\Phi} - {\bm{\Phi}}^{\top} \textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) \bm{\Phi}  \\
                &={\bm{\Phi}}^{\top} (\textbf{F} - \textbf{d}_{\mu} \textbf{f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}} \\
                &={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){\bm{\Phi}} \\
                &={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}} \\
            \end{split}
        \end{equation}
        \begin{equation}
            \begin{split}
                \bm{b}_{\text{VMETD}}&=\lim_{k \rightarrow \infty} \mathbb{E}[\bm{b}_{\text{VMETD},k}]\\
                &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k\rho_kR_{k+1}\bm{\phi}_k]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\  
                &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[  \bm{\phi}_k]\mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ 
                &= \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\  
                &=\sum_{s} f(s) \bm{\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)r_{\pi}  \\
                &=\bm{\bm{\Phi}}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi} \\
            \end{split}
        \end{equation}
        Let $\vec{h}(\bm{\theta}(t))$ be the driving vector field of the ODE
        (\ref{odetheta}).
        \begin{equation*}
        \vec{h}(\bm{\theta}(t))=-\textbf{A}_{\text{VMETD}}\bm{\theta}(t)+\bm{b}_{\text{VMETD}}.
        \end{equation*}

        An $\bm{\Phi}^{\top}\bm{\text{X}}\bm{\Phi}$ matrix of this
        form will be positive definite whenever the matrix $\bm{\text{X}}$ is positive definite.
        Any matrix $\bm{\text{X}}$ is positive definite if and only if
        the symmetric matrix $\bm{\text{S}}=\bm{\text{X}}+\bm{\text{X}}^{\top}$ is positive definite. 
        Any symmetric real matrix $\bm{\text{S}}$ is positive definite if the absolute values of
        its diagonal entries are greater than the sum of the absolute values of the corresponding
        off-diagonal entries\cite{sutton2016emphatic}. 
        
        \begin{equation}
            \label{rowsum}
            \begin{split}
            (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )\textbf{1}
            &=\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})\textbf{1}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\
            &=\textbf{F}(\textbf{1}-\gamma \textbf{P}_{\pi} \textbf{1})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\
            &=(1-\gamma)\textbf{F}\textbf{1}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\
            &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbf{1}\\
            &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \\
            &=(1-\gamma)(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}\textbf{d}_{\mu}-\textbf{d}_{\mu} \\
            &=(1-\gamma)[(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}-\textbf{I}]\textbf{d}_{\mu} \\
            &=(1-\gamma)[\sum_{t=0}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}-\textbf{I}]\textbf{d}_{\mu} \\
            &=(1-\gamma)[\sum_{t=1}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}]\textbf{d}_{\mu} > 0 \\
            \end{split}
            \end{equation}
        \begin{equation}
            \label{columnsum}
            \begin{split}
            \textbf{1}^{\top}(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )
            &=\textbf{1}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{1}^{\top}\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \\
            &=\textbf{d}_{\mu}^{\top}-\textbf{1}^{\top}\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \\
            &=\textbf{d}_{\mu}^{\top}- \textbf{d}_{\mu}^{\top} \\
            &=0
            \end{split}
        \end{equation}
        (\ref{rowsum}) and (\ref{columnsum}) show that the matrix $\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top}$ of
         diagonal entries are positive and its off-diagonal entries are negative. So its each row sum plus the corresponding column sum is positive. 
        So $\textbf{A}_{\text{VMETD}}$ is positive definite.
        

        Therefore,
        $\bm{\theta}^*=\textbf{A}_{\text{VMETD}}^{-1}\bm{b}_{\text{VMETD}}$ can be seen to be the unique globally asymptotically
        stable equilibrium for ODE (\ref{odetheta}).
        Let $\vec{h}_{\infty}(\bm{\theta})=\lim_{r\rightarrow
        \infty}\frac{\vec{h}(r\bm{\theta})}{r}$. Then
        $\vec{h}_{\infty}(\bm{\theta})=-\textbf{A}_{\text{VMETD}}\bm{\theta}$ is well-defined. 
        Consider now the ODE
        \begin{equation}
        \dot{\bm{\theta}}(t)=-\textbf{A}_{\text{VMETD}}\bm{\theta}(t).
        \label{odethetafinal}
        \end{equation}
        The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium.
        Thus, the assumption (A1) and (A2) are verified.
    \end{proof}


\section{Mathematical Analysis}
\label{mathematicalanalysis}
Table \ref{keymatrices} shows the key matrices of various algorithms.
Table \ref{minimumeigenvalues} shows minimum eigenvalues $\frac{1}{2}\lambda_{\min}(\textbf{A} + \textbf{A}^\top)$ of various algorithms on several examples.
\begin{table}[htb]
    \centering
       \caption{Key matrices of various algorithms.}
       \label{keymatrices}
       {
       \begin{tabular}{cccc}
           \toprule
           Algorithm&Key matrix $\textbf{A}$&{Positive definite}&{$\bm{b}$}\\\midrule
           On-policy TD&$\bm{\Phi}^{\top}\textbf{D}_{\pi}(\textbf{I}-\gamma
         \textbf{P}_{\pi})\bm{\Phi}$&$\checkmark$&$\bm{b}_{\text{on}}=\bm{\Phi}^{\top}\textbf{D}_{\pi}\textbf{r}_{\pi}$\\
         On-policy VMTD&${\bm{\Phi}}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){\bm{\Phi}}$
           &$\checkmark$&$\bm{\Phi}^{\top}(\textbf{D}_{\pi}-\textbf{d}_{\pi} \textbf{d}_{\pi}^{\top})\textbf{r}_{\pi}$\\
         \midrule
         Off-policy TD&$\textbf{A}_{\text{off}}={\bm{\Phi}}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma
         \textbf{P}_{\pi}){\bm{\Phi}}$&$\times$&$\bm{b}_{\text{off}}=\bm{\Phi}^{\top}\textbf{D}_{\mu}\textbf{r}_{\pi}$\\
           TDC& $\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{off}}$&$\checkmark$&$\textbf{A}_{\text{off}}^{\top}\textbf{C}^{-1}\bm{b}_{\text{off}}$
           \\
           ETD& ${\bm{\Phi}}^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}}$
           &$\checkmark$&$\bm{\Phi}^{\top}\textbf{F}\textbf{r}_{\pi}$\\
           \midrule
           Off-policy VMTD&$\textbf{A}_{\text{VM}}={\bm{\Phi}}^{\top} (\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} )(\textbf{I} - \gamma\textbf{P}_{\pi}){\bm{\Phi}}$
           &$\times$&$\bm{b}_{\text{VM}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$\\
           VMTDC& $\textbf{A}_{\text{VM}}^{\top}\textbf{C}^{-1}\textbf{A}_{\text{VM}}$&$\checkmark$&$\textbf{A}_{\text{VM}}^{\top}\textbf{C}^{-1}\bm{b}_{\text{VM}}$
           \\
           VMETD& ${\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}}$
           &$\checkmark$&$\bm{\Phi}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$\\
           \bottomrule
       \end{tabular}
       }
\end{table}
\begin{table}[htb]
    \centering
    \caption{Minimum eigenvalues $\frac{1}{2}\lambda_{\min}(\textbf{A} + \textbf{A}^\top)$ of various algorithms on several examples.}
    \label{minimumeigenvalues}
    {
    \begin{tabular}{ccccccc}
        \toprule
        \multirow{2}{*}{Algorithm}&\multirow{2}{*}{2-state} & \multirow{2}{*}{Baird's}&\multicolumn{3}{c}{Random Walk}&\multirow{2}{*}{Boyan Chain}
            \\\cmidrule{4-6} %\cline{3-4}
        &&&Tabular & Inverted & Dependent& \\
        \midrule
        TD& $-0.2$ & $-0.791$&$0.018$&$0.017$&$0.07$&$0.024$\\
        VMTD & & & & & &  \\
        \midrule
        TDC&$0.016$ & $-0.002$&$0.002$&$0.007$&$0.011$&$0.002$\\
        VMTDC & & & & & &  \\
        \midrule
        ETD&$\textbf{3.4}$& $-2.82e-16$&$\textbf{0.195}$&$\textbf{0.165}$&$\textbf{0.76}$&$\textbf{0.245}$\\
        VMETD & & & & & &  \\
        \bottomrule
    \end{tabular}
    }
\end{table}
\begin{table}[htb]
    \label{bairdcounterexample}
    \centering
    \caption{Baird's Counterexample.}
    {
    \begin{tabular}{ccc}
        \toprule
        \multirow{2}{*}{Algorithm}&\multicolumn{2}{c}{Baird's Counterexample}
            \\\cmidrule{2-3} 
        &Two State & Seven State \\
        \midrule
        TD& [-0.2]& [-7.91e-01, 1.07e-16, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 7.48e-01]\\
        VMTD & [0.25]& [-2.25e-01, -3.45e-17, 5.711e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 2.87e+00] \\
        \midrule
        TDC&[0.016]&[-0.0024, 0.0078, 0.5714, 0.5714, 0.5714, 0.5714, 0.5729, 1.7682]\\
        VMTDC &[0.025] &[-1.55e-03, -3.15e-04, 3.56e-01, 5.47e-01, 5.70e-01, 5.72e-01, 5.83e-01, 5.98e-01]   \\
        \midrule
        ETD& [3.4]& [-2.82e-16, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 7.40e-01, 3.72e+00]\\
        VMETD &[1.15] &[-2.25e-01, -3.45e-17, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 5.71e-01, 2.86e+00]  \\
        \bottomrule
    \end{tabular}
    }
\end{table}
\begin{table}[htb]
    \label{randomwalk}
    \centering
    \caption{Random Walk.}
    {
    \begin{tabular}{cccc}
        \toprule
        \multirow{2}{*}{Algorithm}&\multicolumn{3}{c}{Random Walk}
            \\\cmidrule{2-4} 
        &Tabular & Inverted & Dependent \\
        \midrule
        TD&[0.018,0.074,0.183,0.260,0.464] &[0.017,0.044,0.065,0.083,0.116] &[0.070,0.113,0.138]\\
        VMTD & [1.62e-17,0.073,0.180,0.260,0.463]& [-2.07e-17,0.018,0.045,0.065,0.115] &[0.022,0.115,0.116]\\
        \midrule
        TDC&[0.002,0.046,0.240,0.364,0.769]&[0.007,0.012,0.060,0.091,0.192]&[0.011,0.065,0.182]\\
        VMTDC & [6.74e-17,0.044,0.240,0.364,0.769]& [-7.248e-18,0.011,0.060,0.091,0.192] & [0.0008,0.062,0.182]\\
        \midrule
        ETD&[0.195,0.669,1.712,2.765,4.660] & [0.165,0.420,0.678,0.820,1.168]&[0.760,1.084,1.394]\\
        VMETD &[-3.40e-04,0.664,1.69,2.76,4.65] & [-0.001,0.167,0.423,0.689,1.163]&[0.221,1.043,1.293] \\
        \bottomrule
    \end{tabular}
    }
\end{table}
\begin{table}[htb]
    \label{boyanchain}
    \centering
       \caption{Boyan Chain.}
       {
       \begin{tabular}{cc}
           \toprule
           Algorithm&eigenvalues\\
           \midrule
           TD&[0.024495,0.054,0.065,0.135]\\
         VMTD&[2.70e-18,0.053,0.065,0.135]\\
         \midrule
           TDC&[0.002,0.058,0.067,0.153] \\
           VMTDC& [-1.40e-18,0.057,0.067,0.153]\\
           \midrule
           ETD& [0.245,0.540,0.647,1.352]\\
           VMETD&[1.57e-17,0.529,0.647,1.352] \\
           \bottomrule
       \end{tabular}
       }
\end{table}

\section{Experimental details}
\label{experimentaldetails}
The feature matrices corresponding to three random walks are shown below respectively:
\begin{equation*}
    \Phi_{tabular}=\left[ 
    \begin{array}{ccccc}
    1 & 0& 0& 0& 0\\
    0 & 1& 0& 0& 0\\
    0 & 0& 1& 0& 0\\
    0 & 0& 0& 1& 0\\
    0 & 0& 0& 0& 1
    \end{array}\right]
    \end{equation*}
    \begin{equation*}
    \Phi_{inverted}=\left[ 
    \begin{array}{ccccc}
    0 & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\
    \frac{1}{2} & 0& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\
    \frac{1}{2} & \frac{1}{2}& 0& \frac{1}{2}& \frac{1}{2}\\
    \frac{1}{2} & \frac{1}{2}& \frac{1}{2}& 0& \frac{1}{2}\\
    \frac{1}{2} & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& 0
    \end{array}\right]
    \end{equation*}
    \begin{equation*}
    \Phi_{dependent}=\left[ 
    \begin{array}{ccccc}
    1 & 0& 0\\
    \frac{1}{\sqrt{2}} & \frac{1}{\sqrt{2}}& 0\\
    \frac{1}{\sqrt{3}} & \frac{1}{\sqrt{3}}& \frac{1}{\sqrt{3}}\\
    0 & \frac{1}{\sqrt{2}}& \frac{1}{\sqrt{2}}\\
    0 & 0& 1
    \end{array}\right]
    \end{equation*}

Three random walk experiments: the $\alpha$ values for 
all algorithms are in the range of $\{0.008, 0.015, 0.03, 0.06, 0.12, 0.25, 0.5\}$. For the TDC algorithm, 
the range of the ratio $\frac{\zeta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. For the VMTD algorithm, 
the range of the ratio $\frac{\beta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. It can be observed from 
the update formula of VMTDC that when $\zeta$ takes a very small value, 
the VMTDC update tends to be similar to VMTD update. Similarly, 
when $\beta$ takes a very small value, the VMTDC update tends to be 
similar to TDC update. Through experiments, it was found that 
setting $\zeta$ to a small value makes VMTDC updates approach VMTD 
updates, resulting in better performance. Therefore, for the VMTDC 
algorithm, the range of $\frac{\beta}{\alpha}$ ratio is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$, and the range of 
$\zeta$ is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. The learning curves in Figure \ref{Evaluation_full} correspond to the optimal 
parameters.

The feature matrix of 7-state version of Baird's off-policy counterexample is
defined as follow:
\begin{equation*}
\Phi_{Counter}=\left[ 
\begin{array}{cccccccc}
1 & 2& 0& 0& 0& 0& 0& 0\\
1 & 0& 2& 0& 0& 0& 0& 0\\
1 & 0& 0& 2& 0& 0& 0& 0\\
1 & 0& 0& 0& 2& 0& 0& 0\\
1 & 0& 0& 0& 0& 2& 0& 0\\
1 & 0& 0& 0& 0& 0& 2& 0\\
2 & 0& 0& 0& 0& 0& 0& 1
\end{array}\right]
\end{equation*}

7-state version of Baird's off-policy counterexample: 
for TD algorithm, $\alpha$ is set to 0.1. For the TDC algorithm, the range of 
$\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$, 
and the range of 
$\zeta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. 
For the VMTD algorithm, the range of 
$\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$, 
and the range of 
$\beta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. Through experiments, it was found 
that setting $\zeta$ to a small value makes VMTDC updates approach VMTD 
updates, resulting in better performance. Therefore, for the VMTDC 
algorithm, The range of values for $\alpha$ and $\beta$ is the same as that of VMTD  
and the range of $\zeta$ 
is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. 
The learning curves in Figure \ref{Complete_full} correspond to the optimal parameters.
For all policy evaluation experiments, each experiment 
is independently run 100 times.

For the four control experiments: The learning rates for each 
algorithm in all experiments are shown in Table \ref{lrofways}.
For all control experiments, each experiment is independently run 50 times.

\begin{table*}[htb]
    \centering
    \caption{Learning rates ($lr$) of four control experiments.}
    \vskip 0.15in
    \begin{tabular}{c|ccccc}
        \hline
        \multicolumn{1}{c|}{\diagbox{algorithms($lr$)}{envs}} &Maze &Cliff walking &Mountain Car &Acrobot \\
        \hline
         Sarsa($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\
         GQ(0)($\alpha,\zeta$)&$0.1,0.003$ &$0.1,0.004$ &$0.1,0.01$ &$0.1,0.01$ \\
         VMSarsa($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\
         VMGQ(0)($\alpha,\zeta,\beta$)&$0.1,0.001,0.001$ &$0.1,0.005,\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ \\
         AC($lr_{\text{actor}},lr_{\text{critic}}$)&$0.01,0.1$ &$0.01,0.01$ &$0.01,0.05$ &$0.01,0.05$ \\
         Q-learning($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\
         VMQ($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\
         \hline
    \end{tabular}
    \label{lrofways}
    \vskip -0.1in
\end{table*}

\bibliography{aaai24}

\end{document}