\section{Theoretical Analysis}
The purpose of this section is to establish the stabilities of the VMTD algorithm
and the VMTDC algorithm, and also presents a corollary on the convergence rate of VMTD.

\begin{theorem}
    \label{theorem1}(Convergence of VMTD).
    In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta})  of VMTD.
     Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\beta_k).
    $
    Assume that $(\bm{\phi}_k,r_k,\bm{\phi}_k')$ is an i.i.d. sequence with
    uniformly bounded second moments, where $\bm{\bm{\phi}}_k$ and $\bm{\bm{\phi}}'_{k}$ are sampled from the same Markov chain.
    Let $\textbf{A} = \mathrm{Cov}(\bm{\bm{\phi}},\bm{\bm{\phi}}-\gamma\bm{\bm{\phi}}')$,
    $\bm{b}=\mathrm{Cov}(r,\bm{\phi})$.
    Assume that matrix $\bm{\theta}$ is  non-singular. 
    Then the parameter vector $\bm{\theta}_k$ converges with probability one 
    to $\textbf{A}^{-1}\bm{b}$.
\end{theorem}
\begin{proof}
    \label{th1proof}   
        The proof is  based on Borkar's Theorem   for
        general stochastic approximation recursions with two time scales
        \cite{borkar1997stochastic}. 
        
        % The new TD error for the linear setting is 
        % \begin{equation*}
        % \delta_{\text{new}}=r+\gamma
        % \bm{\theta}^{\top}\bm{\phi}'-\bm{\theta}^{\top}\bm{\phi}-\mathbb{E}[\delta].
        % \end{equation*}
        A new one-step
        linear TD solution is defined
        as: 
        \begin{equation*}
        0=\mathbb{E}[(\delta-\mathbb{E}[\delta]) \bm{\phi}]=-\textbf{A}\bm{\theta}+\bm{b}.
        \end{equation*}
        Thus, the  VMTD's solution is
        $\bm{\theta}_{\text{VMTD}}=\textbf{A}^{-1}\bm{b}$.
        
        First, note that recursion (\ref{theta}) can be rewritten as
        \begin{equation*}
        \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\beta_k\bm{\xi}(k),
        \end{equation*}
        where
        \begin{equation*}
        \bm{\xi}(k)=\frac{\alpha_k}{\beta_k}(\delta_k-\omega_k)\bm{\phi}_k
        \end{equation*}
        Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$,
        $\bm{\xi}(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$. 
        That is the increments in iteration (\ref{omega}) are uniformly larger than
        those in (\ref{theta}), thus (\ref{omega}) is the faster recursion.
        Along the faster time scale, iterations of (\ref{omega}) and (\ref{theta})
        are associated to ODEs system as follows:
        \begin{equation}
        \dot{\bm{\theta}}(t) = 0,
        \label{thetaFast}
        \end{equation}
        \begin{equation}
        \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{\theta}(t)]-\omega(t).
        \label{omegaFast}
        \end{equation}
        Based on the ODE (\ref{thetaFast}), $\bm{\theta}(t)\equiv \bm{\theta}$ when
        viewed from the faster timescale. 
        By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
        $||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
        $\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion
        (\ref{theta}).
        Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as
        \begin{equation}
        \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{\theta}]-\omega(t).
        \label{omegaFastFinal}
        \end{equation}
        Consider the function $h(\omega)=\mathbb{E}[\delta|\bm{\theta}]-\omega$,
        i.e., the driving vector field of the ODE (\ref{omegaFastFinal}).
        It is easy to find that the function $h$ is Lipschitz with coefficient
        $-1$.
        Let $h_{\infty}(\cdot)$ be the function defined by
         $h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$.
         Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
         For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}[\delta|\bm{\theta}]$
        is the unique globally asymptotically stable equilibrium.
         For the ODE
          \begin{equation}
         \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
         \label{omegaInfty}
         \end{equation}
         apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
        associated strict Liapunov function. Then,
        the origin of (\ref{omegaInfty}) is a globally asymptotically stable
        equilibrium.
        
        
        Consider now the recursion (\ref{omega}).
        Let
        $M_{k+1}=(\delta_k-\omega_k)
        -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
        where $\mathcal{F}(k)=\sigma(\omega_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
        $k\geq 1$ are the sigma fields
        generated by $\omega_0,\bm{\theta}_0,\omega_{l+1},\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
        $0\leq l<k$.
        It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
        satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
        Because $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$   have
        uniformly bounded second moments, it can be seen that for some constant
        $c_1>0$, $\forall k\geq0$,
        \begin{equation*}
        \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
        c_1(1+||\omega_k||^2+||\bm{\theta}_k||^2).
        \end{equation*}
        
        
        Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
        Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
        conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus,
        by Theorem 2.2 of \cite{borkar2000ode} we obtain that
        $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
        
        Consider now the slower time scale recursion (\ref{theta}).
        Based on the above analysis, (\ref{theta}) can be rewritten as 
        \begin{equation*}
        \bm{\theta}_{k+1}\leftarrow
        \bm{\theta}_{k}+\alpha_k(\delta_k-\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}_k.
        \end{equation*}
        
        Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
        $k\geq 1$ be the sigma fields
        generated by $\bm{\theta}_0,\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
        $0\leq l<k$.
        Let 
        $
        Z_{k+1} = Y_{k}-\mathbb{E}[Y_{k}|\mathcal{G}(k)],
        $
        where
        \begin{equation*}
        Y_{k}=(\delta_k-\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}_k.
        \end{equation*}
        Consequently,
        \begin{equation*}
        \begin{array}{ccl}
        \mathbb{E}[Y_k|\mathcal{G}(k)]&=&\mathbb{E}[(\delta_k-\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}_k|\mathcal{G}(k)]\\
        &=&\mathbb{E}[\delta_k\bm{\phi}_k|\bm{\theta}_k]
        -\mathbb{E}[\mathbb{E}[\delta_k|\bm{\theta}_k]\bm{\phi}_k]\\
        &=&\mathbb{E}[\delta_k\bm{\phi}_k|\bm{\theta}_k]
        -\mathbb{E}[\delta_k|\bm{\theta}_k]\mathbb{E}[\bm{\phi}_k]\\
        &=&\mathrm{Cov}(\delta_k|\bm{\theta}_k,\bm{\phi}_k),
        \end{array}
        \end{equation*}
        where $\mathrm{Cov}(\cdot,\cdot)$ is a covariance operator.
        
         Thus,
         \begin{equation*}
        \begin{array}{ccl}
        Z_{k+1}&=&(\delta_k-\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}_k-\mathrm{Cov}(\delta_k|\bm{\theta}_k,\bm{\phi}_k).
        \end{array}
        \end{equation*}
        It is easy to verify that $Z_{k+1},k\geq0$ are integrable random variables that 
        satisfy $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$.
        Also, because $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$  have
        uniformly bounded second moments, it can be seen that for some constant
        $c_2>0$, $\forall k\geq0$,
        \begin{equation*}
        \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
        c_2(1+||\bm{\theta}_k||^2).
        \end{equation*}
        
        Consider now the following ODE associated with (\ref{theta}):
        \begin{equation}
        \begin{array}{ccl}
        \dot{\bm{\theta}}(t)&=&\mathrm{Cov}(\delta|\bm{\theta}(t),\bm{\phi})\\
        &=&\mathrm{Cov}(r+(\gamma\bm{\phi}'-\bm{\phi})^{\top}\bm{\theta}(t),\bm{\phi})\\
        &=&\mathrm{Cov}(r,\bm{\phi})-\mathrm{Cov}(\bm{\theta}(t)^{\top}(\bm{\phi}-\gamma\bm{\phi}'),\bm{\phi})\\
        &=&\mathrm{Cov}(r,\bm{\phi})-\bm{\theta}(t)^{\top}\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi})\\
        &=&\mathrm{Cov}(r,\bm{\phi})-\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi})^{\top}\bm{\theta}(t)\\
        &=&\mathrm{Cov}(r,\bm{\phi})-\mathrm{Cov}(\bm{\phi},\bm{\phi}-\gamma\bm{\phi}')\bm{\theta}(t)\\
        &=&-\textbf{A}\bm{\theta}(t)+\bm{b}.
        \end{array}
        \label{odetheta}
        \end{equation}
        Let $\vec{h}(\bm{\theta}(t))$ be the driving vector field of the ODE
        (\ref{odetheta}).
        \begin{equation*}
        \vec{h}(\bm{\theta}(t))=-\textbf{A}\bm{\theta}(t)+\bm{b}.
        \end{equation*}
         Consider the cross-covariance matrix,
        \begin{equation}
        \begin{array}{ccl}
            \textbf{A} &=& \mathrm{Cov}(\bm{\phi},\bm{\phi}-\gamma\bm{\phi}')\\
          &=&\frac{\mathrm{Cov}(\bm{\phi},\bm{\phi})+\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi}-\gamma\bm{\phi}')-\mathrm{Cov}(\gamma\bm{\phi}',\gamma\bm{\phi}')}{2}\\
          &=&\frac{\mathrm{Cov}(\bm{\phi},\bm{\phi})+\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi}-\gamma\bm{\phi}')-\gamma^2\mathrm{Cov}(\bm{\phi}',\bm{\phi}')}{2}\\
          &=&\frac{(1-\gamma^2)\mathrm{Cov}(\bm{\phi},\bm{\phi})+\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi}-\gamma\bm{\phi}')}{2},\\
        \end{array}
        \label{covariance}
        \end{equation}
        where we eventually used $\mathrm{Cov}(\bm{\phi}',\bm{\phi}')=\mathrm{Cov}(\bm{\phi},\bm{\phi})$
        \footnote{The covariance matrix $\mathrm{Cov}(\bm{\phi}',\bm{\phi}')$ is equal to
        the covariance matrix $\mathrm{Cov}(\bm{\phi},\bm{\phi})$ if the initial state is re-reachable or
        initialized randomly in a Markov chain for on-policy update.}.
        Note that the covariance matrix $\mathrm{Cov}(\bm{\phi},\bm{\phi})$ and
        $\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi}-\gamma\bm{\phi}')$ are semi-positive
        definite. Then, the matrix $\textbf{A}$ is semi-positive definite because  $\textbf{A}$ is
        linearly combined  by  two positive-weighted semi-positive definite matrice
        (\ref{covariance}).
        Furthermore, $\textbf{A}$ is nonsingular due to the assumption.
        Hence, the cross-covariance matrix $\textbf{A}$ is positive definite.
        
        Therefore,
        $\bm{\theta}^*=\textbf{A}^{-1}\bm{b}$ can be seen to be the unique globally asymptotically
        stable equilibrium for ODE (\ref{odetheta}).
        Let $\vec{h}_{\infty}(\bm{\theta})=\lim_{r\rightarrow
        \infty}\frac{\vec{h}(r\bm{\theta})}{r}$. Then
        $\vec{h}_{\infty}(\bm{\theta})=-\textbf{A}\bm{\theta}$ is well-defined. 
        Consider now the ODE
        \begin{equation}
        \dot{\bm{\theta}}(t)=-\textbf{A}\bm{\theta}(t).
        \label{odethetafinal}
        \end{equation}
        The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium.
        Thus, the assumption (A1) and (A2) are verified.
    \end{proof}

% Please refer to the appendix \ref{proofth1} for detailed proof process.
% Theorem 3 in \cite{dalal2020tale} provides a general conclusion on the convergence speed of all linear two-timescale 
% algorithms. VMTD satisfies the assumptions of this theorem, leading 
% to the following corollary.
% \begin{corollary}
%     \label{corollary4_2}
% Consider the Sparsely Projected variant of VMTD. Then, for $\alpha_k = 1/(k+1)^{\alpha}$, $\beta_k = 1/(k+1)^{\beta}$, 
% $0<\beta<\alpha<1$, $p>1$, with probility larger than $1- \tau$, for all $k\geq N_3$, we have
% \begin{equation}
%     ||\bm{\theta}'_{k} - \bm{\theta}^{*}|| \le C_{3,\bm{\theta}} \frac{\sqrt{\ln (4d_{1}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\alpha / 2}}
% \end{equation} 
% \begin{equation}
%     ||\omega'_{n} - \omega^{*}|| \le C_{3,\omega} \frac{\sqrt{\ln (4d_{2}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\omega / 2}},
% \end{equation} 
% \end{corollary}

% where $d_1$ and $d_2$ represent the dimensions of $\bm{\theta}$ and $\omega$, respectively. For VMTD, $d_2 =1$.
% The meanings of $N_3$,$C_{3,\bm{\theta}}$ and $C_{3,\omega}$ are explained in \cite{dalal2020tale}.
% The formulas for $\bm{\theta}'_{k}$ and $\omega'_{n}$ can be found in (\ref{sparseprojectiontheta}) and (\ref{sparseprojectionomega}).

% Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process.
\begin{theorem}
    \label{theorem2}(Convergence of VMTDC).
    In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc})   of VMTDC.
     Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\zeta_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\zeta_k),
    $
    $
    \zeta_k = o(\beta_k).
    $
    Assume that $(\bm{\bm{\phi}}_k,r_k,\bm{\bm{\phi}}_k')$ is an i.i.d. sequence with
    uniformly bounded second moments.
    Let $\textbf{A} = \mathrm{Cov}(\bm{\bm{\phi}},\bm{\bm{\phi}}-\gamma\bm{\bm{\phi}}')$,
    $\bm{b}=\mathrm{Cov}(r,\bm{\bm{\phi}})$, and $\textbf{A}=\mathbb{E}[\bm{\bm{\phi}}\bm{\bm{\phi}}^{\top}]$.
    Assume that  $\textbf{A}$ and $\textbf{C}$ are  non-singular matrices. 
    Then the parameter vector $\bm{\theta}_k$ converges with probability one 
    to $\textbf{A}^{-1}\bm{b}$.
\end{theorem}
% Please refer to the appendix \ref{proofth2} for detailed proof process.

\begin{theorem}
    \label{theorem3}(Convergence of VMETD).
    In the case of off-policy learning, consider the iterations (\ref{fvmetd}), (\ref{omegavmetd}) and (\ref{thetavmetd}) with (\ref{delta})  of VMETD.
     Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\beta_k).
    $
    Assume that $(\bm{\bm{\phi}}_k,r_k,\bm{\bm{\phi}}_k')$ is an i.i.d. sequence with
    uniformly bounded second moments, where $\bm{\bm{\phi}}_k$ and $\bm{\bm{\phi}}'_{k}$ are sampled from the same Markov chain.
    Let $\textbf{A}_{\textbf{VMETD}} ={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}}$,
    $\bm{b}_{\textbf{VMETD}}=\bm{\Phi}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$.
    Assume that matrix $A$ is  non-singular. 
    Then the parameter vector $\bm{\theta}_k$ converges with probability one 
    to $\textbf{A}_{\textbf{VMETD}}^{-1}\bm{b}_{\textbf{VMETD}}$.
\end{theorem}
% Please refer to the appendix \ref{proofVMETD} for detailed proof process.