\section{Theoretical Analysis}
The purpose of this section is to establish the stabilities of the VMTD algorithm
and the VMTDC algorithm, and also presents a corollary on the convergence rate of VMTD.

\begin{theorem}
    \label{theorem1}(Convergence of VMTD).
    In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta})  of VMTD.
     Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\beta_k).
    $
    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
    uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain.
    Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
    $b=\mathrm{Cov}(r,\phi)$.
    Assume that matrix $A$ is  non-singular. 
    Then the parameter vector $\theta_k$ converges with probability one 
    to $A^{-1}b$.
\end{theorem}

Please refer to the appendix \ref{proofth1} for detailed proof process.

Theorem 3 in \cite{dalal2020tale} provides a general conclusion on the convergence speed of all linear two-timescale 
algorithms. VMTD satisfies the assumptions of this theorem, leading 
to the following corollary.
\begin{corollary}
    \label{corollary4_2}
Consider the Sparsely Projected variant of VMTD. Then, for $\alpha_k = 1/(k+1)^{\alpha}$, $\beta_k = 1/(k+1)^{\beta}$, 
$0<\beta<\alpha<1$, $p>1$, with probility larger than $1- \tau$, for all $k\geq N_3$, we have
\begin{equation}
    ||\theta'_{k} - \theta^{*}|| \le C_{3,\theta} \frac{\sqrt{\ln (4d_{1}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\alpha / 2}}
\end{equation} 
\begin{equation}
    ||\omega'_{n} - \omega^{*}|| \le C_{3,\omega} \frac{\sqrt{\ln (4d_{2}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\omega / 2}},
\end{equation} 
\end{corollary}

where $d_1$ and $d_2$ represent the dimensions of $\theta$ and $\omega$, respectively. For VMTD, $d_2 =1$.
The meanings of $N_3$,$C_{3,\theta}$ and $C_{3,\omega}$ are explained in \cite{dalal2020tale}.
The formulas for $\theta'_{k}$ and $\omega'_{n}$ can be found in (\ref{sparseprojectiontheta}) and (\ref{sparseprojectionomega}).

Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process.

\begin{theorem}
    \label{theorem2}(Convergence of VMTDC).
    In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc})   of VMTDC.
     Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\zeta_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\zeta_k),
    $
    $
    \zeta_k = o(\beta_k).
    $
    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
    uniformly bounded second moments.
    Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
    $b=\mathrm{Cov}(r,\phi)$, and $C=\mathbb{E}[\phi\phi^{\top}]$.
    Assume that  $A$ and $C$ are  non-singular matrices. 
    Then the parameter vector $\theta_k$ converges with probability one 
    to $A^{-1}b$.
\end{theorem}
Please refer to the appendix \ref{proofth2} for detailed proof process.

\begin{theorem}
    \label{theorem3}(Convergence of VMETD).
    In the case of off-policy learning, consider the iterations (\ref{omegavmetd}) and (\ref{thetavmetd}) with (\ref{delta})  of VMETD.
     Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\beta_k).
    $
    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
    uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain.
    Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
    $b=\mathrm{Cov}(r,\phi)$.
    Assume that matrix $A$ is  non-singular. 
    Then the parameter vector $\theta_k$ converges with probability one 
    to $A^{-1}b$.
\end{theorem}
Please refer to the appendix \ref{proofVMETD} for detailed proof process.