\section{Theoretical Analysis} The purpose of this section is to establish the stabilities of the VMTD algorithm and the VMTDC algorithm, and also presents a corollary on the convergence rate of VMTD. \begin{theorem} \label{theorem1}(Convergence of VMTD). In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta}) of VMTD. Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$, $ \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty, $ $ \sum_{k=0}^{\infty}\alpha_k^2<\infty, $ $ \sum_{k=0}^{\infty}\beta_k^2<\infty, $ and $ \alpha_k = o(\beta_k). $ Assume that $(\bm{\phi}_k,r_k,\bm{\phi}_k')$ is an i.i.d. sequence with uniformly bounded second moments, where $\bm{\bm{\phi}}_k$ and $\bm{\bm{\phi}}'_{k}$ are sampled from the same Markov chain. Let $\textbf{A} = \mathrm{Cov}(\bm{\bm{\phi}},\bm{\bm{\phi}}-\gamma\bm{\bm{\phi}}')$, $\bm{b}=\mathrm{Cov}(r,\bm{\phi})$. Assume that matrix $\bm{\theta}$ is non-singular. Then the parameter vector $\bm{\theta}_k$ converges with probability one to $\textbf{A}^{-1}\bm{b}$. \end{theorem} \begin{proof} \label{th1proof} The proof is based on Borkar's Theorem for general stochastic approximation recursions with two time scales \cite{borkar1997stochastic}. % The new TD error for the linear setting is % \begin{equation*} % \delta_{\text{new}}=r+\gamma % \bm{\theta}^{\top}\bm{\phi}'-\bm{\theta}^{\top}\bm{\phi}-\mathbb{E}[\delta]. % \end{equation*} A new one-step linear TD solution is defined as: \begin{equation*} 0=\mathbb{E}[(\delta-\mathbb{E}[\delta]) \bm{\phi}]=-\textbf{A}\bm{\theta}+\bm{b}. \end{equation*} Thus, the VMTD's solution is $\bm{\theta}_{\text{VMTD}}=\textbf{A}^{-1}\bm{b}$. First, note that recursion (\ref{theta}) can be rewritten as \begin{equation*} \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\beta_k\bm{\xi}(k), \end{equation*} where \begin{equation*} \bm{\xi}(k)=\frac{\alpha_k}{\beta_k}(\delta_k-\omega_k)\bm{\phi}_k \end{equation*} Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$, $\bm{\xi}(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$. That is the increments in iteration (\ref{omega}) are uniformly larger than those in (\ref{theta}), thus (\ref{omega}) is the faster recursion. Along the faster time scale, iterations of (\ref{omega}) and (\ref{theta}) are associated to ODEs system as follows: \begin{equation} \dot{\bm{\theta}}(t) = 0, \label{thetaFast} \end{equation} \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{\theta}(t)]-\omega(t). \label{omegaFast} \end{equation} Based on the ODE (\ref{thetaFast}), $\bm{\theta}(t)\equiv \bm{\theta}$ when viewed from the faster timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||\bm{\theta}_k-\bm{\theta}||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $\bm{\theta}$ that depends on the initial condition $\bm{\theta}_0$ of recursion (\ref{theta}). Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{\theta}]-\omega(t). \label{omegaFastFinal} \end{equation} Consider the function $h(\omega)=\mathbb{E}[\delta|\bm{\theta}]-\omega$, i.e., the driving vector field of the ODE (\ref{omegaFastFinal}). It is easy to find that the function $h$ is Lipschitz with coefficient $-1$. Let $h_{\infty}(\cdot)$ be the function defined by $h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$. Then $h_{\infty}(\omega)= -\omega$, is well-defined. For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}[\delta|\bm{\theta}]$ is the unique globally asymptotically stable equilibrium. For the ODE \begin{equation} \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), \label{omegaInfty} \end{equation} apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its associated strict Liapunov function. Then, the origin of (\ref{omegaInfty}) is a globally asymptotically stable equilibrium. Consider now the recursion (\ref{omega}). Let $M_{k+1}=(\delta_k-\omega_k) -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, where $\mathcal{F}(k)=\sigma(\omega_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq c_1(1+||\omega_k||^2+||\bm{\theta}_k||^2). \end{equation*} Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower time scale recursion (\ref{theta}). Based on the above analysis, (\ref{theta}) can be rewritten as \begin{equation*} \bm{\theta}_{k+1}\leftarrow \bm{\theta}_{k}+\alpha_k(\delta_k-\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}_k. \end{equation*} Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq c_2(1+||\bm{\theta}_k||^2). \end{equation*} Consider now the following ODE associated with (\ref{theta}): \begin{equation} \begin{array}{ccl} \dot{\bm{\theta}}(t)&=&\mathrm{Cov}(\delta|\bm{\theta}(t),\bm{\phi})\\ &=&\mathrm{Cov}(r+(\gamma\bm{\phi}'-\bm{\phi})^{\top}\bm{\theta}(t),\bm{\phi})\\ &=&\mathrm{Cov}(r,\bm{\phi})-\mathrm{Cov}(\bm{\theta}(t)^{\top}(\bm{\phi}-\gamma\bm{\phi}'),\bm{\phi})\\ &=&\mathrm{Cov}(r,\bm{\phi})-\bm{\theta}(t)^{\top}\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi})\\ &=&\mathrm{Cov}(r,\bm{\phi})-\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi})^{\top}\bm{\theta}(t)\\ &=&\mathrm{Cov}(r,\bm{\phi})-\mathrm{Cov}(\bm{\phi},\bm{\phi}-\gamma\bm{\phi}')\bm{\theta}(t)\\ &=&-\textbf{A}\bm{\theta}(t)+\bm{b}. \end{array} \label{odetheta} \end{equation} Let $\vec{h}(\bm{\theta}(t))$ be the driving vector field of the ODE (\ref{odetheta}). \begin{equation*} \vec{h}(\bm{\theta}(t))=-\textbf{A}\bm{\theta}(t)+\bm{b}. \end{equation*} Consider the cross-covariance matrix, \begin{equation} \begin{array}{ccl} \textbf{A} &=& \mathrm{Cov}(\bm{\phi},\bm{\phi}-\gamma\bm{\phi}')\\ &=&\frac{\mathrm{Cov}(\bm{\phi},\bm{\phi})+\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi}-\gamma\bm{\phi}')-\mathrm{Cov}(\gamma\bm{\phi}',\gamma\bm{\phi}')}{2}\\ &=&\frac{\mathrm{Cov}(\bm{\phi},\bm{\phi})+\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi}-\gamma\bm{\phi}')-\gamma^2\mathrm{Cov}(\bm{\phi}',\bm{\phi}')}{2}\\ &=&\frac{(1-\gamma^2)\mathrm{Cov}(\bm{\phi},\bm{\phi})+\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi}-\gamma\bm{\phi}')}{2},\\ \end{array} \label{covariance} \end{equation} where we eventually used $\mathrm{Cov}(\bm{\phi}',\bm{\phi}')=\mathrm{Cov}(\bm{\phi},\bm{\phi})$ \footnote{The covariance matrix $\mathrm{Cov}(\bm{\phi}',\bm{\phi}')$ is equal to the covariance matrix $\mathrm{Cov}(\bm{\phi},\bm{\phi})$ if the initial state is re-reachable or initialized randomly in a Markov chain for on-policy update.}. Note that the covariance matrix $\mathrm{Cov}(\bm{\phi},\bm{\phi})$ and $\mathrm{Cov}(\bm{\phi}-\gamma\bm{\phi}',\bm{\phi}-\gamma\bm{\phi}')$ are semi-positive definite. Then, the matrix $\textbf{A}$ is semi-positive definite because $\textbf{A}$ is linearly combined by two positive-weighted semi-positive definite matrice (\ref{covariance}). Furthermore, $\textbf{A}$ is nonsingular due to the assumption. Hence, the cross-covariance matrix $\textbf{A}$ is positive definite. Therefore, $\bm{\theta}^*=\textbf{A}^{-1}\bm{b}$ can be seen to be the unique globally asymptotically stable equilibrium for ODE (\ref{odetheta}). Let $\vec{h}_{\infty}(\bm{\theta})=\lim_{r\rightarrow \infty}\frac{\vec{h}(r\bm{\theta})}{r}$. Then $\vec{h}_{\infty}(\bm{\theta})=-\textbf{A}\bm{\theta}$ is well-defined. Consider now the ODE \begin{equation} \dot{\bm{\theta}}(t)=-\textbf{A}\bm{\theta}(t). \label{odethetafinal} \end{equation} The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium. Thus, the assumption (A1) and (A2) are verified. \end{proof} % Please refer to the appendix \ref{proofth1} for detailed proof process. % Theorem 3 in \cite{dalal2020tale} provides a general conclusion on the convergence speed of all linear two-timescale % algorithms. VMTD satisfies the assumptions of this theorem, leading % to the following corollary. % \begin{corollary} % \label{corollary4_2} % Consider the Sparsely Projected variant of VMTD. Then, for $\alpha_k = 1/(k+1)^{\alpha}$, $\beta_k = 1/(k+1)^{\beta}$, % $0<\beta<\alpha<1$, $p>1$, with probility larger than $1- \tau$, for all $k\geq N_3$, we have % \begin{equation} % ||\bm{\theta}'_{k} - \bm{\theta}^{*}|| \le C_{3,\bm{\theta}} \frac{\sqrt{\ln (4d_{1}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\alpha / 2}} % \end{equation} % \begin{equation} % ||\omega'_{n} - \omega^{*}|| \le C_{3,\omega} \frac{\sqrt{\ln (4d_{2}^{2}(k+1)^{p}/\tau)} }{(k+1)^{\omega / 2}}, % \end{equation} % \end{corollary} % where $d_1$ and $d_2$ represent the dimensions of $\bm{\theta}$ and $\omega$, respectively. For VMTD, $d_2 =1$. % The meanings of $N_3$,$C_{3,\bm{\theta}}$ and $C_{3,\omega}$ are explained in \cite{dalal2020tale}. % The formulas for $\bm{\theta}'_{k}$ and $\omega'_{n}$ can be found in (\ref{sparseprojectiontheta}) and (\ref{sparseprojectionomega}). % Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process. \begin{theorem} \label{theorem2}(Convergence of VMTDC). In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc}) of VMTDC. Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$, $ \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty, $ $ \sum_{k=0}^{\infty}\alpha_k^2<\infty, $ $ \sum_{k=0}^{\infty}\zeta_k^2<\infty, $ $ \sum_{k=0}^{\infty}\beta_k^2<\infty, $ and $ \alpha_k = o(\zeta_k), $ $ \zeta_k = o(\beta_k). $ Assume that $(\bm{\bm{\phi}}_k,r_k,\bm{\bm{\phi}}_k')$ is an i.i.d. sequence with uniformly bounded second moments. Let $\textbf{A} = \mathrm{Cov}(\bm{\bm{\phi}},\bm{\bm{\phi}}-\gamma\bm{\bm{\phi}}')$, $\bm{b}=\mathrm{Cov}(r,\bm{\bm{\phi}})$, and $\textbf{A}=\mathbb{E}[\bm{\bm{\phi}}\bm{\bm{\phi}}^{\top}]$. Assume that $\textbf{A}$ and $\textbf{C}$ are non-singular matrices. Then the parameter vector $\bm{\theta}_k$ converges with probability one to $\textbf{A}^{-1}\bm{b}$. \end{theorem} % Please refer to the appendix \ref{proofth2} for detailed proof process. \begin{theorem} \label{theorem3}(Convergence of VMETD). In the case of off-policy learning, consider the iterations (\ref{fvmetd}), (\ref{omegavmetd}) and (\ref{thetavmetd}) with (\ref{delta}) of VMETD. Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$, $ \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty, $ $ \sum_{k=0}^{\infty}\alpha_k^2<\infty, $ $ \sum_{k=0}^{\infty}\beta_k^2<\infty, $ and $ \alpha_k = o(\beta_k). $ Assume that $(\bm{\bm{\phi}}_k,r_k,\bm{\bm{\phi}}_k')$ is an i.i.d. sequence with uniformly bounded second moments, where $\bm{\bm{\phi}}_k$ and $\bm{\bm{\phi}}'_{k}$ are sampled from the same Markov chain. Let $\textbf{A}_{\textbf{VMETD}} ={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}}$, $\bm{b}_{\textbf{VMETD}}=\bm{\Phi}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$. Assume that matrix $A$ is non-singular. Then the parameter vector $\bm{\theta}_k$ converges with probability one to $\textbf{A}_{\textbf{VMETD}}^{-1}\bm{b}_{\textbf{VMETD}}$. \end{theorem} % Please refer to the appendix \ref{proofVMETD} for detailed proof process.