\section{Relevant proofs} \subsection{Proof of Theorem \ref{theorem1}} \label{proofth1} \begin{proof} \label{th1proof} The proof is based on Borkar's Theorem for general stochastic approximation recursions with two time scales \cite{borkar1997stochastic}. % The new TD error for the linear setting is % \begin{equation*} % \delta_{\text{new}}=r+\gamma % \theta^{\top}\phi'-\theta^{\top}\phi-\mathbb{E}[\delta]. % \end{equation*} A new one-step linear TD solution is defined as: \begin{equation*} 0=\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]=-A\theta+b. \end{equation*} Thus, the VMTD's solution is $\theta_{\text{VMTD}}=A^{-1}b$. First, note that recursion (\ref{theta}) can be rewritten as \begin{equation*} \theta_{k+1}\leftarrow \theta_k+\beta_k\xi(k), \end{equation*} where \begin{equation*} \xi(k)=\frac{\alpha_k}{\beta_k}(\delta_k-\omega_k)\phi_k \end{equation*} Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$, $\xi(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$. That is the increments in iteration (\ref{omega}) are uniformly larger than those in (\ref{theta}), thus (\ref{omega}) is the faster recursion. Along the faster time scale, iterations of (\ref{omega}) and (\ref{theta}) are associated to ODEs system as follows: \begin{equation} \dot{\theta}(t) = 0, \label{thetaFast} \end{equation} \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta(t)]-\omega(t). \label{omegaFast} \end{equation} Based on the ODE (\ref{thetaFast}), $\theta(t)\equiv \theta$ when viewed from the faster timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $\theta$ that depends on the initial condition $\theta_0$ of recursion (\ref{theta}). Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta]-\omega(t). \label{omegaFastFinal} \end{equation} Consider the function $h(\omega)=\mathbb{E}[\delta|\theta]-\omega$, i.e., the driving vector field of the ODE (\ref{omegaFastFinal}). It is easy to find that the function $h$ is Lipschitz with coefficient $-1$. Let $h_{\infty}(\cdot)$ be the function defined by $h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$. Then $h_{\infty}(\omega)= -\omega$, is well-defined. For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}[\delta|\theta]$ is the unique globally asymptotically stable equilibrium. For the ODE \begin{equation} \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), \label{omegaInfty} \end{equation} apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its associated strict Liapunov function. Then, the origin of (\ref{omegaInfty}) is a globally asymptotically stable equilibrium. Consider now the recursion (\ref{omega}). Let $M_{k+1}=(\delta_k-\omega_k) -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, where $\mathcal{F}(k)=\sigma(\omega_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq c_1(1+||\omega_k||^2+||\theta_k||^2). \end{equation*} Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower time scale recursion (\ref{theta}). Based on the above analysis, (\ref{theta}) can be rewritten as \begin{equation*} \theta_{k+1}\leftarrow \theta_{k}+\alpha_k(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k. \end{equation*} Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq c_2(1+||\theta_k||^2). \end{equation*} Consider now the following ODE associated with (\ref{theta}): \begin{equation} \begin{array}{ccl} \dot{\theta}(t)&=&\mathrm{Cov}(\delta|\theta(t),\phi)\\ &=&\mathrm{Cov}(r+(\gamma\phi'-\phi)^{\top}\theta(t),\phi)\\ &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\theta(t)^{\top}(\phi-\gamma\phi'),\phi)\\ &=&\mathrm{Cov}(r,\phi)-\theta(t)^{\top}\mathrm{Cov}(\phi-\gamma\phi',\phi)\\ &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi-\gamma\phi',\phi)^{\top}\theta(t)\\ &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi-\gamma\phi')\theta(t)\\ &=&-A\theta(t)+b. \end{array} \label{odetheta} \end{equation} Let $\vec{h}(\theta(t))$ be the driving vector field of the ODE (\ref{odetheta}). \begin{equation*} \vec{h}(\theta(t))=-A\theta(t)+b. \end{equation*} Consider the cross-covariance matrix, \begin{equation} \begin{array}{ccl} A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\ &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\ &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\ &=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\ \end{array} \label{covariance} \end{equation} where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$ \footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or initialized randomly in a Markov chain for on-policy update.}. Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and $\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive definite. Then, the matrix $A$ is semi-positive definite because $A$ is linearly combined by two positive-weighted semi-positive definite matrice (\ref{covariance}). Furthermore, $A$ is nonsingular due to the assumption. Hence, the cross-covariance matrix $A$ is positive definite. Therefore, $\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically stable equilibrium for ODE (\ref{odetheta}). Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow \infty}\frac{\vec{h}(r\theta)}{r}$. Then $\vec{h}_{\infty}(\theta)=-A\theta$ is well-defined. Consider now the ODE \begin{equation} \dot{\theta}(t)=-A\theta(t). \label{odethetafinal} \end{equation} The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium. Thus, the assumption (A1) and (A2) are verified. \end{proof} \subsection{Proof of Corollary \ref{corollary4_2}} \label{proofcorollary4_2} The update formulas in linear two-timescale algorithms are as follows: \begin{equation} \theta_{k+1}=\theta_{k} + \alpha_{k}[h_1(\theta_{k},\omega_{k})+M^{(1)}_{k+1}], \end{equation} \begin{equation} \omega_{k+1}=\omega_{k} + \alpha_{k}[h_2(\theta_{k},\omega_{k})+M^{(2)}_{k+1}]. \end{equation} where $\alpha_k, \beta_k \in \mathbb{R} $ are stepsizes and $M^{(1)} \in \mathbb{R}^{d_1}, M^{(2)} \in \mathbb{R}^{d_2}$ denote noise. $h_1 : \mathbb{R}^{d_{1}}\times \mathbb{R}^{d_{2}}\rightarrow \mathbb{R}^{d_{1}}$ and $h_2 : \mathbb{R}^{d_{1}}\times \mathbb{R}^{d_{2}}\rightarrow \mathbb{R}^{d_{2}}$ have the form, respectively, \begin{equation} h_{1}(\theta,\omega)=v_1 - \Gamma_1 \theta - W_1\omega, \end{equation} \begin{equation} h_{2}(\theta,\omega)=v_2 - \Gamma_2 \theta - W_2\omega, \end{equation} where $v_1 \in \mathbb{R}^{d_1}$, $v_2 \in \mathbb{R}^{d_2}$, $\Gamma_1 \in \mathbb{R}^{d_1 \times d_1}$ , $\Gamma_2 \in \mathbb{R}^{d_2 \times d_1}$, $W_1 \in \mathbb{R}^{d_1 \times d_2}$ and $W_2 \in \mathbb{R}^{d_2 \times d_2}$. $d_1$ and $d_2$ are the dimensions of vectors $\theta$ and $\omega$, respectively. For Theorem 3 in \cite{dalal2020tale}, the theorem still holds even when $d——1$ is not equal to $d_2$. For the VMTD algorithm, $d_2$ is equal to 1. % Before proving the Corollary \ref{corollary4_2}, \cite{dalal2020tale} presents the matrix assumption, step size assumption, and defines sparse projection. \begin{assumption} \label{matrixassumption} (Matrix Assumption). $W_2$ and $X_1 = \Gamma_1 - W_1 W_{2}^{-1}\Gamma_2$ are positive definite(not necessarily symmetric). \end{assumption} \begin{assumption} \label{stepsizeassumption} (Step Size Assumption). $\alpha_k = (k+1)^{-\alpha}$ and $\beta_k = (k+1)^{-\beta}$, where $1>\alpha > \beta > 0$. \end{assumption} \begin{definition} \label{sparseprojection} (Sparse Projection). For $R>0$, let $\Pi_{R}(x)=\min \{1, R/||x||\}$. $x$ be the projection into the ball with redius R around the origin. The sparse projection operator \begin{equation*} \Pi_{n, R} = \begin{cases} \Pi_{R}, & \text{if } k = n^{n} - 1 \text{ for some } n \in \mathbb{Z}_{>0}, \\ I, & \text{otherwise}. \end{cases} \end{equation*} We call it sparse as it projects only on specific indices that are exponentially far apart. Pick an arbitrary $p>1$. Fix some constant $R^{\theta}_{\text{proj}}>0$ and $R^{\omega}_{\text{proj}}>0$ for the radius of the projection ball. Further, let \begin{equation*} \theta^{*}=X^{-1}_{1}b_{1}, \omega^{*}=W^{-1}_{2}(v_2 - \Gamma_2 \theta^{*}) \end{equation*} with $b_1=v_1 - W_1 W_2^{-1}v_2$. The formula for the sparse projection update in linear two-timescale algorithms is as follows: \begin{equation} \label{sparseprojectiontheta} \theta'_{k+1}=\Pi_{k+1,R^{\theta}_{\text{proj}}}(\theta'_{k} + \alpha_{k}[h_1(\theta'_{k},\omega'_{k})+M^{(1')}_{k+1}]), \end{equation} \begin{equation} \label{sparseprojectionomega} \omega'_{k+1}=\Pi_{k+1,R^{\omega}_{\text{proj}}}(\omega'_{k} + \beta_{k}[h_2(\theta'_{k},\omega'_{k})+M^{(2')}_{k+1}]). \end{equation} \end{definition} \begin{proof} As long as the VMTD algorithm satisfies Assumption \ref{matrixassumption}, the convergence speed of the VMTD algorithm can be obtained. VMTD's update rule is \begin{equation*} \theta_{k+1}=\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi_k. \end{equation*} \begin{equation*} \omega_{k+1}=\omega_{k}+\beta_k(\delta_k-\omega_k). \end{equation*} Thus, $h_1(\theta, \omega)=\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi - \gamma\phi')\theta$, $h_2(\theta, \omega)=\mathbb{E}[r]+\mathbb{E}[\gamma \phi'^{\top}-\phi^{\top}]\theta -\omega$, $\Gamma_1 =\mathrm{Cov}(\phi,\phi - \gamma\phi')$, $W_1 = 0$ and $\Gamma_2 = -\mathbb{E}[\gamma \phi'^{\top}-\phi^{\top}]$, $W_2 = 1$, $v_2 = \mathbb{E}[r]$. Additionally, $X_1=\Gamma_1 - W_1 W^{-1}_2 \Gamma_2 = \mathrm{Cov}(\phi,\phi - \gamma\phi')$. % By the Assumption \ref{matrixassumption}, It can be deduced from the proof \ref{th1proof} that $X_1$ is a positive definite matrix. The VMTD algorithm satisfies the Assumption \ref{matrixassumption}. By the proof \ref{th1proof}, Definition 1 in \cite{dalal2020tale} is satisfied. We can apply the Theorem 3 in \cite{dalal2020tale} to get the Corollary \ref{corollary4_2}. \end{proof} \subsection{Proof of Theorem \ref{theorem2}} \label{proofth2} \begin{proof} The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. For the VMTDC algorithm, a new one-step linear TD solution is defined as: \begin{equation*} 0=\mathbb{E}[(\phi - \gamma \phi' - \mathbb{E}[\phi - \gamma \phi'])\phi^\top]\mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\phi]=A^{\top}C^{-1}(-A\theta+b). \end{equation*} The matrix $A^{\top}C^{-1}A$ is positive definite. Thus, the VMTD's solution is $\theta_{\text{VMTDC}}=\theta_{\text{VMTD}}=A^{-1}b$. First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, \begin{equation*} \theta_{k+1}\leftarrow \theta_k+\zeta_k x(k), \end{equation*} \begin{equation*} u_{k+1}\leftarrow u_k+\beta_k y(k), \end{equation*} where \begin{equation*} x(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)], \end{equation*} \begin{equation*} y(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \phi^{\top}_k u_k]\phi_k. \end{equation*} Recursion (\ref{thetavmtdc}) can also be rewritten as \begin{equation*} \theta_{k+1}\leftarrow \theta_k+\beta_k z(k), \end{equation*} where \begin{equation*} z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)], \end{equation*} Due to the settings of step-size schedule $\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than those in (\ref{uvmtdc}) and the increments in iteration (\ref{uvmtdc}) are uniformly larger than those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion. Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc}) are associated to ODEs system as follows: \begin{equation} \dot{\theta}(t) = 0, \label{thetavmtdcFastest} \end{equation} \begin{equation} \dot{u}(t) = 0, \label{uvmtdcFastest} \end{equation} \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|u(t),\theta(t)]-\omega(t). \label{omegavmtdcFastest} \end{equation} Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$ and $u(t)\equiv u$ when viewed from the fastest timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $\theta$ that depends on the initial condition $\theta_0$ of recursion (\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $u$ that depends on the initial condition $u_0$ of recursion (\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest}) can be written as \begin{equation} \dot{\omega}(t)=\mathbb{E}[\delta_t|u,\theta]-\omega(t). \label{omegavmtdcFastestFinal} \end{equation} Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$, i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}). It is easy to find that the function $h$ is Lipschitz with coefficient $-1$. Let $h_{\infty}(\cdot)$ be the function defined by $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$. Then $h_{\infty}(\omega)= -\omega$, is well-defined. For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\theta,u]$ is the unique globally asymptotically stable equilibrium. For the ODE \begin{equation} \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), \label{omegavmtdcInfty} \end{equation} apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its associated strict Liapunov function. Then, the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable equilibrium. Consider now the recursion (\ref{omegavmtdc}). Let $M_{k+1}=(\delta_k-\omega_k) -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, where $\mathcal{F}(k)=\sigma(\omega_l,u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq c_1(1+||\omega_k||^2+||u_k||^2+||\theta_k||^2). \end{equation*} Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the second time scale recursion (\ref{uvmtdc}). Based on the above analysis, (\ref{uvmtdc}) can be rewritten as % \begin{equation*} % u_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|u_k,\theta_k] - \phi^{\top} (s_k) u_k]\phi(s_k). % \end{equation*} \begin{equation} \dot{\theta}(t) = 0, \label{thetavmtdcFaster} \end{equation} \begin{equation} \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|u(t),\theta(t)])\phi_t|\theta(t)] - Cu(t). \label{uvmtdcFaster} \end{equation} The ODE (\ref{thetavmtdcFaster}) suggests that $\theta(t)\equiv \theta$ (i.e., a time invariant parameter) when viewed from the second fast timescale. By the Hirsch lemma \cite{hirsch1989convergent}, it follows that $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some $\theta$ that depends on the initial condition $\theta_0$ of recursion (\ref{thetavmtdc}). Consider now the recursion (\ref{uvmtdc}). Let $N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \phi_k \phi^{\top}_k u_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \phi_k \phi^{\top}_k u_k)|\mathcal{I} (k)]$, where $\mathcal{I}(k)=\sigma(u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, \begin{equation*} \mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq c_2(1+||u_k||^2+||\theta_k||^2). \end{equation*} Because $\theta(t)\equiv \theta$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster}) can be written as \begin{equation} \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\theta])\phi_t|\theta] - Cu(t). \label{uvmtdcFasterFinal} \end{equation} Now consider the function $h(u)=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\theta]|\theta] -Cu$, i.e., the driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}), $u^* = C^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta])\phi|\theta]$ is the unique globally asymptotically stable equilibrium. Let $h_{\infty}(u)=-Cu$. For the ODE \begin{equation} \dot{u}(t) = h_{\infty}(u(t))= -Cu(t), \label{uvmtdcInfty} \end{equation} the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable equilibrium because $C$ is a positive definite matrix (because it is nonnegative definite and nonsingular). Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, by Theorem 2.2 of \cite{borkar2000ode} we obtain that $||u_k-u^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. Consider now the slower timescale recursion (\ref{thetavmtdc}). In the light of the above, (\ref{thetavmtdc}) can be rewritten as \begin{equation} \theta_{k+1} \leftarrow \theta_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k\\ - \alpha_k \gamma\phi'_{k}(\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k]). \end{equation} Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s