\section{Relevant proofs}
\subsection{Proof of Theorem \ref{theorem1}}
\label{proofth1}
\begin{proof}
    \label{th1proof}   
        The proof is  based on Borkar's Theorem   for
        general stochastic approximation recursions with two time scales
        \cite{borkar1997stochastic}. 
        
        % The new TD error for the linear setting is 
        % \begin{equation*}
        % \delta_{\text{new}}=r+\gamma
        % \theta^{\top}\phi'-\theta^{\top}\phi-\mathbb{E}[\delta].
        % \end{equation*}
        A new one-step
        linear TD solution is defined
        as: 
        \begin{equation*}
        0=\mathbb{E}[(\delta-\mathbb{E}[\delta]) \phi]=-A\theta+b.
        \end{equation*}
        Thus, the  VMTD's solution is
        $\theta_{\text{VMTD}}=A^{-1}b$.
        
        First, note that recursion (\ref{theta}) can be rewritten as
        \begin{equation*}
        \theta_{k+1}\leftarrow \theta_k+\beta_k\xi(k),
        \end{equation*}
        where
        \begin{equation*}
        \xi(k)=\frac{\alpha_k}{\beta_k}(\delta_k-\omega_k)\phi_k
        \end{equation*}
        Due to the settings of step-size schedule $\alpha_k = o(\beta_k)$,
        $\xi(k)\rightarrow 0$ almost surely as $k\rightarrow\infty$. 
        That is the increments in iteration (\ref{omega}) are uniformly larger than
        those in (\ref{theta}), thus (\ref{omega}) is the faster recursion.
        Along the faster time scale, iterations of (\ref{omega}) and (\ref{theta})
        are associated to ODEs system as follows:
        \begin{equation}
        \dot{\theta}(t) = 0,
        \label{thetaFast}
        \end{equation}
        \begin{equation}
        \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta(t)]-\omega(t).
        \label{omegaFast}
        \end{equation}
        Based on the ODE (\ref{thetaFast}), $\theta(t)\equiv \theta$ when
        viewed from the faster timescale. 
        By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
        $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
        $\theta$ that depends on the initial condition $\theta_0$ of recursion
        (\ref{theta}).
        Thus, the ODE pair (\ref{thetaFast})-(\ref{omegaFast}) can be written as
        \begin{equation}
        \dot{\omega}(t)=\mathbb{E}[\delta_t|\theta]-\omega(t).
        \label{omegaFastFinal}
        \end{equation}
        Consider the function $h(\omega)=\mathbb{E}[\delta|\theta]-\omega$,
        i.e., the driving vector field of the ODE (\ref{omegaFastFinal}).
        It is easy to find that the function $h$ is Lipschitz with coefficient
        $-1$.
        Let $h_{\infty}(\cdot)$ be the function defined by
         $h_{\infty}(\omega)=\lim_{x\rightarrow \infty}\frac{h(x\omega)}{x}$.
         Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
         For (\ref{omegaFastFinal}), $\omega^*=\mathbb{E}[\delta|\theta]$
        is the unique globally asymptotically stable equilibrium.
         For the ODE
          \begin{equation}
         \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
         \label{omegaInfty}
         \end{equation}
         apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
        associated strict Liapunov function. Then,
        the origin of (\ref{omegaInfty}) is a globally asymptotically stable
        equilibrium.
        
        
        Consider now the recursion (\ref{omega}).
        Let
        $M_{k+1}=(\delta_k-\omega_k)
        -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
        where $\mathcal{F}(k)=\sigma(\omega_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
        $k\geq 1$ are the sigma fields
        generated by $\omega_0,\theta_0,\omega_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
        $0\leq l<k$.
        It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
        satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
        Because $\phi_k$, $r_k$, and $\phi_k'$   have
        uniformly bounded second moments, it can be seen that for some constant
        $c_1>0$, $\forall k\geq0$,
        \begin{equation*}
        \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
        c_1(1+||\omega_k||^2+||\theta_k||^2).
        \end{equation*}
        
        
        Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
        Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
        conditions on the step-size sequences $\alpha_k$, $\beta_k$. Thus,
        by Theorem 2.2 of \cite{borkar2000ode} we obtain that
        $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
        
        Consider now the slower time scale recursion (\ref{theta}).
        Based on the above analysis, (\ref{theta}) can be rewritten as 
        \begin{equation*}
        \theta_{k+1}\leftarrow
        \theta_{k}+\alpha_k(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k.
        \end{equation*}
        
        Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
        $k\geq 1$ be the sigma fields
        generated by $\theta_0,\theta_{l+1},\phi_l,\phi_l'$,
        $0\leq l<k$.
        Let 
        $
        Z_{k+1} = Y_{t}-\mathbb{E}[Y_{t}|\mathcal{G}(k)],
        $
        where
        \begin{equation*}
        Y_{k}=(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k.
        \end{equation*}
        Consequently,
        \begin{equation*}
        \begin{array}{ccl}
        \mathbb{E}[Y_t|\mathcal{G}(k)]&=&\mathbb{E}[(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k|\mathcal{G}(k)]\\
        &=&\mathbb{E}[\delta_k\phi_k|\theta_k]
        -\mathbb{E}[\mathbb{E}[\delta_k|\theta_k]\phi_k]\\
        &=&\mathbb{E}[\delta_k\phi_k|\theta_k]
        -\mathbb{E}[\delta_k|\theta_k]\mathbb{E}[\phi_k]\\
        &=&\mathrm{Cov}(\delta_k|\theta_k,\phi_k),
        \end{array}
        \end{equation*}
        where $\mathrm{Cov}(\cdot,\cdot)$ is a covariance operator.
        
         Thus,
         \begin{equation*}
        \begin{array}{ccl}
        Z_{k+1}&=&(\delta_k-\mathbb{E}[\delta_k|\theta_k])\phi_k-\mathrm{Cov}(\delta_k|\theta_k,\phi_k).
        \end{array}
        \end{equation*}
        It is easy to verify that $Z_{k+1},k\geq0$ are integrable random variables that 
        satisfy $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$.
        Also, because $\phi_k$, $r_k$, and $\phi_k'$  have
        uniformly bounded second moments, it can be seen that for some constant
        $c_2>0$, $\forall k\geq0$,
        \begin{equation*}
        \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
        c_2(1+||\theta_k||^2).
        \end{equation*}
        
        Consider now the following ODE associated with (\ref{theta}):
        \begin{equation}
        \begin{array}{ccl}
        \dot{\theta}(t)&=&\mathrm{Cov}(\delta|\theta(t),\phi)\\
        &=&\mathrm{Cov}(r+(\gamma\phi'-\phi)^{\top}\theta(t),\phi)\\
        &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\theta(t)^{\top}(\phi-\gamma\phi'),\phi)\\
        &=&\mathrm{Cov}(r,\phi)-\theta(t)^{\top}\mathrm{Cov}(\phi-\gamma\phi',\phi)\\
        &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi-\gamma\phi',\phi)^{\top}\theta(t)\\
        &=&\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi-\gamma\phi')\theta(t)\\
        &=&-A\theta(t)+b.
        \end{array}
        \label{odetheta}
        \end{equation}
        Let $\vec{h}(\theta(t))$ be the driving vector field of the ODE
        (\ref{odetheta}).
        \begin{equation*}
        \vec{h}(\theta(t))=-A\theta(t)+b.
        \end{equation*}
         Consider the cross-covariance matrix,
        \begin{equation}
        \begin{array}{ccl}
        A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\
          &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\
          &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\
          &=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\
        \end{array}
        \label{covariance}
        \end{equation}
        where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$
        \footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to
        the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or
        initialized randomly in a Markov chain for on-policy update.}.
        Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and
        $\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive
        definite. Then, the matrix $A$ is semi-positive definite because  $A$ is
        linearly combined  by  two positive-weighted semi-positive definite matrice
        (\ref{covariance}).
        Furthermore, $A$ is nonsingular due to the assumption.
        Hence, the cross-covariance matrix $A$ is positive definite.
        
        Therefore,
        $\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically
        stable equilibrium for ODE (\ref{odetheta}).
        Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow
        \infty}\frac{\vec{h}(r\theta)}{r}$. Then
        $\vec{h}_{\infty}(\theta)=-A\theta$ is well-defined. 
        Consider now the ODE
        \begin{equation}
        \dot{\theta}(t)=-A\theta(t).
        \label{odethetafinal}
        \end{equation}
        The ODE (\ref{odethetafinal}) has the origin as its unique globally asymptotically stable equilibrium.
        Thus, the assumption (A1) and (A2) are verified.
    \end{proof}

\subsection{Proof of Corollary \ref{corollary4_2}}
\label{proofcorollary4_2}
The update formulas in linear two-timescale algorithms are as follows:
\begin{equation}
    \theta_{k+1}=\theta_{k} + \alpha_{k}[h_1(\theta_{k},\omega_{k})+M^{(1)}_{k+1}],
\end{equation}
\begin{equation}
    \omega_{k+1}=\omega_{k} + \alpha_{k}[h_2(\theta_{k},\omega_{k})+M^{(2)}_{k+1}].
\end{equation}
where $\alpha_k, \beta_k \in \mathbb{R} $ are stepsizes and $M^{(1)} \in \mathbb{R}^{d_1}, M^{(2)} \in \mathbb{R}^{d_2}$
denote noise. 
$h_1 : \mathbb{R}^{d_{1}}\times \mathbb{R}^{d_{2}}\rightarrow \mathbb{R}^{d_{1}}$ and 
$h_2 : \mathbb{R}^{d_{1}}\times \mathbb{R}^{d_{2}}\rightarrow \mathbb{R}^{d_{2}}$ have the 
form, respectively, 
\begin{equation}
    h_{1}(\theta,\omega)=v_1 - \Gamma_1 \theta - W_1\omega,
\end{equation}
\begin{equation}
    h_{2}(\theta,\omega)=v_2 - \Gamma_2 \theta - W_2\omega,
\end{equation}
where $v_1 \in \mathbb{R}^{d_1}$, $v_2 \in \mathbb{R}^{d_2}$, $\Gamma_1 \in \mathbb{R}^{d_1 \times d_1}$
, $\Gamma_2 \in \mathbb{R}^{d_2 \times d_1}$, $W_1 \in \mathbb{R}^{d_1 \times d_2}$ and 
$W_2 \in \mathbb{R}^{d_2 \times d_2}$. $d_1$ and $d_2$ are the dimensions of vectors $\theta$ and $\omega$, respectively.

For Theorem 3 in \cite{dalal2020tale}, the theorem still holds even when $d——1$ is not equal to $d_2$. For the VMTD algorithm, $d_2$ is equal to 1.
% Before proving the Corollary \ref{corollary4_2}, 
\cite{dalal2020tale} presents 
the matrix assumption, step size assumption, and 
defines sparse projection.

\begin{assumption}
\label{matrixassumption}
(Matrix Assumption).
$W_2$ and $X_1 = \Gamma_1 - W_1 W_{2}^{-1}\Gamma_2$ are positive definite(not necessarily symmetric).
\end{assumption}

\begin{assumption}
    \label{stepsizeassumption}
(Step Size Assumption).
$\alpha_k = (k+1)^{-\alpha}$ and $\beta_k = (k+1)^{-\beta}$, where $1>\alpha > \beta > 0$.
\end{assumption}

\begin{definition}
    \label{sparseprojection}
(Sparse Projection).
For $R>0$, let $\Pi_{R}(x)=\min \{1, R/||x||\}$. $x$ be the projection into the ball with redius
R around the origin. The sparse projection operator
\begin{equation*}
    \Pi_{n, R} = \begin{cases}
        \Pi_{R}, & \text{if } k = n^{n} - 1 \text{ for some } n \in \mathbb{Z}_{>0}, \\
        I, & \text{otherwise}.
    \end{cases}
\end{equation*}
We call it sparse as it projects only on specific indices that are exponentially far apart.

Pick an arbitrary $p>1$. Fix some constant $R^{\theta}_{\text{proj}}>0$ and $R^{\omega}_{\text{proj}}>0$ 
for the radius of the projection ball. Further, let 
\begin{equation*}
    \theta^{*}=X^{-1}_{1}b_{1}, \omega^{*}=W^{-1}_{2}(v_2 - \Gamma_2 \theta^{*})
\end{equation*}
with $b_1=v_1 - W_1 W_2^{-1}v_2$.
The formula for the sparse projection update in linear two-timescale algorithms is as follows:
\begin{equation}
    \label{sparseprojectiontheta}
    \theta'_{k+1}=\Pi_{k+1,R^{\theta}_{\text{proj}}}(\theta'_{k} + \alpha_{k}[h_1(\theta'_{k},\omega'_{k})+M^{(1')}_{k+1}]),
\end{equation}
\begin{equation}
    \label{sparseprojectionomega}
    \omega'_{k+1}=\Pi_{k+1,R^{\omega}_{\text{proj}}}(\omega'_{k} + \beta_{k}[h_2(\theta'_{k},\omega'_{k})+M^{(2')}_{k+1}]).
\end{equation}

\end{definition}

\begin{proof}
    As long as the VMTD algorithm satisfies Assumption \ref{matrixassumption}, 
the convergence speed of the VMTD algorithm can be 
obtained.

VMTD's update rule is 
\begin{equation*}
    \theta_{k+1}=\theta_{k}+\alpha_k(\delta_k-\omega_k)\phi_k.
\end{equation*}
\begin{equation*}
        \omega_{k+1}=\omega_{k}+\beta_k(\delta_k-\omega_k).
\end{equation*}
Thus, $h_1(\theta, \omega)=\mathrm{Cov}(r,\phi)-\mathrm{Cov}(\phi,\phi - \gamma\phi')\theta$, 
$h_2(\theta, \omega)=\mathbb{E}[r]+\mathbb{E}[\gamma \phi'^{\top}-\phi^{\top}]\theta -\omega$, 
$\Gamma_1 =\mathrm{Cov}(\phi,\phi - \gamma\phi')$, 
$W_1 = 0$ and 
$\Gamma_2 = -\mathbb{E}[\gamma \phi'^{\top}-\phi^{\top}]$, 
$W_2 = 1$, 
$v_2 = \mathbb{E}[r]$. Additionally, 
$X_1=\Gamma_1 - W_1 W^{-1}_2 \Gamma_2 = \mathrm{Cov}(\phi,\phi - \gamma\phi')$.
% By the Assumption \ref{matrixassumption}, 
It can be deduced from the proof \ref{th1proof} that $X_1$ is a positive definite matrix.
The VMTD algorithm satisfies the Assumption \ref{matrixassumption}.
By the proof \ref{th1proof}, Definition 1 in \cite{dalal2020tale} is satisfied.
We can apply the Theorem 3 in \cite{dalal2020tale} to get the Corollary \ref{corollary4_2}.


\end{proof}


\subsection{Proof of Theorem \ref{theorem2}}
\label{proofth2}
\begin{proof}
The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation.

For the VMTDC algorithm, a new one-step linear TD solution is defined as:
\begin{equation*}
    0=\mathbb{E}[(\phi - \gamma \phi' - \mathbb{E}[\phi - \gamma \phi'])\phi^\top]\mathbb{E}[\phi \phi^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\phi]=A^{\top}C^{-1}(-A\theta+b).
\end{equation*}
The matrix $A^{\top}C^{-1}A$ is positive definite. Thus, the  VMTD's solution is
$\theta_{\text{VMTDC}}=\theta_{\text{VMTD}}=A^{-1}b$.

First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, 
\begin{equation*}
    \theta_{k+1}\leftarrow \theta_k+\zeta_k x(k),
\end{equation*}
\begin{equation*}
    u_{k+1}\leftarrow u_k+\beta_k y(k),
\end{equation*}
where 
\begin{equation*}
    x(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)],
\end{equation*}
\begin{equation*}
    y(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \phi^{\top}_k u_k]\phi_k.
\end{equation*}

Recursion (\ref{thetavmtdc}) can also be rewritten as
\begin{equation*}
    \theta_{k+1}\leftarrow \theta_k+\beta_k z(k),
\end{equation*}
where
\begin{equation*}
    z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)],
\end{equation*}

Due to the settings of step-size schedule 
$\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$.
That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than
those in (\ref{uvmtdc}) and  the increments in iteration (\ref{uvmtdc}) are uniformly larger than
those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion.
Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc})
are associated to ODEs system as follows:
\begin{equation}
    \dot{\theta}(t) = 0,
    \label{thetavmtdcFastest}
\end{equation}
\begin{equation}
    \dot{u}(t) = 0,
    \label{uvmtdcFastest}
\end{equation}
\begin{equation}
    \dot{\omega}(t)=\mathbb{E}[\delta_t|u(t),\theta(t)]-\omega(t).
    \label{omegavmtdcFastest}
\end{equation}

Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$
and $u(t)\equiv u$ when viewed from the fastest timescale.
By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
$||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$\theta$ that depends on the initial condition $\theta_0$ of recursion
(\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$u$ that depends on the initial condition $u_0$ of recursion
(\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest})
can be written as 
\begin{equation}
    \dot{\omega}(t)=\mathbb{E}[\delta_t|u,\theta]-\omega(t).
    \label{omegavmtdcFastestFinal}
\end{equation}

Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$,
i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}).
It is easy to find that the function $h$ is Lipschitz with coefficient
$-1$.
Let $h_{\infty}(\cdot)$ be the function defined by
 $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$.
 Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
 For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\theta,u]$
is the unique globally asymptotically stable equilibrium.
For the ODE
\begin{equation}
 \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
 \label{omegavmtdcInfty}
\end{equation}
apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
associated strict Liapunov function. Then,
the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable
equilibrium.

Consider now the recursion (\ref{omegavmtdc}).
Let
$M_{k+1}=(\delta_k-\omega_k)
-\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
where $\mathcal{F}(k)=\sigma(\omega_l,u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
$k\geq 1$ are the sigma fields
generated by $\omega_0,u_0,\theta_0,\omega_{l+1},u_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
$0\leq l<k$.
It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
Because $\phi_k$, $r_k$, and $\phi_k'$   have
uniformly bounded second moments, it can be seen that for some constant
$c_1>0$, $\forall k\geq0$,
\begin{equation*}
\mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
c_1(1+||\omega_k||^2+||u_k||^2+||\theta_k||^2).
\end{equation*}


Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
by Theorem 2.2 of \cite{borkar2000ode} we obtain that
$||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.

Consider now the second time scale recursion (\ref{uvmtdc}).
Based on the above analysis, (\ref{uvmtdc}) can be rewritten as
% \begin{equation*}
%     u_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|u_k,\theta_k] - \phi^{\top} (s_k) u_k]\phi(s_k).
% \end{equation*}
\begin{equation}
    \dot{\theta}(t) = 0,
    \label{thetavmtdcFaster}
\end{equation}
\begin{equation}
    \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|u(t),\theta(t)])\phi_t|\theta(t)] - Cu(t).
    \label{uvmtdcFaster}
\end{equation}
The ODE (\ref{thetavmtdcFaster}) suggests that $\theta(t)\equiv \theta$ (i.e., a time invariant parameter)
when viewed from the second fast timescale.
By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
$||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
$\theta$ that depends on the initial condition $\theta_0$ of recursion
(\ref{thetavmtdc}). 

Consider now the recursion (\ref{uvmtdc}).
Let
$N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \phi_k \phi^{\top}_k u_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \phi_k \phi^{\top}_k u_k)|\mathcal{I} (k)]$,
where $\mathcal{I}(k)=\sigma(u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
$k\geq 1$ are the sigma fields
generated by $u_0,\theta_0,u_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
$0\leq l<k$.
It is easy to verify that $N_{k+1},k\geq0$ are integrable random variables that 
satisfy $\mathbb{E}[N_{k+1}|\mathcal{I}(k)]=0$, $\forall k\geq0$.
Because $\phi_k$, $r_k$, and $\phi_k'$   have
uniformly bounded second moments, it can be seen that for some constant
$c_2>0$, $\forall k\geq0$,
\begin{equation*}
\mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq
c_2(1+||u_k||^2+||\theta_k||^2).
\end{equation*}

Because $\theta(t)\equiv \theta$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster})
can be written as 
\begin{equation}
    \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\theta])\phi_t|\theta] - Cu(t).
    \label{uvmtdcFasterFinal}
\end{equation}
Now consider the function $h(u)=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\theta]|\theta] -Cu$, i.e., the
driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}),
$u^* = C^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta])\phi|\theta]$ is the unique globally asymptotically
stable equilibrium. Let $h_{\infty}(u)=-Cu$.
For the ODE
\begin{equation}
    \dot{u}(t) = h_{\infty}(u(t))= -Cu(t),
    \label{uvmtdcInfty}
\end{equation}
the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable
equilibrium because $C$ is a positive definite matrix (because it is nonnegative definite and nonsingular).
Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
by Theorem 2.2 of \cite{borkar2000ode} we obtain that
$||u_k-u^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.

Consider now the slower timescale recursion (\ref{thetavmtdc}). In the light of the above,
(\ref{thetavmtdc}) can be rewritten as 
\begin{equation}
    \theta_{k+1} \leftarrow \theta_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k\\
    - \alpha_k \gamma\phi'_{k}(\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k]).
\end{equation}
Let $\mathcal{G}(k)=\sigma(\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
$k\geq 1$ be the sigma fields
generated by $\theta_0,\theta_{l+1},\phi_l,\phi_l'$,
$0\leq l<k$. Let
\begin{equation*}
    \begin{array}{ccl}
    Z_{k+1}&=&((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])\\ 
     & &-\mathbb{E}[((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])|\mathcal{G}(k)]\\
    &=&((\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k - \gamma \phi'_{k}\phi^{\top}_k C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k])\phi|\theta_k])\\
    & &-\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k|\theta_k] - \gamma\mathbb{E}[\phi' \phi^{\top}]C^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\theta_k]) \phi_k|\theta_k].
    \end{array}
\end{equation*}
It is easy to see that $Z_k$, $k\geq 0$ are integrable random variables and $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$. Further,
\begin{equation*}
\mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
c_3(1+||\theta_k||^2), k\geq 0
\end{equation*}
for some constant $c_3 \geq 0$, again beacuse $\phi_k$, $r_k$, and $\phi_k'$   have
uniformly bounded second moments, it can be seen that for some constant.

Consider now the following ODE associated with (\ref{thetavmtdc}):
\begin{equation}
    \dot{\theta}(t) = (I - \mathbb{E}[\gamma \phi' \phi^{\top}]C^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)].
    \label{thetavmtdcSlowerFinal}
\end{equation}
Let 
\begin{equation*}
\begin{array}{ccl}
    \vec{h}(\theta(t))&=&(I - \mathbb{E}[\gamma \phi' \phi^{\top}]C^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
    &=&(C - \mathbb{E}[\gamma \phi' \phi^{\top}])C^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
    &=& (\mathbb{E}[\phi \phi^{\top}] - \mathbb{E}[\gamma \phi' \phi^{\top}])C^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]\\
    &=& A^{\top}C^{-1}(-A\theta(t)+b),
\end{array}
\end{equation*}
because $\mathbb{E}[(\delta -\mathbb{E}[\delta|\theta(t)]) \phi|\theta(t)]=-A\theta(t)+b$, where 
$A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$, $b=\mathrm{Cov}(r,\phi)$, and $C=\mathbb{E}[\phi\phi^{\top}]$

Therefore,
$\theta^*=A^{-1}b$ can be seen to be the unique globally asymptotically
stable equilibrium for ODE (\ref{thetavmtdcSlowerFinal}).
Let $\vec{h}_{\infty}(\theta)=\lim_{r\rightarrow
\infty}\frac{\vec{h}(r\theta)}{r}$. Then
$\vec{h}_{\infty}(\theta)=-A^{\top}C^{-1}A\theta$ is well-defined. 
Consider now the ODE
\begin{equation}
\dot{\theta}(t)=-A^{\top}C^{-1}A\theta(t).
\label{odethetavmtdcfinal}
\end{equation}

Because $C^{-1}$ is positive definite and $A$ has full rank (as it
is nonsingular by assumption), the matrix $A^{\top} C^{-1}A$ is also
positive definite. 
The ODE (\ref{odethetavmtdcfinal}) has the origin as its unique globally asymptotically stable equilibrium.
Thus, the assumption (A1) and (A2) are verified.

The proof is given above.
In the fastest time scale, the parameter $w$ converges to
$\mathbb{E}[\delta|u_k,\theta_k]$.
In the second fast time scale,
the parameter $u$ converges to $C^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta_k])\phi|\theta_k]$.
In the slower time scale,
the parameter $\theta$ converges to $A^{-1}b$.
\end{proof}

\begin{algorithm}[t]
    \caption{VMTDC algorithm with linear function approximation in the off-policy setting}
    \label{alg:algorithm 2}
\begin{algorithmic}
    \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
    $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
    \REPEAT
    \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
    \textbf{Output}: $\theta^*$.\\
    \FOR{$t=0$ {\bfseries to} $T-1$}
    \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
    \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
    \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
    \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
    \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[ (\delta_t-\omega_t)\phi_t - \gamma \phi_{t+1}(\phi^{\top}_{t} u_t)]$
    \STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) - \phi^{\top}_{t} u_t] \phi_t$
    \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
    \STATE $S_t=S_{t+1}$
    \ENDFOR
    \UNTIL{terminal episode}
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t]
    \caption{VMGTD algorithm with linear function approximation in the off-policy setting}
    \label{alg:algorithm 3}
\begin{algorithmic}
    \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
    $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
    \REPEAT
    \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
    \textbf{Output}: $\theta^*$.\\
    \FOR{$t=0$ {\bfseries to} $T-1$}
    \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
    \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
    \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
    \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
    \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[\phi_t - \gamma \phi_{t+1}]\phi^{\top}_{t} u_t$
    \STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) \phi_t - u_t]$
    \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
    \STATE $S_t=S_{t+1}$
    \ENDFOR
    \UNTIL{terminal episode}
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t]
    \caption{VMGTD2 algorithm with linear function approximation in the off-policy setting}
    \label{alg:algorithm 4}
\begin{algorithmic}
    \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
    $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
    \REPEAT
    \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
    \textbf{Output}: $\theta^*$.\\
    \FOR{$t=0$ {\bfseries to} $T-1$}
    \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
    \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
    \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
    \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
    \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t \rho_t[\phi_t - \gamma \phi_{t+1}]\phi^{\top}_{t} u_t$
    \STATE $u_{t+1}\leftarrow u_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) - \phi^{\top}_{t} u_t] \phi_t$
    \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
    \STATE $S_t=S_{t+1}$
    \ENDFOR
    \UNTIL{terminal episode}
\end{algorithmic}
\end{algorithm}

% \begin{algorithm}[t]
%     \caption{VMETD algorithm with linear function approximation in the off-policy setting}
%     \label{alg:algorithm 5}
% \begin{algorithmic}
%     \STATE {\bfseries Input:} $\theta_{0}$, $u_0$, $\omega_{0}$, $\gamma
%     $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
%     \REPEAT
%     \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
%     \textbf{Output}: $\theta^*$.\\
%     \FOR{$t=0$ {\bfseries to} $T-1$}
%     \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
%     \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
%     \STATE $\delta_t = R_{t+1}+\gamma\theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t$
%     \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$
%     \STATE $\theta_{t+1}\leftarrow \theta_{t}+\alpha_t u_t \rho_t(\delta_t-\omega_t)\phi_t$
%     \STATE $u_{t+1}\leftarrow \gamma \rho_t u_t +1$
%     \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$
%     \STATE $S_t=S_{t+1}$
%     \ENDFOR
%     \UNTIL{terminal episode}
% \end{algorithmic}
% \end{algorithm}

\section{Experimental details}
\label{experimentaldetails}
The feature matrices corresponding to three random walks are shown below respectively:
\begin{equation*}
    \Phi_{tabular}=\left[ 
    \begin{array}{ccccc}
    1 & 0& 0& 0& 0\\
    0 & 1& 0& 0& 0\\
    0 & 0& 1& 0& 0\\
    0 & 0& 0& 1& 0\\
    0 & 0& 0& 0& 1
    \end{array}\right]
    \end{equation*}
    \begin{equation*}
    \Phi_{inverted}=\left[ 
    \begin{array}{ccccc}
    0 & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\
    \frac{1}{2} & 0& \frac{1}{2}& \frac{1}{2}& \frac{1}{2}\\
    \frac{1}{2} & \frac{1}{2}& 0& \frac{1}{2}& \frac{1}{2}\\
    \frac{1}{2} & \frac{1}{2}& \frac{1}{2}& 0& \frac{1}{2}\\
    \frac{1}{2} & \frac{1}{2}& \frac{1}{2}& \frac{1}{2}& 0
    \end{array}\right]
    \end{equation*}
    \begin{equation*}
    \Phi_{dependent}=\left[ 
    \begin{array}{ccccc}
    1 & 0& 0\\
    \frac{1}{\sqrt{2}} & \frac{1}{\sqrt{2}}& 0\\
    \frac{1}{\sqrt{3}} & \frac{1}{\sqrt{3}}& \frac{1}{\sqrt{3}}\\
    0 & \frac{1}{\sqrt{2}}& \frac{1}{\sqrt{2}}\\
    0 & 0& 1
    \end{array}\right]
    \end{equation*}

Three random walk experiments: the $\alpha$ values for 
all algorithms are in the range of $\{0.008, 0.015, 0.03, 0.06, 0.12, 0.25, 0.5\}$. For the TDC algorithm, 
the range of the ratio $\frac{\zeta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. For the VMTD algorithm, 
the range of the ratio $\frac{\beta}{\alpha}$ is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$. It can be observed from 
the update formula of VMTDC that when $\zeta$ takes a very small value, 
the VMTDC update tends to be similar to VMTD update. Similarly, 
when $\beta$ takes a very small value, the VMTDC update tends to be 
similar to TDC update. Through experiments, it was found that 
setting $\zeta$ to a small value makes VMTDC updates approach VMTD 
updates, resulting in better performance. Therefore, for the VMTDC 
algorithm, the range of $\frac{\beta}{\alpha}$ ratio is $\{\frac{1}{512}, \frac{1}{256}, \frac{1}{128}, \frac{1}{64}, \frac{1}{32}, \frac{1}{16}, \frac{1}{8}, \frac{1}{4}, \frac{1}{2}, 1, 2\}$, and the range of 
$\zeta$ is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. The learning curves in Figure \ref{Evaluation_full} correspond to the optimal 
parameters.

The feature matrix of 7-state version of Baird's off-policy counterexample is
defined as follow:
\begin{equation*}
\Phi_{Counter}=\left[ 
\begin{array}{cccccccc}
1 & 2& 0& 0& 0& 0& 0& 0\\
1 & 0& 2& 0& 0& 0& 0& 0\\
1 & 0& 0& 2& 0& 0& 0& 0\\
1 & 0& 0& 0& 2& 0& 0& 0\\
1 & 0& 0& 0& 0& 2& 0& 0\\
1 & 0& 0& 0& 0& 0& 2& 0\\
2 & 0& 0& 0& 0& 0& 0& 1
\end{array}\right]
\end{equation*}

7-state version of Baird's off-policy counterexample: 
for TD algorithm, $\alpha$ is set to 0.1. For the TDC algorithm, the range of 
$\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$, 
and the range of 
$\zeta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. 
For the VMTD algorithm, the range of 
$\alpha$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0\}$, 
and the range of 
$\beta$ is $\{0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5\}$. Through experiments, it was found 
that setting $\zeta$ to a small value makes VMTDC updates approach VMTD 
updates, resulting in better performance. Therefore, for the VMTDC 
algorithm, The range of values for $\alpha$ and $\beta$ is the same as that of VMTD  
and the range of $\zeta$ 
is $\{0.1, 0.01, 0.001, 0.0001, 0.00001\}$. 
The learning curves in Figure \ref{Complete_full} correspond to the optimal parameters.
For all policy evaluation experiments, each experiment 
is independently run 100 times.

For the four control experiments: The learning rates for each 
algorithm in all experiments are shown in Table \ref{lrofways}.
For all control experiments, each experiment is independently run 50 times.

\begin{table*}[htb]
    \centering
    \caption{Learning rates ($lr$) of four control experiments.}
    \vskip 0.15in
    \begin{tabular}{c|ccccc}
        \hline
        \multicolumn{1}{c|}{\diagbox{algorithms($lr$)}{envs}} &Maze &Cliff walking &Mountain Car &Acrobot \\
        \hline
         Sarsa($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\
         GQ(0)($\alpha,\zeta$)&$0.1,0.003$ &$0.1,0.004$ &$0.1,0.01$ &$0.1,0.01$ \\
         VMSarsa($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\
         VMGQ(0)($\alpha,\zeta,\beta$)&$0.1,0.001,0.001$ &$0.1,0.005,\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ &$0.1,\text{5e-4},\text{1e-4}$ \\
         AC($lr_{\text{actor}},lr_{\text{critic}}$)&$0.01,0.1$ &$0.01,0.01$ &$0.01,0.05$ &$0.01,0.05$ \\
         Q-learning($\alpha$)&$0.1$ &$0.1$ &$0.1$ &$0.1$ \\
         VMQ($\alpha,\beta$)&$0.1,0.001$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ &$0.1,\text{1e-4}$ \\
         \hline
    \end{tabular}
    \label{lrofways}
    \vskip -0.1in
\end{table*}