\section{Theoretical Analysis}
This section primarily focuses on proving the convergence of VMTD, VMTDC, and VMETD.
\begin{theorem}
    \label{theorem1}(Convergence of VMTD).
    In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta})  of VMTD.
     Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\beta_k).
    $
    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
    uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain.
    Let $\textbf{A} = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
    $b=\mathrm{Cov}(r,\phi)$.
    Assume that matrix $\textbf{A}$ is  non-singular. 
    Then the parameter vector $\theta_k$ converges with probability one 
    to $\textbf{A}^{-1}b$.
\end{theorem}

\begin{proof}
\label{th1proof}   
    The proof is  based on Borkar's Theorem   for
    general stochastic approximation recursions with two time scales
    \cite{borkar1997stochastic}. 

    A sketch proof is given as follows.
     In the fast time scale, the parameter $w$ converges to
     $\mathbb{E}[\delta|\theta_k]$.
     In the slow time scale,
      the associated ODE is
     \begin{equation*}
    \vec{h}(\theta(t))=-\textbf{A}\theta(t)+b.
    \end{equation*}
    \begin{equation}
        \begin{array}{ccl}
        A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\
          &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\
          &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\
          &=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\
        \end{array}
        \label{covariance}
        \end{equation}
        where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$
        \footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to
        the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or
        initialized randomly in a Markov chain for on-policy update.}.
        Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and
        $\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive
        definite. Then, the matrix $\textbf{A}$ is semi-positive definite because  $\textbf{A}$ is
        linearly combined  by  two positive-weighted semi-positive definite matrice
        (\ref{covariance}).
        Furthermore, $\textbf{A}$ is nonsingular due to the assumption.
        Hence, the matrix $\textbf{A}$ is positive definite. And,
     the parameter $\theta$ converges to $\textbf{A}^{-1}b$.
\end{proof}
Please refer to the appendix for VMTD's detailed proof process.

\begin{theorem}
    \label{theorem2}(Convergence of VMTDC).
    In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc})   of VMTDC.
     Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\zeta_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\zeta_k),
    $
    $
    \zeta_k = o(\beta_k).
    $
    Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
    uniformly bounded second moments.
    Let $\textbf{A} = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
    $b=\mathrm{Cov}(r,\phi)$, and $\textbf{C}=\mathbb{E}[\phi\phi^{\top}]$.
    Assume that  $\textbf{A}$ and $\textbf{C}$ are  non-singular matrices. 
    Then the parameter vector $\theta_k$ converges with probability one 
    to $\textbf{A}^{-1}b$.
\end{theorem}
\begin{proof}
    The proof is similar to that given by \cite{sutton2009fast} for TDC,
    but it is based on multi-time-scale stochastic approximation.
    
    A sketch proof is given as follows.
     In the fastest time scale, the parameter $w$ converges to
     $\mathbb{E}[\delta|u_k,\theta_k]$.
     In the second fast time scale,
     the parameter $u$ converges to $\textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta_k])\phi|\theta_k]$.
     In the slower time scale,
      the associated ODE is
     \begin{equation*}
    \vec{h}(\theta(t))=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\theta(t)+b).
    \end{equation*}
    The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus,
     the parameter $\theta$ converges to $\textbf{A}^{-1}b$.
\end{proof}
Please refer to the appendix for VMTDC's detailed proof process.
% \begin{proof}
%     The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation.
%     % For the VMTDC algorithm, a new one-step linear TD solution is defined as:
%     % \begin{equation*}
%     %     0=\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b}).
%     % \end{equation*}
%     % The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the  VMTD's solution is
%     % $\bm{\theta}_{\text{VMTDC}}=\bm{\theta}_{\text{VMTD}}=\textbf{A}^{-1}\bm{b}$.
    
%     First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, 
% \begin{equation*}
%     \theta_{k+1}\leftarrow \theta_k+\zeta_k x(k),
% \end{equation*}
% \begin{equation*}
%     u_{k+1}\leftarrow u_k+\beta_k y(k),
% \end{equation*}
% where 
% \begin{equation*}
%     x(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)],
% \end{equation*}
% \begin{equation*}
%     y(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \phi^{\top}_k u_k]\phi_k.
% \end{equation*}

% Recursion (\ref{thetavmtdc}) can also be rewritten as
% \begin{equation*}
%     \theta_{k+1}\leftarrow \theta_k+\beta_k z(k),
% \end{equation*}
% where
% \begin{equation*}
%     z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)],
% \end{equation*}

% Due to the settings of step-size schedule 
% $\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$.
% That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than
% those in (\ref{uvmtdc}) and  the increments in iteration (\ref{uvmtdc}) are uniformly larger than
% those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion.
% Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc})
% are associated to ODEs system as follows:
% \begin{equation}
%     \dot{\theta}(t) = 0,
%     \label{thetavmtdcFastest}
% \end{equation}
% \begin{equation}
%     \dot{u}(t) = 0,
%     \label{uvmtdcFastest}
% \end{equation}
% \begin{equation}
%     \dot{\omega}(t)=\mathbb{E}[\delta_t|u(t),\theta(t)]-\omega(t).
%     \label{omegavmtdcFastest}
% \end{equation}

% Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$
% and $u(t)\equiv u$ when viewed from the fastest timescale.
% By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
% $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
% $\theta$ that depends on the initial condition $\theta_0$ of recursion
% (\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
% $u$ that depends on the initial condition $u_0$ of recursion
% (\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest})
% can be written as 
% \begin{equation}
%     \dot{\omega}(t)=\mathbb{E}[\delta_t|u,\theta]-\omega(t).
%     \label{omegavmtdcFastestFinal}
% \end{equation}

% Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$,
% i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}).
% It is easy to find that the function $h$ is Lipschitz with coefficient
% $-1$.
% Let $h_{\infty}(\cdot)$ be the function defined by
%  $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$.
%  Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
%  For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\theta,u]$
% is the unique globally asymptotically stable equilibrium.
% For the ODE
% \begin{equation}
%  \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
%  \label{omegavmtdcInfty}
% \end{equation}
% apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
% associated strict Liapunov function. Then,
% the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable
% equilibrium.

% Consider now the recursion (\ref{omegavmtdc}).
% Let
% $M_{k+1}=(\delta_k-\omega_k)
% -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
% where $\mathcal{F}(k)=\sigma(\omega_l,u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s<k)$, 
% $k\geq 1$ are the sigma fields
% generated by $\omega_0,u_0,\theta_0,\omega_{l+1},u_{l+1},\theta_{l+1},\phi_l,\phi_l'$,
% $0\leq l<k$.
% It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
% satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
% Because $\phi_k$, $r_k$, and $\phi_k'$   have
% uniformly bounded second moments, it can be seen that for some constant
% $c_1>0$, $\forall k\geq0$,
% \begin{equation*}
% \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
% c_1(1+||\omega_k||^2+||u_k||^2+||\theta_k||^2).
% \end{equation*}


% Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
% Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
% conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
% by Theorem 2.2 of \cite{borkar2000ode} we obtain that
% $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.

% Recursion (\ref{uvmtdc}) is considered the second timescale.
% Recursion (\ref{thetavmtdc}) is considered the slower timescale.
% For the convergence properties of $u$ and $\theta$, please refer to the appendix.
% \end{proof}

% \begin{proof}
%     The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation.
    
%     For the VMTDC algorithm, a new one-step linear TD solution is defined as:
%     \begin{equation*}
%         \begin{array}{ccl}
%         0&=&\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\\ 
%          & &\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]\\
%          &=&\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b})\\
%          &=&-\textbf{A}_{\text{VMTDC}}\bm{\theta}+\bm{b}_{\text{VMTDC}}.
%         \end{array}
%     \end{equation*}
%     The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the  VMTD's solution is
%     $\bm{\theta}_{\text{VMTDC}}=\textbf{A}^{-1}\bm{b}$.
    
%     First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, 
%     \begin{equation*}
%         \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\zeta_k \bm{x}(k),
%     \end{equation*}
%     \begin{equation*}
%         \bm{u}_{k+1}\leftarrow \bm{u}_k+\beta_k \bm{y}(k),
%     \end{equation*}
%     where 
%     \begin{equation*}
%         \bm{x}(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)],
%     \end{equation*}
%     \begin{equation*}
%         \bm{y}(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \bm{\phi}^{\top}_k \bm{u}_k]\bm{\phi}_k.
%     \end{equation*}
    
%     Recursion (\ref{thetavmtdc}) can also be rewritten as
%     \begin{equation*}
%         \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\beta_k z(k),
%     \end{equation*}
%     where
%     \begin{equation*}
%         z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)],
%     \end{equation*}
    
%     Due to the settings of step-size schedule 
%     $\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$.
%     That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than
%     those in (\ref{uvmtdc}) and  the increments in iteration (\ref{uvmtdc}) are uniformly larger than
%     those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion.
%     Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc})
%     are associated to ODEs system as follows:
%     \begin{equation}
%         \dot{\bm{\theta}}(t) = 0,
%         \label{thetavmtdcFastest}
%     \end{equation}
%     \begin{equation}
%         \dot{\bm{u}}(t) = 0,
%         \label{uvmtdcFastest}
%     \end{equation}
%     \begin{equation}
%         \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)]-\omega(t).
%         \label{omegavmtdcFastest}
%     \end{equation}
    
%     Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$
%     and $u(t)\equiv u$ when viewed from the fastest timescale.
%     By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
%     $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
%     $\theta$ that depends on the initial condition $\theta_0$ of recursion
%     (\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
%     $u$ that depends on the initial condition $u_0$ of recursion
%     (\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest})
%     can be written as 
%     \begin{equation}
%         \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u},\bm{\theta}]-\omega(t).
%         \label{omegavmtdcFastestFinal}
%     \end{equation}
    
%     Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$,
%     i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}).
%     It is easy to find that the function $h$ is Lipschitz with coefficient
%     $-1$.
%     Let $h_{\infty}(\cdot)$ be the function defined by
%      $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$.
%      Then  $h_{\infty}(\omega)= -\omega$,  is well-defined. 
%      For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\bm{\theta},\bm{u}]$
%     is the unique globally asymptotically stable equilibrium.
%     For the ODE
%     \begin{equation}
%      \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t),
%      \label{omegavmtdcInfty}
%     \end{equation}
%     apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its
%     associated strict Liapunov function. Then,
%     the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable
%     equilibrium.
    
%     Consider now the recursion (\ref{omegavmtdc}).
%     Let
%     $M_{k+1}=(\delta_k-\omega_k)
%     -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$,
%     where $\mathcal{F}(k)=\sigma(\omega_l,\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
%     $k\geq 1$ are the sigma fields
%     generated by $\omega_0,u_0,\bm{\theta}_0,\omega_{l+1},\bm{u}_{l+1},\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
%     $0\leq l<k$.
%     It is easy to verify that $M_{k+1},k\geq0$ are integrable random variables that 
%     satisfy $\mathbb{E}[M_{k+1}|\mathcal{F}(k)]=0$, $\forall k\geq0$.
%     Because $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$   have
%     uniformly bounded second moments, it can be seen that for some constant
%     $c_1>0$, $\forall k\geq0$,
%     \begin{equation*}
%     \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq
%     c_1(1+||\omega_k||^2+||\bm{u}_k||^2+||\bm{\theta}_k||^2).
%     \end{equation*}
    
    
%     Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
%     Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
%     conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
%     by Theorem 2.2 of \cite{borkar2000ode} we obtain that
%     $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
    
%     Consider now the second time scale recursion (\ref{uvmtdc}).
%     Based on the above analysis, (\ref{uvmtdc}) can be rewritten as
%     % \begin{equation*}
%     %     \bm{u}_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|\bm{u}_k,\bm{\theta}_k] - \bm{\phi}^{\top} (s_k) \bm{u}_k]\bm{\phi}(s_k).
%     % \end{equation*}
%     \begin{equation}
%         \dot{\bm{\theta}}(t) = 0,
%         \label{thetavmtdcFaster}
%     \end{equation}
%     \begin{equation}
%         \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)])\bm{\phi}_t|\bm{\theta}(t)] - \textbf{C}\bm{u}(t).
%         \label{uvmtdcFaster}
%     \end{equation}
%     The ODE (\ref{thetavmtdcFaster}) suggests that $\theta(t)\equiv \theta$ (i.e., a time invariant parameter)
%     when viewed from the second fast timescale.
%     By the Hirsch lemma \cite{hirsch1989convergent}, it follows that
%     $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some
%     $\theta$ that depends on the initial condition $\theta_0$ of recursion
%     (\ref{thetavmtdc}). 
    
%     Consider now the recursion (\ref{uvmtdc}).
%     Let
%     $N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k)|\mathcal{I} (k)]$,
%     where $\mathcal{I}(k)=\sigma(\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
%     $k\geq 1$ are the sigma fields
%     generated by $\bm{u}_0,\bm{\theta}_0,\bm{u}_{l+1},\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
%     $0\leq l<k$.
%     It is easy to verify that $N_{k+1},k\geq0$ are integrable random variables that 
%     satisfy $\mathbb{E}[N_{k+1}|\mathcal{I}(k)]=0$, $\forall k\geq0$.
%     Because $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$   have
%     uniformly bounded second moments, it can be seen that for some constant
%     $c_2>0$, $\forall k\geq0$,
%     \begin{equation*}
%     \mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq
%     c_2(1+||\bm{u}_k||^2+||\bm{\theta}_k||^2).
%     \end{equation*}
    
%     Because $\bm{\theta}(t)\equiv \bm{\theta}$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster})
%     can be written as 
%     \begin{equation}
%         \dot{\bm{u}}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{\theta}])\bm{\phi}_t|\bm{\theta}] - \textbf{C}\bm{u}(t).
%         \label{uvmtdcFasterFinal}
%     \end{equation}
%     Now consider the function $h(\bm{u})=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\bm{\theta}]|\bm{\theta}] -\textbf{C}\bm{u}$, i.e., the
%     driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}),
%     $\bm{u}^* = \textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\bm{\theta}])\bm{\phi}|\bm{\theta}]$ is the unique globally asymptotically
%     stable equilibrium. Let $h_{\infty}(\bm{u})=-\textbf{C}\bm{u}$.
%     For the ODE
%     \begin{equation}
%         \dot{\bm{u}}(t) = h_{\infty}(\bm{u}(t))= -\textbf{C}\bm{u}(t),
%         \label{uvmtdcInfty}
%     \end{equation}
%     the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable
%     equilibrium because $\textbf{C}$ is a positive definite matrix (because it is nonnegative definite and nonsingular).
%     Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified.
%     Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our
%     conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus,
%     by Theorem 2.2 of \cite{borkar2000ode} we obtain that
%     $||\bm{u}_k-\bm{u}^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$.
    
%     Consider now the slower timescale recursion (\ref{thetavmtdc}). In the light of the above,
%     (\ref{thetavmtdc}) can be rewritten as 
%     % \begin{equation}
%     %     \bm{\theta}_{k+1} \leftarrow \bm{\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\
%     %     - \alpha_k \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]).
%     % \end{equation}
%     \begin{equation}
%         \begin{array}{ccl}
%         \bm{\theta}_{k+1}&\leftarrow&\bm{\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\ 
%          & &- \alpha_k \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]).
%         \end{array}
%     \end{equation}
%     Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s<k)$, 
%     $k\geq 1$ be the sigma fields
%     generated by $\bm{\theta}_0,\bm{\theta}_{l+1},\bm{\phi}_l,\bm{\phi}_l'$,
%     $0\leq l<k$. Let
%     \begin{equation*}
%         \begin{array}{ccl}
%         Z_{k+1}&=&\left(\right.(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\
%         & &- \gamma \bm{\phi}'_{k}\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]\left.\right)\\
%          & &-\mathbb{E}\left[\right.((\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\
%          & &- \gamma \bm{\phi}'_{k}\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k])|\mathcal{G}(k)\left.\right]\\
%          &=&\left(\right.(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\
%          & &- \gamma \bm{\phi}'_{k}\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]\left.\right)\\
%         & &-\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k|\bm{\theta}_k]\\
%         & &- \gamma\mathbb{E}[\bm{\phi}' \bm{\phi}^{\top}]\textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k|\bm{\theta}_k].
%         \end{array}
%     \end{equation*}
%     It is easy to see that $Z_k$, $k\geq 0$ are integrable random variables and $\mathbb{E}[Z_{k+1}|\mathcal{G}(k)]=0$, $\forall k\geq0$. Further,
%     \begin{equation*}
%     \mathbb{E}[||Z_{k+1}||^2|\mathcal{G}(k)]\leq
%     c_3(1+||\bm{\theta}_k||^2), k\geq 0
%     \end{equation*}
%     for some constant $c_3 \geq 0$, again beacuse $\bm{\phi}_k$, $r_k$, and $\bm{\phi}_k'$   have
%     uniformly bounded second moments, it can be seen that for some constant.
    
%     Consider now the following ODE associated with (\ref{thetavmtdc}):
%     \begin{equation}
%         \dot{\bm{\theta}}(t) = (\textbf{I} - \mathbb{E}[\gamma \bm{\phi}' \bm{\phi}^{\top}]\textbf{C}^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)].
%         \label{thetavmtdcSlowerFinal}
%     \end{equation}
%     Let 
%     \begin{equation*}
%     \begin{array}{ccl}
%         \vec{h}(\bm{\theta}(t))&=&(\textbf{I} - \mathbb{E}[\gamma \bm{\phi}' \bm{\phi}^{\top}]\textbf{C}^{-1})\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)]\\
%         &=&(\textbf{C} - \mathbb{E}[\gamma \bm{\phi}' \bm{\phi}^{\top}])\textbf{C}^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)]\\
%         &=& (\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}] - \mathbb{E}[\gamma \bm{\phi}' \bm{\phi}^{\top}])\textbf{C}^{-1}\\
%         & &       \mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)]\\
%         &=& \textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}(t)+\bm{b}),
%     \end{array}
%     \end{equation*}
%     because $\mathbb{E}[(\delta -\mathbb{E}[\delta|\bm{\theta}(t)]) \bm{\phi}|\bm{\theta}(t)]=-\textbf{A}\bm{\theta}(t)+\bm{b}$, where 
%     $\textbf{A} = \mathrm{Cov}(\bm{\phi},\bm{\phi}-\gamma\bm{\phi}')$, $\bm{b}=\mathrm{Cov}(r,\bm{\phi})$, and $\textbf{C}=\mathbb{E}[\bm{\phi}\bm{\phi}^{\top}]$
    
%     Therefore,
%     $\bm{\theta}^*=\textbf{A}^{-1}\bm{b}$ can be seen to be the unique globally asymptotically
%     stable equilibrium for ODE (\ref{thetavmtdcSlowerFinal}).
%     Let $\vec{h}_{\infty}(\bm{\theta})=\lim_{r\rightarrow
%     \infty}\frac{\vec{h}(r\bm{\theta})}{r}$. Then
%     $\vec{h}_{\infty}(\bm{\theta})=-\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}\bm{\theta}$ is well-defined. 
%     Consider now the ODE
%     \begin{equation}
%     \dot{\bm{\theta}}(t)=-\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}\bm{\theta}(t).
%     \label{odethetavmtdcfinal}
%     \end{equation}
    
%     Because $\textbf{C}^{-1}$ is positive definite and $\textbf{A}$ has full rank (as it
%     is nonsingular by assumption), the matrix $\textbf{A}^{\top} \textbf{C}^{-1}\textbf{A}$ is also
%     positive definite. 
%     The ODE (\ref{odethetavmtdcfinal}) has the origin as its unique globally asymptotically stable equilibrium.
%     Thus, the assumption (A1) and (A2) are verified.
    
%     The proof is given above.
%     In the fastest time scale, the parameter $w$ converges to
%     $\mathbb{E}[\delta|\bm{u}_k,\bm{\theta}_k]$.
%     In the second fast time scale,
%     the parameter $u$ converges to $\textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]$.
%     In the slower time scale,
%     the parameter $\bm{\theta}$ converges to $\textbf{A}^{-1}\bm{b}$.
%     \end{proof}
% Please refer to the appendix \ref{proofth2} for detailed proof process.

\begin{theorem}
    \label{theorem3}(Convergence of VMETD).
    In the case of off-policy learning, consider the iterations (\ref{fvmetd}), (\ref{omegavmetd}) and (\ref{thetavmetd}) of VMETD.
     Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
    $
    \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
    $
    $
    \sum_{k=0}^{\infty}\alpha_k^2<\infty,
    $
    $
    \sum_{k=0}^{\infty}\beta_k^2<\infty,
    $
    and  
    $
    \alpha_k = o(\beta_k).
    $
    Assume that $(\bm{\bm{\phi}}_k,r_k,\bm{\bm{\phi}}_k')$ is an i.i.d. sequence with
    uniformly bounded second moments, where $\bm{\bm{\phi}}_k$ and $\bm{\bm{\phi}}'_{k}$ are sampled from the same Markov chain.
    Let $\textbf{A}_{\textbf{VMETD}} ={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-d_{\mu} d_{\mu}^{\top} ){\bm{\Phi}}$,
    $b_{\text{VMETD}}=\bm{\Phi}^{\top}(\textbf{F}-d_{\mu} f^{\top})r_{\pi}$.
    Assume that matrix $\textbf{A}$ is  non-singular. 
    Then the parameter vector $\theta_k$ converges with probability one 
    to $\textbf{A}_{\textbf{VMETD}}^{-1}b_{\textbf{VMETD}}$.
\end{theorem}
\begin{proof}
    The proof of VMETD's convergence is also based on Borkar's Theorem   for
    general stochastic approximation recursions with two time scales
    \cite{borkar1997stochastic}. 

    A sketch proof is given as follows.
     In the fast time scale, the parameter $\omega$ converges to
     $\mathbb{E}_{\mu}[F\rho\delta|\theta_k]$.
Recursion (\ref{thetavmetd}) is considered the slower timescale. 
If the key matrix 
$\textbf{A}_{\text{VMETD}}$ is positive definite, then 
$\theta$ converges.
\begin{equation}
    \label{rowsum}
    \begin{split}
    &(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-{d}_{\mu} {d}_{\mu}^{\top} )e\\
    &=\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})e-{d}_{\mu} {d}_{\mu}^{\top} e\\
    % &=\textbf{F}(\textbfe-\gamma \textbf{P}_{\pi} \textbfe)-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbfe\\
    &=(1-\gamma)\textbf{F}e-{d}_{\mu} {d}_{\mu}^{\top} e\\
    % &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbfe\\
    &=(1-\gamma){f}-{d}_{\mu} \\
    &=(1-\gamma)(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}{d}_{\mu}-{d}_{\mu} \\
    &=(1-\gamma)[(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}-\textbf{I}]{d}_{\mu} \\
    &=(1-\gamma)[\sum_{t=0}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}-\textbf{I}]{d}_{\mu} \\
    &=(1-\gamma)[\sum_{t=1}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}]{d}_{\mu} > 0, \\
    \end{split}
    \end{equation}
\begin{equation}
    \label{columnsum}
    \begin{split}
    &e^{\top}(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} {d}_{\mu}^{\top} )\\
    &=e^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-e^{\top}{d}_{\mu} {d}_{\mu}^{\top} \\
    &={d}_{\mu}^{\top}-e^{\top}{d}_{\mu} {d}_{\mu}^{\top} \\
    &={d}_{\mu}^{\top}- {d}_{\mu}^{\top} \\
    &=0,
    \end{split}
\end{equation}
where $e$ is the all-ones vector.
(\ref{rowsum}) and (\ref{columnsum}) show that the matrix $\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-d_{\mu} d_{\mu}^{\top}$ of
 diagonal entries are positive and its off-diagonal entries are negative. So its each row sum plus the corresponding column sum is positive. 
So $\textbf{A}_{\text{VMETD}}$ is positive definite.
\end{proof}

\subsection{Optimal Policy Invariance}
This section prove 
the optimal policy invariance of VMTD, 
VMTDC and VMETD in control experiments, 
laying the groundwork for subsequent experiments.

 As shown 
in Table \ref{example_bias}, although there is a bias between the 
true value and the predicted value, action $a_3$ is 
still chosen under the greedy-policy.
On the contrary, supervised learning is usually used to predict temperature, humidity, morbidity, etc. If the bias is too large, the consequences could be serious. 

\begin{table}[ht]
    \caption{Comparison of action selection with and without
    constant bias in $Q$ values.}
    \label{example_bias}
    \vskip 0.15in
    \begin{center}
    \begin{small}
    \begin{sc}
    \begin{tabular}{lcccr}
    \toprule
    action & $Q$ value & $Q$ value with bias \\
    \midrule
    $Q(s, a_0)$ & 1& 5 \\
    $Q(s, a_1)$ & 2& 6 \\
    $Q(s, a_2)$ & 3& 7 \\
    $Q(s, a_3)$ & 4& 8 \\
    $\arg \min_{a}Q(s,a)$ & $a_3$& $a_3$\\
    \bottomrule
    \end{tabular}
    \end{sc}
    \end{small}
    \end{center}
    \vskip -0.1in
\end{table}

In addition, reward shaping can significantly speed up the learning  by adding a shaping
reward $F(s,s')$ to the original reward  $r$, 
where $F(s,s')$ is the general form of any state-based shaping reward.
Static potential-based reward shaping (Static PBRS) maintains the policy invariance if the
shaping reward follows from $F(s,s')=\gamma
f(s')-f(s)$ \cite{ng1999policy}.

This means that we can make changes to the TD error $\delta = r+\gamma \bm{\theta}^{\top}\bm{\phi}'-\bm{\theta}^{\top}\bm{\phi} $ while still ensuring the invariance of the optimal policy,
\begin{equation*}
    \delta - \omega= r+\gamma \bm{\theta}^{\top}\bm{\phi}'-\bm{\theta}^{\top}\bm{\phi} - \omega,
\end{equation*}
where $\omega$ is a constant, acting as a static PBRS. 
This also means that algorithms with the optimization goal 
of minimizing errors, after introducing reward shaping, 
may result in larger or smaller bias. Fortunately, 
as discussed above, bias is acceptable in reinforcement 
learning. 
However, the problem is that selecting an appropriate 
$\omega$ requires expert knowledge. This forces us to learn 
$\omega$ dynamically, i.e., $\omega=\omega_t $ and dynamic PBRS can also maintain the policy 
invariance if the shaping reward is $F(s,t,s',t')=\gamma f(s',t')-f(s,t)$,
where $t$ is the time-step the agent reaches in  state $s$
\cite{devlin2012dynamic}.
However, this result requires the convergence guarantee of the dynamic potential
function $f(s,t)$. If $f(s,t)$ does not converge as the time-step
$t\rightarrow\infty$, the Q-values of  dynamic PBRS are not 
guaranteed to converge.

Let  $f_{\omega_t}(s)=\frac{\omega_t}{\gamma-1}$.
Thus, $F_{\omega_t}(s,s')=\gamma f_{\omega_t}(s')-f_{\omega_t}(s)= \omega_t$
is a dynamic PBRS. And if $\omega$ converges finally, the dynamic potential
function $f(s,t)$ will converge.
Bias is the expected difference between the predicted value 
and the true value. Therefore, under the premise of bootstrapping, we first think of 
letting $\omega \doteq\mathbb{E}[\delta]$ or 
$\omega \doteq \mathbb{E}[F \rho\delta]$.