\section{Theoretical Analysis} This section primarily focuses on proving the convergence of VMTD, VMTDC, and VMETD. \begin{theorem} \label{theorem1}(Convergence of VMTD). In the case of on-policy learning, consider the iterations (\ref{omega}) and (\ref{theta}) with (\ref{delta}) of VMTD. Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$, $ \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty, $ $ \sum_{k=0}^{\infty}\alpha_k^2<\infty, $ $ \sum_{k=0}^{\infty}\beta_k^2<\infty, $ and $ \alpha_k = o(\beta_k). $ Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain. Let $\textbf{A} = \mathrm{Cov}(\phi,\phi-\gamma\phi')$, $b=\mathrm{Cov}(r,\phi)$. Assume that matrix $\textbf{A}$ is non-singular. Then the parameter vector $\theta_k$ converges with probability one to $\textbf{A}^{-1}b$. \end{theorem} \begin{proof} \label{th1proof} The proof is based on Borkar's Theorem for general stochastic approximation recursions with two time scales \cite{borkar1997stochastic}. A sketch proof is given as follows. In the fast time scale, the parameter $w$ converges to $\mathbb{E}[\delta|\theta_k]$. In the slow time scale, the associated ODE is \begin{equation*} \vec{h}(\theta(t))=-\textbf{A}\theta(t)+b. \end{equation*} \begin{equation} \begin{array}{ccl} A &=& \mathrm{Cov}(\phi,\phi-\gamma\phi')\\ &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\mathrm{Cov}(\gamma\phi',\gamma\phi')}{2}\\ &=&\frac{\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')-\gamma^2\mathrm{Cov}(\phi',\phi')}{2}\\ &=&\frac{(1-\gamma^2)\mathrm{Cov}(\phi,\phi)+\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')}{2},\\ \end{array} \label{covariance} \end{equation} where we eventually used $\mathrm{Cov}(\phi',\phi')=\mathrm{Cov}(\phi,\phi)$ \footnote{The covariance matrix $\mathrm{Cov}(\phi',\phi')$ is equal to the covariance matrix $\mathrm{Cov}(\phi,\phi)$ if the initial state is re-reachable or initialized randomly in a Markov chain for on-policy update.}. Note that the covariance matrix $\mathrm{Cov}(\phi,\phi)$ and $\mathrm{Cov}(\phi-\gamma\phi',\phi-\gamma\phi')$ are semi-positive definite. Then, the matrix $\textbf{A}$ is semi-positive definite because $\textbf{A}$ is linearly combined by two positive-weighted semi-positive definite matrice (\ref{covariance}). Furthermore, $\textbf{A}$ is nonsingular due to the assumption. Hence, the matrix $\textbf{A}$ is positive definite. And, the parameter $\theta$ converges to $\textbf{A}^{-1}b$. \end{proof} Please refer to the appendix for VMTD's detailed proof process. \begin{theorem} \label{theorem2}(Convergence of VMTDC). In the case of off-policy learning, consider the iterations (\ref{omegavmtdc}), (\ref{uvmtdc}) and (\ref{thetavmtdc}) of VMTDC. Let the step-size sequences $\alpha_k$, $\zeta_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\zeta_k,\beta_k>0$, for all $k$, $ \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\sum_{k=0}^{\infty}\zeta_k=\infty, $ $ \sum_{k=0}^{\infty}\alpha_k^2<\infty, $ $ \sum_{k=0}^{\infty}\zeta_k^2<\infty, $ $ \sum_{k=0}^{\infty}\beta_k^2<\infty, $ and $ \alpha_k = o(\zeta_k), $ $ \zeta_k = o(\beta_k). $ Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with uniformly bounded second moments. Let $\textbf{A} = \mathrm{Cov}(\phi,\phi-\gamma\phi')$, $b=\mathrm{Cov}(r,\phi)$, and $\textbf{C}=\mathbb{E}[\phi\phi^{\top}]$. Assume that $\textbf{A}$ and $\textbf{C}$ are non-singular matrices. Then the parameter vector $\theta_k$ converges with probability one to $\textbf{A}^{-1}b$. \end{theorem} \begin{proof} The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. A sketch proof is given as follows. In the fastest time scale, the parameter $w$ converges to $\mathbb{E}[\delta|u_k,\theta_k]$. In the second fast time scale, the parameter $u$ converges to $\textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\theta_k])\phi|\theta_k]$. In the slower time scale, the associated ODE is \begin{equation*} \vec{h}(\theta(t))=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\theta(t)+b). \end{equation*} The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the parameter $\theta$ converges to $\textbf{A}^{-1}b$. \end{proof} Please refer to the appendix for VMTDC's detailed proof process. % \begin{proof} % The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. % % For the VMTDC algorithm, a new one-step linear TD solution is defined as: % % \begin{equation*} % % 0=\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]=\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b}). % % \end{equation*} % % The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the VMTD's solution is % % $\bm{\theta}_{\text{VMTDC}}=\bm{\theta}_{\text{VMTD}}=\textbf{A}^{-1}\bm{b}$. % First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, % \begin{equation*} % \theta_{k+1}\leftarrow \theta_k+\zeta_k x(k), % \end{equation*} % \begin{equation*} % u_{k+1}\leftarrow u_k+\beta_k y(k), % \end{equation*} % where % \begin{equation*} % x(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)], % \end{equation*} % \begin{equation*} % y(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \phi^{\top}_k u_k]\phi_k. % \end{equation*} % Recursion (\ref{thetavmtdc}) can also be rewritten as % \begin{equation*} % \theta_{k+1}\leftarrow \theta_k+\beta_k z(k), % \end{equation*} % where % \begin{equation*} % z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \phi_k - \gamma\phi'_{k}(\phi^{\top}_k u_k)], % \end{equation*} % Due to the settings of step-size schedule % $\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. % That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than % those in (\ref{uvmtdc}) and the increments in iteration (\ref{uvmtdc}) are uniformly larger than % those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion. % Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc}) % are associated to ODEs system as follows: % \begin{equation} % \dot{\theta}(t) = 0, % \label{thetavmtdcFastest} % \end{equation} % \begin{equation} % \dot{u}(t) = 0, % \label{uvmtdcFastest} % \end{equation} % \begin{equation} % \dot{\omega}(t)=\mathbb{E}[\delta_t|u(t),\theta(t)]-\omega(t). % \label{omegavmtdcFastest} % \end{equation} % Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$ % and $u(t)\equiv u$ when viewed from the fastest timescale. % By the Hirsch lemma \cite{hirsch1989convergent}, it follows that % $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some % $\theta$ that depends on the initial condition $\theta_0$ of recursion % (\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some % $u$ that depends on the initial condition $u_0$ of recursion % (\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest}) % can be written as % \begin{equation} % \dot{\omega}(t)=\mathbb{E}[\delta_t|u,\theta]-\omega(t). % \label{omegavmtdcFastestFinal} % \end{equation} % Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$, % i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}). % It is easy to find that the function $h$ is Lipschitz with coefficient % $-1$. % Let $h_{\infty}(\cdot)$ be the function defined by % $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$. % Then $h_{\infty}(\omega)= -\omega$, is well-defined. % For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\theta,u]$ % is the unique globally asymptotically stable equilibrium. % For the ODE % \begin{equation} % \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), % \label{omegavmtdcInfty} % \end{equation} % apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its % associated strict Liapunov function. Then, % the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable % equilibrium. % Consider now the recursion (\ref{omegavmtdc}). % Let % $M_{k+1}=(\delta_k-\omega_k) % -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, % where $\mathcal{F}(k)=\sigma(\omega_l,u_l,\theta_l,l\leq k;\phi_s,\phi_s',r_s,s0$, $\forall k\geq0$, % \begin{equation*} % \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq % c_1(1+||\omega_k||^2+||u_k||^2+||\theta_k||^2). % \end{equation*} % Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. % Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our % conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, % by Theorem 2.2 of \cite{borkar2000ode} we obtain that % $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. % Recursion (\ref{uvmtdc}) is considered the second timescale. % Recursion (\ref{thetavmtdc}) is considered the slower timescale. % For the convergence properties of $u$ and $\theta$, please refer to the appendix. % \end{proof} % \begin{proof} % The proof is similar to that given by \cite{sutton2009fast} for TDC, but it is based on multi-time-scale stochastic approximation. % For the VMTDC algorithm, a new one-step linear TD solution is defined as: % \begin{equation*} % \begin{array}{ccl} % 0&=&\mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}' - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}'])\bm{\phi}^\top]\\ % & &\mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta])\bm{\phi}]\\ % &=&\textbf{A}^{\top}\textbf{C}^{-1}(-\textbf{A}\bm{\theta}+\bm{b})\\ % &=&-\textbf{A}_{\text{VMTDC}}\bm{\theta}+\bm{b}_{\text{VMTDC}}. % \end{array} % \end{equation*} % The matrix $\textbf{A}^{\top}\textbf{C}^{-1}\textbf{A}$ is positive definite. Thus, the VMTD's solution is % $\bm{\theta}_{\text{VMTDC}}=\textbf{A}^{-1}\bm{b}$. % First, note that recursion (\ref{thetavmtdc}) and (\ref{uvmtdc}) can be rewritten as, respectively, % \begin{equation*} % \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\zeta_k \bm{x}(k), % \end{equation*} % \begin{equation*} % \bm{u}_{k+1}\leftarrow \bm{u}_k+\beta_k \bm{y}(k), % \end{equation*} % where % \begin{equation*} % \bm{x}(k)=\frac{\alpha_k}{\zeta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)], % \end{equation*} % \begin{equation*} % \bm{y}(k)=\frac{\zeta_k}{\beta_k}[\delta_{k}-\omega_k - \bm{\phi}^{\top}_k \bm{u}_k]\bm{\phi}_k. % \end{equation*} % Recursion (\ref{thetavmtdc}) can also be rewritten as % \begin{equation*} % \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\beta_k z(k), % \end{equation*} % where % \begin{equation*} % z(k)=\frac{\alpha_k}{\beta_k}[(\delta_{k}- \omega_k) \bm{\phi}_k - \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \bm{u}_k)], % \end{equation*} % Due to the settings of step-size schedule % $\alpha_k = o(\zeta_k)$, $\zeta_k = o(\beta_k)$, $x(k)\rightarrow 0$, $y(k)\rightarrow 0$, $z(k)\rightarrow 0$ almost surely as $k\rightarrow 0$. % That is that the increments in iteration (\ref{omegavmtdc}) are uniformly larger than % those in (\ref{uvmtdc}) and the increments in iteration (\ref{uvmtdc}) are uniformly larger than % those in (\ref{thetavmtdc}), thus (\ref{omegavmtdc}) is the fastest recursion, (\ref{uvmtdc}) is the second fast recursion and (\ref{thetavmtdc}) is the slower recursion. % Along the fastest time scale, iterations of (\ref{thetavmtdc}), (\ref{uvmtdc}) and (\ref{omegavmtdc}) % are associated to ODEs system as follows: % \begin{equation} % \dot{\bm{\theta}}(t) = 0, % \label{thetavmtdcFastest} % \end{equation} % \begin{equation} % \dot{\bm{u}}(t) = 0, % \label{uvmtdcFastest} % \end{equation} % \begin{equation} % \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)]-\omega(t). % \label{omegavmtdcFastest} % \end{equation} % Based on the ODE (\ref{thetavmtdcFastest}) and (\ref{uvmtdcFastest}), both $\theta(t)\equiv \theta$ % and $u(t)\equiv u$ when viewed from the fastest timescale. % By the Hirsch lemma \cite{hirsch1989convergent}, it follows that % $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some % $\theta$ that depends on the initial condition $\theta_0$ of recursion % (\ref{thetavmtdc}) and $||u_k-u||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some % $u$ that depends on the initial condition $u_0$ of recursion % (\ref{uvmtdc}). Thus, the ODE pair (\ref{thetavmtdcFastest})-(ref{omegavmtdcFastest}) % can be written as % \begin{equation} % \dot{\omega}(t)=\mathbb{E}[\delta_t|\bm{u},\bm{\theta}]-\omega(t). % \label{omegavmtdcFastestFinal} % \end{equation} % Consider the function $h(\omega)=\mathbb{E}[\delta|\theta,u]-\omega$, % i.e., the driving vector field of the ODE (\ref{omegavmtdcFastestFinal}). % It is easy to find that the function $h$ is Lipschitz with coefficient % $-1$. % Let $h_{\infty}(\cdot)$ be the function defined by % $h_{\infty}(\omega)=\lim_{r\rightarrow \infty}\frac{h(r\omega)}{r}$. % Then $h_{\infty}(\omega)= -\omega$, is well-defined. % For (\ref{omegavmtdcFastestFinal}), $\omega^*=\mathbb{E}[\delta|\bm{\theta},\bm{u}]$ % is the unique globally asymptotically stable equilibrium. % For the ODE % \begin{equation} % \dot{\omega}(t) = h_{\infty}(\omega(t))= -\omega(t), % \label{omegavmtdcInfty} % \end{equation} % apply $\vec{V}(\omega)=(-\omega)^{\top}(-\omega)/2$ as its % associated strict Liapunov function. Then, % the origin of (\ref{omegavmtdcInfty}) is a globally asymptotically stable % equilibrium. % Consider now the recursion (\ref{omegavmtdc}). % Let % $M_{k+1}=(\delta_k-\omega_k) % -\mathbb{E}[(\delta_k-\omega_k)|\mathcal{F}(k)]$, % where $\mathcal{F}(k)=\sigma(\omega_l,\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, % \begin{equation*} % \mathbb{E}[||M_{k+1}||^2|\mathcal{F}(k)]\leq % c_1(1+||\omega_k||^2+||\bm{u}_k||^2+||\bm{\theta}_k||^2). % \end{equation*} % Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. % Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our % conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, % by Theorem 2.2 of \cite{borkar2000ode} we obtain that % $||\omega_k-\omega^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. % Consider now the second time scale recursion (\ref{uvmtdc}). % Based on the above analysis, (\ref{uvmtdc}) can be rewritten as % % \begin{equation*} % % \bm{u}_{k+1}\leftarrow u_{k}+\zeta_{k}[\delta_{k}-\mathbb{E}[\delta_k|\bm{u}_k,\bm{\theta}_k] - \bm{\phi}^{\top} (s_k) \bm{u}_k]\bm{\phi}(s_k). % % \end{equation*} % \begin{equation} % \dot{\bm{\theta}}(t) = 0, % \label{thetavmtdcFaster} % \end{equation} % \begin{equation} % \dot{u}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{u}(t),\bm{\theta}(t)])\bm{\phi}_t|\bm{\theta}(t)] - \textbf{C}\bm{u}(t). % \label{uvmtdcFaster} % \end{equation} % The ODE (\ref{thetavmtdcFaster}) suggests that $\theta(t)\equiv \theta$ (i.e., a time invariant parameter) % when viewed from the second fast timescale. % By the Hirsch lemma \cite{hirsch1989convergent}, it follows that % $||\theta_k-\theta||\rightarrow 0$ a.s. as $k\rightarrow \infty$ for some % $\theta$ that depends on the initial condition $\theta_0$ of recursion % (\ref{thetavmtdc}). % Consider now the recursion (\ref{uvmtdc}). % Let % $N_{k+1}=((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k) -\mathbb{E}[((\delta_k-\mathbb{E}[\delta_k]) - \bm{\phi}_k \bm{\phi}^{\top}_k \bm{u}_k)|\mathcal{I} (k)]$, % where $\mathcal{I}(k)=\sigma(\bm{u}_l,\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, $\forall k\geq0$, % \begin{equation*} % \mathbb{E}[||N_{k+1}||^2|\mathcal{I}(k)]\leq % c_2(1+||\bm{u}_k||^2+||\bm{\theta}_k||^2). % \end{equation*} % Because $\bm{\theta}(t)\equiv \bm{\theta}$ from (\ref{thetavmtdcFaster}), the ODE pair (\ref{thetavmtdcFaster})-(\ref{uvmtdcFaster}) % can be written as % \begin{equation} % \dot{\bm{u}}(t) = \mathbb{E}[(\delta_t-\mathbb{E}[\delta_t|\bm{\theta}])\bm{\phi}_t|\bm{\theta}] - \textbf{C}\bm{u}(t). % \label{uvmtdcFasterFinal} % \end{equation} % Now consider the function $h(\bm{u})=\mathbb{E}[\delta_t-\mathbb{E}[\delta_t|\bm{\theta}]|\bm{\theta}] -\textbf{C}\bm{u}$, i.e., the % driving vector field of the ODE (\ref{uvmtdcFasterFinal}). For (\ref{uvmtdcFasterFinal}), % $\bm{u}^* = \textbf{C}^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta|\bm{\theta}])\bm{\phi}|\bm{\theta}]$ is the unique globally asymptotically % stable equilibrium. Let $h_{\infty}(\bm{u})=-\textbf{C}\bm{u}$. % For the ODE % \begin{equation} % \dot{\bm{u}}(t) = h_{\infty}(\bm{u}(t))= -\textbf{C}\bm{u}(t), % \label{uvmtdcInfty} % \end{equation} % the origin of (\ref{uvmtdcInfty}) is a globally asymptotically stable % equilibrium because $\textbf{C}$ is a positive definite matrix (because it is nonnegative definite and nonsingular). % Now Assumptions (A1) and (A2) of \cite{borkar2000ode} are verified. % Furthermore, Assumptions (TS) of \cite{borkar2000ode} is satisfied by our % conditions on the step-size sequences $\alpha_k$,$\zeta_k$, $\beta_k$. Thus, % by Theorem 2.2 of \cite{borkar2000ode} we obtain that % $||\bm{u}_k-\bm{u}^*||\rightarrow 0$ almost surely as $k\rightarrow \infty$. % Consider now the slower timescale recursion (\ref{thetavmtdc}). In the light of the above, % (\ref{thetavmtdc}) can be rewritten as % % \begin{equation} % % \bm{\theta}_{k+1} \leftarrow \bm{\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\ % % - \alpha_k \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]). % % \end{equation} % \begin{equation} % \begin{array}{ccl} % \bm{\theta}_{k+1}&\leftarrow&\bm{\theta}_{k} + \alpha_k (\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k]) \bm{\phi}_k\\ % & &- \alpha_k \gamma\bm{\phi}'_{k}(\bm{\phi}^{\top}_k \textbf{C}^{-1}\mathbb{E}[(\delta_k -\mathbb{E}[\delta_k|\bm{\theta}_k])\bm{\phi}|\bm{\theta}_k]). % \end{array} % \end{equation} % Let $\mathcal{G}(k)=\sigma(\bm{\theta}_l,l\leq k;\bm{\phi}_s,\bm{\phi}_s',r_s,s0$, for all $k$, $ \sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty, $ $ \sum_{k=0}^{\infty}\alpha_k^2<\infty, $ $ \sum_{k=0}^{\infty}\beta_k^2<\infty, $ and $ \alpha_k = o(\beta_k). $ Assume that $(\bm{\bm{\phi}}_k,r_k,\bm{\bm{\phi}}_k')$ is an i.i.d. sequence with uniformly bounded second moments, where $\bm{\bm{\phi}}_k$ and $\bm{\bm{\phi}}'_{k}$ are sampled from the same Markov chain. Let $\textbf{A}_{\textbf{VMETD}} ={\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-d_{\mu} d_{\mu}^{\top} ){\bm{\Phi}}$, $b_{\text{VMETD}}=\bm{\Phi}^{\top}(\textbf{F}-d_{\mu} f^{\top})r_{\pi}$. Assume that matrix $\textbf{A}$ is non-singular. Then the parameter vector $\theta_k$ converges with probability one to $\textbf{A}_{\textbf{VMETD}}^{-1}b_{\textbf{VMETD}}$. \end{theorem} \begin{proof} The proof of VMETD's convergence is also based on Borkar's Theorem for general stochastic approximation recursions with two time scales \cite{borkar1997stochastic}. A sketch proof is given as follows. In the fast time scale, the parameter $\omega$ converges to $\mathbb{E}_{\mu}[F\rho\delta|\theta_k]$. Recursion (\ref{thetavmetd}) is considered the slower timescale. If the key matrix $\textbf{A}_{\text{VMETD}}$ is positive definite, then $\theta$ converges. \begin{equation} \label{rowsum} \begin{split} &(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-{d}_{\mu} {d}_{\mu}^{\top} )e\\ &=\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})e-{d}_{\mu} {d}_{\mu}^{\top} e\\ % &=\textbf{F}(\textbfe-\gamma \textbf{P}_{\pi} \textbfe)-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbfe\\ &=(1-\gamma)\textbf{F}e-{d}_{\mu} {d}_{\mu}^{\top} e\\ % &=(1-\gamma)\textbf{f}-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} \textbfe\\ &=(1-\gamma){f}-{d}_{\mu} \\ &=(1-\gamma)(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}{d}_{\mu}-{d}_{\mu} \\ &=(1-\gamma)[(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}-\textbf{I}]{d}_{\mu} \\ &=(1-\gamma)[\sum_{t=0}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}-\textbf{I}]{d}_{\mu} \\ &=(1-\gamma)[\sum_{t=1}^{\infty}(\gamma\textbf{P}_{\pi}^{\top})^{t}]{d}_{\mu} > 0, \\ \end{split} \end{equation} \begin{equation} \label{columnsum} \begin{split} &e^{\top}(\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} {d}_{\mu}^{\top} )\\ &=e^{\top}\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-e^{\top}{d}_{\mu} {d}_{\mu}^{\top} \\ &={d}_{\mu}^{\top}-e^{\top}{d}_{\mu} {d}_{\mu}^{\top} \\ &={d}_{\mu}^{\top}- {d}_{\mu}^{\top} \\ &=0, \end{split} \end{equation} where $e$ is the all-ones vector. (\ref{rowsum}) and (\ref{columnsum}) show that the matrix $\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-d_{\mu} d_{\mu}^{\top}$ of diagonal entries are positive and its off-diagonal entries are negative. So its each row sum plus the corresponding column sum is positive. So $\textbf{A}_{\text{VMETD}}$ is positive definite. \end{proof} \subsection{Optimal Policy Invariance} This section prove the optimal policy invariance of VMTD, VMTDC and VMETD in control experiments, laying the groundwork for subsequent experiments. As shown in Table \ref{example_bias}, although there is a bias between the true value and the predicted value, action $a_3$ is still chosen under the greedy-policy. On the contrary, supervised learning is usually used to predict temperature, humidity, morbidity, etc. If the bias is too large, the consequences could be serious. \begin{table}[ht] \caption{Comparison of action selection with and without constant bias in $Q$ values.} \label{example_bias} \vskip 0.15in \begin{center} \begin{small} \begin{sc} \begin{tabular}{lcccr} \toprule action & $Q$ value & $Q$ value with bias \\ \midrule $Q(s, a_0)$ & 1& 5 \\ $Q(s, a_1)$ & 2& 6 \\ $Q(s, a_2)$ & 3& 7 \\ $Q(s, a_3)$ & 4& 8 \\ $\arg \min_{a}Q(s,a)$ & $a_3$& $a_3$\\ \bottomrule \end{tabular} \end{sc} \end{small} \end{center} \vskip -0.1in \end{table} In addition, reward shaping can significantly speed up the learning by adding a shaping reward $F(s,s')$ to the original reward $r$, where $F(s,s')$ is the general form of any state-based shaping reward. Static potential-based reward shaping (Static PBRS) maintains the policy invariance if the shaping reward follows from $F(s,s')=\gamma f(s')-f(s)$ \cite{ng1999policy}. This means that we can make changes to the TD error $\delta = r+\gamma \bm{\theta}^{\top}\bm{\phi}'-\bm{\theta}^{\top}\bm{\phi} $ while still ensuring the invariance of the optimal policy, \begin{equation*} \delta - \omega= r+\gamma \bm{\theta}^{\top}\bm{\phi}'-\bm{\theta}^{\top}\bm{\phi} - \omega, \end{equation*} where $\omega$ is a constant, acting as a static PBRS. This also means that algorithms with the optimization goal of minimizing errors, after introducing reward shaping, may result in larger or smaller bias. Fortunately, as discussed above, bias is acceptable in reinforcement learning. However, the problem is that selecting an appropriate $\omega$ requires expert knowledge. This forces us to learn $\omega$ dynamically, i.e., $\omega=\omega_t $ and dynamic PBRS can also maintain the policy invariance if the shaping reward is $F(s,t,s',t')=\gamma f(s',t')-f(s,t)$, where $t$ is the time-step the agent reaches in state $s$ \cite{devlin2012dynamic}. However, this result requires the convergence guarantee of the dynamic potential function $f(s,t)$. If $f(s,t)$ does not converge as the time-step $t\rightarrow\infty$, the Q-values of dynamic PBRS are not guaranteed to converge. Let $f_{\omega_t}(s)=\frac{\omega_t}{\gamma-1}$. Thus, $F_{\omega_t}(s,s')=\gamma f_{\omega_t}(s')-f_{\omega_t}(s)= \omega_t$ is a dynamic PBRS. And if $\omega$ converges finally, the dynamic potential function $f(s,t)$ will converge. Bias is the expected difference between the predicted value and the true value. Therefore, under the premise of bootstrapping, we first think of letting $\omega \doteq\mathbb{E}[\delta]$ or $\omega \doteq \mathbb{E}[F \rho\delta]$.