\section{Variance Minimization Algorithms} To derive an algorithm with a larger minimum eigenvalue for matrix $\textbf{A}$, it is necessary to propose new objective functions. The mentioned objective functions in the Introduction are all forms of error. Is minimizing error the only option for value-based reinforcement learning? Based on this observation, we propose alternative objective functions instead of minimizing errors. We minimize the Variance of Projected Bellman Error (VPBE) and derive the VMTDC algorithm. This idea is then innovatively applied to ETD, resulting in the VMETD algorithm. % \subsection{Motivation} % gagagga \begin{algorithm}[t] \caption{VMTDC algorithm with linear function approximation in the off-policy setting} \label{alg:algorithm 2} \begin{algorithmic} \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $\bm{u}_0$, $\omega_{0}$, $\gamma $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ \REPEAT \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $\bm{u}_{0}$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ % \textbf{Output}: $\bm{\theta}^*$.\\ \FOR{$t=0$ {\bfseries to} $T-1$} \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t [ (\rho_t\delta_t-\omega_t)\bm{\phi}_t - \gamma \rho_t\bm{\phi}_{t+1}(\bm{\phi}^{\top}_{t} \bm{u}_{t})]$ \STATE $\bm{u}_{t+1}\leftarrow \bm{u}_{t}+\zeta_t[(\rho_t\delta_t-\omega_t) - \bm{\phi}^{\top}_{t} \bm{u}_{t}] \bm{\phi}_t$ \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (\rho_t\delta_t-\omega_t)$ \STATE $S_t=S_{t+1}$ \ENDFOR \UNTIL{terminal episode} \end{algorithmic} \end{algorithm} \begin{algorithm}[t] \caption{VMETD algorithm with linear function approximation in the off-policy setting} \label{alg:algorithm 5} \begin{algorithmic} \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $F_0$, $\omega_{0}$, $\gamma $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ \REPEAT \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $F_{0}$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ % \textbf{Output}: $\theta^*$.\\ \FOR{$t=0$ {\bfseries to} $T-1$} \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ \STATE $F_{t}\leftarrow \gamma \rho_t F_{t-1} +1$ \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t (F_t \rho_t\delta_t-\omega_t)\bm{\phi}_t$ \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (F_t \rho_t\delta_t-\omega_t)$ \STATE $S_t=S_{t+1}$ \ENDFOR \UNTIL{terminal episode} \end{algorithmic} \end{algorithm} \subsection{Variance Minimization TDC Learning: VMTDC} For off-policy learning, we propose a new objective function, called Variance of Projected Bellman error (VPBE), and the corresponding algorithm is called VMTDC. \begin{align} \text{VPBE}(\bm{\theta}) &= \mathbb{E}[(\delta-\mathbb{E}[\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1} \\ & \mathbb{E}[(\delta -\mathbb{E}[\delta ])\bm{\phi}] \notag \\ &= (\bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}))^{\top}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1} \notag \\ & \bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ &= (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}})^{\top}\textbf{D}^{\top}\bm{\Phi}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1} \notag \\ & \bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ &= (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}})^{\top}\Pi^{\top}\textbf{D}\Pi \notag \\ & (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ &= (\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}-\textbf{W}_{\bm{\theta}}))^{\top}\textbf{D} \notag \\ & (\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}-\textbf{W}_{\bm{\theta}})) \notag \\ &= ||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}} - \textbf{W}_{\bm{\theta}})||^{2}_{\mu} \notag \\ &= ||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}) - \Pi\textbf{W}_{\bm{\theta}}||^{2}_{\mu} \notag \\ &= \mathbb{E}[(\delta-\omega) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\omega)\bm{\phi}] \end{align} where $\textbf{W}_{\bm{\theta}}$ is viewed as vectors with every element being equal to $||\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}||^{2}_{\mu}$ and $\omega$ is used to approximate $\mathbb{E}[\delta]$, i.e., $\omega \doteq\mathbb{E}[\delta] $. The gradient of the (3) with respect to $\theta$ is \begin{equation*} \begin{array}{ccl} -\frac{1}{2}\nabla \text{VPBE}(\bm{\theta}) &=& -\mathbb{E}\Big[\Big( (\gamma \bm{\phi}' - \bm{\phi}) - \mathbb{E}[ (\gamma \bm{\phi}' - \bm{\phi})]\Big)\bm{\phi}^{\top} \Big] \\ & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1} \mathbb{E}[( \delta -\mathbb{E}[ \delta])\bm{\phi}]\\ &=& \mathbb{E}\Big[\Big( (\bm{\phi} - \gamma \bm{\phi}')- \mathbb{E}[ (\bm{\phi} - \gamma \bm{\phi}')]\Big)\bm{\phi}^{\top} \Big] \\ & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ & & \mathbb{E}\Big[\Big( r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta}\\ & & \hspace{2em} -\mathbb{E}[ r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta}]\Big)\bm{\phi} \Big]\\ &=& \textbf{A}^{\top} \textbf{C}^{-1}(-\textbf{A}\bm{\theta} + \textbf{b}) \end{array} \end{equation*} where \begin{equation*} \begin{array}{ccl} \textbf{A} &=& \mathbb{E}\Big[\Big( (\bm{\phi} - \gamma \bm{\phi}')- \mathbb{E}[ (\bm{\phi} - \gamma \bm{\phi}')]\Big)\bm{\phi}^{\top} \Big] \\ &=& \mathbb{E}[(\bm{\phi} - \gamma \bm{\phi}')\bm{\phi}^{\top}] - \mathbb{E}[\bm{\phi} - \gamma \bm{\phi}']\mathbb{E}[\bm{\phi}^{\top}]\\ &=& \mathrm{Cov}(\bm{\bm{\phi}},\bm{\bm{\phi}}-\gamma\bm{\bm{\phi}}'), \end{array} \end{equation*} \begin{equation*} \begin{array}{ccl} \textbf{C} &=& \mathbb{E}[\bm{\bm{\phi}}\bm{\bm{\phi}}^{\top}], \end{array} \end{equation*} \begin{equation*} \begin{array}{ccl} \textbf{b} &=& \mathbb{E}( r- \mathbb{E}[r])\bm{\phi} \\ &=& \mathbb{E}[r\bm{\phi}] - \mathbb{E}[r]\mathbb{E}[\bm{\phi}]\\ &=& \mathrm{Cov}(r,\bm{\bm{\phi}}), \end{array} \end{equation*} where $\mathrm{Cov}(\cdot,\cdot )$ is a covariance operator. In the process of computing the gradient of the (4) with respect to $\theta$, $\omega$ is treated as a constant. So, the derivation process of the VMTDC algorithm is the same as that of the TDC algorithm, the only difference is that the original $\delta$ is replaced by $\delta-\omega$. Therefore, we can easily get the updated formula of VMTDC, as follows: % \begin{equation*} % \begin{array}{ccl} % \text{VPBE}(\bm{\theta})&=&\mathbb{E}[(\rho_k \delta_k-\mathbb{E}[\rho_k \delta_k]) \bm{\phi}_k]^{\top} \mathbb{E}[\bm{\phi}_k \bm{\phi}^{\top}_k]^{-1}\\ % & &\mathbb{E}[(\rho_k \delta_k -\mathbb{E}[\rho_k \delta_k ])\bm{\phi}_k], % % &=&\mathbb{E}[(\delta-\omega) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta-\omega)\bm{\phi}], % \end{array} % \end{equation*} \begin{equation} \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k}[(\delta_{k}- \omega_k) \bm{\phi}_k\\ - \gamma\bm{\phi}_{k+1}(\bm{\phi}^{\top}_k \bm{u}_{k})], \label{thetavmtdc} \end{equation} \begin{equation} \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[\delta_{k}-\omega_k - \bm{\phi}^{\top}_k \bm{u}_{k}]\bm{\phi}_k, \label{uvmtdc} \end{equation} and \begin{equation} \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k), \label{omegavmtdc} \end{equation} The VMTDC algorithm (\ref{thetavmtdc}) is derived to work with a given set of sub-samples—in the form of triples $(S_k, R_k, S'_k)$ that match transitions from both the behavior and target policies. What if we wanted to use all the data? The data is generated according to the behavior policy $\pi_b$, while our objective is to learn about the target policy $\pi$. We should use importance-sampling. The VPBE with importance sampling is: \begin{equation} \label{rho_VPBE} \begin{array}{ccl} \text{VPBE}(\bm{\theta})&=&\mathbb{E}[(\rho\delta-\mathbb{E}[\rho\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ & &\mathbb{E}[(\rho\delta -\mathbb{E}[\rho\delta ])\bm{\phi}], \end{array} \end{equation} Following the linear VMTDC derivation, we get the following algorithm (linear VMTDC algorithm based on importance weighting scenario): \begin{equation} \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k}[(\rho_k\delta_{k}- \omega_k) \bm{\phi}_k\\ - \gamma\rho_k\bm{\phi}_{k+1}(\bm{\phi}^{\top}_k \bm{u}_{k})], \end{equation} \begin{equation} \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[(\rho_k\delta_{k}-\omega_k) - \bm{\phi}^{\top}_k \bm{u}_{k}]\bm{\phi}_k, \end{equation} and \begin{equation} \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\rho_k\delta_k- \omega_k), \end{equation} The gradient of the (\ref{rho_VPBE}) with respect to $\theta$ is \begin{equation*} \begin{array}{ccl} -\frac{1}{2}\nabla \text{VPBE}(\bm{\theta}) &=& \mathbb{E}\Big[\Big( \rho(\bm{\phi} - \gamma \bm{\phi}')- \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')]\Big)\bm{\phi}^{\top} \Big] \\ & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ & & \mathbb{E}\Big[\Big( \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})\\ & & \hspace{2em} -\mathbb{E}[ \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})]\Big)\bm{\phi} \Big]\\ &=& \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')\bm{\phi}^{\top}]- \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')]\mathbb{E}[\bm{\phi}^{\top}] \\ & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ & & \mathbb{E}\Big[\Big( \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})\\ & & \hspace{2em} -\mathbb{E}[ \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})]\Big)\bm{\phi} \Big]\\ % &=&\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}\\ &=& \textbf{A}^{\top} \textbf{C}^{-1}(-\textbf{A}\bm{\theta} + \textbf{b}), \end{array} \end{equation*} where $\textbf{A}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$, $\textbf{b}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$ and $\textbf{r}_{\pi}$ is viewed as vectors. In the 2-state counterexample, $\textbf{A}_{\text{VMTDC}}=0.025$, meaning that VMTDC can stably converge and converges faster than TDC. % The control algorithm corresponding to TDC is called GQ. % Now, we will introduce the improved version of the GQ algorithm, named VMGQ: % \begin{equation} % \begin{array}{ccl} % \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}&+&\alpha_{k}[(\delta_{k}- \omega_k) \bm{\phi}(s_k,a_k)\\ % &-& \gamma\bm{\phi}(s_{k+1},A^{*}_{k+1})(\bm{\phi}^{\top} (s_k,a_k) \bm{u}_{k})], % \end{array} % \end{equation} % \begin{equation} % \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[(\delta_{k}-\bm{u}_{k}) - \bm{\phi}^{\top} (s_k,a_k) \bm{u}_{k}]\bm{\phi}(s_k,a_k), % \end{equation} % and % \begin{equation} % \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k- \omega_k), % \end{equation} % where % $\delta_k$ in VMGQ is: % \begin{equation} % \delta_{k}=r_{k+1}+\gamma \max_{a\in A}\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a) - \bm{\theta}_{k}^{\top}\bm{\phi}(s_{k},a_{k}), % \label{deltaQ} % \end{equation} % and $A^{*}_{k+1}={\arg \max}_{a}(\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a))$. \begin{table*}[t] \caption{Minimum eigenvalues of various algorithms in the 2-state counterexample.} \vskip 0.15in \begin{center} \begin{small} \begin{sc} \begin{tabular}{lcccccr} \toprule algorithm & off-policy TD & TDC & ETD & VMTDC & VMETD \\ \midrule Minimum eigenvalues&$-0.2$ & $0.016$ & $3.4$ & $0.025$ & $1.15$ \\ \bottomrule \end{tabular} \end{sc} \end{small} \end{center} \vskip -0.1in \end{table*} \subsection{Variance Minimization ETD Learning: VMETD} Based on the off-policy TD algorithm, a scalar, $F$, is introduced to obtain the ETD algorithm, which ensures convergence under off-policy conditions. This paper further introduces a scalar, $\omega$, based on the ETD algorithm to obtain VMETD. VMETD by the following update: % \begin{equation} % \delta_{t}= R_{t+1}+\gamma \theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t. % \end{equation} \begin{equation} \label{thetavmetd} \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_{k})\bm{\phi}_k, \end{equation} \begin{equation} \label{omegavmetd} \omega_{k+1} \leftarrow \omega_k+\beta_k(F_k \rho_k \delta_k - \omega_k), \end{equation} where $\omega$ is used to estimate $\mathbb{E}[F \rho\delta]$, i.e., $\omega \doteq \mathbb{E}[F \rho\delta]$. (\ref{thetavmetd}) can be rewritten as \begin{equation*} \begin{array}{ccl} \bm{\theta}_{k+1}&\leftarrow& \bm{\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_k)\bm{\phi}_k -\alpha_k \omega_{k+1}\bm{\phi}_k\\ &=&\bm{\theta}_{k}+\alpha_k(F_k\rho_k\delta_k-\mathbb{E}_{\mu}[F_k\rho_k\delta_k|\bm{\theta}_k])\bm{\phi}_k\\ &=&\bm{\theta}_k+\alpha_k F_k \rho_k (R_{k+1}+\gamma \bm{\theta}_k^{\top}\bm{\phi}_{k+1}-\bm{\theta}_k^{\top}\bm{\phi}_k)\bm{\phi}_k\\ & & \hspace{2em} -\alpha_k \mathbb{E}_{\mu}[F_k \rho_k \delta_k]\bm{\phi}_k\\ &=& \bm{\theta}_k+\alpha_k \{\underbrace{(F_k\rho_kR_{k+1}-\mathbb{E}_{\mu}[F_k\rho_k R_{k+1}])\bm{\phi}_k}_{\textbf{b}_{\text{VMETD},k}}\\ &&\hspace{-5em}- \underbrace{(F_k\rho_k\bm{\phi}_k(\bm{\phi}_k-\gamma\bm{\phi}_{k+1})^{\top}-\bm{\phi}_k\mathbb{E}_{\mu}[F_k\rho_k (\bm{\phi}_k-\gamma\bm{\phi}_{k+1})]^{\top})}_{\textbf{A}_{\text{VMETD},k}}\bm{\theta}_k\}. \end{array} \end{equation*} Therefore, \begin{equation*} \begin{array}{ccl} \textbf{A}_{\text{VMETD}}&=&\lim_{k \rightarrow \infty} \mathbb{E}[\textbf{A}_{\text{VMETD},k}]\\ &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k \rho_k \bm{\phi}_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]\\ &&\hspace{-1em}- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]\\ &&\hspace{-1em}- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})^{\top}]\\ &&\hspace{-2em}-\lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k \rho_k (\bm{\phi}_k - \gamma \bm{\phi}_{k+1})]^{\top}\\ && \hspace{-9em}=\sum_{s} d_{\mu}(s)\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k|S_k = s]\mathbb{E}_{\mu}[\rho_k\phi_k(\phi_k - \gamma \phi_{k+1})^{\top}|S_k = s]\\ &&\hspace{-2em}-\sum_{s} d_{\mu}(s)\phi(s)\sum_{s} d_{\mu}(s)\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k|S_k = s]\\ &&\hspace{2em}\mathbb{E}_{\mu}[\rho_k(\phi_k - \gamma \phi_{k+1})^{\top}|S_k = s]\\ &=& \sum_{s} f(s)\mathbb{E}_{\pi}[\phi_k(\phi_k - \gamma \phi_{k+1})^{\top}|S_k = s]\\ &&\hspace{-3em}-\sum_{s} d_{\mu}(s)\phi(s)\sum_{s} f(s)\mathbb{E}_{\pi}[(\phi_k - \gamma \phi_{k+1})^{\top}|S_k = s]\\ &=&\sum_{s} f(s) \bm{\phi}(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top} \\ &&\hspace{-4em} -\sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top}\\ &=&{\bm{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) \bm{\Phi} - {\bm{\Phi}}^{\top} \textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) \bm{\Phi} \\ &=&{\bm{\Phi}}^{\top} (\textbf{F} - \textbf{d}_{\mu} \textbf{f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}} \\ &=&{\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){\bm{\Phi}} \\ &=&{\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-\textbf{d}_{\mu} \textbf{d}_{\mu}^{\top} ){\bm{\Phi}}, \end{array} \end{equation*} \begin{equation*} \begin{array}{ccl} \textbf{b}_{\text{VMETD}}&=&\lim_{k \rightarrow \infty} \mathbb{E}[\textbf{b}_{\text{VMETD},k}]\\ &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[F_k\rho_kR_{k+1}\bm{\phi}_k]\\ &&- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]\\ &&- \lim_{k\rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\mathbb{E}_{\mu}[\bm{\phi}_k]\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &=& \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[\bm{\phi}_kF_k\rho_kR_{k+1}]\\ &&- \lim_{k \rightarrow \infty} \mathbb{E}_{\mu}[ \bm{\phi}_k]\lim_{k \rightarrow \infty}\mathbb{E}_{\mu}[F_k\rho_kR_{k+1}]\\ &=&\sum_{s} f(s) \bm{\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) \bm{\phi}(s) * \sum_{s} f(s)r_{\pi} \\ &=&\bm{\bm{\Phi}}^{\top}(\textbf{F}-\textbf{d}_{\mu} \textbf{f}^{\top})\textbf{r}_{\pi}. \end{array} \end{equation*} Therefore, in the 2-state counterexample, $\textbf{A}_{\text{VMETD}}=1.15$, meaning that VMETD can stably converge and converges slower than ETD. However, subsequent experiments showed that the VMETD algorithm converges more smoothly and performs better in controlled experiments. % In this paper, we refer to the control algorithm for ETD as EQ. % Now, we will introduce the improved version of the EQ algorithm, named VMEQ: % \begin{equation} % \begin{array}{ccl} % \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}&+&\alpha_{k}(F_k \rho_k \delta_k- \omega_k) \bm{\phi}(s_k,a_k), % \end{array} % \end{equation} % \begin{equation} % \omega_{k+1}\leftarrow \omega_{k}+\beta_k(F_k \rho_k \delta_k- \omega_k), % \end{equation} % where % $\delta_k$ is (\ref{deltaQ}) % and $A^{*}_{k+1}={\arg \max}_{a}(\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a))$.