\section{Variance Minimization Algorithms} This section will introduce two new objective functions and three new algorithms, including one on-policy algorithm and two off-policy algorithms, and calculate the minimum eigenvalue of $\textbf{A}$ for each of the three algorithms under on-policy and off-policy in a 2-state environment, thereby comparing the convergence speed of the three algorithms. % To derive an algorithm with a larger minimum eigenvalue for matrix % $\textbf{A}$, it is necessary to propose new objective functions. % The mentioned objective functions in the Introduction % are all forms of error. Is minimizing error the only option % for value-based reinforcement learning? Based on this observation, % We propose alternative objective functions instead of minimizing errors. % We minimize the Variance of Projected Bellman Error (VPBE) and derive the % VMTDC algorithm. This idea is then innovatively applied to ETD, resulting % in the VMETD algorithm. % \subsection{Motivation} % gagagga % \begin{algorithm}[t] % \caption{VMTDC algorithm with linear function approximation in the off-policy setting} % \label{alg:algorithm 2} % \begin{algorithmic} % \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $\bm{u}_0$, $\omega_{0}$, $\gamma % $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ % \REPEAT % \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $\bm{u}_{0}$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ % % \textbf{Output}: $\bm{\theta}^*$.\\ % \FOR{$t=0$ {\bfseries to} $T-1$} % \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ % \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ % \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ % \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ % \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t [ (\rho_t\delta_t-\omega_t)\bm{\phi}_t - \gamma \rho_t\bm{\phi}_{t+1}(\bm{\phi}^{\top}_{t} \bm{u}_{t})]$ % \STATE $\bm{u}_{t+1}\leftarrow \bm{u}_{t}+\zeta_t[(\rho_t\delta_t-\omega_t) - \bm{\phi}^{\top}_{t} \bm{u}_{t}] \bm{\phi}_t$ % \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (\rho_t\delta_t-\omega_t)$ % \STATE $S_t=S_{t+1}$ % \ENDFOR % \UNTIL{terminal episode} % \end{algorithmic} % \end{algorithm} % \begin{algorithm}[t] % \caption{VMETD algorithm with linear function approximation in the off-policy setting} % \label{alg:algorithm 5} % \begin{algorithmic} % \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $F_0$, $\omega_{0}$, $\gamma % $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ % \REPEAT % \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $F_{0}$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ % % \textbf{Output}: $\theta^*$.\\ % \FOR{$t=0$ {\bfseries to} $T-1$} % \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ % \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ % \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ % \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ % \STATE $F_{t}\leftarrow \gamma \rho_t F_{t-1} +1$ % \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t (F_t \rho_t\delta_t-\omega_t)\bm{\phi}_t$ % \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (F_t \rho_t\delta_t-\omega_t)$ % \STATE $S_t=S_{t+1}$ % \ENDFOR % \UNTIL{terminal episode} % \end{algorithmic} % \end{algorithm} \subsection{Variance Minimization TD Learning: VMTD} For on-policy learning, a novel objective function, Variance of Bellman Error (VBE), is proposed as follows: % \begin{equation} % \begin{array}{ccl} % \arg \min_{\theta}\text{VBE}(\theta)&=&\arg \min_{\theta}\mathbb{E}[(\mathbb{E}[\delta|s]-\mathbb{E}[\mathbb{E}[\delta|s]])^2]\\ % &=&\arg \min_{\theta,\omega} \mathbb{E}[(\mathbb{E}[\delta|s]-\omega)^2]. % \end{array} % \end{equation} \begin{align} \arg \min_{\theta}\text{VBE}(\theta) &= \arg \min_{\theta}\mathbb{E}[(\mathbb{E}[\delta_t|S_t]-\mathbb{E}[\mathbb{E}[\delta_t|S_t]])^2] \\ &= \arg \min_{\theta,\omega} \mathbb{E}[(\mathbb{E}[\delta_t|S_t]-\omega)^2]\notag \end{align} where $\delta_t$ is the TD error as follows: \begin{equation} \delta_t = r_{t+1}+\gamma \theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t. \label{delta} \end{equation} Clearly, it is no longer to minimize Bellman errors. First, the parameter $\omega$ is derived directly based on stochastic gradient descent: \begin{equation} \omega_{t+1}\leftarrow \omega_{t}+\beta_t(\delta_t-\omega_t), \label{omega} \end{equation} Then, based on stochastic semi-gradient descent, the update of the parameter $\theta$ is as follows: \begin{equation} \theta_{t+1}\leftarrow \theta_{t}+\alpha_t(\delta_t-\omega_t)\phi_t. \label{theta} \end{equation} The semi-gradient of the (2) with respect to $\theta$ is \begin{equation*} \begin{array}{ccl} &&-\frac{1}{2}\nabla \text{VBE}({\theta}) \\ &=& \mathbb{E}[(\mathbb{E}[\delta_t|S_t]-\mathbb{E}[\mathbb{E}[\delta_t|S_t]])(\phi_t -\mathbb{E}[\phi_t])] \\ &=& \mathbb{E}[\delta_t \phi_t] -\mathbb{E}[\delta_t] \mathbb{E}[\phi_t] , % &=&-\mathbb{E}\Big[\Big( (\phi_t - \gamma\phi_t')- \mathbb{E}[ (\phi_t- \gamma {\phi_t}')]\Big)\phi_t^{\top} \Big]\theta + \mathbb{E}( r_{t+1}- \mathbb{E}[r_{t+1}])\bm{\phi_t} \end{array} \end{equation*} The key matrix $\textbf{A}_{\text{VMTD}}$ and $b_{\text{VMTD}}$ of on-policy VMTD is \begin{equation*} \begin{array}{ccl} &&\textbf{A}_{\text{VMTD}} \\ &=& \mathbb{E}[(\phi - \gamma \phi')\phi^{\top}] - \mathbb{E}[\phi - \gamma \phi']\mathbb{E}[\phi^{\top}]\\ &=&\sum_{s}d_{\pi}(s)\phi(s)\Big(\phi(s) -\gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\phi(s') \Big)^{\top} \\ && -\sum_{s}d_{\pi}(s)\phi(s) \cdot \sum_{s}d_{\pi}(s)\Big(\phi(s) -\gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\phi(s') \Big)^{\top}\\ &=& \bm{\Phi}^{\top}\textbf{D}_{\mu}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi} -\bm{\Phi}^{\top}d_{\pi}d_{\pi}^{\top}(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}\\ &=& \bm{\Phi}^{\top}(\textbf{D}_{\pi}-d_{\pi}d_{\pi}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}, \end{array} \end{equation*} % \begin{equation*} % \begin{array}{ccl} % \textbf{C} &=& \mathbb{E}[\bm{\bm{\phi}}\bm{\bm{\phi}}^{\top}], % \end{array} % \end{equation*} \begin{equation*} \begin{array}{ccl} &&b_{\text{VMTD}}\\ &=& \mathbb{E}( r- \mathbb{E}[r])\phi \\ &=& \mathbb{E}[r\phi] - \mathbb{E}[r]\mathbb{E}[\phi]\\ &=& \bm{\Phi}^{\top}(\textbf{D}_{\pi}-d_{\pi}d_{\pi}^{\top})r_\pi. \end{array} \end{equation*} It can be easily obtained that The key matrix $\textbf{A}_{\text{VMTD}}$ and $b_{\text{VMTD}}$ of off-policy VMTD are, respectively, \begin{equation*} \textbf{A}_{\text{VMTD}} = \bm{\Phi}^{\top}(\textbf{D}_{\mu}-d_{\mu}d_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}, \end{equation*} \begin{equation*} b_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-d_{\mu}d_{\mu}^{\top})r_\pi, \end{equation*} In the on-policy 2-state environment, the minimum eigenvalue of the key matrix for VMTD is greater than that of on-policy TDC and smaller than that of on-policy TD(0), indicating that VMTD converges faster than TDC and slower than TD(0) in this environment. In the off-policy 2-state environment, the minimum eigenvalue of the key matrix for VMTD is greater than 0, suggesting that VMTD can converge stably. \subsection{Variance Minimization TDC Learning: VMTDC} For off-policy learning, we propose a new objective function, called Variance of Projected Bellman error (VPBE), and the corresponding algorithm is called VMTDC. \begin{align} &\text{VPBE}(\bm{\theta}) \notag\\ &= \mathbb{E}[(\delta-\mathbb{E}[\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\mathbb{E}[\delta ])\bm{\phi}] \\ % &= (\bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}))^{\top}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1} \notag \\ % & \bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ % &= (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}})^{\top}\textbf{D}^{\top}\bm{\Phi}(\bm{\Phi}^{\top}\textbf{D}\bm{\Phi})^{-1} \notag \\ % & \bm{\Phi}^{\top}\textbf{D}(\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ % &= (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}})^{\top}\Pi^{\top}\textbf{D}\Pi \notag \\ % & (\textbf{W}_{\bm{\theta}} + \textbf{T}\textbf{V}_{\bm{\theta}} -\textbf{V}_{\bm{\theta}}) \notag \\ % &= (\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}-\textbf{W}_{\bm{\theta}}))^{\top}\textbf{D} \notag \\ % & (\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}-\textbf{W}_{\bm{\theta}})) \notag \\ % &= ||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}} - \textbf{W}_{\bm{\theta}})||^{2}_{\mu} \notag \\ % &= ||\Pi(\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}) - \Pi\textbf{W}_{\bm{\theta}}||^{2}_{\mu} \notag \\ &= \mathbb{E}[(\delta-\omega) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta -\omega)\bm{\phi}] , \end{align} where % $\textbf{W}_{\bm{\theta}}$ is viewed as vectors with every element being equal to % $||\textbf{V}_{\bm{\theta}} - \textbf{T}\textbf{V}_{\bm{\theta}}||^{2}_{\mu}$ and $\omega$ is used to approximate $\mathbb{E}[\delta]$, i.e., $\omega \doteq\mathbb{E}[\delta] $. The gradient of the (6) with respect to $\theta$ is \begin{equation*} \begin{array}{ccl} -\frac{1}{2}\nabla \text{VPBE}({\theta}) &=& -\mathbb{E}\Big[\Big( (\gamma {\phi}' - {\phi}) - \mathbb{E}[ (\gamma {\phi}' - {\phi})]\Big){\phi}^{\top} \Big] \\ & & \mathbb{E}[{\phi} {\phi}^{\top}]^{-1} \mathbb{E}[( \delta -\mathbb{E}[ \delta]){\phi}]\\ &=& \mathbb{E}\Big[\Big( ({\phi} - \gamma {\phi}')- \mathbb{E}[ ({\phi} - \gamma {\phi}')]\Big){\phi}^{\top} \Big] \\ & & \mathbb{E}[{\phi} {\phi}^{\top}]^{-1}\\ & & \mathbb{E}\Big[\Big( r + \gamma {{\phi}'}^{\top} {\theta} -{\phi}^{\top} {\theta}\\ & & \hspace{2em} -\mathbb{E}[ r + \gamma {{\phi}'}^{\top} {\theta} -{\phi}^{\top} {\theta}]\Big){\phi} \Big]. % &=& \textbf{A}^{\top} \textbf{C}^{-1}(-\textbf{A}\bm{\theta} + \textbf{b}) \end{array} \end{equation*} It can be easily obtained that The key matrix $\textbf{A}_{\text{VMTDC}}$ and $b_{\text{VMTDC}}$ of VMTDC are, respectively, \begin{equation*} \textbf{A}_{\text{VMTDC}} = \textbf{A}_{\text{VMTD}}^{\top} \textbf{C}^{-1}\textbf{A}_{\text{VMTD}}, \end{equation*} \begin{equation*} b_{\text{VMTDC}}=\textbf{A}_{\text{VMTD}}^{\top} \textbf{C}^{-1}b_{\text{VMTD}}, \end{equation*} where, for on-policy, $\textbf{A}_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\pi}-d_{\pi}d_{\pi}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$ and $b_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\pi}-d_{\pi}d_{\pi}^{\top})r_\pi$ and, for off-policy, $\textbf{A}_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-d_{\mu}d_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$ and $b_{\text{VMTD}}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}-d_{\mu}d_{\mu}^{\top})r_\pi$. In the process of computing the gradient of the (7) with respect to $\theta$, $\omega$ is treated as a constant. So, the derivation process of the VMTDC algorithm is the same as that of the TDC algorithm, the only difference is that the original $\delta$ is replaced by $\delta-\omega$. Therefore, we can easily get the updated formula of VMTDC, as follows: % \begin{equation*} % \begin{array}{ccl} % \text{VPBE}(\bm{\theta})&=&\mathbb{E}[(\rho_k \delta_k-\mathbb{E}[\rho_k \delta_k]) \bm{\phi}_k]^{\top} \mathbb{E}[\bm{\phi}_k \bm{\phi}^{\top}_k]^{-1}\\ % & &\mathbb{E}[(\rho_k \delta_k -\mathbb{E}[\rho_k \delta_k ])\bm{\phi}_k], % % &=&\mathbb{E}[(\delta-\omega) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta-\omega)\bm{\phi}], % \end{array} % \end{equation*} \begin{equation} \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k}[(\delta_{k}- \omega_k) \bm{\phi}_k\\ - \gamma\bm{\phi}_{k+1}(\bm{\phi}^{\top}_k \bm{u}_{k})], \label{thetavmtdc} \end{equation} \begin{equation} \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[\delta_{k}-\omega_k - \bm{\phi}^{\top}_k \bm{u}_{k}]\bm{\phi}_k, \label{uvmtdc} \end{equation} and \begin{equation} \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k). \label{omegavmtdc} \end{equation} The VMTDC algorithm (\ref{thetavmtdc}) is derived to work with a given set of sub-samples—in the form of triples $(S_k, R_k, S'_k)$ that match transitions from both the behavior and target policies. % What if % we wanted to use all the data? The data % is generated according to the behavior policy $\pi_b$, % while our objective is to learn about the target % policy $\pi$. We should use importance-sampling. % The VPBE with importance sampling is: % \begin{equation} % \label{rho_VPBE} % \begin{array}{ccl} % \text{VPBE}(\bm{\theta})&=&\mathbb{E}[(\rho\delta-\mathbb{E}[\rho\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ % & &\mathbb{E}[(\rho\delta -\mathbb{E}[\rho\delta ])\bm{\phi}], % \end{array} % \end{equation} % Following the linear VMTDC derivation, we get the following algorithm (linear VMTDC algorithm % based on importance weighting scenario): % \begin{equation} % \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k}[(\rho_k\delta_{k}- \omega_k) \bm{\phi}_k\\ % - \gamma\rho_k\bm{\phi}_{k+1}(\bm{\phi}^{\top}_k \bm{u}_{k})], % \end{equation} % \begin{equation} % \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[(\rho_k\delta_{k}-\omega_k) - \bm{\phi}^{\top}_k \bm{u}_{k}]\bm{\phi}_k, % \end{equation} % and % \begin{equation} % \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\rho_k\delta_k- \omega_k), % \end{equation} % The gradient of the (\ref{rho_VPBE}) with respect to $\theta$ is % \begin{equation*} % \begin{array}{ccl} % -\frac{1}{2}\nabla \text{VPBE}(\bm{\theta}) &=& \mathbb{E}\Big[\Big( \rho(\bm{\phi} - \gamma \bm{\phi}')- \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')]\Big)\bm{\phi}^{\top} \Big] \\ % & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ % & & \mathbb{E}\Big[\Big( \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})\\ % & & \hspace{2em} -\mathbb{E}[ \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})]\Big)\bm{\phi} \Big]\\ % &=& \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')\bm{\phi}^{\top}]- \mathbb{E}[ \rho(\bm{\phi} - \gamma \bm{\phi}')]\mathbb{E}[\bm{\phi}^{\top}] \\ % & & \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\\ % & & \mathbb{E}\Big[\Big( \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})\\ % & & \hspace{2em} -\mathbb{E}[ \rho(r + \gamma {\bm{\phi}'}^{\top} \bm{\theta} -\bm{\phi}^{\top} \bm{\theta})]\Big)\bm{\phi} \Big]\\ % % &=&\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}\\ % &=& \textbf{A}^{\top} \textbf{C}^{-1}(-\textbf{A}\bm{\theta} + \textbf{b}), % \end{array} % \end{equation*} % where $\textbf{A}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})(\textbf{I}-\gamma \textbf{P}_{\pi})\bm{\Phi}$, % $\textbf{b}=\bm{\Phi}^{\top}(\textbf{D}_{\mu}- \textbf{d}_{\mu}\textbf{d}_{\mu}^{\top})\textbf{r}_{\pi}$ and % $\textbf{r}_{\pi}$ is viewed as vectors. In the on-policy 2-state environment, the minimum eigenvalue of the key matrix for VMTDC is smaller than that of TD(0), TDC and VMTD indicating that VMTDC converges slower than them in this on-policy. In the off-policy 2-state environment, the minimum eigenvalue of the key matrix for VMTD is greater than TDC, suggesting that VMTDC converges faster than them in off-policy environment. % The control algorithm corresponding to TDC is called GQ. % Now, we will introduce the improved version of the GQ algorithm, named VMGQ: % \begin{equation} % \begin{array}{ccl} % \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}&+&\alpha_{k}[(\delta_{k}- \omega_k) \bm{\phi}(s_k,a_k)\\ % &-& \gamma\bm{\phi}(s_{k+1},A^{*}_{k+1})(\bm{\phi}^{\top} (s_k,a_k) \bm{u}_{k})], % \end{array} % \end{equation} % \begin{equation} % \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[(\delta_{k}-\bm{u}_{k}) - \bm{\phi}^{\top} (s_k,a_k) \bm{u}_{k}]\bm{\phi}(s_k,a_k), % \end{equation} % and % \begin{equation} % \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k- \omega_k), % \end{equation} % where % $\delta_k$ in VMGQ is: % \begin{equation} % \delta_{k}=r_{k+1}+\gamma \max_{a\in A}\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a) - \bm{\theta}_{k}^{\top}\bm{\phi}(s_{k},a_{k}), % \label{deltaQ} % \end{equation} % and $A^{*}_{k+1}={\arg \max}_{a}(\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a))$. % \begin{table*}[t] % \caption{Minimum eigenvalues of various algorithms in the 2-state counterexample.} % \vskip 0.15in % \begin{center} % \begin{small} % \begin{sc} % \begin{tabular}{lcccccr} % \toprule % algorithm & off-policy TD & TDC & ETD & VMTDC & VMETD \\ % \midrule % Minimum eigenvalues&$-0.2$ & $0.016$ & $3.4$ & $0.025$ & $1.15$ \\ % \bottomrule % \end{tabular} % \end{sc} % \end{small} % \end{center} % \vskip -0.1in % \end{table*} \subsection{Variance Minimization ETD Learning: VMETD} Based on the off-policy TD algorithm, a scalar, $F$, is introduced to obtain the ETD algorithm, which ensures convergence under off-policy conditions. This paper further introduces a scalar, $\omega$, based on the ETD algorithm to obtain VMETD. VMETD by the following update: % \begin{equation} % \delta_{t}= R_{t+1}+\gamma \theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t. % \end{equation} \begin{equation} \label{fvmetd} F_t \leftarrow \gamma \rho_{t-1}F_{t-1}+1, \end{equation} \begin{equation} \label{thetavmetd} {\theta}_{t+1}\leftarrow {\theta}_t+\alpha_t (F_t \rho_t\delta_t - \omega_{t}){\phi}_t, \end{equation} \begin{equation} \label{omegavmetd} \omega_{t+1} \leftarrow \omega_t+\beta_t(F_t \rho_t \delta_t - \omega_t), \end{equation} where $\omega$ is used to estimate $\mathbb{E}[F \rho\delta]$, i.e., $\omega \doteq \mathbb{E}[F \rho\delta]$. (\ref{thetavmetd}) can be rewritten as \begin{equation*} \begin{array}{ccl} {\theta}_{t+1}&\leftarrow& {\theta}_t+\alpha_t (F_t \rho_t\delta_t - \omega_t){\phi}_t -\alpha_t \omega_{t+1}{\phi}_t\\ &=&{\theta}_{t}+\alpha_t(F_t\rho_t\delta_t-\mathbb{E}_{\mu}[F_t\rho_t\delta_t|{\theta}_t]){\phi}_t\\ &=&{\theta}_t+\alpha_t F_t \rho_t (r_{t+1}+\gamma {\theta}_t^{\top}{\phi}_{t+1}-{\theta}_t^{\top}{\phi}_t){\phi}_t\\ & & \hspace{2em} -\alpha_t \mathbb{E}_{\mu}[F_t \rho_t \delta_t]{\phi}_t\\ &=& {\theta}_t+\alpha_t \{\underbrace{(F_t\rho_tr_{t+1}-\mathbb{E}_{\mu}[F_t\rho_t r_{t+1}]){\phi}_t}_{{b}_{\text{VMETD},t}}\\ &&\hspace{-7em}- \underbrace{(F_t\rho_t{\phi}_t({\phi}_t-\gamma{\phi}_{t+1})^{\top}-{\phi}_t\mathbb{E}_{\mu}[F_t\rho_t ({\phi}_t-\gamma{\phi}_{t+1})]^{\top})}_{\textbf{A}_{\text{VMETD},t}}{\theta}_t\}. \end{array} \end{equation*} Therefore, \begin{equation*} \begin{array}{ccl} &&\textbf{A}_{\text{VMETD}}\\ &=&\lim_{t \rightarrow \infty} \mathbb{E}[\textbf{A}_{\text{VMETD},t}]\\ &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[F_t \rho_t {\phi}_t ({\phi}_t - \gamma {\phi}_{t+1})^{\top}]\\ &&\hspace{1em}- \lim_{t\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\mathbb{E}_{\mu}[F_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})]^{\top}\\ % &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_tF_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})^{\top}]\\ % &&\hspace{1em}- \lim_{t\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\mathbb{E}_{\mu}[F_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})]^{\top}\\ &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_tF_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})^{\top}]\\ &&\hspace{1em}-\lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\lim_{t \rightarrow \infty}\mathbb{E}_{\mu}[F_t \rho_t ({\phi}_t - \gamma {\phi}_{t+1})]^{\top}\\ && \hspace{-2em}=\sum_{s} d_{\mu}(s)\lim_{t \rightarrow \infty}\mathbb{E}_{\mu}[F_t|S_t = s]\mathbb{E}_{\mu}[\rho_t\phi_t(\phi_t - \gamma \phi_{t+1})^{\top}|S_t= s]\\ &&\hspace{1em}-\sum_{s} d_{\mu}(s)\phi(s)\sum_{s} d_{\mu}(s)\lim_{t \rightarrow \infty}\mathbb{E}_{\mu}[F_t|S_t = s]\\ &&\hspace{7em}\mathbb{E}_{\mu}[\rho_t(\phi_t - \gamma \phi_{t+1})^{\top}|S_t = s]\\ &=& \sum_{s} f(s)\mathbb{E}_{\pi}[\phi_t(\phi_t- \gamma \phi_{t+1})^{\top}|S_t = s]\\ &&\hspace{1em}-\sum_{s} d_{\mu}(s)\phi(s)\sum_{s} f(s)\mathbb{E}_{\pi}[(\phi_t- \gamma \phi_{t+1})^{\top}|S_t = s]\\ &=&\sum_{s} f(s) \bm{\phi}(s)(\bm{\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}\bm{\phi}(s'))^{\top} \\ &&\hspace{1em} -\sum_{s} d_{\mu}(s) {\phi}(s) * \sum_{s} f(s)({\phi}(s) - \gamma \sum_{s'}[\textbf{P}_{\pi}]_{ss'}{\phi}(s'))^{\top}\\ &=&{\bm{\Phi}}^{\top} \textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) \bm{\Phi} - {\bm{\Phi}}^{\top} {d}_{\mu} {f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu}) \bm{\Phi} \\ &=&{\bm{\Phi}}^{\top} (\textbf{F} - {d}_{\mu} {f}^{\top}) (\textbf{I} - \gamma \textbf{P}_{\pi}){\bm{\Phi}} \\ &=&{\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-{d}_{\mu} {f}^{\top} (\textbf{I} - \gamma \textbf{P}_{\pi})){\bm{\Phi}} \\ &=&{\bm{\Phi}}^{\top} (\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi})-{d}_{\mu} {d}_{\mu}^{\top} ){\bm{\Phi}}, \end{array} \end{equation*} \begin{equation*} \begin{array}{ccl} &&{b}_{\text{VMETD}}\\ &=&\lim_{t \rightarrow \infty} \mathbb{E}[{b}_{\text{VMETD},t}]\\ &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[F_t\rho_tR_{t+1}{\phi}_t]\\ &&\hspace{2em} - \lim_{t\rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_t]\mathbb{E}_{\mu}[F_t\rho_kR_{k+1}]\\ &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_tF_t\rho_tr_{t+1}]\\ &&\hspace{2em} - \lim_{t\rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\mathbb{E}_{\mu}[{\phi}_t]\mathbb{E}_{\mu}[F_t\rho_tr_{t+1}]\\ &=& \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[{\phi}_tF_t\rho_tr_{t+1}]\\ &&\hspace{2em} - \lim_{t \rightarrow \infty} \mathbb{E}_{\mu}[ {\phi}_t]\lim_{t \rightarrow \infty}\mathbb{E}_{\mu}[F_t\rho_tr_{t+1}]\\ &=&\sum_{s} f(s) {\phi}(s)r_{\pi} - \sum_{s} d_{\mu}(s) {\phi}(s) * \sum_{s} f(s)r_{\pi} \\ &=&\bm{\bm{\Phi}}^{\top}(\textbf{F}-{d}_{\mu} {f}^{\top}){r}_{\pi}. \end{array} \end{equation*} In the off-policy 2-state environment, the minimum eigenvalue of the key matrix for VMETD is greater than that of TD(0), TDC and VMTD and smaller than that of ETD, indicating that VMTDC converges faster than TD(0), TDC and VMTD and slower than ETD in this off-policy. However, subsequent experiments showed that the VMETD algorithm converges more smoothly and performs best in controlled experiments. % In this paper, we refer to the control algorithm for ETD as EQ. % Now, we will introduce the improved version of the EQ algorithm, named VMEQ: % \begin{equation} % \begin{array}{ccl} % \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}&+&\alpha_{k}(F_k \rho_k \delta_k- \omega_k) \bm{\phi}(s_k,a_k), % \end{array} % \end{equation} % \begin{equation} % \omega_{k+1}\leftarrow \omega_{k}+\beta_k(F_k \rho_k \delta_k- \omega_k), % \end{equation} % where % $\delta_k$ is (\ref{deltaQ}) % and $A^{*}_{k+1}={\arg \max}_{a}(\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a))$.