\section{Variance Minimization Algorithms} \subsection{Motivation} As shown in Table \ref{example_bias}, although there is a bias between the true value and the predicted value, action $a_3$ is still chosen under the greedy-policy. On the contrary, supervised learning is usually used to predict temperature, humidity, morbidity, etc. If the bias is too large, the consequences could be serious. \begin{table}[t] \caption{Classification accuracies for naive Bayes and flexible Bayes on various data sets.} \label{example_bias} \vskip 0.15in \begin{center} \begin{small} \begin{sc} \begin{tabular}{lcccr} \toprule action & $Q$ value & $Q$ value with bias \\ \midrule $Q(s, a_0)$ & 1& 5 \\ $Q(s, a_1)$ & 2& 6 \\ $Q(s, a_2)$ & 3& 7 \\ $Q(s, a_3)$ & 4& 8 \\ $\arg \min_{a}Q(s,a)$ & $a_3$& $a_3$\\ \bottomrule \end{tabular} \end{sc} \end{small} \end{center} \vskip -0.1in \end{table} In addition, reward shaping can significantly speed up the learning by adding a shaping reward $F(s,s')$ to the original reward $r$, where $F(s,s')$ is the general form of any state-based shaping reward. Static potential-based reward shaping (Static PBRS) maintains the policy invariance if the shaping reward follows from $F(s,s')=\gamma f(s')-f(s)$ \cite{ng1999policy}. This means that we can make changes to the TD error $\delta = r+\gamma \bm{\theta}^{\top}\bm{\phi}'-\bm{\theta}^{\top}\bm{\phi} $ while still ensuring the invariance of the optimal policy, \begin{equation*} \delta - \omega= r+\gamma \bm{\theta}^{\top}\bm{\phi}'-\bm{\theta}^{\top}\bm{\phi} - \omega, \end{equation*} where $\omega$ is a constant, acting as a static PBRS. This also means that algorithms with the optimization goal of minimizing errors, after introducing reward shaping, may result in larger or smaller bias. Fortunately, as discussed above, bias is acceptable in reinforcement learning. However, the problem is that selecting an appropriate $\omega$ requires expert knowledge. This forces us to learn $\omega$ dynamically, i.e., $\omega=\omega_t $ and dynamic PBRS can also maintain the policy invariance if the shaping reward is $F(s,t,s',t')=\gamma f(s',t')-f(s,t)$, where $t$ is the time-step the agent reaches in state $s$ \cite{devlin2012dynamic}. However, this result requires the convergence guarantee of the dynamic potential function $f(s,t)$. If $f(s,t)$ does not converge as the time-step $t\rightarrow\infty$, the Q-values of dynamic PBRS are not guaranteed to converge. Let $f_{\omega_t}(s)=\frac{\omega_t}{\gamma-1}$. Thus, $F_{\omega_t}(s,s')=\gamma f_{\omega_t}(s')-f_{\omega_t}(s)= \omega_t$ is a dynamic PBRS. And if $\omega$ converges finally, the dynamic potential function $f(s,t)$ will converge. Bias is the expected difference between the predicted value and the true value. Therefore, under the premise of bootstrapping, we first think of letting $\omega \doteq \mathbb{E}[\mathbb{E}[\delta|s]]=\mathbb{E}[\delta]$. % As we all know, the optimization process of linear TD(0) (semi-gradient) and linear TDC are as follows, respectively: % \begin{equation*} % \theta^{*}= \arg \min_{\theta} \mathbb{E}[(\mathbb{E}[\delta |s])^2], % \end{equation*} % and % \begin{equation*} % \theta^{*}=\arg \min_{\theta} \mathbb{E}[\delta \phi]^{\top} \mathbb{E}[\phi \phi^{\top}]^{-1} \mathbb{E}[\delta\phi]. % \end{equation*} % As a result, two novel objective functions and their corresponding algorithms are proposed, % where $\omega$ is subsequently proven to converge, meaning that these two algorithms can maintain the invariance of the optimal strategy. \subsection{Variance Minimization TD Learning: VMTD} For on-policy learning, a novel objective function, Variance of Bellman Error (VBE), is proposed as follows: \begin{equation} \begin{array}{ccl} \arg \min_{\bm{\theta}}\text{VBE}(\bm{\theta})&=&\arg \min_{\bm{\theta}}\mathbb{E}[(\mathbb{E}[\delta|s]-\mathbb{E}[\mathbb{E}[\delta|s]])^2]\\ &=&\arg \min_{\bm{\theta},\omega} \mathbb{E}[(\mathbb{E}[\delta|s]-\omega)^2]. \end{array} \end{equation} Clearly, it is no longer to minimize Bellman errors. First, the parameter $\omega$ is derived directly based on stochastic gradient descent: \begin{equation} \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k), \label{omega} \end{equation} where $\delta_k$ is the TD error as follows: \begin{equation} \delta_k = r_{k+1}+\gamma \bm{\theta}_k^{\top}\bm{\phi}_{k+1}-\bm{\theta}_k^{\top}\bm{\phi}_k. \label{delta} \end{equation} Then, based on stochastic semi-gradient descent, the update of the parameter $\bm{\theta}$ is as follows: \begin{equation} \bm{\theta}_{k+1}\leftarrow \bm{\theta}_{k}+\alpha_k(\bm{\theta}_k-\omega_k)\bm{\phi}_k. \label{theta} \end{equation} % The pseudocode of the VMTD algorithm is shown in Algorithm \ref{alg:algorithm 1}. For control tasks, two extensions of VMTD are named VMSarsa and VMQ respectively, and the update formulas are shown below: \begin{equation} \bm{\theta}_{k+1}\leftarrow \bm{\theta}_{k}+\alpha_k(\delta_k-\omega_k)\bm{\phi}(s_k,a_k). \end{equation} and \begin{equation} \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k-\omega_k), \end{equation} where $\delta_k$ delta in VMSarsa is: \begin{equation} \delta_{k}=r_{k+1}+\gamma \bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a_{k+1}) - \bm{\theta}_{k}^{\top}\bm{\phi}(s_{k},a_{k}), \label{deltaSarsa} \end{equation} and $\delta_k$ delta in VMQ is: \begin{equation} \delta_{k}=r_{k+1}+\gamma \max_{a\in A}\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a) - \bm{\theta}_{k}^{\top}\bm{\phi}(s_{k},a_{k}). \label{deltaQ} \end{equation} \begin{algorithm}[t] \caption{VMTD algorithm with linear function approximation in the on-policy setting} \label{alg:algorithm 1} \begin{algorithmic} \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $\omega_{0}$, $\gamma $, learning rate $\alpha_t$ and $\beta_t$ \REPEAT \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$ and $\beta_t$ are constant.\\ \FOR{$t=0$ {\bfseries to} $T-1$} \STATE Take $A_t$ from $S_t$ according to policy $\pi$, and arrive at $S_{t+1}$\\ \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t}'-\bm{\theta}_t^{\top}\bm{\phi}_t$ \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t(\delta_t-\omega_t)\bm{\phi}_t$ \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t(\delta_t-\omega_t)$ \STATE $S_t=S_{t+1}$ \ENDFOR \UNTIL{terminal episode} \end{algorithmic} \end{algorithm} \begin{algorithm}[t] \caption{VMTDC algorithm with linear function approximation in the off-policy setting} \label{alg:algorithm 2} \begin{algorithmic} \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $\bm{u}_0$, $\omega_{0}$, $\gamma $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ \REPEAT \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $\bm{u}_{0}$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ % \textbf{Output}: $\bm{\theta}^*$.\\ \FOR{$t=0$ {\bfseries to} $T-1$} \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t \rho_t[ (\delta_t-\omega_t)\bm{\phi}_t - \gamma \bm{\phi}_{t+1}(\bm{\phi}^{\top}_{t} \bm{u}_{t})]$ \STATE $\bm{u}_{t+1}\leftarrow \bm{u}_{t}+\zeta_t[\rho_t(\delta_t-\omega_t) - \bm{\phi}^{\top}_{t} \bm{u}_{t}] \bm{\phi}_t$ \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t \rho_t(\delta_t-\omega_t)$ \STATE $S_t=S_{t+1}$ \ENDFOR \UNTIL{terminal episode} \end{algorithmic} \end{algorithm} \begin{algorithm}[t] \caption{VMETD algorithm with linear function approximation in the off-policy setting} \label{alg:algorithm 5} \begin{algorithmic} \STATE {\bfseries Input:} $\bm{\theta}_{0}$, $F_0$, $\omega_{0}$, $\gamma $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$ \REPEAT \STATE For any episode, initialize $\bm{\theta}_{0}$ arbitrarily, $F_{0}$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\ % \textbf{Output}: $\theta^*$.\\ \FOR{$t=0$ {\bfseries to} $T-1$} \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\ \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\ \STATE $\delta_t = R_{t+1}+\gamma\bm{\theta}_t^{\top}\bm{\phi}_{t+1}-\bm{\theta}_t^{\top}\bm{\phi}_t$ \STATE $\rho_{t} \leftarrow \frac{\pi(A_t | S_t)}{\mu(A_t | S_t)}$ \STATE $F_{t}\leftarrow \gamma \rho_t F_{t-1} +1$ \STATE $\bm{\theta}_{t+1}\leftarrow \bm{\theta}_{t}+\alpha_t (F_t \rho_t\delta_t-\omega_t)\bm{\phi}_t$ \STATE $\omega_{t+1}\leftarrow \omega_{t}+\beta_t (F_t \rho_t\delta_t-\omega_t)$ \STATE $S_t=S_{t+1}$ \ENDFOR \UNTIL{terminal episode} \end{algorithmic} \end{algorithm} \subsection{Variance Minimization TDC Learning: VMTDC} For off-policy learning, we employ a projection operator. The objective function is called Variance of Projected Bellman error (VPBE), and the corresponding algorithm is called VMTDC. \begin{equation} \begin{array}{ccl} \text{VPBE}(\bm{\theta})&=&\mathbb{E}[(\delta-\mathbb{E}[\delta]) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta-\mathbb{E}[\delta])\bm{\phi}]\\ &=&\mathbb{E}[(\delta-\omega) \bm{\phi}]^{\top} \mathbb{E}[\bm{\phi} \bm{\phi}^{\top}]^{-1}\mathbb{E}[(\delta-\omega)\bm{\phi}], \end{array} \end{equation} where $\omega$ is used to estimate $\mathbb{E}[\delta]$, i.e., $\omega \doteq \mathbb{E}[\delta]$. The derivation process of the VMTDC algorithm is the same as that of the TDC algorithm, the only difference is that the original $\delta$ is replaced by $\delta-\omega$. Therefore, we can easily get the updated formula of VMTDC, as follows: \begin{equation} \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}+\alpha_{k}[(\delta_{k}- \omega_k) \bm{\phi}(s_k)\\ - \gamma\bm{\phi}(s_{k+1})(\bm{\phi}^{\top} (s_k) \bm{u}_{k})], \label{thetavmtdc} \end{equation} \begin{equation} \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[\delta_{k}-\omega_k - \bm{\phi}^{\top} (s_k) \bm{u}_{k}]\bm{\phi}(s_k), \label{uvmtdc} \end{equation} and \begin{equation} \omega_{k+1}\leftarrow \omega_{k}+\beta_k (\delta_k- \omega_k), \label{omegavmtdc} \end{equation} % The pseudocode of the VMTDC algorithm for importance-sampling scenario is shown in Algorithm \ref{alg:algorithm 2} of Appendix \ref{proofth2}. Now, we will introduce the improved version of the GQ(0) algorithm, named VMGQ(0): \begin{equation} \begin{array}{ccl} \bm{\theta}_{k+1}\leftarrow\bm{\theta}_{k}&+&\alpha_{k}[(\delta_{k}- \omega_k) \bm{\phi}(s_k,a_k)\\ &-& \gamma\bm{\phi}(s_{k+1},A^{*}_{k+1})(\bm{\phi}^{\top} (s_k,a_k) \bm{u}_{k})], \end{array} \end{equation} \begin{equation} \bm{u}_{k+1}\leftarrow \bm{u}_{k}+\zeta_{k}[(\delta_{k}-\bm{u}_{k}) - \bm{\phi}^{\top} (s_k,a_k) \bm{u}_{k}]\bm{\phi}(s_k,a_k), \end{equation} and \begin{equation} \omega_{k+1}\leftarrow \omega_{k}+\beta_k(\delta_k- \omega_k), \end{equation} where $\delta_{k}$ is (\ref{deltaQ}) and $A^{*}_{k+1}={\arg \max}_{a}(\bm{\theta}_{k}^{\top}\bm{\phi}(s_{k+1},a))$. \subsection{Variance Minimization ETD Learning: VMETD} Based on the off-policy TD algorithm, a scalar, $F$, is introduced to obtain the ETD algorithm, which ensures convergence under off-policy conditions. This paper further introduces a scalar, $\omega$, based on the ETD algorithm to obtain VMETD. VMETD by the following update: % \begin{equation} % \delta_{t}= R_{t+1}+\gamma \theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t. % \end{equation} \begin{equation} \rho_{k} \leftarrow \frac{\pi(A_k | S_k)}{\mu(A_k | S_k)} \end{equation} \begin{equation} \label{fvmetd} F_k \leftarrow \gamma \rho_{k-1}F_{k-1}+1, \end{equation} \begin{equation} \label{thetavmetd} \bm{\theta}_{k+1}\leftarrow \bm{\theta}_k+\alpha_k (F_k \rho_k\delta_k - \omega_{k})\bm{\phi}_k, \end{equation} \begin{equation} \label{omegavmetd} \omega_{k+1} \leftarrow \omega_k+\beta_k(F_k \rho_k \delta_k - \omega_k), \end{equation} where $\mu$ is behavior policy and $\pi$ is target policy, $F_t$ is a scalar variable, $F_0=1$, $\omega$ is used to estimate $\mathbb{E}[\delta]$, i.e., $\omega \doteq \mathbb{E}[\delta]$, and $\textbf{F}$ is a diagonal matrix with diagonal elements $f(s)\dot{=}d_{\mu}(s)\lim_{t\rightarrow \infty}\mathbb{E}_{\mu}[F_k|S_k=s]$, which we assume exists. The vector $\textbf{f}\in \mathbb{R}^N$ with components $[\textbf{f}]_s\dot{=}f(s)$ can be written as \begin{equation} \begin{split} \textbf{f}&=\textbf{d}_{\mu}+\gamma \textbf{P}_{\pi}^{\top}\textbf{d}_{\mu}+(\gamma \textbf{P}_{\pi}^{\top})^2\textbf{d}_{\mu}+\ldots\\ &=(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}\textbf{d}_{\mu}. \end{split} \end{equation}