Commit 828184a4 by GongYu

VMETD的更新公式和收敛性证明添加进去了

parent afbe69ae
...@@ -97,7 +97,7 @@ stochastic gradient descent: ...@@ -97,7 +97,7 @@ stochastic gradient descent:
\end{equation} \end{equation}
where $\delta_k$ is the TD error as follows: where $\delta_k$ is the TD error as follows:
\begin{equation} \begin{equation}
\delta_k = r+\gamma \delta_k = r_{k+1}+\gamma
\theta_k^{\top}\phi_{k}'-\theta_k^{\top}\phi_k. \theta_k^{\top}\phi_{k}'-\theta_k^{\top}\phi_k.
\label{delta} \label{delta}
\end{equation} \end{equation}
...@@ -204,4 +204,41 @@ and ...@@ -204,4 +204,41 @@ and
\end{equation} \end{equation}
where $\delta_{k}$ is (\ref{deltaQ}) and $A^{*}_{k+1}={\arg \max}_{a}(\theta_{k}^{\top}\phi(s_{k+1},a))$. where $\delta_{k}$ is (\ref{deltaQ}) and $A^{*}_{k+1}={\arg \max}_{a}(\theta_{k}^{\top}\phi(s_{k+1},a))$.
This paper also introduces an additional parameter $\omega$ into the GTD and GTD2 algorithms. For details, please refer to the appendix. \subsection{Variance Minimization ETD Learning: VMETD}
\ No newline at end of file VMETD by the following update:
% \begin{equation}
% \delta_{t}= R_{t+1}+\gamma \theta_t^{\top}\phi_{t+1}-\theta_t^{\top}\phi_t.
% \end{equation}
\begin{equation}
\rho_{k} \leftarrow \frac{\pi(A_k | S_k)}{\mu(A_k | S_k)}
\end{equation}
\begin{equation}
\label{fvmetd}
F_k \leftarrow \gamma \rho_{k-1}F_{k-1}+1,
\end{equation}
\begin{equation}
\label{omegavmetd}
\omega_{k+1} \leftarrow \omega_k+\beta_k(F_k \rho_k \delta_k - \omega_k),
\end{equation}
\begin{equation}
\label{thetavmetd}
\theta_{k+1}\leftarrow \theta_k+\alpha_k F_k \rho_k (R_{k+1}+\gamma \theta_k^{\top}\phi_{k+1}-\theta_k^{\top}\phi_k)\phi_k -\alpha_k \omega_{k+1}\phi_k,
\end{equation}
where $\mu$ is behavior policy and $\pi$ is target policy,
$F_t$ is a scalar variable,
$F_0=1$, $\omega$ is used to estimate $\mathbb{E}[\delta]$, i.e., $\omega \doteq \mathbb{E}[\delta]$, and
$\textbf{F}$ is a diagonal matrix with diagonal elements
$f(s)\dot{=}d_{\mu}(s)\lim_{t\rightarrow \infty}\mathbb{E}_{\mu}[F_k|S_k=s]$,
which we assume exists.
The vector $\textbf{f}\in \mathbb{R}^N$ with components
$[\textbf{f}]_s\dot{=}f(s)$ can be written as
\begin{equation}
\begin{split}
\textbf{f}&=\textbf{d}_{\mu}+\gamma \textbf{P}_{\pi}^{\top}\textbf{d}_{\mu}+(\gamma \textbf{P}_{\pi}^{\top})^2\textbf{d}_{\mu}+\ldots\\
&=(\textbf{I}-\gamma\textbf{P}_{\pi}^{\top})^{-1}\textbf{d}_{\mu}.
\end{split}
\end{equation}
...@@ -82,4 +82,31 @@ Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process. ...@@ -82,4 +82,31 @@ Please refer to the appendix \ref{proofcorollary4_2} for detailed proof process.
Then the parameter vector $\theta_k$ converges with probability one Then the parameter vector $\theta_k$ converges with probability one
to $A^{-1}b$. to $A^{-1}b$.
\end{theorem} \end{theorem}
Please refer to the appendix \ref{proofth2} for detailed proof process. Please refer to the appendix \ref{proofth2} for detailed proof process.
\ No newline at end of file
\begin{theorem}
\label{theorem3}(Convergence of VMETD).
In the case of off-policy learning, consider the iterations (\ref{omegavmetd}) and (\ref{thetavmetd}) with (\ref{delta}) of VMETD.
Let the step-size sequences $\alpha_k$ and $\beta_k$, $k\geq 0$ satisfy in this case $\alpha_k,\beta_k>0$, for all $k$,
$
\sum_{k=0}^{\infty}\alpha_k=\sum_{k=0}^{\infty}\beta_k=\infty,
$
$
\sum_{k=0}^{\infty}\alpha_k^2<\infty,
$
$
\sum_{k=0}^{\infty}\beta_k^2<\infty,
$
and
$
\alpha_k = o(\beta_k).
$
Assume that $(\phi_k,r_k,\phi_k')$ is an i.i.d. sequence with
uniformly bounded second moments, where $\phi_k$ and $\phi'_{k}$ are sampled from the same Markov chain.
Let $A = \mathrm{Cov}(\phi,\phi-\gamma\phi')$,
$b=\mathrm{Cov}(r,\phi)$.
Assume that matrix $A$ is non-singular.
Then the parameter vector $\theta_k$ converges with probability one
to $A^{-1}b$.
\end{theorem}
Please refer to the appendix \ref{proofVMETD} for detailed proof process.
\ No newline at end of file
...@@ -4,17 +4,19 @@ ...@@ -4,17 +4,19 @@
\BOOKMARK [2][-]{subsection.3.1}{\376\377\000M\000o\000t\000i\000v\000a\000t\000i\000o\000n}{section.3}% 4 \BOOKMARK [2][-]{subsection.3.1}{\376\377\000M\000o\000t\000i\000v\000a\000t\000i\000o\000n}{section.3}% 4
\BOOKMARK [2][-]{subsection.3.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D}{section.3}% 5 \BOOKMARK [2][-]{subsection.3.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D}{section.3}% 5
\BOOKMARK [2][-]{subsection.3.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000C\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D\000C}{section.3}% 6 \BOOKMARK [2][-]{subsection.3.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000T\000D\000C\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000T\000D\000C}{section.3}% 6
\BOOKMARK [1][-]{section.4}{\376\377\000T\000h\000e\000o\000r\000e\000t\000i\000c\000a\000l\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{}% 7 \BOOKMARK [2][-]{subsection.3.4}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000M\000i\000n\000i\000m\000i\000z\000a\000t\000i\000o\000n\000\040\000E\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g\000:\000\040\000V\000M\000E\000T\000D}{section.3}% 7
\BOOKMARK [1][-]{section.5}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000S\000t\000u\000d\000i\000e\000s}{}% 8 \BOOKMARK [1][-]{section.4}{\376\377\000T\000h\000e\000o\000r\000e\000t\000i\000c\000a\000l\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{}% 8
\BOOKMARK [2][-]{subsection.5.1}{\376\377\000T\000e\000s\000t\000i\000n\000g\000\040\000T\000a\000s\000k\000s}{section.5}% 9 \BOOKMARK [1][-]{section.5}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000S\000t\000u\000d\000i\000e\000s}{}% 9
\BOOKMARK [2][-]{subsection.5.2}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000R\000e\000s\000u\000l\000t\000s\000\040\000a\000n\000d\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section.5}% 10 \BOOKMARK [2][-]{subsection.5.1}{\376\377\000T\000e\000s\000t\000i\000n\000g\000\040\000T\000a\000s\000k\000s}{section.5}% 10
\BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000l\000a\000t\000e\000d\000\040\000W\000o\000r\000k}{}% 11 \BOOKMARK [2][-]{subsection.5.2}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000R\000e\000s\000u\000l\000t\000s\000\040\000a\000n\000d\000\040\000A\000n\000a\000l\000y\000s\000i\000s}{section.5}% 11
\BOOKMARK [2][-]{subsection.6.1}{\376\377\000D\000i\000f\000f\000e\000r\000e\000n\000c\000e\000\040\000b\000e\000t\000w\000e\000e\000n\000\040\000V\000M\000Q\000\040\000a\000n\000d\000\040\000R\000-\000l\000e\000a\000r\000n\000i\000n\000g}{section.6}% 12 \BOOKMARK [1][-]{section.6}{\376\377\000R\000e\000l\000a\000t\000e\000d\000\040\000W\000o\000r\000k}{}% 12
\BOOKMARK [2][-]{subsection.6.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g}{section.6}% 13 \BOOKMARK [2][-]{subsection.6.1}{\376\377\000D\000i\000f\000f\000e\000r\000e\000n\000c\000e\000\040\000b\000e\000t\000w\000e\000e\000n\000\040\000V\000M\000Q\000\040\000a\000n\000d\000\040\000R\000-\000l\000e\000a\000r\000n\000i\000n\000g}{section.6}% 13
\BOOKMARK [2][-]{subsection.6.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{section.6}% 14 \BOOKMARK [2][-]{subsection.6.2}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000T\000D\000\040\000L\000e\000a\000r\000n\000i\000n\000g}{section.6}% 14
\BOOKMARK [1][-]{section.7}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n\000\040\000a\000n\000d\000\040\000F\000u\000t\000u\000r\000e\000\040\000W\000o\000r\000k}{}% 15 \BOOKMARK [2][-]{subsection.6.3}{\376\377\000V\000a\000r\000i\000a\000n\000c\000e\000\040\000R\000e\000d\000u\000c\000t\000i\000o\000n\000\040\000f\000o\000r\000\040\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\000\040\000A\000l\000g\000o\000r\000i\000t\000h\000m\000s}{section.6}% 15
\BOOKMARK [1][-]{appendix.A}{\376\377\000R\000e\000l\000e\000v\000a\000n\000t\000\040\000p\000r\000o\000o\000f\000s}{}% 16 \BOOKMARK [1][-]{section.7}{\376\377\000C\000o\000n\000c\000l\000u\000s\000i\000o\000n\000\040\000a\000n\000d\000\040\000F\000u\000t\000u\000r\000e\000\040\000W\000o\000r\000k}{}% 16
\BOOKMARK [2][-]{subsection.A.1}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0001}{appendix.A}% 17 \BOOKMARK [1][-]{appendix.A}{\376\377\000R\000e\000l\000e\000v\000a\000n\000t\000\040\000p\000r\000o\000o\000f\000s}{}% 17
\BOOKMARK [2][-]{subsection.A.2}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000C\000o\000r\000o\000l\000l\000a\000r\000y\000\040\0004\000.\0002}{appendix.A}% 18 \BOOKMARK [2][-]{subsection.A.1}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0001}{appendix.A}% 18
\BOOKMARK [2][-]{subsection.A.3}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0003}{appendix.A}% 19 \BOOKMARK [2][-]{subsection.A.2}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000C\000o\000r\000o\000l\000l\000a\000r\000y\000\040\0004\000.\0002}{appendix.A}% 19
\BOOKMARK [1][-]{appendix.B}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000d\000e\000t\000a\000i\000l\000s}{}% 20 \BOOKMARK [2][-]{subsection.A.3}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000T\000h\000e\000o\000r\000e\000m\000\040\0004\000.\0003}{appendix.A}% 20
\BOOKMARK [2][-]{subsection.A.4}{\376\377\000P\000r\000o\000o\000f\000\040\000o\000f\000\040\000V\000M\000E\000T\000D\000\040\000c\000o\000n\000v\000e\000r\000g\000e\000n\000c\000e}{appendix.A}% 21
\BOOKMARK [1][-]{appendix.B}{\376\377\000E\000x\000p\000e\000r\000i\000m\000e\000n\000t\000a\000l\000\040\000d\000e\000t\000a\000i\000l\000s}{}% 22
No preview for this file type
No preview for this file type
...@@ -42,6 +42,9 @@ ...@@ -42,6 +42,9 @@
\usepackage{mathtools} \usepackage{mathtools}
\usepackage{amsthm} \usepackage{amsthm}
\usepackage{tikz} \usepackage{tikz}
\usepackage{bm}
\usepackage{esvect}
\usepackage{multirow}
\theoremstyle{plain} \theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section] \newtheorem{theorem}{Theorem}[section]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment