$, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
\REPEAT
\STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma\in(0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
\textbf{Output}: $\theta^*$.\\
\FOR{$t=0${\bfseries to}$T-1$}
\STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
\STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
$, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
\REPEAT
\STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ and $\omega_{0}$ to $0$, $\gamma\in(0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
\STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$to $1$and $\omega_{0}$ to $0$, $\gamma\in(0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
\textbf{Output}: $\theta^*$.\\
\FOR{$t=0${\bfseries to}$T-1$}
\STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
\STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
% $, learning rate $\alpha_t$, $\zeta_t$ and $\beta_t$, behavior policy $\mu$ and target policy $\pi$
% \REPEAT
% \STATE For any episode, initialize $\theta_{0}$ arbitrarily, $u_t$ to $1$ and $\omega_{0}$ to $0$, $\gamma \in (0,1]$, and $\alpha_t$, $\zeta_t$ and $\beta_t$ are constant.\\
% \textbf{Output}: $\theta^*$.\\
% \FOR{$t=0$ {\bfseries to} $T-1$}
% \STATE Take $A_t$ from $S_t$ according to $\mu$, and arrive at $S_{t+1}$\\
% \STATE Observe sample ($S_t$,$R_{t+1}$,$S_{t+1}$) at time step $t$ (with their corresponding state feature vectors)\\
% % where $\textbf{E}_\mu \in \mathbb{R}^{N \times d }$ and $\textbf{E}_\mu$'s every row has elements equal to $\mathbb{E}_{\mu}[\phi_t - \gamma \phi'_{t}]^{\top}$.
% The key matrix is $\textbf{F} (\textbf{I} - \gamma \textbf{P}_{\pi}) - \textbf{f} \textbf{d}_{\mu}^{\top} (\textbf{I} - \gamma \textbf{P}_{\mu})$, and the vector of its column sums is
\@writefile{lof}{\contentsline{figure}{\numberline{2}{\ignorespaces 7-state version of Baird's off-policy counterexample.}}{6}{figure.2}\protected@file@percent }
\newlabel{bairdexample}{{2}{6}{7-state version of Baird's off-policy counterexample}{figure.2}{}}
\@writefile{lof}{\contentsline{figure}{\numberline{2}{\ignorespaces 7-state version of Baird's off-policy counterexample.}}{7}{figure.2}\protected@file@percent }
\newlabel{bairdexample}{{2}{7}{7-state version of Baird's off-policy counterexample}{figure.2}{}}
\@writefile{toc}{\contentsline{subsection}{\numberline{5.2}Experimental Results and Analysis}{7}{subsection.5.2}\protected@file@percent }
\@writefile{lof}{\contentsline{figure}{\numberline{3}{\ignorespaces Learning curses of four evaluation environments.}}{7}{figure.3}\protected@file@percent }
\@writefile{lof}{\contentsline{figure}{\numberline{4}{\ignorespaces Learning curses of four contral environments.}}{8}{figure.4}\protected@file@percent }
\@writefile{lof}{\contentsline{figure}{\numberline{3}{\ignorespaces Learning curses of four evaluation environments.}}{8}{figure.3}\protected@file@percent }
\@writefile{toc}{\contentsline{subsection}{\numberline{6.1}Difference between VMQ and R-learning}{8}{subsection.6.1}\protected@file@percent }
\@writefile{lot}{\contentsline{table}{\numberline{2}{\ignorespaces Difference between R-learning and tabular VMQ.}}{8}{table.2}\protected@file@percent }
\@writefile{lof}{\contentsline{figure}{\numberline{4}{\ignorespaces Learning curses of four contral environments.}}{9}{figure.4}\protected@file@percent }
\newlabel{thetavmtdcFaster}{{42}{14}{Proof of Theorem \ref{theorem2}}{equation.A.42}{}}
\newlabel{uvmtdcFaster}{{43}{14}{Proof of Theorem \ref{theorem2}}{equation.A.43}{}}
\newlabel{uvmtdcFasterFinal}{{44}{14}{Proof of Theorem \ref{theorem2}}{equation.A.44}{}}
\newlabel{uvmtdcInfty}{{45}{14}{Proof of Theorem \ref{theorem2}}{equation.A.45}{}}
\newlabel{thetavmtdcSlowerFinal}{{47}{14}{Proof of Theorem \ref{theorem2}}{equation.A.47}{}}
\@writefile{loa}{\contentsline{algorithm}{\numberline{2}{\ignorespaces VMTDC algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.2}\protected@file@percent }
\newlabel{alg:algorithm 2}{{2}{15}{Proof of Theorem \ref{theorem2}}{algorithm.2}{}}
\@writefile{loa}{\contentsline{algorithm}{\numberline{3}{\ignorespaces VMGTD algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.3}\protected@file@percent }
\newlabel{alg:algorithm 3}{{3}{15}{Proof of Theorem \ref{theorem2}}{algorithm.3}{}}
\@writefile{loa}{\contentsline{algorithm}{\numberline{3}{\ignorespaces VMETD algorithm with linear function approximation in the off-policy setting}}{15}{algorithm.3}\protected@file@percent }
\newlabel{alg:algorithm 5}{{3}{15}{Proof of Theorem \ref{theorem2}}{algorithm.3}{}}
\newlabel{odethetavmtdcfinal}{{48}{15}{Proof of Theorem \ref{theorem2}}{equation.A.48}{}}
\@writefile{toc}{\contentsline{subsection}{\numberline{A.4}Proof of VMETD convergence}{16}{subsection.A.4}\protected@file@percent }
\newlabel{proofVMETD}{{A.4}{16}{Proof of VMETD convergence}{subsection.A.4}{}}
\newlabel{rowsum}{{51}{16}{Proof of VMETD convergence}{equation.A.51}{}}
\newlabel{columnsum}{{52}{16}{Proof of VMETD convergence}{equation.A.52}{}}
\bibstyle{named}
\bibdata{neurips_2024}
\bibcite{baird1995residual}{{1}{1995}{{Baird and others}}{{}}}
\bibcite{basserrano2021logistic}{{2}{2021}{{Bas-Serrano \bgroup\em et al.\egroup}}{{}}}
\@writefile{loa}{\contentsline{algorithm}{\numberline{4}{\ignorespaces VMGTD2 algorithm with linear function approximation in the off-policy setting}}{16}{algorithm.4}\protected@file@percent }
\newlabel{alg:algorithm 4}{{4}{16}{Proof of Theorem \ref{theorem2}}{algorithm.4}{}}
\@writefile{lot}{\contentsline{table}{\numberline{3}{\ignorespaces Learning rates ($lr$) of four control experiments.}}{16}{table.3}\protected@file@percent }
\newlabel{lrofways}{{3}{16}{Learning rates ($lr$) of four control experiments}{table.3}{}}
\@writefile{lot}{\contentsline{table}{\numberline{3}{\ignorespaces Learning rates ($lr$) of four control experiments.}}{18}{table.3}\protected@file@percent }
\newlabel{lrofways}{{3}{18}{Learning rates ($lr$) of four control experiments}{table.3}{}}
\bibcite{ng1999policy}{{19}{1999}{{Ng \bgroup\em et al.\egroup}}{{}}}
\bibcite{pan2017accelerated}{{20}{2017}{{Pan \bgroup\em et al.\egroup}}{{}}}
\bibcite{schulman2015trust}{{21}{2015}{{Schulman \bgroup\em et al.\egroup}}{{}}}
...
...
@@ -216,4 +223,4 @@
\bibcite{xu2019reanalysis}{{30}{2019}{{Xu \bgroup\em et al.\egroup}}{{}}}
\bibcite{xu2020reanalysis}{{31}{2020}{{Xu \bgroup\em et al.\egroup}}{{}}}
\bibcite{zhang2022truncated}{{32}{2022}{{Zhang and Whiteson}}{{}}}