\section{Background}
  
\subsection{MDP and 2048 game}
Consider Markov decision process (MDP)
$\langle \mathcal{S}$, $\mathcal{A}$, $\mathcal{R}$, $\mathcal{T}$$\rangle$, where 
$\mathcal{S}=\{1,2,3,\ldots\}$ is a finite state space, $|\mathcal{S}|=n$, $\mathcal{A}$ is an action space,
$\mathcal{T}:\mathcal{S}\times \mathcal{A}\times \mathcal{S}\rightarrow [0,1]$ 
is a transition function,
$\mathcal{R}:\mathcal{S}\times \mathcal{A}\times \mathcal{S}\rightarrow \mathbb{R}$ is a reward function.
Policy $\pi:S\times A\rightarrow [0,1]$ 
selects an action $a$ in state $s$ 
with probability $\pi(a|s)$.
State value function under policy $\pi$, denoted $V^{\pi}:S\rightarrow
\mathbb{R}$, represents the expected sum of rewards in
the MDP under policy $\pi$:
$V^{\pi}(s)=\mathbb{E}_{\pi}\left[\sum_{t=0}^{\infty}r_t|s_0=s\right]$.


The 2048 game consists of a 4$\times$4 grid board, totaling 16 squares.
 At the beginning of the game,  two squares are randomly filled
  with tiles of either 2 or 4.
  Players can make moves in four directions: \textit{up}, \textit{down},
   \textit{left}, and \textit{right}. 
   When a player chooses a direction, 
   all tiles will move in that direction until 
   they hit the edge or another tile. 
   If two tiles with the same number are adjacent 
   in the moving direction, they will merge into 
   a tile with the sum of the original numbers.
   Each tile can only participate in one merge operation per move.
   After each move, a new tile appears on a random empty square.
   The new tile is 2 with  probability 0.1, and 4 with probability 0.9.
The game ends when all squares are filled, and no valid merge operations can be made. 

\subsection{Ergodicity and Non-ergodicity of Markov Chains}

Given a steady policy $\pi$, MDP becomes a Markov chain on state space
$\mathcal{S}$ with a   matrix
$P_{\pi}\in[0,1]^{n\times n}$, where
$P_{\pi}(s_1,s_2)=\sum_{a\in \mathcal{A}}\pi(a|s_1)\mathcal{T}(s_1,a,s_2)$
is the transition probobility from $s_1$ to $s_2$, 
$\forall s\in \mathcal{S}$,   $\sum_{s'\in \mathcal{S}}P_{\pi}(s,s')=1$. 
A stationary  measure for $P_{\pi}$ is a distribution measure
$d_{\pi}$ on $\mathcal{S}$ such that
\begin{equation}
d_{\pi}=P_{\pi}^{\top}d_{\pi}.
\label{invariance}
\end{equation}
That is $\forall s\in \mathcal{S}$, we have
\begin{equation}
\sum_{s'\in \mathcal{S}}P_{\pi}(s',s)d_{\pi}(s')=d_{\pi}(s).
\end{equation}

\begin{definition}[Ergodicity]
 Assume $d_{\pi}(s)$  exist for any policy $\pi$ and 
 are independent of initial states, 
 MDP is ergodic if $\forall s\in \mathcal{S}$,
$d_{\pi}(s)>0$.
\end{definition}




This mean all states are reachable under any policy from the
current state after sufficiently many steps \cite{majeed2018q}.
A sufficient condition for this assumption is that
1 is a simple eigenvalue of the matrix $P_{\pi}$ and 
all other eigenvalues of $P_{\pi}$ are of modulus <1.




\input{pic/randomWalk}

Random walk, see Figure \ref{randomwalk},
is a Markov chain, where agent starts from node C, and takes
a probability of 0.5 to move left or right, until 
reaching the leftmost or rightmost node where it terminates.
The terminal states are usually called absorbing states.
The transition probobility matrix
of random walk with absorbing states 
$P_{\text{ab}}$ is defined as follows:
\[
P_{\text{ab}}\dot{=}\begin{array}{c|ccccccc}
&\text{T} & \text{A} & \text{B} & \text{C} & \text{D} & \text{E}  \\\hline
\text{T} & 1 & 0 & 0 & 0 & 0 & 0 \\
\text{A} & \frac{1}{2} & 0 & \frac{1}{2} & 0 & 0 & 0 \\
\text{B} & 0 & \frac{1}{2} & 0 & \frac{1}{2} & 0 & 0 \\
\text{C} & 0 & 0 & \frac{1}{2} & 0 & \frac{1}{2} & 0 \\
\text{D} & 0 & 0 & 0 & \frac{1}{2} & 0 & \frac{1}{2}  \\
\text{E} & \frac{1}{2} & 0 & 0 & 0 & \frac{1}{2} & 0   
\end{array}
\]
Note that absorbing states can be combined into one.
According to (\ref{invariance}),
the distribution $d_{\text{absorbing}}=\{1$,
 $0$, $0$, $0$, $0$, $0$\}.
 Since the probabilities of A, B, C, D, E are all zeros,
 random walk with absorbing states is non-ergodic.
 
 \input{pic/randomWalkRestart}
 However, in reinforcement learning, we always assume the ergodicity assumption.
 When encountering an absorbing state, we immediately reset and 
 transition to the initial states. Figure \ref{randomwalkRestart}
 is random walk with restarts.
 The transition probobility matrix
of random walk with restarts 
$P_{\text{restart}}$ is defined as follows:
\[
P_{\text{restart}}\dot{=}\begin{array}{c|ccccccc}
&\text{T} & \text{A} & \text{B} & \text{C} & \text{D} & \text{E} \\\hline
\text{T} & 0 & 0 & 0 & 1 & 0 & 0 \\
\text{A} & \frac{1}{2} & 0 & \frac{1}{2} & 0 & 0 & 0 \\
\text{B} & 0 & \frac{1}{2} & 0 & \frac{1}{2} & 0 & 0 \\
\text{C} & 0 & 0 & \frac{1}{2} & 0 & \frac{1}{2} & 0 \\
\text{D} & 0 & 0 & 0 & \frac{1}{2} & 0 & \frac{1}{2} \\
\text{E} & \frac{1}{2}  & 0 & 0 & 0 & \frac{1}{2} & 0  \\
\end{array}
\]
 
According to (\ref{invariance}),
the distribution $d_{\text{restart}}=\{0.1$,
 $0.1$, $0.2$, $0.3$, $0.2$, $0.1\}$.
 Since the probability of T, A, B, C, D, E are non-zeros,
 random walk with restarts is ergodic.

\subsection{Ergodicity  between non-absorbing states}
For Markov chains with absorbing states, we usually decompose
the transition matrix $P$ into the following form:
\[
P =
\begin{bmatrix}
Q & R \\
0 & I
\end{bmatrix},
\]
where $Q$ is the matrix of transition probabilities between
 non-absorbing states,  $R$ represents the transition probabilities
  from non-absorbing states to absorbing states, 
  $I$ is the matrix of transition probabilities between absorbing states, 
  and $0$ is a zero matrix.
  

  
  
  Expected number of visits to non-absorbing states before being absorbed
  is
  \begin{equation}
  N\dot{=} \sum_{i=0}^{\infty}Q^i=(I_{n-1}-Q)^{-1},
  \label{definitionN}
  \end{equation}
  where $I_{n-1}$ is the $(n-1)\times(n-1)$ identity matrix.
It is now easy to define whether the non-absorbing states 
are ergodic.
 
\begin{definition}[Ergodicity between non-absorbing states]
Assume that $N$  exists for any policy $\pi$
 and is independent of initial states.
 $\forall i,j \in S\setminus\{\text{T}\}$, 
 $N_{ij}>0$, MDP is ergodic between non-absorbing states.
 \label{definition2}
\end{definition}



For random walk with absorbing states,
\[
P_{\text{ab}} =
\begin{bmatrix}
Q_{\text{ab}} & R_{\text{ab}} \\
0 & I_{\text{ab}}
\end{bmatrix},
\]
where
\[
Q_{\text{ab}}\dot{=}\begin{array}{c|ccccc}
 & \text{A} & \text{B} & \text{C} & \text{D} & \text{E}  \\\hline
\text{A}  & 0 & \frac{1}{2} & 0 & 0 & 0 \\
\text{B} & \frac{1}{2} & 0 & \frac{1}{2} & 0 & 0 \\
\text{C} & 0 & \frac{1}{2} & 0 & \frac{1}{2} & 0 \\
\text{D}  & 0 & 0 & \frac{1}{2} & 0 & \frac{1}{2}  \\
\text{E}  & 0 & 0 & 0 & \frac{1}{2} & 0   
\end{array}
\]
%\[
% R_{\text{absorbing}}\dot{=}\begin{array}{c|c}
% &\text{T}  \\\hline
% \text{A} & \frac{1}{2}  \\
% \text{B} & 0  \\
% \text{C} & 0  \\
% \text{D} & 0   \\
% \text{E} & \frac{1}{2}    
% \end{array}
% \]
% \[
% I_{\text{absorbing}}\dot{=}\begin{array}{c|c}
% &\text{T}  \\\hline
% \text{T} & 1 
% \end{array}
% \]

Then,
\[
N_{\text{ab}}=(I_5-Q_{\text{ab}})^{-1}=\begin{array}{c|ccccc}
 & \text{A} & \text{B} & \text{C} & \text{D} & \text{E}  \\\hline
\text{A}  & \frac{5}{3} & \frac{4}{3} & 1 & \frac{2}{3} & \frac{1}{3} \\
\text{B} & \frac{4}{3} & \frac{8}{3} & 2 & \frac{4}{3} & \frac{2}{3} \\
\text{C} & 1 & 2 & 3 & 2 & 1 \\
\text{D}  & \frac{2}{3} & \frac{4}{3} & 2 & \frac{8}{3} & \frac{4}{3} \\
\text{E}  & \frac{1}{3} & \frac{2}{3} & 1 & \frac{4}{3} & \frac{5}{3} \\  
\end{array}
\]

Bases on Definition \ref{definition2}, 
random walk with absorbing states
is ergodic between non-absorbing states.