TDAlgorithms_IEEE24

3d352cbd · GongYu · 3d352cbd · 3d352cbd · 3d352cbd · 3d352cbd
Commit 3d352cbd authored Jul 10, 2024 by GongYu
117 changed files
--- a/.gitignore
+++ b/.gitignore
--- a/Algorithms/ABTD.py
+++ b/Algorithms/ABTD.py
+from Algorithms.BaseVariableLmbda import BaseVariableLmbda
+import numpy as np
+
+
+class ABTD(BaseVariableLmbda):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        zeta = kwargs.get('zeta')
+        self.old_nu = 0
+        if self.task.num_policies > 1:
+            self.old_nu = np.zeros(self.task.num_policies)
+        xi_zero = self.task.ABTD_xi_zero
+        xi_max = self.task.ABTD_xi_max
+        self.xi = 2 * zeta * xi_zero + max(0, 2 * zeta - 1) * (xi_max - 2 * xi_zero)
+
+    @staticmethod
+    def related_parameters():
+        return['alpha', 'zeta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, rho, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal)
+        nu = min(self.xi, 1.0 / max(pi, mu))
+        self.z = x + self.gamma * self.old_nu * self.old_pi * self.z
+        self.w += alpha * delta * self.z
+        self.old_nu = nu
+        self.old_pi = pi
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        delta = rho * delta
+        nu = self.compute_nu_for_multiple_policies(pi, mu)
+        self.z = (self.gamma_vec_t * self.old_nu * self.old_pi)[:, None] * self.z + stacked_x
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
+        self.old_nu = nu
+        self.old_pi = pi
+        self.gamma_vec_t = self.gamma_vec_tp
+
+    def compute_nu_for_multiple_policies(self, pi, mu):
+        xi_vec = np.ones(self.task.num_policies) * self.xi
+        max_vec = 1.0 / np.maximum.reduce([pi, mu])
+        return np.minimum.reduce([max_vec, xi_vec])
+
+    def reset(self):
+        super().reset()
+        self.old_nu = 0
--- a/Algorithms/BaseGradient.py
+++ b/Algorithms/BaseGradient.py
+import numpy as np
+from Algorithms.BaseTD import BaseTD
+from Tasks.BaseTask import BaseTask
+
+
+class BaseGradient(BaseTD):
+    def __init__(self, task: BaseTask, **kwargs):
+        super().__init__(task, **kwargs)
+        self.v = np.zeros(self.task.num_features)
+        self.eta = kwargs.get('eta')
+        if self.task.num_policies > 1:
+            self.v = np.zeros((self.task.num_policies, self.task.num_features))
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda', 'eta']
+
+    def compute_second_step_size(self):
+        return self.eta * self.compute_step_size()
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super(BaseGradient, self).learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x, self.task.stacked_feature_rep[:, :, s_p], \
+            self.compute_second_step_size() * self.gamma_vec_t / self.gamma
--- a/Algorithms/BaseLS.py
+++ b/Algorithms/BaseLS.py
+import numpy as np
+from numpy.linalg import pinv
+from Tasks.BaseTask import BaseTask
+from Algorithms.BaseTD import BaseTD
+
+
+class BaseLS(BaseTD):
+    def __init__(self, task: BaseTask, **kwargs):
+        super(BaseLS, self).__init__(task, **kwargs)
+        self.A = np.zeros((self.task.num_features, self.task.num_features))
+        self.b = np.zeros(self.task.num_features)
+        self.t = 0
+        if self.task.num_policies > 1:
+            self.A = np.zeros((self.task.num_policies, self.task.num_features, self.task.num_features))
+            self.b = np.zeros((self.task.num_policies, self.task.num_features))
+            self.gamma_vec_t = np.concatenate((np.ones(2), np.zeros(6))) * self.gamma
+            self.t = np.zeros(self.task.num_policies)
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        self.t += 1
+        self.A += (np.outer(self.z, (x - self.gamma * x_p)) - self.A) / self.t
+        self.b += (r * self.z - self.b) / self.t
+        self.w = np.dot(pinv(self.A), self.b)
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        _, _, x, x_p, _, _, _, stacked_x = \
+            super(BaseLS, self).learn_multiple_policies(s, s_p, r, is_terminal)
+        for i in range(self.task.num_policies):
+            if self.gamma_vec_t[i] != 0.0:
+                self.t[i] += 1
+                z = self.z[i, :]
+                self.A[i, :, :] += (np.outer(z, (x - self.gamma_vec_tp[i] * x_p)) - self.A[i, :, :]) / self.t[i]
+                self.b[i, :] += (self.r_vec[i] * z - self.b[i, :]) / self.t[i]
+                self.w[i, :] = np.dot(pinv(self.A[i, :, :]), self.b[i, :])
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/BaseTD.py
+++ b/Algorithms/BaseTD.py
+import numpy as np
+from Tasks.BaseTask import BaseTask
+
+
+class BaseTD:
+    def __init__(self, task: BaseTask, **kwargs):
+        self.task = task
+        self.w = np.zeros(self.task.num_features)
+        self.z = np.zeros(self.task.num_features)
+        if self.task.num_policies > 1:
+            self.w = np.zeros((self.task.num_policies, self.task.num_features))
+            self.z = np.zeros((self.task.num_policies, self.task.num_features))
+        self.gamma = self.task.GAMMA
+        self.alpha = kwargs['alpha']
+        self.lmbda = kwargs.get('lmbda')
+        self.state_values = self.task.load_state_values()  # This is of size num_policies * 121
+        self.d_mu = self.task.load_behavior_dist()  # same size as state_values
+        self.state, self.next_state, self.action = None, None, None
+        self.r_vec = np.zeros(self.task.num_policies)
+        self.gamma_vec_tp = np.zeros(self.task.num_policies)
+        self.gamma_vec_t = np.zeros(self.task.num_policies)
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda']
+
+    def compute_value_function(self):
+        return np.dot(self.w, self.task.feature_rep.T)
+
+    def compute_rmsve(self):
+        error = self.compute_value_function() - self.state_values
+        error_squared = error * error
+        return np.sqrt(np.sum(self.d_mu * error_squared.T, 0) / np.sum(self.d_mu, 0)), error
+
+    def compute_step_size(self):
+        return self.alpha
+
+    def choose_behavior_action(self):
+        return self.task.select_behavior_action(self.state)
+
+    def choose_target_action(self):
+        return self.task.select_target_action(self.state)
+
+    def learn(self, s, s_p, r, is_terminal):
+        if self.task.num_policies == 1:
+            self.learn_single_policy(s, s_p, r, is_terminal)
+        else:
+            self.learn_multiple_policies(s, s_p, r, is_terminal)
+
+    def get_features(self, s, s_p, is_terminal):
+        x_p = np.zeros(self.task.num_features)
+        if not is_terminal:
+            x_p = self.task.get_state_feature_rep(s_p)
+        x = self.task.get_state_feature_rep(s)
+        return x, x_p
+
+    def get_isr(self, s):
+        pi = self.task.get_pi(s, self.action)
+        mu = self.task.get_mu(s, self.action)
+        rho = pi / mu
+        return rho
+
+    def get_delta(self, r, x, x_p):
+        return r + self.gamma * np.dot(self.w, x_p) - np.dot(self.w, x)
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        rho = self.get_isr(s)
+        alpha = self.compute_step_size()
+        delta = self.get_delta(r, x, x_p)
+        self.z = rho * (self.gamma * self.lmbda * self.z + x)
+        return delta, alpha, x, x_p, rho
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        active_policies_vec = self.task.get_active_policies(s)
+        self.r_vec = np.zeros(self.task.num_policies)
+        if r > 0:
+            terminal_policies_vec = self.task.get_terminal_policies(s_p)
+            self.r_vec = r * terminal_policies_vec
+        alpha_vec = active_policies_vec * self.compute_step_size()
+        x = self.task.get_state_feature_rep(s)
+        x_p = np.zeros(self.task.num_features)
+        if not is_terminal:
+            x_p = self.task.get_state_feature_rep(s_p)
+        pi = self.task.get_pi(s, self.action)
+        mu = self.task.get_mu(s, self.action)
+        rho = pi / mu
+        self.gamma_vec_tp = self.task.get_active_policies(s_p) * self.gamma
+        delta = self.r_vec + self.gamma_vec_tp * np.dot(self.w, x_p) - np.dot(self.w, x)
+        stacked_x = self.task.stacked_feature_rep[:, :, s]
+        return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x
+
+    def reset(self):
+        self.z = np.zeros(self.task.num_features)
+
+    def __str__(self):
+        return f'agent:{type(self).__name__}'
--- a/Algorithms/BaseVariableLmbda.py
+++ b/Algorithms/BaseVariableLmbda.py
+from Algorithms.BaseTD import BaseTD
+from Tasks.BaseTask import BaseTask
+import numpy as np
+
+
+class BaseVariableLmbda(BaseTD):
+    def __init__(self, task: BaseTask, **kwargs):
+        super().__init__(task, **kwargs)
+        self.old_pi, self.old_mu = 0, 1
+        if self.task.num_policies > 1:
+            self.old_pi, self.old_mu = np.zeros(self.task.num_policies), np.ones(self.task.num_policies)
+        self.old_rho = self.old_pi / self.old_mu
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        alpha = self.compute_step_size()
+        pi = self.task.get_pi(s, self.action)
+        mu = self.task.get_mu(s, self.action)
+        rho = pi / mu
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        delta = rho * self.get_delta(r, x, x_p)
+        return delta, alpha, x, x_p, rho, pi, mu
+
+    def reset(self):
+        self.old_pi, self.old_mu = 0, 1
+        self.old_rho = self.old_pi / self.old_mu
--- a/Algorithms/ETD.py
+++ b/Algorithms/ETD.py
+from Algorithms.ETDLB import ETDLB
+
+
+class ETD(ETDLB):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.beta = self.task.GAMMA
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda']
--- a/Algorithms/ETDLB.py
+++ b/Algorithms/ETDLB.py
+from Algorithms.BaseTD import BaseTD
+import numpy as np
+
+
+class ETDLB(BaseTD):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.F = 1
+        self.old_rho = 0
+        self.beta = kwargs.get('beta')
+        if self.task.num_policies > 1:
+            self.F = np.zeros(self.task.num_policies)
+            self.old_rho = np.zeros(self.task.num_policies)
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda', 'beta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        delta = self.get_delta(r, x, x_p)
+        self.F = self.beta * self.old_rho * self.F + 1
+        m = self.lmbda * 1 + (1 - self.lmbda) * self.F
+        rho = self.get_isr(s)
+        self.z = rho * (x * m + self.gamma * self.lmbda * self.z)
+        self.w += self.compute_step_size() * delta * self.z
+        self.old_rho = rho
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, *_, rho, _ = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        stacked_x = self.task.stacked_feature_rep[:, :, s]
+        beta_vec = self.beta * self.gamma_vec_t / self.gamma
+        self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies)
+        m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None])
+        self.w += (alpha_vec * delta)[:, None] * self.z
+        self.old_rho = rho
+        self.gamma_vec_t = self.gamma_vec_tp
+
+    def reset(self):
+        super().reset()
+        self.F = 1
+        self.old_rho = 0
+        if self.task.num_policies > 1:
+            self.old_rho = np.zeros(self.task.num_policies)
+            self.F = np.zeros(self.task.num_policies)
--- a/Algorithms/GEMETD.py
+++ b/Algorithms/GEMETD.py
+from Algorithms.BaseTD import BaseTD
+import numpy as np
+
+
+class GEMETD(BaseTD):
+    """
+    An ETD(0) implementation that uses GEM (aka GTD2(0) with x and x_p switched) to estimate emphasis.
+    """
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.beta = self.task.GAMMA
+        self.gem_alpha = kwargs['gem_alpha']  # Step size for GEM weights.
+        self.gem_beta = kwargs['gem_beta']  # Regularization parameter for GEM; not needed for a fixed target policy.
+        self.k = np.zeros(self.task.num_features)  # Auxiliary weights for GEM.
+        self.u = np.zeros(self.task.num_features)  # Main weights for GEM.
+        if self.task.num_policies > 1:
+            self.k = np.zeros((self.task.num_policies, self.task.num_features))
+            self.u = np.zeros((self.task.num_policies, self.task.num_features))
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'gem_alpha', 'gem_beta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        rho = self.get_isr(s)
+        delta_bar = 1 + rho * self.gamma * np.dot(self.u, x) - np.dot(self.u, x_p)
+        self.k += self.gem_alpha * (delta_bar - np.dot(self.k, x_p)) * x_p
+        self.u += self.gem_alpha * ((x_p - self.gamma * rho * x) * np.dot(self.k, x_p) - self.gem_beta * self.u)
+        delta = self.get_delta(r, x, x_p)
+        m = np.dot(self.u, x)  # Use parametric estimate of expected emphasis.
+        self.w += self.alpha * m * rho * delta * x
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        stacked_x_p = self.task.stacked_feature_rep[:, :, s_p]
+        # GEM update:
+        gem_alpha_vec = self.task.get_active_policies(s) * self.gem_alpha
+        delta_bar = np.ones(self.task.num_policies) + rho * self.gamma_vec_t * np.dot(self.u, x) - np.dot(self.u, x_p)
+        self.k += gem_alpha_vec[:, None] * (delta_bar[:, None] - np.sum(x_p * self.k, 1)[:, None]) * stacked_x_p
+        self.u += gem_alpha_vec[:, None] * ((stacked_x_p - self.gamma_vec_t[:, None] * rho[:, None] * stacked_x) * np.sum(x_p * self.k, 1)[:, None] - self.gem_beta * self.u)  # should self.gem_beta be a vector here?
+        # ETD(0) update:
+        m = np.dot(self.u, x)
+        self.w += (alpha_vec * m * rho * delta)[:, None] * stacked_x
+        self.gamma_vec_t = self.gamma_vec_tp
+
+    def reset(self):
+        super().reset()
+        self.k = np.zeros(self.task.num_features)
+        self.u = np.zeros(self.task.num_features)
+        if self.task.num_policies > 1:
+            self.k = np.zeros((self.task.num_policies, self.task.num_features))
+            self.u = np.zeros((self.task.num_policies, self.task.num_features))
--- a/Algorithms/GTD.py
+++ b/Algorithms/GTD.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+# noinspection DuplicatedCode
+class GTD(BaseGradient):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
+        self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x)
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p)
+        self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/GTD2.py
+++ b/Algorithms/GTD2.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+class GTD2(BaseGradient):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        self.w += alpha * (np.dot(x, self.v) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
+        self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x)
+
+    # noinspection DuplicatedCode
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
+        self.w += alpha_vec[:, None] * (
+                np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
+        self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/HTD.py
+++ b/Algorithms/HTD.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+class HTD(BaseGradient):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.z_b = np.zeros(self.task.num_features)
+        if self.task.num_policies > 1:
+            self.z_b = np.zeros((self.task.num_policies, self.task.num_features))
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        self.z_b = self.gamma * self.lmbda * self.z_b + x
+        self.w += alpha * ((delta * self.z) + (x - self.gamma * x_p) * np.dot((self.z - self.z_b), self.v))
+        self.v += alpha_v * ((delta * self.z) - (x - self.gamma * x_p) * np.dot(self.v, self.z_b))
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        self.z_b = self.lmbda * self.z_b * self.gamma_vec_t[:, None] + stacked_x
+        gamma_stacked_xp = self.gamma_vec_tp[:, None] * stacked_x_p
+        delta_z = delta[:, None] * self.z
+        self.w += alpha_vec[:, None] * (
+                delta_z + (stacked_x - gamma_stacked_xp) * (np.sum((self.z - self.z_b) * self.v, 1))[:, None])
+        self.v += alphav_vec[:, None] * (
+                delta_z - (stacked_x - gamma_stacked_xp) * np.sum(self.v * self.z_b, 1)[:, None])
+        # TODO: Should the last v be replaced by w?
+        self.gamma_vec_t = self.gamma_vec_tp
+
+    def reset(self):
+        super().reset()
+        self.z_b = np.zeros(self.task.num_features)
+        if self.task.num_policies > 1:
+            self.z_b = np.zeros((self.task.num_policies, self.task.num_features))
--- a/Algorithms/LSETD.py
+++ b/Algorithms/LSETD.py
+from Algorithms.BaseLS import BaseLS
+import numpy as np
+
+
+class LSETD(BaseLS):
+    def __init__(self, task, **kwargs):
+        super(LSETD, self).__init__(task, **kwargs)
+        self.old_rho = 0
+        self.F = 1
+        self.beta = kwargs['beta']
+        if self.task.num_policies > 1:
+            self.F = np.ones(self.task.num_policies)
+            self.old_rho = np.zeros(self.task.num_policies)
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda', 'beta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        self.F = self.beta * self.old_rho * self.F + 1
+        m = self.lmbda + (1 - self.lmbda) * self.F
+        x, _ = self.get_features(s, s_p, is_terminal)
+        rho = self.get_isr(s)
+        self.z = rho * (self.gamma * self.lmbda * self.z + x * m)
+        super(LSETD, self).learn_single_policy(s, s_p, r, is_terminal)
+        self.old_rho = rho
+
+    # noinspection DuplicatedCode
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        beta_vec = self.beta * self.gamma_vec_t / self.gamma
+        self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies)
+        m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F
+        stacked_x = self.task.stacked_feature_rep[:, :, s]
+        rho = self.get_isr(s)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None])
+        super(LSETD, self).learn_multiple_policies(s, s_p, r, is_terminal)
+        self.old_rho = rho
+
+    def reset(self):
+        super().reset()
+        self.F = 1
+        self.old_rho = 0
+        if self.task.num_policies > 1:
+            self.old_rho = np.zeros(self.task.num_policies)
+            self.F = np.zeros(self.task.num_policies)
--- a/Algorithms/LSTD.py
+++ b/Algorithms/LSTD.py
+from Algorithms.BaseLS import BaseLS
+
+
+class LSTD(BaseLS):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, _ = self.get_features(s, s_p, is_terminal)
+        self.z = self.get_isr(s) * (self.gamma * self.lmbda * self.z + x)
+        super(LSTD, self).learn_single_policy(s, s_p, r, is_terminal)
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        x, _ = self.get_features(s, s_p, is_terminal)
+        self.z = self.get_isr(s)[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + x)
+        super(LSTD, self).learn_multiple_policies(s, s_p, r, is_terminal)
--- a/Algorithms/PGTD2.py
+++ b/Algorithms/PGTD2.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+class PGTD2(BaseGradient):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        v_mid = self.v + alpha_v * (delta * self.z - np.dot(x, self.v) * x)
+        w_mid = self.w + alpha * (np.dot(x, self.v) * x - (1 - self.lmbda) * self.gamma * np.dot(self.z, self.v) * x_p)
+        delta_mid = r + self.gamma * np.dot(w_mid, x_p) - np.dot(w_mid, x)
+        self.w += alpha * (np.dot(x, v_mid) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, v_mid) * x_p)
+        self.v += alpha_v * (delta_mid * self.z - np.dot(x, v_mid) * x)
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        v_mid = self.v + alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
+        w_mid = self.w + alpha_vec[:, None] * (
+                np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
+        delta_mid = self.r_vec + self.gamma_vec_tp * np.dot(w_mid, x_p) - np.dot(w_mid, x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * v_mid, 1)
+        self.w += alpha_vec[:, None] * (
+                np.sum(x * v_mid, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
+        self.v += alphav_vec[:, None] * (delta_mid[:, None] * self.z - np.sum(x * v_mid, 1)[:, None] * stacked_x)
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/TB.py
+++ b/Algorithms/TB.py
+from Algorithms.BaseVariableLmbda import BaseVariableLmbda
+
+
+class TB(BaseVariableLmbda):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, *_, pi, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        self.z = self.gamma * self.lmbda * self.old_pi * self.z + x
+        self.w = self.w + alpha * delta * self.z
+        self.old_pi = pi
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        delta = rho * delta
+        self.z = (self.gamma_vec_t * self.lmbda * self.old_pi)[:, None] * self.z + stacked_x
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
+        self.old_pi = pi
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/TD.py
+++ b/Algorithms/TD.py
+from Algorithms.BaseTD import BaseTD
+
+
+class TD(BaseTD):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, *_ = super().learn_single_policy(s, s_p, r, is_terminal)
+        self.w += alpha * delta * self.z
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        self.w += (alpha_vec * delta)[:, None] * self.z
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/TDRC.py
+++ b/Algorithms/TDRC.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+# noinspection DuplicatedCode
+class TDRC(BaseGradient):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.tdrc_beta = kwargs['tdrc_beta']
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda', 'eta', 'tdrc_beta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
+        self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x) - alpha_v * self.tdrc_beta * self.v
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p)
+        self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(
+            x * self.v, 1)[:, None] * stacked_x) - (alphav_vec * self.tdrc_beta)[:, None] * self.v
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/Vtrace.py
+++ b/Algorithms/Vtrace.py
+from Algorithms.BaseVariableLmbda import BaseVariableLmbda
+import numpy as np
+
+
+class Vtrace(BaseVariableLmbda):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, *_, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal)
+        self.z = min(self.old_rho, 1) * self.gamma * self.lmbda * self.z + x
+        self.w += alpha * delta * self.z
+        self.old_rho = pi / mu
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        delta = rho * delta
+        truncated_old_rho = np.minimum(self.old_rho, np.ones(self.task.num_policies))
+        self.z = (truncated_old_rho * self.gamma_vec_t * self.lmbda)[:, None] * self.z + stacked_x
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
+        self.old_rho = rho
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Assets/Emphatics_sensitivity.png
+++ b/Assets/Emphatics_sensitivity.png
--- a/Assets/FourRoomGridWorld.gif
+++ b/Assets/FourRoomGridWorld.gif
--- a/Assets/Gradients_sensitivity.png
+++ b/Assets/Gradients_sensitivity.png
--- a/Assets/chain.gif
+++ b/Assets/chain.gif
--- a/Assets/eight_state_collision.png
+++ b/Assets/eight_state_collision.png
--- a/Assets/learning_curves.png
+++ b/Assets/learning_curves.png
--- a/Assets/parameters.png
+++ b/Assets/parameters.png
--- a/Assets/plots.png
+++ b/Assets/plots.png
--- a/Assets/rlai.png
+++ b/Assets/rlai.png
--- a/Assets/sensitivity_curves_of_all_algs.png
+++ b/Assets/sensitivity_curves_of_all_algs.png
--- a/Assets/specific_learning_curves.png
+++ b/Assets/specific_learning_curves.png
--- a/Assets/value_functions.png
+++ b/Assets/value_functions.png
--- a/Environments/Chain.py
+++ b/Environments/Chain.py
+import numpy as np
+
+
+class Chain:
+    def __init__(self, states_number: int = 8, start_state_number: int = 4, **kwargs):
+        assert start_state_number < states_number, "start states numbers should be less than state number"
+
+        self._states_number = states_number
+        self._start_state_number = start_state_number
+        self._terminal = self._states_number
+        self._state = None
+        self.RIGHT_ACTION = 0
+        self.RETREAT_ACTION = 1
+        self.num_states = states_number
+        self._window = None
+
+    def reset(self):
+        self._state = np.random.randint(0, self._start_state_number)
+        return self._state
+
+    def step(self, action):
+        if action == self.RETREAT_ACTION:
+            return self._terminal, 0, True, {}
+
+        next_state = self._state + 1
+        if next_state == self._terminal:
+            return self._terminal, 1, True, {}
+
+        self._state = next_state
+        return self._state, 0, False, {}
+
+    def render(self, mode='human'):
+        if mode == 'human':
+            import sys
+            from Environments.utils import colorize
+            corridor_map = [
+                str(i) if i > self._start_state_number
+                else colorize(str(i), "blue", highlight=False)
+                for i in range(self._states_number)
+            ]
+            corridor_map.append(colorize("T", "red", highlight=False))
+            corridor_map[self._state] = colorize(corridor_map[self._state], "green", highlight=True)
+
+            sys.stdout.write(f'{"|".join(corridor_map)}\n')
+
+        if mode == "rgb" or mode == "screen":
+            RGB_COLORS = {
+                'red': np.array([240, 52, 52]),
+                'black': np.array([0, 0, 0]),
+                'green': np.array([77, 181, 33]),
+                'blue': np.array([29, 111, 219]),
+                'purple': np.array([112, 39, 195]),
+                'yellow': np.array([217, 213, 104]),
+                'grey': np.array([192, 195, 196]),
+                'light_grey': np.array([230, 230, 230]),
+                'white': np.array([255, 255, 255])
+            }
+            img = np.zeros((self.num_states, 1, 3), dtype=np.uint8)
+            img[:, 0] = RGB_COLORS['grey']
+            img[:self._start_state_number - 1, 0] = RGB_COLORS['yellow']
+            img[self._terminal - 1, 0] = RGB_COLORS['black']
+            img[self._state - 1, 0] = RGB_COLORS['green']
+
+            img = np.transpose(img, (1, 0, 2))
+            if mode == "screen":
+                from pyglet.window import Window
+                from pyglet.text import Label
+                from pyglet.gl import GLubyte
+                from pyglet.image import ImageData
+                zoom = 50
+                if self._window is None:
+                    self._window = Window(self.num_states * zoom, 1 * zoom)
+
+                dt = np.kron(img, np.ones((zoom, zoom, 1)))
+                dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8'))
+                texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture()
+                self._window.clear()
+                self._window.switch_to()
+                self._window.dispatch_events()
+                texture.blit(0, 0)
+                # self._info.draw()
+                self._window.flip()
+            return np.flip(img, axis=0)
+
+
+if __name__ == '__main__':
+    env = Chain()
+    env.reset()
+    for step in range(1, 1000):
+        action = np.random.randint(0, 2)
+        sp, r, terminal, _ = env.step(action=action)
+        env.render(mode="screen")
+        if terminal:
+            env.reset()
+            print('env reset')
--- a/Environments/FourRoomGridWorld.py
+++ b/Environments/FourRoomGridWorld.py
+import numpy as np
+# from Environments.rendering import Render
+# from gym import utils
+# import gym
+# import sys
+
+BLOCK_NORMAL, BLOCK_WALL, BLOCK_HALLWAY, BLOCK_AGENT = 0, 1, 2, 3
+RGB_COLORS = {
+    'red': np.array([240, 52, 52]),
+    'black': np.array([0, 0, 0]),
+    'green': np.array([77, 181, 33]),
+    'blue': np.array([29, 111, 219]),
+    'purple': np.array([112, 39, 195]),
+    'yellow': np.array([217, 213, 104]),
+    'grey': np.array([192, 195, 196]),
+    'light_grey': np.array([230, 230, 230]),
+    'white': np.array([255, 255, 255])
+}
+four_room_map = [
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+]
+
+
+class FourRoomGridWorld:
+    def __init__(self, stochasticity_fraction=0.0):
+        self._grid = np.transpose(np.flip(np.array(four_room_map, dtype=np.uint8), axis=0)[1:-1, 1:-1])
+        self._max_row, self._max_col = self._grid.shape
+        self._normal_tiles = np.where(self._grid == BLOCK_NORMAL)
+        self._hallways_tiles = np.where(self._grid == BLOCK_HALLWAY)
+        self._walls_tiles = np.where(self._grid == BLOCK_WALL)
+        self.num_states = self._grid.size
+
+        self._state = None
+        self.ACTION_UP, self.ACTION_DOWN, self.ACTION_RIGHT, self.ACTION_LEFT = 0, 1, 2, 3
+        self.num_actions = 4
+        self._stochasticity_fraction = stochasticity_fraction
+        self.hallways = {
+            0: (5, 1),
+            1: (1, 5),
+            2: (5, 8),
+            3: (8, 4)
+        }
+        self._window, self._info = None, None
+
+    def reset(self):
+        self._state = (0, 0)
+        return self.get_state_index(*self._state)
+
+    def step(self, action):
+        x, y = self._state
+        is_stochastic_selected = False
+        # if self._stochasticity_fraction >= np.random.uniform():
+        #     action_probability = [1 / (self.num_actions - 1) if i != action else 0 for i in range(self.num_actions)]
+        #     action = np.random.choice(self.num_actions, 1, p=action_probability)[0]
+        #     is_stochastic_selected = True
+        x_p, y_p = self._next(action, *self._state)
+        is_done = self._grid[x_p, y_p] == BLOCK_HALLWAY
+        reward = 1 if is_done else 0
+        self._state = (x_p, y_p)
+        return self.get_state_index(*self._state), reward, False, {
+            'x': x, 'y': y,
+            'x_p': x_p, 'y_p': y_p,
+            'is_stochastic_selected': is_stochastic_selected,
+            'selected_action': action}
+
+    def get_xy(self, state):
+        return (state % self._max_row), (state // self._max_col)
+
+    def get_state_index(self, x, y):
+        return y * self._max_col + x
+
+    def _next(self, action, x, y):
+
+        def move(current_x, current_y, next_x, next_y):
+            if next_y < 0 or next_x < 0:
+                return current_x, current_y
+            if next_y >= self._max_col or next_x >= self._max_row:
+                return current_x, current_y
+            if self._grid[next_x, next_y] == BLOCK_WALL:
+                return current_x, current_y
+            return next_x, next_y
+
+        switcher = {
+            self.ACTION_DOWN: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x, pos_y - 1),
+            self.ACTION_RIGHT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x + 1, pos_y),
+            self.ACTION_UP: lambda pox_x, pos_y: move(pox_x, y, pox_x, pos_y + 1),
+            self.ACTION_LEFT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x - 1, pos_y),
+        }
+        move_func = switcher.get(action)
+        return move_func(x, y)
+
+    def render(self, mode='human'):
+        import sys
+        from Environments.utils import colorize
+        color = {
+            BLOCK_NORMAL: lambda c: colorize(c, "white", highlight=True),
+            BLOCK_WALL: lambda c: colorize(c, "gray", highlight=True),
+            BLOCK_HALLWAY: lambda c: colorize(c, "green", highlight=True),
+        }
+        if mode == 'human':
+            outfile = sys.stdout
+            img = [
+                [color[b]('  ')
+                 for x, b
+                 in enumerate(line)]
+                for y, line in enumerate(four_room_map)]
+            img[self._max_row - self._state[1]][self._state[0] + 1] = colorize('  ', "red",
+                                                                                     highlight=True)
+            for line in img:
+                outfile.write(f'{"".join(line)}\n')
+            outfile.write('\n')
+        if mode == "rgb" or mode == "screen":
+            x, y = self._state
+            img = np.zeros((*self._grid.shape, 3), dtype=np.uint8)
+            img[self._normal_tiles] = RGB_COLORS['light_grey']
+
+            # if render_cls is not None:
+            #     assert render_cls is not type(Render), "render_cls should be Render class"
+            #     img = render_cls.render(img)
+
+            img[self._walls_tiles] = RGB_COLORS['black']
+            img[self._hallways_tiles] = RGB_COLORS['green']
+            img[x, y] = RGB_COLORS['red']
+
+            ext_img = np.zeros((self._max_row + 2, self._max_col + 2, 3), dtype=np.uint8)
+            ext_img[1:-1, 1:-1] = np.transpose(img, (1, 0, 2))
+            if mode == "screen":
+
+                from pyglet.window import Window
+                from pyglet.text import Label
+                from pyglet.gl import GLubyte
+                from pyglet.image import ImageData
+                zoom = 20
+                if self._window is None:
+                    self._window = Window((self._max_row + 2) * zoom, (self._max_col + 2) * zoom)
+                    self._info = Label('Four Room Grid World', font_size=10, x=5, y=5)
+                # self._info.text = f'x: {x}, y: {y}'
+                dt = np.kron(ext_img, np.ones((zoom, zoom, 1)))
+                dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8'))
+                texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture()
+                self._window.clear()
+                self._window.switch_to()
+                self._window.dispatch_events()
+                texture.blit(0, 0)
+                # self._info.draw()
+                self._window.flip()
+            return np.flip(ext_img, axis=0)
+
+
+if __name__ == '__main__':
+    mode = 'human'
+    mode = 'screen'
+    env = FourRoomGridWorld()
+    env.reset()
+    for step in range(1, 100):
+        action = np.random.randint(0, 4)
+        sp, r, terminal, _ = env.step(action=action)
+        env.render(mode=mode)
+        if terminal:
+            env.reset()
+            print('env reset')
--- a/Environments/rendering.py
+++ b/Environments/rendering.py
+from abc import ABC, abstractmethod
+import numpy as np
+
+
+class Render(ABC):
+    @abstractmethod
+    def render(self, img):
+        raise NotImplementedError
+
+
+class ErrorRender(Render):
+    def __init__(self, num_policies, num_steps):
+        self.num_steps = num_steps
+        self.num_policies = num_policies
+        self._error, self._max_error, self._valid_state = None, None, None
+
+    def render(self, img):
+        # self.color_policy(img, 0)
+        self.color_policy(img, 1)
+        # self.color_policy(img, 2)
+        self.color_policy(img, 3)
+        # self.color_policy(img, 4)
+        self.color_policy(img, 5)
+        # self.color_policy(img, 6)
+        self.color_policy(img, 7)
+
+        return img
+
+    def add_error(self, error):
+        if self._max_error is None:
+            self._max_error = np.abs(error).reshape(8, 11, 11)
+            self._valid_state = np.array(self._max_error)
+            self._valid_state[self._valid_state != 0] = 1
+
+        self._error = np.abs(error).reshape(8, 11, 11)
+
+    def color_policy(self, img, policy_number):
+        e = self._error[policy_number]
+        x = self._max_error[policy_number]
+        d = np.clip((230 * e / x), 10, 255)
+        d = d * self._valid_state[policy_number]
+        d = np.nan_to_num(d).astype(np.uint8).T
+        d = np.repeat(d, 3).reshape(11, 11, 3)
+        d[:, :, 2] = 230
+        c = np.where(self._valid_state[policy_number].T == 1)
+        img[c] = d[c]
+        return img
--- a/Environments/utils.py
+++ b/Environments/utils.py
+"""A set of common utilities used within the environments. These are
+not intended as API functions, and will not remain stable over time.
+"""
+
+color2num = dict(
+    gray=30,
+    red=31,
+    green=32,
+    yellow=33,
+    blue=34,
+    magenta=35,
+    cyan=36,
+    white=37,
+    crimson=38
+)
+
+
+def colorize(string, color, bold=False, highlight=False):
+    """Return string surrounded by appropriate terminal color codes to
+    print colorized text.  Valid colors: gray, red, green, yellow,
+    blue, magenta, cyan, white, crimson
+    """
+
+    attr = []
+    num = color2num[color]
+    if highlight:
+        num += 10
+    attr.append(str(num))
+    if bold:
+        attr.append('1')
+    attrs = ';'.join(attr)
+    return '\x1b[%sm%s\x1b[0m' % (attrs, string)
--- a/Experiments/1HVFourRoom/ABTD/ABTD.json
+++ b/Experiments/1HVFourRoom/ABTD/ABTD.json
+{
+  "agent": "ABTD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "zeta": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/ETD/ETD.json
+++ b/Experiments/1HVFourRoom/ETD/ETD.json
+{
+  "agent": "ETD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/ETDLB/ETDLB.json
+++ b/Experiments/1HVFourRoom/ETDLB/ETDLB.json
+{
+  "agent": "ETDLB",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "beta": [
+      0.0, 0.2, 0.4, 0.6, 0.8, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/GTD/GTD.json
+++ b/Experiments/1HVFourRoom/GTD/GTD.json
+{
+  "agent": "GTD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/GTD2/GTD2.json
+++ b/Experiments/1HVFourRoom/GTD2/GTD2.json
+{
+  "agent": "GTD2",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/HTD/HTD.json
+++ b/Experiments/1HVFourRoom/HTD/HTD.json
+{
+  "agent": "HTD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/PGTD2/PGTD2.json
+++ b/Experiments/1HVFourRoom/PGTD2/PGTD2.json
+{
+  "agent": "PGTD2",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/TB/TB.json
+++ b/Experiments/1HVFourRoom/TB/TB.json
+{
+  "agent": "TB",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/TD/TD.json
+++ b/Experiments/1HVFourRoom/TD/TD.json
+{
+  "agent": "TD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/TDRC/TDRC.json
+++ b/Experiments/1HVFourRoom/TDRC/TDRC.json
+{
+  "agent": "TDRC",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ],
+    "tdrc_beta": [
+      1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/Vtrace/Vtrace.json
+++ b/Experiments/1HVFourRoom/Vtrace/Vtrace.json
+{
+  "agent": "Vtrace",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/ABTD/ABTD.json
+++ b/Experiments/FirstChain/ABTD/ABTD.json
+{
+  "agent": "ABTD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "zeta": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/ETD/ETD.json
+++ b/Experiments/FirstChain/ETD/ETD.json
+{
+  "agent": "ETD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/ETDLB/ETDLB.json
+++ b/Experiments/FirstChain/ETDLB/ETDLB.json
+{
+  "agent": "ETDLB",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "beta": [
+      0.0, 0.2, 0.4, 0.6, 0.8, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/GTD/GTD.json
+++ b/Experiments/FirstChain/GTD/GTD.json
+{
+  "agent": "GTD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/GTD2/GTD2.json
+++ b/Experiments/FirstChain/GTD2/GTD2.json
+{
+  "agent": "GTD2",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/HTD/HTD.json
+++ b/Experiments/FirstChain/HTD/HTD.json
+{
+  "agent": "HTD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/PGTD2/PGTD2.json
+++ b/Experiments/FirstChain/PGTD2/PGTD2.json
+{
+  "agent": "PGTD2",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/TB/TB.json
+++ b/Experiments/FirstChain/TB/TB.json
+{
+  "agent": "TB",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/TD/TD.json
+++ b/Experiments/FirstChain/TD/TD.json
+{
+  "agent": "TD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/TDRC/TDRC.json
+++ b/Experiments/FirstChain/TDRC/TDRC.json
+{
+  "agent": "TDRC",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ],
+    "tdrc_beta": [
+      1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/Vtrace/Vtrace.json
+++ b/Experiments/FirstChain/Vtrace/Vtrace.json
+{
+  "agent": "Vtrace",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/ABTD/ABTD.json
+++ b/Experiments/FirstFourRoom/ABTD/ABTD.json
+{
+  "agent": "ABTD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "zeta": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/ETD/ETD.json
+++ b/Experiments/FirstFourRoom/ETD/ETD.json
+{
+  "agent": "ETD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/ETDLB/ETDLB.json
+++ b/Experiments/FirstFourRoom/ETDLB/ETDLB.json
+{
+  "agent": "ETDLB",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "beta": [
+      0.0, 0.2, 0.4, 0.6, 0.8, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/GTD/GTD.json
+++ b/Experiments/FirstFourRoom/GTD/GTD.json
+{
+  "agent": "GTD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/GTD2/GTD2.json
+++ b/Experiments/FirstFourRoom/GTD2/GTD2.json
+{
+  "agent": "GTD2",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/HTD/HTD.json
+++ b/Experiments/FirstFourRoom/HTD/HTD.json
+{
+  "agent": "HTD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/PGTD2/PGTD2.json
+++ b/Experiments/FirstFourRoom/PGTD2/PGTD2.json
+{
+  "agent": "PGTD2",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/TB/TB.json
+++ b/Experiments/FirstFourRoom/TB/TB.json
+{
+  "agent": "TB",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/TD/TD.json
+++ b/Experiments/FirstFourRoom/TD/TD.json
+{
+  "agent": "TD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/TDRC/TDRC.json
+++ b/Experiments/FirstFourRoom/TDRC/TDRC.json
+{
+  "agent": "TDRC",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ],
+    "tdrc_beta": [
+      1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/Vtrace/Vtrace.json
+++ b/Experiments/FirstFourRoom/Vtrace/Vtrace.json
+{
+  "agent": "Vtrace",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/ExportBin/ReadMe
+++ b/ExportBin/ReadMe
+This directory contains all the export.dat files created when submitting jobs on Cedar.
\ No newline at end of file
--- a/Job/Cedar_Create_Config_Template.sh
+++ b/Job/Cedar_Create_Config_Template.sh
+#!/bin/bash
+alpha=(__ALPHA__)
+lmbda=(__LMBDA__)
+eta=(__ETA__)
+beta=(__BETA__)
+zeta=(__ZETA__)
+tdrc_beta=(__TDRCBETA__)
+gem_alpha=(__GEMALPHA__)
+gem_beta=(__GEMBETA__)
+num_of_runs=__NUMOFRUNS__
+num_steps=__NUMSTEPS__
+sub_sample=__SUBSAMPLE__
+algorithm=__ALGORITHM__
+environment=__ENVIRONMENT__
+task=__TASK__
+save_path=__SAVEPATH__
+
+rm -f exports_${algorithm}.dat
+for A in ${alpha[@]}; do
+  for L in ${lmbda[@]}; do
+    for E in ${eta[@]}; do
+      for B in ${beta[@]}; do
+        for Z in ${zeta[@]}; do
+          for T in ${tdrc_beta[@]}; do
+            for GA in ${gem_alpha[@]}; do
+              for GB in ${gem_beta[@]}; do
+                echo export SAVE_PATH=${save_path} ENVIRONMENT=${environment} ALGORITHM=${algorithm} \
+                TASK=${task} ALPHA=${A} LMBDA=${L} ETA=${E} BETA=${B} ZETA=${Z} TDRCBETA=${T} GEMALPHA=${GA} \
+                GEMBETA=${GB} NUMOFRUNS=${num_of_runs} NUMSTEPS=${num_steps} SUBSAMPLE=${sub_sample} \
+                >>exports_${algorithm}.dat
+              done
+            done
+          done
+        done
+      done
+    done
+  done
+done
--- a/Job/JobBuilder.py
+++ b/Job/JobBuilder.py
+import os
+import json
+import numpy as np
+from utils import ImmutableDict
+import time
+
+default_params = ImmutableDict(
+    {
+        'agent': 'GEMETD',
+        'task': 'EightStateCollision',
+        'environment': 'Chain',
+        'exp': 'FirstChain',
+        # 'agent': 'HTD',
+        # 'task': 'LearnEightPoliciesTileCodingFeat',
+        # 'environment': 'FourRoomGridWorld',
+        # 'exp': 'FirstFourRoom',
+        # 'agent': 'LSTD',
+        # 'task': 'HighVarianceLearnEightPoliciesTileCodingFeat',
+        # 'environment': 'FourRoomGridWorld',
+        # 'exp': '1HVFourRoom',
+
+        'save_value_function': True,
+        'sub_sample': 1,
+        'num_of_runs': 3,
+        'num_steps': 20_000,
+        'meta_parameters': {
+            'alpha': 0.001953125,
+            'eta': 16.0,
+            'beta': 0.9,
+            'zeta': 0.9,
+            'lmbda': 0.0,
+            'tdrc_beta': 1.0,
+            'gem_alpha': 0.1,
+            'gem_beta': 0.1
+        }
+    }
+)
+
+
+class JobBuilder:
+    def __init__(self, json_path, server_name):
+        self._path = json_path
+        self.server_name = server_name
+        with open(self._path) as f:
+            self._params = json.load(f)
+
+        self._batch_params = ImmutableDict(
+            {
+                'ALPHA': ' '.join([f'{num:.10f}' for num in self.alpha]),
+                'LMBDA': ' '.join([f'{num:.5f}' for num in self.lmbda]),
+                'ETA': ' '.join([f'{num:.10f}' for num in self.eta]),
+                'BETA': ' '.join([f'{num:.5f}' for num in self.beta]),
+                'ZETA': ' '.join([f'{num:.5f}' for num in self.zeta]),
+                'TDRCBETA': ' '.join([f'{num:.5f}' for num in self.tdrc_beta]),
+                'GEMALPHA': ' '.join([f'{num:.5f}' for num in self.gem_alpha]),
+                'GEMBETA': ' '.join([f'{num:.5f}' for num in self.gem_beta]),
+                'NUMOFRUNS': f'{self.num_of_runs}',
+                'NUMSTEPS': f'{self.num_steps}',
+                'SUBSAMPLE': f'{self.sub_sample}',
+                'ALGORITHM': self.agent,
+                'TASK': self.task,
+                'ENVIRONMENT': self.environment,
+                'SAVEPATH': self.save_path
+            })
+
+    @property
+    def tdrc_beta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('tdrc_beta', [default_params['meta_parameters']['tdrc_beta']]))
+
+    @property
+    def gem_alpha(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('gem_alpha', [default_params['meta_parameters']['gem_alpha']]))
+
+    @property
+    def gem_beta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('gem_beta', [default_params['meta_parameters']['gem_beta']]))
+
+    @property
+    def alpha(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('alpha', [default_params['meta_parameters']['alpha']]))
+
+    @property
+    def lmbda(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('lmbda', [default_params['meta_parameters']['lmbda']]))
+
+    @property
+    def eta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('eta', [default_params['meta_parameters']['eta']]))
+
+    @property
+    def beta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('beta', [default_params['meta_parameters']['beta']]))
+
+    @property
+    def zeta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('zeta', [default_params['meta_parameters']['zeta']]))
+
+    @property
+    def agent(self):
+        return self._params.get('agent', default_params['agent'])
+
+    @property
+    def task(self):
+        return self._params.get('task', default_params['task'])
+
+    @property
+    def num_of_runs(self):
+        return np.asarray(self._params.get('number_of_runs', default_params['num_of_runs']))
+
+    @property
+    def num_steps(self):
+        return np.asarray(self._params.get('number_of_steps', default_params['num_steps']))
+
+    @property
+    def sub_sample(self):
+        return np.asarray(self._params.get('sub_sample', default_params['sub_sample']))
+
+    @property
+    def environment(self):
+        return self._params.get('environment', default_params['environment'])
+
+    @property
+    def save_path(self):
+        return os.path.dirname(self._path).replace("/Experiments/", "/Results/")
+
+    def create_dat_file(self):
+        with open('Job/Cedar_Create_Config_Template.sh', 'r') as f:
+            text = f.read()
+            for k, v in self._batch_params.items():
+                text = text.replace(f'__{k}__', v)
+        return text
+
+    def to_shell(self):
+        if self.server_name.upper() == 'NODE':
+            with open('Job/SubmitJobsTemplates.SL', 'r') as f:
+                text = f.read()
+                for k, v in self._batch_params.items():
+                    text = text.replace(f'__{k}__', v)
+            return text
+        elif self.server_name.upper() == 'CPU':
+            with open('Job/SubmitJobsTemplatesCedar.SL', 'r') as f:
+                text = f.read()
+                alg = self._batch_params['ALGORITHM']
+                num_of_jobs = sum(1 for _ in open(f'exports_{alg}.dat'))
+                text = text.replace('__ALG__', self._batch_params['ALGORITHM'])
+                text = text.replace('__NUM_OF_JOBS__', str(num_of_jobs))
+                text = text.replace('__NAME_OF_EXP__', f'{self._batch_params["TASK"]}_{self._batch_params["ALGORITHM"]}')
+            return text
+
+    def run_batch(self):
+        if self.server_name.upper() == 'NODE':
+            print('Submitting the ' + self.agent + ' algorithm jobs on nodes...')
+        elif self.server_name.upper() == 'CPU':
+            print('Submitting the ' + self.agent + ' algorithm jobs on individual cpus...')
+            with open('Create_Configs.sh', 'wt') as f:
+                f.write(self.create_dat_file())
+            time.sleep(1)
+            os.system('bash Create_Configs.sh')
+        with open('Submit_Jobs.SL', 'wt') as f:
+            f.write(self.to_shell())
+        time.sleep(1)
+        os.system('sbatch Submit_Jobs.SL')
+        time.sleep(1)
+        os.remove('Submit_Jobs.SL')
+        if self.server_name.upper() == 'CPU':
+            os.remove('Create_Configs.sh')
+            # alg = self._batch_params['ALGORITHM']
+            # os.remove(f'exports_{alg}.dat')
+
+    def __call__(self):
+        return self.run_batch()
--- a/Job/SubmitJobsTemplates.SL
+++ b/Job/SubmitJobsTemplates.SL
+#!/bin/bash
+# SLURM submission script for submitting multiple serial jobs on Niagara
+#
+#SBATCH --account=xxx
+#SBATCH --time=11:58:59
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=40
+#SBATCH --job-name __TASK_____ALGORITHM__
+
+
+alpha=(__ALPHA__)
+lmbda=(__LMBDA__)
+eta=(__ETA__)
+beta=(__BETA__)
+zeta=(__ZETA__)
+tdrc_beta=(__TDRCBETA__)
+gem_alpha=(__GEMALPHA__)
+gem_beta=(__GEMBETA__)
+num_of_runs=__NUMOFRUNS__
+num_steps=__NUMSTEPS__
+sub_sample=__SUBSAMPLE__
+algorithm=__ALGORITHM__
+environment=__ENVIRONMENT__
+task=__TASK__
+save_path=__SAVEPATH__
+
+source ~/RLENV/bin/activate
+module load NiaEnv/2019b
+module load gnu-parallel
+module load python
+
+cd $SLURM_SUBMIT_DIR || exit
+export OMP_NUM_THREADS=1
+
+echo "The number of available cores is echo $NCORES"
+echo "Current working directory is $(pwd)"
+echo "Running on hostname $(hostname)"
+echo "Starting run at: $(date)"
+
+HOSTS=$(scontrol show hostnames $SLURM_NODELIST | tr '\n' ,)
+NCORES=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+
+
+parallel --env OMP_NUM_THREADS,PATH,LD_LIBRARY_PATH --joblog slurm-$SLURM_JOBID.log -j $NCORES -S $HOSTS --wd $PWD \
+python Learning.py ::: -sp ::: ${save_path} ::: -e ::: ${environment} ::: -alg ::: ${algorithm} ::: -t ::: ${task[@]} \
+::: -a ::: ${alpha[@]} ::: -nr ::: ${num_of_runs} ::: -ns ::: ${num_steps} ::: -et ::: ${eta[@]} \
+::: -l ::: ${lmbda[@]} ::: -z ::: ${zeta[@]} ::: -tb ::: ${tdrc_beta[@]} ::: -b ::: ${beta[@]} ::: \
+-ga ::: ${gem_alpha[@]} ::: -gb ::: ${gem_beta[@]} ::: -ss ::: ${sub_sample}
+
+
+echo "Program test finished with exit code $? at: $(date)"
--- a/Job/SubmitJobsTemplatesCedar.SL
+++ b/Job/SubmitJobsTemplatesCedar.SL
+#!/bin/bash
+#SBATCH --account=xxx
+#SBATCH --time=00:15:58
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=3G
+#SBATCH --array=1-__NUM_OF_JOBS__
+#SBATCH --job-name __NAME_OF_EXP__
+
+alg=__ALG__
+source ~/RLENV/bin/activate
+`sed -n "${SLURM_ARRAY_TASK_ID}p" <exports_${alg}.dat`
+echo ${SLURM_ARRAY_TASK_ID} $ALPHA $LMBDA $ETA $BETA $ZETA $TDRCBETA $GEMALPHA $GEMBETA $NUMOFRUNS $NUMSTEPS $SUBSAMPLE
+echo "Current working directory is $(pwd)"
+echo "Running on hostname $(hostname)"
+echo
+echo "Starting run at: $(date)"
+python Learning.py \
+-a $ALPHA -l $LMBDA -et $ETA -b $BETA -z $ZETA -tb $TDRCBETA -ga $GEMALPHA -gb $GEMBETA -alg $ALGORITHM -t $TASK \
+-nr $NUMOFRUNS -e $ENVIRONMENT -sp $SAVE_PATH -ns $NUMSTEPS -ss $SUBSAMPLE
+echo "Program test finished with exit code $? at: $(date)"
--- a/Learning.py
+++ b/Learning.py
+import os
+import numpy as np
+import argparse
+
+from data_presister import DataPersister, ParameterBuilder
+from utils import save_result, Configuration, save_value_function, get_save_value_function_steps
+from Registry.AlgRegistry import alg_dict
+from Registry.EnvRegistry import environment_dict
+from Registry.TaskRegistry import task_dict
+from Job.JobBuilder import default_params
+from Environments.rendering import ErrorRender
+
+
+def learn(config: Configuration):
+    params = ParameterBuilder().add_algorithm_params(config).build()
+
+    if not os.path.exists(config.save_path):
+        os.makedirs(config.save_path, exist_ok=True)
+
+    env = environment_dict[config.environment]()
+
+    rmsve = np.zeros((task_dict[config.task].num_of_policies(), config.num_steps, config.num_of_runs))
+    for run in range(config.num_of_runs):
+        random_seed = (run + config.num_of_runs) if config.rerun else run
+        np.random.seed(random_seed)
+        task = task_dict[config.task](run_number=run, num_steps=config.num_steps)
+        agent = alg_dict[config.algorithm](task, **params)
+
+        rmsve_of_run = np.zeros((task.num_policies, task.num_steps))
+        agent.state = env.reset()
+        error_render = ErrorRender(task.num_policies, task.num_steps)
+        for step in range(task.num_steps):
+            rmsve_of_run[:, step], error = agent.compute_rmsve()
+            if config.render:
+                error_render.add_error(error)
+            agent.action = agent.choose_behavior_action()
+            agent.next_state, r, is_terminal, info = env.step(agent.action)
+            agent.learn(agent.state, agent.next_state, r, is_terminal)
+            if config.render:
+                env.render(mode='screen', render_cls=error_render)
+            if config.save_value_function and (step in get_save_value_function_steps(task.num_steps)):
+                save_value_function(agent.compute_value_function(), config.save_path, step, run)
+            if is_terminal:
+                agent.state = env.reset()
+                agent.reset()
+                continue
+            agent.state = agent.next_state
+        print(np.mean(rmsve_of_run, axis=0))
+        rmsve[:, :, run] = rmsve_of_run
+    rmsve_of_runs = np.transpose(np.mean(rmsve, axis=0))  # Average over all policies.
+
+    # _RMSVE_mean_over_runs
+    DataPersister.save_result(np.mean(rmsve_of_runs, axis=0), '_RMSVE_mean_over_runs', config)
+    DataPersister.save_result(np.std(rmsve_of_runs, axis=0, ddof=1) / np.sqrt(config.num_of_runs), '_RMSVE_stderr_over_runs', config)
+
+    # _RMSVE_stderr_over_runs
+    save_result(config.save_path, '_RMSVE_stderr_over_runs', np.mean(rmsve_of_runs, axis=0), params, config.rerun)
+    save_result(config.save_path, '_RMSVE_stderr_over_runs',
+                np.std(rmsve_of_runs, axis=0, ddof=1) / np.sqrt(config.num_of_runs), params, config.rerun)
+
+    # _mean_stderr_final
+    final_errors_mean_over_steps = np.mean(rmsve_of_runs[:, config.num_steps - int(0.01 * config.num_steps) - 1:],
+                                           axis=1)
+    DataPersister.save_result(np.array([np.mean(final_errors_mean_over_steps), np.std(final_errors_mean_over_steps, ddof=1) /
+                                        np.sqrt(config.num_of_runs)]), '_mean_stderr_final', config)
+    save_result(config.save_path, '_mean_stderr_final',
+                np.array([np.mean(final_errors_mean_over_steps), np.std(final_errors_mean_over_steps, ddof=1) /
+                          np.sqrt(config.num_of_runs)]), params, config.rerun)
+
+    # _mean_stderr_auc
+    auc_mean_over_steps = np.mean(rmsve_of_runs, axis=1)
+    DataPersister.save_result(np.array([np.mean(auc_mean_over_steps),
+                                        np.std(auc_mean_over_steps, ddof=1) / np.sqrt(config.num_of_runs)]), '_mean_stderr_auc', config)
+    save_result(config.save_path, '_mean_stderr_auc',
+                np.array([np.mean(auc_mean_over_steps),
+                          np.std(auc_mean_over_steps, ddof=1) / np.sqrt(config.num_of_runs)]), params, config.rerun)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--alpha', '-a', type=float, default=default_params['meta_parameters']['alpha'])
+    parser.add_argument('--lmbda', '-l', type=float, default=default_params['meta_parameters']['lmbda'])
+    parser.add_argument('--eta', '-et', type=float, default=default_params['meta_parameters']['eta'])
+    parser.add_argument('--beta', '-b', type=float, default=default_params['meta_parameters']['beta'])
+    parser.add_argument('--zeta', '-z', type=float, default=default_params['meta_parameters']['zeta'])
+    parser.add_argument('--tdrc_beta', '-tb', type=float, default=default_params['meta_parameters']['tdrc_beta'])
+    parser.add_argument('--gem_alpha', '-ga', type=float, default=default_params['meta_parameters']['gem_alpha'])
+    parser.add_argument('--gem_beta', '-gb', type=float, default=default_params['meta_parameters']['gem_beta'])
+    parser.add_argument('--algorithm', '-alg', type=str, default=default_params['agent'])
+    parser.add_argument('--task', '-t', type=str, default=default_params['task'])
+    parser.add_argument('--num_of_runs', '-nr', type=int, default=default_params['num_of_runs'])
+    parser.add_argument('--num_steps', '-ns', type=int, default=default_params['num_steps'])
+    parser.add_argument('--sub_sample', '-ss', type=int, default=default_params['sub_sample'])
+    parser.add_argument('--environment', '-e', type=str, default=default_params['environment'])
+    parser.add_argument('--save_path', '-sp', type=str, default='-')
+    parser.add_argument('--rerun', '-rrn', type=bool, default=False)
+    parser.add_argument('--render', '-rndr', type=bool, default=False)
+    parser.add_argument('--save_value_function', '-svf', type=bool, default=default_params['save_value_function'])
+    args = parser.parse_args()
+    if args.save_path == '-':
+        args.save_path = os.path.join(os.getcwd(), 'Results', default_params['exp'], args.algorithm)
+
+    learn(config=Configuration(vars(args)))
--- a/Plotting/plot_all_sensitivities_per_alg_emphatics.py
+++ b/Plotting/plot_all_sensitivities_per_alg_emphatics.py
+import json
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Plotting.plot_params import EXP_ATTRS, AUC_AND_FINAL
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
+from utils import create_name_for_save_load
+
+plot_alpha = 1.0
+
+
+def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
+    global plot_alpha
+    lbl = f'{alg}_{tp}'
+    ax.set_xscale('log', basex=2)
+    if alg == 'ETD':
+        color = 'red'
+    elif alg == 'ETDLB':
+        color = 'grey'
+        plot_alpha -= 0.1
+    else:
+        color = 'black'
+    ax.plot(alphas, performance, label=lbl, linestyle='-', marker='o',
+            linewidth=2, markersize=5, color=color, alpha=plot_alpha)
+    ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
+                color=color, alpha=plot_alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    # ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    # plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def get_alphas(alg, exp):
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f"{alg}.json")
+    with open(exp_path) as f:
+        jsn_content = json.load(f)
+        return jsn_content['meta_parameters']['alpha']
+
+
+def plot_all_sensitivities_per_alg_emphatics(**kwargs):
+    global plot_alpha
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                plot_alpha = 1.0
+                alg = 'ETD'
+                save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
+                fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                current_params = make_current_params(alg, sp, 0, 0)
+                alphas = get_alphas(alg, exp)
+                performance, stderr = load_performance_over_alpha(
+                    alg, exp, current_params, auc_or_final, exp_attrs)
+                plot_sensitivity(ax, alg, exp, alphas, sp, 0, performance, stderr, exp_attrs)
+                alg = 'ETDLB'
+                fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
+                for tp in tp_list:
+                    for fop in fop_list:
+                        current_params = make_current_params(alg, sp, tp, fop)
+                        alphas = get_alphas(alg, exp)
+                        performance, stderr = load_performance_over_alpha(
+                            alg, exp, current_params, auc_or_final, exp_attrs)
+                        plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir, exist_ok=True)
+                fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}.pdf"),
+                            format='pdf', dpi=1000, bbox_inches='tight')
+                plt.show()
+                print(exp, alg, auc_or_final, sp)
--- a/Plotting/plot_all_sensitivities_per_alg_gradients.py
+++ b/Plotting/plot_all_sensitivities_per_alg_gradients.py
+import os
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+
+from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, ALG_COLORS
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
+from utils import create_name_for_save_load
+
+
+new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#17becf',
+              'orange', '#8c564b', '#e377c2', '#2ca02c',
+              '#bcbd22', '#d62728']
+color_counter = 1
+
+
+def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
+    global color_counter
+    lbl = f'{alg}_{tp}'
+    ax.set_xscale('log', basex=2)
+    color = new_colors[color_counter]
+    linestyle = '-'
+    alpha = 1.0
+    # if alg == 'PGTD2':
+    #     linestyle = '--'
+    #     alpha = 0.5
+    ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
+            linewidth=2, markersize=5, color=color, alpha=alpha)
+    ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
+                color=color, alpha=alpha)
+    color_counter = color_counter + 1
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.set_ylim([0.1, 0.8])
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    plt.xticks(fontsize=25)
+
+
+def get_alphas(alg, exp):
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f"{alg}.json")
+    with open(exp_path) as f:
+        jsn_content = json.load(f)
+        return jsn_content['meta_parameters']['alpha']
+
+
+COUNTER = 0
+
+
+def plot_extra_alg_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
+    global color_counter
+    lbl = f'{alg}_{tp}'
+    ax.set_xscale('log', basex=2)
+    color = new_colors[color_counter - 1]
+    alpha = 1.0
+    if alg == 'TDRC':
+        color = ALG_COLORS[alg]
+        alpha = 1.0
+    linestyle = '--'
+    # if alg == 'GTD2':
+    #     linestyle = '-'
+    #     alpha=1.0
+    ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
+            linewidth=3, markersize=5, color=color, alpha=alpha)
+    ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=3, markersize=5,
+                color=color, alpha=alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim([0.1, 0.8])
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def plot_all_sensitivities_per_alg_gradients(**kwargs):
+    global color_counter, COUNTER
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                for alg in kwargs['algs']:
+                    color_counter = 4
+                    save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
+                    for tp in tp_list:
+                        if COUNTER % 2 == 0:
+                            COUNTER += 1
+                            continue
+                        COUNTER += 1
+                        for fop in fop_list:
+                            current_params = make_current_params(alg, sp, tp, fop)
+                            alphas = get_alphas(alg, exp)
+                            performance, stderr = load_performance_over_alpha(
+                                alg, exp, current_params, auc_or_final, exp_attrs)
+                            plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                            if alg == 'GTD2':
+                                extra_alg = 'GTD'
+                                performance, stderr = load_performance_over_alpha(
+                                    extra_alg, exp, current_params, auc_or_final, exp_attrs)
+                                plot_extra_alg_sensitivity(
+                                    ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                            if alg == 'PGTD2':
+                                extra_alg = 'GTD2'
+                                performance, stderr = load_performance_over_alpha(
+                                    extra_alg, exp, current_params, auc_or_final, exp_attrs)
+                                plot_extra_alg_sensitivity(
+                                    ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                            if alg == 'GTD':
+                                extra_alg = 'HTD'
+                                performance, stderr = load_performance_over_alpha(
+                                    extra_alg, exp, current_params, auc_or_final, exp_attrs)
+                                plot_extra_alg_sensitivity(
+                                    ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                            if alg == 'HTD':
+                                extra_alg = 'TDRC'
+                                current_params['eta'] = 1.0
+                                current_params['tdrc_beta'] = 1.0
+                                performance, stderr = load_performance_over_alpha(
+                                    extra_alg, exp, current_params, auc_or_final, exp_attrs)
+                                plot_extra_alg_sensitivity(
+                                    ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}.pdf"),
+                                format='pdf', dpi=1000, bbox_inches='tight')
+                    plt.show()
+                    print(exp, alg, auc_or_final, sp)
--- a/Plotting/plot_all_sensitivities_per_alg_gradients_all_eta.py
+++ b/Plotting/plot_all_sensitivities_per_alg_gradients_all_eta.py
+import os
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+
+from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, ALG_COLORS
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
+from utils import create_name_for_save_load
+
+new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#17becf', 'orange', '#8c564b', '#e377c2', '#2ca02c','#bcbd22',
+              '#d62728', 'black', 'cyan']
+color_counter = 1
+
+
+def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
+    global color_counter
+    lbl = f'{alg}_{tp}'
+    ax.set_xscale('log', basex=2)
+    color = new_colors[color_counter]
+    linestyle = '-'
+    alpha = 1.0
+    # if alg == 'PGTD2':
+    #     linestyle = '--'
+    #     alpha = 0.5
+    ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
+            linewidth=2, markersize=5, color=color, alpha=alpha)
+    ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
+                color=color, alpha=alpha)
+    color_counter = color_counter + 1
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.set_ylim([0.1, 0.8])
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def get_alphas(alg, exp):
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f"{alg}.json")
+    with open(exp_path) as f:
+        jsn_content = json.load(f)
+        return jsn_content['meta_parameters']['alpha']
+
+
+COUNTER = 0
+
+
+def plot_all_sensitivities_per_alg_gradients_all_eta(**kwargs):
+    global color_counter, COUNTER
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                for alg in kwargs['algs']:
+                    color_counter = 4
+                    save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
+                    if alg == 'TDRC':
+                        _, _, tp_list, _, _ = make_params('GTD', exp)
+                        fop_list = kwargs['tdrc_beta']
+                    for tp in tp_list:
+                        COUNTER += 1
+                        for fop in fop_list:
+                            current_params = make_current_params(alg, sp, tp, fop)
+                            alphas = get_alphas(alg, exp)
+                            performance, stderr = load_performance_over_alpha(
+                                alg, exp, current_params, auc_or_final, exp_attrs)
+                            plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    if alg == 'TDRC':
+                        fig.savefig(
+                            os.path.join(save_dir, f"sensitivity_{alg}_{exp}_all_eta_beta_{kwargs['tdrc_beta']}.pdf"),
+                            format='pdf', dpi=1000, bbox_inches='tight')
+                    else:
+                        fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}_all_eta.pdf"),
+                                    format='pdf', dpi=1000, bbox_inches='tight')
+                    plt.show()
+                    print(exp, alg, auc_or_final, sp)
--- a/Plotting/plot_best_learning_curve_over_all_params.py
+++ b/Plotting/plot_best_learning_curve_over_all_params.py
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pylab
+from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
+    PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX
+from Plotting.plot_utils import load_best_rerun_params_dict, make_current_params, make_params, load_and_replace_large_nan_inf
+from utils import create_name_for_save_load
+
+
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False,
+              smoothing_window=1):
+    zoomed_in = True if is_smoothed else False
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    print(alg)
+    lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']) + r' $\lambda=$ ' +
+           str(best_params.get('lmbda', best_params.get('zeta', 0))))
+    color = ALG_COLORS[alg]
+    # if alg == 'TD':
+    #     color = 'grey'
+    #     alpha = 0.7
+    if is_smoothed:
+        mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid')
+        mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid')
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    if zoomed_in:
+        ax.set_ylim([0.0, 0.4])
+    else:
+        ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+def get_ls_rmsve(alg, exp, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    params = {'alpha': 0.01, 'lmbda': sp}
+    if alg == 'LSETD':
+        params['beta'] = 0.9
+    generic_name = create_name_for_save_load(params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
+    return np.load(load_file_name)
+
+
+def plot_ls_solution(ax, ls_rmsve, alg, sp):
+    lbl = f"{alg} $\\lambda=$ {sp}"
+    x = np.arange(ls_rmsve.shape[0])
+    y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
+    ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':')
+    # ax.legend()
+
+
+def find_best_perf(alg, exp, auc_or_final):
+    exp_attrs = EXP_ATTRS[exp](exp)
+    fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
+    best_params = {}
+    best_perf, best_fp, best_sp, best_tp, best_fop = np.inf, np.inf, np.inf, np.inf, np.inf
+    for fop in fop_list:
+        for tp in tp_list:
+            for sp in sp_list:
+                current_params = make_current_params(alg, sp, tp, fop)
+                load_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=[
+                    'alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
+                current_perf = load_and_replace_large_nan_inf(
+                    load_name, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement)
+                min_perf = min(current_perf)
+                if min_perf < best_perf:
+                    best_perf = min_perf
+                    best_perf_idx = int(np.nanargmin(current_perf))
+                    best_fp = fp_list[best_perf_idx]
+                    best_params = current_params
+                    best_params['alpha'] = best_fp
+    return best_params
+
+
+def plot_learning_curve_best_overall_params(**kwargs):
+    is_smoothed = True if 'is_smoothed' in kwargs else False
+    smoothing_window = kwargs.get('smoothing_window', 1)
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final)
+            for alg_names in kwargs['alg_groups'].values():
+                fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                for alg in alg_names:
+                    if alg in ['LSTD', 'LSETD']:
+                        # ls_rmsve = get_ls_rmsve(alg, exp, sp)
+                        # plot_ls_solution(ax, ls_rmsve, alg, sp)
+                        continue
+                    prefix = RERUN_POSTFIX if PLOT_RERUN else ''
+                    best_params = find_best_perf(alg, exp, auc_or_final)
+                    mean_lc, mean_stderr = load_data(alg, exp, best_params, prefix)
+                    plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False,
+                              is_smoothed=is_smoothed, smoothing_window=smoothing_window)
+                    if PLOT_RERUN_AND_ORIG:
+                        prefix = RERUN_POSTFIX
+                        mean_lc, mean_stderr = load_data(alg, exp, best_params, prefix)
+                        plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=True,
+                                  is_smoothed=is_smoothed, smoothing_window=smoothing_window)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    pylab.gca().set_rasterized(True)
+                if PLOT_RERUN_AND_ORIG:
+                    prefix = '_rerun_and_original'
+                elif PLOT_RERUN:
+                    prefix = RERUN_POSTFIX
+                else:
+                    prefix = ''
+                fig.savefig(os.path.join(save_dir,
+                            f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}AllLmbda.pdf"),
+                            format='pdf', dpi=200, bbox_inches='tight')
+                plt.show()
+                plt.close(fig)
--- a/Plotting/plot_dist.py
+++ b/Plotting/plot_dist.py
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def load_d_mu(task):
+    return np.load(os.path.join(os.getcwd(), 'Resources', task, 'd_mu.npy'))
+
+
+def load_state_values(task):
+    return np.load(os.path.join(os.getcwd(), 'Resources', task, 'state_values.npy'))
+
+
+def plot_d_mu(ax, d_mu, active_states):
+    ax.plot(d_mu, linewidth=3)
+    plt.xticks(fontsize=30)
+    plt.yticks(fontsize=30)
+    x_labels = list(active_states)
+    x_ticks = [x for x in range(len(x_labels))]
+    ax.xaxis.set_ticks(x_ticks)
+    ax.set_xticklabels(x_labels)
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+    ax.yaxis.set_ticks([0, 0.005, 0.01, 0.015, 0.02, 0.025])
+    ax.set_ylim([0.00, 0.025])
+    ax.set_yticklabels([])
+    # ax.set_xticklabels([])
+
+
+def find_active_states(task, d_mu, state_values, policy_no=0):
+    if task == 'EightStateCollision':
+        return [x for x in range(d_mu.shape[0])]
+    return np.where(state_values[policy_no] > 0)[0]
+
+
+def get_active_d_mu(task, d_mu, active_states, policy_no=0):
+    if task == 'EightStateCollision':
+        return d_mu
+    return d_mu[active_states, policy_no].squeeze()
+
+
+def plot_distribution(**kwargs):
+    task = kwargs['task']
+    d_mu = load_d_mu(task)
+    state_values = load_state_values(task)
+    for policy_no in range(state_values.shape[0]):
+        fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+        active_states = find_active_states(task, d_mu, state_values, policy_no)
+        active_d_mu = get_active_d_mu(task, d_mu, active_states, policy_no)
+        plot_d_mu(ax, active_d_mu, active_states)
+        plt.show()
+        if task == 'EightStateCollision':
+            break
+
+
+def plot_dist_for_two_four_room_tasks(**kwargs):
+    task1 = 'LearnEightPoliciesTileCodingFeat'
+    task2 = 'HighVarianceLearnEightPoliciesTileCodingFeat'
+    save_dir = os.path.join('pdf_plots', 'Misc', 'CompareDistsFR')
+    d_mu1 = load_d_mu(task1)
+    d_mu2 = load_d_mu(task2)
+    state_values1 = load_state_values(task1)
+    state_values2 = load_state_values(task2)
+    for policy_no in range(state_values1.shape[0]):
+        fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+        active_states = find_active_states(task1, d_mu1, state_values1, policy_no)
+        active_d_mu = get_active_d_mu(task1, d_mu1, active_states, policy_no)
+        plot_d_mu(ax, active_d_mu, active_states)
+        active_states = find_active_states(task2, d_mu2, state_values2, policy_no)
+        active_d_mu = get_active_d_mu(task2, d_mu2, active_states, policy_no)
+        plot_d_mu(ax, active_d_mu, active_states)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        fig.savefig(os.path.join(save_dir, f"dist_policy_{policy_no}.pdf"),
+                    format='pdf', dpi=1000, bbox_inches='tight')
+        plt.show()
+
+
--- a/Plotting/plot_learning_curve.py
+++ b/Plotting/plot_learning_curve.py
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pylab
+from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
+    PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX
+from Plotting.plot_utils import load_best_rerun_params_dict
+from utils import create_name_for_save_load
+
+
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False,
+              smoothing_window=1):
+    zoomed_in = True if is_smoothed else False
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
+    color = ALG_COLORS[alg]
+    # if alg == 'TD':
+    #     color = 'grey'
+    #     alpha = 0.7
+    if is_smoothed:
+        mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid')
+        mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid')
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    if zoomed_in:
+        ax.set_ylim([0.0, 0.4])
+    else:
+        ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+def get_ls_rmsve(alg, exp, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    params = {'alpha': 0.01, 'lmbda': sp}
+    if alg == 'LSETD':
+        params['beta'] = 0.9
+    generic_name = create_name_for_save_load(params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
+    return np.load(load_file_name)
+
+
+def plot_ls_solution(ax, ls_rmsve, alg, sp):
+    lbl = f"{alg} $\\lambda=$ {sp}"
+    x = np.arange(ls_rmsve.shape[0])
+    y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
+    ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':')
+    # ax.legend()
+
+
+def plot_learning_curve(**kwargs):
+    is_smoothed = True if 'is_smoothed' in kwargs else False
+    smoothing_window = kwargs.get('smoothing_window', 1)
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final)
+                for alg_names in kwargs['alg_groups'].values():
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    for alg in alg_names:
+                        if alg in ['LSTD', 'LSETD']:
+                            ls_rmsve = get_ls_rmsve(alg, exp, sp)
+                            plot_ls_solution(ax, ls_rmsve, alg, sp)
+                            continue
+                        prefix = RERUN_POSTFIX if PLOT_RERUN else ''
+                        current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+                        mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                        plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=False,
+                                  is_smoothed=is_smoothed, smoothing_window=smoothing_window)
+                        if PLOT_RERUN_AND_ORIG:
+                            prefix = RERUN_POSTFIX
+                            mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                            plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=True,
+                                      is_smoothed=is_smoothed, smoothing_window=smoothing_window)
+                        if not os.path.exists(save_dir):
+                            os.makedirs(save_dir, exist_ok=True)
+                        pylab.gca().set_rasterized(True)
+                    if PLOT_RERUN_AND_ORIG:
+                        prefix = '_rerun_and_original'
+                    elif PLOT_RERUN:
+                        prefix = RERUN_POSTFIX
+                    else:
+                        prefix = ''
+                    fig.savefig(os.path.join(save_dir,
+                                f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
+                                format='pdf', dpi=200, bbox_inches='tight')
+                    plt.show()
+                    plt.close(fig)
--- a/Plotting/plot_learning_curves_for_all_third_params.py
+++ b/Plotting/plot_learning_curves_for_all_third_params.py
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pylab
+
+from Plotting.plot_params import ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN_AND_ORIG
+from Plotting.plot_utils import make_params, get_alphas, make_current_params
+from utils import create_name_for_save_load
+
+
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
+    color = ALG_COLORS[alg]
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def get_ls_rmsve(alg, exp, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    params = {'alpha': 0.01, 'lmbda': sp}
+    if alg == 'LSETD':
+        params['beta'] = 0.9
+    generic_name = create_name_for_save_load(params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
+    return np.load(load_file_name)
+
+
+def plot_ls_solution(ax, ls_rmsve, alg, sp):
+    lbl = f"{alg} $\\lambda=$ {sp}"
+    x = np.arange(ls_rmsve.shape[0])
+    y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
+    ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--')
+    # ax.legend()
+
+
+def load_specific_params_dict(alg, exp, sp, tp):
+    if alg == 'TD':
+        return {'alpha': 0.25, 'lmbda': sp}
+    if alg == 'ETD':
+        return {'alpha': 0.00390625, 'lmbda': sp}
+    if alg == 'ETDLB':
+        return {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2}
+    if alg == 'TDRC':
+        return {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0}
+    if alg == 'GTD':
+        return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp}
+    if alg == 'PGTD2':
+        return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp}
+
+
+def load_sample_params_dict(alg, exp, sp):
+    fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
+    if alg in ['TD', 'ETD', 'TB', 'Vtrace']:
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp}
+    if alg == 'ABTD':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp}
+    if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']:
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'eta': tp_list[np.random.randint(0, len(tp_list))]}
+    if alg == 'ETDLB':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'beta': tp_list[np.random.randint(0, len(tp_list))]}
+    if alg == 'TDRC':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'eta': tp_list[np.random.randint(0, len(tp_list))],
+                'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]}
+
+
+def plot_all_learning_curves_for_third(**kwargs):
+    for exp in kwargs['exps']:
+        prefix = ''
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                save_dir = os.path.join('pdf_plots', 'all_third_learning_curves', auc_or_final)
+                fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                for alg in kwargs['algs']:
+                    if alg in ['LSTD', 'LSETD']:
+                        ls_rmsve = get_ls_rmsve(alg, exp, sp)
+                        plot_ls_solution(ax, ls_rmsve, alg, sp)
+                        continue
+                    for tp in kwargs['tp_list']:
+                        for fp in get_alphas(alg, exp):
+                            for fop in [1.0]:
+                                current_params = make_current_params(alg, sp, tp, fop, fp)
+                                mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                                plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir, exist_ok=True)
+                pylab.gca().set_rasterized(True)
+                fig.savefig(os.path.join(save_dir,
+                            f"{prefix}_learning_curve_{'_'.join(kwargs['algs'])}{exp}Lmbda{sp}.pdf"),
+                            format='pdf', dpi=200, bbox_inches='tight')
+                plt.show()
+                plt.close(fig)
--- a/Plotting/plot_learning_for_two_lambdas.py
+++ b/Plotting/plot_learning_for_two_lambdas.py
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pylab
+from Plotting.plot_params import ALG_GROUPS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, RERUN_POSTFIX, \
+    PLOT_RERUN_AND_ORIG
+from Plotting.plot_utils import load_best_rerun_params_dict
+from utils import create_name_for_save_load
+
+
+# noinspection DuplicatedCode
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+# noinspection DuplicatedCode
+def plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, second_time=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    color = 'blue' if sp else 'red'
+    lbl = (alg + r' $\lambda=$ ' + str(sp))
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.tick_params(axis='x', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+
+
+# noinspection DuplicatedCode
+def plot_learning_curve_for_lambdas(**kwargs):
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for alg_names in kwargs['alg_groups'].values():
+                for alg in alg_names:
+                    if alg in ['LSETD', 'LSTD']:
+                        continue
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    save_dir = os.path.join('pdf_plots', 'learning_curves_for_lambdas', auc_or_final)
+                    for sp in kwargs['sp_list']:
+                        prefix = RERUN_POSTFIX if PLOT_RERUN else ''
+                        current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+                        print(alg, current_params)
+                        mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                        plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs)
+                        if PLOT_RERUN_AND_ORIG:
+                            prefix = RERUN_POSTFIX
+                            mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                            plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, True)
+                        if not os.path.exists(save_dir):
+                            os.makedirs(save_dir, exist_ok=True)
+                        pylab.gca().set_rasterized(True)
+                    if PLOT_RERUN_AND_ORIG:
+                        prefix = '_rerun_and_original'
+                    elif PLOT_RERUN:
+                        prefix = RERUN_POSTFIX
+                    else:
+                        prefix = ''
+                    fig.savefig(os.path.join(save_dir,
+                                f"{prefix}_learning_curve_{alg}{exp}.pdf"),
+                                format='pdf', dpi=200, bbox_inches='tight')
+                    # plt.show()
+                    plt.close(fig)
--- a/Plotting/plot_params.py
+++ b/Plotting/plot_params.py
+from Plotting.plot_utils import FirstChainAttr, FirstFourRoomAttr, HVFirstFourRoomAttr
+from Registry.AlgRegistry import alg_dict
+
+
+PLOT_RERUN = True
+PLOT_RERUN_AND_ORIG = False
+if PLOT_RERUN and PLOT_RERUN_AND_ORIG:
+    PLOT_RERUN_AND_ORIG = False
+RERUN_POSTFIX = '_rerun'
+DEBUG_MODE = True
+
+# noinspection SpellCheckingInspection
+COLORS = ['#000000', "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22",
+          "#17becf"]
+ALG_COLORS = {alg_name: color for alg_name, color in zip(alg_dict.keys(), COLORS)}
+ALG_COLORS['LSTD'] = ALG_COLORS['TD']
+ALG_COLORS['LSETD'] = ALG_COLORS['ETD']
+ALG_GROUPS = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
+              'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
+              'emphatics': ['ETD', 'ETDLB', 'LSETD'],
+              'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD']}
+EXPS = ['1HVFourRoom', 'FirstFourRoom', 'FirstChain']
+ALGS = [key for key in alg_dict.keys()]
+ALGS.remove('LSTD')
+ALGS.remove('LSETD')
+# ALGS.remove('TDRC')
+ALL_ALGS = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', 'LSTD', 'LSETD']
+# ALL_ALGS = ['TD', 'Vtrace', 'TB', 'ABTD']
+LMBDA_AND_ZETA = [0.0, 0.9]
+AUC_AND_FINAL = ['auc', 'final']
+EXP_ATTRS = {'FirstChain': FirstChainAttr, 'FirstFourRoom': FirstFourRoomAttr, '1HVFourRoom': HVFirstFourRoomAttr}
+
+if DEBUG_MODE:
+    EXPS = ['FirstFourRoom', '1HVFourRoom']
+    # ALGS = ['GTD']
+    # ALL_ALGS.remove('ETDLB')
+    # ALL_ALGS.remove('LSTD')
+    # ALL_ALGS.remove('LSETD')
+    # LMBDA_AND_ZETA = [0.9]
+    AUC_AND_FINAL = ['final']
+    # ALG_GROUPS = {'main_algs': ALL_ALGS}
--- a/Plotting/plot_sensitivity.py
+++ b/Plotting/plot_sensitivity.py
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \
+    PLOT_RERUN_AND_ORIG, RERUN_POSTFIX
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas
+from utils import create_name_for_save_load
+
+
+def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy')
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha{postfix}.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+# noinspection DuplicatedCode
+def plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, second_time=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = f'{alg}'
+    ax.set_xscale('log', basex=2)
+    color = ALG_COLORS[alg]
+    if alg == 'TD':
+        color = 'grey'
+        alpha=0.7
+    ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color,
+            linewidth=2, markersize=5, alpha=alpha)
+    ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color,
+                mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def plot_sensitivity_curve(**kwargs):
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                save_dir = os.path.join('pdf_plots', 'sensitivity_curves', auc_or_final)
+                for alg_names in kwargs['alg_groups'].values():
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    for alg in alg_names:
+                        if alg in ['LSTD', 'LSETD']:
+                            continue
+                        postfix = RERUN_POSTFIX if PLOT_RERUN else ''
+                        best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+                        alphas = get_alphas(alg, exp)
+                        best_performance, stderr = load_best_performance_over_alpha(
+                            alg, exp, auc_or_final, best_params, exp_attrs, postfix)
+                        plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs)
+                        if PLOT_RERUN_AND_ORIG:
+                            postfix = RERUN_POSTFIX
+                            best_performance, stderr = load_best_performance_over_alpha(
+                                alg, exp, auc_or_final, best_params, exp_attrs, postfix)
+                            plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, True)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    if PLOT_RERUN_AND_ORIG:
+                        prefix = '_rerun_and_original'
+                    elif PLOT_RERUN:
+                        prefix = RERUN_POSTFIX
+                    else:
+                        prefix = ''
+                    fig.savefig(os.path.join(save_dir,
+                                             f"{prefix}_sensitivity_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
+                                format='pdf', dpi=1000, bbox_inches='tight')
+                    plt.show()
+                    print(exp, alg_names, auc_or_final, sp)
--- a/Plotting/plot_sensitivity_for_two_lambdas.py
+++ b/Plotting/plot_sensitivity_for_two_lambdas.py
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN, PLOT_RERUN_AND_ORIG, RERUN_POSTFIX, ALGS
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas
+from utils import create_name_for_save_load
+
+
+def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy')
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha{postfix}.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+# noinspection DuplicatedCode
+def plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, second_time=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = f'{alg}'
+    ax.set_xscale('log', basex=2)
+    color = 'blue' if sp else 'red'
+    if sp not in [0.0, 1.0]:
+        alpha = 0.3
+        color = 'grey'
+    ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color,
+            linewidth=2, markersize=5, alpha=alpha)
+    ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color,
+                mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    # ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    # plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def plot_min(ax, min_performance):
+    print(min_performance)
+    ax.plot([pow(2, -3), pow(2, -2)], [min_performance, min_performance], linewidth=0.2, alpha=0.2)
+    # ax.axhline(y=min_performance, xmin=pow(2, -3), xmax=pow(2, -2))
+
+
+def plot_sensitivity_for_lambdas(**kwargs):
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            save_dir = os.path.join('pdf_plots', 'sensitivity_curves_for_lambdas', exp, auc_or_final)
+            for alg in kwargs['algs']:
+                min_performance = 1_000
+                fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                for sp in kwargs['sp_list']:
+                    if alg in ['LSTD', 'LSETD']:
+                        continue
+                    postfix = RERUN_POSTFIX if PLOT_RERUN else ''
+                    best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+                    alphas = get_alphas(alg, exp)
+                    best_performance, stderr = load_best_performance_over_alpha(
+                        alg, exp, auc_or_final, best_params, exp_attrs, postfix)
+                    plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs)
+                    if PLOT_RERUN_AND_ORIG:
+                        postfix = RERUN_POSTFIX
+                        best_performance, stderr = load_best_performance_over_alpha(
+                            alg, exp, auc_or_final, best_params, exp_attrs, postfix)
+                        plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, True)
+                    if min(best_performance) < min_performance:
+                        min_performance = min(best_performance)
+                if kwargs.get('plot_min_performance', False):
+                    plot_min(ax, min_performance)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir, exist_ok=True)
+                if PLOT_RERUN_AND_ORIG:
+                    prefix = '_rerun_and_original'
+                elif PLOT_RERUN:
+                    prefix = RERUN_POSTFIX
+                else:
+                    prefix = ''
+                fig.savefig(os.path.join(save_dir,
+                                         f"{prefix}_sensitivity_curve_{alg}{exp}.pdf"),
+                            format='pdf', dpi=1000, bbox_inches='tight')
+                plt.show()
+                print(exp, alg, auc_or_final, sp)
--- a/Plotting/plot_specific_learning_curves.py
+++ b/Plotting/plot_specific_learning_curves.py
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pylab
+from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
+    PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX, ALGS, ALL_ALGS
+from Plotting.plot_utils import load_best_rerun_params_dict, make_params
+from utils import create_name_for_save_load
+
+
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, flag=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
+    color = ALG_COLORS[alg]
+    if alg == 'TDRC':
+        alpha = 0.6
+    if flag:
+        color = 'green'
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def get_ls_rmsve(alg, exp, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    params = {'alpha': 0.01, 'lmbda': sp}
+    if alg == 'LSETD':
+        params['beta'] = 0.9
+    generic_name = create_name_for_save_load(params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
+    return np.load(load_file_name)
+
+
+def plot_ls_solution(ax, ls_rmsve, alg, sp):
+    lbl = f"{alg} $\\lambda=$ {sp}"
+    x = np.arange(ls_rmsve.shape[0])
+    y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
+    ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--')
+    # ax.legend()
+
+
+def load_sample_params_dict(alg, exp, sp):
+    fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
+    if alg in ['TD', 'ETD', 'TB', 'Vtrace']:
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp}
+    if alg == 'ABTD':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp}
+    if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']:
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'eta': tp_list[np.random.randint(0, len(tp_list))]}
+    if alg == 'ETDLB':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'beta': tp_list[np.random.randint(0, len(tp_list))]}
+    if alg == 'TDRC':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'eta': tp_list[np.random.randint(0, len(tp_list))],
+                'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]}
+
+
+def plot_specific_learning_curves(**kwargs):
+    specific_params = kwargs['specific_params']
+    exp = kwargs['exp']
+    prefix = ''
+    exp_attrs = EXP_ATTRS[exp](exp)
+    for auc_or_final in AUC_AND_FINAL:
+        sp = kwargs['sp']
+        save_dir = os.path.join('pdf_plots', 'specific_learning_curves', auc_or_final)
+        fig, ax = plt.subplots(figsize=(10, 4))
+        for alg in kwargs['algs']:
+            flag = False
+            if alg in ['LSTD', 'LSETD']:
+                ls_rmsve = get_ls_rmsve(alg, exp, sp)
+                plot_ls_solution(ax, ls_rmsve, alg, sp)
+                continue
+            print(alg, exp, sp)
+            if alg == 'PGTD22':
+                flag = True
+                alg = 'PGTD2'
+                current_params = specific_params[alg]
+                current_params['eta'] = 1.0
+                current_params['alpha'] = 0.03125
+            else:
+                current_params = specific_params[alg]
+            print(current_params)
+            mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+            plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, False, flag)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        pylab.gca().set_rasterized(True)
+        fig.savefig(os.path.join(save_dir,
+                    f"{prefix}_learning_curve_{'_'.join(ALGS)}{exp}Lmbda{sp}.pdf"),
+                    format='pdf', dpi=200, bbox_inches='tight')
+        plt.show()
+        plt.close(fig)
--- a/Plotting/plot_utils.py
+++ b/Plotting/plot_utils.py
+import argparse
+import json
+import numpy as np
+import os
+from Job.JobBuilder import default_params
+from Registry.AlgRegistry import alg_dict
+from utils import create_name_for_save_load
+
+
+def make_res_path(alg, exp):
+    return os.path.join(os.getcwd(), 'Results', exp, alg)
+
+
+def make_exp_path(alg, exp):
+    return os.path.join(os.getcwd(), 'Experiments', exp, alg)
+
+
+def load_best_rerun_params_dict(alg, exp, auc_or_final, sp):
+    res_path = make_res_path(alg, exp)
+    with open(os.path.join(res_path, f"{auc_or_final}_{sp}.json")) as f:
+        return json.load(f)['meta_parameters']
+
+
+def get_alphas(alg, exp):
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f"{alg}.json")
+    with open(exp_path) as f:
+        jsn_content = json.load(f)
+        return jsn_content['meta_parameters']['alpha']
+
+
+def load_best_rerun_params(alg, exp, auc_or_final, sp):
+    best_res_dict = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+    best_fp = best_res_dict.get('alpha', 0)
+    best_tp = best_res_dict.get('eta', best_res_dict.get('beta', 0))
+    best_fop = best_res_dict.get('tdrc_beta', 0)
+    return best_fp, best_tp, best_fop
+
+
+def make_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--exp_name', '-n', type=str, default='1HVFourRoom')
+    # 1HVFourRoom or FirstFourRoom or FirstChain
+    return parser.parse_args()
+
+
+def rename_best_old_result(res_path, params_dict, file_name):
+    name_to_save = create_name_for_save_load(param_dict=params_dict)
+    path_and_name = os.path.join(res_path, name_to_save)
+    file_name = path_and_name + file_name
+    os.rename(file_name + '.npy', file_name + '_old.npy')
+
+
+def load_best_perf_json(alg, exp, sp, auc_or_final):
+    res_path = make_res_path(alg, exp)
+    res_path = os.path.join(res_path, f"{auc_or_final}_{sp}.json")
+    with open(res_path, 'r') as f:
+        return json.load(f)
+
+
+def load_exp_json_file(alg, exp):
+    res_path = make_res_path(alg, exp)
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f'{alg}.json')
+    with open(exp_path) as f:
+        return json.load(f), res_path
+
+
+def make_params(alg_name, exp_name):
+    params = dict()
+    alg_param_names = alg_dict[alg_name].related_parameters()
+    json_content, res_path = load_exp_json_file(alg_name, exp_name)
+    json_exp_params = json_content.get('meta_parameters')
+    for param in alg_param_names:
+        params[param] = json_exp_params.get(param, default_params['meta_parameters'][param])
+        if not isinstance(params[param], list):
+            params[param] = list([params[param]])
+    fp_list = params.get('alpha', params['alpha'])
+    tp_list = [0.0]
+    fop_list = [0.0]
+    if 'lmbda' in params:
+        sp_list = params['lmbda']
+    else:
+        sp_list = params['zeta']
+    if 'eta' in params:
+        tp_list = params['eta']
+    elif 'beta' in params:
+        tp_list = params['beta']
+    if 'tdrc_beta' in params:
+        fop_list = params['tdrc_beta']
+    if alg_name == 'TDRC':
+        tp_list, fop_list = [1.0], [1.0]
+    return fp_list, sp_list, tp_list, fop_list, res_path
+
+
+def make_current_params(alg_name, sp, tp, fop, fp=0):
+    current_params = {'alpha': fp}
+    alg_param_names = alg_dict[alg_name].related_parameters()
+    if 'lmbda' in alg_param_names:
+        current_params['lmbda'] = sp
+    else:
+        current_params['zeta'] = sp
+    if 'eta' in alg_param_names:
+        current_params['eta'] = tp
+    elif 'beta' in alg_param_names:
+        current_params['beta'] = tp
+    if 'tdrc_beta' in alg_param_names:
+        current_params['tdrc_beta'] = fop
+    return current_params
+
+
+def get_alg_names(exp_name):
+    path = os.path.join(os.getcwd(), 'Experiments', exp_name)
+    alg_names = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
+    return alg_names
+
+
+def load_sample_json_for_exp(exp):
+    alg = get_alg_names(exp)[0]
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f'{alg}.json')
+    if not os.path.exists(exp_path):
+        print('No algorithms exist in the experiment directory...')
+        raise FileExistsError
+    with open(exp_path) as f:
+        json_exp_params = json.load(f)
+    return json_exp_params
+
+
+def load_and_replace_large_nan_inf(load_file_name, large, replace_with):
+    current_perf = np.load(load_file_name)
+    return replace_large_nan_inf(current_perf, large=large, replace_with=replace_with)
+
+
+class FirstChainAttr:
+    def __init__(self, exp_name):
+        json_exp_params = load_sample_json_for_exp(exp_name)
+        self.size_of_labels = 25
+        self.y_lim = [0.0, 0.8]
+        self.x_lim = [0.0, json_exp_params['number_of_steps']]
+        self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7]
+        self.x_axis_ticks = [0.0, 5000, 10000, 15000, 20000]
+        self.x_tick_labels = [0, '5', '10', '15', '20']
+        self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)]
+        self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1]
+        self.over_limit_replacement = 2.0
+        self.over_limit_waterfall = 0.79
+        self.learning_starting_point = 0.68910
+        self.ok_error = 0.4
+
+
+class FirstFourRoomAttr:
+    def __init__(self, exp_name):
+        json_exp_params = load_sample_json_for_exp(exp_name)
+        self.size_of_labels = 25
+        self.y_lim = [0.0, 0.8]
+        self.x_lim = [0.0, json_exp_params['number_of_steps']]
+        self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7]
+        self.x_axis_ticks = [0.0, 10000, 20000, 30000, 40000, 50000]
+        self.x_tick_labels = [0, '10', '20', '30', '40', '50']
+        self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)]
+        self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1]
+        self.over_limit_replacement = 2.0
+        self.over_limit_waterfall = 0.79
+        self.learning_starting_point = 0.72672
+        self.ok_error = 0.4
+
+
+class HVFirstFourRoomAttr(FirstFourRoomAttr):
+    def __init__(self, exp_name):
+        super(HVFirstFourRoomAttr, self).__init__(exp_name)
+
+
+def replace_large_nan_inf(arr, large=1.0, replace_with=2.0):
+    arr[np.isnan(arr)], arr[np.isinf(arr)], arr[arr > large] = replace_with, replace_with, replace_with
+    return arr
--- a/Plotting/plot_waterfall.py
+++ b/Plotting/plot_waterfall.py
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \
+    RERUN_POSTFIX
+from Plotting.plot_utils import make_current_params, replace_large_nan_inf, make_params
+from utils import create_name_for_save_load
+
+np.random.seed(0)
+def load_all_performances(alg, exp, auc_or_final, sp, exp_attrs):
+    fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
+    all_performance = np.zeros((len(fp_list), len(tp_list), len(fop_list)))
+    for i, fop in enumerate(fop_list):
+        for j, tp in enumerate(tp_list):
+            current_params = make_current_params(alg, sp, tp, fop)
+            load_file_name = os.path.join(res_path, create_name_for_save_load(
+                current_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
+
+            if PLOT_RERUN and auc_or_final == 'auc':
+                load_file_name_rerun = load_file_name.replace('.npy', f"{RERUN_POSTFIX}.npy")
+                if os.path.isfile(load_file_name_rerun):
+                    load_file_name = load_file_name_rerun
+
+            performance = np.load(load_file_name)
+            performance = replace_large_nan_inf(performance, large=exp_attrs.learning_starting_point,
+                                                replace_with=exp_attrs.over_limit_waterfall)
+            all_performance[:, j, i] = performance
+    return all_performance
+
+
+def plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs):
+    global ticker, x_axis_names, x_axis_ticks
+    performance_to_plot = np.array(all_performance.flatten())
+    percentage_overflowed = round((performance_to_plot > exp_attrs.learning_starting_point).sum() /
+                                  performance_to_plot.size, 2)
+    ok_percentage = round((performance_to_plot < exp_attrs.ok_error).sum() /
+                          performance_to_plot.size, 2)
+    print(alg, 'percentage_overflowed', percentage_overflowed)
+    # print(alg, 'OK_percentage', ok_percentage)
+    color = ALG_COLORS[alg]
+    ax.scatter([(ticker + 1)] * performance_to_plot.shape[0] + np.random.uniform(
+        -0.25, 0.25, performance_to_plot.shape[0]), performance_to_plot, marker='o',
+                facecolors='none', color=color)
+    x_axis_ticks.append(ticker + 1)
+    ticker = (ticker + 1) % len(alg_names)
+    ax.tick_params(
+        axis='x',  # changes apply to the x-axis
+        which='both',  # both major and minor ticks are affected
+        bottom=False,  # ticks along the bottom edge are off
+        top=False,  # ticks along the top edge are off
+        labelbottom=True)  # labels along the bottom edge are off
+    x_axis_names.append(f'{alg}_{percentage_overflowed}')
+    ax.xaxis.set_ticks(x_axis_ticks)
+    ax.set_xticklabels(x_axis_names)
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+ticker, x_axis_names, x_axis_ticks = 0.0, [''], [0]
+
+
+def plot_waterfall_scatter(**kwargs):
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                save_dir = os.path.join('pdf_plots', 'waterfalls', auc_or_final)
+                for alg_names in kwargs['alg_groups'].values():
+                    global ticker, x_axis_names, x_axis_ticks
+                    ticker, x_axis_names, x_axis_ticks = -0.5, [''], [0]
+                    fig, ax = plt.subplots(kwargs['fig_size'])
+                    for alg in alg_names:
+                        if alg in ['LSTD', 'LSETD']:
+                            continue
+                        all_performance = load_all_performances(alg, exp, auc_or_final, sp, exp_attrs)
+                        plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    prefix = RERUN_POSTFIX if PLOT_RERUN else ''
+                    fig.savefig(os.path.join(save_dir,
+                                             f"{prefix}_waterfall_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
+                                format='pdf', dpi=1000, bbox_inches='tight')
+                    plt.show()
+                    print(exp, alg_names, auc_or_final, sp)
--- a/Plotting/process_state_value_function.py
+++ b/Plotting/process_state_value_function.py
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+class ValueFunctionProcessor:
+    def __init__(self, exp, alg):
+        result_dir = os.path.join(os.getcwd(), 'Results', exp, alg, 'Sample_value_function')
+        self.all_value_functions = dict()
+        self.all_value_functions_of_last_step = dict()
+        for value_function_name in os.listdir(result_dir):
+            value_function = np.load(os.path.join(result_dir, value_function_name))
+            step, run_num = (int(i) for i in value_function_name.replace('.npy', '').split('_'))
+            self.all_value_functions[(step, run_num)] = value_function
+            if (step == 19999 and exp == 'FirstChain') or (step == 49999 and exp == 'FirstFourRoom') or (
+                    step == 49999 and exp == '1HVFourRoom'):
+                self.all_value_functions_of_last_step[run_num] = value_function
+
+    def get_value_function_by_step_and_run(self, step, run):
+        return self.all_value_functions[(step, run)]
+
+    def get_value_function_for_last_step(self, run):
+        return self.all_value_functions_of_last_step[run]
+
+
+# STEPS = [199, 999, 1999, 3999, 9999, 19999]
+STEPS = [199, 1999, 19999]
+# STEPS = [19999]
+RUNS = [0, 10, 15, 20, 30, 45]
+# RUNS = list(range(50))
+EXPS = ['FirstChain']  # FirstChain or FirstFourRoom or 1HVFourRoom
+ALGS = ['TD']
+TASK = 'EightStateCollision'
+
+
+def plot_value_function(ax, value_function, step=0, run=0, is_last_step=False):
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(0, 1.0)
+    label = f"{step}_{run}"
+    line_style = '-'
+    line_width = 4
+    alpha = 1.0
+    color = 'blue'
+    if not step:
+        line_style = '--'
+    if not step and is_last_step:
+        line_style = '-'
+    if is_last_step:
+        line_width = 2
+        alpha = 0.2
+        color = 'red'
+        ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha, color=color)
+    else:
+        ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def plot_value_functions():
+    for exp in EXPS:
+        save_dir = os.path.join('pdf_plots', 'value_functions')
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy'))
+        for alg in ALGS:
+            value_processor = ValueFunctionProcessor(exp, alg)
+            for run in RUNS:
+                fig, ax = plt.subplots(figsize=(8, 3))
+                for step in STEPS:
+                    value_function = value_processor.get_value_function_by_step_and_run(step, run)
+                    plot_value_function(ax, value_function, step, run)
+                plot_value_function(ax, true_value_function)
+                fig.savefig(os.path.join(save_dir, f"{run}_value_function_{alg}_{exp}.pdf"),
+                            format='pdf', dpi=200, bbox_inches='tight')
+            plt.show()
+
+
+def plot_all_final_value_functions():
+    for exp in EXPS:
+        save_dir = os.path.join('pdf_plots', 'value_functions', 'asymptotic_value_functions')
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy'))
+        for alg in ALGS:
+            value_processor = ValueFunctionProcessor(exp, alg)
+            fig, ax = plt.subplots(figsize=(8, 3))
+            for run in range(50):
+                value_function = value_processor.get_value_function_for_last_step(run)
+                plot_value_function(ax, value_function, is_last_step=True)
+            plot_value_function(ax, true_value_function)
+            fig.savefig(os.path.join(save_dir, f"value_function_{alg}_{exp}.pdf"),
+                        format='pdf', dpi=200, bbox_inches='tight')
+            plt.show()
--- a/README.md
+++ b/README.md
--- a/Registry/AlgRegistry.py
+++ b/Registry/AlgRegistry.py
+from Algorithms.TD import TD
+from Algorithms.GTD import GTD
+from Algorithms.TDRC import TDRC
+from Algorithms.GEMETD import GEMETD
+from Algorithms.GTD2 import GTD2
+from Algorithms.PGTD2 import PGTD2
+from Algorithms.HTD import HTD
+from Algorithms.ETDLB import ETDLB
+from Algorithms.ETD import ETD
+from Algorithms.ABTD import ABTD
+from Algorithms.Vtrace import Vtrace
+from Algorithms.TB import TB
+from Algorithms.LSTD import LSTD
+from Algorithms.LSETD import LSETD
+alg_dict = {'TD': TD, 'Vtrace': Vtrace, 'GTD': GTD, 'ABTD': ABTD, 'ETD': ETD, 'TB': TB, 'GTD2': GTD2, 'HTD': HTD,
+            'ETDLB': ETDLB, 'PGTD2': PGTD2, 'TDRC': TDRC, 'GEMETD': GEMETD, 'LSTD': LSTD, 'LSETD': LSETD}
+# alg_dict = {'TD': TD, 'GTD': GTD, 'GTD2': GTD2, 'PGTD2': PGTD2, 'HTD': HTD, 'TDRC': TDRC, 'ETD': ETD, 'ETDLB': ETDLB,
+#             'TB': TB, 'Vtrace': Vtrace, 'ABTD': ABTD, 'LSTD': LSTD, 'LSETD': 'LSETD'}
--- a/Registry/EnvRegistry.py
+++ b/Registry/EnvRegistry.py
+from Environments.Chain import Chain
+from Environments.FourRoomGridWorld import FourRoomGridWorld
+environment_dict = {'FourRoomGridWorld': FourRoomGridWorld, 'Chain': Chain}
--- a/Registry/TaskRegistry.py
+++ b/Registry/TaskRegistry.py
+from Tasks.EightStateCollision import EightStateCollision
+from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
+from Tasks.HighVarianceLearnEightPoliciesTileCodingFeat import HighVarianceLearnEightPoliciesTileCodingFeat
+task_dict = {'EightStateCollision': EightStateCollision,
+             'LearnEightPoliciesTileCodingFeat': LearnEightPoliciesTileCodingFeat,
+             'HighVarianceLearnEightPoliciesTileCodingFeat': HighVarianceLearnEightPoliciesTileCodingFeat}
--- a/Resources/EightStateCollision/d_mu.npy
+++ b/Resources/EightStateCollision/d_mu.npy
--- a/Resources/EightStateCollision/feature_rep.npy
+++ b/Resources/EightStateCollision/feature_rep.npy
--- a/Resources/EightStateCollision/state_values.npy
+++ b/Resources/EightStateCollision/state_values.npy
--- a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/d_mu.npy
+++ b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/d_mu.npy
--- a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/feature_rep.npy
+++ b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/feature_rep.npy
--- a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/state_values.npy
+++ b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/state_values.npy
--- a/Resources/LearnEightPoliciesTileCodingFeat/d_mu.npy
+++ b/Resources/LearnEightPoliciesTileCodingFeat/d_mu.npy
--- a/Resources/LearnEightPoliciesTileCodingFeat/feature_rep.npy
+++ b/Resources/LearnEightPoliciesTileCodingFeat/feature_rep.npy
--- a/Resources/LearnEightPoliciesTileCodingFeat/state_values.npy
+++ b/Resources/LearnEightPoliciesTileCodingFeat/state_values.npy
--- a/Tasks/BaseTask.py
+++ b/Tasks/BaseTask.py
--- a/Tasks/EightStateCollision.py
+++ b/Tasks/EightStateCollision.py
--- a/Tasks/HighVarianceLearnEightPoliciesTileCodingFeat.py
+++ b/Tasks/HighVarianceLearnEightPoliciesTileCodingFeat.py
--- a/Tasks/LearnEightPoliciesTileCodingFeat.py
+++ b/Tasks/LearnEightPoliciesTileCodingFeat.py
--- a/Tests/Algorithms/TestTD.py
+++ b/Tests/Algorithms/TestTD.py
--- a/Tests/Environments/TestChain.py
+++ b/Tests/Environments/TestChain.py
--- a/Tests/Tasks/TestEightStateCollision.py
+++ b/Tests/Tasks/TestEightStateCollision.py
--- a/data_presister.py
+++ b/data_presister.py
--- a/main.py
+++ b/main.py
--- a/plot_data.py
+++ b/plot_data.py
--- a/process_data.py
+++ b/process_data.py
--- a/requirements.txt
+++ b/requirements.txt
--- a/test.py
+++ b/test.py
--- a/unittest_suite.py
+++ b/unittest_suite.py
--- a/utils.py
+++ b/utils.py