Commit 3d352cbd by GongYu

TDAlgorithms_IEEE24

parents
This diff is collapsed. Click to expand it.
from Algorithms.BaseVariableLmbda import BaseVariableLmbda
import numpy as np
class ABTD(BaseVariableLmbda):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
zeta = kwargs.get('zeta')
self.old_nu = 0
if self.task.num_policies > 1:
self.old_nu = np.zeros(self.task.num_policies)
xi_zero = self.task.ABTD_xi_zero
xi_max = self.task.ABTD_xi_max
self.xi = 2 * zeta * xi_zero + max(0, 2 * zeta - 1) * (xi_max - 2 * xi_zero)
@staticmethod
def related_parameters():
return['alpha', 'zeta']
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, rho, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal)
nu = min(self.xi, 1.0 / max(pi, mu))
self.z = x + self.gamma * self.old_nu * self.old_pi * self.z
self.w += alpha * delta * self.z
self.old_nu = nu
self.old_pi = pi
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
delta = rho * delta
nu = self.compute_nu_for_multiple_policies(pi, mu)
self.z = (self.gamma_vec_t * self.old_nu * self.old_pi)[:, None] * self.z + stacked_x
self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
self.old_nu = nu
self.old_pi = pi
self.gamma_vec_t = self.gamma_vec_tp
def compute_nu_for_multiple_policies(self, pi, mu):
xi_vec = np.ones(self.task.num_policies) * self.xi
max_vec = 1.0 / np.maximum.reduce([pi, mu])
return np.minimum.reduce([max_vec, xi_vec])
def reset(self):
super().reset()
self.old_nu = 0
import numpy as np
from Algorithms.BaseTD import BaseTD
from Tasks.BaseTask import BaseTask
class BaseGradient(BaseTD):
def __init__(self, task: BaseTask, **kwargs):
super().__init__(task, **kwargs)
self.v = np.zeros(self.task.num_features)
self.eta = kwargs.get('eta')
if self.task.num_policies > 1:
self.v = np.zeros((self.task.num_policies, self.task.num_features))
@staticmethod
def related_parameters():
return ['alpha', 'lmbda', 'eta']
def compute_second_step_size(self):
return self.eta * self.compute_step_size()
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super(BaseGradient, self).learn_multiple_policies(
s, s_p, r, is_terminal)
return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x, self.task.stacked_feature_rep[:, :, s_p], \
self.compute_second_step_size() * self.gamma_vec_t / self.gamma
import numpy as np
from numpy.linalg import pinv
from Tasks.BaseTask import BaseTask
from Algorithms.BaseTD import BaseTD
class BaseLS(BaseTD):
def __init__(self, task: BaseTask, **kwargs):
super(BaseLS, self).__init__(task, **kwargs)
self.A = np.zeros((self.task.num_features, self.task.num_features))
self.b = np.zeros(self.task.num_features)
self.t = 0
if self.task.num_policies > 1:
self.A = np.zeros((self.task.num_policies, self.task.num_features, self.task.num_features))
self.b = np.zeros((self.task.num_policies, self.task.num_features))
self.gamma_vec_t = np.concatenate((np.ones(2), np.zeros(6))) * self.gamma
self.t = np.zeros(self.task.num_policies)
def learn_single_policy(self, s, s_p, r, is_terminal):
x, x_p = self.get_features(s, s_p, is_terminal)
self.t += 1
self.A += (np.outer(self.z, (x - self.gamma * x_p)) - self.A) / self.t
self.b += (r * self.z - self.b) / self.t
self.w = np.dot(pinv(self.A), self.b)
def learn_multiple_policies(self, s, s_p, r, is_terminal):
_, _, x, x_p, _, _, _, stacked_x = \
super(BaseLS, self).learn_multiple_policies(s, s_p, r, is_terminal)
for i in range(self.task.num_policies):
if self.gamma_vec_t[i] != 0.0:
self.t[i] += 1
z = self.z[i, :]
self.A[i, :, :] += (np.outer(z, (x - self.gamma_vec_tp[i] * x_p)) - self.A[i, :, :]) / self.t[i]
self.b[i, :] += (self.r_vec[i] * z - self.b[i, :]) / self.t[i]
self.w[i, :] = np.dot(pinv(self.A[i, :, :]), self.b[i, :])
self.gamma_vec_t = self.gamma_vec_tp
import numpy as np
from Tasks.BaseTask import BaseTask
class BaseTD:
def __init__(self, task: BaseTask, **kwargs):
self.task = task
self.w = np.zeros(self.task.num_features)
self.z = np.zeros(self.task.num_features)
if self.task.num_policies > 1:
self.w = np.zeros((self.task.num_policies, self.task.num_features))
self.z = np.zeros((self.task.num_policies, self.task.num_features))
self.gamma = self.task.GAMMA
self.alpha = kwargs['alpha']
self.lmbda = kwargs.get('lmbda')
self.state_values = self.task.load_state_values() # This is of size num_policies * 121
self.d_mu = self.task.load_behavior_dist() # same size as state_values
self.state, self.next_state, self.action = None, None, None
self.r_vec = np.zeros(self.task.num_policies)
self.gamma_vec_tp = np.zeros(self.task.num_policies)
self.gamma_vec_t = np.zeros(self.task.num_policies)
@staticmethod
def related_parameters():
return ['alpha', 'lmbda']
def compute_value_function(self):
return np.dot(self.w, self.task.feature_rep.T)
def compute_rmsve(self):
error = self.compute_value_function() - self.state_values
error_squared = error * error
return np.sqrt(np.sum(self.d_mu * error_squared.T, 0) / np.sum(self.d_mu, 0)), error
def compute_step_size(self):
return self.alpha
def choose_behavior_action(self):
return self.task.select_behavior_action(self.state)
def choose_target_action(self):
return self.task.select_target_action(self.state)
def learn(self, s, s_p, r, is_terminal):
if self.task.num_policies == 1:
self.learn_single_policy(s, s_p, r, is_terminal)
else:
self.learn_multiple_policies(s, s_p, r, is_terminal)
def get_features(self, s, s_p, is_terminal):
x_p = np.zeros(self.task.num_features)
if not is_terminal:
x_p = self.task.get_state_feature_rep(s_p)
x = self.task.get_state_feature_rep(s)
return x, x_p
def get_isr(self, s):
pi = self.task.get_pi(s, self.action)
mu = self.task.get_mu(s, self.action)
rho = pi / mu
return rho
def get_delta(self, r, x, x_p):
return r + self.gamma * np.dot(self.w, x_p) - np.dot(self.w, x)
def learn_single_policy(self, s, s_p, r, is_terminal):
x, x_p = self.get_features(s, s_p, is_terminal)
rho = self.get_isr(s)
alpha = self.compute_step_size()
delta = self.get_delta(r, x, x_p)
self.z = rho * (self.gamma * self.lmbda * self.z + x)
return delta, alpha, x, x_p, rho
def learn_multiple_policies(self, s, s_p, r, is_terminal):
active_policies_vec = self.task.get_active_policies(s)
self.r_vec = np.zeros(self.task.num_policies)
if r > 0:
terminal_policies_vec = self.task.get_terminal_policies(s_p)
self.r_vec = r * terminal_policies_vec
alpha_vec = active_policies_vec * self.compute_step_size()
x = self.task.get_state_feature_rep(s)
x_p = np.zeros(self.task.num_features)
if not is_terminal:
x_p = self.task.get_state_feature_rep(s_p)
pi = self.task.get_pi(s, self.action)
mu = self.task.get_mu(s, self.action)
rho = pi / mu
self.gamma_vec_tp = self.task.get_active_policies(s_p) * self.gamma
delta = self.r_vec + self.gamma_vec_tp * np.dot(self.w, x_p) - np.dot(self.w, x)
stacked_x = self.task.stacked_feature_rep[:, :, s]
return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x
def reset(self):
self.z = np.zeros(self.task.num_features)
def __str__(self):
return f'agent:{type(self).__name__}'
from Algorithms.BaseTD import BaseTD
from Tasks.BaseTask import BaseTask
import numpy as np
class BaseVariableLmbda(BaseTD):
def __init__(self, task: BaseTask, **kwargs):
super().__init__(task, **kwargs)
self.old_pi, self.old_mu = 0, 1
if self.task.num_policies > 1:
self.old_pi, self.old_mu = np.zeros(self.task.num_policies), np.ones(self.task.num_policies)
self.old_rho = self.old_pi / self.old_mu
def learn_single_policy(self, s, s_p, r, is_terminal):
alpha = self.compute_step_size()
pi = self.task.get_pi(s, self.action)
mu = self.task.get_mu(s, self.action)
rho = pi / mu
x, x_p = self.get_features(s, s_p, is_terminal)
delta = rho * self.get_delta(r, x, x_p)
return delta, alpha, x, x_p, rho, pi, mu
def reset(self):
self.old_pi, self.old_mu = 0, 1
self.old_rho = self.old_pi / self.old_mu
from Algorithms.ETDLB import ETDLB
class ETD(ETDLB):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.beta = self.task.GAMMA
@staticmethod
def related_parameters():
return ['alpha', 'lmbda']
from Algorithms.BaseTD import BaseTD
import numpy as np
class ETDLB(BaseTD):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.F = 1
self.old_rho = 0
self.beta = kwargs.get('beta')
if self.task.num_policies > 1:
self.F = np.zeros(self.task.num_policies)
self.old_rho = np.zeros(self.task.num_policies)
@staticmethod
def related_parameters():
return ['alpha', 'lmbda', 'beta']
def learn_single_policy(self, s, s_p, r, is_terminal):
x, x_p = self.get_features(s, s_p, is_terminal)
delta = self.get_delta(r, x, x_p)
self.F = self.beta * self.old_rho * self.F + 1
m = self.lmbda * 1 + (1 - self.lmbda) * self.F
rho = self.get_isr(s)
self.z = rho * (x * m + self.gamma * self.lmbda * self.z)
self.w += self.compute_step_size() * delta * self.z
self.old_rho = rho
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, *_, rho, _ = super().learn_multiple_policies(s, s_p, r, is_terminal)
stacked_x = self.task.stacked_feature_rep[:, :, s]
beta_vec = self.beta * self.gamma_vec_t / self.gamma
self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies)
m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None])
self.w += (alpha_vec * delta)[:, None] * self.z
self.old_rho = rho
self.gamma_vec_t = self.gamma_vec_tp
def reset(self):
super().reset()
self.F = 1
self.old_rho = 0
if self.task.num_policies > 1:
self.old_rho = np.zeros(self.task.num_policies)
self.F = np.zeros(self.task.num_policies)
from Algorithms.BaseTD import BaseTD
import numpy as np
class GEMETD(BaseTD):
"""
An ETD(0) implementation that uses GEM (aka GTD2(0) with x and x_p switched) to estimate emphasis.
"""
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.beta = self.task.GAMMA
self.gem_alpha = kwargs['gem_alpha'] # Step size for GEM weights.
self.gem_beta = kwargs['gem_beta'] # Regularization parameter for GEM; not needed for a fixed target policy.
self.k = np.zeros(self.task.num_features) # Auxiliary weights for GEM.
self.u = np.zeros(self.task.num_features) # Main weights for GEM.
if self.task.num_policies > 1:
self.k = np.zeros((self.task.num_policies, self.task.num_features))
self.u = np.zeros((self.task.num_policies, self.task.num_features))
@staticmethod
def related_parameters():
return ['alpha', 'gem_alpha', 'gem_beta']
def learn_single_policy(self, s, s_p, r, is_terminal):
x, x_p = self.get_features(s, s_p, is_terminal)
rho = self.get_isr(s)
delta_bar = 1 + rho * self.gamma * np.dot(self.u, x) - np.dot(self.u, x_p)
self.k += self.gem_alpha * (delta_bar - np.dot(self.k, x_p)) * x_p
self.u += self.gem_alpha * ((x_p - self.gamma * rho * x) * np.dot(self.k, x_p) - self.gem_beta * self.u)
delta = self.get_delta(r, x, x_p)
m = np.dot(self.u, x) # Use parametric estimate of expected emphasis.
self.w += self.alpha * m * rho * delta * x
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
stacked_x_p = self.task.stacked_feature_rep[:, :, s_p]
# GEM update:
gem_alpha_vec = self.task.get_active_policies(s) * self.gem_alpha
delta_bar = np.ones(self.task.num_policies) + rho * self.gamma_vec_t * np.dot(self.u, x) - np.dot(self.u, x_p)
self.k += gem_alpha_vec[:, None] * (delta_bar[:, None] - np.sum(x_p * self.k, 1)[:, None]) * stacked_x_p
self.u += gem_alpha_vec[:, None] * ((stacked_x_p - self.gamma_vec_t[:, None] * rho[:, None] * stacked_x) * np.sum(x_p * self.k, 1)[:, None] - self.gem_beta * self.u) # should self.gem_beta be a vector here?
# ETD(0) update:
m = np.dot(self.u, x)
self.w += (alpha_vec * m * rho * delta)[:, None] * stacked_x
self.gamma_vec_t = self.gamma_vec_tp
def reset(self):
super().reset()
self.k = np.zeros(self.task.num_features)
self.u = np.zeros(self.task.num_features)
if self.task.num_policies > 1:
self.k = np.zeros((self.task.num_policies, self.task.num_features))
self.u = np.zeros((self.task.num_policies, self.task.num_features))
from Algorithms.BaseGradient import BaseGradient
import numpy as np
# noinspection DuplicatedCode
class GTD(BaseGradient):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x)
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p)
self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseGradient import BaseGradient
import numpy as np
class GTD2(BaseGradient):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
self.w += alpha * (np.dot(x, self.v) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x)
# noinspection DuplicatedCode
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
self.w += alpha_vec[:, None] * (
np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseGradient import BaseGradient
import numpy as np
class HTD(BaseGradient):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.z_b = np.zeros(self.task.num_features)
if self.task.num_policies > 1:
self.z_b = np.zeros((self.task.num_policies, self.task.num_features))
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
self.z_b = self.gamma * self.lmbda * self.z_b + x
self.w += alpha * ((delta * self.z) + (x - self.gamma * x_p) * np.dot((self.z - self.z_b), self.v))
self.v += alpha_v * ((delta * self.z) - (x - self.gamma * x_p) * np.dot(self.v, self.z_b))
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
self.z_b = self.lmbda * self.z_b * self.gamma_vec_t[:, None] + stacked_x
gamma_stacked_xp = self.gamma_vec_tp[:, None] * stacked_x_p
delta_z = delta[:, None] * self.z
self.w += alpha_vec[:, None] * (
delta_z + (stacked_x - gamma_stacked_xp) * (np.sum((self.z - self.z_b) * self.v, 1))[:, None])
self.v += alphav_vec[:, None] * (
delta_z - (stacked_x - gamma_stacked_xp) * np.sum(self.v * self.z_b, 1)[:, None])
# TODO: Should the last v be replaced by w?
self.gamma_vec_t = self.gamma_vec_tp
def reset(self):
super().reset()
self.z_b = np.zeros(self.task.num_features)
if self.task.num_policies > 1:
self.z_b = np.zeros((self.task.num_policies, self.task.num_features))
from Algorithms.BaseLS import BaseLS
import numpy as np
class LSETD(BaseLS):
def __init__(self, task, **kwargs):
super(LSETD, self).__init__(task, **kwargs)
self.old_rho = 0
self.F = 1
self.beta = kwargs['beta']
if self.task.num_policies > 1:
self.F = np.ones(self.task.num_policies)
self.old_rho = np.zeros(self.task.num_policies)
@staticmethod
def related_parameters():
return ['alpha', 'lmbda', 'beta']
def learn_single_policy(self, s, s_p, r, is_terminal):
self.F = self.beta * self.old_rho * self.F + 1
m = self.lmbda + (1 - self.lmbda) * self.F
x, _ = self.get_features(s, s_p, is_terminal)
rho = self.get_isr(s)
self.z = rho * (self.gamma * self.lmbda * self.z + x * m)
super(LSETD, self).learn_single_policy(s, s_p, r, is_terminal)
self.old_rho = rho
# noinspection DuplicatedCode
def learn_multiple_policies(self, s, s_p, r, is_terminal):
beta_vec = self.beta * self.gamma_vec_t / self.gamma
self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies)
m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F
stacked_x = self.task.stacked_feature_rep[:, :, s]
rho = self.get_isr(s)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None])
super(LSETD, self).learn_multiple_policies(s, s_p, r, is_terminal)
self.old_rho = rho
def reset(self):
super().reset()
self.F = 1
self.old_rho = 0
if self.task.num_policies > 1:
self.old_rho = np.zeros(self.task.num_policies)
self.F = np.zeros(self.task.num_policies)
from Algorithms.BaseLS import BaseLS
class LSTD(BaseLS):
def learn_single_policy(self, s, s_p, r, is_terminal):
x, _ = self.get_features(s, s_p, is_terminal)
self.z = self.get_isr(s) * (self.gamma * self.lmbda * self.z + x)
super(LSTD, self).learn_single_policy(s, s_p, r, is_terminal)
def learn_multiple_policies(self, s, s_p, r, is_terminal):
x, _ = self.get_features(s, s_p, is_terminal)
self.z = self.get_isr(s)[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + x)
super(LSTD, self).learn_multiple_policies(s, s_p, r, is_terminal)
from Algorithms.BaseGradient import BaseGradient
import numpy as np
class PGTD2(BaseGradient):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
v_mid = self.v + alpha_v * (delta * self.z - np.dot(x, self.v) * x)
w_mid = self.w + alpha * (np.dot(x, self.v) * x - (1 - self.lmbda) * self.gamma * np.dot(self.z, self.v) * x_p)
delta_mid = r + self.gamma * np.dot(w_mid, x_p) - np.dot(w_mid, x)
self.w += alpha * (np.dot(x, v_mid) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, v_mid) * x_p)
self.v += alpha_v * (delta_mid * self.z - np.dot(x, v_mid) * x)
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
v_mid = self.v + alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
w_mid = self.w + alpha_vec[:, None] * (
np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
delta_mid = self.r_vec + self.gamma_vec_tp * np.dot(w_mid, x_p) - np.dot(w_mid, x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * v_mid, 1)
self.w += alpha_vec[:, None] * (
np.sum(x * v_mid, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
self.v += alphav_vec[:, None] * (delta_mid[:, None] * self.z - np.sum(x * v_mid, 1)[:, None] * stacked_x)
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseVariableLmbda import BaseVariableLmbda
class TB(BaseVariableLmbda):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, *_, pi, _ = super().learn_single_policy(s, s_p, r, is_terminal)
self.z = self.gamma * self.lmbda * self.old_pi * self.z + x
self.w = self.w + alpha * delta * self.z
self.old_pi = pi
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
delta = rho * delta
self.z = (self.gamma_vec_t * self.lmbda * self.old_pi)[:, None] * self.z + stacked_x
self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
self.old_pi = pi
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseTD import BaseTD
class TD(BaseTD):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, *_ = super().learn_single_policy(s, s_p, r, is_terminal)
self.w += alpha * delta * self.z
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
self.w += (alpha_vec * delta)[:, None] * self.z
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseGradient import BaseGradient
import numpy as np
# noinspection DuplicatedCode
class TDRC(BaseGradient):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.tdrc_beta = kwargs['tdrc_beta']
@staticmethod
def related_parameters():
return ['alpha', 'lmbda', 'eta', 'tdrc_beta']
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x) - alpha_v * self.tdrc_beta * self.v
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p)
self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(
x * self.v, 1)[:, None] * stacked_x) - (alphav_vec * self.tdrc_beta)[:, None] * self.v
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseVariableLmbda import BaseVariableLmbda
import numpy as np
class Vtrace(BaseVariableLmbda):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, *_, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal)
self.z = min(self.old_rho, 1) * self.gamma * self.lmbda * self.z + x
self.w += alpha * delta * self.z
self.old_rho = pi / mu
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
delta = rho * delta
truncated_old_rho = np.minimum(self.old_rho, np.ones(self.task.num_policies))
self.z = (truncated_old_rho * self.gamma_vec_t * self.lmbda)[:, None] * self.z + stacked_x
self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
self.old_rho = rho
self.gamma_vec_t = self.gamma_vec_tp
import numpy as np
class Chain:
def __init__(self, states_number: int = 8, start_state_number: int = 4, **kwargs):
assert start_state_number < states_number, "start states numbers should be less than state number"
self._states_number = states_number
self._start_state_number = start_state_number
self._terminal = self._states_number
self._state = None
self.RIGHT_ACTION = 0
self.RETREAT_ACTION = 1
self.num_states = states_number
self._window = None
def reset(self):
self._state = np.random.randint(0, self._start_state_number)
return self._state
def step(self, action):
if action == self.RETREAT_ACTION:
return self._terminal, 0, True, {}
next_state = self._state + 1
if next_state == self._terminal:
return self._terminal, 1, True, {}
self._state = next_state
return self._state, 0, False, {}
def render(self, mode='human'):
if mode == 'human':
import sys
from Environments.utils import colorize
corridor_map = [
str(i) if i > self._start_state_number
else colorize(str(i), "blue", highlight=False)
for i in range(self._states_number)
]
corridor_map.append(colorize("T", "red", highlight=False))
corridor_map[self._state] = colorize(corridor_map[self._state], "green", highlight=True)
sys.stdout.write(f'{"|".join(corridor_map)}\n')
if mode == "rgb" or mode == "screen":
RGB_COLORS = {
'red': np.array([240, 52, 52]),
'black': np.array([0, 0, 0]),
'green': np.array([77, 181, 33]),
'blue': np.array([29, 111, 219]),
'purple': np.array([112, 39, 195]),
'yellow': np.array([217, 213, 104]),
'grey': np.array([192, 195, 196]),
'light_grey': np.array([230, 230, 230]),
'white': np.array([255, 255, 255])
}
img = np.zeros((self.num_states, 1, 3), dtype=np.uint8)
img[:, 0] = RGB_COLORS['grey']
img[:self._start_state_number - 1, 0] = RGB_COLORS['yellow']
img[self._terminal - 1, 0] = RGB_COLORS['black']
img[self._state - 1, 0] = RGB_COLORS['green']
img = np.transpose(img, (1, 0, 2))
if mode == "screen":
from pyglet.window import Window
from pyglet.text import Label
from pyglet.gl import GLubyte
from pyglet.image import ImageData
zoom = 50
if self._window is None:
self._window = Window(self.num_states * zoom, 1 * zoom)
dt = np.kron(img, np.ones((zoom, zoom, 1)))
dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8'))
texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture()
self._window.clear()
self._window.switch_to()
self._window.dispatch_events()
texture.blit(0, 0)
# self._info.draw()
self._window.flip()
return np.flip(img, axis=0)
if __name__ == '__main__':
env = Chain()
env.reset()
for step in range(1, 1000):
action = np.random.randint(0, 2)
sp, r, terminal, _ = env.step(action=action)
env.render(mode="screen")
if terminal:
env.reset()
print('env reset')
import numpy as np
# from Environments.rendering import Render
# from gym import utils
# import gym
# import sys
BLOCK_NORMAL, BLOCK_WALL, BLOCK_HALLWAY, BLOCK_AGENT = 0, 1, 2, 3
RGB_COLORS = {
'red': np.array([240, 52, 52]),
'black': np.array([0, 0, 0]),
'green': np.array([77, 181, 33]),
'blue': np.array([29, 111, 219]),
'purple': np.array([112, 39, 195]),
'yellow': np.array([217, 213, 104]),
'grey': np.array([192, 195, 196]),
'light_grey': np.array([230, 230, 230]),
'white': np.array([255, 255, 255])
}
four_room_map = [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
]
class FourRoomGridWorld:
def __init__(self, stochasticity_fraction=0.0):
self._grid = np.transpose(np.flip(np.array(four_room_map, dtype=np.uint8), axis=0)[1:-1, 1:-1])
self._max_row, self._max_col = self._grid.shape
self._normal_tiles = np.where(self._grid == BLOCK_NORMAL)
self._hallways_tiles = np.where(self._grid == BLOCK_HALLWAY)
self._walls_tiles = np.where(self._grid == BLOCK_WALL)
self.num_states = self._grid.size
self._state = None
self.ACTION_UP, self.ACTION_DOWN, self.ACTION_RIGHT, self.ACTION_LEFT = 0, 1, 2, 3
self.num_actions = 4
self._stochasticity_fraction = stochasticity_fraction
self.hallways = {
0: (5, 1),
1: (1, 5),
2: (5, 8),
3: (8, 4)
}
self._window, self._info = None, None
def reset(self):
self._state = (0, 0)
return self.get_state_index(*self._state)
def step(self, action):
x, y = self._state
is_stochastic_selected = False
# if self._stochasticity_fraction >= np.random.uniform():
# action_probability = [1 / (self.num_actions - 1) if i != action else 0 for i in range(self.num_actions)]
# action = np.random.choice(self.num_actions, 1, p=action_probability)[0]
# is_stochastic_selected = True
x_p, y_p = self._next(action, *self._state)
is_done = self._grid[x_p, y_p] == BLOCK_HALLWAY
reward = 1 if is_done else 0
self._state = (x_p, y_p)
return self.get_state_index(*self._state), reward, False, {
'x': x, 'y': y,
'x_p': x_p, 'y_p': y_p,
'is_stochastic_selected': is_stochastic_selected,
'selected_action': action}
def get_xy(self, state):
return (state % self._max_row), (state // self._max_col)
def get_state_index(self, x, y):
return y * self._max_col + x
def _next(self, action, x, y):
def move(current_x, current_y, next_x, next_y):
if next_y < 0 or next_x < 0:
return current_x, current_y
if next_y >= self._max_col or next_x >= self._max_row:
return current_x, current_y
if self._grid[next_x, next_y] == BLOCK_WALL:
return current_x, current_y
return next_x, next_y
switcher = {
self.ACTION_DOWN: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x, pos_y - 1),
self.ACTION_RIGHT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x + 1, pos_y),
self.ACTION_UP: lambda pox_x, pos_y: move(pox_x, y, pox_x, pos_y + 1),
self.ACTION_LEFT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x - 1, pos_y),
}
move_func = switcher.get(action)
return move_func(x, y)
def render(self, mode='human'):
import sys
from Environments.utils import colorize
color = {
BLOCK_NORMAL: lambda c: colorize(c, "white", highlight=True),
BLOCK_WALL: lambda c: colorize(c, "gray", highlight=True),
BLOCK_HALLWAY: lambda c: colorize(c, "green", highlight=True),
}
if mode == 'human':
outfile = sys.stdout
img = [
[color[b](' ')
for x, b
in enumerate(line)]
for y, line in enumerate(four_room_map)]
img[self._max_row - self._state[1]][self._state[0] + 1] = colorize(' ', "red",
highlight=True)
for line in img:
outfile.write(f'{"".join(line)}\n')
outfile.write('\n')
if mode == "rgb" or mode == "screen":
x, y = self._state
img = np.zeros((*self._grid.shape, 3), dtype=np.uint8)
img[self._normal_tiles] = RGB_COLORS['light_grey']
# if render_cls is not None:
# assert render_cls is not type(Render), "render_cls should be Render class"
# img = render_cls.render(img)
img[self._walls_tiles] = RGB_COLORS['black']
img[self._hallways_tiles] = RGB_COLORS['green']
img[x, y] = RGB_COLORS['red']
ext_img = np.zeros((self._max_row + 2, self._max_col + 2, 3), dtype=np.uint8)
ext_img[1:-1, 1:-1] = np.transpose(img, (1, 0, 2))
if mode == "screen":
from pyglet.window import Window
from pyglet.text import Label
from pyglet.gl import GLubyte
from pyglet.image import ImageData
zoom = 20
if self._window is None:
self._window = Window((self._max_row + 2) * zoom, (self._max_col + 2) * zoom)
self._info = Label('Four Room Grid World', font_size=10, x=5, y=5)
# self._info.text = f'x: {x}, y: {y}'
dt = np.kron(ext_img, np.ones((zoom, zoom, 1)))
dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8'))
texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture()
self._window.clear()
self._window.switch_to()
self._window.dispatch_events()
texture.blit(0, 0)
# self._info.draw()
self._window.flip()
return np.flip(ext_img, axis=0)
if __name__ == '__main__':
mode = 'human'
mode = 'screen'
env = FourRoomGridWorld()
env.reset()
for step in range(1, 100):
action = np.random.randint(0, 4)
sp, r, terminal, _ = env.step(action=action)
env.render(mode=mode)
if terminal:
env.reset()
print('env reset')
from abc import ABC, abstractmethod
import numpy as np
class Render(ABC):
@abstractmethod
def render(self, img):
raise NotImplementedError
class ErrorRender(Render):
def __init__(self, num_policies, num_steps):
self.num_steps = num_steps
self.num_policies = num_policies
self._error, self._max_error, self._valid_state = None, None, None
def render(self, img):
# self.color_policy(img, 0)
self.color_policy(img, 1)
# self.color_policy(img, 2)
self.color_policy(img, 3)
# self.color_policy(img, 4)
self.color_policy(img, 5)
# self.color_policy(img, 6)
self.color_policy(img, 7)
return img
def add_error(self, error):
if self._max_error is None:
self._max_error = np.abs(error).reshape(8, 11, 11)
self._valid_state = np.array(self._max_error)
self._valid_state[self._valid_state != 0] = 1
self._error = np.abs(error).reshape(8, 11, 11)
def color_policy(self, img, policy_number):
e = self._error[policy_number]
x = self._max_error[policy_number]
d = np.clip((230 * e / x), 10, 255)
d = d * self._valid_state[policy_number]
d = np.nan_to_num(d).astype(np.uint8).T
d = np.repeat(d, 3).reshape(11, 11, 3)
d[:, :, 2] = 230
c = np.where(self._valid_state[policy_number].T == 1)
img[c] = d[c]
return img
"""A set of common utilities used within the environments. These are
not intended as API functions, and will not remain stable over time.
"""
color2num = dict(
gray=30,
red=31,
green=32,
yellow=33,
blue=34,
magenta=35,
cyan=36,
white=37,
crimson=38
)
def colorize(string, color, bold=False, highlight=False):
"""Return string surrounded by appropriate terminal color codes to
print colorized text. Valid colors: gray, red, green, yellow,
blue, magenta, cyan, white, crimson
"""
attr = []
num = color2num[color]
if highlight:
num += 10
attr.append(str(num))
if bold:
attr.append('1')
attrs = ';'.join(attr)
return '\x1b[%sm%s\x1b[0m' % (attrs, string)
{
"agent": "ABTD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"zeta": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ETD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ETDLB",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"beta": [
0.0, 0.2, 0.4, 0.6, 0.8, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "GTD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "GTD2",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "HTD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "PGTD2",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TB",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TDRC",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
],
"tdrc_beta": [
1.0
]
}
}
\ No newline at end of file
{
"agent": "Vtrace",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ABTD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"zeta": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "ETD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "ETDLB",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"beta": [
0.0, 0.2, 0.4, 0.6, 0.8, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "GTD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "GTD2",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "HTD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "PGTD2",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "TB",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "TD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "TDRC",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
],
"tdrc_beta": [
1.0
]
}
}
\ No newline at end of file
{
"agent": "Vtrace",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "ABTD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"zeta": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ETD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ETDLB",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"beta": [
0.0, 0.2, 0.4, 0.6, 0.8, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "GTD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "GTD2",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "HTD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "PGTD2",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TB",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TDRC",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
],
"tdrc_beta": [
1.0
]
}
}
\ No newline at end of file
{
"agent": "Vtrace",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
This directory contains all the export.dat files created when submitting jobs on Cedar.
\ No newline at end of file
#!/bin/bash
alpha=(__ALPHA__)
lmbda=(__LMBDA__)
eta=(__ETA__)
beta=(__BETA__)
zeta=(__ZETA__)
tdrc_beta=(__TDRCBETA__)
gem_alpha=(__GEMALPHA__)
gem_beta=(__GEMBETA__)
num_of_runs=__NUMOFRUNS__
num_steps=__NUMSTEPS__
sub_sample=__SUBSAMPLE__
algorithm=__ALGORITHM__
environment=__ENVIRONMENT__
task=__TASK__
save_path=__SAVEPATH__
rm -f exports_${algorithm}.dat
for A in ${alpha[@]}; do
for L in ${lmbda[@]}; do
for E in ${eta[@]}; do
for B in ${beta[@]}; do
for Z in ${zeta[@]}; do
for T in ${tdrc_beta[@]}; do
for GA in ${gem_alpha[@]}; do
for GB in ${gem_beta[@]}; do
echo export SAVE_PATH=${save_path} ENVIRONMENT=${environment} ALGORITHM=${algorithm} \
TASK=${task} ALPHA=${A} LMBDA=${L} ETA=${E} BETA=${B} ZETA=${Z} TDRCBETA=${T} GEMALPHA=${GA} \
GEMBETA=${GB} NUMOFRUNS=${num_of_runs} NUMSTEPS=${num_steps} SUBSAMPLE=${sub_sample} \
>>exports_${algorithm}.dat
done
done
done
done
done
done
done
done
import os
import json
import numpy as np
from utils import ImmutableDict
import time
default_params = ImmutableDict(
{
'agent': 'GEMETD',
'task': 'EightStateCollision',
'environment': 'Chain',
'exp': 'FirstChain',
# 'agent': 'HTD',
# 'task': 'LearnEightPoliciesTileCodingFeat',
# 'environment': 'FourRoomGridWorld',
# 'exp': 'FirstFourRoom',
# 'agent': 'LSTD',
# 'task': 'HighVarianceLearnEightPoliciesTileCodingFeat',
# 'environment': 'FourRoomGridWorld',
# 'exp': '1HVFourRoom',
'save_value_function': True,
'sub_sample': 1,
'num_of_runs': 3,
'num_steps': 20_000,
'meta_parameters': {
'alpha': 0.001953125,
'eta': 16.0,
'beta': 0.9,
'zeta': 0.9,
'lmbda': 0.0,
'tdrc_beta': 1.0,
'gem_alpha': 0.1,
'gem_beta': 0.1
}
}
)
class JobBuilder:
def __init__(self, json_path, server_name):
self._path = json_path
self.server_name = server_name
with open(self._path) as f:
self._params = json.load(f)
self._batch_params = ImmutableDict(
{
'ALPHA': ' '.join([f'{num:.10f}' for num in self.alpha]),
'LMBDA': ' '.join([f'{num:.5f}' for num in self.lmbda]),
'ETA': ' '.join([f'{num:.10f}' for num in self.eta]),
'BETA': ' '.join([f'{num:.5f}' for num in self.beta]),
'ZETA': ' '.join([f'{num:.5f}' for num in self.zeta]),
'TDRCBETA': ' '.join([f'{num:.5f}' for num in self.tdrc_beta]),
'GEMALPHA': ' '.join([f'{num:.5f}' for num in self.gem_alpha]),
'GEMBETA': ' '.join([f'{num:.5f}' for num in self.gem_beta]),
'NUMOFRUNS': f'{self.num_of_runs}',
'NUMSTEPS': f'{self.num_steps}',
'SUBSAMPLE': f'{self.sub_sample}',
'ALGORITHM': self.agent,
'TASK': self.task,
'ENVIRONMENT': self.environment,
'SAVEPATH': self.save_path
})
@property
def tdrc_beta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('tdrc_beta', [default_params['meta_parameters']['tdrc_beta']]))
@property
def gem_alpha(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('gem_alpha', [default_params['meta_parameters']['gem_alpha']]))
@property
def gem_beta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('gem_beta', [default_params['meta_parameters']['gem_beta']]))
@property
def alpha(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('alpha', [default_params['meta_parameters']['alpha']]))
@property
def lmbda(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('lmbda', [default_params['meta_parameters']['lmbda']]))
@property
def eta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('eta', [default_params['meta_parameters']['eta']]))
@property
def beta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('beta', [default_params['meta_parameters']['beta']]))
@property
def zeta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('zeta', [default_params['meta_parameters']['zeta']]))
@property
def agent(self):
return self._params.get('agent', default_params['agent'])
@property
def task(self):
return self._params.get('task', default_params['task'])
@property
def num_of_runs(self):
return np.asarray(self._params.get('number_of_runs', default_params['num_of_runs']))
@property
def num_steps(self):
return np.asarray(self._params.get('number_of_steps', default_params['num_steps']))
@property
def sub_sample(self):
return np.asarray(self._params.get('sub_sample', default_params['sub_sample']))
@property
def environment(self):
return self._params.get('environment', default_params['environment'])
@property
def save_path(self):
return os.path.dirname(self._path).replace("/Experiments/", "/Results/")
def create_dat_file(self):
with open('Job/Cedar_Create_Config_Template.sh', 'r') as f:
text = f.read()
for k, v in self._batch_params.items():
text = text.replace(f'__{k}__', v)
return text
def to_shell(self):
if self.server_name.upper() == 'NODE':
with open('Job/SubmitJobsTemplates.SL', 'r') as f:
text = f.read()
for k, v in self._batch_params.items():
text = text.replace(f'__{k}__', v)
return text
elif self.server_name.upper() == 'CPU':
with open('Job/SubmitJobsTemplatesCedar.SL', 'r') as f:
text = f.read()
alg = self._batch_params['ALGORITHM']
num_of_jobs = sum(1 for _ in open(f'exports_{alg}.dat'))
text = text.replace('__ALG__', self._batch_params['ALGORITHM'])
text = text.replace('__NUM_OF_JOBS__', str(num_of_jobs))
text = text.replace('__NAME_OF_EXP__', f'{self._batch_params["TASK"]}_{self._batch_params["ALGORITHM"]}')
return text
def run_batch(self):
if self.server_name.upper() == 'NODE':
print('Submitting the ' + self.agent + ' algorithm jobs on nodes...')
elif self.server_name.upper() == 'CPU':
print('Submitting the ' + self.agent + ' algorithm jobs on individual cpus...')
with open('Create_Configs.sh', 'wt') as f:
f.write(self.create_dat_file())
time.sleep(1)
os.system('bash Create_Configs.sh')
with open('Submit_Jobs.SL', 'wt') as f:
f.write(self.to_shell())
time.sleep(1)
os.system('sbatch Submit_Jobs.SL')
time.sleep(1)
os.remove('Submit_Jobs.SL')
if self.server_name.upper() == 'CPU':
os.remove('Create_Configs.sh')
# alg = self._batch_params['ALGORITHM']
# os.remove(f'exports_{alg}.dat')
def __call__(self):
return self.run_batch()
#!/bin/bash
# SLURM submission script for submitting multiple serial jobs on Niagara
#
#SBATCH --account=xxx
#SBATCH --time=11:58:59
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=40
#SBATCH --job-name __TASK_____ALGORITHM__
alpha=(__ALPHA__)
lmbda=(__LMBDA__)
eta=(__ETA__)
beta=(__BETA__)
zeta=(__ZETA__)
tdrc_beta=(__TDRCBETA__)
gem_alpha=(__GEMALPHA__)
gem_beta=(__GEMBETA__)
num_of_runs=__NUMOFRUNS__
num_steps=__NUMSTEPS__
sub_sample=__SUBSAMPLE__
algorithm=__ALGORITHM__
environment=__ENVIRONMENT__
task=__TASK__
save_path=__SAVEPATH__
source ~/RLENV/bin/activate
module load NiaEnv/2019b
module load gnu-parallel
module load python
cd $SLURM_SUBMIT_DIR || exit
export OMP_NUM_THREADS=1
echo "The number of available cores is echo $NCORES"
echo "Current working directory is $(pwd)"
echo "Running on hostname $(hostname)"
echo "Starting run at: $(date)"
HOSTS=$(scontrol show hostnames $SLURM_NODELIST | tr '\n' ,)
NCORES=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
parallel --env OMP_NUM_THREADS,PATH,LD_LIBRARY_PATH --joblog slurm-$SLURM_JOBID.log -j $NCORES -S $HOSTS --wd $PWD \
python Learning.py ::: -sp ::: ${save_path} ::: -e ::: ${environment} ::: -alg ::: ${algorithm} ::: -t ::: ${task[@]} \
::: -a ::: ${alpha[@]} ::: -nr ::: ${num_of_runs} ::: -ns ::: ${num_steps} ::: -et ::: ${eta[@]} \
::: -l ::: ${lmbda[@]} ::: -z ::: ${zeta[@]} ::: -tb ::: ${tdrc_beta[@]} ::: -b ::: ${beta[@]} ::: \
-ga ::: ${gem_alpha[@]} ::: -gb ::: ${gem_beta[@]} ::: -ss ::: ${sub_sample}
echo "Program test finished with exit code $? at: $(date)"
#!/bin/bash
#SBATCH --account=xxx
#SBATCH --time=00:15:58
#SBATCH --cpus-per-task=1
#SBATCH --mem=3G
#SBATCH --array=1-__NUM_OF_JOBS__
#SBATCH --job-name __NAME_OF_EXP__
alg=__ALG__
source ~/RLENV/bin/activate
`sed -n "${SLURM_ARRAY_TASK_ID}p" <exports_${alg}.dat`
echo ${SLURM_ARRAY_TASK_ID} $ALPHA $LMBDA $ETA $BETA $ZETA $TDRCBETA $GEMALPHA $GEMBETA $NUMOFRUNS $NUMSTEPS $SUBSAMPLE
echo "Current working directory is $(pwd)"
echo "Running on hostname $(hostname)"
echo
echo "Starting run at: $(date)"
python Learning.py \
-a $ALPHA -l $LMBDA -et $ETA -b $BETA -z $ZETA -tb $TDRCBETA -ga $GEMALPHA -gb $GEMBETA -alg $ALGORITHM -t $TASK \
-nr $NUMOFRUNS -e $ENVIRONMENT -sp $SAVE_PATH -ns $NUMSTEPS -ss $SUBSAMPLE
echo "Program test finished with exit code $? at: $(date)"
import os
import numpy as np
import argparse
from data_presister import DataPersister, ParameterBuilder
from utils import save_result, Configuration, save_value_function, get_save_value_function_steps
from Registry.AlgRegistry import alg_dict
from Registry.EnvRegistry import environment_dict
from Registry.TaskRegistry import task_dict
from Job.JobBuilder import default_params
from Environments.rendering import ErrorRender
def learn(config: Configuration):
params = ParameterBuilder().add_algorithm_params(config).build()
if not os.path.exists(config.save_path):
os.makedirs(config.save_path, exist_ok=True)
env = environment_dict[config.environment]()
rmsve = np.zeros((task_dict[config.task].num_of_policies(), config.num_steps, config.num_of_runs))
for run in range(config.num_of_runs):
random_seed = (run + config.num_of_runs) if config.rerun else run
np.random.seed(random_seed)
task = task_dict[config.task](run_number=run, num_steps=config.num_steps)
agent = alg_dict[config.algorithm](task, **params)
rmsve_of_run = np.zeros((task.num_policies, task.num_steps))
agent.state = env.reset()
error_render = ErrorRender(task.num_policies, task.num_steps)
for step in range(task.num_steps):
rmsve_of_run[:, step], error = agent.compute_rmsve()
if config.render:
error_render.add_error(error)
agent.action = agent.choose_behavior_action()
agent.next_state, r, is_terminal, info = env.step(agent.action)
agent.learn(agent.state, agent.next_state, r, is_terminal)
if config.render:
env.render(mode='screen', render_cls=error_render)
if config.save_value_function and (step in get_save_value_function_steps(task.num_steps)):
save_value_function(agent.compute_value_function(), config.save_path, step, run)
if is_terminal:
agent.state = env.reset()
agent.reset()
continue
agent.state = agent.next_state
print(np.mean(rmsve_of_run, axis=0))
rmsve[:, :, run] = rmsve_of_run
rmsve_of_runs = np.transpose(np.mean(rmsve, axis=0)) # Average over all policies.
# _RMSVE_mean_over_runs
DataPersister.save_result(np.mean(rmsve_of_runs, axis=0), '_RMSVE_mean_over_runs', config)
DataPersister.save_result(np.std(rmsve_of_runs, axis=0, ddof=1) / np.sqrt(config.num_of_runs), '_RMSVE_stderr_over_runs', config)
# _RMSVE_stderr_over_runs
save_result(config.save_path, '_RMSVE_stderr_over_runs', np.mean(rmsve_of_runs, axis=0), params, config.rerun)
save_result(config.save_path, '_RMSVE_stderr_over_runs',
np.std(rmsve_of_runs, axis=0, ddof=1) / np.sqrt(config.num_of_runs), params, config.rerun)
# _mean_stderr_final
final_errors_mean_over_steps = np.mean(rmsve_of_runs[:, config.num_steps - int(0.01 * config.num_steps) - 1:],
axis=1)
DataPersister.save_result(np.array([np.mean(final_errors_mean_over_steps), np.std(final_errors_mean_over_steps, ddof=1) /
np.sqrt(config.num_of_runs)]), '_mean_stderr_final', config)
save_result(config.save_path, '_mean_stderr_final',
np.array([np.mean(final_errors_mean_over_steps), np.std(final_errors_mean_over_steps, ddof=1) /
np.sqrt(config.num_of_runs)]), params, config.rerun)
# _mean_stderr_auc
auc_mean_over_steps = np.mean(rmsve_of_runs, axis=1)
DataPersister.save_result(np.array([np.mean(auc_mean_over_steps),
np.std(auc_mean_over_steps, ddof=1) / np.sqrt(config.num_of_runs)]), '_mean_stderr_auc', config)
save_result(config.save_path, '_mean_stderr_auc',
np.array([np.mean(auc_mean_over_steps),
np.std(auc_mean_over_steps, ddof=1) / np.sqrt(config.num_of_runs)]), params, config.rerun)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--alpha', '-a', type=float, default=default_params['meta_parameters']['alpha'])
parser.add_argument('--lmbda', '-l', type=float, default=default_params['meta_parameters']['lmbda'])
parser.add_argument('--eta', '-et', type=float, default=default_params['meta_parameters']['eta'])
parser.add_argument('--beta', '-b', type=float, default=default_params['meta_parameters']['beta'])
parser.add_argument('--zeta', '-z', type=float, default=default_params['meta_parameters']['zeta'])
parser.add_argument('--tdrc_beta', '-tb', type=float, default=default_params['meta_parameters']['tdrc_beta'])
parser.add_argument('--gem_alpha', '-ga', type=float, default=default_params['meta_parameters']['gem_alpha'])
parser.add_argument('--gem_beta', '-gb', type=float, default=default_params['meta_parameters']['gem_beta'])
parser.add_argument('--algorithm', '-alg', type=str, default=default_params['agent'])
parser.add_argument('--task', '-t', type=str, default=default_params['task'])
parser.add_argument('--num_of_runs', '-nr', type=int, default=default_params['num_of_runs'])
parser.add_argument('--num_steps', '-ns', type=int, default=default_params['num_steps'])
parser.add_argument('--sub_sample', '-ss', type=int, default=default_params['sub_sample'])
parser.add_argument('--environment', '-e', type=str, default=default_params['environment'])
parser.add_argument('--save_path', '-sp', type=str, default='-')
parser.add_argument('--rerun', '-rrn', type=bool, default=False)
parser.add_argument('--render', '-rndr', type=bool, default=False)
parser.add_argument('--save_value_function', '-svf', type=bool, default=default_params['save_value_function'])
args = parser.parse_args()
if args.save_path == '-':
args.save_path = os.path.join(os.getcwd(), 'Results', default_params['exp'], args.algorithm)
learn(config=Configuration(vars(args)))
import json
import os
import matplotlib.pyplot as plt
import numpy as np
from Plotting.plot_params import EXP_ATTRS, AUC_AND_FINAL
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
from utils import create_name_for_save_load
plot_alpha = 1.0
def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
global plot_alpha
lbl = f'{alg}_{tp}'
ax.set_xscale('log', basex=2)
if alg == 'ETD':
color = 'red'
elif alg == 'ETDLB':
color = 'grey'
plot_alpha -= 0.1
else:
color = 'black'
ax.plot(alphas, performance, label=lbl, linestyle='-', marker='o',
linewidth=2, markersize=5, color=color, alpha=plot_alpha)
ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
color=color, alpha=plot_alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
# ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
# plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_alphas(alg, exp):
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f"{alg}.json")
with open(exp_path) as f:
jsn_content = json.load(f)
return jsn_content['meta_parameters']['alpha']
def plot_all_sensitivities_per_alg_emphatics(**kwargs):
global plot_alpha
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
plot_alpha = 1.0
alg = 'ETD'
save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
current_params = make_current_params(alg, sp, 0, 0)
alphas = get_alphas(alg, exp)
performance, stderr = load_performance_over_alpha(
alg, exp, current_params, auc_or_final, exp_attrs)
plot_sensitivity(ax, alg, exp, alphas, sp, 0, performance, stderr, exp_attrs)
alg = 'ETDLB'
fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
for tp in tp_list:
for fop in fop_list:
current_params = make_current_params(alg, sp, tp, fop)
alphas = get_alphas(alg, exp)
performance, stderr = load_performance_over_alpha(
alg, exp, current_params, auc_or_final, exp_attrs)
plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg, auc_or_final, sp)
import os
import numpy as np
import json
import matplotlib.pyplot as plt
from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, ALG_COLORS
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
from utils import create_name_for_save_load
new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#17becf',
'orange', '#8c564b', '#e377c2', '#2ca02c',
'#bcbd22', '#d62728']
color_counter = 1
def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
global color_counter
lbl = f'{alg}_{tp}'
ax.set_xscale('log', basex=2)
color = new_colors[color_counter]
linestyle = '-'
alpha = 1.0
# if alg == 'PGTD2':
# linestyle = '--'
# alpha = 0.5
ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
linewidth=2, markersize=5, color=color, alpha=alpha)
ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
color=color, alpha=alpha)
color_counter = color_counter + 1
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.set_ylim([0.1, 0.8])
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
plt.xticks(fontsize=25)
def get_alphas(alg, exp):
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f"{alg}.json")
with open(exp_path) as f:
jsn_content = json.load(f)
return jsn_content['meta_parameters']['alpha']
COUNTER = 0
def plot_extra_alg_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
global color_counter
lbl = f'{alg}_{tp}'
ax.set_xscale('log', basex=2)
color = new_colors[color_counter - 1]
alpha = 1.0
if alg == 'TDRC':
color = ALG_COLORS[alg]
alpha = 1.0
linestyle = '--'
# if alg == 'GTD2':
# linestyle = '-'
# alpha=1.0
ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
linewidth=3, markersize=5, color=color, alpha=alpha)
ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=3, markersize=5,
color=color, alpha=alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim([0.1, 0.8])
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def plot_all_sensitivities_per_alg_gradients(**kwargs):
global color_counter, COUNTER
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
for alg in kwargs['algs']:
color_counter = 4
save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
for tp in tp_list:
if COUNTER % 2 == 0:
COUNTER += 1
continue
COUNTER += 1
for fop in fop_list:
current_params = make_current_params(alg, sp, tp, fop)
alphas = get_alphas(alg, exp)
performance, stderr = load_performance_over_alpha(
alg, exp, current_params, auc_or_final, exp_attrs)
plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if alg == 'GTD2':
extra_alg = 'GTD'
performance, stderr = load_performance_over_alpha(
extra_alg, exp, current_params, auc_or_final, exp_attrs)
plot_extra_alg_sensitivity(
ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if alg == 'PGTD2':
extra_alg = 'GTD2'
performance, stderr = load_performance_over_alpha(
extra_alg, exp, current_params, auc_or_final, exp_attrs)
plot_extra_alg_sensitivity(
ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if alg == 'GTD':
extra_alg = 'HTD'
performance, stderr = load_performance_over_alpha(
extra_alg, exp, current_params, auc_or_final, exp_attrs)
plot_extra_alg_sensitivity(
ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if alg == 'HTD':
extra_alg = 'TDRC'
current_params['eta'] = 1.0
current_params['tdrc_beta'] = 1.0
performance, stderr = load_performance_over_alpha(
extra_alg, exp, current_params, auc_or_final, exp_attrs)
plot_extra_alg_sensitivity(
ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg, auc_or_final, sp)
import os
import numpy as np
import json
import matplotlib.pyplot as plt
from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, ALG_COLORS
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
from utils import create_name_for_save_load
new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#17becf', 'orange', '#8c564b', '#e377c2', '#2ca02c','#bcbd22',
'#d62728', 'black', 'cyan']
color_counter = 1
def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
global color_counter
lbl = f'{alg}_{tp}'
ax.set_xscale('log', basex=2)
color = new_colors[color_counter]
linestyle = '-'
alpha = 1.0
# if alg == 'PGTD2':
# linestyle = '--'
# alpha = 0.5
ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
linewidth=2, markersize=5, color=color, alpha=alpha)
ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
color=color, alpha=alpha)
color_counter = color_counter + 1
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.set_ylim([0.1, 0.8])
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_alphas(alg, exp):
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f"{alg}.json")
with open(exp_path) as f:
jsn_content = json.load(f)
return jsn_content['meta_parameters']['alpha']
COUNTER = 0
def plot_all_sensitivities_per_alg_gradients_all_eta(**kwargs):
global color_counter, COUNTER
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
for alg in kwargs['algs']:
color_counter = 4
save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
if alg == 'TDRC':
_, _, tp_list, _, _ = make_params('GTD', exp)
fop_list = kwargs['tdrc_beta']
for tp in tp_list:
COUNTER += 1
for fop in fop_list:
current_params = make_current_params(alg, sp, tp, fop)
alphas = get_alphas(alg, exp)
performance, stderr = load_performance_over_alpha(
alg, exp, current_params, auc_or_final, exp_attrs)
plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
if alg == 'TDRC':
fig.savefig(
os.path.join(save_dir, f"sensitivity_{alg}_{exp}_all_eta_beta_{kwargs['tdrc_beta']}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
else:
fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}_all_eta.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg, auc_or_final, sp)
import matplotlib.pyplot as plt
import numpy as np
import os
import pylab
from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX
from Plotting.plot_utils import load_best_rerun_params_dict, make_current_params, make_params, load_and_replace_large_nan_inf
from utils import create_name_for_save_load
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False,
smoothing_window=1):
zoomed_in = True if is_smoothed else False
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
print(alg)
lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']) + r' $\lambda=$ ' +
str(best_params.get('lmbda', best_params.get('zeta', 0))))
color = ALG_COLORS[alg]
# if alg == 'TD':
# color = 'grey'
# alpha = 0.7
if is_smoothed:
mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid')
mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid')
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
if zoomed_in:
ax.set_ylim([0.0, 0.4])
else:
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_ls_rmsve(alg, exp, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
params = {'alpha': 0.01, 'lmbda': sp}
if alg == 'LSETD':
params['beta'] = 0.9
generic_name = create_name_for_save_load(params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
return np.load(load_file_name)
def plot_ls_solution(ax, ls_rmsve, alg, sp):
lbl = f"{alg} $\\lambda=$ {sp}"
x = np.arange(ls_rmsve.shape[0])
y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':')
# ax.legend()
def find_best_perf(alg, exp, auc_or_final):
exp_attrs = EXP_ATTRS[exp](exp)
fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
best_params = {}
best_perf, best_fp, best_sp, best_tp, best_fop = np.inf, np.inf, np.inf, np.inf, np.inf
for fop in fop_list:
for tp in tp_list:
for sp in sp_list:
current_params = make_current_params(alg, sp, tp, fop)
load_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=[
'alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
current_perf = load_and_replace_large_nan_inf(
load_name, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement)
min_perf = min(current_perf)
if min_perf < best_perf:
best_perf = min_perf
best_perf_idx = int(np.nanargmin(current_perf))
best_fp = fp_list[best_perf_idx]
best_params = current_params
best_params['alpha'] = best_fp
return best_params
def plot_learning_curve_best_overall_params(**kwargs):
is_smoothed = True if 'is_smoothed' in kwargs else False
smoothing_window = kwargs.get('smoothing_window', 1)
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final)
for alg_names in kwargs['alg_groups'].values():
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for alg in alg_names:
if alg in ['LSTD', 'LSETD']:
# ls_rmsve = get_ls_rmsve(alg, exp, sp)
# plot_ls_solution(ax, ls_rmsve, alg, sp)
continue
prefix = RERUN_POSTFIX if PLOT_RERUN else ''
best_params = find_best_perf(alg, exp, auc_or_final)
mean_lc, mean_stderr = load_data(alg, exp, best_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False,
is_smoothed=is_smoothed, smoothing_window=smoothing_window)
if PLOT_RERUN_AND_ORIG:
prefix = RERUN_POSTFIX
mean_lc, mean_stderr = load_data(alg, exp, best_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=True,
is_smoothed=is_smoothed, smoothing_window=smoothing_window)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}AllLmbda.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
plt.close(fig)
import os
import numpy as np
import matplotlib.pyplot as plt
def load_d_mu(task):
return np.load(os.path.join(os.getcwd(), 'Resources', task, 'd_mu.npy'))
def load_state_values(task):
return np.load(os.path.join(os.getcwd(), 'Resources', task, 'state_values.npy'))
def plot_d_mu(ax, d_mu, active_states):
ax.plot(d_mu, linewidth=3)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
x_labels = list(active_states)
x_ticks = [x for x in range(len(x_labels))]
ax.xaxis.set_ticks(x_ticks)
ax.set_xticklabels(x_labels)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
ax.yaxis.set_ticks([0, 0.005, 0.01, 0.015, 0.02, 0.025])
ax.set_ylim([0.00, 0.025])
ax.set_yticklabels([])
# ax.set_xticklabels([])
def find_active_states(task, d_mu, state_values, policy_no=0):
if task == 'EightStateCollision':
return [x for x in range(d_mu.shape[0])]
return np.where(state_values[policy_no] > 0)[0]
def get_active_d_mu(task, d_mu, active_states, policy_no=0):
if task == 'EightStateCollision':
return d_mu
return d_mu[active_states, policy_no].squeeze()
def plot_distribution(**kwargs):
task = kwargs['task']
d_mu = load_d_mu(task)
state_values = load_state_values(task)
for policy_no in range(state_values.shape[0]):
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
active_states = find_active_states(task, d_mu, state_values, policy_no)
active_d_mu = get_active_d_mu(task, d_mu, active_states, policy_no)
plot_d_mu(ax, active_d_mu, active_states)
plt.show()
if task == 'EightStateCollision':
break
def plot_dist_for_two_four_room_tasks(**kwargs):
task1 = 'LearnEightPoliciesTileCodingFeat'
task2 = 'HighVarianceLearnEightPoliciesTileCodingFeat'
save_dir = os.path.join('pdf_plots', 'Misc', 'CompareDistsFR')
d_mu1 = load_d_mu(task1)
d_mu2 = load_d_mu(task2)
state_values1 = load_state_values(task1)
state_values2 = load_state_values(task2)
for policy_no in range(state_values1.shape[0]):
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
active_states = find_active_states(task1, d_mu1, state_values1, policy_no)
active_d_mu = get_active_d_mu(task1, d_mu1, active_states, policy_no)
plot_d_mu(ax, active_d_mu, active_states)
active_states = find_active_states(task2, d_mu2, state_values2, policy_no)
active_d_mu = get_active_d_mu(task2, d_mu2, active_states, policy_no)
plot_d_mu(ax, active_d_mu, active_states)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
fig.savefig(os.path.join(save_dir, f"dist_policy_{policy_no}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
import os
import pylab
from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX
from Plotting.plot_utils import load_best_rerun_params_dict
from utils import create_name_for_save_load
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False,
smoothing_window=1):
zoomed_in = True if is_smoothed else False
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
color = ALG_COLORS[alg]
# if alg == 'TD':
# color = 'grey'
# alpha = 0.7
if is_smoothed:
mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid')
mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid')
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
if zoomed_in:
ax.set_ylim([0.0, 0.4])
else:
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_ls_rmsve(alg, exp, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
params = {'alpha': 0.01, 'lmbda': sp}
if alg == 'LSETD':
params['beta'] = 0.9
generic_name = create_name_for_save_load(params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
return np.load(load_file_name)
def plot_ls_solution(ax, ls_rmsve, alg, sp):
lbl = f"{alg} $\\lambda=$ {sp}"
x = np.arange(ls_rmsve.shape[0])
y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':')
# ax.legend()
def plot_learning_curve(**kwargs):
is_smoothed = True if 'is_smoothed' in kwargs else False
smoothing_window = kwargs.get('smoothing_window', 1)
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final)
for alg_names in kwargs['alg_groups'].values():
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for alg in alg_names:
if alg in ['LSTD', 'LSETD']:
ls_rmsve = get_ls_rmsve(alg, exp, sp)
plot_ls_solution(ax, ls_rmsve, alg, sp)
continue
prefix = RERUN_POSTFIX if PLOT_RERUN else ''
current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=False,
is_smoothed=is_smoothed, smoothing_window=smoothing_window)
if PLOT_RERUN_AND_ORIG:
prefix = RERUN_POSTFIX
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=True,
is_smoothed=is_smoothed, smoothing_window=smoothing_window)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
plt.close(fig)
import os
import matplotlib.pyplot as plt
import numpy as np
import pylab
from Plotting.plot_params import ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN_AND_ORIG
from Plotting.plot_utils import make_params, get_alphas, make_current_params
from utils import create_name_for_save_load
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
color = ALG_COLORS[alg]
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_ls_rmsve(alg, exp, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
params = {'alpha': 0.01, 'lmbda': sp}
if alg == 'LSETD':
params['beta'] = 0.9
generic_name = create_name_for_save_load(params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
return np.load(load_file_name)
def plot_ls_solution(ax, ls_rmsve, alg, sp):
lbl = f"{alg} $\\lambda=$ {sp}"
x = np.arange(ls_rmsve.shape[0])
y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--')
# ax.legend()
def load_specific_params_dict(alg, exp, sp, tp):
if alg == 'TD':
return {'alpha': 0.25, 'lmbda': sp}
if alg == 'ETD':
return {'alpha': 0.00390625, 'lmbda': sp}
if alg == 'ETDLB':
return {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2}
if alg == 'TDRC':
return {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0}
if alg == 'GTD':
return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp}
if alg == 'PGTD2':
return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp}
def load_sample_params_dict(alg, exp, sp):
fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
if alg in ['TD', 'ETD', 'TB', 'Vtrace']:
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp}
if alg == 'ABTD':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp}
if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']:
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'eta': tp_list[np.random.randint(0, len(tp_list))]}
if alg == 'ETDLB':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'beta': tp_list[np.random.randint(0, len(tp_list))]}
if alg == 'TDRC':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'eta': tp_list[np.random.randint(0, len(tp_list))],
'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]}
def plot_all_learning_curves_for_third(**kwargs):
for exp in kwargs['exps']:
prefix = ''
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
save_dir = os.path.join('pdf_plots', 'all_third_learning_curves', auc_or_final)
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for alg in kwargs['algs']:
if alg in ['LSTD', 'LSETD']:
ls_rmsve = get_ls_rmsve(alg, exp, sp)
plot_ls_solution(ax, ls_rmsve, alg, sp)
continue
for tp in kwargs['tp_list']:
for fp in get_alphas(alg, exp):
for fop in [1.0]:
current_params = make_current_params(alg, sp, tp, fop, fp)
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{'_'.join(kwargs['algs'])}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
plt.close(fig)
import matplotlib.pyplot as plt
import numpy as np
import os
import pylab
from Plotting.plot_params import ALG_GROUPS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, RERUN_POSTFIX, \
PLOT_RERUN_AND_ORIG
from Plotting.plot_utils import load_best_rerun_params_dict
from utils import create_name_for_save_load
# noinspection DuplicatedCode
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
# noinspection DuplicatedCode
def plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, second_time=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
color = 'blue' if sp else 'red'
lbl = (alg + r' $\lambda=$ ' + str(sp))
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.tick_params(axis='x', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_yticklabels([])
ax.set_xticklabels([])
# noinspection DuplicatedCode
def plot_learning_curve_for_lambdas(**kwargs):
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for alg_names in kwargs['alg_groups'].values():
for alg in alg_names:
if alg in ['LSETD', 'LSTD']:
continue
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
save_dir = os.path.join('pdf_plots', 'learning_curves_for_lambdas', auc_or_final)
for sp in kwargs['sp_list']:
prefix = RERUN_POSTFIX if PLOT_RERUN else ''
current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
print(alg, current_params)
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs)
if PLOT_RERUN_AND_ORIG:
prefix = RERUN_POSTFIX
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, True)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{alg}{exp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
# plt.show()
plt.close(fig)
from Plotting.plot_utils import FirstChainAttr, FirstFourRoomAttr, HVFirstFourRoomAttr
from Registry.AlgRegistry import alg_dict
PLOT_RERUN = True
PLOT_RERUN_AND_ORIG = False
if PLOT_RERUN and PLOT_RERUN_AND_ORIG:
PLOT_RERUN_AND_ORIG = False
RERUN_POSTFIX = '_rerun'
DEBUG_MODE = True
# noinspection SpellCheckingInspection
COLORS = ['#000000', "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22",
"#17becf"]
ALG_COLORS = {alg_name: color for alg_name, color in zip(alg_dict.keys(), COLORS)}
ALG_COLORS['LSTD'] = ALG_COLORS['TD']
ALG_COLORS['LSETD'] = ALG_COLORS['ETD']
ALG_GROUPS = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
'emphatics': ['ETD', 'ETDLB', 'LSETD'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD']}
EXPS = ['1HVFourRoom', 'FirstFourRoom', 'FirstChain']
ALGS = [key for key in alg_dict.keys()]
ALGS.remove('LSTD')
ALGS.remove('LSETD')
# ALGS.remove('TDRC')
ALL_ALGS = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', 'LSTD', 'LSETD']
# ALL_ALGS = ['TD', 'Vtrace', 'TB', 'ABTD']
LMBDA_AND_ZETA = [0.0, 0.9]
AUC_AND_FINAL = ['auc', 'final']
EXP_ATTRS = {'FirstChain': FirstChainAttr, 'FirstFourRoom': FirstFourRoomAttr, '1HVFourRoom': HVFirstFourRoomAttr}
if DEBUG_MODE:
EXPS = ['FirstFourRoom', '1HVFourRoom']
# ALGS = ['GTD']
# ALL_ALGS.remove('ETDLB')
# ALL_ALGS.remove('LSTD')
# ALL_ALGS.remove('LSETD')
# LMBDA_AND_ZETA = [0.9]
AUC_AND_FINAL = ['final']
# ALG_GROUPS = {'main_algs': ALL_ALGS}
import os
import matplotlib.pyplot as plt
import numpy as np
from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \
PLOT_RERUN_AND_ORIG, RERUN_POSTFIX
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas
from utils import create_name_for_save_load
def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy')
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha{postfix}.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
# noinspection DuplicatedCode
def plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, second_time=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = f'{alg}'
ax.set_xscale('log', basex=2)
color = ALG_COLORS[alg]
if alg == 'TD':
color = 'grey'
alpha=0.7
ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color,
linewidth=2, markersize=5, alpha=alpha)
ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color,
mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def plot_sensitivity_curve(**kwargs):
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
save_dir = os.path.join('pdf_plots', 'sensitivity_curves', auc_or_final)
for alg_names in kwargs['alg_groups'].values():
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for alg in alg_names:
if alg in ['LSTD', 'LSETD']:
continue
postfix = RERUN_POSTFIX if PLOT_RERUN else ''
best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
alphas = get_alphas(alg, exp)
best_performance, stderr = load_best_performance_over_alpha(
alg, exp, auc_or_final, best_params, exp_attrs, postfix)
plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs)
if PLOT_RERUN_AND_ORIG:
postfix = RERUN_POSTFIX
best_performance, stderr = load_best_performance_over_alpha(
alg, exp, auc_or_final, best_params, exp_attrs, postfix)
plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, True)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_sensitivity_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg_names, auc_or_final, sp)
import os
import matplotlib.pyplot as plt
import numpy as np
from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN, PLOT_RERUN_AND_ORIG, RERUN_POSTFIX, ALGS
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas
from utils import create_name_for_save_load
def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy')
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha{postfix}.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
# noinspection DuplicatedCode
def plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, second_time=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = f'{alg}'
ax.set_xscale('log', basex=2)
color = 'blue' if sp else 'red'
if sp not in [0.0, 1.0]:
alpha = 0.3
color = 'grey'
ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color,
linewidth=2, markersize=5, alpha=alpha)
ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color,
mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
# ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
# plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def plot_min(ax, min_performance):
print(min_performance)
ax.plot([pow(2, -3), pow(2, -2)], [min_performance, min_performance], linewidth=0.2, alpha=0.2)
# ax.axhline(y=min_performance, xmin=pow(2, -3), xmax=pow(2, -2))
def plot_sensitivity_for_lambdas(**kwargs):
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
save_dir = os.path.join('pdf_plots', 'sensitivity_curves_for_lambdas', exp, auc_or_final)
for alg in kwargs['algs']:
min_performance = 1_000
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for sp in kwargs['sp_list']:
if alg in ['LSTD', 'LSETD']:
continue
postfix = RERUN_POSTFIX if PLOT_RERUN else ''
best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
alphas = get_alphas(alg, exp)
best_performance, stderr = load_best_performance_over_alpha(
alg, exp, auc_or_final, best_params, exp_attrs, postfix)
plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs)
if PLOT_RERUN_AND_ORIG:
postfix = RERUN_POSTFIX
best_performance, stderr = load_best_performance_over_alpha(
alg, exp, auc_or_final, best_params, exp_attrs, postfix)
plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, True)
if min(best_performance) < min_performance:
min_performance = min(best_performance)
if kwargs.get('plot_min_performance', False):
plot_min(ax, min_performance)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_sensitivity_curve_{alg}{exp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg, auc_or_final, sp)
import matplotlib.pyplot as plt
import numpy as np
import os
import pylab
from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX, ALGS, ALL_ALGS
from Plotting.plot_utils import load_best_rerun_params_dict, make_params
from utils import create_name_for_save_load
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, flag=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
color = ALG_COLORS[alg]
if alg == 'TDRC':
alpha = 0.6
if flag:
color = 'green'
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_ls_rmsve(alg, exp, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
params = {'alpha': 0.01, 'lmbda': sp}
if alg == 'LSETD':
params['beta'] = 0.9
generic_name = create_name_for_save_load(params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
return np.load(load_file_name)
def plot_ls_solution(ax, ls_rmsve, alg, sp):
lbl = f"{alg} $\\lambda=$ {sp}"
x = np.arange(ls_rmsve.shape[0])
y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--')
# ax.legend()
def load_sample_params_dict(alg, exp, sp):
fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
if alg in ['TD', 'ETD', 'TB', 'Vtrace']:
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp}
if alg == 'ABTD':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp}
if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']:
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'eta': tp_list[np.random.randint(0, len(tp_list))]}
if alg == 'ETDLB':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'beta': tp_list[np.random.randint(0, len(tp_list))]}
if alg == 'TDRC':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'eta': tp_list[np.random.randint(0, len(tp_list))],
'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]}
def plot_specific_learning_curves(**kwargs):
specific_params = kwargs['specific_params']
exp = kwargs['exp']
prefix = ''
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in AUC_AND_FINAL:
sp = kwargs['sp']
save_dir = os.path.join('pdf_plots', 'specific_learning_curves', auc_or_final)
fig, ax = plt.subplots(figsize=(10, 4))
for alg in kwargs['algs']:
flag = False
if alg in ['LSTD', 'LSETD']:
ls_rmsve = get_ls_rmsve(alg, exp, sp)
plot_ls_solution(ax, ls_rmsve, alg, sp)
continue
print(alg, exp, sp)
if alg == 'PGTD22':
flag = True
alg = 'PGTD2'
current_params = specific_params[alg]
current_params['eta'] = 1.0
current_params['alpha'] = 0.03125
else:
current_params = specific_params[alg]
print(current_params)
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, False, flag)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{'_'.join(ALGS)}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
plt.close(fig)
import argparse
import json
import numpy as np
import os
from Job.JobBuilder import default_params
from Registry.AlgRegistry import alg_dict
from utils import create_name_for_save_load
def make_res_path(alg, exp):
return os.path.join(os.getcwd(), 'Results', exp, alg)
def make_exp_path(alg, exp):
return os.path.join(os.getcwd(), 'Experiments', exp, alg)
def load_best_rerun_params_dict(alg, exp, auc_or_final, sp):
res_path = make_res_path(alg, exp)
with open(os.path.join(res_path, f"{auc_or_final}_{sp}.json")) as f:
return json.load(f)['meta_parameters']
def get_alphas(alg, exp):
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f"{alg}.json")
with open(exp_path) as f:
jsn_content = json.load(f)
return jsn_content['meta_parameters']['alpha']
def load_best_rerun_params(alg, exp, auc_or_final, sp):
best_res_dict = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
best_fp = best_res_dict.get('alpha', 0)
best_tp = best_res_dict.get('eta', best_res_dict.get('beta', 0))
best_fop = best_res_dict.get('tdrc_beta', 0)
return best_fp, best_tp, best_fop
def make_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', '-n', type=str, default='1HVFourRoom')
# 1HVFourRoom or FirstFourRoom or FirstChain
return parser.parse_args()
def rename_best_old_result(res_path, params_dict, file_name):
name_to_save = create_name_for_save_load(param_dict=params_dict)
path_and_name = os.path.join(res_path, name_to_save)
file_name = path_and_name + file_name
os.rename(file_name + '.npy', file_name + '_old.npy')
def load_best_perf_json(alg, exp, sp, auc_or_final):
res_path = make_res_path(alg, exp)
res_path = os.path.join(res_path, f"{auc_or_final}_{sp}.json")
with open(res_path, 'r') as f:
return json.load(f)
def load_exp_json_file(alg, exp):
res_path = make_res_path(alg, exp)
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f'{alg}.json')
with open(exp_path) as f:
return json.load(f), res_path
def make_params(alg_name, exp_name):
params = dict()
alg_param_names = alg_dict[alg_name].related_parameters()
json_content, res_path = load_exp_json_file(alg_name, exp_name)
json_exp_params = json_content.get('meta_parameters')
for param in alg_param_names:
params[param] = json_exp_params.get(param, default_params['meta_parameters'][param])
if not isinstance(params[param], list):
params[param] = list([params[param]])
fp_list = params.get('alpha', params['alpha'])
tp_list = [0.0]
fop_list = [0.0]
if 'lmbda' in params:
sp_list = params['lmbda']
else:
sp_list = params['zeta']
if 'eta' in params:
tp_list = params['eta']
elif 'beta' in params:
tp_list = params['beta']
if 'tdrc_beta' in params:
fop_list = params['tdrc_beta']
if alg_name == 'TDRC':
tp_list, fop_list = [1.0], [1.0]
return fp_list, sp_list, tp_list, fop_list, res_path
def make_current_params(alg_name, sp, tp, fop, fp=0):
current_params = {'alpha': fp}
alg_param_names = alg_dict[alg_name].related_parameters()
if 'lmbda' in alg_param_names:
current_params['lmbda'] = sp
else:
current_params['zeta'] = sp
if 'eta' in alg_param_names:
current_params['eta'] = tp
elif 'beta' in alg_param_names:
current_params['beta'] = tp
if 'tdrc_beta' in alg_param_names:
current_params['tdrc_beta'] = fop
return current_params
def get_alg_names(exp_name):
path = os.path.join(os.getcwd(), 'Experiments', exp_name)
alg_names = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
return alg_names
def load_sample_json_for_exp(exp):
alg = get_alg_names(exp)[0]
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f'{alg}.json')
if not os.path.exists(exp_path):
print('No algorithms exist in the experiment directory...')
raise FileExistsError
with open(exp_path) as f:
json_exp_params = json.load(f)
return json_exp_params
def load_and_replace_large_nan_inf(load_file_name, large, replace_with):
current_perf = np.load(load_file_name)
return replace_large_nan_inf(current_perf, large=large, replace_with=replace_with)
class FirstChainAttr:
def __init__(self, exp_name):
json_exp_params = load_sample_json_for_exp(exp_name)
self.size_of_labels = 25
self.y_lim = [0.0, 0.8]
self.x_lim = [0.0, json_exp_params['number_of_steps']]
self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7]
self.x_axis_ticks = [0.0, 5000, 10000, 15000, 20000]
self.x_tick_labels = [0, '5', '10', '15', '20']
self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)]
self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1]
self.over_limit_replacement = 2.0
self.over_limit_waterfall = 0.79
self.learning_starting_point = 0.68910
self.ok_error = 0.4
class FirstFourRoomAttr:
def __init__(self, exp_name):
json_exp_params = load_sample_json_for_exp(exp_name)
self.size_of_labels = 25
self.y_lim = [0.0, 0.8]
self.x_lim = [0.0, json_exp_params['number_of_steps']]
self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7]
self.x_axis_ticks = [0.0, 10000, 20000, 30000, 40000, 50000]
self.x_tick_labels = [0, '10', '20', '30', '40', '50']
self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)]
self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1]
self.over_limit_replacement = 2.0
self.over_limit_waterfall = 0.79
self.learning_starting_point = 0.72672
self.ok_error = 0.4
class HVFirstFourRoomAttr(FirstFourRoomAttr):
def __init__(self, exp_name):
super(HVFirstFourRoomAttr, self).__init__(exp_name)
def replace_large_nan_inf(arr, large=1.0, replace_with=2.0):
arr[np.isnan(arr)], arr[np.isinf(arr)], arr[arr > large] = replace_with, replace_with, replace_with
return arr
import os
import matplotlib.pyplot as plt
import numpy as np
from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \
RERUN_POSTFIX
from Plotting.plot_utils import make_current_params, replace_large_nan_inf, make_params
from utils import create_name_for_save_load
np.random.seed(0)
def load_all_performances(alg, exp, auc_or_final, sp, exp_attrs):
fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
all_performance = np.zeros((len(fp_list), len(tp_list), len(fop_list)))
for i, fop in enumerate(fop_list):
for j, tp in enumerate(tp_list):
current_params = make_current_params(alg, sp, tp, fop)
load_file_name = os.path.join(res_path, create_name_for_save_load(
current_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
if PLOT_RERUN and auc_or_final == 'auc':
load_file_name_rerun = load_file_name.replace('.npy', f"{RERUN_POSTFIX}.npy")
if os.path.isfile(load_file_name_rerun):
load_file_name = load_file_name_rerun
performance = np.load(load_file_name)
performance = replace_large_nan_inf(performance, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_waterfall)
all_performance[:, j, i] = performance
return all_performance
def plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs):
global ticker, x_axis_names, x_axis_ticks
performance_to_plot = np.array(all_performance.flatten())
percentage_overflowed = round((performance_to_plot > exp_attrs.learning_starting_point).sum() /
performance_to_plot.size, 2)
ok_percentage = round((performance_to_plot < exp_attrs.ok_error).sum() /
performance_to_plot.size, 2)
print(alg, 'percentage_overflowed', percentage_overflowed)
# print(alg, 'OK_percentage', ok_percentage)
color = ALG_COLORS[alg]
ax.scatter([(ticker + 1)] * performance_to_plot.shape[0] + np.random.uniform(
-0.25, 0.25, performance_to_plot.shape[0]), performance_to_plot, marker='o',
facecolors='none', color=color)
x_axis_ticks.append(ticker + 1)
ticker = (ticker + 1) % len(alg_names)
ax.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=True) # labels along the bottom edge are off
x_axis_names.append(f'{alg}_{percentage_overflowed}')
ax.xaxis.set_ticks(x_axis_ticks)
ax.set_xticklabels(x_axis_names)
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_ylim(exp_attrs.y_lim)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
ticker, x_axis_names, x_axis_ticks = 0.0, [''], [0]
def plot_waterfall_scatter(**kwargs):
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
save_dir = os.path.join('pdf_plots', 'waterfalls', auc_or_final)
for alg_names in kwargs['alg_groups'].values():
global ticker, x_axis_names, x_axis_ticks
ticker, x_axis_names, x_axis_ticks = -0.5, [''], [0]
fig, ax = plt.subplots(kwargs['fig_size'])
for alg in alg_names:
if alg in ['LSTD', 'LSETD']:
continue
all_performance = load_all_performances(alg, exp, auc_or_final, sp, exp_attrs)
plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
prefix = RERUN_POSTFIX if PLOT_RERUN else ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_waterfall_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg_names, auc_or_final, sp)
import os
import numpy as np
import matplotlib.pyplot as plt
class ValueFunctionProcessor:
def __init__(self, exp, alg):
result_dir = os.path.join(os.getcwd(), 'Results', exp, alg, 'Sample_value_function')
self.all_value_functions = dict()
self.all_value_functions_of_last_step = dict()
for value_function_name in os.listdir(result_dir):
value_function = np.load(os.path.join(result_dir, value_function_name))
step, run_num = (int(i) for i in value_function_name.replace('.npy', '').split('_'))
self.all_value_functions[(step, run_num)] = value_function
if (step == 19999 and exp == 'FirstChain') or (step == 49999 and exp == 'FirstFourRoom') or (
step == 49999 and exp == '1HVFourRoom'):
self.all_value_functions_of_last_step[run_num] = value_function
def get_value_function_by_step_and_run(self, step, run):
return self.all_value_functions[(step, run)]
def get_value_function_for_last_step(self, run):
return self.all_value_functions_of_last_step[run]
# STEPS = [199, 999, 1999, 3999, 9999, 19999]
STEPS = [199, 1999, 19999]
# STEPS = [19999]
RUNS = [0, 10, 15, 20, 30, 45]
# RUNS = list(range(50))
EXPS = ['FirstChain'] # FirstChain or FirstFourRoom or 1HVFourRoom
ALGS = ['TD']
TASK = 'EightStateCollision'
def plot_value_function(ax, value_function, step=0, run=0, is_last_step=False):
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(0, 1.0)
label = f"{step}_{run}"
line_style = '-'
line_width = 4
alpha = 1.0
color = 'blue'
if not step:
line_style = '--'
if not step and is_last_step:
line_style = '-'
if is_last_step:
line_width = 2
alpha = 0.2
color = 'red'
ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha, color=color)
else:
ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def plot_value_functions():
for exp in EXPS:
save_dir = os.path.join('pdf_plots', 'value_functions')
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy'))
for alg in ALGS:
value_processor = ValueFunctionProcessor(exp, alg)
for run in RUNS:
fig, ax = plt.subplots(figsize=(8, 3))
for step in STEPS:
value_function = value_processor.get_value_function_by_step_and_run(step, run)
plot_value_function(ax, value_function, step, run)
plot_value_function(ax, true_value_function)
fig.savefig(os.path.join(save_dir, f"{run}_value_function_{alg}_{exp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
def plot_all_final_value_functions():
for exp in EXPS:
save_dir = os.path.join('pdf_plots', 'value_functions', 'asymptotic_value_functions')
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy'))
for alg in ALGS:
value_processor = ValueFunctionProcessor(exp, alg)
fig, ax = plt.subplots(figsize=(8, 3))
for run in range(50):
value_function = value_processor.get_value_function_for_last_step(run)
plot_value_function(ax, value_function, is_last_step=True)
plot_value_function(ax, true_value_function)
fig.savefig(os.path.join(save_dir, f"value_function_{alg}_{exp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
This diff is collapsed. Click to expand it.
from Algorithms.TD import TD
from Algorithms.GTD import GTD
from Algorithms.TDRC import TDRC
from Algorithms.GEMETD import GEMETD
from Algorithms.GTD2 import GTD2
from Algorithms.PGTD2 import PGTD2
from Algorithms.HTD import HTD
from Algorithms.ETDLB import ETDLB
from Algorithms.ETD import ETD
from Algorithms.ABTD import ABTD
from Algorithms.Vtrace import Vtrace
from Algorithms.TB import TB
from Algorithms.LSTD import LSTD
from Algorithms.LSETD import LSETD
alg_dict = {'TD': TD, 'Vtrace': Vtrace, 'GTD': GTD, 'ABTD': ABTD, 'ETD': ETD, 'TB': TB, 'GTD2': GTD2, 'HTD': HTD,
'ETDLB': ETDLB, 'PGTD2': PGTD2, 'TDRC': TDRC, 'GEMETD': GEMETD, 'LSTD': LSTD, 'LSETD': LSETD}
# alg_dict = {'TD': TD, 'GTD': GTD, 'GTD2': GTD2, 'PGTD2': PGTD2, 'HTD': HTD, 'TDRC': TDRC, 'ETD': ETD, 'ETDLB': ETDLB,
# 'TB': TB, 'Vtrace': Vtrace, 'ABTD': ABTD, 'LSTD': LSTD, 'LSETD': 'LSETD'}
from Environments.Chain import Chain
from Environments.FourRoomGridWorld import FourRoomGridWorld
environment_dict = {'FourRoomGridWorld': FourRoomGridWorld, 'Chain': Chain}
from Tasks.EightStateCollision import EightStateCollision
from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
from Tasks.HighVarianceLearnEightPoliciesTileCodingFeat import HighVarianceLearnEightPoliciesTileCodingFeat
task_dict = {'EightStateCollision': EightStateCollision,
'LearnEightPoliciesTileCodingFeat': LearnEightPoliciesTileCodingFeat,
'HighVarianceLearnEightPoliciesTileCodingFeat': HighVarianceLearnEightPoliciesTileCodingFeat}
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment