Commit 3d352cbd by GongYu

TDAlgorithms_IEEE24

parents
# Created by https://www.toptal.com/developers/gitignore/api/macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp
# Edit at https://www.toptal.com/developers/gitignore?templates=macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp
### Linux ###
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### NotepadPP ###
# Notepad++ backups #
*.bak
### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/
# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml
# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/
# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$
# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
doc/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
### SublimeText ###
# Cache files for Sublime Text
*.tmlanguage.cache
*.tmPreferences.cache
*.stTheme.cache
# Workspace files are user-specific
*.sublime-workspace
# Project files should be checked into the repository, unless a significant
# proportion of contributors will probably not be using Sublime Text
# *.sublime-project
# SFTP configuration file
sftp-config.json
# Package control specific files
Package Control.last-run
Package Control.ca-list
Package Control.ca-bundle
Package Control.system-ca-bundle
Package Control.cache/
Package Control.ca-certs/
Package Control.merged-ca-bundle
Package Control.user-ca-bundle
oscrypto-ca-bundle.crt
bh_unicode_properties.cache
# Sublime-github package stores a github token in this file
# https://packagecontrol.io/packages/sublime-github
GitHub.sublime-settings
### Vim ###
# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
Sessionx.vim
# Temporary
.netrwhist
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk
### VisualStudio ###
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Mono auto generated files
mono_crash.*
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Coverlet is a free, cross platform Code Coverage Tool
coverage*[.json, .xml, .info]
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/
# Others
ClientBin/
~$*
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# BeatPulse healthcheck temp database
healthchecksdb
# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/
# Ionide (cross platform F# VS Code tools) working folder
.ionide/
# End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp
/.idea
from Algorithms.BaseVariableLmbda import BaseVariableLmbda
import numpy as np
class ABTD(BaseVariableLmbda):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
zeta = kwargs.get('zeta')
self.old_nu = 0
if self.task.num_policies > 1:
self.old_nu = np.zeros(self.task.num_policies)
xi_zero = self.task.ABTD_xi_zero
xi_max = self.task.ABTD_xi_max
self.xi = 2 * zeta * xi_zero + max(0, 2 * zeta - 1) * (xi_max - 2 * xi_zero)
@staticmethod
def related_parameters():
return['alpha', 'zeta']
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, rho, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal)
nu = min(self.xi, 1.0 / max(pi, mu))
self.z = x + self.gamma * self.old_nu * self.old_pi * self.z
self.w += alpha * delta * self.z
self.old_nu = nu
self.old_pi = pi
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
delta = rho * delta
nu = self.compute_nu_for_multiple_policies(pi, mu)
self.z = (self.gamma_vec_t * self.old_nu * self.old_pi)[:, None] * self.z + stacked_x
self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
self.old_nu = nu
self.old_pi = pi
self.gamma_vec_t = self.gamma_vec_tp
def compute_nu_for_multiple_policies(self, pi, mu):
xi_vec = np.ones(self.task.num_policies) * self.xi
max_vec = 1.0 / np.maximum.reduce([pi, mu])
return np.minimum.reduce([max_vec, xi_vec])
def reset(self):
super().reset()
self.old_nu = 0
import numpy as np
from Algorithms.BaseTD import BaseTD
from Tasks.BaseTask import BaseTask
class BaseGradient(BaseTD):
def __init__(self, task: BaseTask, **kwargs):
super().__init__(task, **kwargs)
self.v = np.zeros(self.task.num_features)
self.eta = kwargs.get('eta')
if self.task.num_policies > 1:
self.v = np.zeros((self.task.num_policies, self.task.num_features))
@staticmethod
def related_parameters():
return ['alpha', 'lmbda', 'eta']
def compute_second_step_size(self):
return self.eta * self.compute_step_size()
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super(BaseGradient, self).learn_multiple_policies(
s, s_p, r, is_terminal)
return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x, self.task.stacked_feature_rep[:, :, s_p], \
self.compute_second_step_size() * self.gamma_vec_t / self.gamma
import numpy as np
from numpy.linalg import pinv
from Tasks.BaseTask import BaseTask
from Algorithms.BaseTD import BaseTD
class BaseLS(BaseTD):
def __init__(self, task: BaseTask, **kwargs):
super(BaseLS, self).__init__(task, **kwargs)
self.A = np.zeros((self.task.num_features, self.task.num_features))
self.b = np.zeros(self.task.num_features)
self.t = 0
if self.task.num_policies > 1:
self.A = np.zeros((self.task.num_policies, self.task.num_features, self.task.num_features))
self.b = np.zeros((self.task.num_policies, self.task.num_features))
self.gamma_vec_t = np.concatenate((np.ones(2), np.zeros(6))) * self.gamma
self.t = np.zeros(self.task.num_policies)
def learn_single_policy(self, s, s_p, r, is_terminal):
x, x_p = self.get_features(s, s_p, is_terminal)
self.t += 1
self.A += (np.outer(self.z, (x - self.gamma * x_p)) - self.A) / self.t
self.b += (r * self.z - self.b) / self.t
self.w = np.dot(pinv(self.A), self.b)
def learn_multiple_policies(self, s, s_p, r, is_terminal):
_, _, x, x_p, _, _, _, stacked_x = \
super(BaseLS, self).learn_multiple_policies(s, s_p, r, is_terminal)
for i in range(self.task.num_policies):
if self.gamma_vec_t[i] != 0.0:
self.t[i] += 1
z = self.z[i, :]
self.A[i, :, :] += (np.outer(z, (x - self.gamma_vec_tp[i] * x_p)) - self.A[i, :, :]) / self.t[i]
self.b[i, :] += (self.r_vec[i] * z - self.b[i, :]) / self.t[i]
self.w[i, :] = np.dot(pinv(self.A[i, :, :]), self.b[i, :])
self.gamma_vec_t = self.gamma_vec_tp
import numpy as np
from Tasks.BaseTask import BaseTask
class BaseTD:
def __init__(self, task: BaseTask, **kwargs):
self.task = task
self.w = np.zeros(self.task.num_features)
self.z = np.zeros(self.task.num_features)
if self.task.num_policies > 1:
self.w = np.zeros((self.task.num_policies, self.task.num_features))
self.z = np.zeros((self.task.num_policies, self.task.num_features))
self.gamma = self.task.GAMMA
self.alpha = kwargs['alpha']
self.lmbda = kwargs.get('lmbda')
self.state_values = self.task.load_state_values() # This is of size num_policies * 121
self.d_mu = self.task.load_behavior_dist() # same size as state_values
self.state, self.next_state, self.action = None, None, None
self.r_vec = np.zeros(self.task.num_policies)
self.gamma_vec_tp = np.zeros(self.task.num_policies)
self.gamma_vec_t = np.zeros(self.task.num_policies)
@staticmethod
def related_parameters():
return ['alpha', 'lmbda']
def compute_value_function(self):
return np.dot(self.w, self.task.feature_rep.T)
def compute_rmsve(self):
error = self.compute_value_function() - self.state_values
error_squared = error * error
return np.sqrt(np.sum(self.d_mu * error_squared.T, 0) / np.sum(self.d_mu, 0)), error
def compute_step_size(self):
return self.alpha
def choose_behavior_action(self):
return self.task.select_behavior_action(self.state)
def choose_target_action(self):
return self.task.select_target_action(self.state)
def learn(self, s, s_p, r, is_terminal):
if self.task.num_policies == 1:
self.learn_single_policy(s, s_p, r, is_terminal)
else:
self.learn_multiple_policies(s, s_p, r, is_terminal)
def get_features(self, s, s_p, is_terminal):
x_p = np.zeros(self.task.num_features)
if not is_terminal:
x_p = self.task.get_state_feature_rep(s_p)
x = self.task.get_state_feature_rep(s)
return x, x_p
def get_isr(self, s):
pi = self.task.get_pi(s, self.action)
mu = self.task.get_mu(s, self.action)
rho = pi / mu
return rho
def get_delta(self, r, x, x_p):
return r + self.gamma * np.dot(self.w, x_p) - np.dot(self.w, x)
def learn_single_policy(self, s, s_p, r, is_terminal):
x, x_p = self.get_features(s, s_p, is_terminal)
rho = self.get_isr(s)
alpha = self.compute_step_size()
delta = self.get_delta(r, x, x_p)
self.z = rho * (self.gamma * self.lmbda * self.z + x)
return delta, alpha, x, x_p, rho
def learn_multiple_policies(self, s, s_p, r, is_terminal):
active_policies_vec = self.task.get_active_policies(s)
self.r_vec = np.zeros(self.task.num_policies)
if r > 0:
terminal_policies_vec = self.task.get_terminal_policies(s_p)
self.r_vec = r * terminal_policies_vec
alpha_vec = active_policies_vec * self.compute_step_size()
x = self.task.get_state_feature_rep(s)
x_p = np.zeros(self.task.num_features)
if not is_terminal:
x_p = self.task.get_state_feature_rep(s_p)
pi = self.task.get_pi(s, self.action)
mu = self.task.get_mu(s, self.action)
rho = pi / mu
self.gamma_vec_tp = self.task.get_active_policies(s_p) * self.gamma
delta = self.r_vec + self.gamma_vec_tp * np.dot(self.w, x_p) - np.dot(self.w, x)
stacked_x = self.task.stacked_feature_rep[:, :, s]
return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x
def reset(self):
self.z = np.zeros(self.task.num_features)
def __str__(self):
return f'agent:{type(self).__name__}'
from Algorithms.BaseTD import BaseTD
from Tasks.BaseTask import BaseTask
import numpy as np
class BaseVariableLmbda(BaseTD):
def __init__(self, task: BaseTask, **kwargs):
super().__init__(task, **kwargs)
self.old_pi, self.old_mu = 0, 1
if self.task.num_policies > 1:
self.old_pi, self.old_mu = np.zeros(self.task.num_policies), np.ones(self.task.num_policies)
self.old_rho = self.old_pi / self.old_mu
def learn_single_policy(self, s, s_p, r, is_terminal):
alpha = self.compute_step_size()
pi = self.task.get_pi(s, self.action)
mu = self.task.get_mu(s, self.action)
rho = pi / mu
x, x_p = self.get_features(s, s_p, is_terminal)
delta = rho * self.get_delta(r, x, x_p)
return delta, alpha, x, x_p, rho, pi, mu
def reset(self):
self.old_pi, self.old_mu = 0, 1
self.old_rho = self.old_pi / self.old_mu
from Algorithms.ETDLB import ETDLB
class ETD(ETDLB):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.beta = self.task.GAMMA
@staticmethod
def related_parameters():
return ['alpha', 'lmbda']
from Algorithms.BaseTD import BaseTD
import numpy as np
class ETDLB(BaseTD):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.F = 1
self.old_rho = 0
self.beta = kwargs.get('beta')
if self.task.num_policies > 1:
self.F = np.zeros(self.task.num_policies)
self.old_rho = np.zeros(self.task.num_policies)
@staticmethod
def related_parameters():
return ['alpha', 'lmbda', 'beta']
def learn_single_policy(self, s, s_p, r, is_terminal):
x, x_p = self.get_features(s, s_p, is_terminal)
delta = self.get_delta(r, x, x_p)
self.F = self.beta * self.old_rho * self.F + 1
m = self.lmbda * 1 + (1 - self.lmbda) * self.F
rho = self.get_isr(s)
self.z = rho * (x * m + self.gamma * self.lmbda * self.z)
self.w += self.compute_step_size() * delta * self.z
self.old_rho = rho
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, *_, rho, _ = super().learn_multiple_policies(s, s_p, r, is_terminal)
stacked_x = self.task.stacked_feature_rep[:, :, s]
beta_vec = self.beta * self.gamma_vec_t / self.gamma
self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies)
m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None])
self.w += (alpha_vec * delta)[:, None] * self.z
self.old_rho = rho
self.gamma_vec_t = self.gamma_vec_tp
def reset(self):
super().reset()
self.F = 1
self.old_rho = 0
if self.task.num_policies > 1:
self.old_rho = np.zeros(self.task.num_policies)
self.F = np.zeros(self.task.num_policies)
from Algorithms.BaseTD import BaseTD
import numpy as np
class GEMETD(BaseTD):
"""
An ETD(0) implementation that uses GEM (aka GTD2(0) with x and x_p switched) to estimate emphasis.
"""
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.beta = self.task.GAMMA
self.gem_alpha = kwargs['gem_alpha'] # Step size for GEM weights.
self.gem_beta = kwargs['gem_beta'] # Regularization parameter for GEM; not needed for a fixed target policy.
self.k = np.zeros(self.task.num_features) # Auxiliary weights for GEM.
self.u = np.zeros(self.task.num_features) # Main weights for GEM.
if self.task.num_policies > 1:
self.k = np.zeros((self.task.num_policies, self.task.num_features))
self.u = np.zeros((self.task.num_policies, self.task.num_features))
@staticmethod
def related_parameters():
return ['alpha', 'gem_alpha', 'gem_beta']
def learn_single_policy(self, s, s_p, r, is_terminal):
x, x_p = self.get_features(s, s_p, is_terminal)
rho = self.get_isr(s)
delta_bar = 1 + rho * self.gamma * np.dot(self.u, x) - np.dot(self.u, x_p)
self.k += self.gem_alpha * (delta_bar - np.dot(self.k, x_p)) * x_p
self.u += self.gem_alpha * ((x_p - self.gamma * rho * x) * np.dot(self.k, x_p) - self.gem_beta * self.u)
delta = self.get_delta(r, x, x_p)
m = np.dot(self.u, x) # Use parametric estimate of expected emphasis.
self.w += self.alpha * m * rho * delta * x
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
stacked_x_p = self.task.stacked_feature_rep[:, :, s_p]
# GEM update:
gem_alpha_vec = self.task.get_active_policies(s) * self.gem_alpha
delta_bar = np.ones(self.task.num_policies) + rho * self.gamma_vec_t * np.dot(self.u, x) - np.dot(self.u, x_p)
self.k += gem_alpha_vec[:, None] * (delta_bar[:, None] - np.sum(x_p * self.k, 1)[:, None]) * stacked_x_p
self.u += gem_alpha_vec[:, None] * ((stacked_x_p - self.gamma_vec_t[:, None] * rho[:, None] * stacked_x) * np.sum(x_p * self.k, 1)[:, None] - self.gem_beta * self.u) # should self.gem_beta be a vector here?
# ETD(0) update:
m = np.dot(self.u, x)
self.w += (alpha_vec * m * rho * delta)[:, None] * stacked_x
self.gamma_vec_t = self.gamma_vec_tp
def reset(self):
super().reset()
self.k = np.zeros(self.task.num_features)
self.u = np.zeros(self.task.num_features)
if self.task.num_policies > 1:
self.k = np.zeros((self.task.num_policies, self.task.num_features))
self.u = np.zeros((self.task.num_policies, self.task.num_features))
from Algorithms.BaseGradient import BaseGradient
import numpy as np
# noinspection DuplicatedCode
class GTD(BaseGradient):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x)
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p)
self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseGradient import BaseGradient
import numpy as np
class GTD2(BaseGradient):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
self.w += alpha * (np.dot(x, self.v) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x)
# noinspection DuplicatedCode
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
self.w += alpha_vec[:, None] * (
np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseGradient import BaseGradient
import numpy as np
class HTD(BaseGradient):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.z_b = np.zeros(self.task.num_features)
if self.task.num_policies > 1:
self.z_b = np.zeros((self.task.num_policies, self.task.num_features))
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
self.z_b = self.gamma * self.lmbda * self.z_b + x
self.w += alpha * ((delta * self.z) + (x - self.gamma * x_p) * np.dot((self.z - self.z_b), self.v))
self.v += alpha_v * ((delta * self.z) - (x - self.gamma * x_p) * np.dot(self.v, self.z_b))
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
self.z_b = self.lmbda * self.z_b * self.gamma_vec_t[:, None] + stacked_x
gamma_stacked_xp = self.gamma_vec_tp[:, None] * stacked_x_p
delta_z = delta[:, None] * self.z
self.w += alpha_vec[:, None] * (
delta_z + (stacked_x - gamma_stacked_xp) * (np.sum((self.z - self.z_b) * self.v, 1))[:, None])
self.v += alphav_vec[:, None] * (
delta_z - (stacked_x - gamma_stacked_xp) * np.sum(self.v * self.z_b, 1)[:, None])
# TODO: Should the last v be replaced by w?
self.gamma_vec_t = self.gamma_vec_tp
def reset(self):
super().reset()
self.z_b = np.zeros(self.task.num_features)
if self.task.num_policies > 1:
self.z_b = np.zeros((self.task.num_policies, self.task.num_features))
from Algorithms.BaseLS import BaseLS
import numpy as np
class LSETD(BaseLS):
def __init__(self, task, **kwargs):
super(LSETD, self).__init__(task, **kwargs)
self.old_rho = 0
self.F = 1
self.beta = kwargs['beta']
if self.task.num_policies > 1:
self.F = np.ones(self.task.num_policies)
self.old_rho = np.zeros(self.task.num_policies)
@staticmethod
def related_parameters():
return ['alpha', 'lmbda', 'beta']
def learn_single_policy(self, s, s_p, r, is_terminal):
self.F = self.beta * self.old_rho * self.F + 1
m = self.lmbda + (1 - self.lmbda) * self.F
x, _ = self.get_features(s, s_p, is_terminal)
rho = self.get_isr(s)
self.z = rho * (self.gamma * self.lmbda * self.z + x * m)
super(LSETD, self).learn_single_policy(s, s_p, r, is_terminal)
self.old_rho = rho
# noinspection DuplicatedCode
def learn_multiple_policies(self, s, s_p, r, is_terminal):
beta_vec = self.beta * self.gamma_vec_t / self.gamma
self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies)
m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F
stacked_x = self.task.stacked_feature_rep[:, :, s]
rho = self.get_isr(s)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None])
super(LSETD, self).learn_multiple_policies(s, s_p, r, is_terminal)
self.old_rho = rho
def reset(self):
super().reset()
self.F = 1
self.old_rho = 0
if self.task.num_policies > 1:
self.old_rho = np.zeros(self.task.num_policies)
self.F = np.zeros(self.task.num_policies)
from Algorithms.BaseLS import BaseLS
class LSTD(BaseLS):
def learn_single_policy(self, s, s_p, r, is_terminal):
x, _ = self.get_features(s, s_p, is_terminal)
self.z = self.get_isr(s) * (self.gamma * self.lmbda * self.z + x)
super(LSTD, self).learn_single_policy(s, s_p, r, is_terminal)
def learn_multiple_policies(self, s, s_p, r, is_terminal):
x, _ = self.get_features(s, s_p, is_terminal)
self.z = self.get_isr(s)[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + x)
super(LSTD, self).learn_multiple_policies(s, s_p, r, is_terminal)
from Algorithms.BaseGradient import BaseGradient
import numpy as np
class PGTD2(BaseGradient):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
v_mid = self.v + alpha_v * (delta * self.z - np.dot(x, self.v) * x)
w_mid = self.w + alpha * (np.dot(x, self.v) * x - (1 - self.lmbda) * self.gamma * np.dot(self.z, self.v) * x_p)
delta_mid = r + self.gamma * np.dot(w_mid, x_p) - np.dot(w_mid, x)
self.w += alpha * (np.dot(x, v_mid) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, v_mid) * x_p)
self.v += alpha_v * (delta_mid * self.z - np.dot(x, v_mid) * x)
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
v_mid = self.v + alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
w_mid = self.w + alpha_vec[:, None] * (
np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
delta_mid = self.r_vec + self.gamma_vec_tp * np.dot(w_mid, x_p) - np.dot(w_mid, x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * v_mid, 1)
self.w += alpha_vec[:, None] * (
np.sum(x * v_mid, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
self.v += alphav_vec[:, None] * (delta_mid[:, None] * self.z - np.sum(x * v_mid, 1)[:, None] * stacked_x)
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseVariableLmbda import BaseVariableLmbda
class TB(BaseVariableLmbda):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, *_, pi, _ = super().learn_single_policy(s, s_p, r, is_terminal)
self.z = self.gamma * self.lmbda * self.old_pi * self.z + x
self.w = self.w + alpha * delta * self.z
self.old_pi = pi
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
delta = rho * delta
self.z = (self.gamma_vec_t * self.lmbda * self.old_pi)[:, None] * self.z + stacked_x
self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
self.old_pi = pi
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseTD import BaseTD
class TD(BaseTD):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, *_ = super().learn_single_policy(s, s_p, r, is_terminal)
self.w += alpha * delta * self.z
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
self.w += (alpha_vec * delta)[:, None] * self.z
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseGradient import BaseGradient
import numpy as np
# noinspection DuplicatedCode
class TDRC(BaseGradient):
def __init__(self, task, **kwargs):
super().__init__(task, **kwargs)
self.tdrc_beta = kwargs['tdrc_beta']
@staticmethod
def related_parameters():
return ['alpha', 'lmbda', 'eta', 'tdrc_beta']
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
alpha_v = self.compute_second_step_size()
self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x) - alpha_v * self.tdrc_beta * self.v
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
s, s_p, r, is_terminal)
self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p)
self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(
x * self.v, 1)[:, None] * stacked_x) - (alphav_vec * self.tdrc_beta)[:, None] * self.v
self.gamma_vec_t = self.gamma_vec_tp
from Algorithms.BaseVariableLmbda import BaseVariableLmbda
import numpy as np
class Vtrace(BaseVariableLmbda):
def learn_single_policy(self, s, s_p, r, is_terminal):
delta, alpha, x, *_, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal)
self.z = min(self.old_rho, 1) * self.gamma * self.lmbda * self.z + x
self.w += alpha * delta * self.z
self.old_rho = pi / mu
def learn_multiple_policies(self, s, s_p, r, is_terminal):
delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
delta = rho * delta
truncated_old_rho = np.minimum(self.old_rho, np.ones(self.task.num_policies))
self.z = (truncated_old_rho * self.gamma_vec_t * self.lmbda)[:, None] * self.z + stacked_x
self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
self.old_rho = rho
self.gamma_vec_t = self.gamma_vec_tp
import numpy as np
class Chain:
def __init__(self, states_number: int = 8, start_state_number: int = 4, **kwargs):
assert start_state_number < states_number, "start states numbers should be less than state number"
self._states_number = states_number
self._start_state_number = start_state_number
self._terminal = self._states_number
self._state = None
self.RIGHT_ACTION = 0
self.RETREAT_ACTION = 1
self.num_states = states_number
self._window = None
def reset(self):
self._state = np.random.randint(0, self._start_state_number)
return self._state
def step(self, action):
if action == self.RETREAT_ACTION:
return self._terminal, 0, True, {}
next_state = self._state + 1
if next_state == self._terminal:
return self._terminal, 1, True, {}
self._state = next_state
return self._state, 0, False, {}
def render(self, mode='human'):
if mode == 'human':
import sys
from Environments.utils import colorize
corridor_map = [
str(i) if i > self._start_state_number
else colorize(str(i), "blue", highlight=False)
for i in range(self._states_number)
]
corridor_map.append(colorize("T", "red", highlight=False))
corridor_map[self._state] = colorize(corridor_map[self._state], "green", highlight=True)
sys.stdout.write(f'{"|".join(corridor_map)}\n')
if mode == "rgb" or mode == "screen":
RGB_COLORS = {
'red': np.array([240, 52, 52]),
'black': np.array([0, 0, 0]),
'green': np.array([77, 181, 33]),
'blue': np.array([29, 111, 219]),
'purple': np.array([112, 39, 195]),
'yellow': np.array([217, 213, 104]),
'grey': np.array([192, 195, 196]),
'light_grey': np.array([230, 230, 230]),
'white': np.array([255, 255, 255])
}
img = np.zeros((self.num_states, 1, 3), dtype=np.uint8)
img[:, 0] = RGB_COLORS['grey']
img[:self._start_state_number - 1, 0] = RGB_COLORS['yellow']
img[self._terminal - 1, 0] = RGB_COLORS['black']
img[self._state - 1, 0] = RGB_COLORS['green']
img = np.transpose(img, (1, 0, 2))
if mode == "screen":
from pyglet.window import Window
from pyglet.text import Label
from pyglet.gl import GLubyte
from pyglet.image import ImageData
zoom = 50
if self._window is None:
self._window = Window(self.num_states * zoom, 1 * zoom)
dt = np.kron(img, np.ones((zoom, zoom, 1)))
dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8'))
texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture()
self._window.clear()
self._window.switch_to()
self._window.dispatch_events()
texture.blit(0, 0)
# self._info.draw()
self._window.flip()
return np.flip(img, axis=0)
if __name__ == '__main__':
env = Chain()
env.reset()
for step in range(1, 1000):
action = np.random.randint(0, 2)
sp, r, terminal, _ = env.step(action=action)
env.render(mode="screen")
if terminal:
env.reset()
print('env reset')
import numpy as np
# from Environments.rendering import Render
# from gym import utils
# import gym
# import sys
BLOCK_NORMAL, BLOCK_WALL, BLOCK_HALLWAY, BLOCK_AGENT = 0, 1, 2, 3
RGB_COLORS = {
'red': np.array([240, 52, 52]),
'black': np.array([0, 0, 0]),
'green': np.array([77, 181, 33]),
'blue': np.array([29, 111, 219]),
'purple': np.array([112, 39, 195]),
'yellow': np.array([217, 213, 104]),
'grey': np.array([192, 195, 196]),
'light_grey': np.array([230, 230, 230]),
'white': np.array([255, 255, 255])
}
four_room_map = [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
]
class FourRoomGridWorld:
def __init__(self, stochasticity_fraction=0.0):
self._grid = np.transpose(np.flip(np.array(four_room_map, dtype=np.uint8), axis=0)[1:-1, 1:-1])
self._max_row, self._max_col = self._grid.shape
self._normal_tiles = np.where(self._grid == BLOCK_NORMAL)
self._hallways_tiles = np.where(self._grid == BLOCK_HALLWAY)
self._walls_tiles = np.where(self._grid == BLOCK_WALL)
self.num_states = self._grid.size
self._state = None
self.ACTION_UP, self.ACTION_DOWN, self.ACTION_RIGHT, self.ACTION_LEFT = 0, 1, 2, 3
self.num_actions = 4
self._stochasticity_fraction = stochasticity_fraction
self.hallways = {
0: (5, 1),
1: (1, 5),
2: (5, 8),
3: (8, 4)
}
self._window, self._info = None, None
def reset(self):
self._state = (0, 0)
return self.get_state_index(*self._state)
def step(self, action):
x, y = self._state
is_stochastic_selected = False
# if self._stochasticity_fraction >= np.random.uniform():
# action_probability = [1 / (self.num_actions - 1) if i != action else 0 for i in range(self.num_actions)]
# action = np.random.choice(self.num_actions, 1, p=action_probability)[0]
# is_stochastic_selected = True
x_p, y_p = self._next(action, *self._state)
is_done = self._grid[x_p, y_p] == BLOCK_HALLWAY
reward = 1 if is_done else 0
self._state = (x_p, y_p)
return self.get_state_index(*self._state), reward, False, {
'x': x, 'y': y,
'x_p': x_p, 'y_p': y_p,
'is_stochastic_selected': is_stochastic_selected,
'selected_action': action}
def get_xy(self, state):
return (state % self._max_row), (state // self._max_col)
def get_state_index(self, x, y):
return y * self._max_col + x
def _next(self, action, x, y):
def move(current_x, current_y, next_x, next_y):
if next_y < 0 or next_x < 0:
return current_x, current_y
if next_y >= self._max_col or next_x >= self._max_row:
return current_x, current_y
if self._grid[next_x, next_y] == BLOCK_WALL:
return current_x, current_y
return next_x, next_y
switcher = {
self.ACTION_DOWN: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x, pos_y - 1),
self.ACTION_RIGHT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x + 1, pos_y),
self.ACTION_UP: lambda pox_x, pos_y: move(pox_x, y, pox_x, pos_y + 1),
self.ACTION_LEFT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x - 1, pos_y),
}
move_func = switcher.get(action)
return move_func(x, y)
def render(self, mode='human'):
import sys
from Environments.utils import colorize
color = {
BLOCK_NORMAL: lambda c: colorize(c, "white", highlight=True),
BLOCK_WALL: lambda c: colorize(c, "gray", highlight=True),
BLOCK_HALLWAY: lambda c: colorize(c, "green", highlight=True),
}
if mode == 'human':
outfile = sys.stdout
img = [
[color[b](' ')
for x, b
in enumerate(line)]
for y, line in enumerate(four_room_map)]
img[self._max_row - self._state[1]][self._state[0] + 1] = colorize(' ', "red",
highlight=True)
for line in img:
outfile.write(f'{"".join(line)}\n')
outfile.write('\n')
if mode == "rgb" or mode == "screen":
x, y = self._state
img = np.zeros((*self._grid.shape, 3), dtype=np.uint8)
img[self._normal_tiles] = RGB_COLORS['light_grey']
# if render_cls is not None:
# assert render_cls is not type(Render), "render_cls should be Render class"
# img = render_cls.render(img)
img[self._walls_tiles] = RGB_COLORS['black']
img[self._hallways_tiles] = RGB_COLORS['green']
img[x, y] = RGB_COLORS['red']
ext_img = np.zeros((self._max_row + 2, self._max_col + 2, 3), dtype=np.uint8)
ext_img[1:-1, 1:-1] = np.transpose(img, (1, 0, 2))
if mode == "screen":
from pyglet.window import Window
from pyglet.text import Label
from pyglet.gl import GLubyte
from pyglet.image import ImageData
zoom = 20
if self._window is None:
self._window = Window((self._max_row + 2) * zoom, (self._max_col + 2) * zoom)
self._info = Label('Four Room Grid World', font_size=10, x=5, y=5)
# self._info.text = f'x: {x}, y: {y}'
dt = np.kron(ext_img, np.ones((zoom, zoom, 1)))
dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8'))
texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture()
self._window.clear()
self._window.switch_to()
self._window.dispatch_events()
texture.blit(0, 0)
# self._info.draw()
self._window.flip()
return np.flip(ext_img, axis=0)
if __name__ == '__main__':
mode = 'human'
mode = 'screen'
env = FourRoomGridWorld()
env.reset()
for step in range(1, 100):
action = np.random.randint(0, 4)
sp, r, terminal, _ = env.step(action=action)
env.render(mode=mode)
if terminal:
env.reset()
print('env reset')
from abc import ABC, abstractmethod
import numpy as np
class Render(ABC):
@abstractmethod
def render(self, img):
raise NotImplementedError
class ErrorRender(Render):
def __init__(self, num_policies, num_steps):
self.num_steps = num_steps
self.num_policies = num_policies
self._error, self._max_error, self._valid_state = None, None, None
def render(self, img):
# self.color_policy(img, 0)
self.color_policy(img, 1)
# self.color_policy(img, 2)
self.color_policy(img, 3)
# self.color_policy(img, 4)
self.color_policy(img, 5)
# self.color_policy(img, 6)
self.color_policy(img, 7)
return img
def add_error(self, error):
if self._max_error is None:
self._max_error = np.abs(error).reshape(8, 11, 11)
self._valid_state = np.array(self._max_error)
self._valid_state[self._valid_state != 0] = 1
self._error = np.abs(error).reshape(8, 11, 11)
def color_policy(self, img, policy_number):
e = self._error[policy_number]
x = self._max_error[policy_number]
d = np.clip((230 * e / x), 10, 255)
d = d * self._valid_state[policy_number]
d = np.nan_to_num(d).astype(np.uint8).T
d = np.repeat(d, 3).reshape(11, 11, 3)
d[:, :, 2] = 230
c = np.where(self._valid_state[policy_number].T == 1)
img[c] = d[c]
return img
"""A set of common utilities used within the environments. These are
not intended as API functions, and will not remain stable over time.
"""
color2num = dict(
gray=30,
red=31,
green=32,
yellow=33,
blue=34,
magenta=35,
cyan=36,
white=37,
crimson=38
)
def colorize(string, color, bold=False, highlight=False):
"""Return string surrounded by appropriate terminal color codes to
print colorized text. Valid colors: gray, red, green, yellow,
blue, magenta, cyan, white, crimson
"""
attr = []
num = color2num[color]
if highlight:
num += 10
attr.append(str(num))
if bold:
attr.append('1')
attrs = ';'.join(attr)
return '\x1b[%sm%s\x1b[0m' % (attrs, string)
{
"agent": "ABTD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"zeta": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ETD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ETDLB",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"beta": [
0.0, 0.2, 0.4, 0.6, 0.8, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "GTD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "GTD2",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "HTD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "PGTD2",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TB",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TD",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TDRC",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
],
"tdrc_beta": [
1.0
]
}
}
\ No newline at end of file
{
"agent": "Vtrace",
"environment": "FourRoomGridWorld",
"task": "HighVarianceLearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ABTD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"zeta": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "ETD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "ETDLB",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"beta": [
0.0, 0.2, 0.4, 0.6, 0.8, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "GTD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "GTD2",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "HTD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "PGTD2",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "TB",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "TD",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "TDRC",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3
],
"tdrc_beta": [
1.0
]
}
}
\ No newline at end of file
{
"agent": "Vtrace",
"environment": "Chain",
"task": "EightStateOffPolicyRandomFeat",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
\ No newline at end of file
{
"agent": "ABTD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"zeta": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ETD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "ETDLB",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"beta": [
0.0, 0.2, 0.4, 0.6, 0.8, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "GTD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "GTD2",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "HTD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "PGTD2",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TB",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TD",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
{
"agent": "TDRC",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"eta": [
0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
],
"tdrc_beta": [
1.0
]
}
}
\ No newline at end of file
{
"agent": "Vtrace",
"environment": "FourRoomGridWorld",
"task": "LearnEightPoliciesTileCodingFeat",
"number_of_runs": 50,
"number_of_steps": 50000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
]
}
}
\ No newline at end of file
This directory contains all the export.dat files created when submitting jobs on Cedar.
\ No newline at end of file
#!/bin/bash
alpha=(__ALPHA__)
lmbda=(__LMBDA__)
eta=(__ETA__)
beta=(__BETA__)
zeta=(__ZETA__)
tdrc_beta=(__TDRCBETA__)
gem_alpha=(__GEMALPHA__)
gem_beta=(__GEMBETA__)
num_of_runs=__NUMOFRUNS__
num_steps=__NUMSTEPS__
sub_sample=__SUBSAMPLE__
algorithm=__ALGORITHM__
environment=__ENVIRONMENT__
task=__TASK__
save_path=__SAVEPATH__
rm -f exports_${algorithm}.dat
for A in ${alpha[@]}; do
for L in ${lmbda[@]}; do
for E in ${eta[@]}; do
for B in ${beta[@]}; do
for Z in ${zeta[@]}; do
for T in ${tdrc_beta[@]}; do
for GA in ${gem_alpha[@]}; do
for GB in ${gem_beta[@]}; do
echo export SAVE_PATH=${save_path} ENVIRONMENT=${environment} ALGORITHM=${algorithm} \
TASK=${task} ALPHA=${A} LMBDA=${L} ETA=${E} BETA=${B} ZETA=${Z} TDRCBETA=${T} GEMALPHA=${GA} \
GEMBETA=${GB} NUMOFRUNS=${num_of_runs} NUMSTEPS=${num_steps} SUBSAMPLE=${sub_sample} \
>>exports_${algorithm}.dat
done
done
done
done
done
done
done
done
import os
import json
import numpy as np
from utils import ImmutableDict
import time
default_params = ImmutableDict(
{
'agent': 'GEMETD',
'task': 'EightStateCollision',
'environment': 'Chain',
'exp': 'FirstChain',
# 'agent': 'HTD',
# 'task': 'LearnEightPoliciesTileCodingFeat',
# 'environment': 'FourRoomGridWorld',
# 'exp': 'FirstFourRoom',
# 'agent': 'LSTD',
# 'task': 'HighVarianceLearnEightPoliciesTileCodingFeat',
# 'environment': 'FourRoomGridWorld',
# 'exp': '1HVFourRoom',
'save_value_function': True,
'sub_sample': 1,
'num_of_runs': 3,
'num_steps': 20_000,
'meta_parameters': {
'alpha': 0.001953125,
'eta': 16.0,
'beta': 0.9,
'zeta': 0.9,
'lmbda': 0.0,
'tdrc_beta': 1.0,
'gem_alpha': 0.1,
'gem_beta': 0.1
}
}
)
class JobBuilder:
def __init__(self, json_path, server_name):
self._path = json_path
self.server_name = server_name
with open(self._path) as f:
self._params = json.load(f)
self._batch_params = ImmutableDict(
{
'ALPHA': ' '.join([f'{num:.10f}' for num in self.alpha]),
'LMBDA': ' '.join([f'{num:.5f}' for num in self.lmbda]),
'ETA': ' '.join([f'{num:.10f}' for num in self.eta]),
'BETA': ' '.join([f'{num:.5f}' for num in self.beta]),
'ZETA': ' '.join([f'{num:.5f}' for num in self.zeta]),
'TDRCBETA': ' '.join([f'{num:.5f}' for num in self.tdrc_beta]),
'GEMALPHA': ' '.join([f'{num:.5f}' for num in self.gem_alpha]),
'GEMBETA': ' '.join([f'{num:.5f}' for num in self.gem_beta]),
'NUMOFRUNS': f'{self.num_of_runs}',
'NUMSTEPS': f'{self.num_steps}',
'SUBSAMPLE': f'{self.sub_sample}',
'ALGORITHM': self.agent,
'TASK': self.task,
'ENVIRONMENT': self.environment,
'SAVEPATH': self.save_path
})
@property
def tdrc_beta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('tdrc_beta', [default_params['meta_parameters']['tdrc_beta']]))
@property
def gem_alpha(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('gem_alpha', [default_params['meta_parameters']['gem_alpha']]))
@property
def gem_beta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('gem_beta', [default_params['meta_parameters']['gem_beta']]))
@property
def alpha(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('alpha', [default_params['meta_parameters']['alpha']]))
@property
def lmbda(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('lmbda', [default_params['meta_parameters']['lmbda']]))
@property
def eta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('eta', [default_params['meta_parameters']['eta']]))
@property
def beta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('beta', [default_params['meta_parameters']['beta']]))
@property
def zeta(self):
parameters = self._params.get('meta_parameters')
return np.asarray(parameters.get('zeta', [default_params['meta_parameters']['zeta']]))
@property
def agent(self):
return self._params.get('agent', default_params['agent'])
@property
def task(self):
return self._params.get('task', default_params['task'])
@property
def num_of_runs(self):
return np.asarray(self._params.get('number_of_runs', default_params['num_of_runs']))
@property
def num_steps(self):
return np.asarray(self._params.get('number_of_steps', default_params['num_steps']))
@property
def sub_sample(self):
return np.asarray(self._params.get('sub_sample', default_params['sub_sample']))
@property
def environment(self):
return self._params.get('environment', default_params['environment'])
@property
def save_path(self):
return os.path.dirname(self._path).replace("/Experiments/", "/Results/")
def create_dat_file(self):
with open('Job/Cedar_Create_Config_Template.sh', 'r') as f:
text = f.read()
for k, v in self._batch_params.items():
text = text.replace(f'__{k}__', v)
return text
def to_shell(self):
if self.server_name.upper() == 'NODE':
with open('Job/SubmitJobsTemplates.SL', 'r') as f:
text = f.read()
for k, v in self._batch_params.items():
text = text.replace(f'__{k}__', v)
return text
elif self.server_name.upper() == 'CPU':
with open('Job/SubmitJobsTemplatesCedar.SL', 'r') as f:
text = f.read()
alg = self._batch_params['ALGORITHM']
num_of_jobs = sum(1 for _ in open(f'exports_{alg}.dat'))
text = text.replace('__ALG__', self._batch_params['ALGORITHM'])
text = text.replace('__NUM_OF_JOBS__', str(num_of_jobs))
text = text.replace('__NAME_OF_EXP__', f'{self._batch_params["TASK"]}_{self._batch_params["ALGORITHM"]}')
return text
def run_batch(self):
if self.server_name.upper() == 'NODE':
print('Submitting the ' + self.agent + ' algorithm jobs on nodes...')
elif self.server_name.upper() == 'CPU':
print('Submitting the ' + self.agent + ' algorithm jobs on individual cpus...')
with open('Create_Configs.sh', 'wt') as f:
f.write(self.create_dat_file())
time.sleep(1)
os.system('bash Create_Configs.sh')
with open('Submit_Jobs.SL', 'wt') as f:
f.write(self.to_shell())
time.sleep(1)
os.system('sbatch Submit_Jobs.SL')
time.sleep(1)
os.remove('Submit_Jobs.SL')
if self.server_name.upper() == 'CPU':
os.remove('Create_Configs.sh')
# alg = self._batch_params['ALGORITHM']
# os.remove(f'exports_{alg}.dat')
def __call__(self):
return self.run_batch()
#!/bin/bash
# SLURM submission script for submitting multiple serial jobs on Niagara
#
#SBATCH --account=xxx
#SBATCH --time=11:58:59
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=40
#SBATCH --job-name __TASK_____ALGORITHM__
alpha=(__ALPHA__)
lmbda=(__LMBDA__)
eta=(__ETA__)
beta=(__BETA__)
zeta=(__ZETA__)
tdrc_beta=(__TDRCBETA__)
gem_alpha=(__GEMALPHA__)
gem_beta=(__GEMBETA__)
num_of_runs=__NUMOFRUNS__
num_steps=__NUMSTEPS__
sub_sample=__SUBSAMPLE__
algorithm=__ALGORITHM__
environment=__ENVIRONMENT__
task=__TASK__
save_path=__SAVEPATH__
source ~/RLENV/bin/activate
module load NiaEnv/2019b
module load gnu-parallel
module load python
cd $SLURM_SUBMIT_DIR || exit
export OMP_NUM_THREADS=1
echo "The number of available cores is echo $NCORES"
echo "Current working directory is $(pwd)"
echo "Running on hostname $(hostname)"
echo "Starting run at: $(date)"
HOSTS=$(scontrol show hostnames $SLURM_NODELIST | tr '\n' ,)
NCORES=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
parallel --env OMP_NUM_THREADS,PATH,LD_LIBRARY_PATH --joblog slurm-$SLURM_JOBID.log -j $NCORES -S $HOSTS --wd $PWD \
python Learning.py ::: -sp ::: ${save_path} ::: -e ::: ${environment} ::: -alg ::: ${algorithm} ::: -t ::: ${task[@]} \
::: -a ::: ${alpha[@]} ::: -nr ::: ${num_of_runs} ::: -ns ::: ${num_steps} ::: -et ::: ${eta[@]} \
::: -l ::: ${lmbda[@]} ::: -z ::: ${zeta[@]} ::: -tb ::: ${tdrc_beta[@]} ::: -b ::: ${beta[@]} ::: \
-ga ::: ${gem_alpha[@]} ::: -gb ::: ${gem_beta[@]} ::: -ss ::: ${sub_sample}
echo "Program test finished with exit code $? at: $(date)"
#!/bin/bash
#SBATCH --account=xxx
#SBATCH --time=00:15:58
#SBATCH --cpus-per-task=1
#SBATCH --mem=3G
#SBATCH --array=1-__NUM_OF_JOBS__
#SBATCH --job-name __NAME_OF_EXP__
alg=__ALG__
source ~/RLENV/bin/activate
`sed -n "${SLURM_ARRAY_TASK_ID}p" <exports_${alg}.dat`
echo ${SLURM_ARRAY_TASK_ID} $ALPHA $LMBDA $ETA $BETA $ZETA $TDRCBETA $GEMALPHA $GEMBETA $NUMOFRUNS $NUMSTEPS $SUBSAMPLE
echo "Current working directory is $(pwd)"
echo "Running on hostname $(hostname)"
echo
echo "Starting run at: $(date)"
python Learning.py \
-a $ALPHA -l $LMBDA -et $ETA -b $BETA -z $ZETA -tb $TDRCBETA -ga $GEMALPHA -gb $GEMBETA -alg $ALGORITHM -t $TASK \
-nr $NUMOFRUNS -e $ENVIRONMENT -sp $SAVE_PATH -ns $NUMSTEPS -ss $SUBSAMPLE
echo "Program test finished with exit code $? at: $(date)"
import os
import numpy as np
import argparse
from data_presister import DataPersister, ParameterBuilder
from utils import save_result, Configuration, save_value_function, get_save_value_function_steps
from Registry.AlgRegistry import alg_dict
from Registry.EnvRegistry import environment_dict
from Registry.TaskRegistry import task_dict
from Job.JobBuilder import default_params
from Environments.rendering import ErrorRender
def learn(config: Configuration):
params = ParameterBuilder().add_algorithm_params(config).build()
if not os.path.exists(config.save_path):
os.makedirs(config.save_path, exist_ok=True)
env = environment_dict[config.environment]()
rmsve = np.zeros((task_dict[config.task].num_of_policies(), config.num_steps, config.num_of_runs))
for run in range(config.num_of_runs):
random_seed = (run + config.num_of_runs) if config.rerun else run
np.random.seed(random_seed)
task = task_dict[config.task](run_number=run, num_steps=config.num_steps)
agent = alg_dict[config.algorithm](task, **params)
rmsve_of_run = np.zeros((task.num_policies, task.num_steps))
agent.state = env.reset()
error_render = ErrorRender(task.num_policies, task.num_steps)
for step in range(task.num_steps):
rmsve_of_run[:, step], error = agent.compute_rmsve()
if config.render:
error_render.add_error(error)
agent.action = agent.choose_behavior_action()
agent.next_state, r, is_terminal, info = env.step(agent.action)
agent.learn(agent.state, agent.next_state, r, is_terminal)
if config.render:
env.render(mode='screen', render_cls=error_render)
if config.save_value_function and (step in get_save_value_function_steps(task.num_steps)):
save_value_function(agent.compute_value_function(), config.save_path, step, run)
if is_terminal:
agent.state = env.reset()
agent.reset()
continue
agent.state = agent.next_state
print(np.mean(rmsve_of_run, axis=0))
rmsve[:, :, run] = rmsve_of_run
rmsve_of_runs = np.transpose(np.mean(rmsve, axis=0)) # Average over all policies.
# _RMSVE_mean_over_runs
DataPersister.save_result(np.mean(rmsve_of_runs, axis=0), '_RMSVE_mean_over_runs', config)
DataPersister.save_result(np.std(rmsve_of_runs, axis=0, ddof=1) / np.sqrt(config.num_of_runs), '_RMSVE_stderr_over_runs', config)
# _RMSVE_stderr_over_runs
save_result(config.save_path, '_RMSVE_stderr_over_runs', np.mean(rmsve_of_runs, axis=0), params, config.rerun)
save_result(config.save_path, '_RMSVE_stderr_over_runs',
np.std(rmsve_of_runs, axis=0, ddof=1) / np.sqrt(config.num_of_runs), params, config.rerun)
# _mean_stderr_final
final_errors_mean_over_steps = np.mean(rmsve_of_runs[:, config.num_steps - int(0.01 * config.num_steps) - 1:],
axis=1)
DataPersister.save_result(np.array([np.mean(final_errors_mean_over_steps), np.std(final_errors_mean_over_steps, ddof=1) /
np.sqrt(config.num_of_runs)]), '_mean_stderr_final', config)
save_result(config.save_path, '_mean_stderr_final',
np.array([np.mean(final_errors_mean_over_steps), np.std(final_errors_mean_over_steps, ddof=1) /
np.sqrt(config.num_of_runs)]), params, config.rerun)
# _mean_stderr_auc
auc_mean_over_steps = np.mean(rmsve_of_runs, axis=1)
DataPersister.save_result(np.array([np.mean(auc_mean_over_steps),
np.std(auc_mean_over_steps, ddof=1) / np.sqrt(config.num_of_runs)]), '_mean_stderr_auc', config)
save_result(config.save_path, '_mean_stderr_auc',
np.array([np.mean(auc_mean_over_steps),
np.std(auc_mean_over_steps, ddof=1) / np.sqrt(config.num_of_runs)]), params, config.rerun)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--alpha', '-a', type=float, default=default_params['meta_parameters']['alpha'])
parser.add_argument('--lmbda', '-l', type=float, default=default_params['meta_parameters']['lmbda'])
parser.add_argument('--eta', '-et', type=float, default=default_params['meta_parameters']['eta'])
parser.add_argument('--beta', '-b', type=float, default=default_params['meta_parameters']['beta'])
parser.add_argument('--zeta', '-z', type=float, default=default_params['meta_parameters']['zeta'])
parser.add_argument('--tdrc_beta', '-tb', type=float, default=default_params['meta_parameters']['tdrc_beta'])
parser.add_argument('--gem_alpha', '-ga', type=float, default=default_params['meta_parameters']['gem_alpha'])
parser.add_argument('--gem_beta', '-gb', type=float, default=default_params['meta_parameters']['gem_beta'])
parser.add_argument('--algorithm', '-alg', type=str, default=default_params['agent'])
parser.add_argument('--task', '-t', type=str, default=default_params['task'])
parser.add_argument('--num_of_runs', '-nr', type=int, default=default_params['num_of_runs'])
parser.add_argument('--num_steps', '-ns', type=int, default=default_params['num_steps'])
parser.add_argument('--sub_sample', '-ss', type=int, default=default_params['sub_sample'])
parser.add_argument('--environment', '-e', type=str, default=default_params['environment'])
parser.add_argument('--save_path', '-sp', type=str, default='-')
parser.add_argument('--rerun', '-rrn', type=bool, default=False)
parser.add_argument('--render', '-rndr', type=bool, default=False)
parser.add_argument('--save_value_function', '-svf', type=bool, default=default_params['save_value_function'])
args = parser.parse_args()
if args.save_path == '-':
args.save_path = os.path.join(os.getcwd(), 'Results', default_params['exp'], args.algorithm)
learn(config=Configuration(vars(args)))
import json
import os
import matplotlib.pyplot as plt
import numpy as np
from Plotting.plot_params import EXP_ATTRS, AUC_AND_FINAL
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
from utils import create_name_for_save_load
plot_alpha = 1.0
def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
global plot_alpha
lbl = f'{alg}_{tp}'
ax.set_xscale('log', basex=2)
if alg == 'ETD':
color = 'red'
elif alg == 'ETDLB':
color = 'grey'
plot_alpha -= 0.1
else:
color = 'black'
ax.plot(alphas, performance, label=lbl, linestyle='-', marker='o',
linewidth=2, markersize=5, color=color, alpha=plot_alpha)
ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
color=color, alpha=plot_alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
# ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
# plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_alphas(alg, exp):
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f"{alg}.json")
with open(exp_path) as f:
jsn_content = json.load(f)
return jsn_content['meta_parameters']['alpha']
def plot_all_sensitivities_per_alg_emphatics(**kwargs):
global plot_alpha
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
plot_alpha = 1.0
alg = 'ETD'
save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
current_params = make_current_params(alg, sp, 0, 0)
alphas = get_alphas(alg, exp)
performance, stderr = load_performance_over_alpha(
alg, exp, current_params, auc_or_final, exp_attrs)
plot_sensitivity(ax, alg, exp, alphas, sp, 0, performance, stderr, exp_attrs)
alg = 'ETDLB'
fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
for tp in tp_list:
for fop in fop_list:
current_params = make_current_params(alg, sp, tp, fop)
alphas = get_alphas(alg, exp)
performance, stderr = load_performance_over_alpha(
alg, exp, current_params, auc_or_final, exp_attrs)
plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg, auc_or_final, sp)
import os
import numpy as np
import json
import matplotlib.pyplot as plt
from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, ALG_COLORS
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
from utils import create_name_for_save_load
new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#17becf',
'orange', '#8c564b', '#e377c2', '#2ca02c',
'#bcbd22', '#d62728']
color_counter = 1
def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
global color_counter
lbl = f'{alg}_{tp}'
ax.set_xscale('log', basex=2)
color = new_colors[color_counter]
linestyle = '-'
alpha = 1.0
# if alg == 'PGTD2':
# linestyle = '--'
# alpha = 0.5
ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
linewidth=2, markersize=5, color=color, alpha=alpha)
ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
color=color, alpha=alpha)
color_counter = color_counter + 1
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.set_ylim([0.1, 0.8])
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
plt.xticks(fontsize=25)
def get_alphas(alg, exp):
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f"{alg}.json")
with open(exp_path) as f:
jsn_content = json.load(f)
return jsn_content['meta_parameters']['alpha']
COUNTER = 0
def plot_extra_alg_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
global color_counter
lbl = f'{alg}_{tp}'
ax.set_xscale('log', basex=2)
color = new_colors[color_counter - 1]
alpha = 1.0
if alg == 'TDRC':
color = ALG_COLORS[alg]
alpha = 1.0
linestyle = '--'
# if alg == 'GTD2':
# linestyle = '-'
# alpha=1.0
ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
linewidth=3, markersize=5, color=color, alpha=alpha)
ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=3, markersize=5,
color=color, alpha=alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim([0.1, 0.8])
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def plot_all_sensitivities_per_alg_gradients(**kwargs):
global color_counter, COUNTER
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
for alg in kwargs['algs']:
color_counter = 4
save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
for tp in tp_list:
if COUNTER % 2 == 0:
COUNTER += 1
continue
COUNTER += 1
for fop in fop_list:
current_params = make_current_params(alg, sp, tp, fop)
alphas = get_alphas(alg, exp)
performance, stderr = load_performance_over_alpha(
alg, exp, current_params, auc_or_final, exp_attrs)
plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if alg == 'GTD2':
extra_alg = 'GTD'
performance, stderr = load_performance_over_alpha(
extra_alg, exp, current_params, auc_or_final, exp_attrs)
plot_extra_alg_sensitivity(
ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if alg == 'PGTD2':
extra_alg = 'GTD2'
performance, stderr = load_performance_over_alpha(
extra_alg, exp, current_params, auc_or_final, exp_attrs)
plot_extra_alg_sensitivity(
ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if alg == 'GTD':
extra_alg = 'HTD'
performance, stderr = load_performance_over_alpha(
extra_alg, exp, current_params, auc_or_final, exp_attrs)
plot_extra_alg_sensitivity(
ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if alg == 'HTD':
extra_alg = 'TDRC'
current_params['eta'] = 1.0
current_params['tdrc_beta'] = 1.0
performance, stderr = load_performance_over_alpha(
extra_alg, exp, current_params, auc_or_final, exp_attrs)
plot_extra_alg_sensitivity(
ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg, auc_or_final, sp)
import os
import numpy as np
import json
import matplotlib.pyplot as plt
from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, ALG_COLORS
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
from utils import create_name_for_save_load
new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#17becf', 'orange', '#8c564b', '#e377c2', '#2ca02c','#bcbd22',
'#d62728', 'black', 'cyan']
color_counter = 1
def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
global color_counter
lbl = f'{alg}_{tp}'
ax.set_xscale('log', basex=2)
color = new_colors[color_counter]
linestyle = '-'
alpha = 1.0
# if alg == 'PGTD2':
# linestyle = '--'
# alpha = 0.5
ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
linewidth=2, markersize=5, color=color, alpha=alpha)
ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
color=color, alpha=alpha)
color_counter = color_counter + 1
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.set_ylim([0.1, 0.8])
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_alphas(alg, exp):
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f"{alg}.json")
with open(exp_path) as f:
jsn_content = json.load(f)
return jsn_content['meta_parameters']['alpha']
COUNTER = 0
def plot_all_sensitivities_per_alg_gradients_all_eta(**kwargs):
global color_counter, COUNTER
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
for alg in kwargs['algs']:
color_counter = 4
save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
if alg == 'TDRC':
_, _, tp_list, _, _ = make_params('GTD', exp)
fop_list = kwargs['tdrc_beta']
for tp in tp_list:
COUNTER += 1
for fop in fop_list:
current_params = make_current_params(alg, sp, tp, fop)
alphas = get_alphas(alg, exp)
performance, stderr = load_performance_over_alpha(
alg, exp, current_params, auc_or_final, exp_attrs)
plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
if alg == 'TDRC':
fig.savefig(
os.path.join(save_dir, f"sensitivity_{alg}_{exp}_all_eta_beta_{kwargs['tdrc_beta']}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
else:
fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}_all_eta.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg, auc_or_final, sp)
import matplotlib.pyplot as plt
import numpy as np
import os
import pylab
from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX
from Plotting.plot_utils import load_best_rerun_params_dict, make_current_params, make_params, load_and_replace_large_nan_inf
from utils import create_name_for_save_load
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False,
smoothing_window=1):
zoomed_in = True if is_smoothed else False
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
print(alg)
lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']) + r' $\lambda=$ ' +
str(best_params.get('lmbda', best_params.get('zeta', 0))))
color = ALG_COLORS[alg]
# if alg == 'TD':
# color = 'grey'
# alpha = 0.7
if is_smoothed:
mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid')
mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid')
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
if zoomed_in:
ax.set_ylim([0.0, 0.4])
else:
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_ls_rmsve(alg, exp, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
params = {'alpha': 0.01, 'lmbda': sp}
if alg == 'LSETD':
params['beta'] = 0.9
generic_name = create_name_for_save_load(params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
return np.load(load_file_name)
def plot_ls_solution(ax, ls_rmsve, alg, sp):
lbl = f"{alg} $\\lambda=$ {sp}"
x = np.arange(ls_rmsve.shape[0])
y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':')
# ax.legend()
def find_best_perf(alg, exp, auc_or_final):
exp_attrs = EXP_ATTRS[exp](exp)
fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
best_params = {}
best_perf, best_fp, best_sp, best_tp, best_fop = np.inf, np.inf, np.inf, np.inf, np.inf
for fop in fop_list:
for tp in tp_list:
for sp in sp_list:
current_params = make_current_params(alg, sp, tp, fop)
load_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=[
'alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
current_perf = load_and_replace_large_nan_inf(
load_name, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement)
min_perf = min(current_perf)
if min_perf < best_perf:
best_perf = min_perf
best_perf_idx = int(np.nanargmin(current_perf))
best_fp = fp_list[best_perf_idx]
best_params = current_params
best_params['alpha'] = best_fp
return best_params
def plot_learning_curve_best_overall_params(**kwargs):
is_smoothed = True if 'is_smoothed' in kwargs else False
smoothing_window = kwargs.get('smoothing_window', 1)
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final)
for alg_names in kwargs['alg_groups'].values():
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for alg in alg_names:
if alg in ['LSTD', 'LSETD']:
# ls_rmsve = get_ls_rmsve(alg, exp, sp)
# plot_ls_solution(ax, ls_rmsve, alg, sp)
continue
prefix = RERUN_POSTFIX if PLOT_RERUN else ''
best_params = find_best_perf(alg, exp, auc_or_final)
mean_lc, mean_stderr = load_data(alg, exp, best_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False,
is_smoothed=is_smoothed, smoothing_window=smoothing_window)
if PLOT_RERUN_AND_ORIG:
prefix = RERUN_POSTFIX
mean_lc, mean_stderr = load_data(alg, exp, best_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=True,
is_smoothed=is_smoothed, smoothing_window=smoothing_window)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}AllLmbda.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
plt.close(fig)
import os
import numpy as np
import matplotlib.pyplot as plt
def load_d_mu(task):
return np.load(os.path.join(os.getcwd(), 'Resources', task, 'd_mu.npy'))
def load_state_values(task):
return np.load(os.path.join(os.getcwd(), 'Resources', task, 'state_values.npy'))
def plot_d_mu(ax, d_mu, active_states):
ax.plot(d_mu, linewidth=3)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
x_labels = list(active_states)
x_ticks = [x for x in range(len(x_labels))]
ax.xaxis.set_ticks(x_ticks)
ax.set_xticklabels(x_labels)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
ax.yaxis.set_ticks([0, 0.005, 0.01, 0.015, 0.02, 0.025])
ax.set_ylim([0.00, 0.025])
ax.set_yticklabels([])
# ax.set_xticklabels([])
def find_active_states(task, d_mu, state_values, policy_no=0):
if task == 'EightStateCollision':
return [x for x in range(d_mu.shape[0])]
return np.where(state_values[policy_no] > 0)[0]
def get_active_d_mu(task, d_mu, active_states, policy_no=0):
if task == 'EightStateCollision':
return d_mu
return d_mu[active_states, policy_no].squeeze()
def plot_distribution(**kwargs):
task = kwargs['task']
d_mu = load_d_mu(task)
state_values = load_state_values(task)
for policy_no in range(state_values.shape[0]):
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
active_states = find_active_states(task, d_mu, state_values, policy_no)
active_d_mu = get_active_d_mu(task, d_mu, active_states, policy_no)
plot_d_mu(ax, active_d_mu, active_states)
plt.show()
if task == 'EightStateCollision':
break
def plot_dist_for_two_four_room_tasks(**kwargs):
task1 = 'LearnEightPoliciesTileCodingFeat'
task2 = 'HighVarianceLearnEightPoliciesTileCodingFeat'
save_dir = os.path.join('pdf_plots', 'Misc', 'CompareDistsFR')
d_mu1 = load_d_mu(task1)
d_mu2 = load_d_mu(task2)
state_values1 = load_state_values(task1)
state_values2 = load_state_values(task2)
for policy_no in range(state_values1.shape[0]):
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
active_states = find_active_states(task1, d_mu1, state_values1, policy_no)
active_d_mu = get_active_d_mu(task1, d_mu1, active_states, policy_no)
plot_d_mu(ax, active_d_mu, active_states)
active_states = find_active_states(task2, d_mu2, state_values2, policy_no)
active_d_mu = get_active_d_mu(task2, d_mu2, active_states, policy_no)
plot_d_mu(ax, active_d_mu, active_states)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
fig.savefig(os.path.join(save_dir, f"dist_policy_{policy_no}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
import os
import pylab
from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX
from Plotting.plot_utils import load_best_rerun_params_dict
from utils import create_name_for_save_load
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False,
smoothing_window=1):
zoomed_in = True if is_smoothed else False
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
color = ALG_COLORS[alg]
# if alg == 'TD':
# color = 'grey'
# alpha = 0.7
if is_smoothed:
mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid')
mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid')
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
if zoomed_in:
ax.set_ylim([0.0, 0.4])
else:
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_ls_rmsve(alg, exp, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
params = {'alpha': 0.01, 'lmbda': sp}
if alg == 'LSETD':
params['beta'] = 0.9
generic_name = create_name_for_save_load(params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
return np.load(load_file_name)
def plot_ls_solution(ax, ls_rmsve, alg, sp):
lbl = f"{alg} $\\lambda=$ {sp}"
x = np.arange(ls_rmsve.shape[0])
y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':')
# ax.legend()
def plot_learning_curve(**kwargs):
is_smoothed = True if 'is_smoothed' in kwargs else False
smoothing_window = kwargs.get('smoothing_window', 1)
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final)
for alg_names in kwargs['alg_groups'].values():
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for alg in alg_names:
if alg in ['LSTD', 'LSETD']:
ls_rmsve = get_ls_rmsve(alg, exp, sp)
plot_ls_solution(ax, ls_rmsve, alg, sp)
continue
prefix = RERUN_POSTFIX if PLOT_RERUN else ''
current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=False,
is_smoothed=is_smoothed, smoothing_window=smoothing_window)
if PLOT_RERUN_AND_ORIG:
prefix = RERUN_POSTFIX
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=True,
is_smoothed=is_smoothed, smoothing_window=smoothing_window)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
plt.close(fig)
import os
import matplotlib.pyplot as plt
import numpy as np
import pylab
from Plotting.plot_params import ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN_AND_ORIG
from Plotting.plot_utils import make_params, get_alphas, make_current_params
from utils import create_name_for_save_load
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
color = ALG_COLORS[alg]
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_ls_rmsve(alg, exp, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
params = {'alpha': 0.01, 'lmbda': sp}
if alg == 'LSETD':
params['beta'] = 0.9
generic_name = create_name_for_save_load(params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
return np.load(load_file_name)
def plot_ls_solution(ax, ls_rmsve, alg, sp):
lbl = f"{alg} $\\lambda=$ {sp}"
x = np.arange(ls_rmsve.shape[0])
y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--')
# ax.legend()
def load_specific_params_dict(alg, exp, sp, tp):
if alg == 'TD':
return {'alpha': 0.25, 'lmbda': sp}
if alg == 'ETD':
return {'alpha': 0.00390625, 'lmbda': sp}
if alg == 'ETDLB':
return {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2}
if alg == 'TDRC':
return {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0}
if alg == 'GTD':
return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp}
if alg == 'PGTD2':
return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp}
def load_sample_params_dict(alg, exp, sp):
fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
if alg in ['TD', 'ETD', 'TB', 'Vtrace']:
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp}
if alg == 'ABTD':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp}
if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']:
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'eta': tp_list[np.random.randint(0, len(tp_list))]}
if alg == 'ETDLB':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'beta': tp_list[np.random.randint(0, len(tp_list))]}
if alg == 'TDRC':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'eta': tp_list[np.random.randint(0, len(tp_list))],
'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]}
def plot_all_learning_curves_for_third(**kwargs):
for exp in kwargs['exps']:
prefix = ''
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
save_dir = os.path.join('pdf_plots', 'all_third_learning_curves', auc_or_final)
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for alg in kwargs['algs']:
if alg in ['LSTD', 'LSETD']:
ls_rmsve = get_ls_rmsve(alg, exp, sp)
plot_ls_solution(ax, ls_rmsve, alg, sp)
continue
for tp in kwargs['tp_list']:
for fp in get_alphas(alg, exp):
for fop in [1.0]:
current_params = make_current_params(alg, sp, tp, fop, fp)
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{'_'.join(kwargs['algs'])}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
plt.close(fig)
import matplotlib.pyplot as plt
import numpy as np
import os
import pylab
from Plotting.plot_params import ALG_GROUPS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, RERUN_POSTFIX, \
PLOT_RERUN_AND_ORIG
from Plotting.plot_utils import load_best_rerun_params_dict
from utils import create_name_for_save_load
# noinspection DuplicatedCode
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
# noinspection DuplicatedCode
def plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, second_time=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
color = 'blue' if sp else 'red'
lbl = (alg + r' $\lambda=$ ' + str(sp))
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.tick_params(axis='x', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_yticklabels([])
ax.set_xticklabels([])
# noinspection DuplicatedCode
def plot_learning_curve_for_lambdas(**kwargs):
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for alg_names in kwargs['alg_groups'].values():
for alg in alg_names:
if alg in ['LSETD', 'LSTD']:
continue
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
save_dir = os.path.join('pdf_plots', 'learning_curves_for_lambdas', auc_or_final)
for sp in kwargs['sp_list']:
prefix = RERUN_POSTFIX if PLOT_RERUN else ''
current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
print(alg, current_params)
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs)
if PLOT_RERUN_AND_ORIG:
prefix = RERUN_POSTFIX
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, True)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{alg}{exp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
# plt.show()
plt.close(fig)
from Plotting.plot_utils import FirstChainAttr, FirstFourRoomAttr, HVFirstFourRoomAttr
from Registry.AlgRegistry import alg_dict
PLOT_RERUN = True
PLOT_RERUN_AND_ORIG = False
if PLOT_RERUN and PLOT_RERUN_AND_ORIG:
PLOT_RERUN_AND_ORIG = False
RERUN_POSTFIX = '_rerun'
DEBUG_MODE = True
# noinspection SpellCheckingInspection
COLORS = ['#000000', "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22",
"#17becf"]
ALG_COLORS = {alg_name: color for alg_name, color in zip(alg_dict.keys(), COLORS)}
ALG_COLORS['LSTD'] = ALG_COLORS['TD']
ALG_COLORS['LSETD'] = ALG_COLORS['ETD']
ALG_GROUPS = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
'emphatics': ['ETD', 'ETDLB', 'LSETD'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD']}
EXPS = ['1HVFourRoom', 'FirstFourRoom', 'FirstChain']
ALGS = [key for key in alg_dict.keys()]
ALGS.remove('LSTD')
ALGS.remove('LSETD')
# ALGS.remove('TDRC')
ALL_ALGS = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', 'LSTD', 'LSETD']
# ALL_ALGS = ['TD', 'Vtrace', 'TB', 'ABTD']
LMBDA_AND_ZETA = [0.0, 0.9]
AUC_AND_FINAL = ['auc', 'final']
EXP_ATTRS = {'FirstChain': FirstChainAttr, 'FirstFourRoom': FirstFourRoomAttr, '1HVFourRoom': HVFirstFourRoomAttr}
if DEBUG_MODE:
EXPS = ['FirstFourRoom', '1HVFourRoom']
# ALGS = ['GTD']
# ALL_ALGS.remove('ETDLB')
# ALL_ALGS.remove('LSTD')
# ALL_ALGS.remove('LSETD')
# LMBDA_AND_ZETA = [0.9]
AUC_AND_FINAL = ['final']
# ALG_GROUPS = {'main_algs': ALL_ALGS}
import os
import matplotlib.pyplot as plt
import numpy as np
from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \
PLOT_RERUN_AND_ORIG, RERUN_POSTFIX
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas
from utils import create_name_for_save_load
def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy')
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha{postfix}.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
# noinspection DuplicatedCode
def plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, second_time=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = f'{alg}'
ax.set_xscale('log', basex=2)
color = ALG_COLORS[alg]
if alg == 'TD':
color = 'grey'
alpha=0.7
ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color,
linewidth=2, markersize=5, alpha=alpha)
ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color,
mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def plot_sensitivity_curve(**kwargs):
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
save_dir = os.path.join('pdf_plots', 'sensitivity_curves', auc_or_final)
for alg_names in kwargs['alg_groups'].values():
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for alg in alg_names:
if alg in ['LSTD', 'LSETD']:
continue
postfix = RERUN_POSTFIX if PLOT_RERUN else ''
best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
alphas = get_alphas(alg, exp)
best_performance, stderr = load_best_performance_over_alpha(
alg, exp, auc_or_final, best_params, exp_attrs, postfix)
plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs)
if PLOT_RERUN_AND_ORIG:
postfix = RERUN_POSTFIX
best_performance, stderr = load_best_performance_over_alpha(
alg, exp, auc_or_final, best_params, exp_attrs, postfix)
plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, True)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_sensitivity_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg_names, auc_or_final, sp)
import os
import matplotlib.pyplot as plt
import numpy as np
from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN, PLOT_RERUN_AND_ORIG, RERUN_POSTFIX, ALGS
from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas
from utils import create_name_for_save_load
def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''):
res_path = make_res_path(alg, exp)
load_file_name = os.path.join(res_path, create_name_for_save_load(
best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy')
performance_over_alpha = np.load(load_file_name)
performance_over_alpha = replace_large_nan_inf(
performance_over_alpha, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_replacement)
stderr_load_file_name = os.path.join(
res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) +
f'_stderr_{auc_or_final}_over_alpha{postfix}.npy')
std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
std_err_of_best_perf_over_alpha = replace_large_nan_inf(
std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
return performance_over_alpha, std_err_of_best_perf_over_alpha
# noinspection DuplicatedCode
def plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, second_time=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = f'{alg}'
ax.set_xscale('log', basex=2)
color = 'blue' if sp else 'red'
if sp not in [0.0, 1.0]:
alpha = 0.3
color = 'grey'
ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color,
linewidth=2, markersize=5, alpha=alpha)
ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color,
mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(exp_attrs.y_lim)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
# ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
# plt.xticks(fontsize=25)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def plot_min(ax, min_performance):
print(min_performance)
ax.plot([pow(2, -3), pow(2, -2)], [min_performance, min_performance], linewidth=0.2, alpha=0.2)
# ax.axhline(y=min_performance, xmin=pow(2, -3), xmax=pow(2, -2))
def plot_sensitivity_for_lambdas(**kwargs):
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
save_dir = os.path.join('pdf_plots', 'sensitivity_curves_for_lambdas', exp, auc_or_final)
for alg in kwargs['algs']:
min_performance = 1_000
fig, ax = plt.subplots(figsize=kwargs['fig_size'])
for sp in kwargs['sp_list']:
if alg in ['LSTD', 'LSETD']:
continue
postfix = RERUN_POSTFIX if PLOT_RERUN else ''
best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
alphas = get_alphas(alg, exp)
best_performance, stderr = load_best_performance_over_alpha(
alg, exp, auc_or_final, best_params, exp_attrs, postfix)
plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs)
if PLOT_RERUN_AND_ORIG:
postfix = RERUN_POSTFIX
best_performance, stderr = load_best_performance_over_alpha(
alg, exp, auc_or_final, best_params, exp_attrs, postfix)
plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, True)
if min(best_performance) < min_performance:
min_performance = min(best_performance)
if kwargs.get('plot_min_performance', False):
plot_min(ax, min_performance)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
if PLOT_RERUN_AND_ORIG:
prefix = '_rerun_and_original'
elif PLOT_RERUN:
prefix = RERUN_POSTFIX
else:
prefix = ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_sensitivity_curve_{alg}{exp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg, auc_or_final, sp)
import matplotlib.pyplot as plt
import numpy as np
import os
import pylab
from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX, ALGS, ALL_ALGS
from Plotting.plot_utils import load_best_rerun_params_dict, make_params
from utils import create_name_for_save_load
def load_data(alg, exp, best_params, postfix=''):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
generic_name = create_name_for_save_load(best_params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
mean_lc = np.load(load_file_name)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
stderr_lc = np.load(load_file_name)
return mean_lc, stderr_lc
def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, flag=False):
alpha = 1.0
if PLOT_RERUN_AND_ORIG:
alpha = 1.0 if second_time else 0.5
lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
color = ALG_COLORS[alg]
if alg == 'TDRC':
alpha = 0.6
if flag:
color = 'green'
ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
color=color, alpha=0.1*alpha)
# ax.legend()
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim(exp_attrs.x_lim)
ax.set_ylim(exp_attrs.y_lim)
ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def get_ls_rmsve(alg, exp, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
params = {'alpha': 0.01, 'lmbda': sp}
if alg == 'LSETD':
params['beta'] = 0.9
generic_name = create_name_for_save_load(params)
load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
return np.load(load_file_name)
def plot_ls_solution(ax, ls_rmsve, alg, sp):
lbl = f"{alg} $\\lambda=$ {sp}"
x = np.arange(ls_rmsve.shape[0])
y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--')
# ax.legend()
def load_sample_params_dict(alg, exp, sp):
fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
if alg in ['TD', 'ETD', 'TB', 'Vtrace']:
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp}
if alg == 'ABTD':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp}
if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']:
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'eta': tp_list[np.random.randint(0, len(tp_list))]}
if alg == 'ETDLB':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'beta': tp_list[np.random.randint(0, len(tp_list))]}
if alg == 'TDRC':
return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
'eta': tp_list[np.random.randint(0, len(tp_list))],
'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]}
def plot_specific_learning_curves(**kwargs):
specific_params = kwargs['specific_params']
exp = kwargs['exp']
prefix = ''
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in AUC_AND_FINAL:
sp = kwargs['sp']
save_dir = os.path.join('pdf_plots', 'specific_learning_curves', auc_or_final)
fig, ax = plt.subplots(figsize=(10, 4))
for alg in kwargs['algs']:
flag = False
if alg in ['LSTD', 'LSETD']:
ls_rmsve = get_ls_rmsve(alg, exp, sp)
plot_ls_solution(ax, ls_rmsve, alg, sp)
continue
print(alg, exp, sp)
if alg == 'PGTD22':
flag = True
alg = 'PGTD2'
current_params = specific_params[alg]
current_params['eta'] = 1.0
current_params['alpha'] = 0.03125
else:
current_params = specific_params[alg]
print(current_params)
mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, False, flag)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
pylab.gca().set_rasterized(True)
fig.savefig(os.path.join(save_dir,
f"{prefix}_learning_curve_{'_'.join(ALGS)}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
plt.close(fig)
import argparse
import json
import numpy as np
import os
from Job.JobBuilder import default_params
from Registry.AlgRegistry import alg_dict
from utils import create_name_for_save_load
def make_res_path(alg, exp):
return os.path.join(os.getcwd(), 'Results', exp, alg)
def make_exp_path(alg, exp):
return os.path.join(os.getcwd(), 'Experiments', exp, alg)
def load_best_rerun_params_dict(alg, exp, auc_or_final, sp):
res_path = make_res_path(alg, exp)
with open(os.path.join(res_path, f"{auc_or_final}_{sp}.json")) as f:
return json.load(f)['meta_parameters']
def get_alphas(alg, exp):
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f"{alg}.json")
with open(exp_path) as f:
jsn_content = json.load(f)
return jsn_content['meta_parameters']['alpha']
def load_best_rerun_params(alg, exp, auc_or_final, sp):
best_res_dict = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
best_fp = best_res_dict.get('alpha', 0)
best_tp = best_res_dict.get('eta', best_res_dict.get('beta', 0))
best_fop = best_res_dict.get('tdrc_beta', 0)
return best_fp, best_tp, best_fop
def make_args():
parser = argparse.ArgumentParser()
parser.add_argument('--exp_name', '-n', type=str, default='1HVFourRoom')
# 1HVFourRoom or FirstFourRoom or FirstChain
return parser.parse_args()
def rename_best_old_result(res_path, params_dict, file_name):
name_to_save = create_name_for_save_load(param_dict=params_dict)
path_and_name = os.path.join(res_path, name_to_save)
file_name = path_and_name + file_name
os.rename(file_name + '.npy', file_name + '_old.npy')
def load_best_perf_json(alg, exp, sp, auc_or_final):
res_path = make_res_path(alg, exp)
res_path = os.path.join(res_path, f"{auc_or_final}_{sp}.json")
with open(res_path, 'r') as f:
return json.load(f)
def load_exp_json_file(alg, exp):
res_path = make_res_path(alg, exp)
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f'{alg}.json')
with open(exp_path) as f:
return json.load(f), res_path
def make_params(alg_name, exp_name):
params = dict()
alg_param_names = alg_dict[alg_name].related_parameters()
json_content, res_path = load_exp_json_file(alg_name, exp_name)
json_exp_params = json_content.get('meta_parameters')
for param in alg_param_names:
params[param] = json_exp_params.get(param, default_params['meta_parameters'][param])
if not isinstance(params[param], list):
params[param] = list([params[param]])
fp_list = params.get('alpha', params['alpha'])
tp_list = [0.0]
fop_list = [0.0]
if 'lmbda' in params:
sp_list = params['lmbda']
else:
sp_list = params['zeta']
if 'eta' in params:
tp_list = params['eta']
elif 'beta' in params:
tp_list = params['beta']
if 'tdrc_beta' in params:
fop_list = params['tdrc_beta']
if alg_name == 'TDRC':
tp_list, fop_list = [1.0], [1.0]
return fp_list, sp_list, tp_list, fop_list, res_path
def make_current_params(alg_name, sp, tp, fop, fp=0):
current_params = {'alpha': fp}
alg_param_names = alg_dict[alg_name].related_parameters()
if 'lmbda' in alg_param_names:
current_params['lmbda'] = sp
else:
current_params['zeta'] = sp
if 'eta' in alg_param_names:
current_params['eta'] = tp
elif 'beta' in alg_param_names:
current_params['beta'] = tp
if 'tdrc_beta' in alg_param_names:
current_params['tdrc_beta'] = fop
return current_params
def get_alg_names(exp_name):
path = os.path.join(os.getcwd(), 'Experiments', exp_name)
alg_names = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
return alg_names
def load_sample_json_for_exp(exp):
alg = get_alg_names(exp)[0]
exp_path = make_exp_path(alg, exp)
exp_path = os.path.join(exp_path, f'{alg}.json')
if not os.path.exists(exp_path):
print('No algorithms exist in the experiment directory...')
raise FileExistsError
with open(exp_path) as f:
json_exp_params = json.load(f)
return json_exp_params
def load_and_replace_large_nan_inf(load_file_name, large, replace_with):
current_perf = np.load(load_file_name)
return replace_large_nan_inf(current_perf, large=large, replace_with=replace_with)
class FirstChainAttr:
def __init__(self, exp_name):
json_exp_params = load_sample_json_for_exp(exp_name)
self.size_of_labels = 25
self.y_lim = [0.0, 0.8]
self.x_lim = [0.0, json_exp_params['number_of_steps']]
self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7]
self.x_axis_ticks = [0.0, 5000, 10000, 15000, 20000]
self.x_tick_labels = [0, '5', '10', '15', '20']
self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)]
self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1]
self.over_limit_replacement = 2.0
self.over_limit_waterfall = 0.79
self.learning_starting_point = 0.68910
self.ok_error = 0.4
class FirstFourRoomAttr:
def __init__(self, exp_name):
json_exp_params = load_sample_json_for_exp(exp_name)
self.size_of_labels = 25
self.y_lim = [0.0, 0.8]
self.x_lim = [0.0, json_exp_params['number_of_steps']]
self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7]
self.x_axis_ticks = [0.0, 10000, 20000, 30000, 40000, 50000]
self.x_tick_labels = [0, '10', '20', '30', '40', '50']
self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)]
self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1]
self.over_limit_replacement = 2.0
self.over_limit_waterfall = 0.79
self.learning_starting_point = 0.72672
self.ok_error = 0.4
class HVFirstFourRoomAttr(FirstFourRoomAttr):
def __init__(self, exp_name):
super(HVFirstFourRoomAttr, self).__init__(exp_name)
def replace_large_nan_inf(arr, large=1.0, replace_with=2.0):
arr[np.isnan(arr)], arr[np.isinf(arr)], arr[arr > large] = replace_with, replace_with, replace_with
return arr
import os
import matplotlib.pyplot as plt
import numpy as np
from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \
RERUN_POSTFIX
from Plotting.plot_utils import make_current_params, replace_large_nan_inf, make_params
from utils import create_name_for_save_load
np.random.seed(0)
def load_all_performances(alg, exp, auc_or_final, sp, exp_attrs):
fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
all_performance = np.zeros((len(fp_list), len(tp_list), len(fop_list)))
for i, fop in enumerate(fop_list):
for j, tp in enumerate(tp_list):
current_params = make_current_params(alg, sp, tp, fop)
load_file_name = os.path.join(res_path, create_name_for_save_load(
current_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
if PLOT_RERUN and auc_or_final == 'auc':
load_file_name_rerun = load_file_name.replace('.npy', f"{RERUN_POSTFIX}.npy")
if os.path.isfile(load_file_name_rerun):
load_file_name = load_file_name_rerun
performance = np.load(load_file_name)
performance = replace_large_nan_inf(performance, large=exp_attrs.learning_starting_point,
replace_with=exp_attrs.over_limit_waterfall)
all_performance[:, j, i] = performance
return all_performance
def plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs):
global ticker, x_axis_names, x_axis_ticks
performance_to_plot = np.array(all_performance.flatten())
percentage_overflowed = round((performance_to_plot > exp_attrs.learning_starting_point).sum() /
performance_to_plot.size, 2)
ok_percentage = round((performance_to_plot < exp_attrs.ok_error).sum() /
performance_to_plot.size, 2)
print(alg, 'percentage_overflowed', percentage_overflowed)
# print(alg, 'OK_percentage', ok_percentage)
color = ALG_COLORS[alg]
ax.scatter([(ticker + 1)] * performance_to_plot.shape[0] + np.random.uniform(
-0.25, 0.25, performance_to_plot.shape[0]), performance_to_plot, marker='o',
facecolors='none', color=color)
x_axis_ticks.append(ticker + 1)
ticker = (ticker + 1) % len(alg_names)
ax.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=True) # labels along the bottom edge are off
x_axis_names.append(f'{alg}_{percentage_overflowed}')
ax.xaxis.set_ticks(x_axis_ticks)
ax.set_xticklabels(x_axis_names)
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
ax.set_ylim(exp_attrs.y_lim)
ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
ticker, x_axis_names, x_axis_ticks = 0.0, [''], [0]
def plot_waterfall_scatter(**kwargs):
for exp in kwargs['exps']:
exp_attrs = EXP_ATTRS[exp](exp)
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
save_dir = os.path.join('pdf_plots', 'waterfalls', auc_or_final)
for alg_names in kwargs['alg_groups'].values():
global ticker, x_axis_names, x_axis_ticks
ticker, x_axis_names, x_axis_ticks = -0.5, [''], [0]
fig, ax = plt.subplots(kwargs['fig_size'])
for alg in alg_names:
if alg in ['LSTD', 'LSETD']:
continue
all_performance = load_all_performances(alg, exp, auc_or_final, sp, exp_attrs)
plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs)
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
prefix = RERUN_POSTFIX if PLOT_RERUN else ''
fig.savefig(os.path.join(save_dir,
f"{prefix}_waterfall_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
format='pdf', dpi=1000, bbox_inches='tight')
plt.show()
print(exp, alg_names, auc_or_final, sp)
import os
import numpy as np
import matplotlib.pyplot as plt
class ValueFunctionProcessor:
def __init__(self, exp, alg):
result_dir = os.path.join(os.getcwd(), 'Results', exp, alg, 'Sample_value_function')
self.all_value_functions = dict()
self.all_value_functions_of_last_step = dict()
for value_function_name in os.listdir(result_dir):
value_function = np.load(os.path.join(result_dir, value_function_name))
step, run_num = (int(i) for i in value_function_name.replace('.npy', '').split('_'))
self.all_value_functions[(step, run_num)] = value_function
if (step == 19999 and exp == 'FirstChain') or (step == 49999 and exp == 'FirstFourRoom') or (
step == 49999 and exp == '1HVFourRoom'):
self.all_value_functions_of_last_step[run_num] = value_function
def get_value_function_by_step_and_run(self, step, run):
return self.all_value_functions[(step, run)]
def get_value_function_for_last_step(self, run):
return self.all_value_functions_of_last_step[run]
# STEPS = [199, 999, 1999, 3999, 9999, 19999]
STEPS = [199, 1999, 19999]
# STEPS = [19999]
RUNS = [0, 10, 15, 20, 30, 45]
# RUNS = list(range(50))
EXPS = ['FirstChain'] # FirstChain or FirstFourRoom or 1HVFourRoom
ALGS = ['TD']
TASK = 'EightStateCollision'
def plot_value_function(ax, value_function, step=0, run=0, is_last_step=False):
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(0, 1.0)
label = f"{step}_{run}"
line_style = '-'
line_width = 4
alpha = 1.0
color = 'blue'
if not step:
line_style = '--'
if not step and is_last_step:
line_style = '-'
if is_last_step:
line_width = 2
alpha = 0.2
color = 'red'
ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha, color=color)
else:
ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.spines['left'].set_linewidth(2)
ax.spines['bottom'].set_linewidth(2)
def plot_value_functions():
for exp in EXPS:
save_dir = os.path.join('pdf_plots', 'value_functions')
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy'))
for alg in ALGS:
value_processor = ValueFunctionProcessor(exp, alg)
for run in RUNS:
fig, ax = plt.subplots(figsize=(8, 3))
for step in STEPS:
value_function = value_processor.get_value_function_by_step_and_run(step, run)
plot_value_function(ax, value_function, step, run)
plot_value_function(ax, true_value_function)
fig.savefig(os.path.join(save_dir, f"{run}_value_function_{alg}_{exp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
def plot_all_final_value_functions():
for exp in EXPS:
save_dir = os.path.join('pdf_plots', 'value_functions', 'asymptotic_value_functions')
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy'))
for alg in ALGS:
value_processor = ValueFunctionProcessor(exp, alg)
fig, ax = plt.subplots(figsize=(8, 3))
for run in range(50):
value_function = value_processor.get_value_function_for_last_step(run)
plot_value_function(ax, value_function, is_last_step=True)
plot_value_function(ax, true_value_function)
fig.savefig(os.path.join(save_dir, f"value_function_{alg}_{exp}.pdf"),
format='pdf', dpi=200, bbox_inches='tight')
plt.show()
<p align="center">
<img width="100" src="/Assets/rlai.png" />
</p>
<br>
<div align="center">
:steam_locomotive::train::train::train::train::train:
</div>
<h2 align=center>An Empirical Comparison of Off-policy Prediction Learning Algorithms on the Collision Task</h2>
This repository includes the code for the "empirical off-policy" paper.
<br>
<p align="center">
<img src="/Assets/FourRoomGridWorld.gif" />
<img src="/Assets/chain.gif" />
</p>
## Table of Contents
- **[Specification of Dependencies](#specifications)**
- **[Algorithms](#algorithms)**
- **TD**: [Off-policy TD](#td)
- **Gradient-TD family** : [GTD](#gtd) , [GTD2](#gtd2), [HTD](#htd), [PGTD2](#pgdt2), [TDRC](#tdrc)
- **Emphatic-TD family** : [Emphatic TD](#etd), [Emphatic TDβ](#etdb)
- **Variable-λ family** : [TB](#tb), [Vtrace](#vtrace), [ABTD](#abtd)
- **[Algorithm Glossary](#glossary)**
- **[Environments](#environment)** : [Chain](#chain), [Four Room Grid World](#four_room_grid_world)
- **[How to run the code](#how-to-run)**: [Learning.py](#learning.py), [Job Buidler](#job_builder)
- **[Plotting the results](#Plot-results)**
<a name='specifications'></a>
## Specification of Dependencies
This code requires python 3.5 or above. Packages that are required for running the code are all in the `requirements.txt`
file. To install these dependencies, run the following command if your pip is set to `python3.x`:
```text
pip install requirements.txt
```
otherwise, run:
```text
pip3 install requirements.txt
```
<a name='algorithms'></a>
## Algorithms
Algorithms are used to find a weight vector, [**w**](#var_w), such that the dot product of [**w**](#var_w) and the feature vector,
approximates the value function.
<a name='td'></a>
### Off-policy TD
**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation](
https://www.cs.mcgill.ca/~dprecup/publications/PSD-01.pdf)<br>
**Authors** Doina Precup, Richard S. Sutton, Sanjoy Dasgupta<br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = rho * (gamma * lmbda * z + x)
w += alpha * delta * z
```
### Gradient-TD algorithms
<a name='gtd'></a>
#### GTD/TDC
**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation](
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.6170&rep=rep1&type=pdf)<br>
**Authors** Richard S. Sutton, Hamid Reza Maei, Doina Precup, Shalabh Bhatnagar, David Silver, Csaba Szepesvàri,
Eric Wiewiora<br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = rho * (gamma * lmbda * z + x)
w += alpha * (delta * z - gamma * (1 - lmbda) * np.dot(z, v) * x_p)
v += alpha_v * (delta * z - np.dot(x, v) * x)
```
<a name='gtd2'></a>
#### GTD2
**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation](
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.6170&rep=rep1&type=pdf)<br>
**Authors** Richard S. Sutton, Hamid Reza Maei, Doina Precup, Shalabh Bhatnagar, David Silver, Csaba Szepesvàri,
Eric Wiewiora<br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = rho * (gamma * lmbda * z + x)
w += alpha * (np.dot(x, v) * x - gamma * (1 - lmbda) * np.dot(z, v) * x_p)
v += alpha_v * (delta * z - np.dot(x, v) * x)
```
<a name='htd'></a>
#### HTD
**Paper** [Investigating Practical Linear Temporal Difference Learning](
https://arxiv.org/pdf/1602.08771.pdf)<br>
**Authors** Adam White, Martha White<br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = rho * (gamma * lmbda * z + x)
z_b = gamma * lmbda * z_b + x
w += alpha * ((delta * z) + (x - gamma * x_p) * np.dot((z - z_b), v))
v += alpha_v * ((delta * z) - (x - gamma * x_p) * np.dot(v, z_b))
```
<a name='pgtd2'></a>
#### Proximal GTD2
**Paper** [Proximal Gradient Temporal Difference Learning: Stable Reinforcement Learning with Polynomial Sample Complexity](
https://arxiv.org/pdf/2006.03976.pdf)<br>
**Authors** Bo Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji Liu, Sridhar Mahadevan, Marek Petrik<br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = rho * (gamma * lmbda * z + x)
v_mid = v + alpha_v * (delta * z - np.dot(x, v) * x)
w_mid = w + alpha * (np.dot(x, v) * x - (1 - lmbda) * gamma * np.dot(z, v) * x_p)
delta_mid = r + gamma * np.dot(w_mid, x_p) - np.dot(w_mid, x)
w += alpha * (np.dot(x, v_mid) * x - gamma * (1 - lmbda) * np.dot(z, v_mid) * x_p)
v += alpha_v * (delta_mid * z - np.dot(x, v_mid) * x)
```
<a name='tdrc'></a>
#### TDRC
**Paper** [Gradient Temporal-Difference Learning with Regularized Corrections](
http://proceedings.mlr.press/v119/ghiassian20a/ghiassian20a.pdf)<br>
**Authors** Sina Ghiassian, Andrew Patterson, Shivam Garg, Dhawal Gupta, Adam White, Martha White <br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = rho * (gamma * lmbda * z + x)
w += alpha * (delta * z - gamma * (1 - lmbda) * np.dot(z, v) * x_p)
v += alpha_v * (delta * z - np.dot(x, v) * x) - alpha_v * tdrc_beta * v
```
### Emphatic-TD algorithms
<a name='etd'></a>
#### Emphatic TD
**Paper** [An Emphatic Approach to the Problem of Off-policy Temporal-Difference Learning](
https://jmlr.org/papers/volume17/14-488/14-488.pdf)<br>
**Authors** Richard S. Sutton, A. Rupam Mahmood, Martha White<br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = rho * (gamma * lmbda * z + x)
F = gamma * old_rho * F + 1
m = lmbda * 1 + (1 - lmbda) * F
z = rho * (x * m + gamma * lmbda * z)
w += alpha * delta * z
```
<a name='etdb'></a>
#### Emphatic TDβ
**Paper** [Generalized Emphatic Temporal Difference Learning: Bias-Variance Analysis](
https://ojs.aaai.org/index.php/AAAI/article/view/10227/10086)<br>
**Authors** Assaf Hallak, Aviv Tamar, Remi Munos, Shie Mannor<br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = rho * (gamma * lmbda * z + x)
F = beta * old_rho * F + 1
m = lmbda * 1 + (1 - lmbda) * F
z = rho * (x * m + gamma * lmbda * z)
w += alpha * delta * z
```
### Variable-λ algorithms
<a name='tb'></a>
#### Tree backup/ Tree backup for prediction
**Paper** [Eligibility Traces for Off-Policy Policy Evaluation](
https://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1079&=&context=cs_faculty_pubs&=&sei-redir=1&referer=https%253A%252F%252Fscholar.google.com%252Fscholar%253Fhl%253Den%2526as_sdt%253D0%25252C5%2526q%253Dtree%252Bbackup%252Balgorithm%252Bdoina%252Bprecup%2526btnG%253D#search=%22tree%20backup%20algorithm%20doina%20precup%22)<br>
**Authors** Doina Precup, Richard S. Sutton, Satinder Singh<br>
The algorithm pseudo-code described below is the prediction variant of the original Tree backup algorithm proposed by
Precup, Sutton, and Singh (2000). The prediction variant of the algorithm used here is first derived in the current paper.
```python
delta = rho * (r + gamma * np.dot(w, x_p) - np.dot(w, x))
z = gamma * lmbda * old_pi * z + x
w = w + alpha * delta * z
```
<a name='vtrace'></a>
#### Vtrace (simplified)
**Paper** [IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures]
(http://proceedings.mlr.press/v80/espeholt18a/espeholt18a.pdf)<br>
**Authors** Lasse Espeholt, Hubert Soyer, Remi Munos, Karen Simonyan, Volodymyr Mnih, Tom Ward, Yotam Doron, Vlad Firoiu, Tim Harley, Iain Dunning, Shane Legg, Koray Kavukcuoglu <br>
```python
delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
z = min(1, rho) * (gamma * lmbda * z + x)
w += alpha * delta * z
```
<a name='abtd'></a>
#### ABQ/ABTD
**Paper** [Multi-step Off-policy Learning Without Importance Sampling Ratios](
https://arxiv.org/pdf/1702.03006)<br>
**Authors** A. Rupam Mahmood, Huizhen Yu, Richard S. Sutton <br>
The algorithm pseudo-code described below is the prediction variant of the original Tree backup algorithm proposed by
Mahmood, Sutton, and Yu (2017). The prediction variant of the algorithm used here is first derived in the current paper.
This algorithm first needs to compute the following:
```python
xi_zero = 1
xi_max = 2
xi = 2 * zeta * xi_zero + max(0, 2 * zeta - 1) * (xi_max - 2 * xi_zero)
```
`xi_zero` and `xi_max` are specifically computed here for the Collision problem.
To see how these are computed for the task see the original paper referenced above.
```python
nu = min(xi, 1.0 / max(pi, mu))
delta = rho * (r + gamma * np.dot(w, x_p) - np.dot(w, x))
nu = min(xi, 1.0 / max(pi, mu))
z = x + gamma * old_nu * old_pi * z
w += alpha * delta * z
```
<a name='glossary'></a>
### Algorithm Glossary
Here, we briefly explain all the symbols and variables names that we use in our implementation.
#### meta-parameters
- Common parameters of all algorithms:
- alpha (α): is the step size that defines how much the weight vector [**w**](#var_w) is updated at each time step.
- lambda (λ): is the bootstrapping parameter.
- Common parameters of Gradient-TD algorithms:
- alpha_v (α<sub>v</sub>): is the second step size that defines how much the second weight vector [**v**](#var_v) is
updated at each time step.
- beta (β): is the parameter used by the [**ETDβ**](#etdb) algorithm that defines how much the product of importance sampling ratios
from the past affects the current update.
- tdrc_beta (tdrc<sub>β</sub>): is the regularization parameter of the [**TDRC**](#tdrc) algorithms. This parameter is often set to 1.
- zeta (ζ): is only used in the [**ABTD**](#abtd) algorithm. It is similar to the bootstrapping parameter of other algorithms.
#### Algorithms variables
<a name='var_w'></a>
- **w**: is the main weight vector being learned. ```init: w=0```.
<a name='var_v'></a>
- **v**: is the secondary weight vector learned by Gradient-TD algorithms. ```init: v=0```.
<a name='var_z'></a>
- **z**: is the eligibility trace vector. ```init: z=0```.
<a name='var_zb'></a>
- **z<sub>b</sub>**: is the extra eligibility trace vector used by [**HTD**](#htd). ```init: z_b=0```.
<a name='var_delta'></a>
- delta (𝛿): is the td-error, which in the full bootstrapping case, is equal to the reward plus the value of the next
state minus the value of the current state.
<a name='var_s'></a>
- s: is the current state (scalar).
<a name='var_x'></a>
- **x**: is the feature vector of the current state.
<a name='var_s_p'></a>
- s_p: is the next state (scalar).
<a name='var_x_p'></a>
- **x_p**: is the feature vector of the next state.
<a name='var_r'></a>
- r: is the reward.
<a name='var_rho'></a>
- rho (ρ): is the importance sampling ratio, which is equal to the probability of taking an action under the target policy
divided by the probability of taking the same action under the behavior policy.
<a name='var_oldrho'></a>
- old_rho (oldρ): is the importance sampling ratio at the previous time step.
<a name='var_pi'></a>
- pi (π): is the probability of taking an action under the target policy at the current time step.
<a name='var_oldpi'></a>
- old_pi (oldπ): is the probability of taking an action under the target policy in the previous time step. The variable
π itself is the probability of taking action under the target policy at the current time step.
<a name='var_F'></a>
- F : is the follow-on trace used by [Emphatic-TD](#etd) algorithms.
<a name='var_m'></a>
- m : is the emphasis used by [Emphatic-TD](#etd) algorithms.
<a name='var_nu'></a>
- nu (ν): Variable used by the ABQ/ABTD algorithm. Please refer to the [original paper](https://arxiv.org/pdf/1702.03006) for explanation.
<a name='var_si'></a>
- xi (ψ): Variable used by the ABQ/ABTD algorithm. Please refer to the [original paper](https://arxiv.org/pdf/1702.03006) for explanation.
<a name='var_mu'></a>
- mu (μ): is the probability of taking action under the behavior policy at the current time step.
<a name='var_oldmu'></a>
- old_mu (oldμ): is the probability of taking an action under the target policy at the previous time step.
- gamma (γ): is the discount factor parameter.
<a name='environment'></a>
## Environment
At the heart of an environment is an MDP.
The MDP defines the states, actions, rewards, transition probability matrix, and the discount factor.
<a name="chain_env"></a>
### Chain Environment and the Collision Task
<br>
<p align="center">
<img width="800" src="/Assets/eight_state_collision.png" />
</p>
<br>
An MDP with eight states is at the heart of the task.
The agent starts in one of the four leftmost states with equal probability.
One action in available in the four leftmost states: forward. Two actions are available in the four rightmost states:
forward and turn. By taking the forward action, the agent transitions one state to the right and by taking the turn
action, it moves away from the wall and transitions to one of the four leftmost states equiprobably. Rewards are all
zero except for taking forward in state 8 for which a +1 is emitted. Termination function (discount factor) returns
0.9 for all transitions except for taking turn in any state or taking forward in state 8, for which the termination
function returns zero.
```python
env = Chain()
env.reset() # returns to one of the four leftmost states with equal probability.
for step in range(1, 1000):
action = np.random.randint(0, 2) # forward=0, turn=1
sp, r, is_wall = env.step(action=action)
if is_wall:
env.reset()
```
We applied eleven algorithms to the Collision task: Off-policy TD(λ), GTD(λ), GTD2(λ), HTD(λ), Proximal GTD2(λ), TDRC(λ)
, ETD(λ), ETD(λ,β), Tree Backup(λ), Vtrace(λ), ABTD(ζ). The target policy was π(forward|·) = 1.0. The behavior policy
was b(forward|·) = 1.0 for the four leftmost states and b(forward|·) = 0.5, b(retreat|·) = 0.5 for the four rightmost
states. Each algorithm was applied to the task with a range of parameters. We refer to an algorithm with a specific
parameter setting as an instance of that algorithm. Each algorithm instance was applied to the Collision task for
20,000 time steps, which we call a run. We repeated the 20,000 time steps for fifty runs. All instances of all
algorithms experienced the same fifty trajectories.
Linear function approximation was used to approximate the true value function. Each state was represented by a six
dimensional binary feature vector. The feature representation of each state had exactly three zeros and three ones.
The locations of the zeros and ones were selected randomly. This was repeated once at the beginning of each run,
meaning that the representation for each run is most probably different from other runs. At the beginning of each run
we set **w**<sub>0</sub> = **0** and thus the error would be the same for all algorithms at the beginning of the runs.
#### Feature representation
The feature representation for the collision task is an array of size `8, 6, 50`, where 8 corresponds to the number of
states, 6 correponds to the number of features for each state, and 50 corresponds to the number of runs.
The feature representations used for the set of results presented here and in the paper is saved in:
```
Resources/EightStateCollision/feature_rep.npy
```
Note that the feature representaiton for each run is different in the Collision task.
For example, the feature representation for the first run is:
```
array([[0., 0., 1., 0., 1., 1.],
[1., 1., 1., 0., 0., 0.],
[0., 1., 1., 0., 0., 1.],
[1., 0., 1., 1., 0., 0.],
[1., 1., 0., 0., 1., 0.],
[0., 1., 1., 1., 0., 0.],
[1., 1., 0., 0., 0., 1.],
[1., 0., 1., 0., 0., 1.]])
```
#### State distribution induced by the behavior policy
To compute an approximation of the mean squared value error at each time step, weighting induced by the behavior policy
was approximated by following the behavior policy for 20,000,000 time step and computing the fraction of time spent in
each state. The resulting distribution is saved in:
```
Resources/EightStateCollision/d_mu.npy
```
`d_mu.npy` is a one dimensional numpy array of size `8`:
```
array([0.05715078, 0.1142799 , 0.17142456, 0.22856842, 0.22856842, 0.11428067, 0.05715311, 0.02857415])
```
#### True state values
To compute an approximation of the mean squared value error at each time step, we need the true state values.
Luckily, for the Collision task, these values are easy to compute.
We computed these true values by following the target policy from each state to the wall once.
The resulting values are saved in:
```
Resources/EightStateCollision/state_values.npy
```
`state_values.npy` is a one dimensional numpy array of size `8`:
```
array([0.4782969, 0.531441, 0.59049, 0.6561, 0.729, 0.81, 0.9, 1])
```
<a name='how-to-run'></a>
## How to Run the Code
The code can be run in two different ways.
One way is through `learning.py` that can be used to run small experiments on a local computer.
The other way is through the files inside the Job directory.
We explain each of these approaches below by means of an example.
### Running on Your Local Machine
Let's take the following example: applying Off-policy TD(λ) to the Collision task.
There are multiple ways for doing this.
The first way is to open a terminal and go into the root directory of the code and run `Learning.py` with proper parameters:
```
python3 Learning.py --algorithm TD --task EightStateCollision --num_of_runs 50 --num_steps --environment Chain
--save_value_function Ture --alpha 0.01 --lmbda 0.9
```
In case any of the parameters are not specified, a default value will be used.
The default value is set in the `Job` directory, inside the `JobBuilder.py` file.
This means, the code, can alternatively be run, by setting all the necessary values that an algorithm needs at the top of the `JobBuilder.py` file.
Note that not all parameters specified in the `default_params` dict are required for all algorithms. For example, the `tdrc_beta` parameter is only
required to be set for the TDRC(λ) algorithms.
Once the variables inside the `default_params` dictionary, the code can be run:
```
python3 Learning.py
```
Or one can choose to specify some parameters in the `default_params` dictionary and specify the rest as command line argumets
like the following:
```
python3 Learning.py --algorithm TD --task EightStateCollision --alpha 0.01
```
### Running on Servers with Slurm Workload Managers
When parameter sweeps are necessary, the code can be run on supercomputers.
The current code supports running on servers that use slurm workload managers such as compute canada.
For exampole, to apply the TD algorithm to the Collision (EightStateCollision) task, with various parameters,
first you need to create a json file that specifies all the parameters that you would like to run, for example:
```json
{
"agent": "TD",
"environment": "Chain",
"task": "EightStateCollision",
"number_of_runs": 50,
"number_of_steps": 20000,
"sub_sample": 1,
"meta_parameters": {
"alpha": [
0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
],
"lmbda": [
0.1, 0.2, 0.3
]
}
}
```
and then run `main.py` using python:
```
python3 main.py -f <path_to_the_json_file> -s <kind_of_submission>
```
where `kind_of_submission` refers to one of the two ways you can submit your code:
1) You can request an individual cpu for each of the algorithm instances, where an algorithm instance refers to an
algorithm with specific parameters. To request an individual cpu, run the following command:
```
python3 main.py -f <path_to_the_json_file_or_dir> -s cpu
```
When running each algorithm instance on a single cpu, you need to specify the following parameters inside
`Job/SubmitJobsTemplatesCedar.SL`:
```shell
#SBATCH --account=xxx
#SBATCH --time=00:15:58
#SBATCH --mem=3G
```
where `#SBATCH --account=xxx` requires the account you are using in place of `xxx`,
`#SBATCH --time=00:15:58` requires the time you want to request for each individual cpu,
and `#SBATCH --mem=xG` requires the amount of memory in place of x.
2) You can request a node, that we assume includes 40 cpus. If you request a node, the jobs you submit will run in
parallel 40 at a time, and once one job is finished, the next one in line will start running.
This process continues until either all jobs are finished running, or you run out of the time you requested for that node.
```
python3 main.py -f <path_to_the_json_file_or_dir> -s node
```
When running the jobs on nodes, you need to specify the following parameters inside `Job/SubmitJobsTemplates.SL`:
```shell
#SBATCH --account=xxx
#SBATCH --time=11:58:59
#SBATCH --nodes=x
#SBATCH --ntasks-per-node=40
```
where `#SBATCH --account=xxx` requires the account you are using in place of `xxx`,
`#SBATCH --time=11:58:59` requires the time you want to request for each individual node, each of which includes 40 cpus in this case,
and `#SBATCH --nodes=x` requires the number of nodes you would like to request in place of x.
If you request more than one node, your jobs will be spread across nodes, 40 on each node, and once each job finishes,
the next job in the queue will start running.
`#SBATCH --ntasks-per-node=xx` is the number of jobs you would like to run concurrently on a single node. In this case,
for example, we set it to 40.
If `path_to_the_json_file_or_dir` is a directory, then the code will walk into all the subdirectories, and submits jobs for
all the parameters in the json files that it finds inside those directories sequentially.
If `path_to_the_json_file_or_dir` is a file, then the code will submit jobs for all the parameters that it finds inside that
single json file.
Note that you can create a new directory for each experiment that you would like to run, and create directories for each
of the algorithms you would like to run in that experiment.
For example, we created a directory called `FirstChain` inside the `Experiments` directory and created one directory
per algorithm inside the `FirstChain` directory for each of the algorithms and specified a json file in that directory.
It is worth noting that whatever parameter that is not specified in the json file will be read from the `default_params`
dictionary inside the `Job` directory inside the `JobBuilder.py` file.
<a name='Plot-results'></a>
## Plotting the results
The following table shows all the parameters that we tested in the experiments:
<p align="center">
<img width="700" src="/Assets/parameters.png" />
</p>
We now explain how each figure in the paper can be reproduced.
All the figures of the paper can be reproduced using the `plot_data.py` file, once you run the Learning.py script with all the needed parameters.
If you do not have the results available, the `plot_data.py` script will return an error.
1) **Processing the data**: This script manipulates data in a way that it is ready to be plotted over step sizes and also such
that the data is ready to be plotted as learning curves averaged over runs.
The `process_data` script also re-runs the algorithms with their best parameters to eliminate possible maximization
bias, as explained in the paper.
This is a time consuming step. If you do not like to do this step, simply set:
```python
PLOT_RERUN = False
```
in `Plotting/plot_params.py` and the code will ignore the re-running steps.
If you would like to eliminate the maximization bias, set:
```python
PLOT_RERUN = True
```
Finally, go to `plot_data.py` and set `func_to_run = 'process_data'`, and run the `plot_data.py` script.
2) **Plotting the learned value functions**:
Go to `plot_data`, and set `func_to_run = 'plot_value_functions'` to plot
the learned value functions for some of the runs, and set `func_to_run = plot_all_final_value_functions` to plot the
value function learned by the last time step of all of the runs in one plot.
<p align="center">
<img src="/Assets/value_functions.png" />
</p>
<br>
3) **Plotting the learning curves with specific parameter values**:
Go to `plot_data`, and set `func_to_run = 'specific_learning_curves_full_bootstrap'`, and run the `plot_data.py`
script.
<br></br>
<p align="center">
<img width="450" src="/Assets/specific_learning_curves.png" />
</p>
<br>
4) **Plotting the parameter studies for step size for all algorithms**:
Go to `plot_data`, and set `func_to_run = 'collision_sensitivity_curves_for_many_lambdas'`, and run the script.
<br></br>
<p align="center">
<img src="/Assets/sensitivity_curves_of_all_algs.png" />
</p>
<br>
5) **Plotting the parameter sensitivity study of Emphatic-TD algorithms**:
Go to `plot_data`, and set `func_to_run = 'collision_emphatics_sensitivity_full_bootstrap'`, and run the script.
<br></br>
<p align="center">
<img width="550" src="/Assets/Emphatics_sensitivity.png" />
</p>
<br>
6) **Plotting the parameter sensitivity study of Gradient-TD algorithms**:
Go to `plot_data`, and set `func_to_run = 'collision_gradients_sensitivity_full_bootstrap'`, and run the script.
<br></br>
<p align="center">
<img width="850" src="/Assets/Gradients_sensitivity.png" />
</p>
<br>
from Algorithms.TD import TD
from Algorithms.GTD import GTD
from Algorithms.TDRC import TDRC
from Algorithms.GEMETD import GEMETD
from Algorithms.GTD2 import GTD2
from Algorithms.PGTD2 import PGTD2
from Algorithms.HTD import HTD
from Algorithms.ETDLB import ETDLB
from Algorithms.ETD import ETD
from Algorithms.ABTD import ABTD
from Algorithms.Vtrace import Vtrace
from Algorithms.TB import TB
from Algorithms.LSTD import LSTD
from Algorithms.LSETD import LSETD
alg_dict = {'TD': TD, 'Vtrace': Vtrace, 'GTD': GTD, 'ABTD': ABTD, 'ETD': ETD, 'TB': TB, 'GTD2': GTD2, 'HTD': HTD,
'ETDLB': ETDLB, 'PGTD2': PGTD2, 'TDRC': TDRC, 'GEMETD': GEMETD, 'LSTD': LSTD, 'LSETD': LSETD}
# alg_dict = {'TD': TD, 'GTD': GTD, 'GTD2': GTD2, 'PGTD2': PGTD2, 'HTD': HTD, 'TDRC': TDRC, 'ETD': ETD, 'ETDLB': ETDLB,
# 'TB': TB, 'Vtrace': Vtrace, 'ABTD': ABTD, 'LSTD': LSTD, 'LSETD': 'LSETD'}
from Environments.Chain import Chain
from Environments.FourRoomGridWorld import FourRoomGridWorld
environment_dict = {'FourRoomGridWorld': FourRoomGridWorld, 'Chain': Chain}
from Tasks.EightStateCollision import EightStateCollision
from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
from Tasks.HighVarianceLearnEightPoliciesTileCodingFeat import HighVarianceLearnEightPoliciesTileCodingFeat
task_dict = {'EightStateCollision': EightStateCollision,
'LearnEightPoliciesTileCodingFeat': LearnEightPoliciesTileCodingFeat,
'HighVarianceLearnEightPoliciesTileCodingFeat': HighVarianceLearnEightPoliciesTileCodingFeat}
from abc import abstractmethod
import numpy as np
class BaseTask:
def __init__(self, **kwargs):
self.run_number = kwargs.get('run_number', 0)
self.num_steps = None
self.feature_rep = None
self.stacked_feature_rep = None # If learning more than one target policy at the same time
self.num_features = None
self.GAMMA = None
self.behavior_dist = None
self.state_values = None
self.num_policies = None
self.ABTD_xi_zero = None
self.ABTD_xi_max = None
def stack_feature_rep(self):
stacked_feature_rep = np.zeros((self.num_policies, self.feature_rep.shape[1], self.feature_rep.shape[0]))
for i in range(self.feature_rep.shape[0]):
stacked_x = np.tile(self.feature_rep[i, :], [self.num_policies, 1])
stacked_feature_rep[:, :, i] = stacked_x
return stacked_feature_rep
def get_active_policies(self, s):
...
def get_terminal_policies(self, s):
...
def generate_behavior_dist(self, total_steps):
...
@staticmethod
def num_of_policies():
...
@abstractmethod
def load_feature_rep(self):
...
@abstractmethod
def get_state_feature_rep(self, s):
...
@abstractmethod
def create_feature_rep(self):
...
@abstractmethod
def select_target_action(self, s, policy_id=0):
...
@abstractmethod
def select_behavior_action(self, s):
...
@abstractmethod
def get_pi(self, s, a):
...
@abstractmethod
def get_mu(self, s, a):
...
@abstractmethod
def load_behavior_dist(self):
return self.behavior_dist
@abstractmethod
def load_state_values(self):
return self.state_values
def __str__(self):
return f'task:{type(self).__name__}'
import numpy as np
from Environments.Chain import Chain
from Tasks.BaseTask import BaseTask
class EightStateCollision(BaseTask, Chain):
def __init__(self, **kwargs):
BaseTask.__init__(self, **kwargs)
Chain.__init__(self)
self._resource_root_path = kwargs.get('resource_root_path', 'Resources')
self.N = kwargs.get('n', 8)
self.feature_rep = self.load_feature_rep()
self.num_features = self.feature_rep.shape[1]
self.num_steps = kwargs.get('num_steps', 20000)
self.GAMMA = 0.9
self.behavior_dist = self.load_behavior_dist()
self.state_values = self.load_state_values()
self.num_policies = EightStateCollision.num_of_policies()
self.ABTD_xi_zero = 1
self.ABTD_xi_max = 2
@staticmethod
def num_of_policies():
return 1
def load_feature_rep(self):
return np.load(f'{self._resource_root_path}/{self.__class__.__name__}/feature_rep.npy')[:, :, self.run_number]
def create_feature_rep(self):
num_ones = 3
num_zeros = self.num_features - num_ones
for i in range(self.N):
random_arr = (np.array([0] * num_zeros + [1] * num_ones))
np.random.shuffle(random_arr)
self.feature_rep[i, :] = random_arr
def get_state_feature_rep(self, s):
return self.feature_rep[s, :]
def load_behavior_dist(self):
self.behavior_dist = np.load(f'{self._resource_root_path}/{self.__class__.__name__}/d_mu.npy')
return self.behavior_dist
def load_state_values(self):
self.state_values = np.load(f'{self._resource_root_path}/{self.__class__.__name__}/state_values.npy')
return self.state_values
def select_behavior_action(self, s):
if s < self.N / 2:
return self.RIGHT_ACTION
else:
return np.random.choice([self.RIGHT_ACTION, self.RETREAT_ACTION])
def select_target_action(self, s, policy_id=0):
return self.RIGHT_ACTION
def get_pi(self, s, a):
if a == self.RIGHT_ACTION:
return 1
else:
return 0
def get_mu(self, s, a):
if s < self.N / 2:
if a == self.RIGHT_ACTION:
return 1
else:
return 0
elif s >= self.N / 2:
return 0.5
else:
raise AssertionError
import numpy as np
from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
class HighVarianceLearnEightPoliciesTileCodingFeat(LearnEightPoliciesTileCodingFeat):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.RANDOM_PROB = 0.97
def select_behavior_action(self, s):
random_num = np.random.random()
x, y = self.get_xy(s)
if x == 1 and (y == 1 or y == 8):
if random_num < self.RANDOM_PROB:
return self.ACTION_LEFT
else:
return np.random.choice([self.ACTION_UP, self.ACTION_RIGHT, self.ACTION_DOWN])
if x == 8 and (y == 1 or y == 8):
if random_num < self.RANDOM_PROB:
return self.ACTION_RIGHT
else:
return np.random.choice([self.ACTION_UP, self.ACTION_LEFT, self.ACTION_DOWN])
return np.random.choice([self.ACTION_UP, self.ACTION_DOWN, self.ACTION_LEFT, self.ACTION_RIGHT])
def get_mu(self, s, a):
x, y = self.get_xy(s)
if x == 1 and (y == 1 or y == 8):
if a == self.ACTION_LEFT:
return np.ones(self.num_policies) * self.RANDOM_PROB
# return 0.97
else:
return np.ones(self.num_policies) * ((1 - self.RANDOM_PROB) / 3.0)
# return 0.01
if x == 8 and (y == 1 or y == 8):
if a == self.ACTION_RIGHT:
return np.ones(self.num_policies) * self.RANDOM_PROB
# return 0.97
else:
return np.ones(self.num_policies) * ((1 - self.RANDOM_PROB) / 3.0)
# return 0.01
return super().get_mu(s, a)
import numpy as np
import random
from Environments.FourRoomGridWorld import FourRoomGridWorld
from Tasks.BaseTask import BaseTask
from utils import ImmutableDict
class LearnEightPoliciesTileCodingFeat(BaseTask, FourRoomGridWorld):
def __init__(self, **kwargs):
BaseTask.__init__(self)
FourRoomGridWorld.__init__(self)
self.feature_rep = self.load_feature_rep()
self.num_features = self.feature_rep.shape[1]
self.num_steps = kwargs.get('num_steps', 50000)
self.GAMMA = 0.9
self.behavior_dist = self.load_behavior_dist()
self.state_values = self.load_state_values()
self.ABTD_xi_zero = 1
self.ABTD_xi_max = 4
self.optimal_policies = ImmutableDict(
{
0: [
[lambda x, y: 0 <= x <= 3 and 2 <= y <= 4, [self.ACTION_DOWN, self.ACTION_RIGHT]],
[lambda x, y: 3 >= x >= 0 == y, [self.ACTION_UP, self.ACTION_RIGHT]],
[lambda x, y: 0 <= x <= 4 and y == 1, [self.ACTION_RIGHT]],
[lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1], [self.ACTION_DOWN]],
[lambda x, y: 4 == x and 2 <= y <= 4, [self.ACTION_DOWN]],
[lambda x, y: 4 == x and y == 0, [self.ACTION_UP]]
],
1: [
[lambda x, y: 2 <= x <= 4 and 0 <= y <= 3, [self.ACTION_LEFT, self.ACTION_UP]],
[lambda x, y: x == 0 and 0 <= y <= 3, [self.ACTION_RIGHT, self.ACTION_UP]],
[lambda x, y: x == 1 and 0 <= y <= 4, [self.ACTION_UP]],
[lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1], [self.ACTION_LEFT]],
[lambda x, y: 2 <= x <= 4 and y == 4, [self.ACTION_LEFT]],
[lambda x, y: x == 0 and y == 4, [self.ACTION_RIGHT]],
],
2: [
[lambda x, y: 2 <= x <= 4 and 7 <= y <= 10, [self.ACTION_LEFT, self.ACTION_DOWN]],
[lambda x, y: x == 0 and 7 <= y <= 10, [self.ACTION_RIGHT, self.ACTION_DOWN]],
[lambda x, y: x == 1 and 6 <= y <= 10, [self.ACTION_DOWN]],
[lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1], [self.ACTION_LEFT]],
[lambda x, y: 2 <= x <= 4 and y == 6, [self.ACTION_LEFT]],
[lambda x, y: x == 0 and y == 6, [self.ACTION_RIGHT]],
],
3: [
[lambda x, y: 0 <= x <= 3 and 6 <= y <= 7, [self.ACTION_UP, self.ACTION_RIGHT]],
[lambda x, y: 0 <= x <= 3 and 9 <= y <= 10, [self.ACTION_DOWN, self.ACTION_RIGHT]],
[lambda x, y: 0 <= x <= 4 and y == 8, [self.ACTION_RIGHT]],
[lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1], [self.ACTION_UP]],
[lambda x, y: x == 4 and 6 <= y <= 7, [self.ACTION_UP]],
[lambda x, y: x == 4 and 9 <= y <= 10, [self.ACTION_DOWN]]
],
4: [
[lambda x, y: 10 >= x >= 7 >= y >= 5, [self.ACTION_LEFT, self.ACTION_UP]],
[lambda x, y: 7 <= x <= 10 and 9 <= y <= 10, [self.ACTION_LEFT, self.ACTION_DOWN]],
[lambda x, y: 6 <= x <= 10 and y == 8, [self.ACTION_LEFT]],
[lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1], [self.ACTION_UP]],
[lambda x, y: x == 6 and 5 <= y <= 7, [self.ACTION_UP]],
[lambda x, y: x == 6 and 9 <= y <= 10, [self.ACTION_DOWN]]
],
5: [
[lambda x, y: 6 <= x <= 7 and 6 <= y <= 10, [self.ACTION_RIGHT, self.ACTION_DOWN]],
[lambda x, y: 9 <= x <= 10 and 6 <= y <= 10, [self.ACTION_DOWN, self.ACTION_LEFT]],
[lambda x, y: x == 8 and 5 <= y <= 10, [self.ACTION_DOWN]],
[lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1], [self.ACTION_RIGHT]],
[lambda x, y: 6 <= x <= 7 and y == 5, [self.ACTION_RIGHT]],
[lambda x, y: 9 <= x <= 10 and y == 5, [self.ACTION_LEFT]]
],
6: [
[lambda x, y: 6 <= x <= 7 and 0 <= y <= 2, [self.ACTION_UP, self.ACTION_RIGHT]],
[lambda x, y: 9 <= x <= 10 and 0 <= y <= 2, [self.ACTION_UP, self.ACTION_LEFT]],
[lambda x, y: x == 8 and 0 <= y <= 3, [self.ACTION_UP]],
[lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1], [self.ACTION_RIGHT]],
[lambda x, y: 6 <= x <= 7 and y == 3, [self.ACTION_RIGHT]],
[lambda x, y: 9 <= x <= 10 and y == 3, [self.ACTION_LEFT]]
],
7: [
[lambda x, y: 7 <= x <= 10 and 2 <= y <= 3, [self.ACTION_DOWN, self.ACTION_LEFT]],
[lambda x, y: 7 <= x <= 10 and y == 0, [self.ACTION_UP, self.ACTION_LEFT]],
[lambda x, y: 6 <= x <= 10 and y == 1, [self.ACTION_LEFT]],
[lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1], [self.ACTION_DOWN]],
[lambda x, y: x == 6 and 2 <= y <= 3, [self.ACTION_DOWN]],
[lambda x, y: x == 6 and y == 0, [self.ACTION_UP]]
]
}
)
self.default_actions = ImmutableDict(
{
0: self.ACTION_RIGHT,
1: self.ACTION_UP,
2: self.ACTION_DOWN,
3: self.ACTION_RIGHT,
4: self.ACTION_LEFT,
5: self.ACTION_DOWN,
6: self.ACTION_UP,
7: self.ACTION_LEFT
}
)
self.policy_terminal_condition = ImmutableDict(
{
0: lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1],
1: lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1],
2: lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1],
3: lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1],
4: lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1],
5: lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1],
6: lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1],
7: lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1]
}
)
self.num_policies = LearnEightPoliciesTileCodingFeat.num_of_policies()
self.stacked_feature_rep = self.stack_feature_rep()
self._active_policies_cache = {}
@staticmethod
def num_of_policies():
return 8
def get_terminal_policies(self, s):
x, y = self.get_xy(s)
terminal_policies = np.zeros(self.num_policies)
for policy_id, condition in self.policy_terminal_condition.items():
if condition(x, y):
terminal_policies[policy_id] = 1
return terminal_policies
def get_state_index(self, x, y):
return int(y * np.sqrt(self.feature_rep.shape[0]) + x)
def get_probability(self, policy_number, s, a):
x, y = self.get_xy(s)
probability = 0.0
for condition, possible_actions in self.optimal_policies[policy_number]:
if condition(x, y):
if a in possible_actions:
probability = 1.0 / len(possible_actions)
break
return probability
def select_target_action(self, s, policy_id=0):
x, y = self.get_xy(s)
a = self.default_actions[policy_id]
for condition, possible_actions in self.optimal_policies[policy_id]:
if condition(x, y):
a = random.choice(possible_actions)
break
return a
def get_active_policies(self, s):
if s in self._active_policies_cache:
return self._active_policies_cache[s]
x, y = self.get_xy(s)
active_policy_vec = np.zeros(self.num_policies, dtype=int)
for policy_number, policy_values in self.optimal_policies.items():
for (condition, _) in policy_values:
if condition(x, y):
active_policy_vec[policy_number] = 1
break
self._active_policies_cache[s] = active_policy_vec
return active_policy_vec
def load_feature_rep(self):
return np.load(f'Resources/{self.__class__.__name__}/feature_rep.npy')
def get_state_feature_rep(self, s):
return self.feature_rep[s, :]
def create_feature_rep(self):
...
def load_behavior_dist(self):
return np.load(f'Resources/{self.__class__.__name__}/d_mu.npy')
def load_state_values(self):
return np.load(f'Resources/{self.__class__.__name__}/state_values.npy')
def select_behavior_action(self, s):
return np.random.randint(0, self.num_actions)
def get_mu(self, s, a):
return np.ones(self.num_policies) * (1.0 / self.num_actions)
def get_pi(self, s, a):
pi_vec = np.zeros(self.num_policies)
for policy_id, i in enumerate(self.get_active_policies(s)):
if i:
pi_vec[policy_id] = self.get_probability(policy_id, s, a)
return pi_vec
def generate_behavior_dist(self, total_steps):
final_state_dist = np.zeros((self.num_policies, self.num_states))
s = self.reset()
state_visitation_count = np.zeros(self.num_states)
for step in range(total_steps):
if step % 100000 == 0:
print(step)
state_visitation_count[s] += 1
sp, r, is_terminal, _ = self.step(self.select_behavior_action(s))
s = sp
for s in range(self.num_states):
for policy_id, i in enumerate(self.get_active_policies(s)):
if i:
final_state_dist[policy_id, s] = state_visitation_count[s]
return (final_state_dist / total_steps).T
import unittest
import numpy as np
from Algorithms.TD import TD
from Environments.Chain import Chain
from Tasks.EightStateCollision import EightStateCollision
class TestTD(unittest.TestCase):
def setUp(self) -> None:
params = {
#'resource_root_path': '../../Resources',
'alpha': 0.001953125,
'lmbda': 0.9,
}
self.env = Chain()
self.task = EightStateCollision(**params)
self.task.reset()
self.alg = TD(task=self.task, **params)
def tearDown(self) -> None:
...
def test_initial_w_is_zero(self):
self.assertEqual(self.alg.w.sum(), 0)
def test_initial_z_is_zero(self):
self.assertEqual(self.alg.z.sum(), 0)
def test_learn_single_policy_rmsve_after_num_steps(self):
rmsve_of_run = np.zeros((self.task.num_policies, self.task.num_steps))
np.random.seed(0)
self.alg.state = self.env.reset()
for step in range(self.task.num_steps):
rmsve_of_run[:, step], error = self.alg.compute_rmsve()
self.alg.action = self.alg.choose_behavior_action()
self.alg.next_state, r, is_terminal, info = self.env.step(self.alg.action)
self.alg.learn(self.alg.state, self.alg.next_state, r, is_terminal)
if is_terminal:
self.alg.state = self.env.reset()
self.alg.reset()
continue
self.alg.state = self.alg.next_state
self.assertTrue(abs(0.08319472840990755 - rmsve_of_run[0, -1]) <= 0.0000001)
import unittest
from Environments.Chain import Chain
class TestChain(unittest.TestCase):
def setUp(self) -> None:
self.env = Chain()
self.env.reset()
def tearDown(self) -> None:
self.env.reset()
def test_rest_initial_state_between_zero_three(self):
self.env.reset()
self.assertIn(self.env._state, [0, 1, 2, 3])
def test_step_retreat_move_state_to_initial_state(self):
self.env.reset()
sp, r, is_done, _ = self.env.step(self.env.RETREAT_ACTION)
self.assertEqual(is_done, True)
def test_step_right_move_state_one_step_to_right(self):
self.env.reset()
s = self.env._state
sp, r, is_done, _ = self.env.step(self.env.RIGHT_ACTION)
self.assertEqual(sp - s, 1)
import unittest
from Tasks.EightStateCollision import EightStateCollision
from Environments.Chain import Chain
class TestEightStateCollision(unittest.TestCase):
def setUp(self) -> None:
params = {
#'resource_root_path': '../../Resources'
}
self.experiment = EightStateCollision(**params)
self.experiment.reset()
def tearDown(self) -> None:
...
def test_load_feature_rep_evaluate_shape_is_(self):
feature_rep_arr = self.experiment.load_feature_rep()
self.assertEqual(feature_rep_arr.shape, (8, 6))
def test_get_state_feature_rep_state_for_all_states(self):
expected_states_feature_rep = [
[0., 0., 1., 0., 1., 1.],
[1., 1., 1., 0., 0., 0.],
[0., 1., 1., 0., 0., 1.],
[1., 0., 1., 1., 0., 0.],
[1., 1., 0., 0., 1., 0.],
[0., 1., 1., 1., 0., 0.],
[1., 1., 0., 0., 0., 1.],
[1., 0., 1., 0., 0., 1.]
]
evaluated_states_feature_rep = []
for state in range(self.experiment.N):
evaluated_states_feature_rep.append(list(self.experiment.get_state_feature_rep(state)))
self.assertListEqual(evaluated_states_feature_rep, expected_states_feature_rep)
def test_load_behavior_dist_evaluate_shape_is_(self):
behavior_dist = self.experiment.load_behavior_dist()
self.assertEqual(behavior_dist.shape, (8,))
def test_get_mu_for_right_action_in_initial_state_is_one(self):
mu = self.experiment.get_mu(0, self.experiment.RIGHT_ACTION)
self.assertEqual(mu, 1)
def test_get_mu_for_retreat_action_in_initial_state_is_zero(self):
mu = self.experiment.get_mu(0, self.experiment.RETREAT_ACTION)
self.assertEqual(mu, 0)
def test_get_mu_for_all_action_in_not_initial_state_is_one_half(self):
mu = self.experiment.get_mu(5, self.experiment.RIGHT_ACTION)
self.assertEqual(mu, 0.5)
mu = self.experiment.get_mu(5, self.experiment.RETREAT_ACTION)
self.assertEqual(mu, 0.5)
def test_get_pi_for_right_action_is_one(self):
pi = self.experiment.get_pi(0, self.experiment.RIGHT_ACTION)
self.assertEqual(pi, 1)
def test_get_pi_for_retreat_action_is_one(self):
pi = self.experiment.get_pi(0, self.experiment.RETREAT_ACTION)
self.assertEqual(pi, 0)
import itertools
import json
import os
from collections import defaultdict
from itertools import zip_longest
from typing import List, Optional, Dict
import numpy as np
from Job.JobBuilder import default_params
from Plotting.plot_params import EXP_ATTRS
from Plotting.plot_utils import load_and_replace_large_nan_inf
from Registry.AlgRegistry import alg_dict
from utils import Configuration
def split_dict_of_list_to_dicts(dict_of_list: Dict[str, list]) -> List[Dict[str, float]]:
"""split a given dictionary of lists into list of dictionaries.
>>> split_dict_of_list_to_dicts({'alpha': [1, 2, 3], 'lambda': [4, 5], 'gamma': [6]})
[{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}]
Args:
dict_of_list (Dict[str, list]): a dictionary of lists.
Returns:
List[Dict[str, float]]: list of dictionaries.
"""
keys = dict_of_list.keys()
values = [[e for e in result if e is not None] for result in itertools.product(*dict_of_list.values())]
result = [dict(zip(keys, v)) for v in values]
return result
def group_dicts_by_first_key(list_of_dicts: List[Dict[str, float]]) -> Dict[str, List[Dict[str, float]]]:
"""
>>> group_dicts_by_first_key([{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}])
{1: [{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}], 2: [{'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}], 3: [{'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}]}
"""
first_key = get_first_key_of_dictionary(list_of_dicts[0])
final_grouped = defaultdict(list)
for inner_dict in list_of_dicts:
final_grouped[inner_dict[first_key]].append(inner_dict)
return dict(final_grouped)
def group_dicts_over_first_key(list_of_dicts: List[Dict[str, float]]) -> Dict[tuple, List[float]]:
"""
>>> group_dicts_over_first_key([{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}])
{(('lambda', 4), ('gamma', 6)): [1, 2, 3], (('lambda', 5), ('gamma', 6)): [1, 2, 3]}
:param list_of_dicts:
:return:
"""
first_key = get_first_key_of_dictionary(list_of_dicts[0])
final_grouped = defaultdict(list)
for inner_dict in list_of_dicts:
first_value = inner_dict[first_key]
del inner_dict[first_key]
final_grouped[tuple(inner_dict.items())].append(first_value)
return dict(final_grouped)
def find_best_performance(exp_name, alg_name, second_param, auc_or_final) -> Dict[str, float]:
exp_attrs = EXP_ATTRS[exp_name](exp_name)
best_params = {}
best_perf = np.inf
all_configuration = JsonParameterBuilder().add_experiment(exp_name).add_algorithm(alg_name).build()
list_of_configuration = split_dict_of_list_to_dicts(all_configuration)
first_param_key = get_first_key_of_dictionary(all_configuration)
grouped_over_first = group_dicts_over_first_key(list_of_configuration)
for grouped, first_values in grouped_over_first.items():
second_param_name, second_param_value = grouped[0]
if second_param_value != second_param:
continue
grouped_params = dict(grouped)
current_params = Configuration(grouped_params)
current_params[first_param_key] = None
current_params.algorithm = alg_name
current_params.save_path = PathFactory.make_result_path(exp_name, alg_name)
current_params.rerun = False
current_configuration_over_first_full_path = DataPersister.create_full_path_file_name(f'_mean_{auc_or_final}_over_alpha', current_params,
excluded_params=[first_param_key])
current_perf = load_and_replace_large_nan_inf(
current_configuration_over_first_full_path, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement)
min_perf = min(current_perf)
if min_perf < best_perf:
best_perf = min_perf
best_perf_idx = int(np.nanargmin(current_perf))
best_params = current_params
best_params[first_param_key] = first_values[best_perf_idx]
return best_params
def get_first_key_of_dictionary(d: dict) -> str:
return list(d.keys())[0]
class ParameterBuilder:
def __init__(self):
self.final_params_dict = dict()
def add_algorithm_params(self, configuration: Configuration):
for k in alg_dict[configuration.algorithm].related_parameters():
self.final_params_dict[k] = configuration[k]
return self
def build(self):
return self.final_params_dict
class JsonParameterBuilder:
def __init__(self):
self.final_params_dict = dict()
self.exp_name = None
self.alg_name = None
self.alg_related_params = None
def add_experiment(self, exp_name):
self.exp_name = exp_name
return self
def add_algorithm(self, alg_name):
self.alg_name = alg_name
self.alg_related_params = alg_dict[alg_name].related_parameters()
return self
def build(self) -> Dict[str, list]:
json_path = PathFactory.make_experiment_path(self.exp_name, self.alg_name)
with open(json_path) as f:
json_config = json.load(f)
for param_name in self.alg_related_params:
self.final_params_dict[param_name] = list(json_config['meta_parameters'].get(param_name, [default_params['meta_parameters'][param_name]]))
return self.final_params_dict
class PathFactory:
@staticmethod
def make_experiment_path(exp_name, alg_name):
return os.path.join(os.getcwd(), 'Experiments', exp_name, alg_name, f'{alg_name}.json')
@staticmethod
def make_result_path(exp_name, alg_name):
return os.path.join(os.getcwd(), 'Results', exp_name, alg_name)
class DataPersister:
@staticmethod
def save_result(result_arr: np.ndarray, result_name: str, configuration: Configuration, excluded_params: Optional[list] = None):
full_path_file_to_save = DataPersister.create_full_path_file_name(result_name, configuration, excluded_params)
if not os.path.exists(os.path.dirname(full_path_file_to_save)):
os.makedirs(os.path.dirname(full_path_file_to_save))
np.save(full_path_file_to_save, result_arr)
@staticmethod
def save_best_pref_over_first_param(exp_name, alg_name, auc_or_final):
all_configuration = JsonParameterBuilder().add_experiment(exp_name).add_algorithm(alg_name).build()
list_of_configuration = split_dict_of_list_to_dicts(all_configuration)
first_param_key = get_first_key_of_dictionary(all_configuration)
first_param_length = len(all_configuration[first_param_key])
mean_over_alpha, stderr_over_alpha = np.zeros(first_param_length), np.zeros(first_param_length)
grouped_over_first = group_dicts_over_first_key(list_of_configuration)
for grouped, first_values in grouped_over_first.items():
grouped_params = dict(grouped)
current_params = Configuration(grouped_params)
current_params[first_param_key] = None
current_params.algorithm = alg_name
current_params.save_path = PathFactory.make_result_path(exp_name, alg_name)
current_params.rerun = False
for index, first_value in enumerate(first_values):
current_params[first_param_key] = first_value
full_path_file_to_save = DataPersister.create_full_path_file_name(f'_mean_stderr_{auc_or_final}', current_params)
# perf = np.load(full_path_file_to_save)
# mean_over_alpha[index], stderr_over_alpha[index] = perf[0], perf[1]
# TODO: in case the rerun postfix is needed it should implement
DataPersister.save_result(mean_over_alpha, f"_mean_{auc_or_final}_over_alpha", current_params, excluded_params=[first_param_key])
DataPersister.save_result(stderr_over_alpha, f"_stderr_{auc_or_final}_over_alpha", current_params, excluded_params=[first_param_key])
@staticmethod
def create_full_path_file_name(result_name: str, configuration: Configuration, excluded_params: Optional[list] = None) -> str:
params = ParameterBuilder().add_algorithm_params(configuration).build()
file_name_to_save = DataPersister.create_file_name(params, excluded_params=excluded_params)
full_path_file_to_save = os.path.join(configuration.save_path, file_name_to_save)
full_path_file_to_save = f'{full_path_file_to_save}{result_name}'
if configuration.rerun:
full_path_file_to_save = f'{full_path_file_to_save}_rerun'
return f'{full_path_file_to_save}.npy'
@staticmethod
def create_file_name(param: dict, excluded_params: Optional[list]) -> str:
if excluded_params is None:
excluded_params = []
final_str = ''
for k, v in param.items():
if k in excluded_params:
continue
if k == 'alpha' or k == 'eta':
split_str = str.split(f'{v:.10f}', '.')
else:
split_str = str.split(f'{v:.5f}', '.')
final_str += '_' + k + split_str[0] + split_str[1]
return final_str
import os
from Job.JobBuilder import JobBuilder
import argparse
from utils import find_all_experiment_configuration
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--directory_or_file', '-f', type=str, help='Json file path or Json files directory', required=True)
parser.add_argument('--server', '-s', type=str, help='Input server name, Cedar or Niagara', required=True)
args = parser.parse_args()
for path in find_all_experiment_configuration(args.directory_or_file):
builder = JobBuilder(json_path=os.path.join(os.getcwd(), path), server_name=args.server)
builder()
from Plotting.plot_all_sensitivities_per_alg_gradients import plot_all_sensitivities_per_alg_gradients
from Plotting.plot_all_sensitivities_per_alg_gradients_all_eta import plot_all_sensitivities_per_alg_gradients_all_eta
from Plotting.plot_best_learning_curve_over_all_params import plot_learning_curve_best_overall_params
from Plotting.plot_dist import plot_distribution, plot_dist_for_two_four_room_tasks
from Plotting.plot_all_sensitivities_per_alg_emphatics import plot_all_sensitivities_per_alg_emphatics
from Plotting.plot_learning_curve import plot_learning_curve
from Plotting.plot_learning_curves_for_all_third_params import plot_all_learning_curves_for_third
from Plotting.plot_learning_for_two_lambdas import plot_learning_curve_for_lambdas
from Plotting.plot_sensitivity import plot_sensitivity_curve
from Plotting.plot_sensitivity_for_two_lambdas import plot_sensitivity_for_lambdas
from Plotting.plot_specific_learning_curves import plot_specific_learning_curves
from Plotting.plot_waterfall import plot_waterfall_scatter
from Plotting.process_state_value_function import plot_all_final_value_functions, plot_value_functions
from process_data import process_data
func_to_run = 'hv_four_rooms_specific_learning_curves_full_bootstrap'
if 'collision' in func_to_run:
exps = ['FirstChain'] # FirstChain OR FirstFourRoom OR 1HVFourRoom
elif 'hv' in func_to_run:
exps = ['1HVFourRoom']
else:
exps = ['FirstFourRoom']
# region process data
if func_to_run == 'process_data':
exps = ['FirstChain', 'FirstFourRoom', '1HVFourRoom']
algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
auc_or_final = ['auc', 'final']
sp_list = [1.0]
process_data(exps=exps, algs=algs, auc_or_final=auc_or_final, sp_list=sp_list)
# endregion
# ====================
# ====================
# ====================
# ====================
# region Collision figures
# region learning curves
if func_to_run == 'collision_specific_learning_curves_full_bootstrap':
auc_or_final = ['auc']
fig_size = (10, 4)
sp = 0.0
if 'FirstChain' in exps:
exp = 'FirstChain'
algs = ['ETD', 'TD', 'GTD', 'TDRC', 'PGTD2']
specific_params = {
'TD': {'alpha': 0.03125, 'lmbda': sp},
'ETD': {'alpha': 0.00390625, 'lmbda': sp},
'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 4.0, 'tdrc_beta': 0.01},
'GTD': {'alpha': 0.000976562, 'lmbda': sp, 'eta': 16.0},
'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}
}
plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
specific_params=specific_params)
if 'FirstFourRoom' in exps:
exp = 'FirstFourRoom'
algs = ['LSTD', 'LSETD', 'ETD', 'TD', 'GTD2', 'TDRC', 'PGTD2']
specific_params = {
'TD': {'alpha': 0.25, 'lmbda': sp},
'ETD': {'alpha': 0.00390625, 'lmbda': sp},
'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2},
'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0},
'GTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0},
'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}
}
plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
specific_params=specific_params)
if '1HVFourRoom' in exps:
exp = '1HVFourRoom'
algs = ['LSTD', 'LSETD', 'ETDLB', 'TD', 'GTD', 'TDRC', 'PGTD2']
specific_params = {
'TD': {'alpha': 0.25, 'lmbda': sp},
'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2},
'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0},
'GTD': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0},
'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}
}
plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
specific_params=specific_params)
if func_to_run == 'collision_learning_curves_for_all_extra_params_full_bootstrapping':
algs = ['PGTD2', 'GTD', 'LSTD']
sp_list = [0.0]
fig_size = (10, 4)
auc_or_final = ['auc']
# tp_list = [0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0]
tp_list = [0.25]
plot_all_learning_curves_for_third(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final, tp_list=tp_list)
if func_to_run == 'collision_learning_curve_for_two_lambdas':
sp_list = [0.0, 0.9]
fig_size = (6, 4)
alg_groups = {'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']}
auc_or_final = ['auc']
plot_learning_curve_for_lambdas(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'collision_best_learning_curves_full_bootstrap':
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
'emphatics': ['ETD', 'ETDLB', 'LSETD'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
'LSTD', 'LSETD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'collision_best_learning_curves_some_algs_full_bootstrap':
sp_list = [0.0]
fig_size = (6, 4)
alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
is_smoothed=True, smoothing_window=1)
if func_to_run == 'collision_best_learning_curves_some_algs_medium_bootstrap':
sp_list = [0.5]
fig_size = (6, 4)
alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
is_smoothed=True, smoothing_window=500)
if func_to_run == 'collision_best_learning_curves_some_algs_minimal_bootstrap':
sp_list = [0.9]
fig_size = (6, 4)
alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
is_smoothed=True, smoothing_window=500)
if func_to_run == 'collision_best_learning_curves_some_algs_no_bootstrap':
sp_list = [1.0]
fig_size = (6, 4)
alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
is_smoothed=True, smoothing_window=500)
if func_to_run == 'collision_best_learning_curves_full_bootstrap_rerun_and_original': # also need to set PLOT_RERUN = False
# and PLOT_RERUN_AND_ORIG = True in plot_params. Also some changes are necessary in the plot_learning_curve function
# like setting the colors and stuff for the re-run and original plots.
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'all_algs': ['GTD']}
auc_or_final = ['final']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
# endregion
# region sensitivity curves
if func_to_run == 'collision_sensitivity_curves_for_many_lambdas':
sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0]
fig_size = (10, 4)
algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
# algs = ['TB', 'Vtrace', 'ABTD']
auc_or_final = ['auc']
plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'collision_emphatics_sensitivity_full_bootstrap':
sp_list = [0.0]
fig_size = (11, 5)
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'collision_gradients_sensitivity_full_bootstrap':
sp_list = [0.0]
fig_size = (11, 4)
algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'collision_gradients_sensitivity_full_bootstrap_all_eta':
sp_list = [0.0]
fig_size = (10, 6)
algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'collision_TDRC_all_eta_one_beta':
sp_list = [0.0]
tdrc_beta = [0.01] # possible values are 0.1, 0.01, 1.0. Set them separately to plot.
fig_size = (10, 6)
algs = ['TDRC']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final, tdrc_beta=tdrc_beta)
if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap':
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'],
'emphatics': ['ETD', 'ETDLB'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'],
'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']}
auc_or_final = ['auc']
if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping':
plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
elif func_to_run == 'collision_waterfall_full_bootstrap':
plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'collision_emphatics_sensitivity_minimal_bootstrap':
sp_list = [0.9]
fig_size = (6, 4)
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'collision_sensitivity_curves_for_two_lambdas':
sp_list = [0.0, 0.9]
fig_size = (6, 4)
algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
auc_or_final = ['auc']
plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
# endregion
# endregion
# ====================
# ====================
# region FOUR ROOMS FIGURES
# region learning curves
if func_to_run == 'four_rooms_specific_learning_curves_full_bootstrap':
auc_or_final = ['auc']
fig_size = (10, 4)
sp = 0.0
exp = 'FirstFourRoom'
algs = ['ETD', 'TD', 'GTD2', 'TDRC', 'PGTD2']
specific_params = {
'TD': {'alpha': 0.0625, 'lmbda': 0.0},
'ETD': {'alpha': 0.000488281, 'lmbda': sp},
'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2},
'TDRC': {'alpha': 0.125, 'lmbda': sp, 'eta': 4.0, 'tdrc_beta': 1.0},
'GTD2': {'alpha': 0.001953125, 'lmbda': sp, 'eta': 16.0},
'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}
}
plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
specific_params=specific_params)
if func_to_run == 'four_rooms_best_learning_curves_full_bootstrap':
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
'emphatics': ['ETD', 'ETDLB', 'LSETD'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
'LSTD', 'LSETD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'four_rooms_best_learning_curves_full_bootstrap_2':
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'main_algs': ['ETD', 'ETDLB', 'LSTD', 'LSETD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'four_rooms_best_overall_params_learning_curves':
fig_size = (10, 4)
alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
'emphatics': ['ETD', 'ETDLB', 'LSETD'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
'LSTD', 'LSETD']}
auc_or_final = ['auc']
plot_learning_curve_best_overall_params(exps=exps, alg_groups=alg_groups, fig_size=fig_size, auc_or_final=auc_or_final)
# endregion
# region sensitivity curves
if func_to_run == 'four_rooms_sensitivity_curves_for_many_lambdas':
sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0]
fig_size = (10, 4)
algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
auc_or_final = ['auc']
plot_min_performance = False
plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
plot_min_performance=plot_min_performance)
if func_to_run == 'four_rooms_emphatics_sensitivity_full_bootstrap':
sp_list = [0.0]
# fig_size = (11, 5)
fig_size = (10, 4)
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'four_rooms_gradients_sensitivity_full_bootstrap':
sp_list = [0.0]
fig_size = (10, 4)
algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'four_rooms_gradients_sensitivity_full_bootstrap_all_eta':
sp_list = [0.0]
fig_size = (10, 6)
algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'four_rooms_TDRC_all_eta_one_beta':
sp_list = [0.0]
tdrc_beta = [0.01] # possible values are 0.1, 0.01, 1.0. Set them separately to plot.
fig_size = (10, 6)
algs = ['TDRC']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final, tdrc_beta=tdrc_beta)
if func_to_run == 'four_rooms_best_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap':
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'],
'emphatics': ['ETD', 'ETDLB'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'],
'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']}
auc_or_final = ['auc']
if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping':
plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
elif func_to_run == 'collision_waterfall_full_bootstrap':
plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'four_rooms_emphatics_sensitivity_minimal_bootstrap':
sp_list = [0.9]
fig_size = (6, 4)
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'four_rooms_sensitivity_curves_for_two_lambdas':
sp_list = [0.0, 0.9]
fig_size = (6, 4)
algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
auc_or_final = ['auc']
plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
# endregion
# endregion
# ====================
# ====================
# region HIGH VARIANCE FOUR ROOMS FIGURES
# region learning curves
if func_to_run == 'hv_four_rooms_specific_learning_curves_full_bootstrap':
auc_or_final = ['auc']
fig_size = (10, 4)
sp = 0.0
exp = '1HVFourRoom'
algs = ['ETD', 'TD', 'GTD', 'TB']
specific_params = {
'TD': {'alpha': 0.0078125, 'lmbda': sp},
'ETD': {'alpha': 0.000244140, 'lmbda': sp},
'GTD': {'alpha': 0.000488281, 'lmbda': sp, 'eta': 16.0},
'TB': {'alpha': 0.03125, 'lmbda': 1.0}
}
plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
specific_params=specific_params)
if func_to_run == 'hv_four_rooms_best_learning_curves_full_bootstrap':
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
'emphatics': ['ETD', 'ETDLB', 'LSETD'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
'LSTD', 'LSETD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'hv_four_rooms_best_learning_curves_full_bootstrap_2':
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'main_algs': ['ETD', 'ETDLB', 'LSTD', 'LSETD']}
auc_or_final = ['auc']
plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'hv_four_rooms_best_overall_params_learning_curves':
fig_size = (10, 4)
alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
'emphatics': ['ETD', 'ETDLB', 'LSETD'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
'LSTD', 'LSETD']}
auc_or_final = ['auc']
plot_learning_curve_best_overall_params(exps=exps, alg_groups=alg_groups, fig_size=fig_size, auc_or_final=auc_or_final)
# endregion
# region sensitivity curves
if func_to_run == 'hv_four_rooms_sensitivity_curves_for_many_lambdas':
sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0]
fig_size = (10, 4)
algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
# algs = ['TB', 'Vtrace', 'ABTD']
auc_or_final = ['auc']
plot_min_performance = False
plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
plot_min_performance=plot_min_performance)
if func_to_run == 'hv_four_rooms_emphatics_sensitivity_full_bootstrap':
sp_list = [0.0]
# fig_size = (11, 5)
fig_size = (10, 4)
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'hv_four_rooms_gradients_sensitivity_full_bootstrap':
sp_list = [0.0]
fig_size = (10, 4)
algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'hv_four_rooms_gradients_sensitivity_full_bootstrap_all_eta':
sp_list = [0.0]
fig_size = (10, 6)
algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'hv_four_rooms_TDRC_all_eta_one_beta':
sp_list = [0.0]
tdrc_beta = [0.01] # possible values are 0.1, 0.01, 1.0. Set them separately to plot.
fig_size = (10, 6)
algs = ['TDRC']
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final, tdrc_beta=tdrc_beta)
if func_to_run == 'hv_four_rooms_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap':
sp_list = [0.0]
fig_size = (10, 4)
alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'],
'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'],
'emphatics': ['ETD', 'ETDLB'],
'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'],
'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']}
auc_or_final = ['auc']
if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping':
plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
elif func_to_run == 'collision_waterfall_full_bootstrap':
plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
if func_to_run == 'hv_four_rooms_emphatics_sensitivity_minimal_bootstrap':
sp_list = [0.9]
fig_size = (6, 4)
auc_or_final = ['auc']
plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
if func_to_run == 'hv_four_rooms_sensitivity_curves_for_two_lambdas':
sp_list = [0.0, 0.9]
fig_size = (6, 4)
algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
auc_or_final = ['auc']
plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
auc_or_final=auc_or_final)
# end region
# endregion
# endregion
# region Misc
if func_to_run == 'plot_value_functions':
plot_value_functions()
if func_to_run == 'plot_all_final_value_functions':
plot_all_final_value_functions()
if func_to_run == 'state_dist':
fig_size = (6, 4)
tasks = ['EightStateCollision', 'LearnEightPoliciesTileCodingFeat',
'HighVarianceLearnEightPoliciesTileCodingFeat']
for task in tasks:
plot_distribution(task=task, fig_size=fig_size)
if func_to_run == 'high_variance_and_normal_dist_comparison':
fig_size = (22, 4)
plot_dist_for_two_four_room_tasks(fig_size=fig_size)
# endregion
# from Plotting.process_state_value_function import plot_value_functions, plot_all_final_value_functions
# from Tasks.HighVarianceLearnEightPoliciesTileCodingFeat import HighVarianceLearnEightPoliciesTileCodingFeat
# from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
# For building d_mu
# obj = HighVarianceLearnEightPoliciesTileCodingFeat()
# d_mu = (obj.generate_behavior_dist(20_000_000))
# numpy.save(os.path.join(os.getcwd(), 'Resources', 'HighVarianceLearnEightPoliciesTileCodingFeat', 'd_mu.npy'), d_mu)
import json
import os
import numpy as np
from Learning import learn
from Plotting.plot_params import EXP_ATTRS
from Plotting.plot_utils import make_params, make_current_params, load_and_replace_large_nan_inf, \
load_best_perf_json, load_best_rerun_params, make_res_path
from utils import create_name_for_save_load, Configuration
def save_perf_over_alpha(alg, exp, auc_or_final, sp, rerun=False):
fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
res_path = make_res_path(alg, exp)
mean_over_alpha, stderr_over_alpha = np.zeros(len(fp_list)), np.zeros(len(fp_list))
best_fp, best_tp, best_fop = load_best_rerun_params(alg, exp, auc_or_final, sp) if rerun else (0, 0, 0)
for tp in tp_list:
for fop in fop_list:
current_params = make_current_params(alg, sp, tp, fop)
for i, fp in enumerate(fp_list):
current_params['alpha'] = fp
load_name = os.path.join(res_path, create_name_for_save_load(current_params))
perf = np.load(f"{load_name}_mean_stderr_{auc_or_final}.npy")
if rerun and fp == best_fp and tp == best_tp and fop == best_fop:
perf = np.load(f"{load_name}_mean_stderr_{auc_or_final}_rerun.npy")
mean_over_alpha[i], stderr_over_alpha[i] = perf[0], perf[1]
save_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=['alpha']))
postfix = ''
if rerun and tp == best_tp and fop == best_fop:
postfix = '_rerun'
np.save(f"{save_name}_mean_{auc_or_final}_over_alpha{postfix}", mean_over_alpha)
np.save(f"{save_name}_stderr_{auc_or_final}_over_alpha{postfix}", stderr_over_alpha)
def find_best_perf(alg, exp, auc_or_final, sp):
exp_attrs = EXP_ATTRS[exp](exp)
fp_list, _, tp_list, fop_list, res_path = make_params(alg, exp)
best_params = {}
best_perf, best_fp, best_sp, best_tp, best_fop = np.inf, np.inf, np.inf, np.inf, np.inf
for fop in fop_list:
for tp in tp_list:
current_params = make_current_params(alg, sp, tp, fop)
load_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=[
'alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
current_perf = load_and_replace_large_nan_inf(
load_name, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement)
min_perf = min(current_perf)
if min_perf < best_perf:
best_perf = min_perf
best_perf_idx = int(np.nanargmin(current_perf))
best_fp = fp_list[best_perf_idx]
best_params = current_params
best_params['alpha'] = best_fp
return best_params
def save_best_perf_in_json(alg, exp, best_params, auc_or_final, sp):
fp_list, _, tp_list, fop_list, res_path = make_params(alg, exp)
exp_path = res_path.replace('Results', 'Experiments')
json_exp = os.path.join(exp_path, f"{alg}.json")
with open(json_exp, 'r') as f:
json_exp = json.load(f)
json_exp['meta_parameters'] = best_params
save_name = os.path.join(res_path, f"{auc_or_final}_{sp}.json")
with open(save_name, 'wt') as f:
json.dump(json_exp, f, indent=4)
def run_learning_with_best_perf(alg, exp, auc_or_final, sp):
res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
best_perf_jsn = load_best_perf_json(alg, exp, sp, auc_or_final)
param_dict = best_perf_jsn['meta_parameters']
param_dict['algorithm'] = alg
param_dict['task'] = best_perf_jsn['task']
param_dict['environment'] = best_perf_jsn['environment']
param_dict['num_steps'] = best_perf_jsn['number_of_steps']
param_dict['num_of_runs'] = best_perf_jsn['number_of_runs']
param_dict['sub_sample'] = best_perf_jsn['sub_sample']
param_dict['save_path'] = res_path
param_dict['save_value_function'] = False
param_dict['rerun'] = True
param_dict['render'] = False
config = Configuration(param_dict)
learn(config)
def process_data(**kwargs):
for exp in kwargs['exps']:
for alg in kwargs['algs']:
for auc_or_final in kwargs['auc_or_final']:
for sp in kwargs['sp_list']:
print(f"\nStarted re-running {exp}, {alg} lmbda_or_zeta: {sp}, {auc_or_final} ...")
save_perf_over_alpha(alg, exp, auc_or_final, sp)
best_params = find_best_perf(alg, exp, auc_or_final, sp)
save_best_perf_in_json(alg, exp, best_params, auc_or_final, sp)
run_learning_with_best_perf(alg, exp, auc_or_final, sp)
save_perf_over_alpha(alg, exp, auc_or_final, sp, rerun=True)
print(f"Finished re-running {exp}, {alg} {best_params}")
#matplotlib>=3.2.2
#numpy>=1.19.0
imageio>=2.9.0
pyglet>=1.5.11
scikit_image>=0.17.2
import time
import utils
from Environments.Chain import Chain
from Environments.FourRoomGridWorld import FourRoomGridWorld
from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
import pyglet
from skimage.transform import resize
import numpy as np
from data_presister import DataPersister, find_best_performance
# if __name__ == "__main__":
# render_mode = 'human'
# render_mode = 'rgb'
# render_mode = 'screen'
#
# frames = []
# env = FourRoomGridWorld()
# # env = Chain()
# env.reset()
# actions = [2, 2, 0, 0, 0, 3, 3, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 3, 1, 1, 2, 2, 2, 0, 0, 2, 2, 2, 2,
# 2, 1, 1, 1, 1, 1, 1, 1
# , 2, 2, 2, 3, 1, 1, 3, 3, 3, 3, 3, 0, 3, 3, 1, 3, 3, 3, 3]
# actions = actions * 1
# for step in range(len(actions)):
# a = actions[step]
# next_state, r, is_terminal, info = env.step(a)
# state = next_state
# frames.append(env.render(mode=render_mode))
# if is_terminal:
# env.reset()
# utils.generate_gif(frames, 'Assets/FourRoomGridWorld.gif', size=(180, 180, 3), duration=1 / 20)
# DataPersister.save_best_pref_over_first_param(exp_name="FirstChain", alg_name="HTD", auc_or_final="auc")
find_best_performance(exp_name="FirstChain", alg_name="HTD", auc_or_final="auc", second_param=0.2)
import unittest
from Tests.Algorithms.TestTD import TestTD
from Tests.Environments.TestChain import TestChain
from Tests.Tasks.TestEightStateCollision import TestEightStateCollision
test_suite = unittest.TestSuite()
test_suite.addTest(unittest.makeSuite(TestChain))
test_suite.addTest(unittest.makeSuite(TestEightStateCollision))
test_suite.addTest(unittest.makeSuite(TestTD))
runner = unittest.TextTestRunner()
runner.run(test_suite)
import numpy as np
import os
def get_save_value_function_steps(num_steps):
return [int(num_steps * i) - 1 for i in [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]]
def save_value_function(value_function, save_path, step, run):
save_dir = os.path.join(save_path, 'Sample_value_function')
res_path = os.path.join(save_dir, f"{step}_{run}")
if not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
np.save(res_path, value_function)
class Configuration(dict):
def __str__(self):
return f"{self.environment} {self.task} {self.algorithm}"
def __getattr__(self, item):
return self[item]
def find_all_experiment_configuration(experiments_path: str, ext='.json'):
if experiments_path.endswith(ext):
yield experiments_path
for root, _, files in os.walk(experiments_path):
for file in files:
if file.endswith(ext):
yield os.path.join(root, file)
class ImmutableDict(dict):
def immutable(self):
raise TypeError("%r objects are immutable" % self.__class__.__name__)
def __setitem__(self, key, value):
self.immutable()
def __delitem__(self, key):
self.immutable()
def set_default(self, k, default):
self.immutable()
def update(self, __m, **kwargs):
self.immutable()
def clear(self) -> None:
self.immutable()
def create_name_for_save_load(param_dict, excluded_params=None):
if excluded_params is None:
excluded_params = []
final_str = ''
for k, v in param_dict.items():
if k in excluded_params:
continue
if k == 'alpha' or k == 'eta':
split_str = str.split(f'{v:.10f}', '.')
else:
split_str = str.split(f'{v:.5f}', '.')
final_str += '_' + k + split_str[0] + split_str[1]
return final_str
def save_result(path, name, result_array, params, rerun):
name_to_save = create_name_for_save_load(param_dict=params)
path_and_name = os.path.join(path, name_to_save)
final_name = f"{path_and_name}{name}"
if rerun:
final_name = f"{final_name}_rerun"
np.save(final_name, result_array)
def generate_gif(frames, path, size=(180, 180, 3), duration=1 / 20):
import imageio
from skimage.transform import resize
for idx, frame_idx in enumerate(frames):
frames[idx] = resize(frame_idx, size, preserve_range=True, order=0).astype(np.uint8)
imageio.mimsave(path, frames, duration=duration)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment