diff --git b/.gitignore a/.gitignore new file mode 100644 index 0000000..12de5b7 --- /dev/null +++ a/.gitignore @@ -0,0 +1,722 @@ + +# Created by https://www.toptal.com/developers/gitignore/api/macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp +# Edit at https://www.toptal.com/developers/gitignore?templates=macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### NotepadPP ### +# Notepad++ backups # +*.bak + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +pytestdebug.log + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +### SublimeText ### +# Cache files for Sublime Text +*.tmlanguage.cache +*.tmPreferences.cache +*.stTheme.cache + +# Workspace files are user-specific +*.sublime-workspace + +# Project files should be checked into the repository, unless a significant +# proportion of contributors will probably not be using Sublime Text +# *.sublime-project + +# SFTP configuration file +sftp-config.json + +# Package control specific files +Package Control.last-run +Package Control.ca-list +Package Control.ca-bundle +Package Control.system-ca-bundle +Package Control.cache/ +Package Control.ca-certs/ +Package Control.merged-ca-bundle +Package Control.user-ca-bundle +oscrypto-ca-bundle.crt +bh_unicode_properties.cache + +# Sublime-github package stores a github token in this file +# https://packagecontrol.io/packages/sublime-github +GitHub.sublime-settings + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +### VisualStudio ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*[.json, .xml, .info] + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp +/.idea diff --git b/Algorithms/ABTD.py a/Algorithms/ABTD.py new file mode 100644 index 0000000..b7fea8d --- /dev/null +++ a/Algorithms/ABTD.py @@ -0,0 +1,45 @@ +from Algorithms.BaseVariableLmbda import BaseVariableLmbda +import numpy as np + + +class ABTD(BaseVariableLmbda): + def __init__(self, task, **kwargs): + super().__init__(task, **kwargs) + zeta = kwargs.get('zeta') + self.old_nu = 0 + if self.task.num_policies > 1: + self.old_nu = np.zeros(self.task.num_policies) + xi_zero = self.task.ABTD_xi_zero + xi_max = self.task.ABTD_xi_max + self.xi = 2 * zeta * xi_zero + max(0, 2 * zeta - 1) * (xi_max - 2 * xi_zero) + + @staticmethod + def related_parameters(): + return['alpha', 'zeta'] + + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, x, x_p, rho, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal) + nu = min(self.xi, 1.0 / max(pi, mu)) + self.z = x + self.gamma * self.old_nu * self.old_pi * self.z + self.w += alpha * delta * self.z + self.old_nu = nu + self.old_pi = pi + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal) + delta = rho * delta + nu = self.compute_nu_for_multiple_policies(pi, mu) + self.z = (self.gamma_vec_t * self.old_nu * self.old_pi)[:, None] * self.z + stacked_x + self.w += alpha_vec[:, None] * (delta[:, None] * self.z) + self.old_nu = nu + self.old_pi = pi + self.gamma_vec_t = self.gamma_vec_tp + + def compute_nu_for_multiple_policies(self, pi, mu): + xi_vec = np.ones(self.task.num_policies) * self.xi + max_vec = 1.0 / np.maximum.reduce([pi, mu]) + return np.minimum.reduce([max_vec, xi_vec]) + + def reset(self): + super().reset() + self.old_nu = 0 diff --git b/Algorithms/BaseGradient.py a/Algorithms/BaseGradient.py new file mode 100644 index 0000000..ca066a8 --- /dev/null +++ a/Algorithms/BaseGradient.py @@ -0,0 +1,25 @@ +import numpy as np +from Algorithms.BaseTD import BaseTD +from Tasks.BaseTask import BaseTask + + +class BaseGradient(BaseTD): + def __init__(self, task: BaseTask, **kwargs): + super().__init__(task, **kwargs) + self.v = np.zeros(self.task.num_features) + self.eta = kwargs.get('eta') + if self.task.num_policies > 1: + self.v = np.zeros((self.task.num_policies, self.task.num_features)) + + @staticmethod + def related_parameters(): + return ['alpha', 'lmbda', 'eta'] + + def compute_second_step_size(self): + return self.eta * self.compute_step_size() + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super(BaseGradient, self).learn_multiple_policies( + s, s_p, r, is_terminal) + return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x, self.task.stacked_feature_rep[:, :, s_p], \ + self.compute_second_step_size() * self.gamma_vec_t / self.gamma diff --git b/Algorithms/BaseLS.py a/Algorithms/BaseLS.py new file mode 100644 index 0000000..88092a2 --- /dev/null +++ a/Algorithms/BaseLS.py @@ -0,0 +1,36 @@ +import numpy as np +from numpy.linalg import pinv +from Tasks.BaseTask import BaseTask +from Algorithms.BaseTD import BaseTD + + +class BaseLS(BaseTD): + def __init__(self, task: BaseTask, **kwargs): + super(BaseLS, self).__init__(task, **kwargs) + self.A = np.zeros((self.task.num_features, self.task.num_features)) + self.b = np.zeros(self.task.num_features) + self.t = 0 + if self.task.num_policies > 1: + self.A = np.zeros((self.task.num_policies, self.task.num_features, self.task.num_features)) + self.b = np.zeros((self.task.num_policies, self.task.num_features)) + self.gamma_vec_t = np.concatenate((np.ones(2), np.zeros(6))) * self.gamma + self.t = np.zeros(self.task.num_policies) + + def learn_single_policy(self, s, s_p, r, is_terminal): + x, x_p = self.get_features(s, s_p, is_terminal) + self.t += 1 + self.A += (np.outer(self.z, (x - self.gamma * x_p)) - self.A) / self.t + self.b += (r * self.z - self.b) / self.t + self.w = np.dot(pinv(self.A), self.b) + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + _, _, x, x_p, _, _, _, stacked_x = \ + super(BaseLS, self).learn_multiple_policies(s, s_p, r, is_terminal) + for i in range(self.task.num_policies): + if self.gamma_vec_t[i] != 0.0: + self.t[i] += 1 + z = self.z[i, :] + self.A[i, :, :] += (np.outer(z, (x - self.gamma_vec_tp[i] * x_p)) - self.A[i, :, :]) / self.t[i] + self.b[i, :] += (self.r_vec[i] * z - self.b[i, :]) / self.t[i] + self.w[i, :] = np.dot(pinv(self.A[i, :, :]), self.b[i, :]) + self.gamma_vec_t = self.gamma_vec_tp diff --git b/Algorithms/BaseTD.py a/Algorithms/BaseTD.py new file mode 100644 index 0000000..9cc74e6 --- /dev/null +++ a/Algorithms/BaseTD.py @@ -0,0 +1,97 @@ +import numpy as np +from Tasks.BaseTask import BaseTask + + +class BaseTD: + def __init__(self, task: BaseTask, **kwargs): + self.task = task + self.w = np.zeros(self.task.num_features) + self.z = np.zeros(self.task.num_features) + if self.task.num_policies > 1: + self.w = np.zeros((self.task.num_policies, self.task.num_features)) + self.z = np.zeros((self.task.num_policies, self.task.num_features)) + self.gamma = self.task.GAMMA + self.alpha = kwargs['alpha'] + self.lmbda = kwargs.get('lmbda') + self.state_values = self.task.load_state_values() # This is of size num_policies * 121 + self.d_mu = self.task.load_behavior_dist() # same size as state_values + self.state, self.next_state, self.action = None, None, None + self.r_vec = np.zeros(self.task.num_policies) + self.gamma_vec_tp = np.zeros(self.task.num_policies) + self.gamma_vec_t = np.zeros(self.task.num_policies) + + @staticmethod + def related_parameters(): + return ['alpha', 'lmbda'] + + def compute_value_function(self): + return np.dot(self.w, self.task.feature_rep.T) + + def compute_rmsve(self): + error = self.compute_value_function() - self.state_values + error_squared = error * error + return np.sqrt(np.sum(self.d_mu * error_squared.T, 0) / np.sum(self.d_mu, 0)), error + + def compute_step_size(self): + return self.alpha + + def choose_behavior_action(self): + return self.task.select_behavior_action(self.state) + + def choose_target_action(self): + return self.task.select_target_action(self.state) + + def learn(self, s, s_p, r, is_terminal): + if self.task.num_policies == 1: + self.learn_single_policy(s, s_p, r, is_terminal) + else: + self.learn_multiple_policies(s, s_p, r, is_terminal) + + def get_features(self, s, s_p, is_terminal): + x_p = np.zeros(self.task.num_features) + if not is_terminal: + x_p = self.task.get_state_feature_rep(s_p) + x = self.task.get_state_feature_rep(s) + return x, x_p + + def get_isr(self, s): + pi = self.task.get_pi(s, self.action) + mu = self.task.get_mu(s, self.action) + rho = pi / mu + return rho + + def get_delta(self, r, x, x_p): + return r + self.gamma * np.dot(self.w, x_p) - np.dot(self.w, x) + + def learn_single_policy(self, s, s_p, r, is_terminal): + x, x_p = self.get_features(s, s_p, is_terminal) + rho = self.get_isr(s) + alpha = self.compute_step_size() + delta = self.get_delta(r, x, x_p) + self.z = rho * (self.gamma * self.lmbda * self.z + x) + return delta, alpha, x, x_p, rho + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + active_policies_vec = self.task.get_active_policies(s) + self.r_vec = np.zeros(self.task.num_policies) + if r > 0: + terminal_policies_vec = self.task.get_terminal_policies(s_p) + self.r_vec = r * terminal_policies_vec + alpha_vec = active_policies_vec * self.compute_step_size() + x = self.task.get_state_feature_rep(s) + x_p = np.zeros(self.task.num_features) + if not is_terminal: + x_p = self.task.get_state_feature_rep(s_p) + pi = self.task.get_pi(s, self.action) + mu = self.task.get_mu(s, self.action) + rho = pi / mu + self.gamma_vec_tp = self.task.get_active_policies(s_p) * self.gamma + delta = self.r_vec + self.gamma_vec_tp * np.dot(self.w, x_p) - np.dot(self.w, x) + stacked_x = self.task.stacked_feature_rep[:, :, s] + return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x + + def reset(self): + self.z = np.zeros(self.task.num_features) + + def __str__(self): + return f'agent:{type(self).__name__}' diff --git b/Algorithms/BaseVariableLmbda.py a/Algorithms/BaseVariableLmbda.py new file mode 100644 index 0000000..1ce84e1 --- /dev/null +++ a/Algorithms/BaseVariableLmbda.py @@ -0,0 +1,25 @@ +from Algorithms.BaseTD import BaseTD +from Tasks.BaseTask import BaseTask +import numpy as np + + +class BaseVariableLmbda(BaseTD): + def __init__(self, task: BaseTask, **kwargs): + super().__init__(task, **kwargs) + self.old_pi, self.old_mu = 0, 1 + if self.task.num_policies > 1: + self.old_pi, self.old_mu = np.zeros(self.task.num_policies), np.ones(self.task.num_policies) + self.old_rho = self.old_pi / self.old_mu + + def learn_single_policy(self, s, s_p, r, is_terminal): + alpha = self.compute_step_size() + pi = self.task.get_pi(s, self.action) + mu = self.task.get_mu(s, self.action) + rho = pi / mu + x, x_p = self.get_features(s, s_p, is_terminal) + delta = rho * self.get_delta(r, x, x_p) + return delta, alpha, x, x_p, rho, pi, mu + + def reset(self): + self.old_pi, self.old_mu = 0, 1 + self.old_rho = self.old_pi / self.old_mu diff --git b/Algorithms/ETD.py a/Algorithms/ETD.py new file mode 100644 index 0000000..10ab5cc --- /dev/null +++ a/Algorithms/ETD.py @@ -0,0 +1,11 @@ +from Algorithms.ETDLB import ETDLB + + +class ETD(ETDLB): + def __init__(self, task, **kwargs): + super().__init__(task, **kwargs) + self.beta = self.task.GAMMA + + @staticmethod + def related_parameters(): + return ['alpha', 'lmbda'] diff --git b/Algorithms/ETDLB.py a/Algorithms/ETDLB.py new file mode 100644 index 0000000..54dcb15 --- /dev/null +++ a/Algorithms/ETDLB.py @@ -0,0 +1,46 @@ +from Algorithms.BaseTD import BaseTD +import numpy as np + + +class ETDLB(BaseTD): + def __init__(self, task, **kwargs): + super().__init__(task, **kwargs) + self.F = 1 + self.old_rho = 0 + self.beta = kwargs.get('beta') + if self.task.num_policies > 1: + self.F = np.zeros(self.task.num_policies) + self.old_rho = np.zeros(self.task.num_policies) + + @staticmethod + def related_parameters(): + return ['alpha', 'lmbda', 'beta'] + + def learn_single_policy(self, s, s_p, r, is_terminal): + x, x_p = self.get_features(s, s_p, is_terminal) + delta = self.get_delta(r, x, x_p) + self.F = self.beta * self.old_rho * self.F + 1 + m = self.lmbda * 1 + (1 - self.lmbda) * self.F + rho = self.get_isr(s) + self.z = rho * (x * m + self.gamma * self.lmbda * self.z) + self.w += self.compute_step_size() * delta * self.z + self.old_rho = rho + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, *_, rho, _ = super().learn_multiple_policies(s, s_p, r, is_terminal) + stacked_x = self.task.stacked_feature_rep[:, :, s] + beta_vec = self.beta * self.gamma_vec_t / self.gamma + self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies) + m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F + self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None]) + self.w += (alpha_vec * delta)[:, None] * self.z + self.old_rho = rho + self.gamma_vec_t = self.gamma_vec_tp + + def reset(self): + super().reset() + self.F = 1 + self.old_rho = 0 + if self.task.num_policies > 1: + self.old_rho = np.zeros(self.task.num_policies) + self.F = np.zeros(self.task.num_policies) diff --git b/Algorithms/GEMETD.py a/Algorithms/GEMETD.py new file mode 100644 index 0000000..b6f1def --- /dev/null +++ a/Algorithms/GEMETD.py @@ -0,0 +1,53 @@ +from Algorithms.BaseTD import BaseTD +import numpy as np + + +class GEMETD(BaseTD): + """ + An ETD(0) implementation that uses GEM (aka GTD2(0) with x and x_p switched) to estimate emphasis. + """ + def __init__(self, task, **kwargs): + super().__init__(task, **kwargs) + self.beta = self.task.GAMMA + self.gem_alpha = kwargs['gem_alpha'] # Step size for GEM weights. + self.gem_beta = kwargs['gem_beta'] # Regularization parameter for GEM; not needed for a fixed target policy. + self.k = np.zeros(self.task.num_features) # Auxiliary weights for GEM. + self.u = np.zeros(self.task.num_features) # Main weights for GEM. + if self.task.num_policies > 1: + self.k = np.zeros((self.task.num_policies, self.task.num_features)) + self.u = np.zeros((self.task.num_policies, self.task.num_features)) + + @staticmethod + def related_parameters(): + return ['alpha', 'gem_alpha', 'gem_beta'] + + def learn_single_policy(self, s, s_p, r, is_terminal): + x, x_p = self.get_features(s, s_p, is_terminal) + rho = self.get_isr(s) + delta_bar = 1 + rho * self.gamma * np.dot(self.u, x) - np.dot(self.u, x_p) + self.k += self.gem_alpha * (delta_bar - np.dot(self.k, x_p)) * x_p + self.u += self.gem_alpha * ((x_p - self.gamma * rho * x) * np.dot(self.k, x_p) - self.gem_beta * self.u) + delta = self.get_delta(r, x, x_p) + m = np.dot(self.u, x) # Use parametric estimate of expected emphasis. + self.w += self.alpha * m * rho * delta * x + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal) + stacked_x_p = self.task.stacked_feature_rep[:, :, s_p] + # GEM update: + gem_alpha_vec = self.task.get_active_policies(s) * self.gem_alpha + delta_bar = np.ones(self.task.num_policies) + rho * self.gamma_vec_t * np.dot(self.u, x) - np.dot(self.u, x_p) + self.k += gem_alpha_vec[:, None] * (delta_bar[:, None] - np.sum(x_p * self.k, 1)[:, None]) * stacked_x_p + self.u += gem_alpha_vec[:, None] * ((stacked_x_p - self.gamma_vec_t[:, None] * rho[:, None] * stacked_x) * np.sum(x_p * self.k, 1)[:, None] - self.gem_beta * self.u) # should self.gem_beta be a vector here? + # ETD(0) update: + m = np.dot(self.u, x) + self.w += (alpha_vec * m * rho * delta)[:, None] * stacked_x + self.gamma_vec_t = self.gamma_vec_tp + + def reset(self): + super().reset() + self.k = np.zeros(self.task.num_features) + self.u = np.zeros(self.task.num_features) + if self.task.num_policies > 1: + self.k = np.zeros((self.task.num_policies, self.task.num_features)) + self.u = np.zeros((self.task.num_policies, self.task.num_features)) diff --git b/Algorithms/GTD.py a/Algorithms/GTD.py new file mode 100644 index 0000000..18dd2e9 --- /dev/null +++ a/Algorithms/GTD.py @@ -0,0 +1,20 @@ +from Algorithms.BaseGradient import BaseGradient +import numpy as np + + +# noinspection DuplicatedCode +class GTD(BaseGradient): + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal) + alpha_v = self.compute_second_step_size() + self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p) + self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x) + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies( + s, s_p, r, is_terminal) + self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x) + phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1) + self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p) + self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x) + self.gamma_vec_t = self.gamma_vec_tp diff --git b/Algorithms/GTD2.py a/Algorithms/GTD2.py new file mode 100644 index 0000000..b19b8a6 --- /dev/null +++ a/Algorithms/GTD2.py @@ -0,0 +1,21 @@ +from Algorithms.BaseGradient import BaseGradient +import numpy as np + + +class GTD2(BaseGradient): + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal) + alpha_v = self.compute_second_step_size() + self.w += alpha * (np.dot(x, self.v) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p) + self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x) + + # noinspection DuplicatedCode + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies( + s, s_p, r, is_terminal) + self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x) + phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1) + self.w += alpha_vec[:, None] * ( + np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p) + self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x) + self.gamma_vec_t = self.gamma_vec_tp diff --git b/Algorithms/HTD.py a/Algorithms/HTD.py new file mode 100644 index 0000000..a95b1e6 --- /dev/null +++ a/Algorithms/HTD.py @@ -0,0 +1,37 @@ +from Algorithms.BaseGradient import BaseGradient +import numpy as np + + +class HTD(BaseGradient): + def __init__(self, task, **kwargs): + super().__init__(task, **kwargs) + self.z_b = np.zeros(self.task.num_features) + if self.task.num_policies > 1: + self.z_b = np.zeros((self.task.num_policies, self.task.num_features)) + + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal) + alpha_v = self.compute_second_step_size() + self.z_b = self.gamma * self.lmbda * self.z_b + x + self.w += alpha * ((delta * self.z) + (x - self.gamma * x_p) * np.dot((self.z - self.z_b), self.v)) + self.v += alpha_v * ((delta * self.z) - (x - self.gamma * x_p) * np.dot(self.v, self.z_b)) + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies( + s, s_p, r, is_terminal) + self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x) + self.z_b = self.lmbda * self.z_b * self.gamma_vec_t[:, None] + stacked_x + gamma_stacked_xp = self.gamma_vec_tp[:, None] * stacked_x_p + delta_z = delta[:, None] * self.z + self.w += alpha_vec[:, None] * ( + delta_z + (stacked_x - gamma_stacked_xp) * (np.sum((self.z - self.z_b) * self.v, 1))[:, None]) + self.v += alphav_vec[:, None] * ( + delta_z - (stacked_x - gamma_stacked_xp) * np.sum(self.v * self.z_b, 1)[:, None]) + # TODO: Should the last v be replaced by w? + self.gamma_vec_t = self.gamma_vec_tp + + def reset(self): + super().reset() + self.z_b = np.zeros(self.task.num_features) + if self.task.num_policies > 1: + self.z_b = np.zeros((self.task.num_policies, self.task.num_features)) diff --git b/Algorithms/LSETD.py a/Algorithms/LSETD.py new file mode 100644 index 0000000..20084dc --- /dev/null +++ a/Algorithms/LSETD.py @@ -0,0 +1,45 @@ +from Algorithms.BaseLS import BaseLS +import numpy as np + + +class LSETD(BaseLS): + def __init__(self, task, **kwargs): + super(LSETD, self).__init__(task, **kwargs) + self.old_rho = 0 + self.F = 1 + self.beta = kwargs['beta'] + if self.task.num_policies > 1: + self.F = np.ones(self.task.num_policies) + self.old_rho = np.zeros(self.task.num_policies) + + @staticmethod + def related_parameters(): + return ['alpha', 'lmbda', 'beta'] + + def learn_single_policy(self, s, s_p, r, is_terminal): + self.F = self.beta * self.old_rho * self.F + 1 + m = self.lmbda + (1 - self.lmbda) * self.F + x, _ = self.get_features(s, s_p, is_terminal) + rho = self.get_isr(s) + self.z = rho * (self.gamma * self.lmbda * self.z + x * m) + super(LSETD, self).learn_single_policy(s, s_p, r, is_terminal) + self.old_rho = rho + + # noinspection DuplicatedCode + def learn_multiple_policies(self, s, s_p, r, is_terminal): + beta_vec = self.beta * self.gamma_vec_t / self.gamma + self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies) + m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F + stacked_x = self.task.stacked_feature_rep[:, :, s] + rho = self.get_isr(s) + self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None]) + super(LSETD, self).learn_multiple_policies(s, s_p, r, is_terminal) + self.old_rho = rho + + def reset(self): + super().reset() + self.F = 1 + self.old_rho = 0 + if self.task.num_policies > 1: + self.old_rho = np.zeros(self.task.num_policies) + self.F = np.zeros(self.task.num_policies) diff --git b/Algorithms/LSTD.py a/Algorithms/LSTD.py new file mode 100644 index 0000000..8fd983d --- /dev/null +++ a/Algorithms/LSTD.py @@ -0,0 +1,13 @@ +from Algorithms.BaseLS import BaseLS + + +class LSTD(BaseLS): + def learn_single_policy(self, s, s_p, r, is_terminal): + x, _ = self.get_features(s, s_p, is_terminal) + self.z = self.get_isr(s) * (self.gamma * self.lmbda * self.z + x) + super(LSTD, self).learn_single_policy(s, s_p, r, is_terminal) + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + x, _ = self.get_features(s, s_p, is_terminal) + self.z = self.get_isr(s)[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + x) + super(LSTD, self).learn_multiple_policies(s, s_p, r, is_terminal) diff --git b/Algorithms/PGTD2.py a/Algorithms/PGTD2.py new file mode 100644 index 0000000..ee7c265 --- /dev/null +++ a/Algorithms/PGTD2.py @@ -0,0 +1,28 @@ +from Algorithms.BaseGradient import BaseGradient +import numpy as np + + +class PGTD2(BaseGradient): + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal) + alpha_v = self.compute_second_step_size() + v_mid = self.v + alpha_v * (delta * self.z - np.dot(x, self.v) * x) + w_mid = self.w + alpha * (np.dot(x, self.v) * x - (1 - self.lmbda) * self.gamma * np.dot(self.z, self.v) * x_p) + delta_mid = r + self.gamma * np.dot(w_mid, x_p) - np.dot(w_mid, x) + self.w += alpha * (np.dot(x, v_mid) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, v_mid) * x_p) + self.v += alpha_v * (delta_mid * self.z - np.dot(x, v_mid) * x) + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies( + s, s_p, r, is_terminal) + self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x) + v_mid = self.v + alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x) + phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1) + w_mid = self.w + alpha_vec[:, None] * ( + np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p) + delta_mid = self.r_vec + self.gamma_vec_tp * np.dot(w_mid, x_p) - np.dot(w_mid, x) + phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * v_mid, 1) + self.w += alpha_vec[:, None] * ( + np.sum(x * v_mid, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p) + self.v += alphav_vec[:, None] * (delta_mid[:, None] * self.z - np.sum(x * v_mid, 1)[:, None] * stacked_x) + self.gamma_vec_t = self.gamma_vec_tp diff --git b/Algorithms/TB.py a/Algorithms/TB.py new file mode 100644 index 0000000..7367ff9 --- /dev/null +++ a/Algorithms/TB.py @@ -0,0 +1,17 @@ +from Algorithms.BaseVariableLmbda import BaseVariableLmbda + + +class TB(BaseVariableLmbda): + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, x, *_, pi, _ = super().learn_single_policy(s, s_p, r, is_terminal) + self.z = self.gamma * self.lmbda * self.old_pi * self.z + x + self.w = self.w + alpha * delta * self.z + self.old_pi = pi + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal) + delta = rho * delta + self.z = (self.gamma_vec_t * self.lmbda * self.old_pi)[:, None] * self.z + stacked_x + self.w += alpha_vec[:, None] * (delta[:, None] * self.z) + self.old_pi = pi + self.gamma_vec_t = self.gamma_vec_tp diff --git b/Algorithms/TD.py a/Algorithms/TD.py new file mode 100644 index 0000000..b647d6e --- /dev/null +++ a/Algorithms/TD.py @@ -0,0 +1,13 @@ +from Algorithms.BaseTD import BaseTD + + +class TD(BaseTD): + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, *_ = super().learn_single_policy(s, s_p, r, is_terminal) + self.w += alpha * delta * self.z + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal) + self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x) + self.w += (alpha_vec * delta)[:, None] * self.z + self.gamma_vec_t = self.gamma_vec_tp diff --git b/Algorithms/TDRC.py a/Algorithms/TDRC.py new file mode 100644 index 0000000..bd62799 --- /dev/null +++ a/Algorithms/TDRC.py @@ -0,0 +1,29 @@ +from Algorithms.BaseGradient import BaseGradient +import numpy as np + + +# noinspection DuplicatedCode +class TDRC(BaseGradient): + def __init__(self, task, **kwargs): + super().__init__(task, **kwargs) + self.tdrc_beta = kwargs['tdrc_beta'] + + @staticmethod + def related_parameters(): + return ['alpha', 'lmbda', 'eta', 'tdrc_beta'] + + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal) + alpha_v = self.compute_second_step_size() + self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p) + self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x) - alpha_v * self.tdrc_beta * self.v + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies( + s, s_p, r, is_terminal) + self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x) + phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1) + self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p) + self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum( + x * self.v, 1)[:, None] * stacked_x) - (alphav_vec * self.tdrc_beta)[:, None] * self.v + self.gamma_vec_t = self.gamma_vec_tp diff --git b/Algorithms/Vtrace.py a/Algorithms/Vtrace.py new file mode 100644 index 0000000..ca1cc65 --- /dev/null +++ a/Algorithms/Vtrace.py @@ -0,0 +1,19 @@ +from Algorithms.BaseVariableLmbda import BaseVariableLmbda +import numpy as np + + +class Vtrace(BaseVariableLmbda): + def learn_single_policy(self, s, s_p, r, is_terminal): + delta, alpha, x, *_, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal) + self.z = min(self.old_rho, 1) * self.gamma * self.lmbda * self.z + x + self.w += alpha * delta * self.z + self.old_rho = pi / mu + + def learn_multiple_policies(self, s, s_p, r, is_terminal): + delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal) + delta = rho * delta + truncated_old_rho = np.minimum(self.old_rho, np.ones(self.task.num_policies)) + self.z = (truncated_old_rho * self.gamma_vec_t * self.lmbda)[:, None] * self.z + stacked_x + self.w += alpha_vec[:, None] * (delta[:, None] * self.z) + self.old_rho = rho + self.gamma_vec_t = self.gamma_vec_tp diff --git b/Assets/Emphatics_sensitivity.png a/Assets/Emphatics_sensitivity.png new file mode 100644 index 0000000..f12f1b8 Binary files /dev/null and a/Assets/Emphatics_sensitivity.png differ diff --git b/Assets/FourRoomGridWorld.gif a/Assets/FourRoomGridWorld.gif new file mode 100644 index 0000000..80901d0 Binary files /dev/null and a/Assets/FourRoomGridWorld.gif differ diff --git b/Assets/Gradients_sensitivity.png a/Assets/Gradients_sensitivity.png new file mode 100644 index 0000000..b040ab7 Binary files /dev/null and a/Assets/Gradients_sensitivity.png differ diff --git b/Assets/chain.gif a/Assets/chain.gif new file mode 100644 index 0000000..a7985f1 Binary files /dev/null and a/Assets/chain.gif differ diff --git b/Assets/eight_state_collision.png a/Assets/eight_state_collision.png new file mode 100644 index 0000000..184ce9b Binary files /dev/null and a/Assets/eight_state_collision.png differ diff --git b/Assets/learning_curves.png a/Assets/learning_curves.png new file mode 100644 index 0000000..c5bcdea Binary files /dev/null and a/Assets/learning_curves.png differ diff --git b/Assets/parameters.png a/Assets/parameters.png new file mode 100644 index 0000000..f291b79 Binary files /dev/null and a/Assets/parameters.png differ diff --git b/Assets/plots.png a/Assets/plots.png new file mode 100644 index 0000000..d7c041c Binary files /dev/null and a/Assets/plots.png differ diff --git b/Assets/rlai.png a/Assets/rlai.png new file mode 100644 index 0000000..551f907 Binary files /dev/null and a/Assets/rlai.png differ diff --git b/Assets/sensitivity_curves_of_all_algs.png a/Assets/sensitivity_curves_of_all_algs.png new file mode 100644 index 0000000..d03f7a2 Binary files /dev/null and a/Assets/sensitivity_curves_of_all_algs.png differ diff --git b/Assets/specific_learning_curves.png a/Assets/specific_learning_curves.png new file mode 100644 index 0000000..3747bfb Binary files /dev/null and a/Assets/specific_learning_curves.png differ diff --git b/Assets/value_functions.png a/Assets/value_functions.png new file mode 100644 index 0000000..d6f4311 Binary files /dev/null and a/Assets/value_functions.png differ diff --git b/Environments/Chain.py a/Environments/Chain.py new file mode 100644 index 0000000..13fa356 --- /dev/null +++ a/Environments/Chain.py @@ -0,0 +1,95 @@ +import numpy as np + + +class Chain: + def __init__(self, states_number: int = 8, start_state_number: int = 4, **kwargs): + assert start_state_number < states_number, "start states numbers should be less than state number" + + self._states_number = states_number + self._start_state_number = start_state_number + self._terminal = self._states_number + self._state = None + self.RIGHT_ACTION = 0 + self.RETREAT_ACTION = 1 + self.num_states = states_number + self._window = None + + def reset(self): + self._state = np.random.randint(0, self._start_state_number) + return self._state + + def step(self, action): + if action == self.RETREAT_ACTION: + return self._terminal, 0, True, {} + + next_state = self._state + 1 + if next_state == self._terminal: + return self._terminal, 1, True, {} + + self._state = next_state + return self._state, 0, False, {} + + def render(self, mode='human'): + if mode == 'human': + import sys + from Environments.utils import colorize + corridor_map = [ + str(i) if i > self._start_state_number + else colorize(str(i), "blue", highlight=False) + for i in range(self._states_number) + ] + corridor_map.append(colorize("T", "red", highlight=False)) + corridor_map[self._state] = colorize(corridor_map[self._state], "green", highlight=True) + + sys.stdout.write(f'{"|".join(corridor_map)}\n') + + if mode == "rgb" or mode == "screen": + RGB_COLORS = { + 'red': np.array([240, 52, 52]), + 'black': np.array([0, 0, 0]), + 'green': np.array([77, 181, 33]), + 'blue': np.array([29, 111, 219]), + 'purple': np.array([112, 39, 195]), + 'yellow': np.array([217, 213, 104]), + 'grey': np.array([192, 195, 196]), + 'light_grey': np.array([230, 230, 230]), + 'white': np.array([255, 255, 255]) + } + img = np.zeros((self.num_states, 1, 3), dtype=np.uint8) + img[:, 0] = RGB_COLORS['grey'] + img[:self._start_state_number - 1, 0] = RGB_COLORS['yellow'] + img[self._terminal - 1, 0] = RGB_COLORS['black'] + img[self._state - 1, 0] = RGB_COLORS['green'] + + img = np.transpose(img, (1, 0, 2)) + if mode == "screen": + from pyglet.window import Window + from pyglet.text import Label + from pyglet.gl import GLubyte + from pyglet.image import ImageData + zoom = 50 + if self._window is None: + self._window = Window(self.num_states * zoom, 1 * zoom) + + dt = np.kron(img, np.ones((zoom, zoom, 1))) + dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8')) + texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture() + self._window.clear() + self._window.switch_to() + self._window.dispatch_events() + texture.blit(0, 0) + # self._info.draw() + self._window.flip() + return np.flip(img, axis=0) + + +if __name__ == '__main__': + env = Chain() + env.reset() + for step in range(1, 1000): + action = np.random.randint(0, 2) + sp, r, terminal, _ = env.step(action=action) + env.render(mode="screen") + if terminal: + env.reset() + print('env reset') diff --git b/Environments/FourRoomGridWorld.py a/Environments/FourRoomGridWorld.py new file mode 100644 index 0000000..d78a3f7 --- /dev/null +++ a/Environments/FourRoomGridWorld.py @@ -0,0 +1,173 @@ +import numpy as np +# from Environments.rendering import Render +# from gym import utils +# import gym +# import sys + +BLOCK_NORMAL, BLOCK_WALL, BLOCK_HALLWAY, BLOCK_AGENT = 0, 1, 2, 3 +RGB_COLORS = { + 'red': np.array([240, 52, 52]), + 'black': np.array([0, 0, 0]), + 'green': np.array([77, 181, 33]), + 'blue': np.array([29, 111, 219]), + 'purple': np.array([112, 39, 195]), + 'yellow': np.array([217, 213, 104]), + 'grey': np.array([192, 195, 196]), + 'light_grey': np.array([230, 230, 230]), + 'white': np.array([255, 255, 255]) +} +four_room_map = [ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], + [1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1], + [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] +] + + +class FourRoomGridWorld: + def __init__(self, stochasticity_fraction=0.0): + self._grid = np.transpose(np.flip(np.array(four_room_map, dtype=np.uint8), axis=0)[1:-1, 1:-1]) + self._max_row, self._max_col = self._grid.shape + self._normal_tiles = np.where(self._grid == BLOCK_NORMAL) + self._hallways_tiles = np.where(self._grid == BLOCK_HALLWAY) + self._walls_tiles = np.where(self._grid == BLOCK_WALL) + self.num_states = self._grid.size + + self._state = None + self.ACTION_UP, self.ACTION_DOWN, self.ACTION_RIGHT, self.ACTION_LEFT = 0, 1, 2, 3 + self.num_actions = 4 + self._stochasticity_fraction = stochasticity_fraction + self.hallways = { + 0: (5, 1), + 1: (1, 5), + 2: (5, 8), + 3: (8, 4) + } + self._window, self._info = None, None + + def reset(self): + self._state = (0, 0) + return self.get_state_index(*self._state) + + def step(self, action): + x, y = self._state + is_stochastic_selected = False + # if self._stochasticity_fraction >= np.random.uniform(): + # action_probability = [1 / (self.num_actions - 1) if i != action else 0 for i in range(self.num_actions)] + # action = np.random.choice(self.num_actions, 1, p=action_probability)[0] + # is_stochastic_selected = True + x_p, y_p = self._next(action, *self._state) + is_done = self._grid[x_p, y_p] == BLOCK_HALLWAY + reward = 1 if is_done else 0 + self._state = (x_p, y_p) + return self.get_state_index(*self._state), reward, False, { + 'x': x, 'y': y, + 'x_p': x_p, 'y_p': y_p, + 'is_stochastic_selected': is_stochastic_selected, + 'selected_action': action} + + def get_xy(self, state): + return (state % self._max_row), (state // self._max_col) + + def get_state_index(self, x, y): + return y * self._max_col + x + + def _next(self, action, x, y): + + def move(current_x, current_y, next_x, next_y): + if next_y < 0 or next_x < 0: + return current_x, current_y + if next_y >= self._max_col or next_x >= self._max_row: + return current_x, current_y + if self._grid[next_x, next_y] == BLOCK_WALL: + return current_x, current_y + return next_x, next_y + + switcher = { + self.ACTION_DOWN: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x, pos_y - 1), + self.ACTION_RIGHT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x + 1, pos_y), + self.ACTION_UP: lambda pox_x, pos_y: move(pox_x, y, pox_x, pos_y + 1), + self.ACTION_LEFT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x - 1, pos_y), + } + move_func = switcher.get(action) + return move_func(x, y) + + def render(self, mode='human'): + import sys + from Environments.utils import colorize + color = { + BLOCK_NORMAL: lambda c: colorize(c, "white", highlight=True), + BLOCK_WALL: lambda c: colorize(c, "gray", highlight=True), + BLOCK_HALLWAY: lambda c: colorize(c, "green", highlight=True), + } + if mode == 'human': + outfile = sys.stdout + img = [ + [color[b](' ') + for x, b + in enumerate(line)] + for y, line in enumerate(four_room_map)] + img[self._max_row - self._state[1]][self._state[0] + 1] = colorize(' ', "red", + highlight=True) + for line in img: + outfile.write(f'{"".join(line)}\n') + outfile.write('\n') + if mode == "rgb" or mode == "screen": + x, y = self._state + img = np.zeros((*self._grid.shape, 3), dtype=np.uint8) + img[self._normal_tiles] = RGB_COLORS['light_grey'] + + # if render_cls is not None: + # assert render_cls is not type(Render), "render_cls should be Render class" + # img = render_cls.render(img) + + img[self._walls_tiles] = RGB_COLORS['black'] + img[self._hallways_tiles] = RGB_COLORS['green'] + img[x, y] = RGB_COLORS['red'] + + ext_img = np.zeros((self._max_row + 2, self._max_col + 2, 3), dtype=np.uint8) + ext_img[1:-1, 1:-1] = np.transpose(img, (1, 0, 2)) + if mode == "screen": + + from pyglet.window import Window + from pyglet.text import Label + from pyglet.gl import GLubyte + from pyglet.image import ImageData + zoom = 20 + if self._window is None: + self._window = Window((self._max_row + 2) * zoom, (self._max_col + 2) * zoom) + self._info = Label('Four Room Grid World', font_size=10, x=5, y=5) + # self._info.text = f'x: {x}, y: {y}' + dt = np.kron(ext_img, np.ones((zoom, zoom, 1))) + dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8')) + texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture() + self._window.clear() + self._window.switch_to() + self._window.dispatch_events() + texture.blit(0, 0) + # self._info.draw() + self._window.flip() + return np.flip(ext_img, axis=0) + + +if __name__ == '__main__': + mode = 'human' + mode = 'screen' + env = FourRoomGridWorld() + env.reset() + for step in range(1, 100): + action = np.random.randint(0, 4) + sp, r, terminal, _ = env.step(action=action) + env.render(mode=mode) + if terminal: + env.reset() + print('env reset') diff --git b/Environments/rendering.py a/Environments/rendering.py new file mode 100644 index 0000000..53196f2 --- /dev/null +++ a/Environments/rendering.py @@ -0,0 +1,47 @@ +from abc import ABC, abstractmethod +import numpy as np + + +class Render(ABC): + @abstractmethod + def render(self, img): + raise NotImplementedError + + +class ErrorRender(Render): + def __init__(self, num_policies, num_steps): + self.num_steps = num_steps + self.num_policies = num_policies + self._error, self._max_error, self._valid_state = None, None, None + + def render(self, img): + # self.color_policy(img, 0) + self.color_policy(img, 1) + # self.color_policy(img, 2) + self.color_policy(img, 3) + # self.color_policy(img, 4) + self.color_policy(img, 5) + # self.color_policy(img, 6) + self.color_policy(img, 7) + + return img + + def add_error(self, error): + if self._max_error is None: + self._max_error = np.abs(error).reshape(8, 11, 11) + self._valid_state = np.array(self._max_error) + self._valid_state[self._valid_state != 0] = 1 + + self._error = np.abs(error).reshape(8, 11, 11) + + def color_policy(self, img, policy_number): + e = self._error[policy_number] + x = self._max_error[policy_number] + d = np.clip((230 * e / x), 10, 255) + d = d * self._valid_state[policy_number] + d = np.nan_to_num(d).astype(np.uint8).T + d = np.repeat(d, 3).reshape(11, 11, 3) + d[:, :, 2] = 230 + c = np.where(self._valid_state[policy_number].T == 1) + img[c] = d[c] + return img diff --git b/Environments/utils.py a/Environments/utils.py new file mode 100644 index 0000000..5dd732e --- /dev/null +++ a/Environments/utils.py @@ -0,0 +1,32 @@ +"""A set of common utilities used within the environments. These are +not intended as API functions, and will not remain stable over time. +""" + +color2num = dict( + gray=30, + red=31, + green=32, + yellow=33, + blue=34, + magenta=35, + cyan=36, + white=37, + crimson=38 +) + + +def colorize(string, color, bold=False, highlight=False): + """Return string surrounded by appropriate terminal color codes to + print colorized text. Valid colors: gray, red, green, yellow, + blue, magenta, cyan, white, crimson + """ + + attr = [] + num = color2num[color] + if highlight: + num += 10 + attr.append(str(num)) + if bold: + attr.append('1') + attrs = ';'.join(attr) + return '\x1b[%sm%s\x1b[0m' % (attrs, string) diff --git b/Experiments/1HVFourRoom/ABTD/ABTD.json a/Experiments/1HVFourRoom/ABTD/ABTD.json new file mode 100644 index 0000000..5ffcd79 --- /dev/null +++ a/Experiments/1HVFourRoom/ABTD/ABTD.json @@ -0,0 +1,17 @@ +{ + "agent": "ABTD", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "zeta": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/ETD/ETD.json a/Experiments/1HVFourRoom/ETD/ETD.json new file mode 100644 index 0000000..c4c9d6e --- /dev/null +++ a/Experiments/1HVFourRoom/ETD/ETD.json @@ -0,0 +1,17 @@ +{ + "agent": "ETD", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/ETDLB/ETDLB.json a/Experiments/1HVFourRoom/ETDLB/ETDLB.json new file mode 100644 index 0000000..3ddc98f --- /dev/null +++ a/Experiments/1HVFourRoom/ETDLB/ETDLB.json @@ -0,0 +1,20 @@ +{ + "agent": "ETDLB", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "beta": [ + 0.0, 0.2, 0.4, 0.6, 0.8, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/GTD/GTD.json a/Experiments/1HVFourRoom/GTD/GTD.json new file mode 100644 index 0000000..dfbfd86 --- /dev/null +++ a/Experiments/1HVFourRoom/GTD/GTD.json @@ -0,0 +1,20 @@ +{ + "agent": "GTD", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/GTD2/GTD2.json a/Experiments/1HVFourRoom/GTD2/GTD2.json new file mode 100644 index 0000000..996c41b --- /dev/null +++ a/Experiments/1HVFourRoom/GTD2/GTD2.json @@ -0,0 +1,20 @@ +{ + "agent": "GTD2", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/HTD/HTD.json a/Experiments/1HVFourRoom/HTD/HTD.json new file mode 100644 index 0000000..5a8ab6d --- /dev/null +++ a/Experiments/1HVFourRoom/HTD/HTD.json @@ -0,0 +1,20 @@ +{ + "agent": "HTD", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/PGTD2/PGTD2.json a/Experiments/1HVFourRoom/PGTD2/PGTD2.json new file mode 100644 index 0000000..947ebef --- /dev/null +++ a/Experiments/1HVFourRoom/PGTD2/PGTD2.json @@ -0,0 +1,20 @@ +{ + "agent": "PGTD2", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/TB/TB.json a/Experiments/1HVFourRoom/TB/TB.json new file mode 100644 index 0000000..dc29a86 --- /dev/null +++ a/Experiments/1HVFourRoom/TB/TB.json @@ -0,0 +1,17 @@ +{ + "agent": "TB", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/TD/TD.json a/Experiments/1HVFourRoom/TD/TD.json new file mode 100644 index 0000000..375d2ce --- /dev/null +++ a/Experiments/1HVFourRoom/TD/TD.json @@ -0,0 +1,17 @@ +{ + "agent": "TD", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/TDRC/TDRC.json a/Experiments/1HVFourRoom/TDRC/TDRC.json new file mode 100644 index 0000000..fb42e9f --- /dev/null +++ a/Experiments/1HVFourRoom/TDRC/TDRC.json @@ -0,0 +1,23 @@ +{ + "agent": "TDRC", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ], + "tdrc_beta": [ + 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/1HVFourRoom/Vtrace/Vtrace.json a/Experiments/1HVFourRoom/Vtrace/Vtrace.json new file mode 100644 index 0000000..97c5acb --- /dev/null +++ a/Experiments/1HVFourRoom/Vtrace/Vtrace.json @@ -0,0 +1,17 @@ +{ + "agent": "Vtrace", + "environment": "FourRoomGridWorld", + "task": "HighVarianceLearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/ABTD/ABTD.json a/Experiments/FirstChain/ABTD/ABTD.json new file mode 100644 index 0000000..47fa1a8 --- /dev/null +++ a/Experiments/FirstChain/ABTD/ABTD.json @@ -0,0 +1,17 @@ +{ + "agent": "ABTD", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "zeta": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/ETD/ETD.json a/Experiments/FirstChain/ETD/ETD.json new file mode 100644 index 0000000..2b78335 --- /dev/null +++ a/Experiments/FirstChain/ETD/ETD.json @@ -0,0 +1,17 @@ +{ + "agent": "ETD", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/ETDLB/ETDLB.json a/Experiments/FirstChain/ETDLB/ETDLB.json new file mode 100644 index 0000000..e54e9c6 --- /dev/null +++ a/Experiments/FirstChain/ETDLB/ETDLB.json @@ -0,0 +1,20 @@ +{ + "agent": "ETDLB", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "beta": [ + 0.0, 0.2, 0.4, 0.6, 0.8, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/GTD/GTD.json a/Experiments/FirstChain/GTD/GTD.json new file mode 100644 index 0000000..92bcc81 --- /dev/null +++ a/Experiments/FirstChain/GTD/GTD.json @@ -0,0 +1,20 @@ +{ + "agent": "GTD", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/GTD2/GTD2.json a/Experiments/FirstChain/GTD2/GTD2.json new file mode 100644 index 0000000..0e926c5 --- /dev/null +++ a/Experiments/FirstChain/GTD2/GTD2.json @@ -0,0 +1,20 @@ +{ + "agent": "GTD2", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/HTD/HTD.json a/Experiments/FirstChain/HTD/HTD.json new file mode 100644 index 0000000..461ccff --- /dev/null +++ a/Experiments/FirstChain/HTD/HTD.json @@ -0,0 +1,20 @@ +{ + "agent": "HTD", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/PGTD2/PGTD2.json a/Experiments/FirstChain/PGTD2/PGTD2.json new file mode 100644 index 0000000..8c78f5c --- /dev/null +++ a/Experiments/FirstChain/PGTD2/PGTD2.json @@ -0,0 +1,20 @@ +{ + "agent": "PGTD2", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/TB/TB.json a/Experiments/FirstChain/TB/TB.json new file mode 100644 index 0000000..d4d9d00 --- /dev/null +++ a/Experiments/FirstChain/TB/TB.json @@ -0,0 +1,17 @@ +{ + "agent": "TB", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/TD/TD.json a/Experiments/FirstChain/TD/TD.json new file mode 100644 index 0000000..623c42e --- /dev/null +++ a/Experiments/FirstChain/TD/TD.json @@ -0,0 +1,17 @@ +{ + "agent": "TD", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/TDRC/TDRC.json a/Experiments/FirstChain/TDRC/TDRC.json new file mode 100644 index 0000000..6087ab1 --- /dev/null +++ a/Experiments/FirstChain/TDRC/TDRC.json @@ -0,0 +1,23 @@ +{ + "agent": "TDRC", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ], + "tdrc_beta": [ + 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstChain/Vtrace/Vtrace.json a/Experiments/FirstChain/Vtrace/Vtrace.json new file mode 100644 index 0000000..e8b79ad --- /dev/null +++ a/Experiments/FirstChain/Vtrace/Vtrace.json @@ -0,0 +1,17 @@ +{ + "agent": "Vtrace", + "environment": "Chain", + "task": "EightStateOffPolicyRandomFeat", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/ABTD/ABTD.json a/Experiments/FirstFourRoom/ABTD/ABTD.json new file mode 100644 index 0000000..21dec1b --- /dev/null +++ a/Experiments/FirstFourRoom/ABTD/ABTD.json @@ -0,0 +1,17 @@ +{ + "agent": "ABTD", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "zeta": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/ETD/ETD.json a/Experiments/FirstFourRoom/ETD/ETD.json new file mode 100644 index 0000000..c260770 --- /dev/null +++ a/Experiments/FirstFourRoom/ETD/ETD.json @@ -0,0 +1,17 @@ +{ + "agent": "ETD", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/ETDLB/ETDLB.json a/Experiments/FirstFourRoom/ETDLB/ETDLB.json new file mode 100644 index 0000000..32c8413 --- /dev/null +++ a/Experiments/FirstFourRoom/ETDLB/ETDLB.json @@ -0,0 +1,20 @@ +{ + "agent": "ETDLB", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "beta": [ + 0.0, 0.2, 0.4, 0.6, 0.8, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/GTD/GTD.json a/Experiments/FirstFourRoom/GTD/GTD.json new file mode 100644 index 0000000..32897cc --- /dev/null +++ a/Experiments/FirstFourRoom/GTD/GTD.json @@ -0,0 +1,20 @@ +{ + "agent": "GTD", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/GTD2/GTD2.json a/Experiments/FirstFourRoom/GTD2/GTD2.json new file mode 100644 index 0000000..ec21488 --- /dev/null +++ a/Experiments/FirstFourRoom/GTD2/GTD2.json @@ -0,0 +1,20 @@ +{ + "agent": "GTD2", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/HTD/HTD.json a/Experiments/FirstFourRoom/HTD/HTD.json new file mode 100644 index 0000000..3ebff2c --- /dev/null +++ a/Experiments/FirstFourRoom/HTD/HTD.json @@ -0,0 +1,20 @@ +{ + "agent": "HTD", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/PGTD2/PGTD2.json a/Experiments/FirstFourRoom/PGTD2/PGTD2.json new file mode 100644 index 0000000..8f7f1a7 --- /dev/null +++ a/Experiments/FirstFourRoom/PGTD2/PGTD2.json @@ -0,0 +1,20 @@ +{ + "agent": "PGTD2", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/TB/TB.json a/Experiments/FirstFourRoom/TB/TB.json new file mode 100644 index 0000000..d7bae3d --- /dev/null +++ a/Experiments/FirstFourRoom/TB/TB.json @@ -0,0 +1,17 @@ +{ + "agent": "TB", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/TD/TD.json a/Experiments/FirstFourRoom/TD/TD.json new file mode 100644 index 0000000..7ee4b6d --- /dev/null +++ a/Experiments/FirstFourRoom/TD/TD.json @@ -0,0 +1,17 @@ +{ + "agent": "TD", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/TDRC/TDRC.json a/Experiments/FirstFourRoom/TDRC/TDRC.json new file mode 100644 index 0000000..38ea260 --- /dev/null +++ a/Experiments/FirstFourRoom/TDRC/TDRC.json @@ -0,0 +1,23 @@ +{ + "agent": "TDRC", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "eta": [ + 0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ], + "tdrc_beta": [ + 1.0 + ] + } +} \ No newline at end of file diff --git b/Experiments/FirstFourRoom/Vtrace/Vtrace.json a/Experiments/FirstFourRoom/Vtrace/Vtrace.json new file mode 100644 index 0000000..7ba2682 --- /dev/null +++ a/Experiments/FirstFourRoom/Vtrace/Vtrace.json @@ -0,0 +1,17 @@ +{ + "agent": "Vtrace", + "environment": "FourRoomGridWorld", + "task": "LearnEightPoliciesTileCodingFeat", + "number_of_runs": 50, + "number_of_steps": 50000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0 + ] + } +} \ No newline at end of file diff --git b/ExportBin/ReadMe a/ExportBin/ReadMe new file mode 100644 index 0000000..f73d946 --- /dev/null +++ a/ExportBin/ReadMe @@ -0,0 +1 @@ +This directory contains all the export.dat files created when submitting jobs on Cedar. \ No newline at end of file diff --git b/Job/Cedar_Create_Config_Template.sh a/Job/Cedar_Create_Config_Template.sh new file mode 100644 index 0000000..6207655 --- /dev/null +++ a/Job/Cedar_Create_Config_Template.sh @@ -0,0 +1,38 @@ +#!/bin/bash +alpha=(__ALPHA__) +lmbda=(__LMBDA__) +eta=(__ETA__) +beta=(__BETA__) +zeta=(__ZETA__) +tdrc_beta=(__TDRCBETA__) +gem_alpha=(__GEMALPHA__) +gem_beta=(__GEMBETA__) +num_of_runs=__NUMOFRUNS__ +num_steps=__NUMSTEPS__ +sub_sample=__SUBSAMPLE__ +algorithm=__ALGORITHM__ +environment=__ENVIRONMENT__ +task=__TASK__ +save_path=__SAVEPATH__ + +rm -f exports_${algorithm}.dat +for A in ${alpha[@]}; do + for L in ${lmbda[@]}; do + for E in ${eta[@]}; do + for B in ${beta[@]}; do + for Z in ${zeta[@]}; do + for T in ${tdrc_beta[@]}; do + for GA in ${gem_alpha[@]}; do + for GB in ${gem_beta[@]}; do + echo export SAVE_PATH=${save_path} ENVIRONMENT=${environment} ALGORITHM=${algorithm} \ + TASK=${task} ALPHA=${A} LMBDA=${L} ETA=${E} BETA=${B} ZETA=${Z} TDRCBETA=${T} GEMALPHA=${GA} \ + GEMBETA=${GB} NUMOFRUNS=${num_of_runs} NUMSTEPS=${num_steps} SUBSAMPLE=${sub_sample} \ + >>exports_${algorithm}.dat + done + done + done + done + done + done + done +done diff --git b/Job/JobBuilder.py a/Job/JobBuilder.py new file mode 100644 index 0000000..0ed3f0b --- /dev/null +++ a/Job/JobBuilder.py @@ -0,0 +1,179 @@ +import os +import json +import numpy as np +from utils import ImmutableDict +import time + +default_params = ImmutableDict( + { + 'agent': 'GEMETD', + 'task': 'EightStateCollision', + 'environment': 'Chain', + 'exp': 'FirstChain', + # 'agent': 'HTD', + # 'task': 'LearnEightPoliciesTileCodingFeat', + # 'environment': 'FourRoomGridWorld', + # 'exp': 'FirstFourRoom', + # 'agent': 'LSTD', + # 'task': 'HighVarianceLearnEightPoliciesTileCodingFeat', + # 'environment': 'FourRoomGridWorld', + # 'exp': '1HVFourRoom', + + 'save_value_function': True, + 'sub_sample': 1, + 'num_of_runs': 3, + 'num_steps': 20_000, + 'meta_parameters': { + 'alpha': 0.001953125, + 'eta': 16.0, + 'beta': 0.9, + 'zeta': 0.9, + 'lmbda': 0.0, + 'tdrc_beta': 1.0, + 'gem_alpha': 0.1, + 'gem_beta': 0.1 + } + } +) + + +class JobBuilder: + def __init__(self, json_path, server_name): + self._path = json_path + self.server_name = server_name + with open(self._path) as f: + self._params = json.load(f) + + self._batch_params = ImmutableDict( + { + 'ALPHA': ' '.join([f'{num:.10f}' for num in self.alpha]), + 'LMBDA': ' '.join([f'{num:.5f}' for num in self.lmbda]), + 'ETA': ' '.join([f'{num:.10f}' for num in self.eta]), + 'BETA': ' '.join([f'{num:.5f}' for num in self.beta]), + 'ZETA': ' '.join([f'{num:.5f}' for num in self.zeta]), + 'TDRCBETA': ' '.join([f'{num:.5f}' for num in self.tdrc_beta]), + 'GEMALPHA': ' '.join([f'{num:.5f}' for num in self.gem_alpha]), + 'GEMBETA': ' '.join([f'{num:.5f}' for num in self.gem_beta]), + 'NUMOFRUNS': f'{self.num_of_runs}', + 'NUMSTEPS': f'{self.num_steps}', + 'SUBSAMPLE': f'{self.sub_sample}', + 'ALGORITHM': self.agent, + 'TASK': self.task, + 'ENVIRONMENT': self.environment, + 'SAVEPATH': self.save_path + }) + + @property + def tdrc_beta(self): + parameters = self._params.get('meta_parameters') + return np.asarray(parameters.get('tdrc_beta', [default_params['meta_parameters']['tdrc_beta']])) + + @property + def gem_alpha(self): + parameters = self._params.get('meta_parameters') + return np.asarray(parameters.get('gem_alpha', [default_params['meta_parameters']['gem_alpha']])) + + @property + def gem_beta(self): + parameters = self._params.get('meta_parameters') + return np.asarray(parameters.get('gem_beta', [default_params['meta_parameters']['gem_beta']])) + + @property + def alpha(self): + parameters = self._params.get('meta_parameters') + return np.asarray(parameters.get('alpha', [default_params['meta_parameters']['alpha']])) + + @property + def lmbda(self): + parameters = self._params.get('meta_parameters') + return np.asarray(parameters.get('lmbda', [default_params['meta_parameters']['lmbda']])) + + @property + def eta(self): + parameters = self._params.get('meta_parameters') + return np.asarray(parameters.get('eta', [default_params['meta_parameters']['eta']])) + + @property + def beta(self): + parameters = self._params.get('meta_parameters') + return np.asarray(parameters.get('beta', [default_params['meta_parameters']['beta']])) + + @property + def zeta(self): + parameters = self._params.get('meta_parameters') + return np.asarray(parameters.get('zeta', [default_params['meta_parameters']['zeta']])) + + @property + def agent(self): + return self._params.get('agent', default_params['agent']) + + @property + def task(self): + return self._params.get('task', default_params['task']) + + @property + def num_of_runs(self): + return np.asarray(self._params.get('number_of_runs', default_params['num_of_runs'])) + + @property + def num_steps(self): + return np.asarray(self._params.get('number_of_steps', default_params['num_steps'])) + + @property + def sub_sample(self): + return np.asarray(self._params.get('sub_sample', default_params['sub_sample'])) + + @property + def environment(self): + return self._params.get('environment', default_params['environment']) + + @property + def save_path(self): + return os.path.dirname(self._path).replace("/Experiments/", "/Results/") + + def create_dat_file(self): + with open('Job/Cedar_Create_Config_Template.sh', 'r') as f: + text = f.read() + for k, v in self._batch_params.items(): + text = text.replace(f'__{k}__', v) + return text + + def to_shell(self): + if self.server_name.upper() == 'NODE': + with open('Job/SubmitJobsTemplates.SL', 'r') as f: + text = f.read() + for k, v in self._batch_params.items(): + text = text.replace(f'__{k}__', v) + return text + elif self.server_name.upper() == 'CPU': + with open('Job/SubmitJobsTemplatesCedar.SL', 'r') as f: + text = f.read() + alg = self._batch_params['ALGORITHM'] + num_of_jobs = sum(1 for _ in open(f'exports_{alg}.dat')) + text = text.replace('__ALG__', self._batch_params['ALGORITHM']) + text = text.replace('__NUM_OF_JOBS__', str(num_of_jobs)) + text = text.replace('__NAME_OF_EXP__', f'{self._batch_params["TASK"]}_{self._batch_params["ALGORITHM"]}') + return text + + def run_batch(self): + if self.server_name.upper() == 'NODE': + print('Submitting the ' + self.agent + ' algorithm jobs on nodes...') + elif self.server_name.upper() == 'CPU': + print('Submitting the ' + self.agent + ' algorithm jobs on individual cpus...') + with open('Create_Configs.sh', 'wt') as f: + f.write(self.create_dat_file()) + time.sleep(1) + os.system('bash Create_Configs.sh') + with open('Submit_Jobs.SL', 'wt') as f: + f.write(self.to_shell()) + time.sleep(1) + os.system('sbatch Submit_Jobs.SL') + time.sleep(1) + os.remove('Submit_Jobs.SL') + if self.server_name.upper() == 'CPU': + os.remove('Create_Configs.sh') + # alg = self._batch_params['ALGORITHM'] + # os.remove(f'exports_{alg}.dat') + + def __call__(self): + return self.run_batch() diff --git b/Job/SubmitJobsTemplates.SL a/Job/SubmitJobsTemplates.SL new file mode 100644 index 0000000..dcfe527 --- /dev/null +++ a/Job/SubmitJobsTemplates.SL @@ -0,0 +1,51 @@ +#!/bin/bash +# SLURM submission script for submitting multiple serial jobs on Niagara +# +#SBATCH --account=xxx +#SBATCH --time=11:58:59 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=40 +#SBATCH --job-name __TASK_____ALGORITHM__ + + +alpha=(__ALPHA__) +lmbda=(__LMBDA__) +eta=(__ETA__) +beta=(__BETA__) +zeta=(__ZETA__) +tdrc_beta=(__TDRCBETA__) +gem_alpha=(__GEMALPHA__) +gem_beta=(__GEMBETA__) +num_of_runs=__NUMOFRUNS__ +num_steps=__NUMSTEPS__ +sub_sample=__SUBSAMPLE__ +algorithm=__ALGORITHM__ +environment=__ENVIRONMENT__ +task=__TASK__ +save_path=__SAVEPATH__ + +source ~/RLENV/bin/activate +module load NiaEnv/2019b +module load gnu-parallel +module load python + +cd $SLURM_SUBMIT_DIR || exit +export OMP_NUM_THREADS=1 + +echo "The number of available cores is echo $NCORES" +echo "Current working directory is $(pwd)" +echo "Running on hostname $(hostname)" +echo "Starting run at: $(date)" + +HOSTS=$(scontrol show hostnames $SLURM_NODELIST | tr '\n' ,) +NCORES=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE)) + + +parallel --env OMP_NUM_THREADS,PATH,LD_LIBRARY_PATH --joblog slurm-$SLURM_JOBID.log -j $NCORES -S $HOSTS --wd $PWD \ +python Learning.py ::: -sp ::: ${save_path} ::: -e ::: ${environment} ::: -alg ::: ${algorithm} ::: -t ::: ${task[@]} \ +::: -a ::: ${alpha[@]} ::: -nr ::: ${num_of_runs} ::: -ns ::: ${num_steps} ::: -et ::: ${eta[@]} \ +::: -l ::: ${lmbda[@]} ::: -z ::: ${zeta[@]} ::: -tb ::: ${tdrc_beta[@]} ::: -b ::: ${beta[@]} ::: \ +-ga ::: ${gem_alpha[@]} ::: -gb ::: ${gem_beta[@]} ::: -ss ::: ${sub_sample} + + +echo "Program test finished with exit code $? at: $(date)" diff --git b/Job/SubmitJobsTemplatesCedar.SL a/Job/SubmitJobsTemplatesCedar.SL new file mode 100644 index 0000000..1b2b033 --- /dev/null +++ a/Job/SubmitJobsTemplatesCedar.SL @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --account=xxx +#SBATCH --time=00:15:58 +#SBATCH --cpus-per-task=1 +#SBATCH --mem=3G +#SBATCH --array=1-__NUM_OF_JOBS__ +#SBATCH --job-name __NAME_OF_EXP__ + +alg=__ALG__ +source ~/RLENV/bin/activate +`sed -n "${SLURM_ARRAY_TASK_ID}p" 0)[0] + + +def get_active_d_mu(task, d_mu, active_states, policy_no=0): + if task == 'EightStateCollision': + return d_mu + return d_mu[active_states, policy_no].squeeze() + + +def plot_distribution(**kwargs): + task = kwargs['task'] + d_mu = load_d_mu(task) + state_values = load_state_values(task) + for policy_no in range(state_values.shape[0]): + fig, ax = plt.subplots(figsize=kwargs['fig_size']) + active_states = find_active_states(task, d_mu, state_values, policy_no) + active_d_mu = get_active_d_mu(task, d_mu, active_states, policy_no) + plot_d_mu(ax, active_d_mu, active_states) + plt.show() + if task == 'EightStateCollision': + break + + +def plot_dist_for_two_four_room_tasks(**kwargs): + task1 = 'LearnEightPoliciesTileCodingFeat' + task2 = 'HighVarianceLearnEightPoliciesTileCodingFeat' + save_dir = os.path.join('pdf_plots', 'Misc', 'CompareDistsFR') + d_mu1 = load_d_mu(task1) + d_mu2 = load_d_mu(task2) + state_values1 = load_state_values(task1) + state_values2 = load_state_values(task2) + for policy_no in range(state_values1.shape[0]): + fig, ax = plt.subplots(figsize=kwargs['fig_size']) + active_states = find_active_states(task1, d_mu1, state_values1, policy_no) + active_d_mu = get_active_d_mu(task1, d_mu1, active_states, policy_no) + plot_d_mu(ax, active_d_mu, active_states) + active_states = find_active_states(task2, d_mu2, state_values2, policy_no) + active_d_mu = get_active_d_mu(task2, d_mu2, active_states, policy_no) + plot_d_mu(ax, active_d_mu, active_states) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + fig.savefig(os.path.join(save_dir, f"dist_policy_{policy_no}.pdf"), + format='pdf', dpi=1000, bbox_inches='tight') + plt.show() + + diff --git b/Plotting/plot_learning_curve.py a/Plotting/plot_learning_curve.py new file mode 100644 index 0000000..b65150b --- /dev/null +++ a/Plotting/plot_learning_curve.py @@ -0,0 +1,113 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import pylab +from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \ + PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX +from Plotting.plot_utils import load_best_rerun_params_dict +from utils import create_name_for_save_load + + +def load_data(alg, exp, best_params, postfix=''): + res_path = os.path.join(os.getcwd(), 'Results', exp, alg) + generic_name = create_name_for_save_load(best_params) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy") + mean_lc = np.load(load_file_name) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy") + stderr_lc = np.load(load_file_name) + return mean_lc, stderr_lc + + +def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False, + smoothing_window=1): + zoomed_in = True if is_smoothed else False + alpha = 1.0 + if PLOT_RERUN_AND_ORIG: + alpha = 1.0 if second_time else 0.5 + lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha'])) + color = ALG_COLORS[alg] + # if alg == 'TD': + # color = 'grey' + # alpha = 0.7 + if is_smoothed: + mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid') + mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid') + ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha) + ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2, + color=color, alpha=0.1*alpha) + # ax.legend() + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set_xlim(exp_attrs.x_lim) + ax.set_ylim(exp_attrs.y_lim) + if zoomed_in: + ax.set_ylim([0.0, 0.4]) + else: + ax.yaxis.set_ticks(exp_attrs.y_axis_ticks) + ax.xaxis.set_ticks(exp_attrs.x_axis_ticks) + ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25) + ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + ax.spines['left'].set_linewidth(2) + ax.spines['bottom'].set_linewidth(2) + +def get_ls_rmsve(alg, exp, sp): + res_path = os.path.join(os.getcwd(), 'Results', exp, alg) + params = {'alpha': 0.01, 'lmbda': sp} + if alg == 'LSETD': + params['beta'] = 0.9 + generic_name = create_name_for_save_load(params) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy") + return np.load(load_file_name) + + +def plot_ls_solution(ax, ls_rmsve, alg, sp): + lbl = f"{alg} $\\lambda=$ {sp}" + x = np.arange(ls_rmsve.shape[0]) + y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0]) + ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':') + # ax.legend() + + +def plot_learning_curve(**kwargs): + is_smoothed = True if 'is_smoothed' in kwargs else False + smoothing_window = kwargs.get('smoothing_window', 1) + for exp in kwargs['exps']: + exp_attrs = EXP_ATTRS[exp](exp) + for auc_or_final in kwargs['auc_or_final']: + for sp in kwargs['sp_list']: + save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final) + for alg_names in kwargs['alg_groups'].values(): + fig, ax = plt.subplots(figsize=kwargs['fig_size']) + for alg in alg_names: + if alg in ['LSTD', 'LSETD']: + ls_rmsve = get_ls_rmsve(alg, exp, sp) + plot_ls_solution(ax, ls_rmsve, alg, sp) + continue + prefix = RERUN_POSTFIX if PLOT_RERUN else '' + current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp) + mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix) + plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=False, + is_smoothed=is_smoothed, smoothing_window=smoothing_window) + if PLOT_RERUN_AND_ORIG: + prefix = RERUN_POSTFIX + mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix) + plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=True, + is_smoothed=is_smoothed, smoothing_window=smoothing_window) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + pylab.gca().set_rasterized(True) + if PLOT_RERUN_AND_ORIG: + prefix = '_rerun_and_original' + elif PLOT_RERUN: + prefix = RERUN_POSTFIX + else: + prefix = '' + fig.savefig(os.path.join(save_dir, + f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"), + format='pdf', dpi=200, bbox_inches='tight') + plt.show() + plt.close(fig) diff --git b/Plotting/plot_learning_curves_for_all_third_params.py a/Plotting/plot_learning_curves_for_all_third_params.py new file mode 100644 index 0000000..7fc7860 --- /dev/null +++ a/Plotting/plot_learning_curves_for_all_third_params.py @@ -0,0 +1,123 @@ +import os + +import matplotlib.pyplot as plt +import numpy as np +import pylab + +from Plotting.plot_params import ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN_AND_ORIG +from Plotting.plot_utils import make_params, get_alphas, make_current_params +from utils import create_name_for_save_load + + +def load_data(alg, exp, best_params, postfix=''): + res_path = os.path.join(os.getcwd(), 'Results', exp, alg) + generic_name = create_name_for_save_load(best_params) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy") + mean_lc = np.load(load_file_name) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy") + stderr_lc = np.load(load_file_name) + return mean_lc, stderr_lc + + +def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False): + alpha = 1.0 + if PLOT_RERUN_AND_ORIG: + alpha = 1.0 if second_time else 0.5 + lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha'])) + color = ALG_COLORS[alg] + ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha) + ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2, + color=color, alpha=0.1*alpha) + # ax.legend() + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set_xlim(exp_attrs.x_lim) + ax.set_ylim(exp_attrs.y_lim) + ax.xaxis.set_ticks(exp_attrs.x_axis_ticks) + ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25) + ax.yaxis.set_ticks(exp_attrs.y_axis_ticks) + ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels) + ax.spines['left'].set_linewidth(2) + ax.spines['bottom'].set_linewidth(2) + + +def get_ls_rmsve(alg, exp, sp): + res_path = os.path.join(os.getcwd(), 'Results', exp, alg) + params = {'alpha': 0.01, 'lmbda': sp} + if alg == 'LSETD': + params['beta'] = 0.9 + generic_name = create_name_for_save_load(params) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy") + return np.load(load_file_name) + + +def plot_ls_solution(ax, ls_rmsve, alg, sp): + lbl = f"{alg} $\\lambda=$ {sp}" + x = np.arange(ls_rmsve.shape[0]) + y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0]) + ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--') + # ax.legend() + + +def load_specific_params_dict(alg, exp, sp, tp): + if alg == 'TD': + return {'alpha': 0.25, 'lmbda': sp} + if alg == 'ETD': + return {'alpha': 0.00390625, 'lmbda': sp} + if alg == 'ETDLB': + return {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2} + if alg == 'TDRC': + return {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0} + if alg == 'GTD': + return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp} + if alg == 'PGTD2': + return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp} + + +def load_sample_params_dict(alg, exp, sp): + fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp) + if alg in ['TD', 'ETD', 'TB', 'Vtrace']: + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp} + if alg == 'ABTD': + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp} + if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']: + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp, + 'eta': tp_list[np.random.randint(0, len(tp_list))]} + if alg == 'ETDLB': + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp, + 'beta': tp_list[np.random.randint(0, len(tp_list))]} + if alg == 'TDRC': + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp, + 'eta': tp_list[np.random.randint(0, len(tp_list))], + 'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]} + + +def plot_all_learning_curves_for_third(**kwargs): + for exp in kwargs['exps']: + prefix = '' + exp_attrs = EXP_ATTRS[exp](exp) + for auc_or_final in kwargs['auc_or_final']: + for sp in kwargs['sp_list']: + save_dir = os.path.join('pdf_plots', 'all_third_learning_curves', auc_or_final) + fig, ax = plt.subplots(figsize=kwargs['fig_size']) + for alg in kwargs['algs']: + if alg in ['LSTD', 'LSETD']: + ls_rmsve = get_ls_rmsve(alg, exp, sp) + plot_ls_solution(ax, ls_rmsve, alg, sp) + continue + for tp in kwargs['tp_list']: + for fp in get_alphas(alg, exp): + for fop in [1.0]: + current_params = make_current_params(alg, sp, tp, fop, fp) + mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix) + plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + pylab.gca().set_rasterized(True) + fig.savefig(os.path.join(save_dir, + f"{prefix}_learning_curve_{'_'.join(kwargs['algs'])}{exp}Lmbda{sp}.pdf"), + format='pdf', dpi=200, bbox_inches='tight') + plt.show() + plt.close(fig) diff --git b/Plotting/plot_learning_for_two_lambdas.py a/Plotting/plot_learning_for_two_lambdas.py new file mode 100644 index 0000000..176ff21 --- /dev/null +++ a/Plotting/plot_learning_for_two_lambdas.py @@ -0,0 +1,82 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import pylab +from Plotting.plot_params import ALG_GROUPS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, RERUN_POSTFIX, \ + PLOT_RERUN_AND_ORIG +from Plotting.plot_utils import load_best_rerun_params_dict +from utils import create_name_for_save_load + + +# noinspection DuplicatedCode +def load_data(alg, exp, best_params, postfix=''): + res_path = os.path.join(os.getcwd(), 'Results', exp, alg) + generic_name = create_name_for_save_load(best_params) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy") + mean_lc = np.load(load_file_name) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy") + stderr_lc = np.load(load_file_name) + return mean_lc, stderr_lc + + +# noinspection DuplicatedCode +def plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, second_time=False): + alpha = 1.0 + if PLOT_RERUN_AND_ORIG: + alpha = 1.0 if second_time else 0.5 + color = 'blue' if sp else 'red' + lbl = (alg + r' $\lambda=$ ' + str(sp)) + ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha) + ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2, + color=color, alpha=0.1*alpha) + ax.legend() + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set_xlim(exp_attrs.x_lim) + ax.set_ylim(exp_attrs.y_lim) + ax.xaxis.set_ticks(exp_attrs.x_axis_ticks) + ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25) + ax.yaxis.set_ticks(exp_attrs.y_axis_ticks) + ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels) + ax.tick_params(axis='x', which='major', labelsize=exp_attrs.size_of_labels) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + + +# noinspection DuplicatedCode +def plot_learning_curve_for_lambdas(**kwargs): + for exp in kwargs['exps']: + exp_attrs = EXP_ATTRS[exp](exp) + for auc_or_final in kwargs['auc_or_final']: + for alg_names in kwargs['alg_groups'].values(): + for alg in alg_names: + if alg in ['LSETD', 'LSTD']: + continue + fig, ax = plt.subplots(figsize=kwargs['fig_size']) + save_dir = os.path.join('pdf_plots', 'learning_curves_for_lambdas', auc_or_final) + for sp in kwargs['sp_list']: + prefix = RERUN_POSTFIX if PLOT_RERUN else '' + current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp) + print(alg, current_params) + mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix) + plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs) + if PLOT_RERUN_AND_ORIG: + prefix = RERUN_POSTFIX + mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix) + plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, True) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + pylab.gca().set_rasterized(True) + if PLOT_RERUN_AND_ORIG: + prefix = '_rerun_and_original' + elif PLOT_RERUN: + prefix = RERUN_POSTFIX + else: + prefix = '' + fig.savefig(os.path.join(save_dir, + f"{prefix}_learning_curve_{alg}{exp}.pdf"), + format='pdf', dpi=200, bbox_inches='tight') + # plt.show() + plt.close(fig) diff --git b/Plotting/plot_params.py a/Plotting/plot_params.py new file mode 100644 index 0000000..2a5627a --- /dev/null +++ a/Plotting/plot_params.py @@ -0,0 +1,41 @@ +from Plotting.plot_utils import FirstChainAttr, FirstFourRoomAttr, HVFirstFourRoomAttr +from Registry.AlgRegistry import alg_dict + + +PLOT_RERUN = True +PLOT_RERUN_AND_ORIG = False +if PLOT_RERUN and PLOT_RERUN_AND_ORIG: + PLOT_RERUN_AND_ORIG = False +RERUN_POSTFIX = '_rerun' +DEBUG_MODE = True + +# noinspection SpellCheckingInspection +COLORS = ['#000000', "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", + "#17becf"] +ALG_COLORS = {alg_name: color for alg_name, color in zip(alg_dict.keys(), COLORS)} +ALG_COLORS['LSTD'] = ALG_COLORS['TD'] +ALG_COLORS['LSETD'] = ALG_COLORS['ETD'] +ALG_GROUPS = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'], + 'emphatics': ['ETD', 'ETDLB', 'LSETD'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD']} +EXPS = ['1HVFourRoom', 'FirstFourRoom', 'FirstChain'] +ALGS = [key for key in alg_dict.keys()] +ALGS.remove('LSTD') +ALGS.remove('LSETD') +# ALGS.remove('TDRC') +ALL_ALGS = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', 'LSTD', 'LSETD'] +# ALL_ALGS = ['TD', 'Vtrace', 'TB', 'ABTD'] +LMBDA_AND_ZETA = [0.0, 0.9] +AUC_AND_FINAL = ['auc', 'final'] +EXP_ATTRS = {'FirstChain': FirstChainAttr, 'FirstFourRoom': FirstFourRoomAttr, '1HVFourRoom': HVFirstFourRoomAttr} + +if DEBUG_MODE: + EXPS = ['FirstFourRoom', '1HVFourRoom'] + # ALGS = ['GTD'] + # ALL_ALGS.remove('ETDLB') + # ALL_ALGS.remove('LSTD') + # ALL_ALGS.remove('LSETD') + # LMBDA_AND_ZETA = [0.9] + AUC_AND_FINAL = ['final'] + # ALG_GROUPS = {'main_algs': ALL_ALGS} diff --git b/Plotting/plot_sensitivity.py a/Plotting/plot_sensitivity.py new file mode 100644 index 0000000..7227e60 --- /dev/null +++ a/Plotting/plot_sensitivity.py @@ -0,0 +1,94 @@ +import os +import matplotlib.pyplot as plt +import numpy as np + +from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \ + PLOT_RERUN_AND_ORIG, RERUN_POSTFIX +from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas +from utils import create_name_for_save_load + + +def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''): + res_path = make_res_path(alg, exp) + load_file_name = os.path.join(res_path, create_name_for_save_load( + best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy') + performance_over_alpha = np.load(load_file_name) + performance_over_alpha = replace_large_nan_inf( + performance_over_alpha, large=exp_attrs.learning_starting_point, + replace_with=exp_attrs.over_limit_replacement) + stderr_load_file_name = os.path.join( + res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) + + f'_stderr_{auc_or_final}_over_alpha{postfix}.npy') + std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name) + std_err_of_best_perf_over_alpha = replace_large_nan_inf( + std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0) + return performance_over_alpha, std_err_of_best_perf_over_alpha + + +# noinspection DuplicatedCode +def plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, second_time=False): + alpha = 1.0 + if PLOT_RERUN_AND_ORIG: + alpha = 1.0 if second_time else 0.5 + lbl = f'{alg}' + ax.set_xscale('log', basex=2) + color = ALG_COLORS[alg] + if alg == 'TD': + color = 'grey' + alpha=0.7 + ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color, + linewidth=2, markersize=5, alpha=alpha) + ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color, + mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha) + # ax.legend() + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set_ylim(exp_attrs.y_lim) + ax.yaxis.set_ticks(exp_attrs.y_axis_ticks) + ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels) + ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log) + ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25) + plt.xticks(fontsize=25) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + ax.spines['left'].set_linewidth(2) + ax.spines['bottom'].set_linewidth(2) + + +def plot_sensitivity_curve(**kwargs): + for exp in kwargs['exps']: + exp_attrs = EXP_ATTRS[exp](exp) + for auc_or_final in kwargs['auc_or_final']: + for sp in kwargs['sp_list']: + save_dir = os.path.join('pdf_plots', 'sensitivity_curves', auc_or_final) + for alg_names in kwargs['alg_groups'].values(): + fig, ax = plt.subplots(figsize=kwargs['fig_size']) + for alg in alg_names: + if alg in ['LSTD', 'LSETD']: + continue + postfix = RERUN_POSTFIX if PLOT_RERUN else '' + best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp) + alphas = get_alphas(alg, exp) + best_performance, stderr = load_best_performance_over_alpha( + alg, exp, auc_or_final, best_params, exp_attrs, postfix) + plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs) + if PLOT_RERUN_AND_ORIG: + postfix = RERUN_POSTFIX + best_performance, stderr = load_best_performance_over_alpha( + alg, exp, auc_or_final, best_params, exp_attrs, postfix) + plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, True) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + if PLOT_RERUN_AND_ORIG: + prefix = '_rerun_and_original' + elif PLOT_RERUN: + prefix = RERUN_POSTFIX + else: + prefix = '' + fig.savefig(os.path.join(save_dir, + f"{prefix}_sensitivity_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"), + format='pdf', dpi=1000, bbox_inches='tight') + plt.show() + print(exp, alg_names, auc_or_final, sp) diff --git b/Plotting/plot_sensitivity_for_two_lambdas.py a/Plotting/plot_sensitivity_for_two_lambdas.py new file mode 100644 index 0000000..3b88bff --- /dev/null +++ a/Plotting/plot_sensitivity_for_two_lambdas.py @@ -0,0 +1,103 @@ +import os +import matplotlib.pyplot as plt +import numpy as np + +from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN, PLOT_RERUN_AND_ORIG, RERUN_POSTFIX, ALGS +from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas +from utils import create_name_for_save_load + + +def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''): + res_path = make_res_path(alg, exp) + load_file_name = os.path.join(res_path, create_name_for_save_load( + best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy') + performance_over_alpha = np.load(load_file_name) + performance_over_alpha = replace_large_nan_inf( + performance_over_alpha, large=exp_attrs.learning_starting_point, + replace_with=exp_attrs.over_limit_replacement) + stderr_load_file_name = os.path.join( + res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) + + f'_stderr_{auc_or_final}_over_alpha{postfix}.npy') + std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name) + std_err_of_best_perf_over_alpha = replace_large_nan_inf( + std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0) + return performance_over_alpha, std_err_of_best_perf_over_alpha + + +# noinspection DuplicatedCode +def plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, second_time=False): + alpha = 1.0 + if PLOT_RERUN_AND_ORIG: + alpha = 1.0 if second_time else 0.5 + lbl = f'{alg}' + ax.set_xscale('log', basex=2) + color = 'blue' if sp else 'red' + if sp not in [0.0, 1.0]: + alpha = 0.3 + color = 'grey' + ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color, + linewidth=2, markersize=5, alpha=alpha) + ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color, + mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha) + # ax.legend() + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set_ylim(exp_attrs.y_lim) + ax.yaxis.set_ticks(exp_attrs.y_axis_ticks) + ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels) + ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log) + # ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25) + # plt.xticks(fontsize=25) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + ax.spines['left'].set_linewidth(2) + ax.spines['bottom'].set_linewidth(2) + + +def plot_min(ax, min_performance): + print(min_performance) + ax.plot([pow(2, -3), pow(2, -2)], [min_performance, min_performance], linewidth=0.2, alpha=0.2) + # ax.axhline(y=min_performance, xmin=pow(2, -3), xmax=pow(2, -2)) + + +def plot_sensitivity_for_lambdas(**kwargs): + for exp in kwargs['exps']: + exp_attrs = EXP_ATTRS[exp](exp) + for auc_or_final in kwargs['auc_or_final']: + save_dir = os.path.join('pdf_plots', 'sensitivity_curves_for_lambdas', exp, auc_or_final) + for alg in kwargs['algs']: + min_performance = 1_000 + fig, ax = plt.subplots(figsize=kwargs['fig_size']) + for sp in kwargs['sp_list']: + if alg in ['LSTD', 'LSETD']: + continue + postfix = RERUN_POSTFIX if PLOT_RERUN else '' + best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp) + alphas = get_alphas(alg, exp) + best_performance, stderr = load_best_performance_over_alpha( + alg, exp, auc_or_final, best_params, exp_attrs, postfix) + plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs) + if PLOT_RERUN_AND_ORIG: + postfix = RERUN_POSTFIX + best_performance, stderr = load_best_performance_over_alpha( + alg, exp, auc_or_final, best_params, exp_attrs, postfix) + plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, True) + if min(best_performance) < min_performance: + min_performance = min(best_performance) + if kwargs.get('plot_min_performance', False): + plot_min(ax, min_performance) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + if PLOT_RERUN_AND_ORIG: + prefix = '_rerun_and_original' + elif PLOT_RERUN: + prefix = RERUN_POSTFIX + else: + prefix = '' + fig.savefig(os.path.join(save_dir, + f"{prefix}_sensitivity_curve_{alg}{exp}.pdf"), + format='pdf', dpi=1000, bbox_inches='tight') + plt.show() + print(exp, alg, auc_or_final, sp) diff --git b/Plotting/plot_specific_learning_curves.py a/Plotting/plot_specific_learning_curves.py new file mode 100644 index 0000000..f3f1bc0 --- /dev/null +++ a/Plotting/plot_specific_learning_curves.py @@ -0,0 +1,121 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import pylab +from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \ + PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX, ALGS, ALL_ALGS +from Plotting.plot_utils import load_best_rerun_params_dict, make_params +from utils import create_name_for_save_load + + +def load_data(alg, exp, best_params, postfix=''): + res_path = os.path.join(os.getcwd(), 'Results', exp, alg) + generic_name = create_name_for_save_load(best_params) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy") + mean_lc = np.load(load_file_name) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy") + stderr_lc = np.load(load_file_name) + return mean_lc, stderr_lc + + +def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, flag=False): + alpha = 1.0 + if PLOT_RERUN_AND_ORIG: + alpha = 1.0 if second_time else 0.5 + lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha'])) + color = ALG_COLORS[alg] + if alg == 'TDRC': + alpha = 0.6 + if flag: + color = 'green' + ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha) + ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2, + color=color, alpha=0.1*alpha) + # ax.legend() + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set_xlim(exp_attrs.x_lim) + ax.set_ylim(exp_attrs.y_lim) + ax.xaxis.set_ticks(exp_attrs.x_axis_ticks) + ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25) + ax.yaxis.set_ticks(exp_attrs.y_axis_ticks) + ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + ax.spines['left'].set_linewidth(2) + ax.spines['bottom'].set_linewidth(2) + + +def get_ls_rmsve(alg, exp, sp): + res_path = os.path.join(os.getcwd(), 'Results', exp, alg) + params = {'alpha': 0.01, 'lmbda': sp} + if alg == 'LSETD': + params['beta'] = 0.9 + generic_name = create_name_for_save_load(params) + load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy") + return np.load(load_file_name) + + +def plot_ls_solution(ax, ls_rmsve, alg, sp): + lbl = f"{alg} $\\lambda=$ {sp}" + x = np.arange(ls_rmsve.shape[0]) + y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0]) + ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--') + # ax.legend() + + +def load_sample_params_dict(alg, exp, sp): + fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp) + if alg in ['TD', 'ETD', 'TB', 'Vtrace']: + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp} + if alg == 'ABTD': + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp} + if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']: + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp, + 'eta': tp_list[np.random.randint(0, len(tp_list))]} + if alg == 'ETDLB': + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp, + 'beta': tp_list[np.random.randint(0, len(tp_list))]} + if alg == 'TDRC': + return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp, + 'eta': tp_list[np.random.randint(0, len(tp_list))], + 'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]} + + +def plot_specific_learning_curves(**kwargs): + specific_params = kwargs['specific_params'] + exp = kwargs['exp'] + prefix = '' + exp_attrs = EXP_ATTRS[exp](exp) + for auc_or_final in AUC_AND_FINAL: + sp = kwargs['sp'] + save_dir = os.path.join('pdf_plots', 'specific_learning_curves', auc_or_final) + fig, ax = plt.subplots(figsize=(10, 4)) + for alg in kwargs['algs']: + flag = False + if alg in ['LSTD', 'LSETD']: + ls_rmsve = get_ls_rmsve(alg, exp, sp) + plot_ls_solution(ax, ls_rmsve, alg, sp) + continue + print(alg, exp, sp) + if alg == 'PGTD22': + flag = True + alg = 'PGTD2' + current_params = specific_params[alg] + current_params['eta'] = 1.0 + current_params['alpha'] = 0.03125 + else: + current_params = specific_params[alg] + print(current_params) + mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix) + plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, False, flag) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + pylab.gca().set_rasterized(True) + fig.savefig(os.path.join(save_dir, + f"{prefix}_learning_curve_{'_'.join(ALGS)}{exp}Lmbda{sp}.pdf"), + format='pdf', dpi=200, bbox_inches='tight') + plt.show() + plt.close(fig) diff --git b/Plotting/plot_utils.py a/Plotting/plot_utils.py new file mode 100644 index 0000000..897ade8 --- /dev/null +++ a/Plotting/plot_utils.py @@ -0,0 +1,176 @@ +import argparse +import json +import numpy as np +import os +from Job.JobBuilder import default_params +from Registry.AlgRegistry import alg_dict +from utils import create_name_for_save_load + + +def make_res_path(alg, exp): + return os.path.join(os.getcwd(), 'Results', exp, alg) + + +def make_exp_path(alg, exp): + return os.path.join(os.getcwd(), 'Experiments', exp, alg) + + +def load_best_rerun_params_dict(alg, exp, auc_or_final, sp): + res_path = make_res_path(alg, exp) + with open(os.path.join(res_path, f"{auc_or_final}_{sp}.json")) as f: + return json.load(f)['meta_parameters'] + + +def get_alphas(alg, exp): + exp_path = make_exp_path(alg, exp) + exp_path = os.path.join(exp_path, f"{alg}.json") + with open(exp_path) as f: + jsn_content = json.load(f) + return jsn_content['meta_parameters']['alpha'] + + +def load_best_rerun_params(alg, exp, auc_or_final, sp): + best_res_dict = load_best_rerun_params_dict(alg, exp, auc_or_final, sp) + best_fp = best_res_dict.get('alpha', 0) + best_tp = best_res_dict.get('eta', best_res_dict.get('beta', 0)) + best_fop = best_res_dict.get('tdrc_beta', 0) + return best_fp, best_tp, best_fop + + +def make_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--exp_name', '-n', type=str, default='1HVFourRoom') + # 1HVFourRoom or FirstFourRoom or FirstChain + return parser.parse_args() + + +def rename_best_old_result(res_path, params_dict, file_name): + name_to_save = create_name_for_save_load(param_dict=params_dict) + path_and_name = os.path.join(res_path, name_to_save) + file_name = path_and_name + file_name + os.rename(file_name + '.npy', file_name + '_old.npy') + + +def load_best_perf_json(alg, exp, sp, auc_or_final): + res_path = make_res_path(alg, exp) + res_path = os.path.join(res_path, f"{auc_or_final}_{sp}.json") + with open(res_path, 'r') as f: + return json.load(f) + + +def load_exp_json_file(alg, exp): + res_path = make_res_path(alg, exp) + exp_path = make_exp_path(alg, exp) + exp_path = os.path.join(exp_path, f'{alg}.json') + with open(exp_path) as f: + return json.load(f), res_path + + +def make_params(alg_name, exp_name): + params = dict() + alg_param_names = alg_dict[alg_name].related_parameters() + json_content, res_path = load_exp_json_file(alg_name, exp_name) + json_exp_params = json_content.get('meta_parameters') + for param in alg_param_names: + params[param] = json_exp_params.get(param, default_params['meta_parameters'][param]) + if not isinstance(params[param], list): + params[param] = list([params[param]]) + fp_list = params.get('alpha', params['alpha']) + tp_list = [0.0] + fop_list = [0.0] + if 'lmbda' in params: + sp_list = params['lmbda'] + else: + sp_list = params['zeta'] + if 'eta' in params: + tp_list = params['eta'] + elif 'beta' in params: + tp_list = params['beta'] + if 'tdrc_beta' in params: + fop_list = params['tdrc_beta'] + if alg_name == 'TDRC': + tp_list, fop_list = [1.0], [1.0] + return fp_list, sp_list, tp_list, fop_list, res_path + + +def make_current_params(alg_name, sp, tp, fop, fp=0): + current_params = {'alpha': fp} + alg_param_names = alg_dict[alg_name].related_parameters() + if 'lmbda' in alg_param_names: + current_params['lmbda'] = sp + else: + current_params['zeta'] = sp + if 'eta' in alg_param_names: + current_params['eta'] = tp + elif 'beta' in alg_param_names: + current_params['beta'] = tp + if 'tdrc_beta' in alg_param_names: + current_params['tdrc_beta'] = fop + return current_params + + +def get_alg_names(exp_name): + path = os.path.join(os.getcwd(), 'Experiments', exp_name) + alg_names = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))] + return alg_names + + +def load_sample_json_for_exp(exp): + alg = get_alg_names(exp)[0] + exp_path = make_exp_path(alg, exp) + exp_path = os.path.join(exp_path, f'{alg}.json') + if not os.path.exists(exp_path): + print('No algorithms exist in the experiment directory...') + raise FileExistsError + with open(exp_path) as f: + json_exp_params = json.load(f) + return json_exp_params + + +def load_and_replace_large_nan_inf(load_file_name, large, replace_with): + current_perf = np.load(load_file_name) + return replace_large_nan_inf(current_perf, large=large, replace_with=replace_with) + + +class FirstChainAttr: + def __init__(self, exp_name): + json_exp_params = load_sample_json_for_exp(exp_name) + self.size_of_labels = 25 + self.y_lim = [0.0, 0.8] + self.x_lim = [0.0, json_exp_params['number_of_steps']] + self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7] + self.x_axis_ticks = [0.0, 5000, 10000, 15000, 20000] + self.x_tick_labels = [0, '5', '10', '15', '20'] + self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)] + self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1] + self.over_limit_replacement = 2.0 + self.over_limit_waterfall = 0.79 + self.learning_starting_point = 0.68910 + self.ok_error = 0.4 + + +class FirstFourRoomAttr: + def __init__(self, exp_name): + json_exp_params = load_sample_json_for_exp(exp_name) + self.size_of_labels = 25 + self.y_lim = [0.0, 0.8] + self.x_lim = [0.0, json_exp_params['number_of_steps']] + self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7] + self.x_axis_ticks = [0.0, 10000, 20000, 30000, 40000, 50000] + self.x_tick_labels = [0, '10', '20', '30', '40', '50'] + self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)] + self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1] + self.over_limit_replacement = 2.0 + self.over_limit_waterfall = 0.79 + self.learning_starting_point = 0.72672 + self.ok_error = 0.4 + + +class HVFirstFourRoomAttr(FirstFourRoomAttr): + def __init__(self, exp_name): + super(HVFirstFourRoomAttr, self).__init__(exp_name) + + +def replace_large_nan_inf(arr, large=1.0, replace_with=2.0): + arr[np.isnan(arr)], arr[np.isinf(arr)], arr[arr > large] = replace_with, replace_with, replace_with + return arr diff --git b/Plotting/plot_waterfall.py a/Plotting/plot_waterfall.py new file mode 100644 index 0000000..9553c9f --- /dev/null +++ a/Plotting/plot_waterfall.py @@ -0,0 +1,93 @@ +import os +import matplotlib.pyplot as plt +import numpy as np + +from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \ + RERUN_POSTFIX +from Plotting.plot_utils import make_current_params, replace_large_nan_inf, make_params +from utils import create_name_for_save_load + +np.random.seed(0) +def load_all_performances(alg, exp, auc_or_final, sp, exp_attrs): + fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp) + all_performance = np.zeros((len(fp_list), len(tp_list), len(fop_list))) + for i, fop in enumerate(fop_list): + for j, tp in enumerate(tp_list): + current_params = make_current_params(alg, sp, tp, fop) + load_file_name = os.path.join(res_path, create_name_for_save_load( + current_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha.npy') + + if PLOT_RERUN and auc_or_final == 'auc': + load_file_name_rerun = load_file_name.replace('.npy', f"{RERUN_POSTFIX}.npy") + if os.path.isfile(load_file_name_rerun): + load_file_name = load_file_name_rerun + + performance = np.load(load_file_name) + performance = replace_large_nan_inf(performance, large=exp_attrs.learning_starting_point, + replace_with=exp_attrs.over_limit_waterfall) + all_performance[:, j, i] = performance + return all_performance + + +def plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs): + global ticker, x_axis_names, x_axis_ticks + performance_to_plot = np.array(all_performance.flatten()) + percentage_overflowed = round((performance_to_plot > exp_attrs.learning_starting_point).sum() / + performance_to_plot.size, 2) + ok_percentage = round((performance_to_plot < exp_attrs.ok_error).sum() / + performance_to_plot.size, 2) + print(alg, 'percentage_overflowed', percentage_overflowed) + # print(alg, 'OK_percentage', ok_percentage) + color = ALG_COLORS[alg] + ax.scatter([(ticker + 1)] * performance_to_plot.shape[0] + np.random.uniform( + -0.25, 0.25, performance_to_plot.shape[0]), performance_to_plot, marker='o', + facecolors='none', color=color) + x_axis_ticks.append(ticker + 1) + ticker = (ticker + 1) % len(alg_names) + ax.tick_params( + axis='x', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + bottom=False, # ticks along the bottom edge are off + top=False, # ticks along the top edge are off + labelbottom=True) # labels along the bottom edge are off + x_axis_names.append(f'{alg}_{percentage_overflowed}') + ax.xaxis.set_ticks(x_axis_ticks) + ax.set_xticklabels(x_axis_names) + ax.get_yaxis().tick_left() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels) + ax.set_ylim(exp_attrs.y_lim) + ax.yaxis.set_ticks(exp_attrs.y_axis_ticks) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + ax.spines['left'].set_linewidth(2) + ax.spines['bottom'].set_linewidth(2) + + +ticker, x_axis_names, x_axis_ticks = 0.0, [''], [0] + + +def plot_waterfall_scatter(**kwargs): + for exp in kwargs['exps']: + exp_attrs = EXP_ATTRS[exp](exp) + for auc_or_final in kwargs['auc_or_final']: + for sp in kwargs['sp_list']: + save_dir = os.path.join('pdf_plots', 'waterfalls', auc_or_final) + for alg_names in kwargs['alg_groups'].values(): + global ticker, x_axis_names, x_axis_ticks + ticker, x_axis_names, x_axis_ticks = -0.5, [''], [0] + fig, ax = plt.subplots(kwargs['fig_size']) + for alg in alg_names: + if alg in ['LSTD', 'LSETD']: + continue + all_performance = load_all_performances(alg, exp, auc_or_final, sp, exp_attrs) + plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs) + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + prefix = RERUN_POSTFIX if PLOT_RERUN else '' + fig.savefig(os.path.join(save_dir, + f"{prefix}_waterfall_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"), + format='pdf', dpi=1000, bbox_inches='tight') + plt.show() + print(exp, alg_names, auc_or_final, sp) diff --git b/Plotting/process_state_value_function.py a/Plotting/process_state_value_function.py new file mode 100644 index 0000000..00f8fe7 --- /dev/null +++ a/Plotting/process_state_value_function.py @@ -0,0 +1,98 @@ +import os +import numpy as np +import matplotlib.pyplot as plt + + +class ValueFunctionProcessor: + def __init__(self, exp, alg): + result_dir = os.path.join(os.getcwd(), 'Results', exp, alg, 'Sample_value_function') + self.all_value_functions = dict() + self.all_value_functions_of_last_step = dict() + for value_function_name in os.listdir(result_dir): + value_function = np.load(os.path.join(result_dir, value_function_name)) + step, run_num = (int(i) for i in value_function_name.replace('.npy', '').split('_')) + self.all_value_functions[(step, run_num)] = value_function + if (step == 19999 and exp == 'FirstChain') or (step == 49999 and exp == 'FirstFourRoom') or ( + step == 49999 and exp == '1HVFourRoom'): + self.all_value_functions_of_last_step[run_num] = value_function + + def get_value_function_by_step_and_run(self, step, run): + return self.all_value_functions[(step, run)] + + def get_value_function_for_last_step(self, run): + return self.all_value_functions_of_last_step[run] + + +# STEPS = [199, 999, 1999, 3999, 9999, 19999] +STEPS = [199, 1999, 19999] +# STEPS = [19999] +RUNS = [0, 10, 15, 20, 30, 45] +# RUNS = list(range(50)) +EXPS = ['FirstChain'] # FirstChain or FirstFourRoom or 1HVFourRoom +ALGS = ['TD'] +TASK = 'EightStateCollision' + + +def plot_value_function(ax, value_function, step=0, run=0, is_last_step=False): + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.set_ylim(0, 1.0) + label = f"{step}_{run}" + line_style = '-' + line_width = 4 + alpha = 1.0 + color = 'blue' + if not step: + line_style = '--' + if not step and is_last_step: + line_style = '-' + if is_last_step: + line_width = 2 + alpha = 0.2 + color = 'red' + ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha, color=color) + else: + ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha) + ax.set_yticklabels([]) + ax.set_xticklabels([]) + ax.spines['left'].set_linewidth(2) + ax.spines['bottom'].set_linewidth(2) + + +def plot_value_functions(): + for exp in EXPS: + save_dir = os.path.join('pdf_plots', 'value_functions') + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy')) + for alg in ALGS: + value_processor = ValueFunctionProcessor(exp, alg) + for run in RUNS: + fig, ax = plt.subplots(figsize=(8, 3)) + for step in STEPS: + value_function = value_processor.get_value_function_by_step_and_run(step, run) + plot_value_function(ax, value_function, step, run) + plot_value_function(ax, true_value_function) + fig.savefig(os.path.join(save_dir, f"{run}_value_function_{alg}_{exp}.pdf"), + format='pdf', dpi=200, bbox_inches='tight') + plt.show() + + +def plot_all_final_value_functions(): + for exp in EXPS: + save_dir = os.path.join('pdf_plots', 'value_functions', 'asymptotic_value_functions') + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy')) + for alg in ALGS: + value_processor = ValueFunctionProcessor(exp, alg) + fig, ax = plt.subplots(figsize=(8, 3)) + for run in range(50): + value_function = value_processor.get_value_function_for_last_step(run) + plot_value_function(ax, value_function, is_last_step=True) + plot_value_function(ax, true_value_function) + fig.savefig(os.path.join(save_dir, f"value_function_{alg}_{exp}.pdf"), + format='pdf', dpi=200, bbox_inches='tight') + plt.show() diff --git b/README.md a/README.md new file mode 100644 index 0000000..6b673b1 --- /dev/null +++ a/README.md @@ -0,0 +1,563 @@ + +

+ +

+
+
+ :steam_locomotive::train::train::train::train::train: +
+

An Empirical Comparison of Off-policy Prediction Learning Algorithms on the Collision Task

+ +This repository includes the code for the "empirical off-policy" paper. +
+ + +

+ + +

+ +## Table of Contents +- **[Specification of Dependencies](#specifications)** +- **[Algorithms](#algorithms)** + - **TD**: [Off-policy TD](#td) + - **Gradient-TD family** : [GTD](#gtd) , [GTD2](#gtd2), [HTD](#htd), [PGTD2](#pgdt2), [TDRC](#tdrc) + - **Emphatic-TD family** : [Emphatic TD](#etd), [Emphatic TDβ](#etdb) + - **Variable-λ family** : [TB](#tb), [Vtrace](#vtrace), [ABTD](#abtd) + - **[Algorithm Glossary](#glossary)** +- **[Environments](#environment)** : [Chain](#chain), [Four Room Grid World](#four_room_grid_world) +- **[How to run the code](#how-to-run)**: [Learning.py](#learning.py), [Job Buidler](#job_builder) +- **[Plotting the results](#Plot-results)** + + +## Specification of Dependencies +This code requires python 3.5 or above. Packages that are required for running the code are all in the `requirements.txt` +file. To install these dependencies, run the following command if your pip is set to `python3.x`: +```text +pip install requirements.txt +``` +otherwise, run: +```text +pip3 install requirements.txt +``` + + + + + +## Algorithms +Algorithms are used to find a weight vector, [**w**](#var_w), such that the dot product of [**w**](#var_w) and the feature vector, +approximates the value function. + + + +### Off-policy TD + +**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation]( +https://www.cs.mcgill.ca/~dprecup/publications/PSD-01.pdf)
+**Authors** Doina Precup, Richard S. Sutton, Sanjoy Dasgupta
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = rho * (gamma * lmbda * z + x) +w += alpha * delta * z +``` + +### Gradient-TD algorithms + +#### GTD/TDC + +**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation]( +http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.6170&rep=rep1&type=pdf)
+**Authors** Richard S. Sutton, Hamid Reza Maei, Doina Precup, Shalabh Bhatnagar, David Silver, Csaba Szepesvàri, +Eric Wiewiora
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = rho * (gamma * lmbda * z + x) +w += alpha * (delta * z - gamma * (1 - lmbda) * np.dot(z, v) * x_p) +v += alpha_v * (delta * z - np.dot(x, v) * x) +``` + + +#### GTD2 + +**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation]( +http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.6170&rep=rep1&type=pdf)
+**Authors** Richard S. Sutton, Hamid Reza Maei, Doina Precup, Shalabh Bhatnagar, David Silver, Csaba Szepesvàri, +Eric Wiewiora
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = rho * (gamma * lmbda * z + x) +w += alpha * (np.dot(x, v) * x - gamma * (1 - lmbda) * np.dot(z, v) * x_p) +v += alpha_v * (delta * z - np.dot(x, v) * x) +``` + + +#### HTD + +**Paper** [Investigating Practical Linear Temporal Difference Learning]( +https://arxiv.org/pdf/1602.08771.pdf)
+**Authors** Adam White, Martha White
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = rho * (gamma * lmbda * z + x) +z_b = gamma * lmbda * z_b + x +w += alpha * ((delta * z) + (x - gamma * x_p) * np.dot((z - z_b), v)) +v += alpha_v * ((delta * z) - (x - gamma * x_p) * np.dot(v, z_b)) +``` + + +#### Proximal GTD2 + +**Paper** [Proximal Gradient Temporal Difference Learning: Stable Reinforcement Learning with Polynomial Sample Complexity]( +https://arxiv.org/pdf/2006.03976.pdf)
+**Authors** Bo Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji Liu, Sridhar Mahadevan, Marek Petrik
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = rho * (gamma * lmbda * z + x) +v_mid = v + alpha_v * (delta * z - np.dot(x, v) * x) +w_mid = w + alpha * (np.dot(x, v) * x - (1 - lmbda) * gamma * np.dot(z, v) * x_p) +delta_mid = r + gamma * np.dot(w_mid, x_p) - np.dot(w_mid, x) +w += alpha * (np.dot(x, v_mid) * x - gamma * (1 - lmbda) * np.dot(z, v_mid) * x_p) +v += alpha_v * (delta_mid * z - np.dot(x, v_mid) * x) +``` + + +#### TDRC + +**Paper** [Gradient Temporal-Difference Learning with Regularized Corrections]( +http://proceedings.mlr.press/v119/ghiassian20a/ghiassian20a.pdf)
+**Authors** Sina Ghiassian, Andrew Patterson, Shivam Garg, Dhawal Gupta, Adam White, Martha White
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = rho * (gamma * lmbda * z + x) +w += alpha * (delta * z - gamma * (1 - lmbda) * np.dot(z, v) * x_p) +v += alpha_v * (delta * z - np.dot(x, v) * x) - alpha_v * tdrc_beta * v +``` + +### Emphatic-TD algorithms + + +#### Emphatic TD + +**Paper** [An Emphatic Approach to the Problem of Off-policy Temporal-Difference Learning]( +https://jmlr.org/papers/volume17/14-488/14-488.pdf)
+**Authors** Richard S. Sutton, A. Rupam Mahmood, Martha White
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = rho * (gamma * lmbda * z + x) +F = gamma * old_rho * F + 1 +m = lmbda * 1 + (1 - lmbda) * F +z = rho * (x * m + gamma * lmbda * z) +w += alpha * delta * z +``` + + +#### Emphatic TDβ + +**Paper** [Generalized Emphatic Temporal Difference Learning: Bias-Variance Analysis]( +https://ojs.aaai.org/index.php/AAAI/article/view/10227/10086)
+**Authors** Assaf Hallak, Aviv Tamar, Remi Munos, Shie Mannor
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = rho * (gamma * lmbda * z + x) +F = beta * old_rho * F + 1 +m = lmbda * 1 + (1 - lmbda) * F +z = rho * (x * m + gamma * lmbda * z) +w += alpha * delta * z +``` + + +### Variable-λ algorithms + + +#### Tree backup/ Tree backup for prediction + +**Paper** [Eligibility Traces for Off-Policy Policy Evaluation]( +https://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1079&=&context=cs_faculty_pubs&=&sei-redir=1&referer=https%253A%252F%252Fscholar.google.com%252Fscholar%253Fhl%253Den%2526as_sdt%253D0%25252C5%2526q%253Dtree%252Bbackup%252Balgorithm%252Bdoina%252Bprecup%2526btnG%253D#search=%22tree%20backup%20algorithm%20doina%20precup%22)
+**Authors** Doina Precup, Richard S. Sutton, Satinder Singh
+ +The algorithm pseudo-code described below is the prediction variant of the original Tree backup algorithm proposed by +Precup, Sutton, and Singh (2000). The prediction variant of the algorithm used here is first derived in the current paper. +```python +delta = rho * (r + gamma * np.dot(w, x_p) - np.dot(w, x)) +z = gamma * lmbda * old_pi * z + x +w = w + alpha * delta * z +``` + + +#### Vtrace (simplified) + +**Paper** [IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures] +(http://proceedings.mlr.press/v80/espeholt18a/espeholt18a.pdf)
+**Authors** Lasse Espeholt, Hubert Soyer, Remi Munos, Karen Simonyan, Volodymyr Mnih, Tom Ward, Yotam Doron, Vlad Firoiu, Tim Harley, Iain Dunning, Shane Legg, Koray Kavukcuoglu
+ +```python +delta = r + gamma * np.dot(w, x_p) - np.dot(w, x) +z = min(1, rho) * (gamma * lmbda * z + x) +w += alpha * delta * z +``` + + +#### ABQ/ABTD + +**Paper** [Multi-step Off-policy Learning Without Importance Sampling Ratios]( +https://arxiv.org/pdf/1702.03006)
+**Authors** A. Rupam Mahmood, Huizhen Yu, Richard S. Sutton
+ +The algorithm pseudo-code described below is the prediction variant of the original Tree backup algorithm proposed by +Mahmood, Sutton, and Yu (2017). The prediction variant of the algorithm used here is first derived in the current paper. +This algorithm first needs to compute the following: +```python +xi_zero = 1 +xi_max = 2 +xi = 2 * zeta * xi_zero + max(0, 2 * zeta - 1) * (xi_max - 2 * xi_zero) +``` +`xi_zero` and `xi_max` are specifically computed here for the Collision problem. +To see how these are computed for the task see the original paper referenced above. + +```python +nu = min(xi, 1.0 / max(pi, mu)) +delta = rho * (r + gamma * np.dot(w, x_p) - np.dot(w, x)) +nu = min(xi, 1.0 / max(pi, mu)) +z = x + gamma * old_nu * old_pi * z +w += alpha * delta * z +``` + + +### Algorithm Glossary +Here, we briefly explain all the symbols and variables names that we use in our implementation. + +#### meta-parameters +- Common parameters of all algorithms: + - alpha (α): is the step size that defines how much the weight vector [**w**](#var_w) is updated at each time step. + - lambda (λ): is the bootstrapping parameter. +- Common parameters of Gradient-TD algorithms: + - alpha_v (αv): is the second step size that defines how much the second weight vector [**v**](#var_v) is + updated at each time step. +- beta (β): is the parameter used by the [**ETDβ**](#etdb) algorithm that defines how much the product of importance sampling ratios +from the past affects the current update. +- tdrc_beta (tdrcβ): is the regularization parameter of the [**TDRC**](#tdrc) algorithms. This parameter is often set to 1. +- zeta (ζ): is only used in the [**ABTD**](#abtd) algorithm. It is similar to the bootstrapping parameter of other algorithms. + +#### Algorithms variables + +- **w**: is the main weight vector being learned. ```init: w=0```. + +- **v**: is the secondary weight vector learned by Gradient-TD algorithms. ```init: v=0```. + +- **z**: is the eligibility trace vector. ```init: z=0```. + +- **zb**: is the extra eligibility trace vector used by [**HTD**](#htd). ```init: z_b=0```. + +- delta (𝛿): is the td-error, which in the full bootstrapping case, is equal to the reward plus the value of the next + state minus the value of the current state. + +- s: is the current state (scalar). + +- **x**: is the feature vector of the current state. + +- s_p: is the next state (scalar). + +- **x_p**: is the feature vector of the next state. + +- r: is the reward. + +- rho (ρ): is the importance sampling ratio, which is equal to the probability of taking an action under the target policy + divided by the probability of taking the same action under the behavior policy. + +- old_rho (oldρ): is the importance sampling ratio at the previous time step. + +- pi (π): is the probability of taking an action under the target policy at the current time step. + +- old_pi (oldπ): is the probability of taking an action under the target policy in the previous time step. The variable + π itself is the probability of taking action under the target policy at the current time step. + +- F : is the follow-on trace used by [Emphatic-TD](#etd) algorithms. + +- m : is the emphasis used by [Emphatic-TD](#etd) algorithms. + +- nu (ν): Variable used by the ABQ/ABTD algorithm. Please refer to the [original paper](https://arxiv.org/pdf/1702.03006) for explanation. + +- xi (ψ): Variable used by the ABQ/ABTD algorithm. Please refer to the [original paper](https://arxiv.org/pdf/1702.03006) for explanation. + +- mu (μ): is the probability of taking action under the behavior policy at the current time step. + +- old_mu (oldμ): is the probability of taking an action under the target policy at the previous time step. +- gamma (γ): is the discount factor parameter. + + + +## Environment +At the heart of an environment is an MDP. +The MDP defines the states, actions, rewards, transition probability matrix, and the discount factor. + + +### Chain Environment and the Collision Task +
+

+ +

+
+An MDP with eight states is at the heart of the task. +The agent starts in one of the four leftmost states with equal probability. +One action in available in the four leftmost states: forward. Two actions are available in the four rightmost states: +forward and turn. By taking the forward action, the agent transitions one state to the right and by taking the turn +action, it moves away from the wall and transitions to one of the four leftmost states equiprobably. Rewards are all +zero except for taking forward in state 8 for which a +1 is emitted. Termination function (discount factor) returns +0.9 for all transitions except for taking turn in any state or taking forward in state 8, for which the termination +function returns zero. + +```python +env = Chain() +env.reset() # returns to one of the four leftmost states with equal probability. +for step in range(1, 1000): + action = np.random.randint(0, 2) # forward=0, turn=1 + sp, r, is_wall = env.step(action=action) + if is_wall: + env.reset() +``` + +We applied eleven algorithms to the Collision task: Off-policy TD(λ), GTD(λ), GTD2(λ), HTD(λ), Proximal GTD2(λ), TDRC(λ) +, ETD(λ), ETD(λ,β), Tree Backup(λ), Vtrace(λ), ABTD(ζ). The target policy was π(forward|·) = 1.0. The behavior policy +was b(forward|·) = 1.0 for the four leftmost states and b(forward|·) = 0.5, b(retreat|·) = 0.5 for the four rightmost +states. Each algorithm was applied to the task with a range of parameters. We refer to an algorithm with a specific +parameter setting as an instance of that algorithm. Each algorithm instance was applied to the Collision task for +20,000 time steps, which we call a run. We repeated the 20,000 time steps for fifty runs. All instances of all +algorithms experienced the same fifty trajectories. + +Linear function approximation was used to approximate the true value function. Each state was represented by a six +dimensional binary feature vector. The feature representation of each state had exactly three zeros and three ones. +The locations of the zeros and ones were selected randomly. This was repeated once at the beginning of each run, +meaning that the representation for each run is most probably different from other runs. At the beginning of each run +we set **w**0 = **0** and thus the error would be the same for all algorithms at the beginning of the runs. + +#### Feature representation +The feature representation for the collision task is an array of size `8, 6, 50`, where 8 corresponds to the number of +states, 6 correponds to the number of features for each state, and 50 corresponds to the number of runs. +The feature representations used for the set of results presented here and in the paper is saved in: +``` +Resources/EightStateCollision/feature_rep.npy +``` +Note that the feature representaiton for each run is different in the Collision task. +For example, the feature representation for the first run is: +``` +array([[0., 0., 1., 0., 1., 1.], + [1., 1., 1., 0., 0., 0.], + [0., 1., 1., 0., 0., 1.], + [1., 0., 1., 1., 0., 0.], + [1., 1., 0., 0., 1., 0.], + [0., 1., 1., 1., 0., 0.], + [1., 1., 0., 0., 0., 1.], + [1., 0., 1., 0., 0., 1.]]) +``` + +#### State distribution induced by the behavior policy +To compute an approximation of the mean squared value error at each time step, weighting induced by the behavior policy +was approximated by following the behavior policy for 20,000,000 time step and computing the fraction of time spent in +each state. The resulting distribution is saved in: +``` +Resources/EightStateCollision/d_mu.npy +``` +`d_mu.npy` is a one dimensional numpy array of size `8`: +``` +array([0.05715078, 0.1142799 , 0.17142456, 0.22856842, 0.22856842, 0.11428067, 0.05715311, 0.02857415]) +``` + +#### True state values +To compute an approximation of the mean squared value error at each time step, we need the true state values. +Luckily, for the Collision task, these values are easy to compute. +We computed these true values by following the target policy from each state to the wall once. +The resulting values are saved in: +``` +Resources/EightStateCollision/state_values.npy +``` +`state_values.npy` is a one dimensional numpy array of size `8`: +``` +array([0.4782969, 0.531441, 0.59049, 0.6561, 0.729, 0.81, 0.9, 1]) +``` + + + + +## How to Run the Code +The code can be run in two different ways. +One way is through `learning.py` that can be used to run small experiments on a local computer. +The other way is through the files inside the Job directory. +We explain each of these approaches below by means of an example. + +### Running on Your Local Machine +Let's take the following example: applying Off-policy TD(λ) to the Collision task. +There are multiple ways for doing this. +The first way is to open a terminal and go into the root directory of the code and run `Learning.py` with proper parameters: +``` +python3 Learning.py --algorithm TD --task EightStateCollision --num_of_runs 50 --num_steps --environment Chain +--save_value_function Ture --alpha 0.01 --lmbda 0.9 +``` +In case any of the parameters are not specified, a default value will be used. +The default value is set in the `Job` directory, inside the `JobBuilder.py` file. +This means, the code, can alternatively be run, by setting all the necessary values that an algorithm needs at the top of the `JobBuilder.py` file. +Note that not all parameters specified in the `default_params` dict are required for all algorithms. For example, the `tdrc_beta` parameter is only +required to be set for the TDRC(λ) algorithms. +Once the variables inside the `default_params` dictionary, the code can be run: +``` +python3 Learning.py +``` +Or one can choose to specify some parameters in the `default_params` dictionary and specify the rest as command line argumets +like the following: +``` +python3 Learning.py --algorithm TD --task EightStateCollision --alpha 0.01 +``` + +### Running on Servers with Slurm Workload Managers +When parameter sweeps are necessary, the code can be run on supercomputers. +The current code supports running on servers that use slurm workload managers such as compute canada. +For exampole, to apply the TD algorithm to the Collision (EightStateCollision) task, with various parameters, +first you need to create a json file that specifies all the parameters that you would like to run, for example: +```json +{ + "agent": "TD", + "environment": "Chain", + "task": "EightStateCollision", + "number_of_runs": 50, + "number_of_steps": 20000, + "sub_sample": 1, + "meta_parameters": { + "alpha": [ + 0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281, + 0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0 + ], + "lmbda": [ + 0.1, 0.2, 0.3 + ] + } +} +``` +and then run `main.py` using python: +``` +python3 main.py -f -s +``` +where `kind_of_submission` refers to one of the two ways you can submit your code: +1) You can request an individual cpu for each of the algorithm instances, where an algorithm instance refers to an +algorithm with specific parameters. To request an individual cpu, run the following command: +``` +python3 main.py -f -s cpu +``` +When running each algorithm instance on a single cpu, you need to specify the following parameters inside +`Job/SubmitJobsTemplatesCedar.SL`: +```shell +#SBATCH --account=xxx +#SBATCH --time=00:15:58 +#SBATCH --mem=3G +``` +where `#SBATCH --account=xxx` requires the account you are using in place of `xxx`, +`#SBATCH --time=00:15:58` requires the time you want to request for each individual cpu, +and `#SBATCH --mem=xG` requires the amount of memory in place of x. + +2) You can request a node, that we assume includes 40 cpus. If you request a node, the jobs you submit will run in +parallel 40 at a time, and once one job is finished, the next one in line will start running. +This process continues until either all jobs are finished running, or you run out of the time you requested for that node. +``` +python3 main.py -f -s node +``` +When running the jobs on nodes, you need to specify the following parameters inside `Job/SubmitJobsTemplates.SL`: +```shell +#SBATCH --account=xxx +#SBATCH --time=11:58:59 +#SBATCH --nodes=x +#SBATCH --ntasks-per-node=40 +``` +where `#SBATCH --account=xxx` requires the account you are using in place of `xxx`, +`#SBATCH --time=11:58:59` requires the time you want to request for each individual node, each of which includes 40 cpus in this case, +and `#SBATCH --nodes=x` requires the number of nodes you would like to request in place of x. +If you request more than one node, your jobs will be spread across nodes, 40 on each node, and once each job finishes, +the next job in the queue will start running. +`#SBATCH --ntasks-per-node=xx` is the number of jobs you would like to run concurrently on a single node. In this case, +for example, we set it to 40. + +If `path_to_the_json_file_or_dir` is a directory, then the code will walk into all the subdirectories, and submits jobs for +all the parameters in the json files that it finds inside those directories sequentially. +If `path_to_the_json_file_or_dir` is a file, then the code will submit jobs for all the parameters that it finds inside that +single json file. +Note that you can create a new directory for each experiment that you would like to run, and create directories for each +of the algorithms you would like to run in that experiment. +For example, we created a directory called `FirstChain` inside the `Experiments` directory and created one directory +per algorithm inside the `FirstChain` directory for each of the algorithms and specified a json file in that directory. +It is worth noting that whatever parameter that is not specified in the json file will be read from the `default_params` +dictionary inside the `Job` directory inside the `JobBuilder.py` file. + + + +## Plotting the results +The following table shows all the parameters that we tested in the experiments: +

+ +

+ +We now explain how each figure in the paper can be reproduced. +All the figures of the paper can be reproduced using the `plot_data.py` file, once you run the Learning.py script with all the needed parameters. +If you do not have the results available, the `plot_data.py` script will return an error. + +1) **Processing the data**: This script manipulates data in a way that it is ready to be plotted over step sizes and also such + that the data is ready to be plotted as learning curves averaged over runs. + The `process_data` script also re-runs the algorithms with their best parameters to eliminate possible maximization + bias, as explained in the paper. + This is a time consuming step. If you do not like to do this step, simply set: + ```python + PLOT_RERUN = False + ``` + in `Plotting/plot_params.py` and the code will ignore the re-running steps. + If you would like to eliminate the maximization bias, set: + ```python + PLOT_RERUN = True + ``` + Finally, go to `plot_data.py` and set `func_to_run = 'process_data'`, and run the `plot_data.py` script. + +2) **Plotting the learned value functions**: + Go to `plot_data`, and set `func_to_run = 'plot_value_functions'` to plot + the learned value functions for some of the runs, and set `func_to_run = plot_all_final_value_functions` to plot the + value function learned by the last time step of all of the runs in one plot. +

+ +

+
+ +3) **Plotting the learning curves with specific parameter values**: + Go to `plot_data`, and set `func_to_run = 'specific_learning_curves_full_bootstrap'`, and run the `plot_data.py` + script. +

+

+ +

+
+ +4) **Plotting the parameter studies for step size for all algorithms**: + Go to `plot_data`, and set `func_to_run = 'collision_sensitivity_curves_for_many_lambdas'`, and run the script. +

+

+ +

+
+ +5) **Plotting the parameter sensitivity study of Emphatic-TD algorithms**: + Go to `plot_data`, and set `func_to_run = 'collision_emphatics_sensitivity_full_bootstrap'`, and run the script. +

+

+ +

+
+ +6) **Plotting the parameter sensitivity study of Gradient-TD algorithms**: + Go to `plot_data`, and set `func_to_run = 'collision_gradients_sensitivity_full_bootstrap'`, and run the script. +

+

+ +

+
diff --git b/Registry/AlgRegistry.py a/Registry/AlgRegistry.py new file mode 100644 index 0000000..860843c --- /dev/null +++ a/Registry/AlgRegistry.py @@ -0,0 +1,18 @@ +from Algorithms.TD import TD +from Algorithms.GTD import GTD +from Algorithms.TDRC import TDRC +from Algorithms.GEMETD import GEMETD +from Algorithms.GTD2 import GTD2 +from Algorithms.PGTD2 import PGTD2 +from Algorithms.HTD import HTD +from Algorithms.ETDLB import ETDLB +from Algorithms.ETD import ETD +from Algorithms.ABTD import ABTD +from Algorithms.Vtrace import Vtrace +from Algorithms.TB import TB +from Algorithms.LSTD import LSTD +from Algorithms.LSETD import LSETD +alg_dict = {'TD': TD, 'Vtrace': Vtrace, 'GTD': GTD, 'ABTD': ABTD, 'ETD': ETD, 'TB': TB, 'GTD2': GTD2, 'HTD': HTD, + 'ETDLB': ETDLB, 'PGTD2': PGTD2, 'TDRC': TDRC, 'GEMETD': GEMETD, 'LSTD': LSTD, 'LSETD': LSETD} +# alg_dict = {'TD': TD, 'GTD': GTD, 'GTD2': GTD2, 'PGTD2': PGTD2, 'HTD': HTD, 'TDRC': TDRC, 'ETD': ETD, 'ETDLB': ETDLB, +# 'TB': TB, 'Vtrace': Vtrace, 'ABTD': ABTD, 'LSTD': LSTD, 'LSETD': 'LSETD'} diff --git b/Registry/EnvRegistry.py a/Registry/EnvRegistry.py new file mode 100644 index 0000000..513a0d5 --- /dev/null +++ a/Registry/EnvRegistry.py @@ -0,0 +1,3 @@ +from Environments.Chain import Chain +from Environments.FourRoomGridWorld import FourRoomGridWorld +environment_dict = {'FourRoomGridWorld': FourRoomGridWorld, 'Chain': Chain} diff --git b/Registry/TaskRegistry.py a/Registry/TaskRegistry.py new file mode 100644 index 0000000..a828297 --- /dev/null +++ a/Registry/TaskRegistry.py @@ -0,0 +1,6 @@ +from Tasks.EightStateCollision import EightStateCollision +from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat +from Tasks.HighVarianceLearnEightPoliciesTileCodingFeat import HighVarianceLearnEightPoliciesTileCodingFeat +task_dict = {'EightStateCollision': EightStateCollision, + 'LearnEightPoliciesTileCodingFeat': LearnEightPoliciesTileCodingFeat, + 'HighVarianceLearnEightPoliciesTileCodingFeat': HighVarianceLearnEightPoliciesTileCodingFeat} diff --git b/Resources/EightStateCollision/d_mu.npy a/Resources/EightStateCollision/d_mu.npy new file mode 100644 index 0000000..79bb2af Binary files /dev/null and a/Resources/EightStateCollision/d_mu.npy differ diff --git b/Resources/EightStateCollision/feature_rep.npy a/Resources/EightStateCollision/feature_rep.npy new file mode 100644 index 0000000..0296432 Binary files /dev/null and a/Resources/EightStateCollision/feature_rep.npy differ diff --git b/Resources/EightStateCollision/state_values.npy a/Resources/EightStateCollision/state_values.npy new file mode 100644 index 0000000..948e9fb Binary files /dev/null and a/Resources/EightStateCollision/state_values.npy differ diff --git b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/d_mu.npy a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/d_mu.npy new file mode 100644 index 0000000..a316e8f Binary files /dev/null and a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/d_mu.npy differ diff --git b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/feature_rep.npy a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/feature_rep.npy new file mode 100644 index 0000000..09e3ec2 Binary files /dev/null and a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/feature_rep.npy differ diff --git b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/state_values.npy a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/state_values.npy new file mode 100644 index 0000000..b539dc0 Binary files /dev/null and a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/state_values.npy differ diff --git b/Resources/LearnEightPoliciesTileCodingFeat/d_mu.npy a/Resources/LearnEightPoliciesTileCodingFeat/d_mu.npy new file mode 100644 index 0000000..9a09131 Binary files /dev/null and a/Resources/LearnEightPoliciesTileCodingFeat/d_mu.npy differ diff --git b/Resources/LearnEightPoliciesTileCodingFeat/feature_rep.npy a/Resources/LearnEightPoliciesTileCodingFeat/feature_rep.npy new file mode 100644 index 0000000..09e3ec2 Binary files /dev/null and a/Resources/LearnEightPoliciesTileCodingFeat/feature_rep.npy differ diff --git b/Resources/LearnEightPoliciesTileCodingFeat/state_values.npy a/Resources/LearnEightPoliciesTileCodingFeat/state_values.npy new file mode 100644 index 0000000..d29743e Binary files /dev/null and a/Resources/LearnEightPoliciesTileCodingFeat/state_values.npy differ diff --git b/Tasks/BaseTask.py a/Tasks/BaseTask.py new file mode 100644 index 0000000..c6f9b73 --- /dev/null +++ a/Tasks/BaseTask.py @@ -0,0 +1,76 @@ +from abc import abstractmethod +import numpy as np + + +class BaseTask: + def __init__(self, **kwargs): + self.run_number = kwargs.get('run_number', 0) + self.num_steps = None + self.feature_rep = None + self.stacked_feature_rep = None # If learning more than one target policy at the same time + self.num_features = None + self.GAMMA = None + self.behavior_dist = None + self.state_values = None + self.num_policies = None + self.ABTD_xi_zero = None + self.ABTD_xi_max = None + + def stack_feature_rep(self): + stacked_feature_rep = np.zeros((self.num_policies, self.feature_rep.shape[1], self.feature_rep.shape[0])) + for i in range(self.feature_rep.shape[0]): + stacked_x = np.tile(self.feature_rep[i, :], [self.num_policies, 1]) + stacked_feature_rep[:, :, i] = stacked_x + return stacked_feature_rep + + def get_active_policies(self, s): + ... + + def get_terminal_policies(self, s): + ... + + def generate_behavior_dist(self, total_steps): + ... + + @staticmethod + def num_of_policies(): + ... + + @abstractmethod + def load_feature_rep(self): + ... + + @abstractmethod + def get_state_feature_rep(self, s): + ... + + @abstractmethod + def create_feature_rep(self): + ... + + @abstractmethod + def select_target_action(self, s, policy_id=0): + ... + + @abstractmethod + def select_behavior_action(self, s): + ... + + @abstractmethod + def get_pi(self, s, a): + ... + + @abstractmethod + def get_mu(self, s, a): + ... + + @abstractmethod + def load_behavior_dist(self): + return self.behavior_dist + + @abstractmethod + def load_state_values(self): + return self.state_values + + def __str__(self): + return f'task:{type(self).__name__}' diff --git b/Tasks/EightStateCollision.py a/Tasks/EightStateCollision.py new file mode 100644 index 0000000..60c6f65 --- /dev/null +++ a/Tasks/EightStateCollision.py @@ -0,0 +1,75 @@ +import numpy as np + +from Environments.Chain import Chain +from Tasks.BaseTask import BaseTask + + +class EightStateCollision(BaseTask, Chain): + + def __init__(self, **kwargs): + BaseTask.__init__(self, **kwargs) + Chain.__init__(self) + self._resource_root_path = kwargs.get('resource_root_path', 'Resources') + + self.N = kwargs.get('n', 8) + self.feature_rep = self.load_feature_rep() + self.num_features = self.feature_rep.shape[1] + self.num_steps = kwargs.get('num_steps', 20000) + self.GAMMA = 0.9 + self.behavior_dist = self.load_behavior_dist() + self.state_values = self.load_state_values() + self.num_policies = EightStateCollision.num_of_policies() + self.ABTD_xi_zero = 1 + self.ABTD_xi_max = 2 + + @staticmethod + def num_of_policies(): + return 1 + + def load_feature_rep(self): + return np.load(f'{self._resource_root_path}/{self.__class__.__name__}/feature_rep.npy')[:, :, self.run_number] + + def create_feature_rep(self): + num_ones = 3 + num_zeros = self.num_features - num_ones + for i in range(self.N): + random_arr = (np.array([0] * num_zeros + [1] * num_ones)) + np.random.shuffle(random_arr) + self.feature_rep[i, :] = random_arr + + def get_state_feature_rep(self, s): + return self.feature_rep[s, :] + + def load_behavior_dist(self): + self.behavior_dist = np.load(f'{self._resource_root_path}/{self.__class__.__name__}/d_mu.npy') + return self.behavior_dist + + def load_state_values(self): + self.state_values = np.load(f'{self._resource_root_path}/{self.__class__.__name__}/state_values.npy') + return self.state_values + + def select_behavior_action(self, s): + if s < self.N / 2: + return self.RIGHT_ACTION + else: + return np.random.choice([self.RIGHT_ACTION, self.RETREAT_ACTION]) + + def select_target_action(self, s, policy_id=0): + return self.RIGHT_ACTION + + def get_pi(self, s, a): + if a == self.RIGHT_ACTION: + return 1 + else: + return 0 + + def get_mu(self, s, a): + if s < self.N / 2: + if a == self.RIGHT_ACTION: + return 1 + else: + return 0 + elif s >= self.N / 2: + return 0.5 + else: + raise AssertionError diff --git b/Tasks/HighVarianceLearnEightPoliciesTileCodingFeat.py a/Tasks/HighVarianceLearnEightPoliciesTileCodingFeat.py new file mode 100644 index 0000000..315bf28 --- /dev/null +++ a/Tasks/HighVarianceLearnEightPoliciesTileCodingFeat.py @@ -0,0 +1,42 @@ +import numpy as np +from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat + + +class HighVarianceLearnEightPoliciesTileCodingFeat(LearnEightPoliciesTileCodingFeat): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.RANDOM_PROB = 0.97 + + def select_behavior_action(self, s): + random_num = np.random.random() + x, y = self.get_xy(s) + if x == 1 and (y == 1 or y == 8): + if random_num < self.RANDOM_PROB: + return self.ACTION_LEFT + else: + return np.random.choice([self.ACTION_UP, self.ACTION_RIGHT, self.ACTION_DOWN]) + if x == 8 and (y == 1 or y == 8): + if random_num < self.RANDOM_PROB: + return self.ACTION_RIGHT + else: + return np.random.choice([self.ACTION_UP, self.ACTION_LEFT, self.ACTION_DOWN]) + return np.random.choice([self.ACTION_UP, self.ACTION_DOWN, self.ACTION_LEFT, self.ACTION_RIGHT]) + + def get_mu(self, s, a): + x, y = self.get_xy(s) + if x == 1 and (y == 1 or y == 8): + if a == self.ACTION_LEFT: + return np.ones(self.num_policies) * self.RANDOM_PROB + # return 0.97 + else: + return np.ones(self.num_policies) * ((1 - self.RANDOM_PROB) / 3.0) + # return 0.01 + if x == 8 and (y == 1 or y == 8): + if a == self.ACTION_RIGHT: + return np.ones(self.num_policies) * self.RANDOM_PROB + # return 0.97 + else: + return np.ones(self.num_policies) * ((1 - self.RANDOM_PROB) / 3.0) + # return 0.01 + + return super().get_mu(s, a) diff --git b/Tasks/LearnEightPoliciesTileCodingFeat.py a/Tasks/LearnEightPoliciesTileCodingFeat.py new file mode 100644 index 0000000..9953efe --- /dev/null +++ a/Tasks/LearnEightPoliciesTileCodingFeat.py @@ -0,0 +1,207 @@ +import numpy as np +import random + +from Environments.FourRoomGridWorld import FourRoomGridWorld +from Tasks.BaseTask import BaseTask +from utils import ImmutableDict + + +class LearnEightPoliciesTileCodingFeat(BaseTask, FourRoomGridWorld): + def __init__(self, **kwargs): + BaseTask.__init__(self) + FourRoomGridWorld.__init__(self) + self.feature_rep = self.load_feature_rep() + self.num_features = self.feature_rep.shape[1] + self.num_steps = kwargs.get('num_steps', 50000) + self.GAMMA = 0.9 + self.behavior_dist = self.load_behavior_dist() + self.state_values = self.load_state_values() + self.ABTD_xi_zero = 1 + self.ABTD_xi_max = 4 + + self.optimal_policies = ImmutableDict( + { + 0: [ + [lambda x, y: 0 <= x <= 3 and 2 <= y <= 4, [self.ACTION_DOWN, self.ACTION_RIGHT]], + [lambda x, y: 3 >= x >= 0 == y, [self.ACTION_UP, self.ACTION_RIGHT]], + [lambda x, y: 0 <= x <= 4 and y == 1, [self.ACTION_RIGHT]], + [lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1], [self.ACTION_DOWN]], + [lambda x, y: 4 == x and 2 <= y <= 4, [self.ACTION_DOWN]], + [lambda x, y: 4 == x and y == 0, [self.ACTION_UP]] + ], + 1: [ + [lambda x, y: 2 <= x <= 4 and 0 <= y <= 3, [self.ACTION_LEFT, self.ACTION_UP]], + [lambda x, y: x == 0 and 0 <= y <= 3, [self.ACTION_RIGHT, self.ACTION_UP]], + [lambda x, y: x == 1 and 0 <= y <= 4, [self.ACTION_UP]], + [lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1], [self.ACTION_LEFT]], + [lambda x, y: 2 <= x <= 4 and y == 4, [self.ACTION_LEFT]], + [lambda x, y: x == 0 and y == 4, [self.ACTION_RIGHT]], + ], + 2: [ + [lambda x, y: 2 <= x <= 4 and 7 <= y <= 10, [self.ACTION_LEFT, self.ACTION_DOWN]], + [lambda x, y: x == 0 and 7 <= y <= 10, [self.ACTION_RIGHT, self.ACTION_DOWN]], + [lambda x, y: x == 1 and 6 <= y <= 10, [self.ACTION_DOWN]], + [lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1], [self.ACTION_LEFT]], + [lambda x, y: 2 <= x <= 4 and y == 6, [self.ACTION_LEFT]], + [lambda x, y: x == 0 and y == 6, [self.ACTION_RIGHT]], + ], + 3: [ + [lambda x, y: 0 <= x <= 3 and 6 <= y <= 7, [self.ACTION_UP, self.ACTION_RIGHT]], + [lambda x, y: 0 <= x <= 3 and 9 <= y <= 10, [self.ACTION_DOWN, self.ACTION_RIGHT]], + [lambda x, y: 0 <= x <= 4 and y == 8, [self.ACTION_RIGHT]], + [lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1], [self.ACTION_UP]], + [lambda x, y: x == 4 and 6 <= y <= 7, [self.ACTION_UP]], + [lambda x, y: x == 4 and 9 <= y <= 10, [self.ACTION_DOWN]] + ], + 4: [ + [lambda x, y: 10 >= x >= 7 >= y >= 5, [self.ACTION_LEFT, self.ACTION_UP]], + [lambda x, y: 7 <= x <= 10 and 9 <= y <= 10, [self.ACTION_LEFT, self.ACTION_DOWN]], + [lambda x, y: 6 <= x <= 10 and y == 8, [self.ACTION_LEFT]], + [lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1], [self.ACTION_UP]], + [lambda x, y: x == 6 and 5 <= y <= 7, [self.ACTION_UP]], + [lambda x, y: x == 6 and 9 <= y <= 10, [self.ACTION_DOWN]] + ], + 5: [ + [lambda x, y: 6 <= x <= 7 and 6 <= y <= 10, [self.ACTION_RIGHT, self.ACTION_DOWN]], + [lambda x, y: 9 <= x <= 10 and 6 <= y <= 10, [self.ACTION_DOWN, self.ACTION_LEFT]], + [lambda x, y: x == 8 and 5 <= y <= 10, [self.ACTION_DOWN]], + [lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1], [self.ACTION_RIGHT]], + [lambda x, y: 6 <= x <= 7 and y == 5, [self.ACTION_RIGHT]], + [lambda x, y: 9 <= x <= 10 and y == 5, [self.ACTION_LEFT]] + ], + 6: [ + [lambda x, y: 6 <= x <= 7 and 0 <= y <= 2, [self.ACTION_UP, self.ACTION_RIGHT]], + [lambda x, y: 9 <= x <= 10 and 0 <= y <= 2, [self.ACTION_UP, self.ACTION_LEFT]], + [lambda x, y: x == 8 and 0 <= y <= 3, [self.ACTION_UP]], + [lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1], [self.ACTION_RIGHT]], + [lambda x, y: 6 <= x <= 7 and y == 3, [self.ACTION_RIGHT]], + [lambda x, y: 9 <= x <= 10 and y == 3, [self.ACTION_LEFT]] + ], + 7: [ + [lambda x, y: 7 <= x <= 10 and 2 <= y <= 3, [self.ACTION_DOWN, self.ACTION_LEFT]], + [lambda x, y: 7 <= x <= 10 and y == 0, [self.ACTION_UP, self.ACTION_LEFT]], + [lambda x, y: 6 <= x <= 10 and y == 1, [self.ACTION_LEFT]], + [lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1], [self.ACTION_DOWN]], + [lambda x, y: x == 6 and 2 <= y <= 3, [self.ACTION_DOWN]], + [lambda x, y: x == 6 and y == 0, [self.ACTION_UP]] + ] + } + ) + self.default_actions = ImmutableDict( + { + 0: self.ACTION_RIGHT, + 1: self.ACTION_UP, + 2: self.ACTION_DOWN, + 3: self.ACTION_RIGHT, + 4: self.ACTION_LEFT, + 5: self.ACTION_DOWN, + 6: self.ACTION_UP, + 7: self.ACTION_LEFT + } + ) + self.policy_terminal_condition = ImmutableDict( + { + 0: lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1], + 1: lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1], + 2: lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1], + 3: lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1], + 4: lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1], + 5: lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1], + 6: lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1], + 7: lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1] + } + ) + self.num_policies = LearnEightPoliciesTileCodingFeat.num_of_policies() + self.stacked_feature_rep = self.stack_feature_rep() + self._active_policies_cache = {} + + @staticmethod + def num_of_policies(): + return 8 + + def get_terminal_policies(self, s): + x, y = self.get_xy(s) + terminal_policies = np.zeros(self.num_policies) + for policy_id, condition in self.policy_terminal_condition.items(): + if condition(x, y): + terminal_policies[policy_id] = 1 + return terminal_policies + + def get_state_index(self, x, y): + return int(y * np.sqrt(self.feature_rep.shape[0]) + x) + + def get_probability(self, policy_number, s, a): + x, y = self.get_xy(s) + probability = 0.0 + for condition, possible_actions in self.optimal_policies[policy_number]: + if condition(x, y): + if a in possible_actions: + probability = 1.0 / len(possible_actions) + break + return probability + + def select_target_action(self, s, policy_id=0): + x, y = self.get_xy(s) + a = self.default_actions[policy_id] + for condition, possible_actions in self.optimal_policies[policy_id]: + if condition(x, y): + a = random.choice(possible_actions) + break + return a + + def get_active_policies(self, s): + if s in self._active_policies_cache: + return self._active_policies_cache[s] + x, y = self.get_xy(s) + active_policy_vec = np.zeros(self.num_policies, dtype=int) + for policy_number, policy_values in self.optimal_policies.items(): + for (condition, _) in policy_values: + if condition(x, y): + active_policy_vec[policy_number] = 1 + break + self._active_policies_cache[s] = active_policy_vec + return active_policy_vec + + def load_feature_rep(self): + return np.load(f'Resources/{self.__class__.__name__}/feature_rep.npy') + + def get_state_feature_rep(self, s): + return self.feature_rep[s, :] + + def create_feature_rep(self): + ... + + def load_behavior_dist(self): + return np.load(f'Resources/{self.__class__.__name__}/d_mu.npy') + + def load_state_values(self): + return np.load(f'Resources/{self.__class__.__name__}/state_values.npy') + + def select_behavior_action(self, s): + return np.random.randint(0, self.num_actions) + + def get_mu(self, s, a): + return np.ones(self.num_policies) * (1.0 / self.num_actions) + + def get_pi(self, s, a): + pi_vec = np.zeros(self.num_policies) + for policy_id, i in enumerate(self.get_active_policies(s)): + if i: + pi_vec[policy_id] = self.get_probability(policy_id, s, a) + return pi_vec + + def generate_behavior_dist(self, total_steps): + final_state_dist = np.zeros((self.num_policies, self.num_states)) + s = self.reset() + state_visitation_count = np.zeros(self.num_states) + for step in range(total_steps): + if step % 100000 == 0: + print(step) + state_visitation_count[s] += 1 + sp, r, is_terminal, _ = self.step(self.select_behavior_action(s)) + s = sp + for s in range(self.num_states): + for policy_id, i in enumerate(self.get_active_policies(s)): + if i: + final_state_dist[policy_id, s] = state_visitation_count[s] + return (final_state_dist / total_steps).T diff --git b/Tests/Algorithms/TestTD.py a/Tests/Algorithms/TestTD.py new file mode 100644 index 0000000..c179b70 --- /dev/null +++ a/Tests/Algorithms/TestTD.py @@ -0,0 +1,46 @@ +import unittest +import numpy as np + +from Algorithms.TD import TD +from Environments.Chain import Chain +from Tasks.EightStateCollision import EightStateCollision + + +class TestTD(unittest.TestCase): + def setUp(self) -> None: + params = { + #'resource_root_path': '../../Resources', + 'alpha': 0.001953125, + 'lmbda': 0.9, + } + self.env = Chain() + self.task = EightStateCollision(**params) + self.task.reset() + + self.alg = TD(task=self.task, **params) + + def tearDown(self) -> None: + ... + + def test_initial_w_is_zero(self): + self.assertEqual(self.alg.w.sum(), 0) + + def test_initial_z_is_zero(self): + self.assertEqual(self.alg.z.sum(), 0) + + def test_learn_single_policy_rmsve_after_num_steps(self): + rmsve_of_run = np.zeros((self.task.num_policies, self.task.num_steps)) + np.random.seed(0) + + self.alg.state = self.env.reset() + for step in range(self.task.num_steps): + rmsve_of_run[:, step], error = self.alg.compute_rmsve() + self.alg.action = self.alg.choose_behavior_action() + self.alg.next_state, r, is_terminal, info = self.env.step(self.alg.action) + self.alg.learn(self.alg.state, self.alg.next_state, r, is_terminal) + if is_terminal: + self.alg.state = self.env.reset() + self.alg.reset() + continue + self.alg.state = self.alg.next_state + self.assertTrue(abs(0.08319472840990755 - rmsve_of_run[0, -1]) <= 0.0000001) diff --git b/Tests/Environments/TestChain.py a/Tests/Environments/TestChain.py new file mode 100644 index 0000000..1eba67b --- /dev/null +++ a/Tests/Environments/TestChain.py @@ -0,0 +1,27 @@ +import unittest + +from Environments.Chain import Chain + + +class TestChain(unittest.TestCase): + def setUp(self) -> None: + self.env = Chain() + self.env.reset() + + def tearDown(self) -> None: + self.env.reset() + + def test_rest_initial_state_between_zero_three(self): + self.env.reset() + self.assertIn(self.env._state, [0, 1, 2, 3]) + + def test_step_retreat_move_state_to_initial_state(self): + self.env.reset() + sp, r, is_done, _ = self.env.step(self.env.RETREAT_ACTION) + self.assertEqual(is_done, True) + + def test_step_right_move_state_one_step_to_right(self): + self.env.reset() + s = self.env._state + sp, r, is_done, _ = self.env.step(self.env.RIGHT_ACTION) + self.assertEqual(sp - s, 1) diff --git b/Tests/Tasks/TestEightStateCollision.py a/Tests/Tasks/TestEightStateCollision.py new file mode 100644 index 0000000..252d847 --- /dev/null +++ a/Tests/Tasks/TestEightStateCollision.py @@ -0,0 +1,61 @@ +import unittest +from Tasks.EightStateCollision import EightStateCollision +from Environments.Chain import Chain + + +class TestEightStateCollision(unittest.TestCase): + def setUp(self) -> None: + params = { + #'resource_root_path': '../../Resources' + } + self.experiment = EightStateCollision(**params) + self.experiment.reset() + + def tearDown(self) -> None: + ... + + def test_load_feature_rep_evaluate_shape_is_(self): + feature_rep_arr = self.experiment.load_feature_rep() + self.assertEqual(feature_rep_arr.shape, (8, 6)) + + def test_get_state_feature_rep_state_for_all_states(self): + expected_states_feature_rep = [ + [0., 0., 1., 0., 1., 1.], + [1., 1., 1., 0., 0., 0.], + [0., 1., 1., 0., 0., 1.], + [1., 0., 1., 1., 0., 0.], + [1., 1., 0., 0., 1., 0.], + [0., 1., 1., 1., 0., 0.], + [1., 1., 0., 0., 0., 1.], + [1., 0., 1., 0., 0., 1.] + ] + evaluated_states_feature_rep = [] + for state in range(self.experiment.N): + evaluated_states_feature_rep.append(list(self.experiment.get_state_feature_rep(state))) + self.assertListEqual(evaluated_states_feature_rep, expected_states_feature_rep) + + def test_load_behavior_dist_evaluate_shape_is_(self): + behavior_dist = self.experiment.load_behavior_dist() + self.assertEqual(behavior_dist.shape, (8,)) + + def test_get_mu_for_right_action_in_initial_state_is_one(self): + mu = self.experiment.get_mu(0, self.experiment.RIGHT_ACTION) + self.assertEqual(mu, 1) + + def test_get_mu_for_retreat_action_in_initial_state_is_zero(self): + mu = self.experiment.get_mu(0, self.experiment.RETREAT_ACTION) + self.assertEqual(mu, 0) + + def test_get_mu_for_all_action_in_not_initial_state_is_one_half(self): + mu = self.experiment.get_mu(5, self.experiment.RIGHT_ACTION) + self.assertEqual(mu, 0.5) + mu = self.experiment.get_mu(5, self.experiment.RETREAT_ACTION) + self.assertEqual(mu, 0.5) + + def test_get_pi_for_right_action_is_one(self): + pi = self.experiment.get_pi(0, self.experiment.RIGHT_ACTION) + self.assertEqual(pi, 1) + + def test_get_pi_for_retreat_action_is_one(self): + pi = self.experiment.get_pi(0, self.experiment.RETREAT_ACTION) + self.assertEqual(pi, 0) diff --git b/data_presister.py a/data_presister.py new file mode 100644 index 0000000..59abdfc --- /dev/null +++ a/data_presister.py @@ -0,0 +1,222 @@ +import itertools +import json +import os +from collections import defaultdict +from itertools import zip_longest +from typing import List, Optional, Dict + +import numpy as np + +from Job.JobBuilder import default_params +from Plotting.plot_params import EXP_ATTRS +from Plotting.plot_utils import load_and_replace_large_nan_inf +from Registry.AlgRegistry import alg_dict +from utils import Configuration + + +def split_dict_of_list_to_dicts(dict_of_list: Dict[str, list]) -> List[Dict[str, float]]: + """split a given dictionary of lists into list of dictionaries. + + >>> split_dict_of_list_to_dicts({'alpha': [1, 2, 3], 'lambda': [4, 5], 'gamma': [6]}) + [{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}] + + Args: + dict_of_list (Dict[str, list]): a dictionary of lists. + + Returns: + List[Dict[str, float]]: list of dictionaries. + + + """ + keys = dict_of_list.keys() + values = [[e for e in result if e is not None] for result in itertools.product(*dict_of_list.values())] + result = [dict(zip(keys, v)) for v in values] + return result + + +def group_dicts_by_first_key(list_of_dicts: List[Dict[str, float]]) -> Dict[str, List[Dict[str, float]]]: + """ + >>> group_dicts_by_first_key([{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}]) + {1: [{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}], 2: [{'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}], 3: [{'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}]} + + """ + first_key = get_first_key_of_dictionary(list_of_dicts[0]) + final_grouped = defaultdict(list) + for inner_dict in list_of_dicts: + final_grouped[inner_dict[first_key]].append(inner_dict) + + return dict(final_grouped) + + +def group_dicts_over_first_key(list_of_dicts: List[Dict[str, float]]) -> Dict[tuple, List[float]]: + """ + >>> group_dicts_over_first_key([{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}]) + {(('lambda', 4), ('gamma', 6)): [1, 2, 3], (('lambda', 5), ('gamma', 6)): [1, 2, 3]} + + :param list_of_dicts: + :return: + """ + first_key = get_first_key_of_dictionary(list_of_dicts[0]) + final_grouped = defaultdict(list) + for inner_dict in list_of_dicts: + first_value = inner_dict[first_key] + del inner_dict[first_key] + final_grouped[tuple(inner_dict.items())].append(first_value) + + return dict(final_grouped) + + +def find_best_performance(exp_name, alg_name, second_param, auc_or_final) -> Dict[str, float]: + exp_attrs = EXP_ATTRS[exp_name](exp_name) + best_params = {} + best_perf = np.inf + + all_configuration = JsonParameterBuilder().add_experiment(exp_name).add_algorithm(alg_name).build() + list_of_configuration = split_dict_of_list_to_dicts(all_configuration) + first_param_key = get_first_key_of_dictionary(all_configuration) + + grouped_over_first = group_dicts_over_first_key(list_of_configuration) + + for grouped, first_values in grouped_over_first.items(): + second_param_name, second_param_value = grouped[0] + if second_param_value != second_param: + continue + grouped_params = dict(grouped) + current_params = Configuration(grouped_params) + current_params[first_param_key] = None + current_params.algorithm = alg_name + current_params.save_path = PathFactory.make_result_path(exp_name, alg_name) + current_params.rerun = False + + current_configuration_over_first_full_path = DataPersister.create_full_path_file_name(f'_mean_{auc_or_final}_over_alpha', current_params, + excluded_params=[first_param_key]) + + current_perf = load_and_replace_large_nan_inf( + current_configuration_over_first_full_path, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement) + + min_perf = min(current_perf) + if min_perf < best_perf: + best_perf = min_perf + best_perf_idx = int(np.nanargmin(current_perf)) + best_params = current_params + best_params[first_param_key] = first_values[best_perf_idx] + + return best_params + + +def get_first_key_of_dictionary(d: dict) -> str: + return list(d.keys())[0] + + +class ParameterBuilder: + def __init__(self): + self.final_params_dict = dict() + + def add_algorithm_params(self, configuration: Configuration): + for k in alg_dict[configuration.algorithm].related_parameters(): + self.final_params_dict[k] = configuration[k] + return self + + def build(self): + return self.final_params_dict + + +class JsonParameterBuilder: + def __init__(self): + self.final_params_dict = dict() + self.exp_name = None + self.alg_name = None + self.alg_related_params = None + + def add_experiment(self, exp_name): + self.exp_name = exp_name + return self + + def add_algorithm(self, alg_name): + self.alg_name = alg_name + self.alg_related_params = alg_dict[alg_name].related_parameters() + return self + + def build(self) -> Dict[str, list]: + json_path = PathFactory.make_experiment_path(self.exp_name, self.alg_name) + + with open(json_path) as f: + json_config = json.load(f) + + for param_name in self.alg_related_params: + self.final_params_dict[param_name] = list(json_config['meta_parameters'].get(param_name, [default_params['meta_parameters'][param_name]])) + + return self.final_params_dict + + +class PathFactory: + @staticmethod + def make_experiment_path(exp_name, alg_name): + return os.path.join(os.getcwd(), 'Experiments', exp_name, alg_name, f'{alg_name}.json') + + @staticmethod + def make_result_path(exp_name, alg_name): + return os.path.join(os.getcwd(), 'Results', exp_name, alg_name) + + +class DataPersister: + + @staticmethod + def save_result(result_arr: np.ndarray, result_name: str, configuration: Configuration, excluded_params: Optional[list] = None): + full_path_file_to_save = DataPersister.create_full_path_file_name(result_name, configuration, excluded_params) + if not os.path.exists(os.path.dirname(full_path_file_to_save)): + os.makedirs(os.path.dirname(full_path_file_to_save)) + np.save(full_path_file_to_save, result_arr) + + @staticmethod + def save_best_pref_over_first_param(exp_name, alg_name, auc_or_final): + all_configuration = JsonParameterBuilder().add_experiment(exp_name).add_algorithm(alg_name).build() + list_of_configuration = split_dict_of_list_to_dicts(all_configuration) + first_param_key = get_first_key_of_dictionary(all_configuration) + first_param_length = len(all_configuration[first_param_key]) + mean_over_alpha, stderr_over_alpha = np.zeros(first_param_length), np.zeros(first_param_length) + + grouped_over_first = group_dicts_over_first_key(list_of_configuration) + + for grouped, first_values in grouped_over_first.items(): + grouped_params = dict(grouped) + current_params = Configuration(grouped_params) + current_params[first_param_key] = None + current_params.algorithm = alg_name + current_params.save_path = PathFactory.make_result_path(exp_name, alg_name) + current_params.rerun = False + + for index, first_value in enumerate(first_values): + current_params[first_param_key] = first_value + full_path_file_to_save = DataPersister.create_full_path_file_name(f'_mean_stderr_{auc_or_final}', current_params) + # perf = np.load(full_path_file_to_save) + # mean_over_alpha[index], stderr_over_alpha[index] = perf[0], perf[1] + + # TODO: in case the rerun postfix is needed it should implement + DataPersister.save_result(mean_over_alpha, f"_mean_{auc_or_final}_over_alpha", current_params, excluded_params=[first_param_key]) + DataPersister.save_result(stderr_over_alpha, f"_stderr_{auc_or_final}_over_alpha", current_params, excluded_params=[first_param_key]) + + @staticmethod + def create_full_path_file_name(result_name: str, configuration: Configuration, excluded_params: Optional[list] = None) -> str: + params = ParameterBuilder().add_algorithm_params(configuration).build() + file_name_to_save = DataPersister.create_file_name(params, excluded_params=excluded_params) + full_path_file_to_save = os.path.join(configuration.save_path, file_name_to_save) + full_path_file_to_save = f'{full_path_file_to_save}{result_name}' + if configuration.rerun: + full_path_file_to_save = f'{full_path_file_to_save}_rerun' + return f'{full_path_file_to_save}.npy' + + @staticmethod + def create_file_name(param: dict, excluded_params: Optional[list]) -> str: + if excluded_params is None: + excluded_params = [] + final_str = '' + for k, v in param.items(): + if k in excluded_params: + continue + if k == 'alpha' or k == 'eta': + split_str = str.split(f'{v:.10f}', '.') + else: + split_str = str.split(f'{v:.5f}', '.') + final_str += '_' + k + split_str[0] + split_str[1] + return final_str diff --git b/main.py a/main.py new file mode 100644 index 0000000..28f2460 --- /dev/null +++ a/main.py @@ -0,0 +1,15 @@ +import os +from Job.JobBuilder import JobBuilder +import argparse +from utils import find_all_experiment_configuration + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--directory_or_file', '-f', type=str, help='Json file path or Json files directory', required=True) + parser.add_argument('--server', '-s', type=str, help='Input server name, Cedar or Niagara', required=True) + args = parser.parse_args() + for path in find_all_experiment_configuration(args.directory_or_file): + builder = JobBuilder(json_path=os.path.join(os.getcwd(), path), server_name=args.server) + builder() diff --git b/plot_data.py a/plot_data.py new file mode 100644 index 0000000..a253c4c --- /dev/null +++ a/plot_data.py @@ -0,0 +1,479 @@ +from Plotting.plot_all_sensitivities_per_alg_gradients import plot_all_sensitivities_per_alg_gradients +from Plotting.plot_all_sensitivities_per_alg_gradients_all_eta import plot_all_sensitivities_per_alg_gradients_all_eta +from Plotting.plot_best_learning_curve_over_all_params import plot_learning_curve_best_overall_params +from Plotting.plot_dist import plot_distribution, plot_dist_for_two_four_room_tasks +from Plotting.plot_all_sensitivities_per_alg_emphatics import plot_all_sensitivities_per_alg_emphatics +from Plotting.plot_learning_curve import plot_learning_curve +from Plotting.plot_learning_curves_for_all_third_params import plot_all_learning_curves_for_third +from Plotting.plot_learning_for_two_lambdas import plot_learning_curve_for_lambdas +from Plotting.plot_sensitivity import plot_sensitivity_curve +from Plotting.plot_sensitivity_for_two_lambdas import plot_sensitivity_for_lambdas +from Plotting.plot_specific_learning_curves import plot_specific_learning_curves +from Plotting.plot_waterfall import plot_waterfall_scatter +from Plotting.process_state_value_function import plot_all_final_value_functions, plot_value_functions +from process_data import process_data + +func_to_run = 'hv_four_rooms_specific_learning_curves_full_bootstrap' +if 'collision' in func_to_run: + exps = ['FirstChain'] # FirstChain OR FirstFourRoom OR 1HVFourRoom +elif 'hv' in func_to_run: + exps = ['1HVFourRoom'] +else: + exps = ['FirstFourRoom'] + +# region process data +if func_to_run == 'process_data': + exps = ['FirstChain', 'FirstFourRoom', '1HVFourRoom'] + algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD'] + auc_or_final = ['auc', 'final'] + sp_list = [1.0] + process_data(exps=exps, algs=algs, auc_or_final=auc_or_final, sp_list=sp_list) +# endregion + + +# ==================== +# ==================== + +# ==================== +# ==================== + + +# region Collision figures +# region learning curves +if func_to_run == 'collision_specific_learning_curves_full_bootstrap': + auc_or_final = ['auc'] + fig_size = (10, 4) + sp = 0.0 + if 'FirstChain' in exps: + exp = 'FirstChain' + algs = ['ETD', 'TD', 'GTD', 'TDRC', 'PGTD2'] + specific_params = { + 'TD': {'alpha': 0.03125, 'lmbda': sp}, + 'ETD': {'alpha': 0.00390625, 'lmbda': sp}, + 'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 4.0, 'tdrc_beta': 0.01}, + 'GTD': {'alpha': 0.000976562, 'lmbda': sp, 'eta': 16.0}, + 'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0} + } + plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final, + specific_params=specific_params) + if 'FirstFourRoom' in exps: + exp = 'FirstFourRoom' + algs = ['LSTD', 'LSETD', 'ETD', 'TD', 'GTD2', 'TDRC', 'PGTD2'] + specific_params = { + 'TD': {'alpha': 0.25, 'lmbda': sp}, + 'ETD': {'alpha': 0.00390625, 'lmbda': sp}, + 'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2}, + 'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0}, + 'GTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}, + 'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0} + } + plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final, + specific_params=specific_params) + + if '1HVFourRoom' in exps: + exp = '1HVFourRoom' + algs = ['LSTD', 'LSETD', 'ETDLB', 'TD', 'GTD', 'TDRC', 'PGTD2'] + specific_params = { + 'TD': {'alpha': 0.25, 'lmbda': sp}, + 'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2}, + 'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0}, + 'GTD': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}, + 'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0} + } + plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final, + specific_params=specific_params) +if func_to_run == 'collision_learning_curves_for_all_extra_params_full_bootstrapping': + algs = ['PGTD2', 'GTD', 'LSTD'] + sp_list = [0.0] + fig_size = (10, 4) + auc_or_final = ['auc'] + # tp_list = [0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0] + tp_list = [0.25] + plot_all_learning_curves_for_third(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final, tp_list=tp_list) +if func_to_run == 'collision_learning_curve_for_two_lambdas': + sp_list = [0.0, 0.9] + fig_size = (6, 4) + alg_groups = {'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']} + auc_or_final = ['auc'] + plot_learning_curve_for_lambdas(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'collision_best_learning_curves_full_bootstrap': + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'], + 'emphatics': ['ETD', 'ETDLB', 'LSETD'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'], + 'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', + 'LSTD', 'LSETD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'collision_best_learning_curves_some_algs_full_bootstrap': + sp_list = [0.0] + fig_size = (6, 4) + alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final, + is_smoothed=True, smoothing_window=1) +if func_to_run == 'collision_best_learning_curves_some_algs_medium_bootstrap': + sp_list = [0.5] + fig_size = (6, 4) + alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final, + is_smoothed=True, smoothing_window=500) +if func_to_run == 'collision_best_learning_curves_some_algs_minimal_bootstrap': + sp_list = [0.9] + fig_size = (6, 4) + alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final, + is_smoothed=True, smoothing_window=500) +if func_to_run == 'collision_best_learning_curves_some_algs_no_bootstrap': + sp_list = [1.0] + fig_size = (6, 4) + alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final, + is_smoothed=True, smoothing_window=500) +if func_to_run == 'collision_best_learning_curves_full_bootstrap_rerun_and_original': # also need to set PLOT_RERUN = False + # and PLOT_RERUN_AND_ORIG = True in plot_params. Also some changes are necessary in the plot_learning_curve function + # like setting the colors and stuff for the re-run and original plots. + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'all_algs': ['GTD']} + auc_or_final = ['final'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +# endregion +# region sensitivity curves +if func_to_run == 'collision_sensitivity_curves_for_many_lambdas': + sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0] + fig_size = (10, 4) + algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD'] + # algs = ['TB', 'Vtrace', 'ABTD'] + auc_or_final = ['auc'] + plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'collision_emphatics_sensitivity_full_bootstrap': + sp_list = [0.0] + fig_size = (11, 5) + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'collision_gradients_sensitivity_full_bootstrap': + sp_list = [0.0] + fig_size = (11, 4) + algs = ['GTD', 'GTD2', 'PGTD2', 'HTD'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'collision_gradients_sensitivity_full_bootstrap_all_eta': + sp_list = [0.0] + fig_size = (10, 6) + algs = ['GTD', 'GTD2', 'PGTD2', 'HTD'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'collision_TDRC_all_eta_one_beta': + sp_list = [0.0] + tdrc_beta = [0.01] # possible values are 0.1, 0.01, 1.0. Set them separately to plot. + fig_size = (10, 6) + algs = ['TDRC'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final, tdrc_beta=tdrc_beta) +if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap': + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'], + 'emphatics': ['ETD', 'ETDLB'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'], + 'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']} + auc_or_final = ['auc'] + if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping': + plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) + elif func_to_run == 'collision_waterfall_full_bootstrap': + plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'collision_emphatics_sensitivity_minimal_bootstrap': + sp_list = [0.9] + fig_size = (6, 4) + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'collision_sensitivity_curves_for_two_lambdas': + sp_list = [0.0, 0.9] + fig_size = (6, 4) + algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD'] + auc_or_final = ['auc'] + plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +# endregion + +# endregion + + +# ==================== +# ==================== + + +# region FOUR ROOMS FIGURES +# region learning curves +if func_to_run == 'four_rooms_specific_learning_curves_full_bootstrap': + auc_or_final = ['auc'] + fig_size = (10, 4) + sp = 0.0 + exp = 'FirstFourRoom' + algs = ['ETD', 'TD', 'GTD2', 'TDRC', 'PGTD2'] + specific_params = { + 'TD': {'alpha': 0.0625, 'lmbda': 0.0}, + 'ETD': {'alpha': 0.000488281, 'lmbda': sp}, + 'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2}, + 'TDRC': {'alpha': 0.125, 'lmbda': sp, 'eta': 4.0, 'tdrc_beta': 1.0}, + 'GTD2': {'alpha': 0.001953125, 'lmbda': sp, 'eta': 16.0}, + 'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0} + } + plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final, + specific_params=specific_params) +if func_to_run == 'four_rooms_best_learning_curves_full_bootstrap': + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'], + 'emphatics': ['ETD', 'ETDLB', 'LSETD'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'], + 'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', + 'LSTD', 'LSETD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'four_rooms_best_learning_curves_full_bootstrap_2': + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'main_algs': ['ETD', 'ETDLB', 'LSTD', 'LSETD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'four_rooms_best_overall_params_learning_curves': + fig_size = (10, 4) + alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'], + 'emphatics': ['ETD', 'ETDLB', 'LSETD'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'], + 'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', + 'LSTD', 'LSETD']} + auc_or_final = ['auc'] + plot_learning_curve_best_overall_params(exps=exps, alg_groups=alg_groups, fig_size=fig_size, auc_or_final=auc_or_final) +# endregion + +# region sensitivity curves +if func_to_run == 'four_rooms_sensitivity_curves_for_many_lambdas': + sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0] + fig_size = (10, 4) + algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD'] + auc_or_final = ['auc'] + plot_min_performance = False + plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final, + plot_min_performance=plot_min_performance) +if func_to_run == 'four_rooms_emphatics_sensitivity_full_bootstrap': + sp_list = [0.0] + # fig_size = (11, 5) + fig_size = (10, 4) + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'four_rooms_gradients_sensitivity_full_bootstrap': + sp_list = [0.0] + fig_size = (10, 4) + algs = ['GTD', 'GTD2', 'PGTD2', 'HTD'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'four_rooms_gradients_sensitivity_full_bootstrap_all_eta': + sp_list = [0.0] + fig_size = (10, 6) + algs = ['GTD', 'GTD2', 'PGTD2', 'HTD'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'four_rooms_TDRC_all_eta_one_beta': + sp_list = [0.0] + tdrc_beta = [0.01] # possible values are 0.1, 0.01, 1.0. Set them separately to plot. + fig_size = (10, 6) + algs = ['TDRC'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final, tdrc_beta=tdrc_beta) +if func_to_run == 'four_rooms_best_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap': + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'], + 'emphatics': ['ETD', 'ETDLB'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'], + 'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']} + auc_or_final = ['auc'] + if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping': + plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) + elif func_to_run == 'collision_waterfall_full_bootstrap': + plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'four_rooms_emphatics_sensitivity_minimal_bootstrap': + sp_list = [0.9] + fig_size = (6, 4) + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'four_rooms_sensitivity_curves_for_two_lambdas': + sp_list = [0.0, 0.9] + fig_size = (6, 4) + algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD'] + auc_or_final = ['auc'] + plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +# endregion +# endregion + + +# ==================== +# ==================== + + +# region HIGH VARIANCE FOUR ROOMS FIGURES +# region learning curves +if func_to_run == 'hv_four_rooms_specific_learning_curves_full_bootstrap': + auc_or_final = ['auc'] + fig_size = (10, 4) + sp = 0.0 + exp = '1HVFourRoom' + algs = ['ETD', 'TD', 'GTD', 'TB'] + specific_params = { + 'TD': {'alpha': 0.0078125, 'lmbda': sp}, + 'ETD': {'alpha': 0.000244140, 'lmbda': sp}, + 'GTD': {'alpha': 0.000488281, 'lmbda': sp, 'eta': 16.0}, + 'TB': {'alpha': 0.03125, 'lmbda': 1.0} + } + plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final, + specific_params=specific_params) +if func_to_run == 'hv_four_rooms_best_learning_curves_full_bootstrap': + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'], + 'emphatics': ['ETD', 'ETDLB', 'LSETD'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'], + 'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', + 'LSTD', 'LSETD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'hv_four_rooms_best_learning_curves_full_bootstrap_2': + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'main_algs': ['ETD', 'ETDLB', 'LSTD', 'LSETD']} + auc_or_final = ['auc'] + plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'hv_four_rooms_best_overall_params_learning_curves': + fig_size = (10, 4) + alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'], + 'emphatics': ['ETD', 'ETDLB', 'LSETD'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'], + 'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', + 'LSTD', 'LSETD']} + auc_or_final = ['auc'] + plot_learning_curve_best_overall_params(exps=exps, alg_groups=alg_groups, fig_size=fig_size, auc_or_final=auc_or_final) +# endregion + +# region sensitivity curves +if func_to_run == 'hv_four_rooms_sensitivity_curves_for_many_lambdas': + sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0] + fig_size = (10, 4) + algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD'] + # algs = ['TB', 'Vtrace', 'ABTD'] + auc_or_final = ['auc'] + plot_min_performance = False + plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final, + plot_min_performance=plot_min_performance) +if func_to_run == 'hv_four_rooms_emphatics_sensitivity_full_bootstrap': + sp_list = [0.0] + # fig_size = (11, 5) + fig_size = (10, 4) + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'hv_four_rooms_gradients_sensitivity_full_bootstrap': + sp_list = [0.0] + fig_size = (10, 4) + algs = ['GTD', 'GTD2', 'PGTD2', 'HTD'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'hv_four_rooms_gradients_sensitivity_full_bootstrap_all_eta': + sp_list = [0.0] + fig_size = (10, 6) + algs = ['GTD', 'GTD2', 'PGTD2', 'HTD'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'hv_four_rooms_TDRC_all_eta_one_beta': + sp_list = [0.0] + tdrc_beta = [0.01] # possible values are 0.1, 0.01, 1.0. Set them separately to plot. + fig_size = (10, 6) + algs = ['TDRC'] + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final, tdrc_beta=tdrc_beta) +if func_to_run == 'hv_four_rooms_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap': + sp_list = [0.0] + fig_size = (10, 4) + alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'], + 'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'], + 'emphatics': ['ETD', 'ETDLB'], + 'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'], + 'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']} + auc_or_final = ['auc'] + if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping': + plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) + elif func_to_run == 'collision_waterfall_full_bootstrap': + plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +if func_to_run == 'hv_four_rooms_emphatics_sensitivity_minimal_bootstrap': + sp_list = [0.9] + fig_size = (6, 4) + auc_or_final = ['auc'] + plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final) +if func_to_run == 'hv_four_rooms_sensitivity_curves_for_two_lambdas': + sp_list = [0.0, 0.9] + fig_size = (6, 4) + algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD'] + auc_or_final = ['auc'] + plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, + auc_or_final=auc_or_final) +# end region + +# endregion +# endregion + + +# region Misc +if func_to_run == 'plot_value_functions': + plot_value_functions() +if func_to_run == 'plot_all_final_value_functions': + plot_all_final_value_functions() +if func_to_run == 'state_dist': + fig_size = (6, 4) + tasks = ['EightStateCollision', 'LearnEightPoliciesTileCodingFeat', + 'HighVarianceLearnEightPoliciesTileCodingFeat'] + for task in tasks: + plot_distribution(task=task, fig_size=fig_size) +if func_to_run == 'high_variance_and_normal_dist_comparison': + fig_size = (22, 4) + plot_dist_for_two_four_room_tasks(fig_size=fig_size) +# endregion + + +# from Plotting.process_state_value_function import plot_value_functions, plot_all_final_value_functions +# from Tasks.HighVarianceLearnEightPoliciesTileCodingFeat import HighVarianceLearnEightPoliciesTileCodingFeat +# from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat +# For building d_mu +# obj = HighVarianceLearnEightPoliciesTileCodingFeat() +# d_mu = (obj.generate_behavior_dist(20_000_000)) +# numpy.save(os.path.join(os.getcwd(), 'Resources', 'HighVarianceLearnEightPoliciesTileCodingFeat', 'd_mu.npy'), d_mu) diff --git b/process_data.py a/process_data.py new file mode 100644 index 0000000..02845a0 --- /dev/null +++ a/process_data.py @@ -0,0 +1,101 @@ +import json +import os + +import numpy as np + +from Learning import learn +from Plotting.plot_params import EXP_ATTRS +from Plotting.plot_utils import make_params, make_current_params, load_and_replace_large_nan_inf, \ + load_best_perf_json, load_best_rerun_params, make_res_path +from utils import create_name_for_save_load, Configuration + + +def save_perf_over_alpha(alg, exp, auc_or_final, sp, rerun=False): + fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp) + res_path = make_res_path(alg, exp) + mean_over_alpha, stderr_over_alpha = np.zeros(len(fp_list)), np.zeros(len(fp_list)) + best_fp, best_tp, best_fop = load_best_rerun_params(alg, exp, auc_or_final, sp) if rerun else (0, 0, 0) + for tp in tp_list: + for fop in fop_list: + current_params = make_current_params(alg, sp, tp, fop) + for i, fp in enumerate(fp_list): + current_params['alpha'] = fp + load_name = os.path.join(res_path, create_name_for_save_load(current_params)) + perf = np.load(f"{load_name}_mean_stderr_{auc_or_final}.npy") + if rerun and fp == best_fp and tp == best_tp and fop == best_fop: + perf = np.load(f"{load_name}_mean_stderr_{auc_or_final}_rerun.npy") + + mean_over_alpha[i], stderr_over_alpha[i] = perf[0], perf[1] + + save_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=['alpha'])) + postfix = '' + if rerun and tp == best_tp and fop == best_fop: + postfix = '_rerun' + np.save(f"{save_name}_mean_{auc_or_final}_over_alpha{postfix}", mean_over_alpha) + np.save(f"{save_name}_stderr_{auc_or_final}_over_alpha{postfix}", stderr_over_alpha) + + +def find_best_perf(alg, exp, auc_or_final, sp): + exp_attrs = EXP_ATTRS[exp](exp) + fp_list, _, tp_list, fop_list, res_path = make_params(alg, exp) + best_params = {} + best_perf, best_fp, best_sp, best_tp, best_fop = np.inf, np.inf, np.inf, np.inf, np.inf + for fop in fop_list: + for tp in tp_list: + current_params = make_current_params(alg, sp, tp, fop) + load_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=[ + 'alpha']) + f'_mean_{auc_or_final}_over_alpha.npy') + current_perf = load_and_replace_large_nan_inf( + load_name, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement) + min_perf = min(current_perf) + if min_perf < best_perf: + best_perf = min_perf + best_perf_idx = int(np.nanargmin(current_perf)) + best_fp = fp_list[best_perf_idx] + best_params = current_params + best_params['alpha'] = best_fp + return best_params + + +def save_best_perf_in_json(alg, exp, best_params, auc_or_final, sp): + fp_list, _, tp_list, fop_list, res_path = make_params(alg, exp) + exp_path = res_path.replace('Results', 'Experiments') + json_exp = os.path.join(exp_path, f"{alg}.json") + with open(json_exp, 'r') as f: + json_exp = json.load(f) + json_exp['meta_parameters'] = best_params + save_name = os.path.join(res_path, f"{auc_or_final}_{sp}.json") + with open(save_name, 'wt') as f: + json.dump(json_exp, f, indent=4) + + +def run_learning_with_best_perf(alg, exp, auc_or_final, sp): + res_path = os.path.join(os.getcwd(), 'Results', exp, alg) + best_perf_jsn = load_best_perf_json(alg, exp, sp, auc_or_final) + param_dict = best_perf_jsn['meta_parameters'] + param_dict['algorithm'] = alg + param_dict['task'] = best_perf_jsn['task'] + param_dict['environment'] = best_perf_jsn['environment'] + param_dict['num_steps'] = best_perf_jsn['number_of_steps'] + param_dict['num_of_runs'] = best_perf_jsn['number_of_runs'] + param_dict['sub_sample'] = best_perf_jsn['sub_sample'] + param_dict['save_path'] = res_path + param_dict['save_value_function'] = False + param_dict['rerun'] = True + param_dict['render'] = False + config = Configuration(param_dict) + learn(config) + + +def process_data(**kwargs): + for exp in kwargs['exps']: + for alg in kwargs['algs']: + for auc_or_final in kwargs['auc_or_final']: + for sp in kwargs['sp_list']: + print(f"\nStarted re-running {exp}, {alg} lmbda_or_zeta: {sp}, {auc_or_final} ...") + save_perf_over_alpha(alg, exp, auc_or_final, sp) + best_params = find_best_perf(alg, exp, auc_or_final, sp) + save_best_perf_in_json(alg, exp, best_params, auc_or_final, sp) + run_learning_with_best_perf(alg, exp, auc_or_final, sp) + save_perf_over_alpha(alg, exp, auc_or_final, sp, rerun=True) + print(f"Finished re-running {exp}, {alg} {best_params}") diff --git b/requirements.txt a/requirements.txt new file mode 100644 index 0000000..596d5fc --- /dev/null +++ a/requirements.txt @@ -0,0 +1,5 @@ +#matplotlib>=3.2.2 +#numpy>=1.19.0 +imageio>=2.9.0 +pyglet>=1.5.11 +scikit_image>=0.17.2 diff --git b/test.py a/test.py new file mode 100644 index 0000000..e364d3e --- /dev/null +++ a/test.py @@ -0,0 +1,37 @@ +import time + +import utils +from Environments.Chain import Chain +from Environments.FourRoomGridWorld import FourRoomGridWorld +from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat +import pyglet +from skimage.transform import resize +import numpy as np + +from data_presister import DataPersister, find_best_performance + +# if __name__ == "__main__": +# render_mode = 'human' +# render_mode = 'rgb' +# render_mode = 'screen' +# +# frames = [] +# env = FourRoomGridWorld() +# # env = Chain() +# env.reset() +# actions = [2, 2, 0, 0, 0, 3, 3, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 3, 1, 1, 2, 2, 2, 0, 0, 2, 2, 2, 2, +# 2, 1, 1, 1, 1, 1, 1, 1 +# , 2, 2, 2, 3, 1, 1, 3, 3, 3, 3, 3, 0, 3, 3, 1, 3, 3, 3, 3] +# actions = actions * 1 +# for step in range(len(actions)): +# a = actions[step] +# next_state, r, is_terminal, info = env.step(a) +# state = next_state +# frames.append(env.render(mode=render_mode)) +# if is_terminal: +# env.reset() +# utils.generate_gif(frames, 'Assets/FourRoomGridWorld.gif', size=(180, 180, 3), duration=1 / 20) + +# DataPersister.save_best_pref_over_first_param(exp_name="FirstChain", alg_name="HTD", auc_or_final="auc") + +find_best_performance(exp_name="FirstChain", alg_name="HTD", auc_or_final="auc", second_param=0.2) diff --git b/unittest_suite.py a/unittest_suite.py new file mode 100644 index 0000000..0fe9355 --- /dev/null +++ a/unittest_suite.py @@ -0,0 +1,11 @@ +import unittest +from Tests.Algorithms.TestTD import TestTD +from Tests.Environments.TestChain import TestChain +from Tests.Tasks.TestEightStateCollision import TestEightStateCollision + +test_suite = unittest.TestSuite() +test_suite.addTest(unittest.makeSuite(TestChain)) +test_suite.addTest(unittest.makeSuite(TestEightStateCollision)) +test_suite.addTest(unittest.makeSuite(TestTD)) +runner = unittest.TextTestRunner() +runner.run(test_suite) diff --git b/utils.py a/utils.py new file mode 100644 index 0000000..8f2f288 --- /dev/null +++ a/utils.py @@ -0,0 +1,83 @@ +import numpy as np +import os + + +def get_save_value_function_steps(num_steps): + return [int(num_steps * i) - 1 for i in [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]] + + +def save_value_function(value_function, save_path, step, run): + save_dir = os.path.join(save_path, 'Sample_value_function') + res_path = os.path.join(save_dir, f"{step}_{run}") + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + np.save(res_path, value_function) + + +class Configuration(dict): + def __str__(self): + return f"{self.environment} {self.task} {self.algorithm}" + + def __getattr__(self, item): + return self[item] + + +def find_all_experiment_configuration(experiments_path: str, ext='.json'): + if experiments_path.endswith(ext): + yield experiments_path + for root, _, files in os.walk(experiments_path): + for file in files: + if file.endswith(ext): + yield os.path.join(root, file) + + +class ImmutableDict(dict): + def immutable(self): + raise TypeError("%r objects are immutable" % self.__class__.__name__) + + def __setitem__(self, key, value): + self.immutable() + + def __delitem__(self, key): + self.immutable() + + def set_default(self, k, default): + self.immutable() + + def update(self, __m, **kwargs): + self.immutable() + + def clear(self) -> None: + self.immutable() + + +def create_name_for_save_load(param_dict, excluded_params=None): + if excluded_params is None: + excluded_params = [] + final_str = '' + for k, v in param_dict.items(): + if k in excluded_params: + continue + if k == 'alpha' or k == 'eta': + split_str = str.split(f'{v:.10f}', '.') + else: + split_str = str.split(f'{v:.5f}', '.') + final_str += '_' + k + split_str[0] + split_str[1] + return final_str + + +def save_result(path, name, result_array, params, rerun): + name_to_save = create_name_for_save_load(param_dict=params) + path_and_name = os.path.join(path, name_to_save) + final_name = f"{path_and_name}{name}" + if rerun: + final_name = f"{final_name}_rerun" + np.save(final_name, result_array) + + +def generate_gif(frames, path, size=(180, 180, 3), duration=1 / 20): + import imageio + from skimage.transform import resize + for idx, frame_idx in enumerate(frames): + frames[idx] = resize(frame_idx, size, preserve_range=True, order=0).astype(np.uint8) + imageio.mimsave(path, frames, duration=duration)