TDAlgorithms_IEEE24

3d352cbd · GongYu · 3d352cbd · 3d352cbd · 3d352cbd · 3d352cbd
Commit 3d352cbd authored Jul 10, 2024 by GongYu
117 changed files
--- a/.gitignore
+++ b/.gitignore
+
+# Created by https://www.toptal.com/developers/gitignore/api/macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp
+# Edit at https://www.toptal.com/developers/gitignore?templates=macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### NotepadPP ###
+# Notepad++ backups #
+*.bak
+
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### PyCharm Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+doc/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+### SublimeText ###
+# Cache files for Sublime Text
+*.tmlanguage.cache
+*.tmPreferences.cache
+*.stTheme.cache
+
+# Workspace files are user-specific
+*.sublime-workspace
+
+# Project files should be checked into the repository, unless a significant
+# proportion of contributors will probably not be using Sublime Text
+# *.sublime-project
+
+# SFTP configuration file
+sftp-config.json
+
+# Package control specific files
+Package Control.last-run
+Package Control.ca-list
+Package Control.ca-bundle
+Package Control.system-ca-bundle
+Package Control.cache/
+Package Control.ca-certs/
+Package Control.merged-ca-bundle
+Package Control.user-ca-bundle
+oscrypto-ca-bundle.crt
+bh_unicode_properties.cache
+
+# Sublime-github package stores a github token in this file
+# https://packagecontrol.io/packages/sublime-github
+GitHub.sublime-settings
+
+### Vim ###
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+### VisualStudio ###
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Mono auto generated files
+mono_crash.*
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*[.json, .xml, .info]
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# CodeRush personal settings
+.cr/personal
+
+# Python Tools for Visual Studio (PTVS)
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+
+# Local History for Visual Studio
+.localhistory/
+
+# BeatPulse healthcheck temp database
+healthchecksdb
+
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+
+# End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,python,pycharm,sublimetext,vim,visualstudio,notepadpp
+/.idea
--- a/Algorithms/ABTD.py
+++ b/Algorithms/ABTD.py
+from Algorithms.BaseVariableLmbda import BaseVariableLmbda
+import numpy as np
+
+
+class ABTD(BaseVariableLmbda):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        zeta = kwargs.get('zeta')
+        self.old_nu = 0
+        if self.task.num_policies > 1:
+            self.old_nu = np.zeros(self.task.num_policies)
+        xi_zero = self.task.ABTD_xi_zero
+        xi_max = self.task.ABTD_xi_max
+        self.xi = 2 * zeta * xi_zero + max(0, 2 * zeta - 1) * (xi_max - 2 * xi_zero)
+
+    @staticmethod
+    def related_parameters():
+        return['alpha', 'zeta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, rho, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal)
+        nu = min(self.xi, 1.0 / max(pi, mu))
+        self.z = x + self.gamma * self.old_nu * self.old_pi * self.z
+        self.w += alpha * delta * self.z
+        self.old_nu = nu
+        self.old_pi = pi
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        delta = rho * delta
+        nu = self.compute_nu_for_multiple_policies(pi, mu)
+        self.z = (self.gamma_vec_t * self.old_nu * self.old_pi)[:, None] * self.z + stacked_x
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
+        self.old_nu = nu
+        self.old_pi = pi
+        self.gamma_vec_t = self.gamma_vec_tp
+
+    def compute_nu_for_multiple_policies(self, pi, mu):
+        xi_vec = np.ones(self.task.num_policies) * self.xi
+        max_vec = 1.0 / np.maximum.reduce([pi, mu])
+        return np.minimum.reduce([max_vec, xi_vec])
+
+    def reset(self):
+        super().reset()
+        self.old_nu = 0
--- a/Algorithms/BaseGradient.py
+++ b/Algorithms/BaseGradient.py
+import numpy as np
+from Algorithms.BaseTD import BaseTD
+from Tasks.BaseTask import BaseTask
+
+
+class BaseGradient(BaseTD):
+    def __init__(self, task: BaseTask, **kwargs):
+        super().__init__(task, **kwargs)
+        self.v = np.zeros(self.task.num_features)
+        self.eta = kwargs.get('eta')
+        if self.task.num_policies > 1:
+            self.v = np.zeros((self.task.num_policies, self.task.num_features))
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda', 'eta']
+
+    def compute_second_step_size(self):
+        return self.eta * self.compute_step_size()
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super(BaseGradient, self).learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x, self.task.stacked_feature_rep[:, :, s_p], \
+            self.compute_second_step_size() * self.gamma_vec_t / self.gamma
--- a/Algorithms/BaseLS.py
+++ b/Algorithms/BaseLS.py
+import numpy as np
+from numpy.linalg import pinv
+from Tasks.BaseTask import BaseTask
+from Algorithms.BaseTD import BaseTD
+
+
+class BaseLS(BaseTD):
+    def __init__(self, task: BaseTask, **kwargs):
+        super(BaseLS, self).__init__(task, **kwargs)
+        self.A = np.zeros((self.task.num_features, self.task.num_features))
+        self.b = np.zeros(self.task.num_features)
+        self.t = 0
+        if self.task.num_policies > 1:
+            self.A = np.zeros((self.task.num_policies, self.task.num_features, self.task.num_features))
+            self.b = np.zeros((self.task.num_policies, self.task.num_features))
+            self.gamma_vec_t = np.concatenate((np.ones(2), np.zeros(6))) * self.gamma
+            self.t = np.zeros(self.task.num_policies)
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        self.t += 1
+        self.A += (np.outer(self.z, (x - self.gamma * x_p)) - self.A) / self.t
+        self.b += (r * self.z - self.b) / self.t
+        self.w = np.dot(pinv(self.A), self.b)
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        _, _, x, x_p, _, _, _, stacked_x = \
+            super(BaseLS, self).learn_multiple_policies(s, s_p, r, is_terminal)
+        for i in range(self.task.num_policies):
+            if self.gamma_vec_t[i] != 0.0:
+                self.t[i] += 1
+                z = self.z[i, :]
+                self.A[i, :, :] += (np.outer(z, (x - self.gamma_vec_tp[i] * x_p)) - self.A[i, :, :]) / self.t[i]
+                self.b[i, :] += (self.r_vec[i] * z - self.b[i, :]) / self.t[i]
+                self.w[i, :] = np.dot(pinv(self.A[i, :, :]), self.b[i, :])
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/BaseTD.py
+++ b/Algorithms/BaseTD.py
+import numpy as np
+from Tasks.BaseTask import BaseTask
+
+
+class BaseTD:
+    def __init__(self, task: BaseTask, **kwargs):
+        self.task = task
+        self.w = np.zeros(self.task.num_features)
+        self.z = np.zeros(self.task.num_features)
+        if self.task.num_policies > 1:
+            self.w = np.zeros((self.task.num_policies, self.task.num_features))
+            self.z = np.zeros((self.task.num_policies, self.task.num_features))
+        self.gamma = self.task.GAMMA
+        self.alpha = kwargs['alpha']
+        self.lmbda = kwargs.get('lmbda')
+        self.state_values = self.task.load_state_values()  # This is of size num_policies * 121
+        self.d_mu = self.task.load_behavior_dist()  # same size as state_values
+        self.state, self.next_state, self.action = None, None, None
+        self.r_vec = np.zeros(self.task.num_policies)
+        self.gamma_vec_tp = np.zeros(self.task.num_policies)
+        self.gamma_vec_t = np.zeros(self.task.num_policies)
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda']
+
+    def compute_value_function(self):
+        return np.dot(self.w, self.task.feature_rep.T)
+
+    def compute_rmsve(self):
+        error = self.compute_value_function() - self.state_values
+        error_squared = error * error
+        return np.sqrt(np.sum(self.d_mu * error_squared.T, 0) / np.sum(self.d_mu, 0)), error
+
+    def compute_step_size(self):
+        return self.alpha
+
+    def choose_behavior_action(self):
+        return self.task.select_behavior_action(self.state)
+
+    def choose_target_action(self):
+        return self.task.select_target_action(self.state)
+
+    def learn(self, s, s_p, r, is_terminal):
+        if self.task.num_policies == 1:
+            self.learn_single_policy(s, s_p, r, is_terminal)
+        else:
+            self.learn_multiple_policies(s, s_p, r, is_terminal)
+
+    def get_features(self, s, s_p, is_terminal):
+        x_p = np.zeros(self.task.num_features)
+        if not is_terminal:
+            x_p = self.task.get_state_feature_rep(s_p)
+        x = self.task.get_state_feature_rep(s)
+        return x, x_p
+
+    def get_isr(self, s):
+        pi = self.task.get_pi(s, self.action)
+        mu = self.task.get_mu(s, self.action)
+        rho = pi / mu
+        return rho
+
+    def get_delta(self, r, x, x_p):
+        return r + self.gamma * np.dot(self.w, x_p) - np.dot(self.w, x)
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        rho = self.get_isr(s)
+        alpha = self.compute_step_size()
+        delta = self.get_delta(r, x, x_p)
+        self.z = rho * (self.gamma * self.lmbda * self.z + x)
+        return delta, alpha, x, x_p, rho
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        active_policies_vec = self.task.get_active_policies(s)
+        self.r_vec = np.zeros(self.task.num_policies)
+        if r > 0:
+            terminal_policies_vec = self.task.get_terminal_policies(s_p)
+            self.r_vec = r * terminal_policies_vec
+        alpha_vec = active_policies_vec * self.compute_step_size()
+        x = self.task.get_state_feature_rep(s)
+        x_p = np.zeros(self.task.num_features)
+        if not is_terminal:
+            x_p = self.task.get_state_feature_rep(s_p)
+        pi = self.task.get_pi(s, self.action)
+        mu = self.task.get_mu(s, self.action)
+        rho = pi / mu
+        self.gamma_vec_tp = self.task.get_active_policies(s_p) * self.gamma
+        delta = self.r_vec + self.gamma_vec_tp * np.dot(self.w, x_p) - np.dot(self.w, x)
+        stacked_x = self.task.stacked_feature_rep[:, :, s]
+        return delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x
+
+    def reset(self):
+        self.z = np.zeros(self.task.num_features)
+
+    def __str__(self):
+        return f'agent:{type(self).__name__}'
--- a/Algorithms/BaseVariableLmbda.py
+++ b/Algorithms/BaseVariableLmbda.py
+from Algorithms.BaseTD import BaseTD
+from Tasks.BaseTask import BaseTask
+import numpy as np
+
+
+class BaseVariableLmbda(BaseTD):
+    def __init__(self, task: BaseTask, **kwargs):
+        super().__init__(task, **kwargs)
+        self.old_pi, self.old_mu = 0, 1
+        if self.task.num_policies > 1:
+            self.old_pi, self.old_mu = np.zeros(self.task.num_policies), np.ones(self.task.num_policies)
+        self.old_rho = self.old_pi / self.old_mu
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        alpha = self.compute_step_size()
+        pi = self.task.get_pi(s, self.action)
+        mu = self.task.get_mu(s, self.action)
+        rho = pi / mu
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        delta = rho * self.get_delta(r, x, x_p)
+        return delta, alpha, x, x_p, rho, pi, mu
+
+    def reset(self):
+        self.old_pi, self.old_mu = 0, 1
+        self.old_rho = self.old_pi / self.old_mu
--- a/Algorithms/ETD.py
+++ b/Algorithms/ETD.py
+from Algorithms.ETDLB import ETDLB
+
+
+class ETD(ETDLB):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.beta = self.task.GAMMA
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda']
--- a/Algorithms/ETDLB.py
+++ b/Algorithms/ETDLB.py
+from Algorithms.BaseTD import BaseTD
+import numpy as np
+
+
+class ETDLB(BaseTD):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.F = 1
+        self.old_rho = 0
+        self.beta = kwargs.get('beta')
+        if self.task.num_policies > 1:
+            self.F = np.zeros(self.task.num_policies)
+            self.old_rho = np.zeros(self.task.num_policies)
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda', 'beta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        delta = self.get_delta(r, x, x_p)
+        self.F = self.beta * self.old_rho * self.F + 1
+        m = self.lmbda * 1 + (1 - self.lmbda) * self.F
+        rho = self.get_isr(s)
+        self.z = rho * (x * m + self.gamma * self.lmbda * self.z)
+        self.w += self.compute_step_size() * delta * self.z
+        self.old_rho = rho
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, *_, rho, _ = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        stacked_x = self.task.stacked_feature_rep[:, :, s]
+        beta_vec = self.beta * self.gamma_vec_t / self.gamma
+        self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies)
+        m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None])
+        self.w += (alpha_vec * delta)[:, None] * self.z
+        self.old_rho = rho
+        self.gamma_vec_t = self.gamma_vec_tp
+
+    def reset(self):
+        super().reset()
+        self.F = 1
+        self.old_rho = 0
+        if self.task.num_policies > 1:
+            self.old_rho = np.zeros(self.task.num_policies)
+            self.F = np.zeros(self.task.num_policies)
--- a/Algorithms/GEMETD.py
+++ b/Algorithms/GEMETD.py
+from Algorithms.BaseTD import BaseTD
+import numpy as np
+
+
+class GEMETD(BaseTD):
+    """
+    An ETD(0) implementation that uses GEM (aka GTD2(0) with x and x_p switched) to estimate emphasis.
+    """
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.beta = self.task.GAMMA
+        self.gem_alpha = kwargs['gem_alpha']  # Step size for GEM weights.
+        self.gem_beta = kwargs['gem_beta']  # Regularization parameter for GEM; not needed for a fixed target policy.
+        self.k = np.zeros(self.task.num_features)  # Auxiliary weights for GEM.
+        self.u = np.zeros(self.task.num_features)  # Main weights for GEM.
+        if self.task.num_policies > 1:
+            self.k = np.zeros((self.task.num_policies, self.task.num_features))
+            self.u = np.zeros((self.task.num_policies, self.task.num_features))
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'gem_alpha', 'gem_beta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, x_p = self.get_features(s, s_p, is_terminal)
+        rho = self.get_isr(s)
+        delta_bar = 1 + rho * self.gamma * np.dot(self.u, x) - np.dot(self.u, x_p)
+        self.k += self.gem_alpha * (delta_bar - np.dot(self.k, x_p)) * x_p
+        self.u += self.gem_alpha * ((x_p - self.gamma * rho * x) * np.dot(self.k, x_p) - self.gem_beta * self.u)
+        delta = self.get_delta(r, x, x_p)
+        m = np.dot(self.u, x)  # Use parametric estimate of expected emphasis.
+        self.w += self.alpha * m * rho * delta * x
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        stacked_x_p = self.task.stacked_feature_rep[:, :, s_p]
+        # GEM update:
+        gem_alpha_vec = self.task.get_active_policies(s) * self.gem_alpha
+        delta_bar = np.ones(self.task.num_policies) + rho * self.gamma_vec_t * np.dot(self.u, x) - np.dot(self.u, x_p)
+        self.k += gem_alpha_vec[:, None] * (delta_bar[:, None] - np.sum(x_p * self.k, 1)[:, None]) * stacked_x_p
+        self.u += gem_alpha_vec[:, None] * ((stacked_x_p - self.gamma_vec_t[:, None] * rho[:, None] * stacked_x) * np.sum(x_p * self.k, 1)[:, None] - self.gem_beta * self.u)  # should self.gem_beta be a vector here?
+        # ETD(0) update:
+        m = np.dot(self.u, x)
+        self.w += (alpha_vec * m * rho * delta)[:, None] * stacked_x
+        self.gamma_vec_t = self.gamma_vec_tp
+
+    def reset(self):
+        super().reset()
+        self.k = np.zeros(self.task.num_features)
+        self.u = np.zeros(self.task.num_features)
+        if self.task.num_policies > 1:
+            self.k = np.zeros((self.task.num_policies, self.task.num_features))
+            self.u = np.zeros((self.task.num_policies, self.task.num_features))
--- a/Algorithms/GTD.py
+++ b/Algorithms/GTD.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+# noinspection DuplicatedCode
+class GTD(BaseGradient):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
+        self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x)
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p)
+        self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/GTD2.py
+++ b/Algorithms/GTD2.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+class GTD2(BaseGradient):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        self.w += alpha * (np.dot(x, self.v) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
+        self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x)
+
+    # noinspection DuplicatedCode
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
+        self.w += alpha_vec[:, None] * (
+                np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
+        self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/HTD.py
+++ b/Algorithms/HTD.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+class HTD(BaseGradient):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.z_b = np.zeros(self.task.num_features)
+        if self.task.num_policies > 1:
+            self.z_b = np.zeros((self.task.num_policies, self.task.num_features))
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        self.z_b = self.gamma * self.lmbda * self.z_b + x
+        self.w += alpha * ((delta * self.z) + (x - self.gamma * x_p) * np.dot((self.z - self.z_b), self.v))
+        self.v += alpha_v * ((delta * self.z) - (x - self.gamma * x_p) * np.dot(self.v, self.z_b))
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        self.z_b = self.lmbda * self.z_b * self.gamma_vec_t[:, None] + stacked_x
+        gamma_stacked_xp = self.gamma_vec_tp[:, None] * stacked_x_p
+        delta_z = delta[:, None] * self.z
+        self.w += alpha_vec[:, None] * (
+                delta_z + (stacked_x - gamma_stacked_xp) * (np.sum((self.z - self.z_b) * self.v, 1))[:, None])
+        self.v += alphav_vec[:, None] * (
+                delta_z - (stacked_x - gamma_stacked_xp) * np.sum(self.v * self.z_b, 1)[:, None])
+        # TODO: Should the last v be replaced by w?
+        self.gamma_vec_t = self.gamma_vec_tp
+
+    def reset(self):
+        super().reset()
+        self.z_b = np.zeros(self.task.num_features)
+        if self.task.num_policies > 1:
+            self.z_b = np.zeros((self.task.num_policies, self.task.num_features))
--- a/Algorithms/LSETD.py
+++ b/Algorithms/LSETD.py
+from Algorithms.BaseLS import BaseLS
+import numpy as np
+
+
+class LSETD(BaseLS):
+    def __init__(self, task, **kwargs):
+        super(LSETD, self).__init__(task, **kwargs)
+        self.old_rho = 0
+        self.F = 1
+        self.beta = kwargs['beta']
+        if self.task.num_policies > 1:
+            self.F = np.ones(self.task.num_policies)
+            self.old_rho = np.zeros(self.task.num_policies)
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda', 'beta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        self.F = self.beta * self.old_rho * self.F + 1
+        m = self.lmbda + (1 - self.lmbda) * self.F
+        x, _ = self.get_features(s, s_p, is_terminal)
+        rho = self.get_isr(s)
+        self.z = rho * (self.gamma * self.lmbda * self.z + x * m)
+        super(LSETD, self).learn_single_policy(s, s_p, r, is_terminal)
+        self.old_rho = rho
+
+    # noinspection DuplicatedCode
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        beta_vec = self.beta * self.gamma_vec_t / self.gamma
+        self.F = beta_vec * self.old_rho * self.F + np.ones(self.task.num_policies)
+        m = self.lmbda * np.ones(self.task.num_policies) + (1 - self.lmbda) * self.F
+        stacked_x = self.task.stacked_feature_rep[:, :, s]
+        rho = self.get_isr(s)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x * m[:, None])
+        super(LSETD, self).learn_multiple_policies(s, s_p, r, is_terminal)
+        self.old_rho = rho
+
+    def reset(self):
+        super().reset()
+        self.F = 1
+        self.old_rho = 0
+        if self.task.num_policies > 1:
+            self.old_rho = np.zeros(self.task.num_policies)
+            self.F = np.zeros(self.task.num_policies)
--- a/Algorithms/LSTD.py
+++ b/Algorithms/LSTD.py
+from Algorithms.BaseLS import BaseLS
+
+
+class LSTD(BaseLS):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        x, _ = self.get_features(s, s_p, is_terminal)
+        self.z = self.get_isr(s) * (self.gamma * self.lmbda * self.z + x)
+        super(LSTD, self).learn_single_policy(s, s_p, r, is_terminal)
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        x, _ = self.get_features(s, s_p, is_terminal)
+        self.z = self.get_isr(s)[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + x)
+        super(LSTD, self).learn_multiple_policies(s, s_p, r, is_terminal)
--- a/Algorithms/PGTD2.py
+++ b/Algorithms/PGTD2.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+class PGTD2(BaseGradient):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        v_mid = self.v + alpha_v * (delta * self.z - np.dot(x, self.v) * x)
+        w_mid = self.w + alpha * (np.dot(x, self.v) * x - (1 - self.lmbda) * self.gamma * np.dot(self.z, self.v) * x_p)
+        delta_mid = r + self.gamma * np.dot(w_mid, x_p) - np.dot(w_mid, x)
+        self.w += alpha * (np.dot(x, v_mid) * x - self.gamma * (1 - self.lmbda) * np.dot(self.z, v_mid) * x_p)
+        self.v += alpha_v * (delta_mid * self.z - np.dot(x, v_mid) * x)
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        v_mid = self.v + alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(x * self.v, 1)[:, None] * stacked_x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
+        w_mid = self.w + alpha_vec[:, None] * (
+                np.sum(x * self.v, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
+        delta_mid = self.r_vec + self.gamma_vec_tp * np.dot(w_mid, x_p) - np.dot(w_mid, x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * v_mid, 1)
+        self.w += alpha_vec[:, None] * (
+                np.sum(x * v_mid, 1)[:, None] * stacked_x - phi_prime_multiplier[:, None] * stacked_x_p)
+        self.v += alphav_vec[:, None] * (delta_mid[:, None] * self.z - np.sum(x * v_mid, 1)[:, None] * stacked_x)
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/TB.py
+++ b/Algorithms/TB.py
+from Algorithms.BaseVariableLmbda import BaseVariableLmbda
+
+
+class TB(BaseVariableLmbda):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, *_, pi, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        self.z = self.gamma * self.lmbda * self.old_pi * self.z + x
+        self.w = self.w + alpha * delta * self.z
+        self.old_pi = pi
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        delta = rho * delta
+        self.z = (self.gamma_vec_t * self.lmbda * self.old_pi)[:, None] * self.z + stacked_x
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
+        self.old_pi = pi
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/TD.py
+++ b/Algorithms/TD.py
+from Algorithms.BaseTD import BaseTD
+
+
+class TD(BaseTD):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, *_ = super().learn_single_policy(s, s_p, r, is_terminal)
+        self.w += alpha * delta * self.z
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, *_, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        self.w += (alpha_vec * delta)[:, None] * self.z
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/TDRC.py
+++ b/Algorithms/TDRC.py
+from Algorithms.BaseGradient import BaseGradient
+import numpy as np
+
+
+# noinspection DuplicatedCode
+class TDRC(BaseGradient):
+    def __init__(self, task, **kwargs):
+        super().__init__(task, **kwargs)
+        self.tdrc_beta = kwargs['tdrc_beta']
+
+    @staticmethod
+    def related_parameters():
+        return ['alpha', 'lmbda', 'eta', 'tdrc_beta']
+
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, x_p, _ = super().learn_single_policy(s, s_p, r, is_terminal)
+        alpha_v = self.compute_second_step_size()
+        self.w += alpha * (delta * self.z - self.gamma * (1 - self.lmbda) * np.dot(self.z, self.v) * x_p)
+        self.v += alpha_v * (delta * self.z - np.dot(x, self.v) * x) - alpha_v * self.tdrc_beta * self.v
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, *_, rho, stacked_x, stacked_x_p, alphav_vec = super().learn_multiple_policies(
+            s, s_p, r, is_terminal)
+        self.z = rho[:, None] * (self.lmbda * self.z * self.gamma_vec_t[:, None] + stacked_x)
+        phi_prime_multiplier = (1 - self.lmbda) * self.gamma_vec_tp * np.sum(self.z * self.v, 1)
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z - phi_prime_multiplier[:, None] * stacked_x_p)
+        self.v += alphav_vec[:, None] * (delta[:, None] * self.z - np.sum(
+            x * self.v, 1)[:, None] * stacked_x) - (alphav_vec * self.tdrc_beta)[:, None] * self.v
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Algorithms/Vtrace.py
+++ b/Algorithms/Vtrace.py
+from Algorithms.BaseVariableLmbda import BaseVariableLmbda
+import numpy as np
+
+
+class Vtrace(BaseVariableLmbda):
+    def learn_single_policy(self, s, s_p, r, is_terminal):
+        delta, alpha, x, *_, pi, mu = super().learn_single_policy(s, s_p, r, is_terminal)
+        self.z = min(self.old_rho, 1) * self.gamma * self.lmbda * self.z + x
+        self.w += alpha * delta * self.z
+        self.old_rho = pi / mu
+
+    def learn_multiple_policies(self, s, s_p, r, is_terminal):
+        delta, alpha_vec, x, x_p, pi, mu, rho, stacked_x = super().learn_multiple_policies(s, s_p, r, is_terminal)
+        delta = rho * delta
+        truncated_old_rho = np.minimum(self.old_rho, np.ones(self.task.num_policies))
+        self.z = (truncated_old_rho * self.gamma_vec_t * self.lmbda)[:, None] * self.z + stacked_x
+        self.w += alpha_vec[:, None] * (delta[:, None] * self.z)
+        self.old_rho = rho
+        self.gamma_vec_t = self.gamma_vec_tp
--- a/Assets/Emphatics_sensitivity.png
+++ b/Assets/Emphatics_sensitivity.png
--- a/Assets/FourRoomGridWorld.gif
+++ b/Assets/FourRoomGridWorld.gif
--- a/Assets/Gradients_sensitivity.png
+++ b/Assets/Gradients_sensitivity.png
--- a/Assets/chain.gif
+++ b/Assets/chain.gif
--- a/Assets/eight_state_collision.png
+++ b/Assets/eight_state_collision.png
--- a/Assets/learning_curves.png
+++ b/Assets/learning_curves.png
--- a/Assets/parameters.png
+++ b/Assets/parameters.png
--- a/Assets/plots.png
+++ b/Assets/plots.png
--- a/Assets/rlai.png
+++ b/Assets/rlai.png
--- a/Assets/sensitivity_curves_of_all_algs.png
+++ b/Assets/sensitivity_curves_of_all_algs.png
--- a/Assets/specific_learning_curves.png
+++ b/Assets/specific_learning_curves.png
--- a/Assets/value_functions.png
+++ b/Assets/value_functions.png
--- a/Environments/Chain.py
+++ b/Environments/Chain.py
+import numpy as np
+
+
+class Chain:
+    def __init__(self, states_number: int = 8, start_state_number: int = 4, **kwargs):
+        assert start_state_number < states_number, "start states numbers should be less than state number"
+
+        self._states_number = states_number
+        self._start_state_number = start_state_number
+        self._terminal = self._states_number
+        self._state = None
+        self.RIGHT_ACTION = 0
+        self.RETREAT_ACTION = 1
+        self.num_states = states_number
+        self._window = None
+
+    def reset(self):
+        self._state = np.random.randint(0, self._start_state_number)
+        return self._state
+
+    def step(self, action):
+        if action == self.RETREAT_ACTION:
+            return self._terminal, 0, True, {}
+
+        next_state = self._state + 1
+        if next_state == self._terminal:
+            return self._terminal, 1, True, {}
+
+        self._state = next_state
+        return self._state, 0, False, {}
+
+    def render(self, mode='human'):
+        if mode == 'human':
+            import sys
+            from Environments.utils import colorize
+            corridor_map = [
+                str(i) if i > self._start_state_number
+                else colorize(str(i), "blue", highlight=False)
+                for i in range(self._states_number)
+            ]
+            corridor_map.append(colorize("T", "red", highlight=False))
+            corridor_map[self._state] = colorize(corridor_map[self._state], "green", highlight=True)
+
+            sys.stdout.write(f'{"|".join(corridor_map)}\n')
+
+        if mode == "rgb" or mode == "screen":
+            RGB_COLORS = {
+                'red': np.array([240, 52, 52]),
+                'black': np.array([0, 0, 0]),
+                'green': np.array([77, 181, 33]),
+                'blue': np.array([29, 111, 219]),
+                'purple': np.array([112, 39, 195]),
+                'yellow': np.array([217, 213, 104]),
+                'grey': np.array([192, 195, 196]),
+                'light_grey': np.array([230, 230, 230]),
+                'white': np.array([255, 255, 255])
+            }
+            img = np.zeros((self.num_states, 1, 3), dtype=np.uint8)
+            img[:, 0] = RGB_COLORS['grey']
+            img[:self._start_state_number - 1, 0] = RGB_COLORS['yellow']
+            img[self._terminal - 1, 0] = RGB_COLORS['black']
+            img[self._state - 1, 0] = RGB_COLORS['green']
+
+            img = np.transpose(img, (1, 0, 2))
+            if mode == "screen":
+                from pyglet.window import Window
+                from pyglet.text import Label
+                from pyglet.gl import GLubyte
+                from pyglet.image import ImageData
+                zoom = 50
+                if self._window is None:
+                    self._window = Window(self.num_states * zoom, 1 * zoom)
+
+                dt = np.kron(img, np.ones((zoom, zoom, 1)))
+                dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8'))
+                texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture()
+                self._window.clear()
+                self._window.switch_to()
+                self._window.dispatch_events()
+                texture.blit(0, 0)
+                # self._info.draw()
+                self._window.flip()
+            return np.flip(img, axis=0)
+
+
+if __name__ == '__main__':
+    env = Chain()
+    env.reset()
+    for step in range(1, 1000):
+        action = np.random.randint(0, 2)
+        sp, r, terminal, _ = env.step(action=action)
+        env.render(mode="screen")
+        if terminal:
+            env.reset()
+            print('env reset')
--- a/Environments/FourRoomGridWorld.py
+++ b/Environments/FourRoomGridWorld.py
+import numpy as np
+# from Environments.rendering import Render
+# from gym import utils
+# import gym
+# import sys
+
+BLOCK_NORMAL, BLOCK_WALL, BLOCK_HALLWAY, BLOCK_AGENT = 0, 1, 2, 3
+RGB_COLORS = {
+    'red': np.array([240, 52, 52]),
+    'black': np.array([0, 0, 0]),
+    'green': np.array([77, 181, 33]),
+    'blue': np.array([29, 111, 219]),
+    'purple': np.array([112, 39, 195]),
+    'yellow': np.array([217, 213, 104]),
+    'grey': np.array([192, 195, 196]),
+    'light_grey': np.array([230, 230, 230]),
+    'white': np.array([255, 255, 255])
+}
+four_room_map = [
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1],
+    [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+]
+
+
+class FourRoomGridWorld:
+    def __init__(self, stochasticity_fraction=0.0):
+        self._grid = np.transpose(np.flip(np.array(four_room_map, dtype=np.uint8), axis=0)[1:-1, 1:-1])
+        self._max_row, self._max_col = self._grid.shape
+        self._normal_tiles = np.where(self._grid == BLOCK_NORMAL)
+        self._hallways_tiles = np.where(self._grid == BLOCK_HALLWAY)
+        self._walls_tiles = np.where(self._grid == BLOCK_WALL)
+        self.num_states = self._grid.size
+
+        self._state = None
+        self.ACTION_UP, self.ACTION_DOWN, self.ACTION_RIGHT, self.ACTION_LEFT = 0, 1, 2, 3
+        self.num_actions = 4
+        self._stochasticity_fraction = stochasticity_fraction
+        self.hallways = {
+            0: (5, 1),
+            1: (1, 5),
+            2: (5, 8),
+            3: (8, 4)
+        }
+        self._window, self._info = None, None
+
+    def reset(self):
+        self._state = (0, 0)
+        return self.get_state_index(*self._state)
+
+    def step(self, action):
+        x, y = self._state
+        is_stochastic_selected = False
+        # if self._stochasticity_fraction >= np.random.uniform():
+        #     action_probability = [1 / (self.num_actions - 1) if i != action else 0 for i in range(self.num_actions)]
+        #     action = np.random.choice(self.num_actions, 1, p=action_probability)[0]
+        #     is_stochastic_selected = True
+        x_p, y_p = self._next(action, *self._state)
+        is_done = self._grid[x_p, y_p] == BLOCK_HALLWAY
+        reward = 1 if is_done else 0
+        self._state = (x_p, y_p)
+        return self.get_state_index(*self._state), reward, False, {
+            'x': x, 'y': y,
+            'x_p': x_p, 'y_p': y_p,
+            'is_stochastic_selected': is_stochastic_selected,
+            'selected_action': action}
+
+    def get_xy(self, state):
+        return (state % self._max_row), (state // self._max_col)
+
+    def get_state_index(self, x, y):
+        return y * self._max_col + x
+
+    def _next(self, action, x, y):
+
+        def move(current_x, current_y, next_x, next_y):
+            if next_y < 0 or next_x < 0:
+                return current_x, current_y
+            if next_y >= self._max_col or next_x >= self._max_row:
+                return current_x, current_y
+            if self._grid[next_x, next_y] == BLOCK_WALL:
+                return current_x, current_y
+            return next_x, next_y
+
+        switcher = {
+            self.ACTION_DOWN: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x, pos_y - 1),
+            self.ACTION_RIGHT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x + 1, pos_y),
+            self.ACTION_UP: lambda pox_x, pos_y: move(pox_x, y, pox_x, pos_y + 1),
+            self.ACTION_LEFT: lambda pox_x, pos_y: move(pox_x, pos_y, pox_x - 1, pos_y),
+        }
+        move_func = switcher.get(action)
+        return move_func(x, y)
+
+    def render(self, mode='human'):
+        import sys
+        from Environments.utils import colorize
+        color = {
+            BLOCK_NORMAL: lambda c: colorize(c, "white", highlight=True),
+            BLOCK_WALL: lambda c: colorize(c, "gray", highlight=True),
+            BLOCK_HALLWAY: lambda c: colorize(c, "green", highlight=True),
+        }
+        if mode == 'human':
+            outfile = sys.stdout
+            img = [
+                [color[b]('  ')
+                 for x, b
+                 in enumerate(line)]
+                for y, line in enumerate(four_room_map)]
+            img[self._max_row - self._state[1]][self._state[0] + 1] = colorize('  ', "red",
+                                                                                     highlight=True)
+            for line in img:
+                outfile.write(f'{"".join(line)}\n')
+            outfile.write('\n')
+        if mode == "rgb" or mode == "screen":
+            x, y = self._state
+            img = np.zeros((*self._grid.shape, 3), dtype=np.uint8)
+            img[self._normal_tiles] = RGB_COLORS['light_grey']
+
+            # if render_cls is not None:
+            #     assert render_cls is not type(Render), "render_cls should be Render class"
+            #     img = render_cls.render(img)
+
+            img[self._walls_tiles] = RGB_COLORS['black']
+            img[self._hallways_tiles] = RGB_COLORS['green']
+            img[x, y] = RGB_COLORS['red']
+
+            ext_img = np.zeros((self._max_row + 2, self._max_col + 2, 3), dtype=np.uint8)
+            ext_img[1:-1, 1:-1] = np.transpose(img, (1, 0, 2))
+            if mode == "screen":
+
+                from pyglet.window import Window
+                from pyglet.text import Label
+                from pyglet.gl import GLubyte
+                from pyglet.image import ImageData
+                zoom = 20
+                if self._window is None:
+                    self._window = Window((self._max_row + 2) * zoom, (self._max_col + 2) * zoom)
+                    self._info = Label('Four Room Grid World', font_size=10, x=5, y=5)
+                # self._info.text = f'x: {x}, y: {y}'
+                dt = np.kron(ext_img, np.ones((zoom, zoom, 1)))
+                dt = (GLubyte * dt.size)(*dt.flatten().astype('uint8'))
+                texture = ImageData(self._window.width, self._window.height, 'RGB', dt).get_texture()
+                self._window.clear()
+                self._window.switch_to()
+                self._window.dispatch_events()
+                texture.blit(0, 0)
+                # self._info.draw()
+                self._window.flip()
+            return np.flip(ext_img, axis=0)
+
+
+if __name__ == '__main__':
+    mode = 'human'
+    mode = 'screen'
+    env = FourRoomGridWorld()
+    env.reset()
+    for step in range(1, 100):
+        action = np.random.randint(0, 4)
+        sp, r, terminal, _ = env.step(action=action)
+        env.render(mode=mode)
+        if terminal:
+            env.reset()
+            print('env reset')
--- a/Environments/rendering.py
+++ b/Environments/rendering.py
+from abc import ABC, abstractmethod
+import numpy as np
+
+
+class Render(ABC):
+    @abstractmethod
+    def render(self, img):
+        raise NotImplementedError
+
+
+class ErrorRender(Render):
+    def __init__(self, num_policies, num_steps):
+        self.num_steps = num_steps
+        self.num_policies = num_policies
+        self._error, self._max_error, self._valid_state = None, None, None
+
+    def render(self, img):
+        # self.color_policy(img, 0)
+        self.color_policy(img, 1)
+        # self.color_policy(img, 2)
+        self.color_policy(img, 3)
+        # self.color_policy(img, 4)
+        self.color_policy(img, 5)
+        # self.color_policy(img, 6)
+        self.color_policy(img, 7)
+
+        return img
+
+    def add_error(self, error):
+        if self._max_error is None:
+            self._max_error = np.abs(error).reshape(8, 11, 11)
+            self._valid_state = np.array(self._max_error)
+            self._valid_state[self._valid_state != 0] = 1
+
+        self._error = np.abs(error).reshape(8, 11, 11)
+
+    def color_policy(self, img, policy_number):
+        e = self._error[policy_number]
+        x = self._max_error[policy_number]
+        d = np.clip((230 * e / x), 10, 255)
+        d = d * self._valid_state[policy_number]
+        d = np.nan_to_num(d).astype(np.uint8).T
+        d = np.repeat(d, 3).reshape(11, 11, 3)
+        d[:, :, 2] = 230
+        c = np.where(self._valid_state[policy_number].T == 1)
+        img[c] = d[c]
+        return img
--- a/Environments/utils.py
+++ b/Environments/utils.py
+"""A set of common utilities used within the environments. These are
+not intended as API functions, and will not remain stable over time.
+"""
+
+color2num = dict(
+    gray=30,
+    red=31,
+    green=32,
+    yellow=33,
+    blue=34,
+    magenta=35,
+    cyan=36,
+    white=37,
+    crimson=38
+)
+
+
+def colorize(string, color, bold=False, highlight=False):
+    """Return string surrounded by appropriate terminal color codes to
+    print colorized text.  Valid colors: gray, red, green, yellow,
+    blue, magenta, cyan, white, crimson
+    """
+
+    attr = []
+    num = color2num[color]
+    if highlight:
+        num += 10
+    attr.append(str(num))
+    if bold:
+        attr.append('1')
+    attrs = ';'.join(attr)
+    return '\x1b[%sm%s\x1b[0m' % (attrs, string)
--- a/Experiments/1HVFourRoom/ABTD/ABTD.json
+++ b/Experiments/1HVFourRoom/ABTD/ABTD.json
+{
+  "agent": "ABTD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "zeta": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/ETD/ETD.json
+++ b/Experiments/1HVFourRoom/ETD/ETD.json
+{
+  "agent": "ETD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/ETDLB/ETDLB.json
+++ b/Experiments/1HVFourRoom/ETDLB/ETDLB.json
+{
+  "agent": "ETDLB",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "beta": [
+      0.0, 0.2, 0.4, 0.6, 0.8, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/GTD/GTD.json
+++ b/Experiments/1HVFourRoom/GTD/GTD.json
+{
+  "agent": "GTD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/GTD2/GTD2.json
+++ b/Experiments/1HVFourRoom/GTD2/GTD2.json
+{
+  "agent": "GTD2",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/HTD/HTD.json
+++ b/Experiments/1HVFourRoom/HTD/HTD.json
+{
+  "agent": "HTD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/PGTD2/PGTD2.json
+++ b/Experiments/1HVFourRoom/PGTD2/PGTD2.json
+{
+  "agent": "PGTD2",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/TB/TB.json
+++ b/Experiments/1HVFourRoom/TB/TB.json
+{
+  "agent": "TB",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/TD/TD.json
+++ b/Experiments/1HVFourRoom/TD/TD.json
+{
+  "agent": "TD",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/TDRC/TDRC.json
+++ b/Experiments/1HVFourRoom/TDRC/TDRC.json
+{
+  "agent": "TDRC",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ],
+    "tdrc_beta": [
+      1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/1HVFourRoom/Vtrace/Vtrace.json
+++ b/Experiments/1HVFourRoom/Vtrace/Vtrace.json
+{
+  "agent": "Vtrace",
+  "environment": "FourRoomGridWorld",
+  "task": "HighVarianceLearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/ABTD/ABTD.json
+++ b/Experiments/FirstChain/ABTD/ABTD.json
+{
+  "agent": "ABTD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "zeta": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/ETD/ETD.json
+++ b/Experiments/FirstChain/ETD/ETD.json
+{
+  "agent": "ETD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/ETDLB/ETDLB.json
+++ b/Experiments/FirstChain/ETDLB/ETDLB.json
+{
+  "agent": "ETDLB",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "beta": [
+      0.0, 0.2, 0.4, 0.6, 0.8, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/GTD/GTD.json
+++ b/Experiments/FirstChain/GTD/GTD.json
+{
+  "agent": "GTD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/GTD2/GTD2.json
+++ b/Experiments/FirstChain/GTD2/GTD2.json
+{
+  "agent": "GTD2",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/HTD/HTD.json
+++ b/Experiments/FirstChain/HTD/HTD.json
+{
+  "agent": "HTD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/PGTD2/PGTD2.json
+++ b/Experiments/FirstChain/PGTD2/PGTD2.json
+{
+  "agent": "PGTD2",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/TB/TB.json
+++ b/Experiments/FirstChain/TB/TB.json
+{
+  "agent": "TB",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/TD/TD.json
+++ b/Experiments/FirstChain/TD/TD.json
+{
+  "agent": "TD",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/TDRC/TDRC.json
+++ b/Experiments/FirstChain/TDRC/TDRC.json
+{
+  "agent": "TDRC",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ],
+    "tdrc_beta": [
+      1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstChain/Vtrace/Vtrace.json
+++ b/Experiments/FirstChain/Vtrace/Vtrace.json
+{
+  "agent": "Vtrace",
+  "environment": "Chain",
+  "task": "EightStateOffPolicyRandomFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/ABTD/ABTD.json
+++ b/Experiments/FirstFourRoom/ABTD/ABTD.json
+{
+  "agent": "ABTD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "zeta": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/ETD/ETD.json
+++ b/Experiments/FirstFourRoom/ETD/ETD.json
+{
+  "agent": "ETD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/ETDLB/ETDLB.json
+++ b/Experiments/FirstFourRoom/ETDLB/ETDLB.json
+{
+  "agent": "ETDLB",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "beta": [
+      0.0, 0.2, 0.4, 0.6, 0.8, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/GTD/GTD.json
+++ b/Experiments/FirstFourRoom/GTD/GTD.json
+{
+  "agent": "GTD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/GTD2/GTD2.json
+++ b/Experiments/FirstFourRoom/GTD2/GTD2.json
+{
+  "agent": "GTD2",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/HTD/HTD.json
+++ b/Experiments/FirstFourRoom/HTD/HTD.json
+{
+  "agent": "HTD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/PGTD2/PGTD2.json
+++ b/Experiments/FirstFourRoom/PGTD2/PGTD2.json
+{
+  "agent": "PGTD2",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/TB/TB.json
+++ b/Experiments/FirstFourRoom/TB/TB.json
+{
+  "agent": "TB",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/TD/TD.json
+++ b/Experiments/FirstFourRoom/TD/TD.json
+{
+  "agent": "TD",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/TDRC/TDRC.json
+++ b/Experiments/FirstFourRoom/TDRC/TDRC.json
+{
+  "agent": "TDRC",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "eta": [
+      0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ],
+    "tdrc_beta": [
+      1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/Experiments/FirstFourRoom/Vtrace/Vtrace.json
+++ b/Experiments/FirstFourRoom/Vtrace/Vtrace.json
+{
+  "agent": "Vtrace",
+  "environment": "FourRoomGridWorld",
+  "task": "LearnEightPoliciesTileCodingFeat",
+  "number_of_runs": 50,
+  "number_of_steps": 50000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0
+    ]
+  }
+}
\ No newline at end of file
--- a/ExportBin/ReadMe
+++ b/ExportBin/ReadMe
+This directory contains all the export.dat files created when submitting jobs on Cedar.
\ No newline at end of file
--- a/Job/Cedar_Create_Config_Template.sh
+++ b/Job/Cedar_Create_Config_Template.sh
+#!/bin/bash
+alpha=(__ALPHA__)
+lmbda=(__LMBDA__)
+eta=(__ETA__)
+beta=(__BETA__)
+zeta=(__ZETA__)
+tdrc_beta=(__TDRCBETA__)
+gem_alpha=(__GEMALPHA__)
+gem_beta=(__GEMBETA__)
+num_of_runs=__NUMOFRUNS__
+num_steps=__NUMSTEPS__
+sub_sample=__SUBSAMPLE__
+algorithm=__ALGORITHM__
+environment=__ENVIRONMENT__
+task=__TASK__
+save_path=__SAVEPATH__
+
+rm -f exports_${algorithm}.dat
+for A in ${alpha[@]}; do
+  for L in ${lmbda[@]}; do
+    for E in ${eta[@]}; do
+      for B in ${beta[@]}; do
+        for Z in ${zeta[@]}; do
+          for T in ${tdrc_beta[@]}; do
+            for GA in ${gem_alpha[@]}; do
+              for GB in ${gem_beta[@]}; do
+                echo export SAVE_PATH=${save_path} ENVIRONMENT=${environment} ALGORITHM=${algorithm} \
+                TASK=${task} ALPHA=${A} LMBDA=${L} ETA=${E} BETA=${B} ZETA=${Z} TDRCBETA=${T} GEMALPHA=${GA} \
+                GEMBETA=${GB} NUMOFRUNS=${num_of_runs} NUMSTEPS=${num_steps} SUBSAMPLE=${sub_sample} \
+                >>exports_${algorithm}.dat
+              done
+            done
+          done
+        done
+      done
+    done
+  done
+done
--- a/Job/JobBuilder.py
+++ b/Job/JobBuilder.py
+import os
+import json
+import numpy as np
+from utils import ImmutableDict
+import time
+
+default_params = ImmutableDict(
+    {
+        'agent': 'GEMETD',
+        'task': 'EightStateCollision',
+        'environment': 'Chain',
+        'exp': 'FirstChain',
+        # 'agent': 'HTD',
+        # 'task': 'LearnEightPoliciesTileCodingFeat',
+        # 'environment': 'FourRoomGridWorld',
+        # 'exp': 'FirstFourRoom',
+        # 'agent': 'LSTD',
+        # 'task': 'HighVarianceLearnEightPoliciesTileCodingFeat',
+        # 'environment': 'FourRoomGridWorld',
+        # 'exp': '1HVFourRoom',
+
+        'save_value_function': True,
+        'sub_sample': 1,
+        'num_of_runs': 3,
+        'num_steps': 20_000,
+        'meta_parameters': {
+            'alpha': 0.001953125,
+            'eta': 16.0,
+            'beta': 0.9,
+            'zeta': 0.9,
+            'lmbda': 0.0,
+            'tdrc_beta': 1.0,
+            'gem_alpha': 0.1,
+            'gem_beta': 0.1
+        }
+    }
+)
+
+
+class JobBuilder:
+    def __init__(self, json_path, server_name):
+        self._path = json_path
+        self.server_name = server_name
+        with open(self._path) as f:
+            self._params = json.load(f)
+
+        self._batch_params = ImmutableDict(
+            {
+                'ALPHA': ' '.join([f'{num:.10f}' for num in self.alpha]),
+                'LMBDA': ' '.join([f'{num:.5f}' for num in self.lmbda]),
+                'ETA': ' '.join([f'{num:.10f}' for num in self.eta]),
+                'BETA': ' '.join([f'{num:.5f}' for num in self.beta]),
+                'ZETA': ' '.join([f'{num:.5f}' for num in self.zeta]),
+                'TDRCBETA': ' '.join([f'{num:.5f}' for num in self.tdrc_beta]),
+                'GEMALPHA': ' '.join([f'{num:.5f}' for num in self.gem_alpha]),
+                'GEMBETA': ' '.join([f'{num:.5f}' for num in self.gem_beta]),
+                'NUMOFRUNS': f'{self.num_of_runs}',
+                'NUMSTEPS': f'{self.num_steps}',
+                'SUBSAMPLE': f'{self.sub_sample}',
+                'ALGORITHM': self.agent,
+                'TASK': self.task,
+                'ENVIRONMENT': self.environment,
+                'SAVEPATH': self.save_path
+            })
+
+    @property
+    def tdrc_beta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('tdrc_beta', [default_params['meta_parameters']['tdrc_beta']]))
+
+    @property
+    def gem_alpha(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('gem_alpha', [default_params['meta_parameters']['gem_alpha']]))
+
+    @property
+    def gem_beta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('gem_beta', [default_params['meta_parameters']['gem_beta']]))
+
+    @property
+    def alpha(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('alpha', [default_params['meta_parameters']['alpha']]))
+
+    @property
+    def lmbda(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('lmbda', [default_params['meta_parameters']['lmbda']]))
+
+    @property
+    def eta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('eta', [default_params['meta_parameters']['eta']]))
+
+    @property
+    def beta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('beta', [default_params['meta_parameters']['beta']]))
+
+    @property
+    def zeta(self):
+        parameters = self._params.get('meta_parameters')
+        return np.asarray(parameters.get('zeta', [default_params['meta_parameters']['zeta']]))
+
+    @property
+    def agent(self):
+        return self._params.get('agent', default_params['agent'])
+
+    @property
+    def task(self):
+        return self._params.get('task', default_params['task'])
+
+    @property
+    def num_of_runs(self):
+        return np.asarray(self._params.get('number_of_runs', default_params['num_of_runs']))
+
+    @property
+    def num_steps(self):
+        return np.asarray(self._params.get('number_of_steps', default_params['num_steps']))
+
+    @property
+    def sub_sample(self):
+        return np.asarray(self._params.get('sub_sample', default_params['sub_sample']))
+
+    @property
+    def environment(self):
+        return self._params.get('environment', default_params['environment'])
+
+    @property
+    def save_path(self):
+        return os.path.dirname(self._path).replace("/Experiments/", "/Results/")
+
+    def create_dat_file(self):
+        with open('Job/Cedar_Create_Config_Template.sh', 'r') as f:
+            text = f.read()
+            for k, v in self._batch_params.items():
+                text = text.replace(f'__{k}__', v)
+        return text
+
+    def to_shell(self):
+        if self.server_name.upper() == 'NODE':
+            with open('Job/SubmitJobsTemplates.SL', 'r') as f:
+                text = f.read()
+                for k, v in self._batch_params.items():
+                    text = text.replace(f'__{k}__', v)
+            return text
+        elif self.server_name.upper() == 'CPU':
+            with open('Job/SubmitJobsTemplatesCedar.SL', 'r') as f:
+                text = f.read()
+                alg = self._batch_params['ALGORITHM']
+                num_of_jobs = sum(1 for _ in open(f'exports_{alg}.dat'))
+                text = text.replace('__ALG__', self._batch_params['ALGORITHM'])
+                text = text.replace('__NUM_OF_JOBS__', str(num_of_jobs))
+                text = text.replace('__NAME_OF_EXP__', f'{self._batch_params["TASK"]}_{self._batch_params["ALGORITHM"]}')
+            return text
+
+    def run_batch(self):
+        if self.server_name.upper() == 'NODE':
+            print('Submitting the ' + self.agent + ' algorithm jobs on nodes...')
+        elif self.server_name.upper() == 'CPU':
+            print('Submitting the ' + self.agent + ' algorithm jobs on individual cpus...')
+            with open('Create_Configs.sh', 'wt') as f:
+                f.write(self.create_dat_file())
+            time.sleep(1)
+            os.system('bash Create_Configs.sh')
+        with open('Submit_Jobs.SL', 'wt') as f:
+            f.write(self.to_shell())
+        time.sleep(1)
+        os.system('sbatch Submit_Jobs.SL')
+        time.sleep(1)
+        os.remove('Submit_Jobs.SL')
+        if self.server_name.upper() == 'CPU':
+            os.remove('Create_Configs.sh')
+            # alg = self._batch_params['ALGORITHM']
+            # os.remove(f'exports_{alg}.dat')
+
+    def __call__(self):
+        return self.run_batch()
--- a/Job/SubmitJobsTemplates.SL
+++ b/Job/SubmitJobsTemplates.SL
+#!/bin/bash
+# SLURM submission script for submitting multiple serial jobs on Niagara
+#
+#SBATCH --account=xxx
+#SBATCH --time=11:58:59
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=40
+#SBATCH --job-name __TASK_____ALGORITHM__
+
+
+alpha=(__ALPHA__)
+lmbda=(__LMBDA__)
+eta=(__ETA__)
+beta=(__BETA__)
+zeta=(__ZETA__)
+tdrc_beta=(__TDRCBETA__)
+gem_alpha=(__GEMALPHA__)
+gem_beta=(__GEMBETA__)
+num_of_runs=__NUMOFRUNS__
+num_steps=__NUMSTEPS__
+sub_sample=__SUBSAMPLE__
+algorithm=__ALGORITHM__
+environment=__ENVIRONMENT__
+task=__TASK__
+save_path=__SAVEPATH__
+
+source ~/RLENV/bin/activate
+module load NiaEnv/2019b
+module load gnu-parallel
+module load python
+
+cd $SLURM_SUBMIT_DIR || exit
+export OMP_NUM_THREADS=1
+
+echo "The number of available cores is echo $NCORES"
+echo "Current working directory is $(pwd)"
+echo "Running on hostname $(hostname)"
+echo "Starting run at: $(date)"
+
+HOSTS=$(scontrol show hostnames $SLURM_NODELIST | tr '\n' ,)
+NCORES=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+
+
+parallel --env OMP_NUM_THREADS,PATH,LD_LIBRARY_PATH --joblog slurm-$SLURM_JOBID.log -j $NCORES -S $HOSTS --wd $PWD \
+python Learning.py ::: -sp ::: ${save_path} ::: -e ::: ${environment} ::: -alg ::: ${algorithm} ::: -t ::: ${task[@]} \
+::: -a ::: ${alpha[@]} ::: -nr ::: ${num_of_runs} ::: -ns ::: ${num_steps} ::: -et ::: ${eta[@]} \
+::: -l ::: ${lmbda[@]} ::: -z ::: ${zeta[@]} ::: -tb ::: ${tdrc_beta[@]} ::: -b ::: ${beta[@]} ::: \
+-ga ::: ${gem_alpha[@]} ::: -gb ::: ${gem_beta[@]} ::: -ss ::: ${sub_sample}
+
+
+echo "Program test finished with exit code $? at: $(date)"
--- a/Job/SubmitJobsTemplatesCedar.SL
+++ b/Job/SubmitJobsTemplatesCedar.SL
+#!/bin/bash
+#SBATCH --account=xxx
+#SBATCH --time=00:15:58
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=3G
+#SBATCH --array=1-__NUM_OF_JOBS__
+#SBATCH --job-name __NAME_OF_EXP__
+
+alg=__ALG__
+source ~/RLENV/bin/activate
+`sed -n "${SLURM_ARRAY_TASK_ID}p" <exports_${alg}.dat`
+echo ${SLURM_ARRAY_TASK_ID} $ALPHA $LMBDA $ETA $BETA $ZETA $TDRCBETA $GEMALPHA $GEMBETA $NUMOFRUNS $NUMSTEPS $SUBSAMPLE
+echo "Current working directory is $(pwd)"
+echo "Running on hostname $(hostname)"
+echo
+echo "Starting run at: $(date)"
+python Learning.py \
+-a $ALPHA -l $LMBDA -et $ETA -b $BETA -z $ZETA -tb $TDRCBETA -ga $GEMALPHA -gb $GEMBETA -alg $ALGORITHM -t $TASK \
+-nr $NUMOFRUNS -e $ENVIRONMENT -sp $SAVE_PATH -ns $NUMSTEPS -ss $SUBSAMPLE
+echo "Program test finished with exit code $? at: $(date)"
--- a/Learning.py
+++ b/Learning.py
+import os
+import numpy as np
+import argparse
+
+from data_presister import DataPersister, ParameterBuilder
+from utils import save_result, Configuration, save_value_function, get_save_value_function_steps
+from Registry.AlgRegistry import alg_dict
+from Registry.EnvRegistry import environment_dict
+from Registry.TaskRegistry import task_dict
+from Job.JobBuilder import default_params
+from Environments.rendering import ErrorRender
+
+
+def learn(config: Configuration):
+    params = ParameterBuilder().add_algorithm_params(config).build()
+
+    if not os.path.exists(config.save_path):
+        os.makedirs(config.save_path, exist_ok=True)
+
+    env = environment_dict[config.environment]()
+
+    rmsve = np.zeros((task_dict[config.task].num_of_policies(), config.num_steps, config.num_of_runs))
+    for run in range(config.num_of_runs):
+        random_seed = (run + config.num_of_runs) if config.rerun else run
+        np.random.seed(random_seed)
+        task = task_dict[config.task](run_number=run, num_steps=config.num_steps)
+        agent = alg_dict[config.algorithm](task, **params)
+
+        rmsve_of_run = np.zeros((task.num_policies, task.num_steps))
+        agent.state = env.reset()
+        error_render = ErrorRender(task.num_policies, task.num_steps)
+        for step in range(task.num_steps):
+            rmsve_of_run[:, step], error = agent.compute_rmsve()
+            if config.render:
+                error_render.add_error(error)
+            agent.action = agent.choose_behavior_action()
+            agent.next_state, r, is_terminal, info = env.step(agent.action)
+            agent.learn(agent.state, agent.next_state, r, is_terminal)
+            if config.render:
+                env.render(mode='screen', render_cls=error_render)
+            if config.save_value_function and (step in get_save_value_function_steps(task.num_steps)):
+                save_value_function(agent.compute_value_function(), config.save_path, step, run)
+            if is_terminal:
+                agent.state = env.reset()
+                agent.reset()
+                continue
+            agent.state = agent.next_state
+        print(np.mean(rmsve_of_run, axis=0))
+        rmsve[:, :, run] = rmsve_of_run
+    rmsve_of_runs = np.transpose(np.mean(rmsve, axis=0))  # Average over all policies.
+
+    # _RMSVE_mean_over_runs
+    DataPersister.save_result(np.mean(rmsve_of_runs, axis=0), '_RMSVE_mean_over_runs', config)
+    DataPersister.save_result(np.std(rmsve_of_runs, axis=0, ddof=1) / np.sqrt(config.num_of_runs), '_RMSVE_stderr_over_runs', config)
+
+    # _RMSVE_stderr_over_runs
+    save_result(config.save_path, '_RMSVE_stderr_over_runs', np.mean(rmsve_of_runs, axis=0), params, config.rerun)
+    save_result(config.save_path, '_RMSVE_stderr_over_runs',
+                np.std(rmsve_of_runs, axis=0, ddof=1) / np.sqrt(config.num_of_runs), params, config.rerun)
+
+    # _mean_stderr_final
+    final_errors_mean_over_steps = np.mean(rmsve_of_runs[:, config.num_steps - int(0.01 * config.num_steps) - 1:],
+                                           axis=1)
+    DataPersister.save_result(np.array([np.mean(final_errors_mean_over_steps), np.std(final_errors_mean_over_steps, ddof=1) /
+                                        np.sqrt(config.num_of_runs)]), '_mean_stderr_final', config)
+    save_result(config.save_path, '_mean_stderr_final',
+                np.array([np.mean(final_errors_mean_over_steps), np.std(final_errors_mean_over_steps, ddof=1) /
+                          np.sqrt(config.num_of_runs)]), params, config.rerun)
+
+    # _mean_stderr_auc
+    auc_mean_over_steps = np.mean(rmsve_of_runs, axis=1)
+    DataPersister.save_result(np.array([np.mean(auc_mean_over_steps),
+                                        np.std(auc_mean_over_steps, ddof=1) / np.sqrt(config.num_of_runs)]), '_mean_stderr_auc', config)
+    save_result(config.save_path, '_mean_stderr_auc',
+                np.array([np.mean(auc_mean_over_steps),
+                          np.std(auc_mean_over_steps, ddof=1) / np.sqrt(config.num_of_runs)]), params, config.rerun)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--alpha', '-a', type=float, default=default_params['meta_parameters']['alpha'])
+    parser.add_argument('--lmbda', '-l', type=float, default=default_params['meta_parameters']['lmbda'])
+    parser.add_argument('--eta', '-et', type=float, default=default_params['meta_parameters']['eta'])
+    parser.add_argument('--beta', '-b', type=float, default=default_params['meta_parameters']['beta'])
+    parser.add_argument('--zeta', '-z', type=float, default=default_params['meta_parameters']['zeta'])
+    parser.add_argument('--tdrc_beta', '-tb', type=float, default=default_params['meta_parameters']['tdrc_beta'])
+    parser.add_argument('--gem_alpha', '-ga', type=float, default=default_params['meta_parameters']['gem_alpha'])
+    parser.add_argument('--gem_beta', '-gb', type=float, default=default_params['meta_parameters']['gem_beta'])
+    parser.add_argument('--algorithm', '-alg', type=str, default=default_params['agent'])
+    parser.add_argument('--task', '-t', type=str, default=default_params['task'])
+    parser.add_argument('--num_of_runs', '-nr', type=int, default=default_params['num_of_runs'])
+    parser.add_argument('--num_steps', '-ns', type=int, default=default_params['num_steps'])
+    parser.add_argument('--sub_sample', '-ss', type=int, default=default_params['sub_sample'])
+    parser.add_argument('--environment', '-e', type=str, default=default_params['environment'])
+    parser.add_argument('--save_path', '-sp', type=str, default='-')
+    parser.add_argument('--rerun', '-rrn', type=bool, default=False)
+    parser.add_argument('--render', '-rndr', type=bool, default=False)
+    parser.add_argument('--save_value_function', '-svf', type=bool, default=default_params['save_value_function'])
+    args = parser.parse_args()
+    if args.save_path == '-':
+        args.save_path = os.path.join(os.getcwd(), 'Results', default_params['exp'], args.algorithm)
+
+    learn(config=Configuration(vars(args)))
--- a/Plotting/plot_all_sensitivities_per_alg_emphatics.py
+++ b/Plotting/plot_all_sensitivities_per_alg_emphatics.py
+import json
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Plotting.plot_params import EXP_ATTRS, AUC_AND_FINAL
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
+from utils import create_name_for_save_load
+
+plot_alpha = 1.0
+
+
+def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
+    global plot_alpha
+    lbl = f'{alg}_{tp}'
+    ax.set_xscale('log', basex=2)
+    if alg == 'ETD':
+        color = 'red'
+    elif alg == 'ETDLB':
+        color = 'grey'
+        plot_alpha -= 0.1
+    else:
+        color = 'black'
+    ax.plot(alphas, performance, label=lbl, linestyle='-', marker='o',
+            linewidth=2, markersize=5, color=color, alpha=plot_alpha)
+    ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
+                color=color, alpha=plot_alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    # ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    # plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def get_alphas(alg, exp):
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f"{alg}.json")
+    with open(exp_path) as f:
+        jsn_content = json.load(f)
+        return jsn_content['meta_parameters']['alpha']
+
+
+def plot_all_sensitivities_per_alg_emphatics(**kwargs):
+    global plot_alpha
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                plot_alpha = 1.0
+                alg = 'ETD'
+                save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
+                fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                current_params = make_current_params(alg, sp, 0, 0)
+                alphas = get_alphas(alg, exp)
+                performance, stderr = load_performance_over_alpha(
+                    alg, exp, current_params, auc_or_final, exp_attrs)
+                plot_sensitivity(ax, alg, exp, alphas, sp, 0, performance, stderr, exp_attrs)
+                alg = 'ETDLB'
+                fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
+                for tp in tp_list:
+                    for fop in fop_list:
+                        current_params = make_current_params(alg, sp, tp, fop)
+                        alphas = get_alphas(alg, exp)
+                        performance, stderr = load_performance_over_alpha(
+                            alg, exp, current_params, auc_or_final, exp_attrs)
+                        plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir, exist_ok=True)
+                fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}.pdf"),
+                            format='pdf', dpi=1000, bbox_inches='tight')
+                plt.show()
+                print(exp, alg, auc_or_final, sp)
--- a/Plotting/plot_all_sensitivities_per_alg_gradients.py
+++ b/Plotting/plot_all_sensitivities_per_alg_gradients.py
+import os
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+
+from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, ALG_COLORS
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
+from utils import create_name_for_save_load
+
+
+new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#17becf',
+              'orange', '#8c564b', '#e377c2', '#2ca02c',
+              '#bcbd22', '#d62728']
+color_counter = 1
+
+
+def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
+    global color_counter
+    lbl = f'{alg}_{tp}'
+    ax.set_xscale('log', basex=2)
+    color = new_colors[color_counter]
+    linestyle = '-'
+    alpha = 1.0
+    # if alg == 'PGTD2':
+    #     linestyle = '--'
+    #     alpha = 0.5
+    ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
+            linewidth=2, markersize=5, color=color, alpha=alpha)
+    ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
+                color=color, alpha=alpha)
+    color_counter = color_counter + 1
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.set_ylim([0.1, 0.8])
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    plt.xticks(fontsize=25)
+
+
+def get_alphas(alg, exp):
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f"{alg}.json")
+    with open(exp_path) as f:
+        jsn_content = json.load(f)
+        return jsn_content['meta_parameters']['alpha']
+
+
+COUNTER = 0
+
+
+def plot_extra_alg_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
+    global color_counter
+    lbl = f'{alg}_{tp}'
+    ax.set_xscale('log', basex=2)
+    color = new_colors[color_counter - 1]
+    alpha = 1.0
+    if alg == 'TDRC':
+        color = ALG_COLORS[alg]
+        alpha = 1.0
+    linestyle = '--'
+    # if alg == 'GTD2':
+    #     linestyle = '-'
+    #     alpha=1.0
+    ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
+            linewidth=3, markersize=5, color=color, alpha=alpha)
+    ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=3, markersize=5,
+                color=color, alpha=alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim([0.1, 0.8])
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def plot_all_sensitivities_per_alg_gradients(**kwargs):
+    global color_counter, COUNTER
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                for alg in kwargs['algs']:
+                    color_counter = 4
+                    save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
+                    for tp in tp_list:
+                        if COUNTER % 2 == 0:
+                            COUNTER += 1
+                            continue
+                        COUNTER += 1
+                        for fop in fop_list:
+                            current_params = make_current_params(alg, sp, tp, fop)
+                            alphas = get_alphas(alg, exp)
+                            performance, stderr = load_performance_over_alpha(
+                                alg, exp, current_params, auc_or_final, exp_attrs)
+                            plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                            if alg == 'GTD2':
+                                extra_alg = 'GTD'
+                                performance, stderr = load_performance_over_alpha(
+                                    extra_alg, exp, current_params, auc_or_final, exp_attrs)
+                                plot_extra_alg_sensitivity(
+                                    ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                            if alg == 'PGTD2':
+                                extra_alg = 'GTD2'
+                                performance, stderr = load_performance_over_alpha(
+                                    extra_alg, exp, current_params, auc_or_final, exp_attrs)
+                                plot_extra_alg_sensitivity(
+                                    ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                            if alg == 'GTD':
+                                extra_alg = 'HTD'
+                                performance, stderr = load_performance_over_alpha(
+                                    extra_alg, exp, current_params, auc_or_final, exp_attrs)
+                                plot_extra_alg_sensitivity(
+                                    ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                            if alg == 'HTD':
+                                extra_alg = 'TDRC'
+                                current_params['eta'] = 1.0
+                                current_params['tdrc_beta'] = 1.0
+                                performance, stderr = load_performance_over_alpha(
+                                    extra_alg, exp, current_params, auc_or_final, exp_attrs)
+                                plot_extra_alg_sensitivity(
+                                    ax, extra_alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}.pdf"),
+                                format='pdf', dpi=1000, bbox_inches='tight')
+                    plt.show()
+                    print(exp, alg, auc_or_final, sp)
--- a/Plotting/plot_all_sensitivities_per_alg_gradients_all_eta.py
+++ b/Plotting/plot_all_sensitivities_per_alg_gradients_all_eta.py
+import os
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+
+from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, ALG_COLORS
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, make_exp_path, make_params, make_current_params
+from utils import create_name_for_save_load
+
+new_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#17becf', 'orange', '#8c564b', '#e377c2', '#2ca02c','#bcbd22',
+              '#d62728', 'black', 'cyan']
+color_counter = 1
+
+
+def load_performance_over_alpha(alg, exp, params, auc_or_final, exp_attrs):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        params, excluded_params=['alpha']) + f"_mean_{auc_or_final}_over_alpha.npy")
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+def plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs):
+    global color_counter
+    lbl = f'{alg}_{tp}'
+    ax.set_xscale('log', basex=2)
+    color = new_colors[color_counter]
+    linestyle = '-'
+    alpha = 1.0
+    # if alg == 'PGTD2':
+    #     linestyle = '--'
+    #     alpha = 0.5
+    ax.plot(alphas, performance, label=lbl, linestyle=linestyle, marker='o',
+            linewidth=2, markersize=5, color=color, alpha=alpha)
+    ax.errorbar(alphas, performance, yerr=stderr, linestyle='', elinewidth=2, markersize=5,
+                color=color, alpha=alpha)
+    color_counter = color_counter + 1
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.set_ylim([0.1, 0.8])
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def get_alphas(alg, exp):
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f"{alg}.json")
+    with open(exp_path) as f:
+        jsn_content = json.load(f)
+        return jsn_content['meta_parameters']['alpha']
+
+
+COUNTER = 0
+
+
+def plot_all_sensitivities_per_alg_gradients_all_eta(**kwargs):
+    global color_counter, COUNTER
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                for alg in kwargs['algs']:
+                    color_counter = 4
+                    save_dir = os.path.join('pdf_plots', 'AllThirds', exp, f'Lmbda{sp}_{auc_or_final}')
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
+                    if alg == 'TDRC':
+                        _, _, tp_list, _, _ = make_params('GTD', exp)
+                        fop_list = kwargs['tdrc_beta']
+                    for tp in tp_list:
+                        COUNTER += 1
+                        for fop in fop_list:
+                            current_params = make_current_params(alg, sp, tp, fop)
+                            alphas = get_alphas(alg, exp)
+                            performance, stderr = load_performance_over_alpha(
+                                alg, exp, current_params, auc_or_final, exp_attrs)
+                            plot_sensitivity(ax, alg, exp, alphas, sp, tp, performance, stderr, exp_attrs)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    if alg == 'TDRC':
+                        fig.savefig(
+                            os.path.join(save_dir, f"sensitivity_{alg}_{exp}_all_eta_beta_{kwargs['tdrc_beta']}.pdf"),
+                            format='pdf', dpi=1000, bbox_inches='tight')
+                    else:
+                        fig.savefig(os.path.join(save_dir, f"sensitivity_{alg}_{exp}_all_eta.pdf"),
+                                    format='pdf', dpi=1000, bbox_inches='tight')
+                    plt.show()
+                    print(exp, alg, auc_or_final, sp)
--- a/Plotting/plot_best_learning_curve_over_all_params.py
+++ b/Plotting/plot_best_learning_curve_over_all_params.py
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pylab
+from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
+    PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX
+from Plotting.plot_utils import load_best_rerun_params_dict, make_current_params, make_params, load_and_replace_large_nan_inf
+from utils import create_name_for_save_load
+
+
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False,
+              smoothing_window=1):
+    zoomed_in = True if is_smoothed else False
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    print(alg)
+    lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']) + r' $\lambda=$ ' +
+           str(best_params.get('lmbda', best_params.get('zeta', 0))))
+    color = ALG_COLORS[alg]
+    # if alg == 'TD':
+    #     color = 'grey'
+    #     alpha = 0.7
+    if is_smoothed:
+        mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid')
+        mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid')
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    if zoomed_in:
+        ax.set_ylim([0.0, 0.4])
+    else:
+        ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+def get_ls_rmsve(alg, exp, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    params = {'alpha': 0.01, 'lmbda': sp}
+    if alg == 'LSETD':
+        params['beta'] = 0.9
+    generic_name = create_name_for_save_load(params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
+    return np.load(load_file_name)
+
+
+def plot_ls_solution(ax, ls_rmsve, alg, sp):
+    lbl = f"{alg} $\\lambda=$ {sp}"
+    x = np.arange(ls_rmsve.shape[0])
+    y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
+    ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':')
+    # ax.legend()
+
+
+def find_best_perf(alg, exp, auc_or_final):
+    exp_attrs = EXP_ATTRS[exp](exp)
+    fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
+    best_params = {}
+    best_perf, best_fp, best_sp, best_tp, best_fop = np.inf, np.inf, np.inf, np.inf, np.inf
+    for fop in fop_list:
+        for tp in tp_list:
+            for sp in sp_list:
+                current_params = make_current_params(alg, sp, tp, fop)
+                load_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=[
+                    'alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
+                current_perf = load_and_replace_large_nan_inf(
+                    load_name, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement)
+                min_perf = min(current_perf)
+                if min_perf < best_perf:
+                    best_perf = min_perf
+                    best_perf_idx = int(np.nanargmin(current_perf))
+                    best_fp = fp_list[best_perf_idx]
+                    best_params = current_params
+                    best_params['alpha'] = best_fp
+    return best_params
+
+
+def plot_learning_curve_best_overall_params(**kwargs):
+    is_smoothed = True if 'is_smoothed' in kwargs else False
+    smoothing_window = kwargs.get('smoothing_window', 1)
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final)
+            for alg_names in kwargs['alg_groups'].values():
+                fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                for alg in alg_names:
+                    if alg in ['LSTD', 'LSETD']:
+                        # ls_rmsve = get_ls_rmsve(alg, exp, sp)
+                        # plot_ls_solution(ax, ls_rmsve, alg, sp)
+                        continue
+                    prefix = RERUN_POSTFIX if PLOT_RERUN else ''
+                    best_params = find_best_perf(alg, exp, auc_or_final)
+                    mean_lc, mean_stderr = load_data(alg, exp, best_params, prefix)
+                    plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False,
+                              is_smoothed=is_smoothed, smoothing_window=smoothing_window)
+                    if PLOT_RERUN_AND_ORIG:
+                        prefix = RERUN_POSTFIX
+                        mean_lc, mean_stderr = load_data(alg, exp, best_params, prefix)
+                        plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=True,
+                                  is_smoothed=is_smoothed, smoothing_window=smoothing_window)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    pylab.gca().set_rasterized(True)
+                if PLOT_RERUN_AND_ORIG:
+                    prefix = '_rerun_and_original'
+                elif PLOT_RERUN:
+                    prefix = RERUN_POSTFIX
+                else:
+                    prefix = ''
+                fig.savefig(os.path.join(save_dir,
+                            f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}AllLmbda.pdf"),
+                            format='pdf', dpi=200, bbox_inches='tight')
+                plt.show()
+                plt.close(fig)
--- a/Plotting/plot_dist.py
+++ b/Plotting/plot_dist.py
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def load_d_mu(task):
+    return np.load(os.path.join(os.getcwd(), 'Resources', task, 'd_mu.npy'))
+
+
+def load_state_values(task):
+    return np.load(os.path.join(os.getcwd(), 'Resources', task, 'state_values.npy'))
+
+
+def plot_d_mu(ax, d_mu, active_states):
+    ax.plot(d_mu, linewidth=3)
+    plt.xticks(fontsize=30)
+    plt.yticks(fontsize=30)
+    x_labels = list(active_states)
+    x_ticks = [x for x in range(len(x_labels))]
+    ax.xaxis.set_ticks(x_ticks)
+    ax.set_xticklabels(x_labels)
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+    ax.yaxis.set_ticks([0, 0.005, 0.01, 0.015, 0.02, 0.025])
+    ax.set_ylim([0.00, 0.025])
+    ax.set_yticklabels([])
+    # ax.set_xticklabels([])
+
+
+def find_active_states(task, d_mu, state_values, policy_no=0):
+    if task == 'EightStateCollision':
+        return [x for x in range(d_mu.shape[0])]
+    return np.where(state_values[policy_no] > 0)[0]
+
+
+def get_active_d_mu(task, d_mu, active_states, policy_no=0):
+    if task == 'EightStateCollision':
+        return d_mu
+    return d_mu[active_states, policy_no].squeeze()
+
+
+def plot_distribution(**kwargs):
+    task = kwargs['task']
+    d_mu = load_d_mu(task)
+    state_values = load_state_values(task)
+    for policy_no in range(state_values.shape[0]):
+        fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+        active_states = find_active_states(task, d_mu, state_values, policy_no)
+        active_d_mu = get_active_d_mu(task, d_mu, active_states, policy_no)
+        plot_d_mu(ax, active_d_mu, active_states)
+        plt.show()
+        if task == 'EightStateCollision':
+            break
+
+
+def plot_dist_for_two_four_room_tasks(**kwargs):
+    task1 = 'LearnEightPoliciesTileCodingFeat'
+    task2 = 'HighVarianceLearnEightPoliciesTileCodingFeat'
+    save_dir = os.path.join('pdf_plots', 'Misc', 'CompareDistsFR')
+    d_mu1 = load_d_mu(task1)
+    d_mu2 = load_d_mu(task2)
+    state_values1 = load_state_values(task1)
+    state_values2 = load_state_values(task2)
+    for policy_no in range(state_values1.shape[0]):
+        fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+        active_states = find_active_states(task1, d_mu1, state_values1, policy_no)
+        active_d_mu = get_active_d_mu(task1, d_mu1, active_states, policy_no)
+        plot_d_mu(ax, active_d_mu, active_states)
+        active_states = find_active_states(task2, d_mu2, state_values2, policy_no)
+        active_d_mu = get_active_d_mu(task2, d_mu2, active_states, policy_no)
+        plot_d_mu(ax, active_d_mu, active_states)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        fig.savefig(os.path.join(save_dir, f"dist_policy_{policy_no}.pdf"),
+                    format='pdf', dpi=1000, bbox_inches='tight')
+        plt.show()
+
+
--- a/Plotting/plot_learning_curve.py
+++ b/Plotting/plot_learning_curve.py
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pylab
+from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
+    PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX
+from Plotting.plot_utils import load_best_rerun_params_dict
+from utils import create_name_for_save_load
+
+
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, is_smoothed=False,
+              smoothing_window=1):
+    zoomed_in = True if is_smoothed else False
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
+    color = ALG_COLORS[alg]
+    # if alg == 'TD':
+    #     color = 'grey'
+    #     alpha = 0.7
+    if is_smoothed:
+        mean_lc = np.convolve(mean_lc, np.ones(smoothing_window)/smoothing_window, mode='valid')
+        mean_stderr = np.convolve(mean_stderr, np.ones(smoothing_window)/smoothing_window, mode='valid')
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    if zoomed_in:
+        ax.set_ylim([0.0, 0.4])
+    else:
+        ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+def get_ls_rmsve(alg, exp, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    params = {'alpha': 0.01, 'lmbda': sp}
+    if alg == 'LSETD':
+        params['beta'] = 0.9
+    generic_name = create_name_for_save_load(params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
+    return np.load(load_file_name)
+
+
+def plot_ls_solution(ax, ls_rmsve, alg, sp):
+    lbl = f"{alg} $\\lambda=$ {sp}"
+    x = np.arange(ls_rmsve.shape[0])
+    y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
+    ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle=':')
+    # ax.legend()
+
+
+def plot_learning_curve(**kwargs):
+    is_smoothed = True if 'is_smoothed' in kwargs else False
+    smoothing_window = kwargs.get('smoothing_window', 1)
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                save_dir = os.path.join('pdf_plots', 'learning_curves', exp, auc_or_final)
+                for alg_names in kwargs['alg_groups'].values():
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    for alg in alg_names:
+                        if alg in ['LSTD', 'LSETD']:
+                            ls_rmsve = get_ls_rmsve(alg, exp, sp)
+                            plot_ls_solution(ax, ls_rmsve, alg, sp)
+                            continue
+                        prefix = RERUN_POSTFIX if PLOT_RERUN else ''
+                        current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+                        mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                        plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=False,
+                                  is_smoothed=is_smoothed, smoothing_window=smoothing_window)
+                        if PLOT_RERUN_AND_ORIG:
+                            prefix = RERUN_POSTFIX
+                            mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                            plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, second_time=True,
+                                      is_smoothed=is_smoothed, smoothing_window=smoothing_window)
+                        if not os.path.exists(save_dir):
+                            os.makedirs(save_dir, exist_ok=True)
+                        pylab.gca().set_rasterized(True)
+                    if PLOT_RERUN_AND_ORIG:
+                        prefix = '_rerun_and_original'
+                    elif PLOT_RERUN:
+                        prefix = RERUN_POSTFIX
+                    else:
+                        prefix = ''
+                    fig.savefig(os.path.join(save_dir,
+                                f"{prefix}_learning_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
+                                format='pdf', dpi=200, bbox_inches='tight')
+                    plt.show()
+                    plt.close(fig)
--- a/Plotting/plot_learning_curves_for_all_third_params.py
+++ b/Plotting/plot_learning_curves_for_all_third_params.py
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pylab
+
+from Plotting.plot_params import ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN_AND_ORIG
+from Plotting.plot_utils import make_params, get_alphas, make_current_params
+from utils import create_name_for_save_load
+
+
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
+    color = ALG_COLORS[alg]
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def get_ls_rmsve(alg, exp, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    params = {'alpha': 0.01, 'lmbda': sp}
+    if alg == 'LSETD':
+        params['beta'] = 0.9
+    generic_name = create_name_for_save_load(params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
+    return np.load(load_file_name)
+
+
+def plot_ls_solution(ax, ls_rmsve, alg, sp):
+    lbl = f"{alg} $\\lambda=$ {sp}"
+    x = np.arange(ls_rmsve.shape[0])
+    y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
+    ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--')
+    # ax.legend()
+
+
+def load_specific_params_dict(alg, exp, sp, tp):
+    if alg == 'TD':
+        return {'alpha': 0.25, 'lmbda': sp}
+    if alg == 'ETD':
+        return {'alpha': 0.00390625, 'lmbda': sp}
+    if alg == 'ETDLB':
+        return {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2}
+    if alg == 'TDRC':
+        return {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0}
+    if alg == 'GTD':
+        return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp}
+    if alg == 'PGTD2':
+        return {'alpha': 0.0078125, 'lmbda': sp, 'eta': tp}
+
+
+def load_sample_params_dict(alg, exp, sp):
+    fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
+    if alg in ['TD', 'ETD', 'TB', 'Vtrace']:
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp}
+    if alg == 'ABTD':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp}
+    if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']:
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'eta': tp_list[np.random.randint(0, len(tp_list))]}
+    if alg == 'ETDLB':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'beta': tp_list[np.random.randint(0, len(tp_list))]}
+    if alg == 'TDRC':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'eta': tp_list[np.random.randint(0, len(tp_list))],
+                'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]}
+
+
+def plot_all_learning_curves_for_third(**kwargs):
+    for exp in kwargs['exps']:
+        prefix = ''
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                save_dir = os.path.join('pdf_plots', 'all_third_learning_curves', auc_or_final)
+                fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                for alg in kwargs['algs']:
+                    if alg in ['LSTD', 'LSETD']:
+                        ls_rmsve = get_ls_rmsve(alg, exp, sp)
+                        plot_ls_solution(ax, ls_rmsve, alg, sp)
+                        continue
+                    for tp in kwargs['tp_list']:
+                        for fp in get_alphas(alg, exp):
+                            for fop in [1.0]:
+                                current_params = make_current_params(alg, sp, tp, fop, fp)
+                                mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                                plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir, exist_ok=True)
+                pylab.gca().set_rasterized(True)
+                fig.savefig(os.path.join(save_dir,
+                            f"{prefix}_learning_curve_{'_'.join(kwargs['algs'])}{exp}Lmbda{sp}.pdf"),
+                            format='pdf', dpi=200, bbox_inches='tight')
+                plt.show()
+                plt.close(fig)
--- a/Plotting/plot_learning_for_two_lambdas.py
+++ b/Plotting/plot_learning_for_two_lambdas.py
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pylab
+from Plotting.plot_params import ALG_GROUPS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, RERUN_POSTFIX, \
+    PLOT_RERUN_AND_ORIG
+from Plotting.plot_utils import load_best_rerun_params_dict
+from utils import create_name_for_save_load
+
+
+# noinspection DuplicatedCode
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+# noinspection DuplicatedCode
+def plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, second_time=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    color = 'blue' if sp else 'red'
+    lbl = (alg + r' $\lambda=$ ' + str(sp))
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.tick_params(axis='x', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+
+
+# noinspection DuplicatedCode
+def plot_learning_curve_for_lambdas(**kwargs):
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for alg_names in kwargs['alg_groups'].values():
+                for alg in alg_names:
+                    if alg in ['LSETD', 'LSTD']:
+                        continue
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    save_dir = os.path.join('pdf_plots', 'learning_curves_for_lambdas', auc_or_final)
+                    for sp in kwargs['sp_list']:
+                        prefix = RERUN_POSTFIX if PLOT_RERUN else ''
+                        current_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+                        print(alg, current_params)
+                        mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                        plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs)
+                        if PLOT_RERUN_AND_ORIG:
+                            prefix = RERUN_POSTFIX
+                            mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+                            plot_data(ax, alg, mean_lc, mean_stderr, sp, exp_attrs, True)
+                        if not os.path.exists(save_dir):
+                            os.makedirs(save_dir, exist_ok=True)
+                        pylab.gca().set_rasterized(True)
+                    if PLOT_RERUN_AND_ORIG:
+                        prefix = '_rerun_and_original'
+                    elif PLOT_RERUN:
+                        prefix = RERUN_POSTFIX
+                    else:
+                        prefix = ''
+                    fig.savefig(os.path.join(save_dir,
+                                f"{prefix}_learning_curve_{alg}{exp}.pdf"),
+                                format='pdf', dpi=200, bbox_inches='tight')
+                    # plt.show()
+                    plt.close(fig)
--- a/Plotting/plot_params.py
+++ b/Plotting/plot_params.py
+from Plotting.plot_utils import FirstChainAttr, FirstFourRoomAttr, HVFirstFourRoomAttr
+from Registry.AlgRegistry import alg_dict
+
+
+PLOT_RERUN = True
+PLOT_RERUN_AND_ORIG = False
+if PLOT_RERUN and PLOT_RERUN_AND_ORIG:
+    PLOT_RERUN_AND_ORIG = False
+RERUN_POSTFIX = '_rerun'
+DEBUG_MODE = True
+
+# noinspection SpellCheckingInspection
+COLORS = ['#000000', "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22",
+          "#17becf"]
+ALG_COLORS = {alg_name: color for alg_name, color in zip(alg_dict.keys(), COLORS)}
+ALG_COLORS['LSTD'] = ALG_COLORS['TD']
+ALG_COLORS['LSETD'] = ALG_COLORS['ETD']
+ALG_GROUPS = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
+              'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
+              'emphatics': ['ETD', 'ETDLB', 'LSETD'],
+              'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD']}
+EXPS = ['1HVFourRoom', 'FirstFourRoom', 'FirstChain']
+ALGS = [key for key in alg_dict.keys()]
+ALGS.remove('LSTD')
+ALGS.remove('LSETD')
+# ALGS.remove('TDRC')
+ALL_ALGS = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD', 'LSTD', 'LSETD']
+# ALL_ALGS = ['TD', 'Vtrace', 'TB', 'ABTD']
+LMBDA_AND_ZETA = [0.0, 0.9]
+AUC_AND_FINAL = ['auc', 'final']
+EXP_ATTRS = {'FirstChain': FirstChainAttr, 'FirstFourRoom': FirstFourRoomAttr, '1HVFourRoom': HVFirstFourRoomAttr}
+
+if DEBUG_MODE:
+    EXPS = ['FirstFourRoom', '1HVFourRoom']
+    # ALGS = ['GTD']
+    # ALL_ALGS.remove('ETDLB')
+    # ALL_ALGS.remove('LSTD')
+    # ALL_ALGS.remove('LSETD')
+    # LMBDA_AND_ZETA = [0.9]
+    AUC_AND_FINAL = ['final']
+    # ALG_GROUPS = {'main_algs': ALL_ALGS}
--- a/Plotting/plot_sensitivity.py
+++ b/Plotting/plot_sensitivity.py
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \
+    PLOT_RERUN_AND_ORIG, RERUN_POSTFIX
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas
+from utils import create_name_for_save_load
+
+
+def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy')
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha{postfix}.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+# noinspection DuplicatedCode
+def plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, second_time=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = f'{alg}'
+    ax.set_xscale('log', basex=2)
+    color = ALG_COLORS[alg]
+    if alg == 'TD':
+        color = 'grey'
+        alpha=0.7
+    ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color,
+            linewidth=2, markersize=5, alpha=alpha)
+    ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color,
+                mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def plot_sensitivity_curve(**kwargs):
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                save_dir = os.path.join('pdf_plots', 'sensitivity_curves', auc_or_final)
+                for alg_names in kwargs['alg_groups'].values():
+                    fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                    for alg in alg_names:
+                        if alg in ['LSTD', 'LSETD']:
+                            continue
+                        postfix = RERUN_POSTFIX if PLOT_RERUN else ''
+                        best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+                        alphas = get_alphas(alg, exp)
+                        best_performance, stderr = load_best_performance_over_alpha(
+                            alg, exp, auc_or_final, best_params, exp_attrs, postfix)
+                        plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs)
+                        if PLOT_RERUN_AND_ORIG:
+                            postfix = RERUN_POSTFIX
+                            best_performance, stderr = load_best_performance_over_alpha(
+                                alg, exp, auc_or_final, best_params, exp_attrs, postfix)
+                            plot_sensitivity(ax, alg, alphas, best_performance, stderr, exp_attrs, True)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    if PLOT_RERUN_AND_ORIG:
+                        prefix = '_rerun_and_original'
+                    elif PLOT_RERUN:
+                        prefix = RERUN_POSTFIX
+                    else:
+                        prefix = ''
+                    fig.savefig(os.path.join(save_dir,
+                                             f"{prefix}_sensitivity_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
+                                format='pdf', dpi=1000, bbox_inches='tight')
+                    plt.show()
+                    print(exp, alg_names, auc_or_final, sp)
--- a/Plotting/plot_sensitivity_for_two_lambdas.py
+++ b/Plotting/plot_sensitivity_for_two_lambdas.py
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Plotting.plot_params import EXPS, EXP_ATTRS, AUC_AND_FINAL, PLOT_RERUN, PLOT_RERUN_AND_ORIG, RERUN_POSTFIX, ALGS
+from Plotting.plot_utils import replace_large_nan_inf, make_res_path, load_best_rerun_params_dict, get_alphas
+from utils import create_name_for_save_load
+
+
+def load_best_performance_over_alpha(alg, exp, auc_or_final, best_params, exp_attrs, postfix=''):
+    res_path = make_res_path(alg, exp)
+    load_file_name = os.path.join(res_path, create_name_for_save_load(
+        best_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha{postfix}.npy')
+    performance_over_alpha = np.load(load_file_name)
+    performance_over_alpha = replace_large_nan_inf(
+        performance_over_alpha, large=exp_attrs.learning_starting_point,
+        replace_with=exp_attrs.over_limit_replacement)
+    stderr_load_file_name = os.path.join(
+        res_path, create_name_for_save_load(best_params, excluded_params=['alpha']) +
+        f'_stderr_{auc_or_final}_over_alpha{postfix}.npy')
+    std_err_of_best_perf_over_alpha = np.load(stderr_load_file_name)
+    std_err_of_best_perf_over_alpha = replace_large_nan_inf(
+        std_err_of_best_perf_over_alpha, large=exp_attrs.learning_starting_point, replace_with=0.0)
+    return performance_over_alpha, std_err_of_best_perf_over_alpha
+
+
+# noinspection DuplicatedCode
+def plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, second_time=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = f'{alg}'
+    ax.set_xscale('log', basex=2)
+    color = 'blue' if sp else 'red'
+    if sp not in [0.0, 1.0]:
+        alpha = 0.3
+        color = 'grey'
+    ax.plot(alphas, best_performance, label=lbl, linestyle='-', marker='o', color=color,
+            linewidth=2, markersize=5, alpha=alpha)
+    ax.errorbar(alphas, best_performance, yerr=stderr, ecolor=color, mfc=color,
+                mec=color, linestyle='', elinewidth=2, markersize=5, alpha=alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks_log)
+    # ax.set_xticklabels(exp_attrs.x_axis_tick_labels_log, fontsize=25)
+    # plt.xticks(fontsize=25)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def plot_min(ax, min_performance):
+    print(min_performance)
+    ax.plot([pow(2, -3), pow(2, -2)], [min_performance, min_performance], linewidth=0.2, alpha=0.2)
+    # ax.axhline(y=min_performance, xmin=pow(2, -3), xmax=pow(2, -2))
+
+
+def plot_sensitivity_for_lambdas(**kwargs):
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            save_dir = os.path.join('pdf_plots', 'sensitivity_curves_for_lambdas', exp, auc_or_final)
+            for alg in kwargs['algs']:
+                min_performance = 1_000
+                fig, ax = plt.subplots(figsize=kwargs['fig_size'])
+                for sp in kwargs['sp_list']:
+                    if alg in ['LSTD', 'LSETD']:
+                        continue
+                    postfix = RERUN_POSTFIX if PLOT_RERUN else ''
+                    best_params = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+                    alphas = get_alphas(alg, exp)
+                    best_performance, stderr = load_best_performance_over_alpha(
+                        alg, exp, auc_or_final, best_params, exp_attrs, postfix)
+                    plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs)
+                    if PLOT_RERUN_AND_ORIG:
+                        postfix = RERUN_POSTFIX
+                        best_performance, stderr = load_best_performance_over_alpha(
+                            alg, exp, auc_or_final, best_params, exp_attrs, postfix)
+                        plot_sensitivity(ax, alg, alphas, sp, best_performance, stderr, exp_attrs, True)
+                    if min(best_performance) < min_performance:
+                        min_performance = min(best_performance)
+                if kwargs.get('plot_min_performance', False):
+                    plot_min(ax, min_performance)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir, exist_ok=True)
+                if PLOT_RERUN_AND_ORIG:
+                    prefix = '_rerun_and_original'
+                elif PLOT_RERUN:
+                    prefix = RERUN_POSTFIX
+                else:
+                    prefix = ''
+                fig.savefig(os.path.join(save_dir,
+                                         f"{prefix}_sensitivity_curve_{alg}{exp}.pdf"),
+                            format='pdf', dpi=1000, bbox_inches='tight')
+                plt.show()
+                print(exp, alg, auc_or_final, sp)
--- a/Plotting/plot_specific_learning_curves.py
+++ b/Plotting/plot_specific_learning_curves.py
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pylab
+from Plotting.plot_params import ALG_GROUPS, ALG_COLORS, EXP_ATTRS, EXPS, AUC_AND_FINAL, LMBDA_AND_ZETA, \
+    PLOT_RERUN_AND_ORIG, PLOT_RERUN, RERUN_POSTFIX, ALGS, ALL_ALGS
+from Plotting.plot_utils import load_best_rerun_params_dict, make_params
+from utils import create_name_for_save_load
+
+
+def load_data(alg, exp, best_params, postfix=''):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    generic_name = create_name_for_save_load(best_params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs{postfix}.npy")
+    mean_lc = np.load(load_file_name)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_stderr_over_runs{postfix}.npy")
+    stderr_lc = np.load(load_file_name)
+    return mean_lc, stderr_lc
+
+
+def plot_data(ax, alg, mean_lc, mean_stderr, best_params, exp_attrs, second_time=False, flag=False):
+    alpha = 1.0
+    if PLOT_RERUN_AND_ORIG:
+        alpha = 1.0 if second_time else 0.5
+    lbl = (alg + r'$\alpha=$ ' + str(best_params['alpha']))
+    color = ALG_COLORS[alg]
+    if alg == 'TDRC':
+        alpha = 0.6
+    if flag:
+        color = 'green'
+    ax.plot(np.arange(mean_lc.shape[0]), mean_lc, label=lbl, linewidth=1.0, color=color, alpha=alpha)
+    ax.fill_between(np.arange(mean_lc.shape[0]), mean_lc - mean_stderr / 2, mean_lc + mean_stderr / 2,
+                    color=color, alpha=0.1*alpha)
+    # ax.legend()
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_xlim(exp_attrs.x_lim)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.xaxis.set_ticks(exp_attrs.x_axis_ticks)
+    ax.set_xticklabels(exp_attrs.x_tick_labels, fontsize=25)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def get_ls_rmsve(alg, exp, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    params = {'alpha': 0.01, 'lmbda': sp}
+    if alg == 'LSETD':
+        params['beta'] = 0.9
+    generic_name = create_name_for_save_load(params)
+    load_file_name = os.path.join(res_path, f"{generic_name}_RMSVE_mean_over_runs.npy")
+    return np.load(load_file_name)
+
+
+def plot_ls_solution(ax, ls_rmsve, alg, sp):
+    lbl = f"{alg} $\\lambda=$ {sp}"
+    x = np.arange(ls_rmsve.shape[0])
+    y = ls_rmsve[-1] * np.ones(ls_rmsve.shape[0])
+    ax.plot(x, y, label=lbl, linewidth=1.0, color=ALG_COLORS[alg], linestyle='--')
+    # ax.legend()
+
+
+def load_sample_params_dict(alg, exp, sp):
+    fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
+    if alg in ['TD', 'ETD', 'TB', 'Vtrace']:
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp}
+    if alg == 'ABTD':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'zeta': sp}
+    if alg in ['GTD', 'GTD2', 'PGTD2', 'HTD']:
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'eta': tp_list[np.random.randint(0, len(tp_list))]}
+    if alg == 'ETDLB':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'beta': tp_list[np.random.randint(0, len(tp_list))]}
+    if alg == 'TDRC':
+        return {'alpha': fp_list[np.random.randint(0, len(fp_list))], 'lmbda': sp,
+                'eta': tp_list[np.random.randint(0, len(tp_list))],
+                'tdrc_beta': fop_list[np.random.randint(0, len(fop_list))]}
+
+
+def plot_specific_learning_curves(**kwargs):
+    specific_params = kwargs['specific_params']
+    exp = kwargs['exp']
+    prefix = ''
+    exp_attrs = EXP_ATTRS[exp](exp)
+    for auc_or_final in AUC_AND_FINAL:
+        sp = kwargs['sp']
+        save_dir = os.path.join('pdf_plots', 'specific_learning_curves', auc_or_final)
+        fig, ax = plt.subplots(figsize=(10, 4))
+        for alg in kwargs['algs']:
+            flag = False
+            if alg in ['LSTD', 'LSETD']:
+                ls_rmsve = get_ls_rmsve(alg, exp, sp)
+                plot_ls_solution(ax, ls_rmsve, alg, sp)
+                continue
+            print(alg, exp, sp)
+            if alg == 'PGTD22':
+                flag = True
+                alg = 'PGTD2'
+                current_params = specific_params[alg]
+                current_params['eta'] = 1.0
+                current_params['alpha'] = 0.03125
+            else:
+                current_params = specific_params[alg]
+            print(current_params)
+            mean_lc, mean_stderr = load_data(alg, exp, current_params, prefix)
+            plot_data(ax, alg, mean_lc, mean_stderr, current_params, exp_attrs, False, flag)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        pylab.gca().set_rasterized(True)
+        fig.savefig(os.path.join(save_dir,
+                    f"{prefix}_learning_curve_{'_'.join(ALGS)}{exp}Lmbda{sp}.pdf"),
+                    format='pdf', dpi=200, bbox_inches='tight')
+        plt.show()
+        plt.close(fig)
--- a/Plotting/plot_utils.py
+++ b/Plotting/plot_utils.py
+import argparse
+import json
+import numpy as np
+import os
+from Job.JobBuilder import default_params
+from Registry.AlgRegistry import alg_dict
+from utils import create_name_for_save_load
+
+
+def make_res_path(alg, exp):
+    return os.path.join(os.getcwd(), 'Results', exp, alg)
+
+
+def make_exp_path(alg, exp):
+    return os.path.join(os.getcwd(), 'Experiments', exp, alg)
+
+
+def load_best_rerun_params_dict(alg, exp, auc_or_final, sp):
+    res_path = make_res_path(alg, exp)
+    with open(os.path.join(res_path, f"{auc_or_final}_{sp}.json")) as f:
+        return json.load(f)['meta_parameters']
+
+
+def get_alphas(alg, exp):
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f"{alg}.json")
+    with open(exp_path) as f:
+        jsn_content = json.load(f)
+        return jsn_content['meta_parameters']['alpha']
+
+
+def load_best_rerun_params(alg, exp, auc_or_final, sp):
+    best_res_dict = load_best_rerun_params_dict(alg, exp, auc_or_final, sp)
+    best_fp = best_res_dict.get('alpha', 0)
+    best_tp = best_res_dict.get('eta', best_res_dict.get('beta', 0))
+    best_fop = best_res_dict.get('tdrc_beta', 0)
+    return best_fp, best_tp, best_fop
+
+
+def make_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--exp_name', '-n', type=str, default='1HVFourRoom')
+    # 1HVFourRoom or FirstFourRoom or FirstChain
+    return parser.parse_args()
+
+
+def rename_best_old_result(res_path, params_dict, file_name):
+    name_to_save = create_name_for_save_load(param_dict=params_dict)
+    path_and_name = os.path.join(res_path, name_to_save)
+    file_name = path_and_name + file_name
+    os.rename(file_name + '.npy', file_name + '_old.npy')
+
+
+def load_best_perf_json(alg, exp, sp, auc_or_final):
+    res_path = make_res_path(alg, exp)
+    res_path = os.path.join(res_path, f"{auc_or_final}_{sp}.json")
+    with open(res_path, 'r') as f:
+        return json.load(f)
+
+
+def load_exp_json_file(alg, exp):
+    res_path = make_res_path(alg, exp)
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f'{alg}.json')
+    with open(exp_path) as f:
+        return json.load(f), res_path
+
+
+def make_params(alg_name, exp_name):
+    params = dict()
+    alg_param_names = alg_dict[alg_name].related_parameters()
+    json_content, res_path = load_exp_json_file(alg_name, exp_name)
+    json_exp_params = json_content.get('meta_parameters')
+    for param in alg_param_names:
+        params[param] = json_exp_params.get(param, default_params['meta_parameters'][param])
+        if not isinstance(params[param], list):
+            params[param] = list([params[param]])
+    fp_list = params.get('alpha', params['alpha'])
+    tp_list = [0.0]
+    fop_list = [0.0]
+    if 'lmbda' in params:
+        sp_list = params['lmbda']
+    else:
+        sp_list = params['zeta']
+    if 'eta' in params:
+        tp_list = params['eta']
+    elif 'beta' in params:
+        tp_list = params['beta']
+    if 'tdrc_beta' in params:
+        fop_list = params['tdrc_beta']
+    if alg_name == 'TDRC':
+        tp_list, fop_list = [1.0], [1.0]
+    return fp_list, sp_list, tp_list, fop_list, res_path
+
+
+def make_current_params(alg_name, sp, tp, fop, fp=0):
+    current_params = {'alpha': fp}
+    alg_param_names = alg_dict[alg_name].related_parameters()
+    if 'lmbda' in alg_param_names:
+        current_params['lmbda'] = sp
+    else:
+        current_params['zeta'] = sp
+    if 'eta' in alg_param_names:
+        current_params['eta'] = tp
+    elif 'beta' in alg_param_names:
+        current_params['beta'] = tp
+    if 'tdrc_beta' in alg_param_names:
+        current_params['tdrc_beta'] = fop
+    return current_params
+
+
+def get_alg_names(exp_name):
+    path = os.path.join(os.getcwd(), 'Experiments', exp_name)
+    alg_names = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
+    return alg_names
+
+
+def load_sample_json_for_exp(exp):
+    alg = get_alg_names(exp)[0]
+    exp_path = make_exp_path(alg, exp)
+    exp_path = os.path.join(exp_path, f'{alg}.json')
+    if not os.path.exists(exp_path):
+        print('No algorithms exist in the experiment directory...')
+        raise FileExistsError
+    with open(exp_path) as f:
+        json_exp_params = json.load(f)
+    return json_exp_params
+
+
+def load_and_replace_large_nan_inf(load_file_name, large, replace_with):
+    current_perf = np.load(load_file_name)
+    return replace_large_nan_inf(current_perf, large=large, replace_with=replace_with)
+
+
+class FirstChainAttr:
+    def __init__(self, exp_name):
+        json_exp_params = load_sample_json_for_exp(exp_name)
+        self.size_of_labels = 25
+        self.y_lim = [0.0, 0.8]
+        self.x_lim = [0.0, json_exp_params['number_of_steps']]
+        self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7]
+        self.x_axis_ticks = [0.0, 5000, 10000, 15000, 20000]
+        self.x_tick_labels = [0, '5', '10', '15', '20']
+        self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)]
+        self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1]
+        self.over_limit_replacement = 2.0
+        self.over_limit_waterfall = 0.79
+        self.learning_starting_point = 0.68910
+        self.ok_error = 0.4
+
+
+class FirstFourRoomAttr:
+    def __init__(self, exp_name):
+        json_exp_params = load_sample_json_for_exp(exp_name)
+        self.size_of_labels = 25
+        self.y_lim = [0.0, 0.8]
+        self.x_lim = [0.0, json_exp_params['number_of_steps']]
+        self.y_axis_ticks = [0.1, 0.3, 0.5, 0.7]
+        self.x_axis_ticks = [0.0, 10000, 20000, 30000, 40000, 50000]
+        self.x_tick_labels = [0, '10', '20', '30', '40', '50']
+        self.x_axis_ticks_log = [pow(2, -18), pow(2, -14), pow(2, -10), pow(2, -6), pow(2, -2)]
+        self.x_axis_tick_labels_log = [-16, -13, -10, -7, -4, -1]
+        self.over_limit_replacement = 2.0
+        self.over_limit_waterfall = 0.79
+        self.learning_starting_point = 0.72672
+        self.ok_error = 0.4
+
+
+class HVFirstFourRoomAttr(FirstFourRoomAttr):
+    def __init__(self, exp_name):
+        super(HVFirstFourRoomAttr, self).__init__(exp_name)
+
+
+def replace_large_nan_inf(arr, large=1.0, replace_with=2.0):
+    arr[np.isnan(arr)], arr[np.isinf(arr)], arr[arr > large] = replace_with, replace_with, replace_with
+    return arr
--- a/Plotting/plot_waterfall.py
+++ b/Plotting/plot_waterfall.py
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+
+from Plotting.plot_params import EXPS, ALG_GROUPS, ALG_COLORS, EXP_ATTRS, AUC_AND_FINAL, LMBDA_AND_ZETA, PLOT_RERUN, \
+    RERUN_POSTFIX
+from Plotting.plot_utils import make_current_params, replace_large_nan_inf, make_params
+from utils import create_name_for_save_load
+
+np.random.seed(0)
+def load_all_performances(alg, exp, auc_or_final, sp, exp_attrs):
+    fp_list, sp_list, tp_list, fop_list, res_path = make_params(alg, exp)
+    all_performance = np.zeros((len(fp_list), len(tp_list), len(fop_list)))
+    for i, fop in enumerate(fop_list):
+        for j, tp in enumerate(tp_list):
+            current_params = make_current_params(alg, sp, tp, fop)
+            load_file_name = os.path.join(res_path, create_name_for_save_load(
+                current_params, excluded_params=['alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
+
+            if PLOT_RERUN and auc_or_final == 'auc':
+                load_file_name_rerun = load_file_name.replace('.npy', f"{RERUN_POSTFIX}.npy")
+                if os.path.isfile(load_file_name_rerun):
+                    load_file_name = load_file_name_rerun
+
+            performance = np.load(load_file_name)
+            performance = replace_large_nan_inf(performance, large=exp_attrs.learning_starting_point,
+                                                replace_with=exp_attrs.over_limit_waterfall)
+            all_performance[:, j, i] = performance
+    return all_performance
+
+
+def plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs):
+    global ticker, x_axis_names, x_axis_ticks
+    performance_to_plot = np.array(all_performance.flatten())
+    percentage_overflowed = round((performance_to_plot > exp_attrs.learning_starting_point).sum() /
+                                  performance_to_plot.size, 2)
+    ok_percentage = round((performance_to_plot < exp_attrs.ok_error).sum() /
+                          performance_to_plot.size, 2)
+    print(alg, 'percentage_overflowed', percentage_overflowed)
+    # print(alg, 'OK_percentage', ok_percentage)
+    color = ALG_COLORS[alg]
+    ax.scatter([(ticker + 1)] * performance_to_plot.shape[0] + np.random.uniform(
+        -0.25, 0.25, performance_to_plot.shape[0]), performance_to_plot, marker='o',
+                facecolors='none', color=color)
+    x_axis_ticks.append(ticker + 1)
+    ticker = (ticker + 1) % len(alg_names)
+    ax.tick_params(
+        axis='x',  # changes apply to the x-axis
+        which='both',  # both major and minor ticks are affected
+        bottom=False,  # ticks along the bottom edge are off
+        top=False,  # ticks along the top edge are off
+        labelbottom=True)  # labels along the bottom edge are off
+    x_axis_names.append(f'{alg}_{percentage_overflowed}')
+    ax.xaxis.set_ticks(x_axis_ticks)
+    ax.set_xticklabels(x_axis_names)
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.tick_params(axis='y', which='major', labelsize=exp_attrs.size_of_labels)
+    ax.set_ylim(exp_attrs.y_lim)
+    ax.yaxis.set_ticks(exp_attrs.y_axis_ticks)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+ticker, x_axis_names, x_axis_ticks = 0.0, [''], [0]
+
+
+def plot_waterfall_scatter(**kwargs):
+    for exp in kwargs['exps']:
+        exp_attrs = EXP_ATTRS[exp](exp)
+        for auc_or_final in kwargs['auc_or_final']:
+            for sp in kwargs['sp_list']:
+                save_dir = os.path.join('pdf_plots', 'waterfalls', auc_or_final)
+                for alg_names in kwargs['alg_groups'].values():
+                    global ticker, x_axis_names, x_axis_ticks
+                    ticker, x_axis_names, x_axis_ticks = -0.5, [''], [0]
+                    fig, ax = plt.subplots(kwargs['fig_size'])
+                    for alg in alg_names:
+                        if alg in ['LSTD', 'LSETD']:
+                            continue
+                        all_performance = load_all_performances(alg, exp, auc_or_final, sp, exp_attrs)
+                        plot_waterfall(ax, alg, all_performance, alg_names, exp_attrs)
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    prefix = RERUN_POSTFIX if PLOT_RERUN else ''
+                    fig.savefig(os.path.join(save_dir,
+                                             f"{prefix}_waterfall_curve_{'_'.join(alg_names)}{exp}Lmbda{sp}.pdf"),
+                                format='pdf', dpi=1000, bbox_inches='tight')
+                    plt.show()
+                    print(exp, alg_names, auc_or_final, sp)
--- a/Plotting/process_state_value_function.py
+++ b/Plotting/process_state_value_function.py
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+class ValueFunctionProcessor:
+    def __init__(self, exp, alg):
+        result_dir = os.path.join(os.getcwd(), 'Results', exp, alg, 'Sample_value_function')
+        self.all_value_functions = dict()
+        self.all_value_functions_of_last_step = dict()
+        for value_function_name in os.listdir(result_dir):
+            value_function = np.load(os.path.join(result_dir, value_function_name))
+            step, run_num = (int(i) for i in value_function_name.replace('.npy', '').split('_'))
+            self.all_value_functions[(step, run_num)] = value_function
+            if (step == 19999 and exp == 'FirstChain') or (step == 49999 and exp == 'FirstFourRoom') or (
+                    step == 49999 and exp == '1HVFourRoom'):
+                self.all_value_functions_of_last_step[run_num] = value_function
+
+    def get_value_function_by_step_and_run(self, step, run):
+        return self.all_value_functions[(step, run)]
+
+    def get_value_function_for_last_step(self, run):
+        return self.all_value_functions_of_last_step[run]
+
+
+# STEPS = [199, 999, 1999, 3999, 9999, 19999]
+STEPS = [199, 1999, 19999]
+# STEPS = [19999]
+RUNS = [0, 10, 15, 20, 30, 45]
+# RUNS = list(range(50))
+EXPS = ['FirstChain']  # FirstChain or FirstFourRoom or 1HVFourRoom
+ALGS = ['TD']
+TASK = 'EightStateCollision'
+
+
+def plot_value_function(ax, value_function, step=0, run=0, is_last_step=False):
+    ax.get_xaxis().tick_bottom()
+    ax.get_yaxis().tick_left()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.set_ylim(0, 1.0)
+    label = f"{step}_{run}"
+    line_style = '-'
+    line_width = 4
+    alpha = 1.0
+    color = 'blue'
+    if not step:
+        line_style = '--'
+    if not step and is_last_step:
+        line_style = '-'
+    if is_last_step:
+        line_width = 2
+        alpha = 0.2
+        color = 'red'
+        ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha, color=color)
+    else:
+        ax.plot(value_function, label=label, linewidth=line_width, linestyle=line_style, alpha=alpha)
+    ax.set_yticklabels([])
+    ax.set_xticklabels([])
+    ax.spines['left'].set_linewidth(2)
+    ax.spines['bottom'].set_linewidth(2)
+
+
+def plot_value_functions():
+    for exp in EXPS:
+        save_dir = os.path.join('pdf_plots', 'value_functions')
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy'))
+        for alg in ALGS:
+            value_processor = ValueFunctionProcessor(exp, alg)
+            for run in RUNS:
+                fig, ax = plt.subplots(figsize=(8, 3))
+                for step in STEPS:
+                    value_function = value_processor.get_value_function_by_step_and_run(step, run)
+                    plot_value_function(ax, value_function, step, run)
+                plot_value_function(ax, true_value_function)
+                fig.savefig(os.path.join(save_dir, f"{run}_value_function_{alg}_{exp}.pdf"),
+                            format='pdf', dpi=200, bbox_inches='tight')
+            plt.show()
+
+
+def plot_all_final_value_functions():
+    for exp in EXPS:
+        save_dir = os.path.join('pdf_plots', 'value_functions', 'asymptotic_value_functions')
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        true_value_function = np.load(os.path.join(os.getcwd(), 'Resources', TASK, 'state_values.npy'))
+        for alg in ALGS:
+            value_processor = ValueFunctionProcessor(exp, alg)
+            fig, ax = plt.subplots(figsize=(8, 3))
+            for run in range(50):
+                value_function = value_processor.get_value_function_for_last_step(run)
+                plot_value_function(ax, value_function, is_last_step=True)
+            plot_value_function(ax, true_value_function)
+            fig.savefig(os.path.join(save_dir, f"value_function_{alg}_{exp}.pdf"),
+                        format='pdf', dpi=200, bbox_inches='tight')
+            plt.show()
--- a/README.md
+++ b/README.md
+
+<p align="center">
+    <img width="100" src="/Assets/rlai.png" />
+</p>
+<br>
+<div align="center">
+  :steam_locomotive::train::train::train::train::train:
+</div>
+<h2 align=center>An Empirical Comparison of Off-policy Prediction Learning Algorithms on the Collision Task</h2>
+
+This repository includes the code for the "empirical off-policy" paper.
+<br>
+
+
+<p align="center">
+    <img src="/Assets/FourRoomGridWorld.gif" />
+    <img src="/Assets/chain.gif" />
+</p>
+
+## Table of Contents
+- **[Specification of Dependencies](#specifications)**
+- **[Algorithms](#algorithms)**
+    - **TD**: [Off-policy TD](#td)
+    - **Gradient-TD family**   : [GTD](#gtd) , [GTD2](#gtd2), [HTD](#htd), [PGTD2](#pgdt2), [TDRC](#tdrc)
+    - **Emphatic-TD family**   : [Emphatic TD](#etd), [Emphatic TDβ](#etdb)  
+    - **Variable-λ family**    : [TB](#tb), [Vtrace](#vtrace), [ABTD](#abtd)
+    - **[Algorithm Glossary](#glossary)**
+- **[Environments](#environment)** :  [Chain](#chain), [Four Room Grid World](#four_room_grid_world)
+- **[How to run the code](#how-to-run)**: [Learning.py](#learning.py), [Job Buidler](#job_builder)
+- **[Plotting the results](#Plot-results)**
+
+<a name='specifications'></a>
+## Specification of Dependencies
+This code requires python 3.5 or above. Packages that are required for running the code are all in the `requirements.txt`
+file. To install these dependencies, run the following command if your pip is set to `python3.x`:
+```text
+pip install requirements.txt
+```
+otherwise, run:
+```text
+pip3 install requirements.txt
+```
+
+
+
+
+<a name='algorithms'></a>
+## Algorithms
+Algorithms are used to find a weight vector, [**w**](#var_w), such that the dot product of [**w**](#var_w) and the feature vector, 
+approximates the value function. 
+
+
+<a name='td'></a>
+### Off-policy TD
+
+**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation](
+https://www.cs.mcgill.ca/~dprecup/publications/PSD-01.pdf)<br>
+**Authors** Doina Precup, Richard S. Sutton, Sanjoy Dasgupta<br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = rho * (gamma * lmbda * z + x)
+w += alpha * delta * z
+```
+
+### Gradient-TD algorithms
+<a name='gtd'></a>
+#### GTD/TDC
+
+**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation](
+http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.6170&rep=rep1&type=pdf)<br>
+**Authors** Richard S. Sutton, Hamid Reza Maei, Doina Precup, Shalabh Bhatnagar, David Silver, Csaba Szepesvàri,
+Eric Wiewiora<br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = rho * (gamma * lmbda * z + x)
+w += alpha * (delta * z - gamma * (1 - lmbda) * np.dot(z, v) * x_p)
+v += alpha_v * (delta * z - np.dot(x, v) * x)
+```
+
+<a name='gtd2'></a>
+#### GTD2
+
+**Paper** [Off-Policy Temporal-Difference Learning with Function Approximation](
+http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.6170&rep=rep1&type=pdf)<br>
+**Authors** Richard S. Sutton, Hamid Reza Maei, Doina Precup, Shalabh Bhatnagar, David Silver, Csaba Szepesvàri,
+Eric Wiewiora<br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = rho * (gamma * lmbda * z + x)
+w += alpha * (np.dot(x, v) * x - gamma * (1 - lmbda) * np.dot(z, v) * x_p)
+v += alpha_v * (delta * z - np.dot(x, v) * x)
+```
+
+<a name='htd'></a>
+#### HTD
+
+**Paper** [Investigating Practical Linear Temporal Difference Learning](
+https://arxiv.org/pdf/1602.08771.pdf)<br>
+**Authors** Adam White, Martha White<br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = rho * (gamma * lmbda * z + x)
+z_b = gamma * lmbda * z_b + x
+w += alpha * ((delta * z) + (x - gamma * x_p) * np.dot((z - z_b), v))
+v += alpha_v * ((delta * z) - (x - gamma * x_p) * np.dot(v, z_b))
+```
+
+<a name='pgtd2'></a>
+#### Proximal GTD2
+
+**Paper** [Proximal Gradient Temporal Difference Learning: Stable Reinforcement Learning with Polynomial Sample Complexity](
+https://arxiv.org/pdf/2006.03976.pdf)<br>
+**Authors** Bo Liu, Ian Gemp, Mohammad Ghavamzadeh, Ji Liu, Sridhar Mahadevan, Marek Petrik<br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = rho * (gamma * lmbda * z + x)
+v_mid = v + alpha_v * (delta * z - np.dot(x, v) * x)
+w_mid = w + alpha * (np.dot(x, v) * x - (1 - lmbda) * gamma * np.dot(z, v) * x_p)
+delta_mid = r + gamma * np.dot(w_mid, x_p) - np.dot(w_mid, x)
+w += alpha * (np.dot(x, v_mid) * x - gamma * (1 - lmbda) * np.dot(z, v_mid) * x_p)
+v += alpha_v * (delta_mid * z - np.dot(x, v_mid) * x)
+```
+
+<a name='tdrc'></a>
+#### TDRC
+
+**Paper** [Gradient Temporal-Difference Learning with Regularized Corrections](
+http://proceedings.mlr.press/v119/ghiassian20a/ghiassian20a.pdf)<br>
+**Authors** Sina Ghiassian, Andrew Patterson, Shivam Garg, Dhawal Gupta, Adam White, Martha White <br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = rho * (gamma * lmbda * z + x)
+w += alpha * (delta * z - gamma * (1 - lmbda) * np.dot(z, v) * x_p)
+v += alpha_v * (delta * z - np.dot(x, v) * x) - alpha_v * tdrc_beta * v
+```
+
+### Emphatic-TD algorithms
+
+<a name='etd'></a>
+#### Emphatic TD
+
+**Paper** [An Emphatic Approach to the Problem of Off-policy Temporal-Difference Learning](
+https://jmlr.org/papers/volume17/14-488/14-488.pdf)<br>
+**Authors** Richard S. Sutton, A. Rupam Mahmood, Martha White<br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = rho * (gamma * lmbda * z + x)
+F = gamma * old_rho * F + 1
+m = lmbda * 1 + (1 - lmbda) * F
+z = rho * (x * m + gamma * lmbda * z)
+w += alpha * delta * z
+```
+
+<a name='etdb'></a>
+#### Emphatic TDβ
+
+**Paper** [Generalized Emphatic Temporal Difference Learning: Bias-Variance Analysis](
+https://ojs.aaai.org/index.php/AAAI/article/view/10227/10086)<br>
+**Authors** Assaf Hallak, Aviv Tamar, Remi Munos, Shie Mannor<br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = rho * (gamma * lmbda * z + x)
+F = beta * old_rho * F + 1
+m = lmbda * 1 + (1 - lmbda) * F
+z = rho * (x * m + gamma * lmbda * z)
+w += alpha * delta * z
+```
+
+
+### Variable-λ algorithms
+
+<a name='tb'></a>
+#### Tree backup/ Tree backup for prediction
+
+**Paper** [Eligibility Traces for Off-Policy Policy Evaluation](
+https://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1079&=&context=cs_faculty_pubs&=&sei-redir=1&referer=https%253A%252F%252Fscholar.google.com%252Fscholar%253Fhl%253Den%2526as_sdt%253D0%25252C5%2526q%253Dtree%252Bbackup%252Balgorithm%252Bdoina%252Bprecup%2526btnG%253D#search=%22tree%20backup%20algorithm%20doina%20precup%22)<br>
+**Authors** Doina Precup, Richard S. Sutton, Satinder Singh<br>
+
+The algorithm pseudo-code described below is the prediction variant of the original Tree backup algorithm proposed by 
+Precup, Sutton, and Singh (2000). The prediction variant of the algorithm used here is first derived in the current paper.
+```python
+delta = rho * (r + gamma * np.dot(w, x_p) - np.dot(w, x))
+z = gamma * lmbda * old_pi * z + x
+w = w + alpha * delta * z
+```
+
+<a name='vtrace'></a>
+#### Vtrace (simplified)
+
+**Paper** [IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures]
+(http://proceedings.mlr.press/v80/espeholt18a/espeholt18a.pdf)<br>
+**Authors** Lasse Espeholt,  Hubert Soyer,  Remi Munos,  Karen Simonyan, Volodymyr Mnih, Tom Ward, Yotam Doron, Vlad Firoiu, Tim Harley, Iain Dunning, Shane Legg, Koray Kavukcuoglu <br>
+
+```python
+delta = r + gamma * np.dot(w, x_p) - np.dot(w, x)
+z = min(1, rho) * (gamma * lmbda * z + x)
+w += alpha * delta * z
+```
+
+<a name='abtd'></a>
+#### ABQ/ABTD
+
+**Paper** [Multi-step Off-policy Learning Without Importance Sampling Ratios](
+https://arxiv.org/pdf/1702.03006)<br>
+**Authors** A. Rupam Mahmood, Huizhen Yu, Richard S. Sutton <br>
+
+The algorithm pseudo-code described below is the prediction variant of the original Tree backup algorithm proposed by 
+Mahmood, Sutton, and Yu (2017). The prediction variant of the algorithm used here is first derived in the current paper.
+This algorithm first needs to compute the following:
+```python
+xi_zero = 1
+xi_max = 2
+xi = 2 * zeta * xi_zero + max(0, 2 * zeta - 1) * (xi_max - 2 * xi_zero)
+```
+`xi_zero` and `xi_max` are specifically computed here for the Collision problem. 
+To see how these are computed for the task see the original paper referenced above.
+
+```python
+nu = min(xi, 1.0 / max(pi, mu))
+delta = rho * (r + gamma * np.dot(w, x_p) - np.dot(w, x))
+nu = min(xi, 1.0 / max(pi, mu))
+z = x + gamma * old_nu * old_pi * z
+w += alpha * delta * z
+```
+
+<a name='glossary'></a>
+### Algorithm Glossary
+Here, we briefly explain all the symbols and variables names that we use in our implementation.
+
+#### meta-parameters
+- Common parameters of all algorithms:
+  - alpha (α): is the step size that defines how much the weight vector [**w**](#var_w) is updated at each time step.
+  - lambda (λ): is the bootstrapping parameter.
+- Common parameters of Gradient-TD algorithms:    
+  - alpha_v (α<sub>v</sub>): is the second step size that defines how much the second weight vector [**v**](#var_v) is 
+    updated at each time step.
+- beta (β): is the parameter used by the [**ETDβ**](#etdb) algorithm that defines how much the product of importance sampling ratios
+from the past affects the current update.
+- tdrc_beta (tdrc<sub>β</sub>): is the regularization parameter of the [**TDRC**](#tdrc) algorithms. This parameter is often set to 1.  
+- zeta (ζ): is only used in the [**ABTD**](#abtd) algorithm. It is similar to the bootstrapping parameter of other algorithms.
+
+#### Algorithms variables
+<a name='var_w'></a>
+- **w**: is the main weight vector being learned. ```init: w=0```.
+<a name='var_v'></a>
+- **v**: is the secondary weight vector learned by Gradient-TD algorithms.  ```init: v=0```.
+<a name='var_z'></a>
+- **z**: is the eligibility trace vector.  ```init: z=0```.
+<a name='var_zb'></a>
+- **z<sub>b</sub>**: is the extra eligibility trace vector used by [**HTD**](#htd).  ```init: z_b=0```.
+<a name='var_delta'></a>
+- delta (𝛿): is the td-error, which in the full bootstrapping case, is equal to the reward plus the value of the next 
+  state minus the value of the current state.
+<a name='var_s'></a>
+- s: is the current state (scalar).
+<a name='var_x'></a>
+- **x**: is the feature vector of the current state.
+<a name='var_s_p'></a>
+- s_p: is the next state (scalar).
+<a name='var_x_p'></a>
+- **x_p**: is the feature vector of the next state. 
+<a name='var_r'></a>
+- r: is the reward.
+<a name='var_rho'></a>
+- rho (ρ): is the importance sampling ratio, which is equal to the probability of taking an action under the target policy
+  divided by the probability of taking the same action under the behavior policy.
+<a name='var_oldrho'></a>
+- old_rho (oldρ): is the importance sampling ratio at the previous time step.
+<a name='var_pi'></a>
+- pi (π): is the probability of taking an action under the target policy at the current time step.
+<a name='var_oldpi'></a>
+- old_pi (oldπ): is the probability of taking an action under the target policy in the previous time step. The variable
+  π itself is the probability of taking action under the target policy at the current time step.
+<a name='var_F'></a>
+- F : is the follow-on trace used by [Emphatic-TD](#etd) algorithms.
+<a name='var_m'></a>
+- m : is the emphasis used by [Emphatic-TD](#etd) algorithms.
+<a name='var_nu'></a>
+- nu (ν): Variable used by the ABQ/ABTD algorithm. Please refer to the [original paper](https://arxiv.org/pdf/1702.03006) for explanation.
+<a name='var_si'></a>
+- xi (ψ): Variable used by the ABQ/ABTD algorithm. Please refer to the [original paper](https://arxiv.org/pdf/1702.03006) for explanation.
+<a name='var_mu'></a>
+- mu (μ): is the probability of taking action under the behavior policy at the current time step.
+<a name='var_oldmu'></a>
+- old_mu (oldμ): is the probability of taking an action under the target policy at the previous time step.
+- gamma (γ): is the discount factor parameter.
+
+
+<a name='environment'></a>
+## Environment
+At the heart of an environment is an MDP.
+The MDP defines the states, actions, rewards, transition probability matrix, and the discount factor.
+
+<a name="chain_env"></a>
+### Chain Environment and the Collision Task
+<br>
+<p align="center">
+    <img width="800" src="/Assets/eight_state_collision.png" />
+</p>
+<br>
+An MDP with eight states is at the heart of the task.
+The agent starts in one of the four leftmost states with equal probability.
+One action in available in the four leftmost states: forward. Two actions are available in the four rightmost states: 
+forward and turn. By taking the forward action, the agent transitions one state to the right and by taking the turn 
+action, it moves away from the wall and transitions to one of the four leftmost states equiprobably. Rewards are all 
+zero except for taking forward in state 8 for which a +1 is emitted. Termination function (discount factor) returns
+0.9 for all transitions except for taking turn in any state or taking forward in state 8, for which the termination
+function returns zero.
+
+```python
+env = Chain()
+env.reset() # returns to one of the four leftmost states with equal probability.
+for step in range(1, 1000):
+    action = np.random.randint(0, 2) #  forward=0, turn=1
+    sp, r, is_wall = env.step(action=action)
+    if is_wall:
+        env.reset()
+```
+
+We applied eleven algorithms to the Collision task: Off-policy TD(λ), GTD(λ), GTD2(λ), HTD(λ), Proximal GTD2(λ), TDRC(λ)
+, ETD(λ), ETD(λ,β), Tree Backup(λ), Vtrace(λ), ABTD(ζ). The target policy was π(forward|·) = 1.0. The behavior policy 
+was b(forward|·) = 1.0 for the four leftmost states and b(forward|·) = 0.5, b(retreat|·) = 0.5 for the four rightmost 
+states. Each algorithm was applied to the task with a range of parameters. We refer to an algorithm with a specific 
+parameter setting as an instance of that algorithm. Each algorithm instance was applied to the Collision task for 
+20,000 time steps, which we call a run. We repeated the 20,000 time steps for fifty runs. All instances of all 
+algorithms experienced the same fifty trajectories.
+
+Linear function approximation was used to approximate the true value function. Each state was represented by a six 
+dimensional binary feature vector. The feature representation of each state had exactly three zeros and three ones. 
+The locations of the zeros and ones were selected randomly. This was repeated once at the beginning of each run, 
+meaning that the representation for each run is most probably different from other runs. At the beginning of each run 
+we set **w**<sub>0</sub> = **0** and thus the error would be the same for all algorithms at the beginning of the runs.
+
+#### Feature representation
+The feature representation for the collision task is an array of size `8, 6, 50`, where 8 corresponds to the number of 
+states, 6 correponds to the number of features for each state, and 50 corresponds to the number of runs.
+The feature representations used for the set of results presented here and in the paper is saved in:
+```
+Resources/EightStateCollision/feature_rep.npy
+```
+Note that the feature representaiton for each run is different in the Collision task.
+For example, the feature representation for the first run is:
+```
+array([[0., 0., 1., 0., 1., 1.],
+       [1., 1., 1., 0., 0., 0.],
+       [0., 1., 1., 0., 0., 1.],
+       [1., 0., 1., 1., 0., 0.],
+       [1., 1., 0., 0., 1., 0.],
+       [0., 1., 1., 1., 0., 0.],
+       [1., 1., 0., 0., 0., 1.],
+       [1., 0., 1., 0., 0., 1.]])
+```
+
+#### State distribution induced by the behavior policy
+To compute an approximation of the mean squared value error at each time step, weighting induced by the behavior policy
+was approximated by following the behavior policy for 20,000,000 time step and computing the fraction of time spent in
+each state. The resulting distribution is saved in:
+```
+Resources/EightStateCollision/d_mu.npy
+```
+`d_mu.npy` is a one dimensional numpy array of size `8`:
+```
+array([0.05715078, 0.1142799 , 0.17142456, 0.22856842, 0.22856842, 0.11428067, 0.05715311, 0.02857415])
+```
+
+#### True state values
+To compute an approximation of the mean squared value error at each time step, we need the true state values.
+Luckily, for the Collision task, these values are easy to compute.
+We computed these true values by following the target policy from each state to the wall once.
+The resulting values are saved in:
+```
+Resources/EightStateCollision/state_values.npy
+```
+`state_values.npy` is a one dimensional numpy array of size `8`:
+```
+array([0.4782969, 0.531441, 0.59049, 0.6561, 0.729, 0.81, 0.9, 1])
+```
+
+
+
+<a name='how-to-run'></a>
+## How to Run the Code
+The code can be run in two different ways.
+One way is through `learning.py` that can be used to run small experiments on a local computer.
+The other way is through the files inside the Job directory. 
+We explain each of these approaches below by means of an example.
+
+### Running on Your Local Machine
+Let's take the following example: applying Off-policy TD(λ) to the Collision task.
+There are multiple ways for doing this.
+The first way is to open a terminal and go into the root directory of the code and run `Learning.py` with proper parameters:
+```
+python3 Learning.py --algorithm TD --task EightStateCollision --num_of_runs 50 --num_steps --environment Chain
+--save_value_function Ture --alpha 0.01 --lmbda 0.9
+```
+In case any of the parameters are not specified, a default value will be used.
+The default value is set in the `Job` directory, inside the `JobBuilder.py` file.
+This means, the code, can alternatively be run, by setting all the necessary values that an algorithm needs at the top of the `JobBuilder.py` file.
+Note that not all parameters specified in the `default_params` dict are required for all algorithms. For example, the `tdrc_beta` parameter is only
+required to be set for the TDRC(λ) algorithms.
+Once the variables inside the `default_params` dictionary, the code can be run:
+```
+python3 Learning.py
+```
+Or one can choose to specify some parameters in the `default_params` dictionary and specify the rest as command line argumets 
+like the following:
+```
+python3 Learning.py --algorithm TD --task EightStateCollision --alpha 0.01
+```
+
+### Running on Servers with Slurm Workload Managers
+When parameter sweeps are necessary, the code can be run on supercomputers. 
+The current code supports running on servers that use slurm workload managers such as compute canada.
+For exampole, to apply the TD algorithm to the Collision (EightStateCollision) task, with various parameters,
+first you need to create a json file that specifies all the parameters that you would like to run, for example:
+```json
+{
+  "agent": "TD",
+  "environment": "Chain",
+  "task": "EightStateCollision",
+  "number_of_runs": 50,
+  "number_of_steps": 20000,
+  "sub_sample": 1,
+  "meta_parameters": {
+    "alpha": [
+      0.000003814, 0.000007629, 0.000015258, 0.000030517, 0.000061035, 0.000122070, 0.000244140, 0.000488281,
+      0.000976562, 0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0
+    ],
+    "lmbda": [
+      0.1, 0.2, 0.3
+    ]
+  }
+}
+```
+and then run `main.py` using python:
+```
+python3 main.py -f <path_to_the_json_file> -s <kind_of_submission>
+```
+where `kind_of_submission` refers to one of the two ways you can submit your code:
+1) You can request an individual cpu for each of the algorithm instances, where an algorithm instance refers to an 
+algorithm with specific parameters. To request an individual cpu, run the following command:
+```
+python3 main.py -f <path_to_the_json_file_or_dir> -s cpu
+```
+When running each algorithm instance on a single cpu, you need to specify the following parameters inside 
+`Job/SubmitJobsTemplatesCedar.SL`:
+```shell
+#SBATCH --account=xxx
+#SBATCH --time=00:15:58
+#SBATCH --mem=3G
+```
+where `#SBATCH --account=xxx` requires the account you are using in place of `xxx`,
+`#SBATCH --time=00:15:58` requires the time you want to request for each individual cpu,
+and `#SBATCH --mem=xG` requires the amount of memory in place of x.
+
+2) You can request a node, that we assume includes 40 cpus. If you request a node, the jobs you submit will run in 
+parallel 40 at a time, and once one job is finished, the next one in line will start running.
+This process continues until either all jobs are finished running, or you run out of the time you requested for that node.
+```
+python3 main.py -f <path_to_the_json_file_or_dir> -s node
+```
+When running the jobs on nodes, you need to specify the following parameters inside `Job/SubmitJobsTemplates.SL`:
+```shell
+#SBATCH --account=xxx
+#SBATCH --time=11:58:59
+#SBATCH --nodes=x
+#SBATCH --ntasks-per-node=40
+```
+where `#SBATCH --account=xxx` requires the account you are using in place of `xxx`,
+`#SBATCH --time=11:58:59` requires the time you want to request for each individual node, each of which includes 40 cpus in this case,
+and `#SBATCH --nodes=x` requires the number of nodes you would like to request in place of x.
+If you request more than one node, your jobs will be spread across nodes, 40 on each node, and once each job finishes, 
+the next job in the queue will start running.
+`#SBATCH --ntasks-per-node=xx` is the number of jobs you would like to run concurrently on a single node. In this case,
+for example, we set it to 40.
+
+If `path_to_the_json_file_or_dir` is a directory, then the code will walk into all the subdirectories, and submits jobs for
+all the parameters in the json files that it finds inside those directories sequentially.
+If `path_to_the_json_file_or_dir` is a file, then the code will submit jobs for all the parameters that it finds inside that 
+single json file.
+Note that you can create a new directory for each experiment that you would like to run, and create directories for each
+of the algorithms you would like to run in that experiment.
+For example, we created a directory called `FirstChain` inside the `Experiments` directory and created one directory
+per algorithm inside the `FirstChain` directory for each of the algorithms and specified a json file in that directory.
+It is worth noting that whatever parameter that is not specified in the json file will be read from the `default_params`
+dictionary inside the `Job` directory inside the `JobBuilder.py` file.
+
+
+<a name='Plot-results'></a>
+## Plotting the results
+The following table shows all the parameters that we tested in the experiments:
+<p align="center">
+    <img width="700" src="/Assets/parameters.png" />
+</p>
+
+We now explain how each figure in the paper can be reproduced.
+All the figures of the paper can be reproduced using the `plot_data.py` file, once you run the Learning.py script with all the needed parameters.
+If you do not have the results available, the `plot_data.py` script will return an error.
+
+1) **Processing the data**: This script manipulates data in a way that it is ready to be plotted over step sizes and also such 
+   that the data is ready to be plotted as learning curves averaged over runs.
+   The `process_data` script also re-runs the algorithms with their best parameters to eliminate possible maximization 
+   bias, as explained in the paper.
+   This is a time consuming step. If you do not like to do this step, simply set:
+    ```python
+    PLOT_RERUN = False
+    ```
+    in `Plotting/plot_params.py` and the code will ignore the re-running steps.
+    If you would like to eliminate the maximization bias, set:
+    ```python
+    PLOT_RERUN = True
+    ```
+   Finally, go to `plot_data.py` and set `func_to_run = 'process_data'`, and run the `plot_data.py` script.
+
+2) **Plotting the learned value functions**: 
+   Go to `plot_data`, and set `func_to_run = 'plot_value_functions'` to plot 
+   the learned value functions for some of the runs, and set `func_to_run = plot_all_final_value_functions` to plot the 
+   value function learned by the last time step of all of the runs in one plot.
+   <p align="center">
+   <img src="/Assets/value_functions.png" />
+   </p>
+   <br>
+
+3) **Plotting the learning curves with specific parameter values**: 
+   Go to `plot_data`, and set  `func_to_run = 'specific_learning_curves_full_bootstrap'`, and run the `plot_data.py` 
+   script.
+   <br></br>
+   <p align="center">
+   <img width="450" src="/Assets/specific_learning_curves.png" />
+   </p>
+   <br>
+   
+4) **Plotting the parameter studies for step size for all algorithms**: 
+   Go to `plot_data`, and set  `func_to_run = 'collision_sensitivity_curves_for_many_lambdas'`, and run the script.
+   <br></br>
+   <p align="center">
+   <img src="/Assets/sensitivity_curves_of_all_algs.png" />
+   </p>
+   <br>
+   
+5) **Plotting the parameter sensitivity study of Emphatic-TD algorithms**: 
+   Go to `plot_data`, and set  `func_to_run = 'collision_emphatics_sensitivity_full_bootstrap'`, and run the script.
+   <br></br>
+   <p align="center">
+   <img width="550" src="/Assets/Emphatics_sensitivity.png" />
+   </p>
+   <br>
+   
+6) **Plotting the parameter sensitivity study of Gradient-TD algorithms**: 
+   Go to `plot_data`, and set  `func_to_run = 'collision_gradients_sensitivity_full_bootstrap'`, and run the script.
+   <br></br>
+   <p align="center">
+   <img width="850" src="/Assets/Gradients_sensitivity.png" />
+   </p>
+   <br>
--- a/Registry/AlgRegistry.py
+++ b/Registry/AlgRegistry.py
+from Algorithms.TD import TD
+from Algorithms.GTD import GTD
+from Algorithms.TDRC import TDRC
+from Algorithms.GEMETD import GEMETD
+from Algorithms.GTD2 import GTD2
+from Algorithms.PGTD2 import PGTD2
+from Algorithms.HTD import HTD
+from Algorithms.ETDLB import ETDLB
+from Algorithms.ETD import ETD
+from Algorithms.ABTD import ABTD
+from Algorithms.Vtrace import Vtrace
+from Algorithms.TB import TB
+from Algorithms.LSTD import LSTD
+from Algorithms.LSETD import LSETD
+alg_dict = {'TD': TD, 'Vtrace': Vtrace, 'GTD': GTD, 'ABTD': ABTD, 'ETD': ETD, 'TB': TB, 'GTD2': GTD2, 'HTD': HTD,
+            'ETDLB': ETDLB, 'PGTD2': PGTD2, 'TDRC': TDRC, 'GEMETD': GEMETD, 'LSTD': LSTD, 'LSETD': LSETD}
+# alg_dict = {'TD': TD, 'GTD': GTD, 'GTD2': GTD2, 'PGTD2': PGTD2, 'HTD': HTD, 'TDRC': TDRC, 'ETD': ETD, 'ETDLB': ETDLB,
+#             'TB': TB, 'Vtrace': Vtrace, 'ABTD': ABTD, 'LSTD': LSTD, 'LSETD': 'LSETD'}
--- a/Registry/EnvRegistry.py
+++ b/Registry/EnvRegistry.py
+from Environments.Chain import Chain
+from Environments.FourRoomGridWorld import FourRoomGridWorld
+environment_dict = {'FourRoomGridWorld': FourRoomGridWorld, 'Chain': Chain}
--- a/Registry/TaskRegistry.py
+++ b/Registry/TaskRegistry.py
+from Tasks.EightStateCollision import EightStateCollision
+from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
+from Tasks.HighVarianceLearnEightPoliciesTileCodingFeat import HighVarianceLearnEightPoliciesTileCodingFeat
+task_dict = {'EightStateCollision': EightStateCollision,
+             'LearnEightPoliciesTileCodingFeat': LearnEightPoliciesTileCodingFeat,
+             'HighVarianceLearnEightPoliciesTileCodingFeat': HighVarianceLearnEightPoliciesTileCodingFeat}
--- a/Resources/EightStateCollision/d_mu.npy
+++ b/Resources/EightStateCollision/d_mu.npy
--- a/Resources/EightStateCollision/feature_rep.npy
+++ b/Resources/EightStateCollision/feature_rep.npy
--- a/Resources/EightStateCollision/state_values.npy
+++ b/Resources/EightStateCollision/state_values.npy
--- a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/d_mu.npy
+++ b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/d_mu.npy
--- a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/feature_rep.npy
+++ b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/feature_rep.npy
--- a/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/state_values.npy
+++ b/Resources/HighVarianceLearnEightPoliciesTileCodingFeat/state_values.npy
--- a/Resources/LearnEightPoliciesTileCodingFeat/d_mu.npy
+++ b/Resources/LearnEightPoliciesTileCodingFeat/d_mu.npy
--- a/Resources/LearnEightPoliciesTileCodingFeat/feature_rep.npy
+++ b/Resources/LearnEightPoliciesTileCodingFeat/feature_rep.npy
--- a/Resources/LearnEightPoliciesTileCodingFeat/state_values.npy
+++ b/Resources/LearnEightPoliciesTileCodingFeat/state_values.npy
--- a/Tasks/BaseTask.py
+++ b/Tasks/BaseTask.py
+from abc import abstractmethod
+import numpy as np
+
+
+class BaseTask:
+    def __init__(self, **kwargs):
+        self.run_number = kwargs.get('run_number', 0)
+        self.num_steps = None
+        self.feature_rep = None
+        self.stacked_feature_rep = None  # If learning more than one target policy at the same time
+        self.num_features = None
+        self.GAMMA = None
+        self.behavior_dist = None
+        self.state_values = None
+        self.num_policies = None
+        self.ABTD_xi_zero = None
+        self.ABTD_xi_max = None
+
+    def stack_feature_rep(self):
+        stacked_feature_rep = np.zeros((self.num_policies, self.feature_rep.shape[1], self.feature_rep.shape[0]))
+        for i in range(self.feature_rep.shape[0]):
+            stacked_x = np.tile(self.feature_rep[i, :], [self.num_policies, 1])
+            stacked_feature_rep[:, :, i] = stacked_x
+        return stacked_feature_rep
+
+    def get_active_policies(self, s):
+        ...
+
+    def get_terminal_policies(self, s):
+        ...
+
+    def generate_behavior_dist(self, total_steps):
+        ...
+
+    @staticmethod
+    def num_of_policies():
+        ...
+
+    @abstractmethod
+    def load_feature_rep(self):
+        ...
+
+    @abstractmethod
+    def get_state_feature_rep(self, s):
+        ...
+
+    @abstractmethod
+    def create_feature_rep(self):
+        ...
+
+    @abstractmethod
+    def select_target_action(self, s, policy_id=0):
+        ...
+
+    @abstractmethod
+    def select_behavior_action(self, s):
+        ...
+
+    @abstractmethod
+    def get_pi(self, s, a):
+        ...
+
+    @abstractmethod
+    def get_mu(self, s, a):
+        ...
+
+    @abstractmethod
+    def load_behavior_dist(self):
+        return self.behavior_dist
+
+    @abstractmethod
+    def load_state_values(self):
+        return self.state_values
+
+    def __str__(self):
+        return f'task:{type(self).__name__}'
--- a/Tasks/EightStateCollision.py
+++ b/Tasks/EightStateCollision.py
+import numpy as np
+
+from Environments.Chain import Chain
+from Tasks.BaseTask import BaseTask
+
+
+class EightStateCollision(BaseTask, Chain):
+
+    def __init__(self, **kwargs):
+        BaseTask.__init__(self, **kwargs)
+        Chain.__init__(self)
+        self._resource_root_path = kwargs.get('resource_root_path', 'Resources')
+
+        self.N = kwargs.get('n', 8)
+        self.feature_rep = self.load_feature_rep()
+        self.num_features = self.feature_rep.shape[1]
+        self.num_steps = kwargs.get('num_steps', 20000)
+        self.GAMMA = 0.9
+        self.behavior_dist = self.load_behavior_dist()
+        self.state_values = self.load_state_values()
+        self.num_policies = EightStateCollision.num_of_policies()
+        self.ABTD_xi_zero = 1
+        self.ABTD_xi_max = 2
+
+    @staticmethod
+    def num_of_policies():
+        return 1
+
+    def load_feature_rep(self):
+        return np.load(f'{self._resource_root_path}/{self.__class__.__name__}/feature_rep.npy')[:, :, self.run_number]
+
+    def create_feature_rep(self):
+        num_ones = 3
+        num_zeros = self.num_features - num_ones
+        for i in range(self.N):
+            random_arr = (np.array([0] * num_zeros + [1] * num_ones))
+            np.random.shuffle(random_arr)
+            self.feature_rep[i, :] = random_arr
+
+    def get_state_feature_rep(self, s):
+        return self.feature_rep[s, :]
+
+    def load_behavior_dist(self):
+        self.behavior_dist = np.load(f'{self._resource_root_path}/{self.__class__.__name__}/d_mu.npy')
+        return self.behavior_dist
+
+    def load_state_values(self):
+        self.state_values = np.load(f'{self._resource_root_path}/{self.__class__.__name__}/state_values.npy')
+        return self.state_values
+
+    def select_behavior_action(self, s):
+        if s < self.N / 2:
+            return self.RIGHT_ACTION
+        else:
+            return np.random.choice([self.RIGHT_ACTION, self.RETREAT_ACTION])
+
+    def select_target_action(self, s, policy_id=0):
+        return self.RIGHT_ACTION
+
+    def get_pi(self, s, a):
+        if a == self.RIGHT_ACTION:
+            return 1
+        else:
+            return 0
+
+    def get_mu(self, s, a):
+        if s < self.N / 2:
+            if a == self.RIGHT_ACTION:
+                return 1
+            else:
+                return 0
+        elif s >= self.N / 2:
+            return 0.5
+        else:
+            raise AssertionError
--- a/Tasks/HighVarianceLearnEightPoliciesTileCodingFeat.py
+++ b/Tasks/HighVarianceLearnEightPoliciesTileCodingFeat.py
+import numpy as np
+from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
+
+
+class HighVarianceLearnEightPoliciesTileCodingFeat(LearnEightPoliciesTileCodingFeat):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.RANDOM_PROB = 0.97
+
+    def select_behavior_action(self, s):
+        random_num = np.random.random()
+        x, y = self.get_xy(s)
+        if x == 1 and (y == 1 or y == 8):
+            if random_num < self.RANDOM_PROB:
+                return self.ACTION_LEFT
+            else:
+                return np.random.choice([self.ACTION_UP, self.ACTION_RIGHT, self.ACTION_DOWN])
+        if x == 8 and (y == 1 or y == 8):
+            if random_num < self.RANDOM_PROB:
+                return self.ACTION_RIGHT
+            else:
+                return np.random.choice([self.ACTION_UP, self.ACTION_LEFT, self.ACTION_DOWN])
+        return np.random.choice([self.ACTION_UP, self.ACTION_DOWN, self.ACTION_LEFT, self.ACTION_RIGHT])
+
+    def get_mu(self, s, a):
+        x, y = self.get_xy(s)
+        if x == 1 and (y == 1 or y == 8):
+            if a == self.ACTION_LEFT:
+                return np.ones(self.num_policies) * self.RANDOM_PROB
+                # return 0.97
+            else:
+                return np.ones(self.num_policies) * ((1 - self.RANDOM_PROB) / 3.0)
+                # return 0.01
+        if x == 8 and (y == 1 or y == 8):
+            if a == self.ACTION_RIGHT:
+                return np.ones(self.num_policies) * self.RANDOM_PROB
+                # return 0.97
+            else:
+                return np.ones(self.num_policies) * ((1 - self.RANDOM_PROB) / 3.0)
+                # return 0.01
+
+        return super().get_mu(s, a)
--- a/Tasks/LearnEightPoliciesTileCodingFeat.py
+++ b/Tasks/LearnEightPoliciesTileCodingFeat.py
+import numpy as np
+import random
+
+from Environments.FourRoomGridWorld import FourRoomGridWorld
+from Tasks.BaseTask import BaseTask
+from utils import ImmutableDict
+
+
+class LearnEightPoliciesTileCodingFeat(BaseTask, FourRoomGridWorld):
+    def __init__(self, **kwargs):
+        BaseTask.__init__(self)
+        FourRoomGridWorld.__init__(self)
+        self.feature_rep = self.load_feature_rep()
+        self.num_features = self.feature_rep.shape[1]
+        self.num_steps = kwargs.get('num_steps', 50000)
+        self.GAMMA = 0.9
+        self.behavior_dist = self.load_behavior_dist()
+        self.state_values = self.load_state_values()
+        self.ABTD_xi_zero = 1
+        self.ABTD_xi_max = 4
+
+        self.optimal_policies = ImmutableDict(
+            {
+                0: [
+                    [lambda x, y: 0 <= x <= 3 and 2 <= y <= 4, [self.ACTION_DOWN, self.ACTION_RIGHT]],
+                    [lambda x, y: 3 >= x >= 0 == y, [self.ACTION_UP, self.ACTION_RIGHT]],
+                    [lambda x, y: 0 <= x <= 4 and y == 1, [self.ACTION_RIGHT]],
+                    [lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1], [self.ACTION_DOWN]],
+                    [lambda x, y: 4 == x and 2 <= y <= 4, [self.ACTION_DOWN]],
+                    [lambda x, y: 4 == x and y == 0, [self.ACTION_UP]]
+                ],
+                1: [
+                    [lambda x, y: 2 <= x <= 4 and 0 <= y <= 3, [self.ACTION_LEFT, self.ACTION_UP]],
+                    [lambda x, y: x == 0 and 0 <= y <= 3, [self.ACTION_RIGHT, self.ACTION_UP]],
+                    [lambda x, y: x == 1 and 0 <= y <= 4, [self.ACTION_UP]],
+                    [lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1], [self.ACTION_LEFT]],
+                    [lambda x, y: 2 <= x <= 4 and y == 4, [self.ACTION_LEFT]],
+                    [lambda x, y: x == 0 and y == 4, [self.ACTION_RIGHT]],
+                ],
+                2: [
+                    [lambda x, y: 2 <= x <= 4 and 7 <= y <= 10, [self.ACTION_LEFT, self.ACTION_DOWN]],
+                    [lambda x, y: x == 0 and 7 <= y <= 10, [self.ACTION_RIGHT, self.ACTION_DOWN]],
+                    [lambda x, y: x == 1 and 6 <= y <= 10, [self.ACTION_DOWN]],
+                    [lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1], [self.ACTION_LEFT]],
+                    [lambda x, y: 2 <= x <= 4 and y == 6, [self.ACTION_LEFT]],
+                    [lambda x, y: x == 0 and y == 6, [self.ACTION_RIGHT]],
+                ],
+                3: [
+                    [lambda x, y: 0 <= x <= 3 and 6 <= y <= 7, [self.ACTION_UP, self.ACTION_RIGHT]],
+                    [lambda x, y: 0 <= x <= 3 and 9 <= y <= 10, [self.ACTION_DOWN, self.ACTION_RIGHT]],
+                    [lambda x, y: 0 <= x <= 4 and y == 8, [self.ACTION_RIGHT]],
+                    [lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1], [self.ACTION_UP]],
+                    [lambda x, y: x == 4 and 6 <= y <= 7, [self.ACTION_UP]],
+                    [lambda x, y: x == 4 and 9 <= y <= 10, [self.ACTION_DOWN]]
+                ],
+                4: [
+                    [lambda x, y: 10 >= x >= 7 >= y >= 5, [self.ACTION_LEFT, self.ACTION_UP]],
+                    [lambda x, y: 7 <= x <= 10 and 9 <= y <= 10, [self.ACTION_LEFT, self.ACTION_DOWN]],
+                    [lambda x, y: 6 <= x <= 10 and y == 8, [self.ACTION_LEFT]],
+                    [lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1], [self.ACTION_UP]],
+                    [lambda x, y: x == 6 and 5 <= y <= 7, [self.ACTION_UP]],
+                    [lambda x, y: x == 6 and 9 <= y <= 10, [self.ACTION_DOWN]]
+                ],
+                5: [
+                    [lambda x, y: 6 <= x <= 7 and 6 <= y <= 10, [self.ACTION_RIGHT, self.ACTION_DOWN]],
+                    [lambda x, y: 9 <= x <= 10 and 6 <= y <= 10, [self.ACTION_DOWN, self.ACTION_LEFT]],
+                    [lambda x, y: x == 8 and 5 <= y <= 10, [self.ACTION_DOWN]],
+                    [lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1], [self.ACTION_RIGHT]],
+                    [lambda x, y: 6 <= x <= 7 and y == 5, [self.ACTION_RIGHT]],
+                    [lambda x, y: 9 <= x <= 10 and y == 5, [self.ACTION_LEFT]]
+                ],
+                6: [
+                    [lambda x, y: 6 <= x <= 7 and 0 <= y <= 2, [self.ACTION_UP, self.ACTION_RIGHT]],
+                    [lambda x, y: 9 <= x <= 10 and 0 <= y <= 2, [self.ACTION_UP, self.ACTION_LEFT]],
+                    [lambda x, y: x == 8 and 0 <= y <= 3, [self.ACTION_UP]],
+                    [lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1], [self.ACTION_RIGHT]],
+                    [lambda x, y: 6 <= x <= 7 and y == 3, [self.ACTION_RIGHT]],
+                    [lambda x, y: 9 <= x <= 10 and y == 3, [self.ACTION_LEFT]]
+                ],
+                7: [
+                    [lambda x, y: 7 <= x <= 10 and 2 <= y <= 3, [self.ACTION_DOWN, self.ACTION_LEFT]],
+                    [lambda x, y: 7 <= x <= 10 and y == 0, [self.ACTION_UP, self.ACTION_LEFT]],
+                    [lambda x, y: 6 <= x <= 10 and y == 1, [self.ACTION_LEFT]],
+                    [lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1], [self.ACTION_DOWN]],
+                    [lambda x, y: x == 6 and 2 <= y <= 3, [self.ACTION_DOWN]],
+                    [lambda x, y: x == 6 and y == 0, [self.ACTION_UP]]
+                ]
+            }
+        )
+        self.default_actions = ImmutableDict(
+            {
+                0: self.ACTION_RIGHT,
+                1: self.ACTION_UP,
+                2: self.ACTION_DOWN,
+                3: self.ACTION_RIGHT,
+                4: self.ACTION_LEFT,
+                5: self.ACTION_DOWN,
+                6: self.ACTION_UP,
+                7: self.ACTION_LEFT
+            }
+        )
+        self.policy_terminal_condition = ImmutableDict(
+            {
+                0: lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1],
+                1: lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1],
+                2: lambda x, y: x == self.hallways[1][0] and y == self.hallways[1][1],
+                3: lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1],
+                4: lambda x, y: x == self.hallways[2][0] and y == self.hallways[2][1],
+                5: lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1],
+                6: lambda x, y: x == self.hallways[3][0] and y == self.hallways[3][1],
+                7: lambda x, y: x == self.hallways[0][0] and y == self.hallways[0][1]
+            }
+        )
+        self.num_policies = LearnEightPoliciesTileCodingFeat.num_of_policies()
+        self.stacked_feature_rep = self.stack_feature_rep()
+        self._active_policies_cache = {}
+
+    @staticmethod
+    def num_of_policies():
+        return 8
+
+    def get_terminal_policies(self, s):
+        x, y = self.get_xy(s)
+        terminal_policies = np.zeros(self.num_policies)
+        for policy_id, condition in self.policy_terminal_condition.items():
+            if condition(x, y):
+                terminal_policies[policy_id] = 1
+        return terminal_policies
+
+    def get_state_index(self, x, y):
+        return int(y * np.sqrt(self.feature_rep.shape[0]) + x)
+
+    def get_probability(self, policy_number, s, a):
+        x, y = self.get_xy(s)
+        probability = 0.0
+        for condition, possible_actions in self.optimal_policies[policy_number]:
+            if condition(x, y):
+                if a in possible_actions:
+                    probability = 1.0 / len(possible_actions)
+                    break
+        return probability
+
+    def select_target_action(self, s, policy_id=0):
+        x, y = self.get_xy(s)
+        a = self.default_actions[policy_id]
+        for condition, possible_actions in self.optimal_policies[policy_id]:
+            if condition(x, y):
+                a = random.choice(possible_actions)
+                break
+        return a
+
+    def get_active_policies(self, s):
+        if s in self._active_policies_cache:
+            return self._active_policies_cache[s]
+        x, y = self.get_xy(s)
+        active_policy_vec = np.zeros(self.num_policies, dtype=int)
+        for policy_number, policy_values in self.optimal_policies.items():
+            for (condition, _) in policy_values:
+                if condition(x, y):
+                    active_policy_vec[policy_number] = 1
+                    break
+        self._active_policies_cache[s] = active_policy_vec
+        return active_policy_vec
+
+    def load_feature_rep(self):
+        return np.load(f'Resources/{self.__class__.__name__}/feature_rep.npy')
+
+    def get_state_feature_rep(self, s):
+        return self.feature_rep[s, :]
+
+    def create_feature_rep(self):
+        ...
+
+    def load_behavior_dist(self):
+        return np.load(f'Resources/{self.__class__.__name__}/d_mu.npy')
+
+    def load_state_values(self):
+        return np.load(f'Resources/{self.__class__.__name__}/state_values.npy')
+
+    def select_behavior_action(self, s):
+        return np.random.randint(0, self.num_actions)
+
+    def get_mu(self, s, a):
+        return np.ones(self.num_policies) * (1.0 / self.num_actions)
+
+    def get_pi(self, s, a):
+        pi_vec = np.zeros(self.num_policies)
+        for policy_id, i in enumerate(self.get_active_policies(s)):
+            if i:
+                pi_vec[policy_id] = self.get_probability(policy_id, s, a)
+        return pi_vec
+
+    def generate_behavior_dist(self, total_steps):
+        final_state_dist = np.zeros((self.num_policies, self.num_states))
+        s = self.reset()
+        state_visitation_count = np.zeros(self.num_states)
+        for step in range(total_steps):
+            if step % 100000 == 0:
+                print(step)
+            state_visitation_count[s] += 1
+            sp, r, is_terminal, _ = self.step(self.select_behavior_action(s))
+            s = sp
+        for s in range(self.num_states):
+            for policy_id, i in enumerate(self.get_active_policies(s)):
+                if i:
+                    final_state_dist[policy_id, s] = state_visitation_count[s]
+        return (final_state_dist / total_steps).T
--- a/Tests/Algorithms/TestTD.py
+++ b/Tests/Algorithms/TestTD.py
+import unittest
+import numpy as np
+
+from Algorithms.TD import TD
+from Environments.Chain import Chain
+from Tasks.EightStateCollision import EightStateCollision
+
+
+class TestTD(unittest.TestCase):
+    def setUp(self) -> None:
+        params = {
+            #'resource_root_path': '../../Resources',
+            'alpha': 0.001953125,
+            'lmbda': 0.9,
+        }
+        self.env = Chain()
+        self.task = EightStateCollision(**params)
+        self.task.reset()
+
+        self.alg = TD(task=self.task, **params)
+
+    def tearDown(self) -> None:
+        ...
+
+    def test_initial_w_is_zero(self):
+        self.assertEqual(self.alg.w.sum(), 0)
+
+    def test_initial_z_is_zero(self):
+        self.assertEqual(self.alg.z.sum(), 0)
+
+    def test_learn_single_policy_rmsve_after_num_steps(self):
+        rmsve_of_run = np.zeros((self.task.num_policies, self.task.num_steps))
+        np.random.seed(0)
+
+        self.alg.state = self.env.reset()
+        for step in range(self.task.num_steps):
+            rmsve_of_run[:, step], error = self.alg.compute_rmsve()
+            self.alg.action = self.alg.choose_behavior_action()
+            self.alg.next_state, r, is_terminal, info = self.env.step(self.alg.action)
+            self.alg.learn(self.alg.state, self.alg.next_state, r, is_terminal)
+            if is_terminal:
+                self.alg.state = self.env.reset()
+                self.alg.reset()
+                continue
+            self.alg.state = self.alg.next_state
+        self.assertTrue(abs(0.08319472840990755 - rmsve_of_run[0, -1]) <= 0.0000001)
--- a/Tests/Environments/TestChain.py
+++ b/Tests/Environments/TestChain.py
+import unittest
+
+from Environments.Chain import Chain
+
+
+class TestChain(unittest.TestCase):
+    def setUp(self) -> None:
+        self.env = Chain()
+        self.env.reset()
+
+    def tearDown(self) -> None:
+        self.env.reset()
+
+    def test_rest_initial_state_between_zero_three(self):
+        self.env.reset()
+        self.assertIn(self.env._state, [0, 1, 2, 3])
+
+    def test_step_retreat_move_state_to_initial_state(self):
+        self.env.reset()
+        sp, r, is_done, _ = self.env.step(self.env.RETREAT_ACTION)
+        self.assertEqual(is_done, True)
+
+    def test_step_right_move_state_one_step_to_right(self):
+        self.env.reset()
+        s = self.env._state
+        sp, r, is_done, _ = self.env.step(self.env.RIGHT_ACTION)
+        self.assertEqual(sp - s, 1)
--- a/Tests/Tasks/TestEightStateCollision.py
+++ b/Tests/Tasks/TestEightStateCollision.py
+import unittest
+from Tasks.EightStateCollision import EightStateCollision
+from Environments.Chain import Chain
+
+
+class TestEightStateCollision(unittest.TestCase):
+    def setUp(self) -> None:
+        params = {
+            #'resource_root_path': '../../Resources'
+            }
+        self.experiment = EightStateCollision(**params)
+        self.experiment.reset()
+
+    def tearDown(self) -> None:
+        ...
+
+    def test_load_feature_rep_evaluate_shape_is_(self):
+        feature_rep_arr = self.experiment.load_feature_rep()
+        self.assertEqual(feature_rep_arr.shape, (8, 6))
+
+    def test_get_state_feature_rep_state_for_all_states(self):
+        expected_states_feature_rep = [
+            [0., 0., 1., 0., 1., 1.],
+            [1., 1., 1., 0., 0., 0.],
+            [0., 1., 1., 0., 0., 1.],
+            [1., 0., 1., 1., 0., 0.],
+            [1., 1., 0., 0., 1., 0.],
+            [0., 1., 1., 1., 0., 0.],
+            [1., 1., 0., 0., 0., 1.],
+            [1., 0., 1., 0., 0., 1.]
+        ]
+        evaluated_states_feature_rep = []
+        for state in range(self.experiment.N):
+            evaluated_states_feature_rep.append(list(self.experiment.get_state_feature_rep(state)))
+        self.assertListEqual(evaluated_states_feature_rep, expected_states_feature_rep)
+
+    def test_load_behavior_dist_evaluate_shape_is_(self):
+        behavior_dist = self.experiment.load_behavior_dist()
+        self.assertEqual(behavior_dist.shape, (8,))
+
+    def test_get_mu_for_right_action_in_initial_state_is_one(self):
+        mu = self.experiment.get_mu(0, self.experiment.RIGHT_ACTION)
+        self.assertEqual(mu, 1)
+
+    def test_get_mu_for_retreat_action_in_initial_state_is_zero(self):
+        mu = self.experiment.get_mu(0, self.experiment.RETREAT_ACTION)
+        self.assertEqual(mu, 0)
+
+    def test_get_mu_for_all_action_in_not_initial_state_is_one_half(self):
+        mu = self.experiment.get_mu(5, self.experiment.RIGHT_ACTION)
+        self.assertEqual(mu, 0.5)
+        mu = self.experiment.get_mu(5, self.experiment.RETREAT_ACTION)
+        self.assertEqual(mu, 0.5)
+
+    def test_get_pi_for_right_action_is_one(self):
+        pi = self.experiment.get_pi(0, self.experiment.RIGHT_ACTION)
+        self.assertEqual(pi, 1)
+
+    def test_get_pi_for_retreat_action_is_one(self):
+        pi = self.experiment.get_pi(0, self.experiment.RETREAT_ACTION)
+        self.assertEqual(pi, 0)
--- a/data_presister.py
+++ b/data_presister.py
+import itertools
+import json
+import os
+from collections import defaultdict
+from itertools import zip_longest
+from typing import List, Optional, Dict
+
+import numpy as np
+
+from Job.JobBuilder import default_params
+from Plotting.plot_params import EXP_ATTRS
+from Plotting.plot_utils import load_and_replace_large_nan_inf
+from Registry.AlgRegistry import alg_dict
+from utils import Configuration
+
+
+def split_dict_of_list_to_dicts(dict_of_list: Dict[str, list]) -> List[Dict[str, float]]:
+    """split a given dictionary of lists into list of dictionaries.
+
+    >>> split_dict_of_list_to_dicts({'alpha': [1, 2, 3], 'lambda': [4, 5], 'gamma': [6]})
+    [{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}]
+
+    Args:
+        dict_of_list (Dict[str, list]): a dictionary of lists.
+
+    Returns:
+        List[Dict[str, float]]: list of dictionaries.
+
+
+    """
+    keys = dict_of_list.keys()
+    values = [[e for e in result if e is not None] for result in itertools.product(*dict_of_list.values())]
+    result = [dict(zip(keys, v)) for v in values]
+    return result
+
+
+def group_dicts_by_first_key(list_of_dicts: List[Dict[str, float]]) -> Dict[str, List[Dict[str, float]]]:
+    """
+    >>> group_dicts_by_first_key([{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}])
+    {1: [{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}], 2: [{'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}], 3: [{'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}]}
+
+    """
+    first_key = get_first_key_of_dictionary(list_of_dicts[0])
+    final_grouped = defaultdict(list)
+    for inner_dict in list_of_dicts:
+        final_grouped[inner_dict[first_key]].append(inner_dict)
+
+    return dict(final_grouped)
+
+
+def group_dicts_over_first_key(list_of_dicts: List[Dict[str, float]]) -> Dict[tuple, List[float]]:
+    """
+    >>> group_dicts_over_first_key([{'alpha': 1, 'lambda': 4, 'gamma': 6}, {'alpha': 1, 'lambda': 5, 'gamma': 6}, {'alpha': 2, 'lambda': 4, 'gamma': 6}, {'alpha': 2, 'lambda': 5, 'gamma': 6}, {'alpha': 3, 'lambda': 4, 'gamma': 6}, {'alpha': 3, 'lambda': 5, 'gamma': 6}])
+    {(('lambda', 4), ('gamma', 6)): [1, 2, 3], (('lambda', 5), ('gamma', 6)): [1, 2, 3]}
+
+    :param list_of_dicts:
+    :return:
+    """
+    first_key = get_first_key_of_dictionary(list_of_dicts[0])
+    final_grouped = defaultdict(list)
+    for inner_dict in list_of_dicts:
+        first_value = inner_dict[first_key]
+        del inner_dict[first_key]
+        final_grouped[tuple(inner_dict.items())].append(first_value)
+
+    return dict(final_grouped)
+
+
+def find_best_performance(exp_name, alg_name, second_param, auc_or_final) -> Dict[str, float]:
+    exp_attrs = EXP_ATTRS[exp_name](exp_name)
+    best_params = {}
+    best_perf = np.inf
+
+    all_configuration = JsonParameterBuilder().add_experiment(exp_name).add_algorithm(alg_name).build()
+    list_of_configuration = split_dict_of_list_to_dicts(all_configuration)
+    first_param_key = get_first_key_of_dictionary(all_configuration)
+
+    grouped_over_first = group_dicts_over_first_key(list_of_configuration)
+
+    for grouped, first_values in grouped_over_first.items():
+        second_param_name, second_param_value = grouped[0]
+        if second_param_value != second_param:
+            continue
+        grouped_params = dict(grouped)
+        current_params = Configuration(grouped_params)
+        current_params[first_param_key] = None
+        current_params.algorithm = alg_name
+        current_params.save_path = PathFactory.make_result_path(exp_name, alg_name)
+        current_params.rerun = False
+
+        current_configuration_over_first_full_path = DataPersister.create_full_path_file_name(f'_mean_{auc_or_final}_over_alpha', current_params,
+                                                                                              excluded_params=[first_param_key])
+
+        current_perf = load_and_replace_large_nan_inf(
+            current_configuration_over_first_full_path, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement)
+
+        min_perf = min(current_perf)
+        if min_perf < best_perf:
+            best_perf = min_perf
+            best_perf_idx = int(np.nanargmin(current_perf))
+            best_params = current_params
+            best_params[first_param_key] = first_values[best_perf_idx]
+
+    return best_params
+
+
+def get_first_key_of_dictionary(d: dict) -> str:
+    return list(d.keys())[0]
+
+
+class ParameterBuilder:
+    def __init__(self):
+        self.final_params_dict = dict()
+
+    def add_algorithm_params(self, configuration: Configuration):
+        for k in alg_dict[configuration.algorithm].related_parameters():
+            self.final_params_dict[k] = configuration[k]
+        return self
+
+    def build(self):
+        return self.final_params_dict
+
+
+class JsonParameterBuilder:
+    def __init__(self):
+        self.final_params_dict = dict()
+        self.exp_name = None
+        self.alg_name = None
+        self.alg_related_params = None
+
+    def add_experiment(self, exp_name):
+        self.exp_name = exp_name
+        return self
+
+    def add_algorithm(self, alg_name):
+        self.alg_name = alg_name
+        self.alg_related_params = alg_dict[alg_name].related_parameters()
+        return self
+
+    def build(self) -> Dict[str, list]:
+        json_path = PathFactory.make_experiment_path(self.exp_name, self.alg_name)
+
+        with open(json_path) as f:
+            json_config = json.load(f)
+
+        for param_name in self.alg_related_params:
+            self.final_params_dict[param_name] = list(json_config['meta_parameters'].get(param_name, [default_params['meta_parameters'][param_name]]))
+
+        return self.final_params_dict
+
+
+class PathFactory:
+    @staticmethod
+    def make_experiment_path(exp_name, alg_name):
+        return os.path.join(os.getcwd(), 'Experiments', exp_name, alg_name, f'{alg_name}.json')
+
+    @staticmethod
+    def make_result_path(exp_name, alg_name):
+        return os.path.join(os.getcwd(), 'Results', exp_name, alg_name)
+
+
+class DataPersister:
+
+    @staticmethod
+    def save_result(result_arr: np.ndarray, result_name: str, configuration: Configuration, excluded_params: Optional[list] = None):
+        full_path_file_to_save = DataPersister.create_full_path_file_name(result_name, configuration, excluded_params)
+        if not os.path.exists(os.path.dirname(full_path_file_to_save)):
+            os.makedirs(os.path.dirname(full_path_file_to_save))
+        np.save(full_path_file_to_save, result_arr)
+
+    @staticmethod
+    def save_best_pref_over_first_param(exp_name, alg_name, auc_or_final):
+        all_configuration = JsonParameterBuilder().add_experiment(exp_name).add_algorithm(alg_name).build()
+        list_of_configuration = split_dict_of_list_to_dicts(all_configuration)
+        first_param_key = get_first_key_of_dictionary(all_configuration)
+        first_param_length = len(all_configuration[first_param_key])
+        mean_over_alpha, stderr_over_alpha = np.zeros(first_param_length), np.zeros(first_param_length)
+
+        grouped_over_first = group_dicts_over_first_key(list_of_configuration)
+
+        for grouped, first_values in grouped_over_first.items():
+            grouped_params = dict(grouped)
+            current_params = Configuration(grouped_params)
+            current_params[first_param_key] = None
+            current_params.algorithm = alg_name
+            current_params.save_path = PathFactory.make_result_path(exp_name, alg_name)
+            current_params.rerun = False
+
+            for index, first_value in enumerate(first_values):
+                current_params[first_param_key] = first_value
+                full_path_file_to_save = DataPersister.create_full_path_file_name(f'_mean_stderr_{auc_or_final}', current_params)
+                # perf = np.load(full_path_file_to_save)
+                # mean_over_alpha[index], stderr_over_alpha[index] = perf[0], perf[1]
+
+            # TODO: in case the rerun postfix is needed it should implement
+            DataPersister.save_result(mean_over_alpha, f"_mean_{auc_or_final}_over_alpha", current_params, excluded_params=[first_param_key])
+            DataPersister.save_result(stderr_over_alpha, f"_stderr_{auc_or_final}_over_alpha", current_params, excluded_params=[first_param_key])
+
+    @staticmethod
+    def create_full_path_file_name(result_name: str, configuration: Configuration, excluded_params: Optional[list] = None) -> str:
+        params = ParameterBuilder().add_algorithm_params(configuration).build()
+        file_name_to_save = DataPersister.create_file_name(params, excluded_params=excluded_params)
+        full_path_file_to_save = os.path.join(configuration.save_path, file_name_to_save)
+        full_path_file_to_save = f'{full_path_file_to_save}{result_name}'
+        if configuration.rerun:
+            full_path_file_to_save = f'{full_path_file_to_save}_rerun'
+        return f'{full_path_file_to_save}.npy'
+
+    @staticmethod
+    def create_file_name(param: dict, excluded_params: Optional[list]) -> str:
+        if excluded_params is None:
+            excluded_params = []
+        final_str = ''
+        for k, v in param.items():
+            if k in excluded_params:
+                continue
+            if k == 'alpha' or k == 'eta':
+                split_str = str.split(f'{v:.10f}', '.')
+            else:
+                split_str = str.split(f'{v:.5f}', '.')
+            final_str += '_' + k + split_str[0] + split_str[1]
+        return final_str
--- a/main.py
+++ b/main.py
+import os
+from Job.JobBuilder import JobBuilder
+import argparse
+from utils import find_all_experiment_configuration
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--directory_or_file', '-f', type=str, help='Json file path or Json files directory', required=True)
+    parser.add_argument('--server', '-s', type=str, help='Input server name, Cedar or Niagara', required=True)
+    args = parser.parse_args()
+    for path in find_all_experiment_configuration(args.directory_or_file):
+        builder = JobBuilder(json_path=os.path.join(os.getcwd(), path), server_name=args.server)
+        builder()
--- a/plot_data.py
+++ b/plot_data.py
+from Plotting.plot_all_sensitivities_per_alg_gradients import plot_all_sensitivities_per_alg_gradients
+from Plotting.plot_all_sensitivities_per_alg_gradients_all_eta import plot_all_sensitivities_per_alg_gradients_all_eta
+from Plotting.plot_best_learning_curve_over_all_params import plot_learning_curve_best_overall_params
+from Plotting.plot_dist import plot_distribution, plot_dist_for_two_four_room_tasks
+from Plotting.plot_all_sensitivities_per_alg_emphatics import plot_all_sensitivities_per_alg_emphatics
+from Plotting.plot_learning_curve import plot_learning_curve
+from Plotting.plot_learning_curves_for_all_third_params import plot_all_learning_curves_for_third
+from Plotting.plot_learning_for_two_lambdas import plot_learning_curve_for_lambdas
+from Plotting.plot_sensitivity import plot_sensitivity_curve
+from Plotting.plot_sensitivity_for_two_lambdas import plot_sensitivity_for_lambdas
+from Plotting.plot_specific_learning_curves import plot_specific_learning_curves
+from Plotting.plot_waterfall import plot_waterfall_scatter
+from Plotting.process_state_value_function import plot_all_final_value_functions, plot_value_functions
+from process_data import process_data
+
+func_to_run = 'hv_four_rooms_specific_learning_curves_full_bootstrap'
+if 'collision' in func_to_run:
+    exps = ['FirstChain']  # FirstChain OR FirstFourRoom OR 1HVFourRoom
+elif 'hv' in func_to_run:
+    exps = ['1HVFourRoom']
+else:
+    exps = ['FirstFourRoom']
+
+# region process data
+if func_to_run == 'process_data':
+    exps = ['FirstChain', 'FirstFourRoom', '1HVFourRoom']
+    algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
+    auc_or_final = ['auc', 'final']
+    sp_list = [1.0]
+    process_data(exps=exps, algs=algs, auc_or_final=auc_or_final, sp_list=sp_list)
+# endregion
+
+
+# ====================
+# ====================
+
+# ====================
+# ====================
+
+
+# region Collision figures
+# region learning curves
+if func_to_run == 'collision_specific_learning_curves_full_bootstrap':
+    auc_or_final = ['auc']
+    fig_size = (10, 4)
+    sp = 0.0
+    if 'FirstChain' in exps:
+        exp = 'FirstChain'
+        algs = ['ETD', 'TD', 'GTD', 'TDRC', 'PGTD2']
+        specific_params = {
+            'TD': {'alpha': 0.03125, 'lmbda': sp},
+            'ETD': {'alpha': 0.00390625, 'lmbda': sp},
+            'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 4.0, 'tdrc_beta': 0.01},
+            'GTD': {'alpha': 0.000976562, 'lmbda': sp, 'eta': 16.0},
+            'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}
+        }
+        plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
+                                      specific_params=specific_params)
+    if 'FirstFourRoom' in exps:
+        exp = 'FirstFourRoom'
+        algs = ['LSTD', 'LSETD', 'ETD', 'TD', 'GTD2', 'TDRC', 'PGTD2']
+        specific_params = {
+            'TD': {'alpha': 0.25, 'lmbda': sp},
+            'ETD': {'alpha': 0.00390625, 'lmbda': sp},
+            'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2},
+            'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0},
+            'GTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0},
+            'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}
+        }
+        plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
+                                      specific_params=specific_params)
+
+    if '1HVFourRoom' in exps:
+        exp = '1HVFourRoom'
+        algs = ['LSTD', 'LSETD', 'ETDLB', 'TD', 'GTD', 'TDRC', 'PGTD2']
+        specific_params = {
+            'TD': {'alpha': 0.25, 'lmbda': sp},
+            'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2},
+            'TDRC': {'alpha': 0.0625, 'lmbda': sp, 'eta': 1.0, 'tdrc_beta': 1.0},
+            'GTD': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0},
+            'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}
+        }
+        plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
+                                      specific_params=specific_params)
+if func_to_run == 'collision_learning_curves_for_all_extra_params_full_bootstrapping':
+    algs = ['PGTD2', 'GTD', 'LSTD']
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    auc_or_final = ['auc']
+    # tp_list = [0.015625, 0.0625, 0.25, 1.0, 4.0, 16.0, 64.0, 256.0]
+    tp_list = [0.25]
+    plot_all_learning_curves_for_third(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                       auc_or_final=auc_or_final, tp_list=tp_list)
+if func_to_run == 'collision_learning_curve_for_two_lambdas':
+    sp_list = [0.0, 0.9]
+    fig_size = (6, 4)
+    alg_groups = {'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']}
+    auc_or_final = ['auc']
+    plot_learning_curve_for_lambdas(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
+                                    auc_or_final=auc_or_final)
+if func_to_run == 'collision_best_learning_curves_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
+                  'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
+                  'emphatics': ['ETD', 'ETDLB', 'LSETD'],
+                  'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
+                  'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
+                               'LSTD', 'LSETD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'collision_best_learning_curves_some_algs_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (6, 4)
+    alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
+                        is_smoothed=True, smoothing_window=1)
+if func_to_run == 'collision_best_learning_curves_some_algs_medium_bootstrap':
+    sp_list = [0.5]
+    fig_size = (6, 4)
+    alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
+                        is_smoothed=True, smoothing_window=500)
+if func_to_run == 'collision_best_learning_curves_some_algs_minimal_bootstrap':
+    sp_list = [0.9]
+    fig_size = (6, 4)
+    alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
+                        is_smoothed=True, smoothing_window=500)
+if func_to_run == 'collision_best_learning_curves_some_algs_no_bootstrap':
+    sp_list = [1.0]
+    fig_size = (6, 4)
+    alg_groups = {'all_algs': ['TD', 'PGTD2', 'HTD', 'ETD', 'TB', 'Vtrace', 'ABTD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
+                        is_smoothed=True, smoothing_window=500)
+if func_to_run == 'collision_best_learning_curves_full_bootstrap_rerun_and_original':  # also need to set PLOT_RERUN = False
+    # and PLOT_RERUN_AND_ORIG = True in plot_params. Also some changes are necessary in the plot_learning_curve function
+    # like setting the colors and stuff for the re-run and original plots.
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'all_algs': ['GTD']}
+    auc_or_final = ['final']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+# endregion
+# region sensitivity curves
+if func_to_run == 'collision_sensitivity_curves_for_many_lambdas':
+    sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0]
+    fig_size = (10, 4)
+    algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
+    # algs = ['TB', 'Vtrace', 'ABTD']
+    auc_or_final = ['auc']
+    plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                 auc_or_final=auc_or_final)
+if func_to_run == 'collision_emphatics_sensitivity_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (11, 5)
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'collision_gradients_sensitivity_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (11, 4)
+    algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                             auc_or_final=auc_or_final)
+if func_to_run == 'collision_gradients_sensitivity_full_bootstrap_all_eta':
+    sp_list = [0.0]
+    fig_size = (10, 6)
+    algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                                     auc_or_final=auc_or_final)
+if func_to_run == 'collision_TDRC_all_eta_one_beta':
+    sp_list = [0.0]
+    tdrc_beta = [0.01]  # possible values are 0.1, 0.01, 1.0. Set them separately to plot.
+    fig_size = (10, 6)
+    algs = ['TDRC']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                                     auc_or_final=auc_or_final, tdrc_beta=tdrc_beta)
+if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'],
+                  'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'],
+                  'emphatics': ['ETD', 'ETDLB'],
+                  'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'],
+                  'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']}
+    auc_or_final = ['auc']
+    if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping':
+        plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
+                               auc_or_final=auc_or_final)
+    elif func_to_run == 'collision_waterfall_full_bootstrap':
+        plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
+                               auc_or_final=auc_or_final)
+if func_to_run == 'collision_emphatics_sensitivity_minimal_bootstrap':
+    sp_list = [0.9]
+    fig_size = (6, 4)
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'collision_sensitivity_curves_for_two_lambdas':
+    sp_list = [0.0, 0.9]
+    fig_size = (6, 4)
+    algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
+    auc_or_final = ['auc']
+    plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                 auc_or_final=auc_or_final)
+# endregion
+
+# endregion
+
+
+# ====================
+# ====================
+
+
+# region FOUR ROOMS FIGURES
+# region learning curves
+if func_to_run == 'four_rooms_specific_learning_curves_full_bootstrap':
+    auc_or_final = ['auc']
+    fig_size = (10, 4)
+    sp = 0.0
+    exp = 'FirstFourRoom'
+    algs = ['ETD', 'TD', 'GTD2', 'TDRC', 'PGTD2']
+    specific_params = {
+        'TD': {'alpha': 0.0625, 'lmbda': 0.0},
+        'ETD': {'alpha': 0.000488281, 'lmbda': sp},
+        'ETDLB': {'alpha': 0.000488281, 'lmbda': sp, 'beta': 0.2},
+        'TDRC': {'alpha': 0.125, 'lmbda': sp, 'eta': 4.0, 'tdrc_beta': 1.0},
+        'GTD2': {'alpha': 0.001953125, 'lmbda': sp, 'eta': 16.0},
+        'PGTD2': {'alpha': 0.0078125, 'lmbda': sp, 'eta': 16.0}
+    }
+    plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
+                                  specific_params=specific_params)
+if func_to_run == 'four_rooms_best_learning_curves_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
+                  'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
+                  'emphatics': ['ETD', 'ETDLB', 'LSETD'],
+                  'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
+                  'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
+                               'LSTD', 'LSETD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'four_rooms_best_learning_curves_full_bootstrap_2':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['ETD', 'ETDLB', 'LSTD', 'LSETD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'four_rooms_best_overall_params_learning_curves':
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
+                  'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
+                  'emphatics': ['ETD', 'ETDLB', 'LSETD'],
+                  'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
+                  'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
+                               'LSTD', 'LSETD']}
+    auc_or_final = ['auc']
+    plot_learning_curve_best_overall_params(exps=exps, alg_groups=alg_groups, fig_size=fig_size, auc_or_final=auc_or_final)
+# endregion
+
+# region sensitivity curves
+if func_to_run == 'four_rooms_sensitivity_curves_for_many_lambdas':
+    sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0]
+    fig_size = (10, 4)
+    algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
+    auc_or_final = ['auc']
+    plot_min_performance = False
+    plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
+                                 plot_min_performance=plot_min_performance)
+if func_to_run == 'four_rooms_emphatics_sensitivity_full_bootstrap':
+    sp_list = [0.0]
+    # fig_size = (11, 5)
+    fig_size = (10, 4)
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'four_rooms_gradients_sensitivity_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                             auc_or_final=auc_or_final)
+if func_to_run == 'four_rooms_gradients_sensitivity_full_bootstrap_all_eta':
+    sp_list = [0.0]
+    fig_size = (10, 6)
+    algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                                     auc_or_final=auc_or_final)
+if func_to_run == 'four_rooms_TDRC_all_eta_one_beta':
+    sp_list = [0.0]
+    tdrc_beta = [0.01]  # possible values are 0.1, 0.01, 1.0. Set them separately to plot.
+    fig_size = (10, 6)
+    algs = ['TDRC']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                                     auc_or_final=auc_or_final, tdrc_beta=tdrc_beta)
+if func_to_run == 'four_rooms_best_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'],
+                  'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'],
+                  'emphatics': ['ETD', 'ETDLB'],
+                  'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'],
+                  'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']}
+    auc_or_final = ['auc']
+    if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping':
+        plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
+                               auc_or_final=auc_or_final)
+    elif func_to_run == 'collision_waterfall_full_bootstrap':
+        plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
+                               auc_or_final=auc_or_final)
+if func_to_run == 'four_rooms_emphatics_sensitivity_minimal_bootstrap':
+    sp_list = [0.9]
+    fig_size = (6, 4)
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'four_rooms_sensitivity_curves_for_two_lambdas':
+    sp_list = [0.0, 0.9]
+    fig_size = (6, 4)
+    algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
+    auc_or_final = ['auc']
+    plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                 auc_or_final=auc_or_final)
+# endregion
+# endregion
+
+
+# ====================
+# ====================
+
+
+# region HIGH VARIANCE FOUR ROOMS FIGURES
+# region learning curves
+if func_to_run == 'hv_four_rooms_specific_learning_curves_full_bootstrap':
+    auc_or_final = ['auc']
+    fig_size = (10, 4)
+    sp = 0.0
+    exp = '1HVFourRoom'
+    algs = ['ETD', 'TD', 'GTD', 'TB']
+    specific_params = {
+        'TD': {'alpha': 0.0078125, 'lmbda': sp},
+        'ETD': {'alpha': 0.000244140, 'lmbda': sp},
+        'GTD': {'alpha': 0.000488281, 'lmbda': sp, 'eta': 16.0},
+        'TB': {'alpha': 0.03125, 'lmbda': 1.0}
+    }
+    plot_specific_learning_curves(exp=exp, algs=algs, sp=sp, fig_size=fig_size, auc_or_final=auc_or_final,
+                                  specific_params=specific_params)
+if func_to_run == 'hv_four_rooms_best_learning_curves_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
+                  'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
+                  'emphatics': ['ETD', 'ETDLB', 'LSETD'],
+                  'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
+                  'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
+                               'LSTD', 'LSETD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'hv_four_rooms_best_learning_curves_full_bootstrap_2':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['ETD', 'ETDLB', 'LSTD', 'LSETD']}
+    auc_or_final = ['auc']
+    plot_learning_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'hv_four_rooms_best_overall_params_learning_curves':
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['TD', 'GTD', 'ETD', 'LSTD', 'LSETD'],
+                  'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC', 'LSTD'],
+                  'emphatics': ['ETD', 'ETDLB', 'LSETD'],
+                  'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD', 'LSTD'],
+                  'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD',
+                               'LSTD', 'LSETD']}
+    auc_or_final = ['auc']
+    plot_learning_curve_best_overall_params(exps=exps, alg_groups=alg_groups, fig_size=fig_size, auc_or_final=auc_or_final)
+# endregion
+
+# region sensitivity curves
+if func_to_run == 'hv_four_rooms_sensitivity_curves_for_many_lambdas':
+    sp_list = [0.0, 0.1, 0.2, 0.3, 0.5, 0.75, 0.875, 0.9375, 0.96875, 0.984375, 1.0]
+    fig_size = (10, 4)
+    algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
+    # algs = ['TB', 'Vtrace', 'ABTD']
+    auc_or_final = ['auc']
+    plot_min_performance = False
+    plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final,
+                                 plot_min_performance=plot_min_performance)
+if func_to_run == 'hv_four_rooms_emphatics_sensitivity_full_bootstrap':
+    sp_list = [0.0]
+    # fig_size = (11, 5)
+    fig_size = (10, 4)
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'hv_four_rooms_gradients_sensitivity_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                             auc_or_final=auc_or_final)
+if func_to_run == 'hv_four_rooms_gradients_sensitivity_full_bootstrap_all_eta':
+    sp_list = [0.0]
+    fig_size = (10, 6)
+    algs = ['GTD', 'GTD2', 'PGTD2', 'HTD']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                                     auc_or_final=auc_or_final)
+if func_to_run == 'hv_four_rooms_TDRC_all_eta_one_beta':
+    sp_list = [0.0]
+    tdrc_beta = [0.01]  # possible values are 0.1, 0.01, 1.0. Set them separately to plot.
+    fig_size = (10, 6)
+    algs = ['TDRC']
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_gradients_all_eta(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                                     auc_or_final=auc_or_final, tdrc_beta=tdrc_beta)
+if func_to_run == 'hv_four_rooms_sensitivity_curves_full_bootstrapping' or 'collision_waterfall_full_bootstrap':
+    sp_list = [0.0]
+    fig_size = (10, 4)
+    alg_groups = {'main_algs': ['TD', 'GTD', 'ETD'],
+                  'gradients': ['GTD', 'GTD2', 'HTD', 'PGTD2', 'TDRC'],
+                  'emphatics': ['ETD', 'ETDLB'],
+                  'fast_algs': ['TD', 'TB', 'Vtrace', 'ABTD'],
+                  'all_algs': ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']}
+    auc_or_final = ['auc']
+    if func_to_run == 'collision_best_sensitivity_curves_full_bootstrapping':
+        plot_sensitivity_curve(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
+                               auc_or_final=auc_or_final)
+    elif func_to_run == 'collision_waterfall_full_bootstrap':
+        plot_waterfall_scatter(exps=exps, alg_groups=alg_groups, sp_list=sp_list, fig_size=fig_size,
+                               auc_or_final=auc_or_final)
+if func_to_run == 'hv_four_rooms_emphatics_sensitivity_minimal_bootstrap':
+    sp_list = [0.9]
+    fig_size = (6, 4)
+    auc_or_final = ['auc']
+    plot_all_sensitivities_per_alg_emphatics(exps=exps, sp_list=sp_list, fig_size=fig_size, auc_or_final=auc_or_final)
+if func_to_run == 'hv_four_rooms_sensitivity_curves_for_two_lambdas':
+    sp_list = [0.0, 0.9]
+    fig_size = (6, 4)
+    algs = ['TD', 'GTD', 'GTD2', 'PGTD2', 'HTD', 'TDRC', 'ETD', 'ETDLB', 'TB', 'Vtrace', 'ABTD']
+    auc_or_final = ['auc']
+    plot_sensitivity_for_lambdas(exps=exps, algs=algs, sp_list=sp_list, fig_size=fig_size,
+                                 auc_or_final=auc_or_final)
+# end region
+
+# endregion
+# endregion
+
+
+# region Misc
+if func_to_run == 'plot_value_functions':
+    plot_value_functions()
+if func_to_run == 'plot_all_final_value_functions':
+    plot_all_final_value_functions()
+if func_to_run == 'state_dist':
+    fig_size = (6, 4)
+    tasks = ['EightStateCollision', 'LearnEightPoliciesTileCodingFeat',
+             'HighVarianceLearnEightPoliciesTileCodingFeat']
+    for task in tasks:
+        plot_distribution(task=task, fig_size=fig_size)
+if func_to_run == 'high_variance_and_normal_dist_comparison':
+    fig_size = (22, 4)
+    plot_dist_for_two_four_room_tasks(fig_size=fig_size)
+# endregion
+
+
+# from Plotting.process_state_value_function import plot_value_functions, plot_all_final_value_functions
+# from Tasks.HighVarianceLearnEightPoliciesTileCodingFeat import HighVarianceLearnEightPoliciesTileCodingFeat
+# from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
+# For building d_mu
+# obj = HighVarianceLearnEightPoliciesTileCodingFeat()
+# d_mu = (obj.generate_behavior_dist(20_000_000))
+# numpy.save(os.path.join(os.getcwd(), 'Resources', 'HighVarianceLearnEightPoliciesTileCodingFeat', 'd_mu.npy'), d_mu)
--- a/process_data.py
+++ b/process_data.py
+import json
+import os
+
+import numpy as np
+
+from Learning import learn
+from Plotting.plot_params import EXP_ATTRS
+from Plotting.plot_utils import make_params, make_current_params, load_and_replace_large_nan_inf, \
+    load_best_perf_json, load_best_rerun_params, make_res_path
+from utils import create_name_for_save_load, Configuration
+
+
+def save_perf_over_alpha(alg, exp, auc_or_final, sp, rerun=False):
+    fp_list, sp_list, tp_list, fop_list, _ = make_params(alg, exp)
+    res_path = make_res_path(alg, exp)
+    mean_over_alpha, stderr_over_alpha = np.zeros(len(fp_list)), np.zeros(len(fp_list))
+    best_fp, best_tp, best_fop = load_best_rerun_params(alg, exp, auc_or_final, sp) if rerun else (0, 0, 0)
+    for tp in tp_list:
+        for fop in fop_list:
+            current_params = make_current_params(alg, sp, tp, fop)
+            for i, fp in enumerate(fp_list):
+                current_params['alpha'] = fp
+                load_name = os.path.join(res_path, create_name_for_save_load(current_params))
+                perf = np.load(f"{load_name}_mean_stderr_{auc_or_final}.npy")
+                if rerun and fp == best_fp and tp == best_tp and fop == best_fop:
+                    perf = np.load(f"{load_name}_mean_stderr_{auc_or_final}_rerun.npy")
+
+                mean_over_alpha[i], stderr_over_alpha[i] = perf[0], perf[1]
+
+            save_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=['alpha']))
+            postfix = ''
+            if rerun and tp == best_tp and fop == best_fop:
+                postfix = '_rerun'
+            np.save(f"{save_name}_mean_{auc_or_final}_over_alpha{postfix}", mean_over_alpha)
+            np.save(f"{save_name}_stderr_{auc_or_final}_over_alpha{postfix}", stderr_over_alpha)
+
+
+def find_best_perf(alg, exp, auc_or_final, sp):
+    exp_attrs = EXP_ATTRS[exp](exp)
+    fp_list, _, tp_list, fop_list, res_path = make_params(alg, exp)
+    best_params = {}
+    best_perf, best_fp, best_sp, best_tp, best_fop = np.inf, np.inf, np.inf, np.inf, np.inf
+    for fop in fop_list:
+        for tp in tp_list:
+            current_params = make_current_params(alg, sp, tp, fop)
+            load_name = os.path.join(res_path, create_name_for_save_load(current_params, excluded_params=[
+                'alpha']) + f'_mean_{auc_or_final}_over_alpha.npy')
+            current_perf = load_and_replace_large_nan_inf(
+                load_name, large=exp_attrs.learning_starting_point, replace_with=exp_attrs.over_limit_replacement)
+            min_perf = min(current_perf)
+            if min_perf < best_perf:
+                best_perf = min_perf
+                best_perf_idx = int(np.nanargmin(current_perf))
+                best_fp = fp_list[best_perf_idx]
+                best_params = current_params
+                best_params['alpha'] = best_fp
+    return best_params
+
+
+def save_best_perf_in_json(alg, exp, best_params, auc_or_final, sp):
+    fp_list, _, tp_list, fop_list, res_path = make_params(alg, exp)
+    exp_path = res_path.replace('Results', 'Experiments')
+    json_exp = os.path.join(exp_path, f"{alg}.json")
+    with open(json_exp, 'r') as f:
+        json_exp = json.load(f)
+    json_exp['meta_parameters'] = best_params
+    save_name = os.path.join(res_path, f"{auc_or_final}_{sp}.json")
+    with open(save_name, 'wt') as f:
+        json.dump(json_exp, f, indent=4)
+
+
+def run_learning_with_best_perf(alg, exp, auc_or_final, sp):
+    res_path = os.path.join(os.getcwd(), 'Results', exp, alg)
+    best_perf_jsn = load_best_perf_json(alg, exp, sp, auc_or_final)
+    param_dict = best_perf_jsn['meta_parameters']
+    param_dict['algorithm'] = alg
+    param_dict['task'] = best_perf_jsn['task']
+    param_dict['environment'] = best_perf_jsn['environment']
+    param_dict['num_steps'] = best_perf_jsn['number_of_steps']
+    param_dict['num_of_runs'] = best_perf_jsn['number_of_runs']
+    param_dict['sub_sample'] = best_perf_jsn['sub_sample']
+    param_dict['save_path'] = res_path
+    param_dict['save_value_function'] = False
+    param_dict['rerun'] = True
+    param_dict['render'] = False
+    config = Configuration(param_dict)
+    learn(config)
+
+
+def process_data(**kwargs):
+    for exp in kwargs['exps']:
+        for alg in kwargs['algs']:
+            for auc_or_final in kwargs['auc_or_final']:
+                for sp in kwargs['sp_list']:
+                    print(f"\nStarted re-running {exp}, {alg} lmbda_or_zeta: {sp}, {auc_or_final} ...")
+                    save_perf_over_alpha(alg, exp, auc_or_final, sp)
+                    best_params = find_best_perf(alg, exp, auc_or_final, sp)
+                    save_best_perf_in_json(alg, exp, best_params, auc_or_final, sp)
+                    run_learning_with_best_perf(alg, exp, auc_or_final, sp)
+                    save_perf_over_alpha(alg, exp, auc_or_final, sp, rerun=True)
+                    print(f"Finished re-running {exp}, {alg} {best_params}")
--- a/requirements.txt
+++ b/requirements.txt
+#matplotlib>=3.2.2
+#numpy>=1.19.0
+imageio>=2.9.0
+pyglet>=1.5.11
+scikit_image>=0.17.2
--- a/test.py
+++ b/test.py
+import time
+
+import utils
+from Environments.Chain import Chain
+from Environments.FourRoomGridWorld import FourRoomGridWorld
+from Tasks.LearnEightPoliciesTileCodingFeat import LearnEightPoliciesTileCodingFeat
+import pyglet
+from skimage.transform import resize
+import numpy as np
+
+from data_presister import DataPersister, find_best_performance
+
+# if __name__ == "__main__":
+# render_mode = 'human'
+# render_mode = 'rgb'
+# render_mode = 'screen'
+#
+# frames = []
+# env = FourRoomGridWorld()
+# # env = Chain()
+# env.reset()
+# actions = [2, 2, 0, 0, 0, 3, 3, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 3, 1, 1, 2, 2, 2, 0, 0, 2, 2, 2, 2,
+#            2, 1, 1, 1, 1, 1, 1, 1
+#            , 2, 2, 2, 3, 1, 1, 3, 3, 3, 3, 3, 0, 3, 3, 1, 3, 3, 3, 3]
+# actions = actions * 1
+# for step in range(len(actions)):
+#     a = actions[step]
+#     next_state, r, is_terminal, info = env.step(a)
+#     state = next_state
+#     frames.append(env.render(mode=render_mode))
+#     if is_terminal:
+#         env.reset()
+# utils.generate_gif(frames, 'Assets/FourRoomGridWorld.gif', size=(180, 180, 3), duration=1 / 20)
+
+# DataPersister.save_best_pref_over_first_param(exp_name="FirstChain", alg_name="HTD", auc_or_final="auc")
+
+find_best_performance(exp_name="FirstChain", alg_name="HTD", auc_or_final="auc", second_param=0.2)
--- a/unittest_suite.py
+++ b/unittest_suite.py
+import unittest
+from Tests.Algorithms.TestTD import TestTD
+from Tests.Environments.TestChain import TestChain
+from Tests.Tasks.TestEightStateCollision import TestEightStateCollision
+
+test_suite = unittest.TestSuite()
+test_suite.addTest(unittest.makeSuite(TestChain))
+test_suite.addTest(unittest.makeSuite(TestEightStateCollision))
+test_suite.addTest(unittest.makeSuite(TestTD))
+runner = unittest.TextTestRunner()
+runner.run(test_suite)
--- a/utils.py
+++ b/utils.py
+import numpy as np
+import os
+
+
+def get_save_value_function_steps(num_steps):
+    return [int(num_steps * i) - 1 for i in [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]]
+
+
+def save_value_function(value_function, save_path, step, run):
+    save_dir = os.path.join(save_path, 'Sample_value_function')
+    res_path = os.path.join(save_dir, f"{step}_{run}")
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+    np.save(res_path, value_function)
+
+
+class Configuration(dict):
+    def __str__(self):
+        return f"{self.environment} {self.task} {self.algorithm}"
+
+    def __getattr__(self, item):
+        return self[item]
+
+
+def find_all_experiment_configuration(experiments_path: str, ext='.json'):
+    if experiments_path.endswith(ext):
+        yield experiments_path
+    for root, _, files in os.walk(experiments_path):
+        for file in files:
+            if file.endswith(ext):
+                yield os.path.join(root, file)
+
+
+class ImmutableDict(dict):
+    def immutable(self):
+        raise TypeError("%r objects are immutable" % self.__class__.__name__)
+
+    def __setitem__(self, key, value):
+        self.immutable()
+
+    def __delitem__(self, key):
+        self.immutable()
+
+    def set_default(self, k, default):
+        self.immutable()
+
+    def update(self, __m, **kwargs):
+        self.immutable()
+
+    def clear(self) -> None:
+        self.immutable()
+
+
+def create_name_for_save_load(param_dict, excluded_params=None):
+    if excluded_params is None:
+        excluded_params = []
+    final_str = ''
+    for k, v in param_dict.items():
+        if k in excluded_params:
+            continue
+        if k == 'alpha' or k == 'eta':
+            split_str = str.split(f'{v:.10f}', '.')
+        else:
+            split_str = str.split(f'{v:.5f}', '.')
+        final_str += '_' + k + split_str[0] + split_str[1]
+    return final_str
+
+
+def save_result(path, name, result_array, params, rerun):
+    name_to_save = create_name_for_save_load(param_dict=params)
+    path_and_name = os.path.join(path, name_to_save)
+    final_name = f"{path_and_name}{name}"
+    if rerun:
+        final_name = f"{final_name}_rerun"
+    np.save(final_name, result_array)
+
+
+def generate_gif(frames, path, size=(180, 180, 3), duration=1 / 20):
+    import imageio
+    from skimage.transform import resize
+    for idx, frame_idx in enumerate(frames):
+        frames[idx] = resize(frame_idx, size, preserve_range=True, order=0).astype(np.uint8)
+    imageio.mimsave(path, frames, duration=duration)