From 81602ce99a8130cf731b6960b880d6a911d993cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=B2=E6=BA=90?= <48008469+puyuan1996@users.noreply.github.com> Date: Fri, 26 Nov 2021 20:08:56 +0800 Subject: [PATCH] polish(pu): add loss statistics and polish r2d3 pong config (#126) * fix(pu): fix adam weight decay bug * feature(pu): add pitfall offppo config * feature(pu): add qbert spaceinvaders pitfall r2d3 config * fix(pu): fix expert offfppo config in r2d3 * fix(pu): fix pong connfig * polish(pu): add loss statistics * fix(pu): fix loss statistics bug * polish(pu): polish pong r2d3 config * polish(pu): polish r2d3 pong and lunarlander config * polish(pu): delete unused files --- README.md | 19 +++++----- ding/policy/r2d3.py | 24 +++++++++++-- ding/rl_utils/td.py | 2 +- .../config/serial/pong/pong_r2d2_config.py | 4 +-- ...ig.py => pong_r2d3_offppoexpert_config.py} | 19 +++++----- .../pong/pong_r2d3_r2d2expert_config.py | 36 ++++++++----------- ...y => lunarlander_r2d3_ppoexpert_config.py} | 27 +++++++------- .../lunarlander_r2d3_r2d2expert_config.py | 25 ++++++------- 8 files changed, 85 insertions(+), 71 deletions(-) rename dizoo/atari/config/serial/pong/{pong_r2d3_config.py => pong_r2d3_offppoexpert_config.py} (92%) rename dizoo/box2d/lunarlander/config/{lunarlander_r2d3_config.py => lunarlander_r2d3_ppoexpert_config.py} (86%) diff --git a/README.md b/README.md index 5990406..2552ae2 100644 --- a/README.md +++ b/README.md @@ -123,15 +123,16 @@ ding -m serial -e cartpole -p dqn -s 0 | 23 | [GAIL](https://arxiv.org/pdf/1606.03476.pdf) | ![IL](https://img.shields.io/badge/-IL-purple) | [reward_model/gail](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/gail_irl_model.py) | ding -m serial_gail -c cartpole_dqn_gail_config.py -s 0 | | 24 | [SQIL](https://arxiv.org/pdf/1905.11108.pdf) | ![IL](https://img.shields.io/badge/-IL-purple) | [entry/sqil](https://github.com/opendilab/DI-engine/blob/main/ding/entry/serial_entry_sqil.py) | ding -m serial_sqil -c cartpole_sqil_config.py -s 0 | | 25 | [DQFD](https://arxiv.org/pdf/1704.03732.pdf) | ![IL](https://img.shields.io/badge/-IL-purple) | [policy/dqfd](https://github.com/opendilab/DI-engine/blob/main/ding/policy/dqfd.py) | ding -m serial_dqfd -c cartpole_dqfd_config.py -s 0 | -| 26 | [GCL](https://arxiv.org/pdf/1603.00448.pdf) | ![IL](https://img.shields.io/badge/-IL-purple) | [reward_model/guided_cost](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/guided_cost_reward_model.py) | python3 lunarlander_gcl_config.py -| 27 | [HER](https://arxiv.org/pdf/1707.01495.pdf) | ![exp](https://img.shields.io/badge/-exploration-orange) | [reward_model/her](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/her_reward_model.py) | python3 -u bitflip_her_dqn.py | -| 28 | [RND](https://arxiv.org/abs/1810.12894) | ![exp](https://img.shields.io/badge/-exploration-orange) | [reward_model/rnd](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/rnd_reward_model.py) | python3 -u cartpole_ppo_rnd_main.py | -| 29 | [ICM](https://arxiv.org/pdf/1705.05363.pdf) | ![exp](https://img.shields.io/badge/-exploration-orange) | [reward_model/icm](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/icm_reward_model.py) | python3 -u cartpole_ppo_icm_config.py | -| 30 | [CQL](https://arxiv.org/pdf/2006.04779.pdf) | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [policy/cql](https://github.com/opendilab/DI-engine/blob/main/ding/policy/cql.py) | python3 -u d4rl_cql_main.py | -| 31 | [TD3BC](https://arxiv.org/pdf/2106.06860.pdf) | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [policy/td3_bc](https://github.com/opendilab/DI-engine/blob/main/ding/policy/td3_bc.py) | python3 -u mujoco_td3_bc_main.py | -| 32 | [MBPO](https://arxiv.org/pdf/1906.08253.pdf) | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [model/template/model_based/mbpo](https://github.com/opendilab/DI-engine/blob/main/ding/model/template/model_based/mbpo.py) | python3 -u sac_halfcheetah_mopo_default_config.py | -| 33 | [PER](https://arxiv.org/pdf/1511.05952.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [worker/replay_buffer](https://github.com/opendilab/DI-engine/blob/main/ding/worker/replay_buffer/advanced_buffer.py) | `rainbow demo` | -| 34 | [GAE](https://arxiv.org/pdf/1506.02438.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [rl_utils/gae](https://github.com/opendilab/DI-engine/blob/main/ding/rl_utils/gae.py) | `ppo demo` | +| 26 | [R2D3](https://arxiv.org/pdf/1909.01387.pdf) | ![IL](https://img.shields.io/badge/-IL-purple) | [policy/r2d3](https://github.com/opendilab/DI-engine/blob/main/ding/policy/r2d3.py) | python3 -u pong_r2d3_r2d2expert_config.py | +| 27 | [GCL](https://arxiv.org/pdf/1603.00448.pdf) | ![IL](https://img.shields.io/badge/-IL-purple) | [reward_model/guided_cost](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/guided_cost_reward_model.py) | python3 lunarlander_gcl_config.py +| 28 | [HER](https://arxiv.org/pdf/1707.01495.pdf) | ![exp](https://img.shields.io/badge/-exploration-orange) | [reward_model/her](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/her_reward_model.py) | python3 -u bitflip_her_dqn.py | +| 29 | [RND](https://arxiv.org/abs/1810.12894) | ![exp](https://img.shields.io/badge/-exploration-orange) | [reward_model/rnd](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/rnd_reward_model.py) | python3 -u cartpole_ppo_rnd_main.py | +| 30 | [ICM](https://arxiv.org/pdf/1705.05363.pdf) | ![exp](https://img.shields.io/badge/-exploration-orange) | [reward_model/icm](https://github.com/opendilab/DI-engine/blob/main/ding/reward_model/icm_reward_model.py) | python3 -u cartpole_ppo_icm_config.py | +| 31 | [CQL](https://arxiv.org/pdf/2006.04779.pdf) | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [policy/cql](https://github.com/opendilab/DI-engine/blob/main/ding/policy/cql.py) | python3 -u d4rl_cql_main.py | +| 32 | [TD3BC](https://arxiv.org/pdf/2106.06860.pdf) | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [policy/td3_bc](https://github.com/opendilab/DI-engine/blob/main/ding/policy/td3_bc.py) | python3 -u mujoco_td3_bc_main.py | +| 33 | [MBPO](https://arxiv.org/pdf/1906.08253.pdf) | ![mbrl](https://img.shields.io/badge/-ModelBasedRL-lightblue) | [model/template/model_based/mbpo](https://github.com/opendilab/DI-engine/blob/main/ding/model/template/model_based/mbpo.py) | python3 -u sac_halfcheetah_mopo_default_config.py | +| 34 | [PER](https://arxiv.org/pdf/1511.05952.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [worker/replay_buffer](https://github.com/opendilab/DI-engine/blob/main/ding/worker/replay_buffer/advanced_buffer.py) | `rainbow demo` | +| 35 | [GAE](https://arxiv.org/pdf/1506.02438.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [rl_utils/gae](https://github.com/opendilab/DI-engine/blob/main/ding/rl_utils/gae.py) | `ppo demo` | ![discrete](https://img.shields.io/badge/-discrete-brightgreen) means discrete action space, which is only label in normal DRL algorithms (1-16) diff --git a/ding/policy/r2d3.py b/ding/policy/r2d3.py index e824745..3b07297 100644 --- a/ding/policy/r2d3.py +++ b/ding/policy/r2d3.py @@ -148,7 +148,9 @@ class R2D3Policy(Policy): self._priority = self._cfg.priority self._priority_IS_weight = self._cfg.priority_IS_weight - self._optimizer = Adam(self._model.parameters(), lr=self._cfg.learn.learning_rate, weight_decay=self.lambda3) + self._optimizer = Adam( + self._model.parameters(), lr=self._cfg.learn.learning_rate, weight_decay=self.lambda3, optim_type='adamw' + ) self._gamma = self._cfg.discount_factor self._nstep = self._cfg.nstep self._burnin_step = self._cfg.burnin_step @@ -316,6 +318,9 @@ class R2D3Policy(Policy): # T, B, nstep -> T, nstep, B reward = reward.permute(0, 2, 1).contiguous() loss = [] + loss_nstep = [] + loss_1step = [] + loss_sl = [] td_error = [] for t in range(self._unroll_len_add_burnin_step - self._burnin_step - self._nstep): # here t=0 means timestep in the original sample sequence, we minus self._nstep @@ -335,7 +340,7 @@ class R2D3Policy(Policy): ) if self._value_rescale: - l, e = dqfd_nstep_td_error_with_rescale( + l, e, loss_statistics = dqfd_nstep_td_error_with_rescale( td_data, self._gamma, self.lambda1, @@ -348,6 +353,10 @@ class R2D3Policy(Policy): ) loss.append(l) td_error.append(e.abs()) + # loss statistics for debugging + loss_nstep.append(loss_statistics[0]) + loss_1step.append(loss_statistics[1]) + loss_sl.append(loss_statistics[2]) else: l, e = dqfd_nstep_td_error( @@ -365,6 +374,10 @@ class R2D3Policy(Policy): td_error.append(e.abs()) loss = sum(loss) / (len(loss) + 1e-8) + # loss statistics for debugging + loss_nstep = sum(loss_nstep) / (len(loss_nstep) + 1e-8) + loss_1step = sum(loss_1step) / (len(loss_1step) + 1e-8) + loss_sl = sum(loss_sl) / (len(loss_sl) + 1e-8) # using the mixture of max and mean absolute n-step TD-errors as the priority of the sequence td_error_per_sample = 0.9 * torch.max( @@ -388,6 +401,10 @@ class R2D3Policy(Policy): return { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), + # loss statistics for debugging + 'nstep_loss': loss_nstep.item(), + '1step_loss': loss_1step.item(), + 'sl_loss': loss_sl.item(), 'priority': td_error_per_sample.abs().tolist(), # the first timestep in the sequence, may not be the start of episode 'q_s_taken-a_t0': q_s_a_t0.mean().item(), @@ -541,5 +558,6 @@ class R2D3Policy(Policy): def _monitor_vars_learn(self) -> List[str]: return super()._monitor_vars_learn() + [ - 'total_loss', 'priority', 'q_s_taken-a_t0', 'target_q_s_max-a_t0', 'q_s_a-mean_t0' + 'total_loss', 'nstep_loss', '1step_loss', 'sl_loss', 'priority', 'q_s_taken-a_t0', 'target_q_s_max-a_t0', + 'q_s_a-mean_t0' ] diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index 3672281..16755ed 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -739,7 +739,7 @@ def dqfd_nstep_td_error_with_rescale( lambda_supervised_loss * JE ) * weight ).mean(), lambda_n_step_td * td_error_per_sample + lambda_one_step_td * td_error_one_step_per_sample + - lambda_supervised_loss * JE + lambda_supervised_loss * JE, (td_error_per_sample.mean(), td_error_one_step_per_sample.mean(), JE.mean()) ) diff --git a/dizoo/atari/config/serial/pong/pong_r2d2_config.py b/dizoo/atari/config/serial/pong/pong_r2d2_config.py index 3265507..474f064 100644 --- a/dizoo/atari/config/serial/pong/pong_r2d2_config.py +++ b/dizoo/atari/config/serial/pong/pong_r2d2_config.py @@ -4,7 +4,7 @@ from ding.entry import serial_pipeline collector_env_num = 8 evaluator_env_num = 5 pong_r2d2_config = dict( - exp_name='debug_pong_r2d2_n5_bs2_ul40', + exp_name='debug_pong_r2d2_n5_bs2_ul40_rbs1e4_seed0', env=dict( collector_env_num=collector_env_num, evaluator_env_num=evaluator_env_num, @@ -55,7 +55,7 @@ pong_r2d2_config = dict( decay=1e5, ), replay_buffer=dict( - replay_buffer_size=20000, # TODO(pu) + replay_buffer_size=10000, # TODO(pu) # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization alpha=0.6, # (Float type) How much correction is used: 0 means no correction while 1 means full correction diff --git a/dizoo/atari/config/serial/pong/pong_r2d3_config.py b/dizoo/atari/config/serial/pong/pong_r2d3_offppoexpert_config.py similarity index 92% rename from dizoo/atari/config/serial/pong/pong_r2d3_config.py rename to dizoo/atari/config/serial/pong/pong_r2d3_offppoexpert_config.py index aa8b238..35c998a 100644 --- a/dizoo/atari/config/serial/pong/pong_r2d3_config.py +++ b/dizoo/atari/config/serial/pong/pong_r2d3_offppoexpert_config.py @@ -6,10 +6,10 @@ module_path = os.path.dirname(__file__) collector_env_num = 8 evaluator_env_num = 5 -expert_replay_buffer_size=1 #TODO 1000 +expert_replay_buffer_size=1000 #TODO 1000 """agent config""" pong_r2d3_config = dict( - exp_name='debug_pong_r2d3_k0_pho0', + exp_name='debug_pong_r2d3_offppoexpert_k0_pho1-256_rbs2e4', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -50,10 +50,9 @@ pong_r2d3_config = dict( target_update_theta=0.001, # DQFD related parameters lambda1=1.0, # n-step return - lambda2=0, # 1.0, # supervised loss - lambda3=1e-5, # L2 - lambda_one_step_td=0, # 1-step return - + lambda2=1, # 1.0, # supervised loss + lambda3=1e-5, # 1e-5, # L2 it's very important to set Adam optimizer optim_type='adamw'. + lambda_one_step_td=1, # 1-step return margin_function=0.8, # margin function in JE, here we implement this as a constant per_train_iter_k=0, # TODO(pu) ), @@ -63,15 +62,15 @@ pong_r2d3_config = dict( env_num=collector_env_num, # The hyperparameter pho, the demo ratio, control the propotion of data coming\ # from expert demonstrations versus from the agent's own experience. - pho=0, # 1/256, #TODO(pu), 0.25, + pho=1/256, # 1/256, #TODO(pu), 0.25, ), eval=dict(env_num=evaluator_env_num, ), other=dict( eps=dict( type='exp', start=0.95, - end=0.1, - decay=100000, + end=0.05, + decay=1e5, ), replay_buffer=dict( replay_buffer_size=20000, # TODO(pu) sequence_length 42 10000 obs need 11GB memory, if rbs=20000, at least 140gb @@ -99,7 +98,7 @@ create_config = pong_r2d3_create_config """export config""" expert_pong_r2d3_config = dict( - exp_name='debug_pong_r2d3', + exp_name='expert_pong_r2d3_ppoexpert_k0_pho1-256_rbs2e4', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), diff --git a/dizoo/atari/config/serial/pong/pong_r2d3_r2d2expert_config.py b/dizoo/atari/config/serial/pong/pong_r2d3_r2d2expert_config.py index d4b3730..d7c3ef1 100644 --- a/dizoo/atari/config/serial/pong/pong_r2d3_r2d2expert_config.py +++ b/dizoo/atari/config/serial/pong/pong_r2d3_r2d2expert_config.py @@ -1,16 +1,15 @@ from easydict import EasyDict - from ding.entry import serial_pipeline_r2d3 import os module_path = os.path.dirname(__file__) collector_env_num = 8 evaluator_env_num = 5 -expert_replay_buffer_size=1 #TODO 1000 +expert_replay_buffer_size=int(5e3) """agent config""" pong_r2d3_config = dict( - exp_name='debug_pong_r2d3_r2d2expert_k0_pho0_no1td_nosl', + exp_name='debug_pong_r2d3_r2d2expert_k0_pho1-4_rbs2e4_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -46,14 +45,14 @@ pong_r2d3_config = dict( # in most environments value_rescale=True, update_per_collect=8, - batch_size=64, + batch_size=64, # TODO(pu) learning_rate=0.0005, target_update_theta=0.001, # DQFD related parameters lambda1=1.0, # n-step return - lambda2=0, # supervised loss - lambda3=1e-5, # L2 - lambda_one_step_td=0, # 1-step return + lambda2=1.0, # supervised loss + lambda3=1e-5, # L2 it's very important to set Adam optimizer optim_type='adamw'. + lambda_one_step_td=1.0, # 1-step return margin_function=0.8, # margin function in JE, here we implement this as a constant per_train_iter_k=0, # TODO(pu) ), @@ -63,18 +62,18 @@ pong_r2d3_config = dict( env_num=collector_env_num, # The hyperparameter pho, the demo ratio, control the propotion of data coming\ # from expert demonstrations versus from the agent's own experience. - pho=0, #TODO(pu), 0.25, + pho=1/4, # TODO(pu) ), eval=dict(env_num=evaluator_env_num, ), other=dict( eps=dict( type='exp', start=0.95, - end=0.1, + end=0.05, decay=100000, ), replay_buffer=dict( - replay_buffer_size=20000, # TODO(pu) sequence_length 42 10000 obs need 11GB memory, if rbs=20000, at least 140gb + replay_buffer_size=int(2e4), # TODO(pu) # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization alpha=0.6, # (Float type) How much correction is used: 0 means no correction while 1 means full correction @@ -96,10 +95,9 @@ pong_r2d3_create_config = dict( pong_r2d3_create_config = EasyDict(pong_r2d3_create_config) create_config = pong_r2d3_create_config - """export config""" expert_pong_r2d3_config = dict( - # exp_name='debug_pong_r2d3', + exp_name='expert_pong_r2d3_r2d2expert_k0_pho1-4_rbs1e4_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -119,22 +117,18 @@ expert_pong_r2d3_config = dict( obs_shape=[4, 84, 84], action_shape=6, # encoder_hidden_size_list=[64, 64, 128], # ppo expert policy - encoder_hidden_size_list=[128, 128, 512], # r2d2 - # actor_head_hidden_size=128, - # critic_head_hidden_size=128, + encoder_hidden_size_list=[128, 128, 512], # r2d2 expert policy ), discount_factor=0.997, - burnin_step=20, + burnin_step=2, nstep=5, learn=dict( - expert_replay_buffer_size=expert_replay_buffer_size, # TODO(pu) + expert_replay_buffer_size=expert_replay_buffer_size, ), collect=dict( # NOTE it is important that don't include key n_sample here, to make sure self._traj_len=INF each_iter_n_sample=32, # Users should add their own path here (path should lead to a well-trained model) - # demonstration_info_path='dizoo/atari/config/serial/pong/demo_path/ppo-off_iteration_16127.pth.tar', - # demonstration_info_path=module_path + '/demo_path/ppo-off_iteration_16127.pth.tar', # demonstration_info_path=module_path + '/demo_path/ppo-off_ckpt_best.pth.tar', demonstration_info_path=module_path + '/demo_path/r2d2_iteration_15000.pth.tar', # Cut trajectories into pieces with length "unroll_len". should set as self._unroll_len_add_burnin_step of r2d2 @@ -144,7 +138,7 @@ expert_pong_r2d3_config = dict( eval=dict(env_num=evaluator_env_num, ), other=dict( replay_buffer=dict( - replay_buffer_size=expert_replay_buffer_size, # TODO(pu) + replay_buffer_size=expert_replay_buffer_size, # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization alpha=0.6, # (Float type) How much correction is used: 0 means no correction while 1 means full correction @@ -161,7 +155,7 @@ expert_pong_r2d3_create_config = dict( import_names=['dizoo.atari.envs.atari_env'], ), env_manager=dict(type='base'), - policy=dict(type='r2d2_collect_traj'), + policy=dict(type='r2d2_collect_traj'), # this policy is designed to collect r2d2 expert traj for r2d3 ) expert_pong_r2d3_create_config = EasyDict(expert_pong_r2d3_create_config) expert_create_config = expert_pong_r2d3_create_config diff --git a/dizoo/box2d/lunarlander/config/lunarlander_r2d3_config.py b/dizoo/box2d/lunarlander/config/lunarlander_r2d3_ppoexpert_config.py similarity index 86% rename from dizoo/box2d/lunarlander/config/lunarlander_r2d3_config.py rename to dizoo/box2d/lunarlander/config/lunarlander_r2d3_ppoexpert_config.py index e8eacd2..0cd1f35 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_r2d3_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_r2d3_ppoexpert_config.py @@ -8,10 +8,11 @@ module_path = os.path.dirname(__file__) collector_env_num = 8 evaluator_env_num = 5 +expert_replay_buffer_size=int(5e3) """agent config""" lunarlander_r2d3_config = dict( - exp_name='debug_lunarlander_r2d3_k0_pho0', + exp_name='debug_lunarlander_r2d3_ppoexpert_k100_pho1-4_rbs1e5_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -45,13 +46,14 @@ lunarlander_r2d3_config = dict( # in most environments value_rescale=True, update_per_collect=8, - batch_size=64, #32, # TODO(pu) + batch_size=64, # TODO(pu) learning_rate=0.0005, target_update_theta=0.001, # DQFD related parameters lambda1=1.0, # n-step return lambda2=1.0, # supervised loss - lambda3=1e-5, # L2 + lambda3=1e-5, # L2 it's very important to set Adam optimizer optim_type='adamw'. + lambda_one_step_td=1, # 1-step return margin_function=0.8, # margin function in JE, here we implement this as a constant per_train_iter_k=0, # TODO(pu) ), @@ -59,19 +61,19 @@ lunarlander_r2d3_config = dict( # NOTE it is important that don't include key n_sample here, to make sure self._traj_len=INF each_iter_n_sample=32, env_num=collector_env_num, - # The hyperparameter pho, the demo ratio, control the propotion of data coming\ + # The hyperparameter pho, the demo ratio, control the propotion of data coming # from expert demonstrations versus from the agent's own experience. - pho=0, # TODO(pu) 0.25 + pho=1/4., # TODO(pu) ), eval=dict(env_num=evaluator_env_num, ), other=dict( eps=dict( type='exp', start=0.95, - end=0.1, + end=0.05, decay=100000, ), - replay_buffer=dict(replay_buffer_size=10000, + replay_buffer=dict(replay_buffer_size=int(1e5), # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization alpha=0.6, # priority exponent default=0.6 # (Float type) How much correction is used: 0 means no correction while 1 means full correction @@ -96,7 +98,7 @@ create_config = lunarlander_r2d3_create_config """export config""" expert_lunarlander_r2d3_config = dict( - exp_name='debug_lunarlander_r2d3', + exp_name='expert_lunarlander_r2d3_ppoexpert_k0_pho1-4_rbs1e5_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -119,12 +121,11 @@ expert_lunarlander_r2d3_config = dict( burnin_step=2, nstep=5, learn=dict( - expert_replay_buffer_size=1000, # 10000, TODO(pu) + expert_replay_buffer_size=expert_replay_buffer_size, ), collect=dict( - # n_sample=32, # NOTE it is important that don't include key n_sample here, to make sure self._traj_len=INF + # n_sample=32, NOTE it is important that don't include key n_sample here, to make sure self._traj_len=INF # Users should add their own path here (path should lead to a well-trained model) - # demonstration_info_path='dizoo/box2d/lunarlander/config/demo_path/ppo-off_iteration_12948.pth.tar', demonstration_info_path=module_path + '/demo_path/ppo-off_iteration_12948.pth.tar', # Cut trajectories into pieces with length "unroll_len". should set as self._unroll_len_add_burnin_step of r2d2 unroll_len=42, # TODO(pu) should equals self._unroll_len_add_burnin_step in r2d2 policy @@ -132,7 +133,7 @@ expert_lunarlander_r2d3_config = dict( ), eval=dict(env_num=evaluator_env_num, ), other=dict( - replay_buffer=dict(replay_buffer_size=1000, # 10000,8 TODO(pu) + replay_buffer=dict(replay_buffer_size=expert_replay_buffer_size, # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization alpha=0.9, # priority exponent default=0.6 # (Float type) How much correction is used: 0 means no correction while 1 means full correction @@ -149,7 +150,7 @@ expert_lunarlander_r2d3_create_config = dict( import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'], ), env_manager=dict(type='base'), - policy=dict(type='ppo_offpolicy_collect_traj'), # NOTE + policy=dict(type='ppo_offpolicy_collect_traj'), # this policy is designed to collect off-ppo expert traj for r2d3 ) expert_lunarlander_r2d3_create_config = EasyDict(expert_lunarlander_r2d3_create_config) expert_create_config = expert_lunarlander_r2d3_create_config diff --git a/dizoo/box2d/lunarlander/config/lunarlander_r2d3_r2d2expert_config.py b/dizoo/box2d/lunarlander/config/lunarlander_r2d3_r2d2expert_config.py index 73c4a07..1868080 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_r2d3_r2d2expert_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_r2d3_r2d2expert_config.py @@ -8,10 +8,11 @@ module_path = os.path.dirname(__file__) collector_env_num = 8 evaluator_env_num = 5 +expert_replay_buffer_size=int(5e3) """agent config""" lunarlander_r2d3_config = dict( - exp_name='debug_lunarlander_r2d3_r2d2expert_k0_pho0', + exp_name='debug_lunarlander_r2d3_r2d2expert_k0_pho1-4_rbs1e4_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -51,7 +52,8 @@ lunarlander_r2d3_config = dict( # DQFD related parameters lambda1=1.0, # n-step return lambda2=1.0, # supervised loss - lambda3=1e-5, # L2 + lambda3=1e-5, # L2 it's very important to set Adam optimizer optim_type='adamw'. + lambda_one_step_td=1, # 1-step return margin_function=0.8, # margin function in JE, here we implement this as a constant per_train_iter_k=0, # TODO(pu) ), @@ -61,7 +63,7 @@ lunarlander_r2d3_config = dict( env_num=collector_env_num, # The hyperparameter pho, the demo ratio, control the propotion of data coming\ # from expert demonstrations versus from the agent's own experience. - pho=0, # TODO(pu) 0.25 + pho=1/4, # TODO(pu) ), eval=dict(env_num=evaluator_env_num, ), other=dict( @@ -71,7 +73,7 @@ lunarlander_r2d3_config = dict( end=0.1, decay=100000, ), - replay_buffer=dict(replay_buffer_size=10000, + replay_buffer=dict(replay_buffer_size=int(1e4), # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization alpha=0.6, # priority exponent default=0.6 # (Float type) How much correction is used: 0 means no correction while 1 means full correction @@ -93,10 +95,10 @@ lunarlander_r2d3_create_config = dict( lunarlander_r2d3_create_config = EasyDict(lunarlander_r2d3_create_config) create_config = lunarlander_r2d3_create_config -"""export config""" +"""export config""" expert_lunarlander_r2d3_config = dict( - # exp_name='debug_lunarlander_r2d3', + exp_name='expert_lunarlander_r2d3_r2d2expert_k0_pho1-4_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -120,22 +122,21 @@ expert_lunarlander_r2d3_config = dict( burnin_step=2, nstep=5, learn=dict( - expert_replay_buffer_size=1000, + expert_replay_buffer_size=expert_replay_buffer_size, ), collect=dict( - # n_sample=32, NOTE it is important that don't include key n_sample here, to make sure self._traj_len=INF + # NOTE it is important that don't include key n_sample here, to make sure self._traj_len=INF + each_iter_n_sample=32, # Users should add their own path here (path should lead to a well-trained model) - # demonstration_info_path='dizoo/box2d/lunarlander/config/demo_path/ppo-off_iteration_12948.pth.tar', # demonstration_info_path=module_path + '/demo_path/ppo-off_iteration_12948.pth.tar', demonstration_info_path=module_path + '/demo_path/r2d2_iteration_13000.pth.tar', - # Cut trajectories into pieces with length "unroll_len". should set as self._unroll_len_add_burnin_step of r2d2 unroll_len=40, # TODO(pu): if in ppo_offpolicy, this key should equals self._unroll_len_add_burnin_step in r2d2 policy env_num=collector_env_num, ), eval=dict(env_num=evaluator_env_num, ), other=dict( - replay_buffer=dict(replay_buffer_size=1000, + replay_buffer=dict(replay_buffer_size=expert_replay_buffer_size, # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization alpha=0.9, # priority exponent default=0.6 # (Float type) How much correction is used: 0 means no correction while 1 means full correction @@ -152,7 +153,7 @@ expert_lunarlander_r2d3_create_config = dict( import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'], ), env_manager=dict(type='base'), - policy=dict(type='r2d2_collect_traj'), # NOTE + policy=dict(type='r2d2_collect_traj'), # this policy is designed to collect r2d2 expert traj for r2d3 ) expert_lunarlander_r2d3_create_config = EasyDict(expert_lunarlander_r2d3_create_config) expert_create_config = expert_lunarlander_r2d3_create_config -- GitLab