From 5ee17ad1a6add9f910a8a956b24291f51c3fd7c3 Mon Sep 17 00:00:00 2001 From: Davide Liu <41103541+davide97l@users.noreply.github.com> Date: Fri, 3 Dec 2021 16:11:33 +0800 Subject: [PATCH] benchmark(davide): Bsuite memory benchmark (#138) * added r2d2 + a2c configs * changed convergence reward for some env * removed configs that don't converge * removed 'on_policy' param in 2rd2 configs --- .../memory_len/memory_len_0_a2c_config.py | 55 +++++++++++++ .../memory_len/memory_len_0_dqn_config.py | 20 +++-- .../memory_len/memory_len_10_r2d2_config.py | 78 +++++++++++++++++++ .../memory_len/memory_len_17_r2d2_config.py | 78 +++++++++++++++++++ 4 files changed, 223 insertions(+), 8 deletions(-) create mode 100644 dizoo/bsuite/config/serial/memory_len/memory_len_0_a2c_config.py create mode 100644 dizoo/bsuite/config/serial/memory_len/memory_len_10_r2d2_config.py create mode 100644 dizoo/bsuite/config/serial/memory_len/memory_len_17_r2d2_config.py diff --git a/dizoo/bsuite/config/serial/memory_len/memory_len_0_a2c_config.py b/dizoo/bsuite/config/serial/memory_len/memory_len_0_a2c_config.py new file mode 100644 index 0000000..7fdead6 --- /dev/null +++ b/dizoo/bsuite/config/serial/memory_len/memory_len_0_a2c_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict +from ding.entry import serial_pipeline + +memory_len_a2c_config = dict( + exp_name='memory_len_0_a2c', + env=dict( + collector_env_num=8, + evaluator_env_num=1, + n_evaluator_episode=100, + env_id='memory_len/0', + stop_value=1., + ), + policy=dict( + cuda=False, + # (bool) whether use on-policy training pipeline(behaviour policy and training policy are the same) + model=dict( + obs_shape=3, + action_shape=2, + encoder_hidden_size_list=[128, 128, 64], + ), + learn=dict( + batch_size=64, + # (bool) Whether to normalize advantage. Default to False. + normalize_advantage=False, + learning_rate=0.001, + # (float) loss weight of the value network, the weight of policy network is set to 1 + value_weight=0.5, + # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 + entropy_weight=0.01, + ), + collect=dict( + # (int) collect n_sample data, train model n_iteration times + n_sample=80, + # (float) the trade-off factor lambda to balance 1step td and mc + gae_lambda=0.95, + ), + eval=dict(evaluator=dict(eval_freq=100, )), + ), +) +memory_len_a2c_config = EasyDict(memory_len_a2c_config) +main_config = memory_len_a2c_config + +memory_len_a2c_create_config = dict( + env=dict( + type='bsuite', + import_names=['dizoo.bsuite.envs.bsuite_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='a2c'), +) +memory_len_a2c_create_config = EasyDict(memory_len_a2c_create_config) +create_config = memory_len_a2c_create_config + +if __name__ == "__main__": + serial_pipeline([main_config, create_config], seed=0) \ No newline at end of file diff --git a/dizoo/bsuite/config/serial/memory_len/memory_len_0_dqn_config.py b/dizoo/bsuite/config/serial/memory_len/memory_len_0_dqn_config.py index a6a6c8b..6136c7c 100644 --- a/dizoo/bsuite/config/serial/memory_len/memory_len_0_dqn_config.py +++ b/dizoo/bsuite/config/serial/memory_len/memory_len_0_dqn_config.py @@ -1,11 +1,12 @@ from easydict import EasyDict +from ding.entry import serial_pipeline -memory_len_0_dqn_config = dict( +memory_len_dqn_config = dict( exp_name='memory_len_0_dqn', env=dict( collector_env_num=8, evaluator_env_num=1, - n_evaluator_episode=10, + n_evaluator_episode=100, env_id='memory_len/0', stop_value=1., ), @@ -25,7 +26,7 @@ memory_len_0_dqn_config = dict( learning_rate=0.001, ), collect=dict(n_sample=8), - eval=dict(evaluator=dict(eval_freq=20, )), + eval=dict(evaluator=dict(eval_freq=100, )), other=dict( eps=dict( type='exp', @@ -37,9 +38,9 @@ memory_len_0_dqn_config = dict( ), ), ) -memory_len_0_dqn_config = EasyDict(memory_len_0_dqn_config) -main_config = memory_len_0_dqn_config -memory_len_0_dqn_create_config = dict( +memory_len_dqn_config = EasyDict(memory_len_dqn_config) +main_config = memory_len_dqn_config +memory_len_dqn_create_config = dict( env=dict( type='bsuite', import_names=['dizoo.bsuite.envs.bsuite_env'], @@ -47,5 +48,8 @@ memory_len_0_dqn_create_config = dict( env_manager=dict(type='base'), policy=dict(type='dqn'), ) -memory_len_0_dqn_create_config = EasyDict(memory_len_0_dqn_create_config) -create_config = memory_len_0_dqn_create_config +memory_len_dqn_create_config = EasyDict(memory_len_dqn_create_config) +create_config = memory_len_dqn_create_config + +if __name__ == "__main__": + serial_pipeline([main_config, create_config], seed=0) diff --git a/dizoo/bsuite/config/serial/memory_len/memory_len_10_r2d2_config.py b/dizoo/bsuite/config/serial/memory_len/memory_len_10_r2d2_config.py new file mode 100644 index 0000000..7ade794 --- /dev/null +++ b/dizoo/bsuite/config/serial/memory_len/memory_len_10_r2d2_config.py @@ -0,0 +1,78 @@ +from easydict import EasyDict +from ding.entry import serial_pipeline + +collector_env_num = 8 +evaluator_env_num = 1 +memory_len_r2d2_config = dict( + exp_name='memory_len_10_r2d2', + env=dict( + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=100, + env_id='memory_len/10', # 12 memory steps, 13 obs per episode + stop_value=1., + ), + policy=dict( + cuda=True, + priority=True, + priority_IS_weight=True, + model=dict( + obs_shape=3, + action_shape=2, + encoder_hidden_size_list=[128, 128, 64], + ), + discount_factor=0.997, + burnin_step=1, + nstep=2, + # (int) the whole sequence length to unroll the RNN network minus + # the timesteps of burnin part, + # i.e., = + + unroll_len=15, + learn=dict( + # according to the R2D2 paper, actor parameter update interval is 400 + # environment timesteps, and in per collect phase, we collect 32 sequence + # samples, the length of each sample sequence is + , + # which is 100 in our seeting, 32*100/400=8, so we set update_per_collect=8 + # in most environments + update_per_collect=8, + batch_size=64, + learning_rate=0.0005, + target_update_theta=0.001, + ), + collect=dict( + # NOTE it is important that don't include key n_sample here, to make sure self._traj_len=INF + each_iter_n_sample=32, + env_num=collector_env_num, + ), + eval=dict(env_num=evaluator_env_num, evaluator=dict(eval_freq=100, )), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.05, + decay=1e5, + ), + replay_buffer=dict(replay_buffer_size=50000, + # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization + alpha=0.6, + # (Float type) How much correction is used: 0 means no correction while 1 means full correction + beta=0.4, + ) + ), + ), +) +memory_len_r2d2_config = EasyDict(memory_len_r2d2_config) +main_config = memory_len_r2d2_config +memory_len_r2d2_create_config = dict( + env=dict( + type='bsuite', + import_names=['dizoo.bsuite.envs.bsuite_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='r2d2'), +) +memory_len_r2d2_create_config = EasyDict(memory_len_r2d2_create_config) +create_config = memory_len_r2d2_create_config + +if __name__ == "__main__": + serial_pipeline([main_config, create_config], seed=0) diff --git a/dizoo/bsuite/config/serial/memory_len/memory_len_17_r2d2_config.py b/dizoo/bsuite/config/serial/memory_len/memory_len_17_r2d2_config.py new file mode 100644 index 0000000..9195bb0 --- /dev/null +++ b/dizoo/bsuite/config/serial/memory_len/memory_len_17_r2d2_config.py @@ -0,0 +1,78 @@ +from easydict import EasyDict +from ding.entry import serial_pipeline + +collector_env_num = 8 +evaluator_env_num = 1 +memory_len_r2d2_config = dict( + exp_name='memory_len_17_r2d2', + env=dict( + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=100, + env_id='memory_len/17', # 50 memory steps, 51 obs per episode + stop_value=1., + ), + policy=dict( + cuda=True, + priority=True, + priority_IS_weight=True, + model=dict( + obs_shape=3, + action_shape=2, + encoder_hidden_size_list=[128, 128, 64], + ), + discount_factor=0.997, + burnin_step=1, + nstep=2, + # (int) the whole sequence length to unroll the RNN network minus + # the timesteps of burnin part, + # i.e., = + + unroll_len=60, + learn=dict( + # according to the R2D2 paper, actor parameter update interval is 400 + # environment timesteps, and in per collect phase, we collect 32 sequence + # samples, the length of each sample sequence is + , + # which is 100 in our seeting, 32*100/400=8, so we set update_per_collect=8 + # in most environments + update_per_collect=8, + batch_size=64, + learning_rate=0.0005, + target_update_theta=0.001, + ), + collect=dict( + # NOTE it is important that don't include key n_sample here, to make sure self._traj_len=INF + each_iter_n_sample=32, + env_num=collector_env_num, + ), + eval=dict(env_num=evaluator_env_num, evaluator=dict(eval_freq=100, )), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.05, + decay=1e5, + ), + replay_buffer=dict(replay_buffer_size=50000, + # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization + alpha=0.6, + # (Float type) How much correction is used: 0 means no correction while 1 means full correction + beta=0.4, + ) + ), + ), +) +memory_len_r2d2_config = EasyDict(memory_len_r2d2_config) +main_config = memory_len_r2d2_config +memory_len_r2d2_create_config = dict( + env=dict( + type='bsuite', + import_names=['dizoo.bsuite.envs.bsuite_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='r2d2'), +) +memory_len_r2d2_create_config = EasyDict(memory_len_r2d2_create_config) +create_config = memory_len_r2d2_create_config + +if __name__ == "__main__": + serial_pipeline([main_config, create_config], seed=0) -- GitLab