diff --git a/ding/policy/r2d3.py b/ding/policy/r2d3.py index 3b07297d0d870263c7416b3bc44b16a4f5c683ec..3db536193885e8ab358aed2dbbbe9cc2c53eb7d9 100644 --- a/ding/policy/r2d3.py +++ b/ding/policy/r2d3.py @@ -352,14 +352,15 @@ class R2D3Policy(Policy): value_gamma=value_gamma[t], ) loss.append(l) - td_error.append(e.abs()) + # td_error.append(e.abs()) # first sum then abs + td_error.append(e) # first abs then sum # loss statistics for debugging loss_nstep.append(loss_statistics[0]) loss_1step.append(loss_statistics[1]) loss_sl.append(loss_statistics[2]) else: - l, e = dqfd_nstep_td_error( + l, e, loss_statistics = dqfd_nstep_td_error( td_data, self._gamma, self.lambda1, @@ -371,7 +372,12 @@ class R2D3Policy(Policy): value_gamma=value_gamma[t], ) loss.append(l) - td_error.append(e.abs()) + # td_error.append(e.abs()) # first sum then abs + td_error.append(e) # first abs then sum + # loss statistics for debugging + loss_nstep.append(loss_statistics[0]) + loss_1step.append(loss_statistics[1]) + loss_sl.append(loss_statistics[2]) loss = sum(loss) / (len(loss) + 1e-8) # loss statistics for debugging diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index 5a48dd4fa336758aaa4af0244f7c6632ea34e1bf..887ab3513711c1292e2dfceeae0173c75d72a935 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -669,8 +669,9 @@ def dqfd_nstep_td_error( lambda_n_step_td * td_error_per_sample + lambda_one_step_td * td_error_one_step_per_sample + lambda_supervised_loss * JE ) * weight - ).mean(), lambda_n_step_td * td_error_per_sample + lambda_one_step_td * td_error_one_step_per_sample + - lambda_supervised_loss * JE + ).mean(), lambda_n_step_td * td_error_per_sample.abs() + + lambda_one_step_td * td_error_one_step_per_sample.abs() + lambda_supervised_loss * JE.abs(), + (td_error_per_sample.mean(), td_error_one_step_per_sample.mean(), JE.mean()) ) @@ -775,8 +776,9 @@ def dqfd_nstep_td_error_with_rescale( lambda_n_step_td * td_error_per_sample + lambda_one_step_td * td_error_one_step_per_sample + lambda_supervised_loss * JE ) * weight - ).mean(), lambda_n_step_td * td_error_per_sample + lambda_one_step_td * td_error_one_step_per_sample + - lambda_supervised_loss * JE, (td_error_per_sample.mean(), td_error_one_step_per_sample.mean(), JE.mean()) + ).mean(), lambda_n_step_td * td_error_per_sample.abs() + + lambda_one_step_td * td_error_one_step_per_sample.abs() + lambda_supervised_loss * JE.abs(), + (td_error_per_sample.mean(), td_error_one_step_per_sample.mean(), JE.mean()) ) diff --git a/dizoo/atari/config/serial/pong/pong_r2d2_config.py b/dizoo/atari/config/serial/pong/pong_r2d2_config.py index 6fbc18671081c4927dc14bfe4e476176df662038..36bab76feeded0e837de7cf5d6debbe27441aa91 100644 --- a/dizoo/atari/config/serial/pong/pong_r2d2_config.py +++ b/dizoo/atari/config/serial/pong/pong_r2d2_config.py @@ -4,7 +4,7 @@ from ding.entry import serial_pipeline collector_env_num = 8 evaluator_env_num = 5 pong_r2d2_config = dict( - exp_name='debug_pong_r2d2_n5_bs2_ul40_rbs1e4_seed0', + exp_name='pong_r2d2_n5_bs2_ul40_rbs1e4_seed0', env=dict( collector_env_num=collector_env_num, evaluator_env_num=evaluator_env_num, diff --git a/dizoo/atari/config/serial/pong/pong_r2d3_offppoexpert_config.py b/dizoo/atari/config/serial/pong/pong_r2d3_offppoexpert_config.py index 35c998af5e2a7cf88926ec247098e60874be51b2..5b113268f70f1fdd813719d9a908ea4421fdd854 100644 --- a/dizoo/atari/config/serial/pong/pong_r2d3_offppoexpert_config.py +++ b/dizoo/atari/config/serial/pong/pong_r2d3_offppoexpert_config.py @@ -6,10 +6,11 @@ module_path = os.path.dirname(__file__) collector_env_num = 8 evaluator_env_num = 5 -expert_replay_buffer_size=1000 #TODO 1000 +expert_replay_buffer_size = int(5e3) # TODO(pu) + """agent config""" pong_r2d3_config = dict( - exp_name='debug_pong_r2d3_offppoexpert_k0_pho1-256_rbs2e4', + exp_name='pong_r2d3_offppoexpert_k0_pho1-4_rbs2e4_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -62,7 +63,7 @@ pong_r2d3_config = dict( env_num=collector_env_num, # The hyperparameter pho, the demo ratio, control the propotion of data coming\ # from expert demonstrations versus from the agent's own experience. - pho=1/256, # 1/256, #TODO(pu), 0.25, + pho=1/4, # TODO(pu) ), eval=dict(env_num=evaluator_env_num, ), other=dict( @@ -98,7 +99,7 @@ create_config = pong_r2d3_create_config """export config""" expert_pong_r2d3_config = dict( - exp_name='expert_pong_r2d3_ppoexpert_k0_pho1-256_rbs2e4', + exp_name='expert_pong_r2d3_ppoexpert_k0_pho1-4_rbs2e4_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), diff --git a/dizoo/atari/config/serial/pong/pong_r2d3_r2d2expert_config.py b/dizoo/atari/config/serial/pong/pong_r2d3_r2d2expert_config.py index d7c3ef11e381ec3adc1c0a3af9355052c63e9b60..7354204ee164385b99acba9e61addcba6fd9a61f 100644 --- a/dizoo/atari/config/serial/pong/pong_r2d3_r2d2expert_config.py +++ b/dizoo/atari/config/serial/pong/pong_r2d3_r2d2expert_config.py @@ -5,11 +5,11 @@ module_path = os.path.dirname(__file__) collector_env_num = 8 evaluator_env_num = 5 -expert_replay_buffer_size=int(5e3) +expert_replay_buffer_size = int(5e3) # TODO(pu) """agent config""" pong_r2d3_config = dict( - exp_name='debug_pong_r2d3_r2d2expert_k0_pho1-4_rbs2e4_ds5e3', + exp_name='pong_r2d3_r2d2expert_k0_pho1-4_rbs2e4_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True), @@ -45,7 +45,7 @@ pong_r2d3_config = dict( # in most environments value_rescale=True, update_per_collect=8, - batch_size=64, # TODO(pu) + batch_size=64, learning_rate=0.0005, target_update_theta=0.001, # DQFD related parameters @@ -97,7 +97,7 @@ create_config = pong_r2d3_create_config """export config""" expert_pong_r2d3_config = dict( - exp_name='expert_pong_r2d3_r2d2expert_k0_pho1-4_rbs1e4_ds5e3', + exp_name='expert_pong_r2d3_r2d2expert_k0_pho1-4_rbs2e4_ds5e3', env=dict( # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' manager=dict(shared_memory=True, force_reproducibility=True),