提交 17c9f04d 编写于 作者: P puyuan1996

feature(pu): add ddpg lunarlander_cont config

上级 3dbce395
......@@ -8,10 +8,10 @@ from ding.model import model_wrap
from ding.utils import POLICY_REGISTRY
from ding.utils.data import default_collate, default_decollate
from .base_policy import Policy
try:
from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel
except ImportError:
FootballKaggle5thPlaceModel = None
# try:
# from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel
# except ImportError:
# FootballKaggle5thPlaceModel = None
@POLICY_REGISTRY.register('IL')
......
......@@ -146,7 +146,8 @@ class TD3VAEPolicy(DDPGPolicy):
),
),
collect=dict(
n_sample=1,
# n_sample=1,
each_iter_n_sample=48,
# (int) Cut trajectories into pieces with length "unroll_len".
unroll_len=1,
# (float) It is a must to add noise during collection. So here omits "noise" and only set "noise_sigma".
......@@ -257,7 +258,7 @@ class TD3VAEPolicy(DDPGPolicy):
{'action': data['action'],
'obs': data['obs']}) # [self.decode(z)[0], self.decode(z)[1], input, mu, log_var, z]
data['latent_action'] = result[5].detach() # TODO(pu): update latent_action mu
# data['latent_action'] = result[5].detach() # TODO(pu): update latent_action mu
# data['latent_action'] = result[3].detach() # TODO(pu): update latent_action mu
result.pop(-1) # remove z
result[2] = data['action']
......@@ -323,7 +324,7 @@ class TD3VAEPolicy(DDPGPolicy):
{'action': data['action'],
'obs': data['obs']}) # [self.decode(z)[0], self.decode(z)[1], input, mu, log_var, z]
data['latent_action'] = result[5].detach() # TODO(pu): update latent_action z
# data['latent_action'] = result[5].detach() # TODO(pu): update latent_action z
# data['latent_action'] = result[3].detach() # TODO(pu): update latent_action mu
result.pop(-1) # remove z
result[2] = data['action']
......@@ -378,6 +379,7 @@ class TD3VAEPolicy(DDPGPolicy):
result = self._vae_model(
{'action': data['action'],
'obs': data['obs']}) # [self.decode(z)[0], self.decode(z)[1], input, mu, log_var, z]
# if result[1].detach()
data['latent_action'] = result[5].detach() # TODO(pu): update latent_action z
# data['latent_action'] = result[3].detach() # TODO(pu): update latent_action mu
......@@ -427,6 +429,7 @@ class TD3VAEPolicy(DDPGPolicy):
# ===============================
# actor updates every ``self._actor_update_freq`` iters
if (self._forward_learn_cnt + 1) % self._actor_update_freq == 0:
actor_data = self._learn_model.forward(data['obs'], mode='compute_actor') # latent action
actor_data['obs'] = data['obs']
if self._twin_critic:
......
from easydict import EasyDict
from ding.entry import serial_pipeline
lunarlander_ddpg_config = dict(
exp_name='lunarlander_cont_ddpg',
env=dict(
env_id='LunarLanderContinuous-v2',
collector_env_num=8,
evaluator_env_num=5,
# (bool) Scale output action into legal range.
act_scale=True,
n_evaluator_episode=5,
stop_value=200,
),
policy=dict(
cuda=False,
priority=False,
random_collect_size=0,
model=dict(
obs_shape=8,
action_shape=2,
twin_critic=True,
actor_head_type='regression',
),
learn=dict(
update_per_collect=2,
batch_size=128,
learning_rate_actor=0.001,
learning_rate_critic=0.001,
ignore_done=False, # TODO(pu)
# (int) When critic network updates once, how many times will actor network update.
# Delayed Policy Updates in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf).
# Default 1 for DDPG, 2 for TD3.
actor_update_freq=1,
# (bool) Whether to add noise on target network's action.
# Target Policy Smoothing Regularization in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf).
# Default True for TD3, False for DDPG.
noise=False,
noise_sigma=0.1,
noise_range=dict(
min=-0.5,
max=0.5,
),
),
collect=dict(
n_sample=48,
noise_sigma=0.1,
collector=dict(collect_print_freq=1000, ),
),
eval=dict(evaluator=dict(eval_freq=100, ), ),
other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ),
),
)
lunarlander_ddpg_config = EasyDict(lunarlander_ddpg_config)
main_config = lunarlander_ddpg_config
lunarlander_ddpg_create_config = dict(
env=dict(
type='lunarlander',
import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'],
),
env_manager=dict(type='base'),
policy=dict(type='ddpg'),
)
lunarlander_ddpg_create_config = EasyDict(lunarlander_ddpg_create_config)
create_config = lunarlander_ddpg_create_config
if __name__ == '__main__':
serial_pipeline((main_config, create_config), seed=0)
\ No newline at end of file
......@@ -2,10 +2,36 @@ from easydict import EasyDict
from ding.entry import serial_pipeline_td3_vae
lunarlander_td3vae_config = dict(
#TODO(pu):run3 ddpg
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_zrelabel_eins1280_rvuc10_upcr20_upcv100_noisefalse_rbs1e5', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins1280_rvuc10_upcr20_upcv100_noisefalse_rbs1e5', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc100_upcr2_upcv100_noisefalse_rbs2e4', # TODO(pu): lr 1e-3 loss diverge
exp_name='lunarlander_cont_td3_vae_lad6_wu1000_zrelabel_eins1280_rvuc1_upcr20_upcv20_rbs2e4', # TODO(pu)
# mu 1e-3, z1e-1
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc1_upcr2_upcv2_noisetrue_rbs2e4', # TODO(pu) lr 3e-4 loss explode 45000iters
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc20_upcr2_upcv200_noisetrue_rbs2e4', # TODO(pu) lr 3e-4 loss explode 10000iters
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc1000_upcr2_upcv1000_noisetrue_rbs1e5', # TODO(pu) loss explode 10000iters
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc1000_upcr2_upcv0_noisetrue_rbs1e5', # TODO(pu) loss explode 3000iters
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_zrelabel_eins48_rvuc1000_upcr2_upcv0_noisetrue_rbs1e5', # TODO(pu) 80000iters eval rew_mean -278
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_eins48_rvuc1000_upcr2_upcv0_noisetrue_rbs1e5', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_eins48_rvuc1000_upcr2_upcv0_noisetrue_rbs2e4', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_eins48_rvuc100_upcr2_upcv0_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) debug 2m collect rew_max 200
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_eins48_rvuc100_upcr2_upcv0_targetnoise_nocollectnoise_rbs2e4', # TODO(pu) 2m collect rew_mean -120 不变
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_vaeupdatez_eins48_rvuc100_upcr2_upcv100_noisetrue_rbs2e4', # TODO(pu) 90000iters eval rew_mean -139
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc100_upcr2_upcv100_noisetrue_rbs2e4', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc100_upcr2_upcv100_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) 2m eval rew_mean -210
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc100_upcr20_upcv100_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) run9 0.5m eval rew_mean -46 best now
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc100_upcr50_upcv100_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) run10
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc1000_upcr2_upcv1000_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) run5 0.5m eval rew_mean -43
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc1000_upcr20_upcv1000_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) run11
exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc1000_upcr20_upcv1000_notargetnoise_nocollectnoise_rbs1e5', # TODO(pu) run2
env=dict(
env_id='LunarLanderContinuous-v2',
......@@ -27,6 +53,8 @@ lunarlander_td3vae_config = dict(
model=dict(
obs_shape=8,
action_shape=6, # latent_action_dim
# action_shape=6, # latent_action_dim
twin_critic=True,
actor_head_type='regression',
),
......@@ -35,14 +63,17 @@ lunarlander_td3vae_config = dict(
warm_up_update=1000,
# vae_train_times_per_update=1, # TODO(pu)
# rl_vae_update_circle=10, # train rl 10 iter, vae 1 iter
rl_vae_update_circle=1, # train rl 1 iter, vae 1 iter
rl_vae_update_circle=1000, # train rl 10 iter, vae 1 iter
# rl_vae_update_circle=1, # train rl 1 iter, vae 1 iter
# update_per_collect_rl=50,
update_per_collect_rl=20,
# update_per_collect_rl=2,
# update_per_collect_vae=100, # each mini-batch: replay_buffer_recent sample 128, replay_buffer sample 128
update_per_collect_vae=20, # each mini-batch: replay_buffer_recent sample 128
update_per_collect_vae=1000, # each mini-batch: replay_buffer_recent sample 128, replay_buffer sample 128
# update_per_collect_vae=20, # each mini-batch: replay_buffer_recent sample 128
# update_per_collect_vae=2, # each mini-batch: replay_buffer_recent sample 128
# update_per_collect_vae=0, # each mini-batch: replay_buffer_recent sample 128, replay_buffer sample 128
batch_size=128,
learning_rate_actor=3e-4,
......@@ -50,8 +81,8 @@ lunarlander_td3vae_config = dict(
learning_rate_vae=3e-4,
ignore_done=False, # TODO(pu)
actor_update_freq=2,
noise=True,
# noise=False, # TODO(pu)
# noise=True,
noise=False, # TODO(pu)
noise_sigma=0.1,
noise_range=dict(
min=-0.5,
......@@ -59,15 +90,17 @@ lunarlander_td3vae_config = dict(
),
),
collect=dict(
# each_iter_n_sample=48,
each_iter_n_sample=48,
# each_iter_n_sample=256,
each_iter_n_sample=1280,
noise_sigma=0.1,
# each_iter_n_sample=1280,
unroll_len=1, # TODO(pu)
# noise_sigma=0.1,
noise_sigma=0, # TODO(pu)
collector=dict(collect_print_freq=1000, ),
),
eval=dict(evaluator=dict(eval_freq=100, ), ),
other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ),
# other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ),
# other=dict(replay_buffer=dict(replay_buffer_size=int(2e4), ), ),
other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ),
),
)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册