Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenDILab开源决策智能平台
DI-engine
提交
17c9f04d
D
DI-engine
项目概览
OpenDILab开源决策智能平台
/
DI-engine
上一次同步 2 年多
通知
56
Star
321
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DI-engine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
17c9f04d
编写于
12月 17, 2021
作者:
P
puyuan1996
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feature(pu): add ddpg lunarlander_cont config
上级
3dbce395
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
125 addition
and
20 deletion
+125
-20
ding/policy/il.py
ding/policy/il.py
+4
-4
ding/policy/td3_vae.py
ding/policy/td3_vae.py
+6
-3
dizoo/box2d/lunarlander/config/lunarlander_cont_ddpg_config.py
.../box2d/lunarlander/config/lunarlander_cont_ddpg_config.py
+69
-0
dizoo/box2d/lunarlander/config/lunarlander_cont_td3_vae_config.py
...x2d/lunarlander/config/lunarlander_cont_td3_vae_config.py
+46
-13
未找到文件。
ding/policy/il.py
浏览文件 @
17c9f04d
...
...
@@ -8,10 +8,10 @@ from ding.model import model_wrap
from
ding.utils
import
POLICY_REGISTRY
from
ding.utils.data
import
default_collate
,
default_decollate
from
.base_policy
import
Policy
try
:
from
dizoo.gfootball.model.bots
import
FootballKaggle5thPlaceModel
except
ImportError
:
FootballKaggle5thPlaceModel
=
None
#
try:
#
from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel
#
except ImportError:
#
FootballKaggle5thPlaceModel = None
@
POLICY_REGISTRY
.
register
(
'IL'
)
...
...
ding/policy/td3_vae.py
浏览文件 @
17c9f04d
...
...
@@ -146,7 +146,8 @@ class TD3VAEPolicy(DDPGPolicy):
),
),
collect
=
dict
(
n_sample
=
1
,
# n_sample=1,
each_iter_n_sample
=
48
,
# (int) Cut trajectories into pieces with length "unroll_len".
unroll_len
=
1
,
# (float) It is a must to add noise during collection. So here omits "noise" and only set "noise_sigma".
...
...
@@ -257,7 +258,7 @@ class TD3VAEPolicy(DDPGPolicy):
{
'action'
:
data
[
'action'
],
'obs'
:
data
[
'obs'
]})
# [self.decode(z)[0], self.decode(z)[1], input, mu, log_var, z]
data
[
'latent_action'
]
=
result
[
5
].
detach
()
# TODO(pu): update latent_action mu
#
data['latent_action'] = result[5].detach() # TODO(pu): update latent_action mu
# data['latent_action'] = result[3].detach() # TODO(pu): update latent_action mu
result
.
pop
(
-
1
)
# remove z
result
[
2
]
=
data
[
'action'
]
...
...
@@ -323,7 +324,7 @@ class TD3VAEPolicy(DDPGPolicy):
{
'action'
:
data
[
'action'
],
'obs'
:
data
[
'obs'
]})
# [self.decode(z)[0], self.decode(z)[1], input, mu, log_var, z]
data
[
'latent_action'
]
=
result
[
5
].
detach
()
# TODO(pu): update latent_action z
#
data['latent_action'] = result[5].detach() # TODO(pu): update latent_action z
# data['latent_action'] = result[3].detach() # TODO(pu): update latent_action mu
result
.
pop
(
-
1
)
# remove z
result
[
2
]
=
data
[
'action'
]
...
...
@@ -378,6 +379,7 @@ class TD3VAEPolicy(DDPGPolicy):
result
=
self
.
_vae_model
(
{
'action'
:
data
[
'action'
],
'obs'
:
data
[
'obs'
]})
# [self.decode(z)[0], self.decode(z)[1], input, mu, log_var, z]
# if result[1].detach()
data
[
'latent_action'
]
=
result
[
5
].
detach
()
# TODO(pu): update latent_action z
# data['latent_action'] = result[3].detach() # TODO(pu): update latent_action mu
...
...
@@ -427,6 +429,7 @@ class TD3VAEPolicy(DDPGPolicy):
# ===============================
# actor updates every ``self._actor_update_freq`` iters
if
(
self
.
_forward_learn_cnt
+
1
)
%
self
.
_actor_update_freq
==
0
:
actor_data
=
self
.
_learn_model
.
forward
(
data
[
'obs'
],
mode
=
'compute_actor'
)
# latent action
actor_data
[
'obs'
]
=
data
[
'obs'
]
if
self
.
_twin_critic
:
...
...
dizoo/box2d/lunarlander/config/lunarlander_cont_ddpg_config.py
0 → 100644
浏览文件 @
17c9f04d
from
easydict
import
EasyDict
from
ding.entry
import
serial_pipeline
lunarlander_ddpg_config
=
dict
(
exp_name
=
'lunarlander_cont_ddpg'
,
env
=
dict
(
env_id
=
'LunarLanderContinuous-v2'
,
collector_env_num
=
8
,
evaluator_env_num
=
5
,
# (bool) Scale output action into legal range.
act_scale
=
True
,
n_evaluator_episode
=
5
,
stop_value
=
200
,
),
policy
=
dict
(
cuda
=
False
,
priority
=
False
,
random_collect_size
=
0
,
model
=
dict
(
obs_shape
=
8
,
action_shape
=
2
,
twin_critic
=
True
,
actor_head_type
=
'regression'
,
),
learn
=
dict
(
update_per_collect
=
2
,
batch_size
=
128
,
learning_rate_actor
=
0.001
,
learning_rate_critic
=
0.001
,
ignore_done
=
False
,
# TODO(pu)
# (int) When critic network updates once, how many times will actor network update.
# Delayed Policy Updates in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf).
# Default 1 for DDPG, 2 for TD3.
actor_update_freq
=
1
,
# (bool) Whether to add noise on target network's action.
# Target Policy Smoothing Regularization in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf).
# Default True for TD3, False for DDPG.
noise
=
False
,
noise_sigma
=
0.1
,
noise_range
=
dict
(
min
=-
0.5
,
max
=
0.5
,
),
),
collect
=
dict
(
n_sample
=
48
,
noise_sigma
=
0.1
,
collector
=
dict
(
collect_print_freq
=
1000
,
),
),
eval
=
dict
(
evaluator
=
dict
(
eval_freq
=
100
,
),
),
other
=
dict
(
replay_buffer
=
dict
(
replay_buffer_size
=
20000
,
),
),
),
)
lunarlander_ddpg_config
=
EasyDict
(
lunarlander_ddpg_config
)
main_config
=
lunarlander_ddpg_config
lunarlander_ddpg_create_config
=
dict
(
env
=
dict
(
type
=
'lunarlander'
,
import_names
=
[
'dizoo.box2d.lunarlander.envs.lunarlander_env'
],
),
env_manager
=
dict
(
type
=
'base'
),
policy
=
dict
(
type
=
'ddpg'
),
)
lunarlander_ddpg_create_config
=
EasyDict
(
lunarlander_ddpg_create_config
)
create_config
=
lunarlander_ddpg_create_config
if
__name__
==
'__main__'
:
serial_pipeline
((
main_config
,
create_config
),
seed
=
0
)
\ No newline at end of file
dizoo/box2d/lunarlander/config/lunarlander_cont_td3_vae_config.py
浏览文件 @
17c9f04d
...
...
@@ -2,10 +2,36 @@ from easydict import EasyDict
from
ding.entry
import
serial_pipeline_td3_vae
lunarlander_td3vae_config
=
dict
(
#TODO(pu):run3 ddpg
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_zrelabel_eins1280_rvuc10_upcr20_upcv100_noisefalse_rbs1e5', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins1280_rvuc10_upcr20_upcv100_noisefalse_rbs1e5', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc100_upcr2_upcv100_noisefalse_rbs2e4', # TODO(pu): lr 1e-3 loss diverge
exp_name
=
'lunarlander_cont_td3_vae_lad6_wu1000_zrelabel_eins1280_rvuc1_upcr20_upcv20_rbs2e4'
,
# TODO(pu)
# mu 1e-3, z1e-1
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc1_upcr2_upcv2_noisetrue_rbs2e4', # TODO(pu) lr 3e-4 loss explode 45000iters
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc20_upcr2_upcv200_noisetrue_rbs2e4', # TODO(pu) lr 3e-4 loss explode 10000iters
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc1000_upcr2_upcv1000_noisetrue_rbs1e5', # TODO(pu) loss explode 10000iters
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_murelabel_eins48_rvuc1000_upcr2_upcv0_noisetrue_rbs1e5', # TODO(pu) loss explode 3000iters
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_zrelabel_eins48_rvuc1000_upcr2_upcv0_noisetrue_rbs1e5', # TODO(pu) 80000iters eval rew_mean -278
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_eins48_rvuc1000_upcr2_upcv0_noisetrue_rbs1e5', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_eins48_rvuc1000_upcr2_upcv0_noisetrue_rbs2e4', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_eins48_rvuc100_upcr2_upcv0_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) debug 2m collect rew_max 200
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_eins48_rvuc100_upcr2_upcv0_targetnoise_nocollectnoise_rbs2e4', # TODO(pu) 2m collect rew_mean -120 不变
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_norelabel_vaeupdatez_eins48_rvuc100_upcr2_upcv100_noisetrue_rbs2e4', # TODO(pu) 90000iters eval rew_mean -139
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc100_upcr2_upcv100_noisetrue_rbs2e4', # TODO(pu)
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc100_upcr2_upcv100_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) 2m eval rew_mean -210
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc100_upcr20_upcv100_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) run9 0.5m eval rew_mean -46 best now
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc100_upcr50_upcv100_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) run10
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc1000_upcr2_upcv1000_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) run5 0.5m eval rew_mean -43
# exp_name='lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc1000_upcr20_upcv1000_notargetnoise_nocollectnoise_rbs2e4', # TODO(pu) run11
exp_name
=
'lunarlander_cont_td3_vae_lad6_wu1000_relabelz_novaeupdatez_eins48_rvuc1000_upcr20_upcv1000_notargetnoise_nocollectnoise_rbs1e5'
,
# TODO(pu) run2
env
=
dict
(
env_id
=
'LunarLanderContinuous-v2'
,
...
...
@@ -27,6 +53,8 @@ lunarlander_td3vae_config = dict(
model
=
dict
(
obs_shape
=
8
,
action_shape
=
6
,
# latent_action_dim
# action_shape=6, # latent_action_dim
twin_critic
=
True
,
actor_head_type
=
'regression'
,
),
...
...
@@ -35,14 +63,17 @@ lunarlander_td3vae_config = dict(
warm_up_update
=
1000
,
# vae_train_times_per_update=1, # TODO(pu)
# rl_vae_update_circle=1
0, # train rl 10 iter, vae 1 iter
rl_vae_update_circle
=
1
,
# train rl 1 iter, vae 1 iter
rl_vae_update_circle
=
100
0
,
# train rl 10 iter, vae 1 iter
#
rl_vae_update_circle=1, # train rl 1 iter, vae 1 iter
# update_per_collect_rl=50,
update_per_collect_rl
=
20
,
# update_per_collect_rl=2,
# update_per_collect_vae=100, # each mini-batch: replay_buffer_recent sample 128, replay_buffer sample 128
update_per_collect_vae
=
20
,
# each mini-batch: replay_buffer_recent sample 128
update_per_collect_vae
=
1000
,
# each mini-batch: replay_buffer_recent sample 128, replay_buffer sample 128
# update_per_collect_vae=20, # each mini-batch: replay_buffer_recent sample 128
# update_per_collect_vae=2, # each mini-batch: replay_buffer_recent sample 128
# update_per_collect_vae=0, # each mini-batch: replay_buffer_recent sample 128, replay_buffer sample 128
batch_size
=
128
,
learning_rate_actor
=
3e-4
,
...
...
@@ -50,8 +81,8 @@ lunarlander_td3vae_config = dict(
learning_rate_vae
=
3e-4
,
ignore_done
=
False
,
# TODO(pu)
actor_update_freq
=
2
,
noise
=
True
,
#
noise=False, # TODO(pu)
#
noise=True,
noise
=
False
,
# TODO(pu)
noise_sigma
=
0.1
,
noise_range
=
dict
(
min
=-
0.5
,
...
...
@@ -59,15 +90,17 @@ lunarlander_td3vae_config = dict(
),
),
collect
=
dict
(
#
each_iter_n_sample=48,
each_iter_n_sample
=
48
,
# each_iter_n_sample=256,
each_iter_n_sample
=
1280
,
noise_sigma
=
0.1
,
# each_iter_n_sample=1280,
unroll_len
=
1
,
# TODO(pu)
# noise_sigma=0.1,
noise_sigma
=
0
,
# TODO(pu)
collector
=
dict
(
collect_print_freq
=
1000
,
),
),
eval
=
dict
(
evaluator
=
dict
(
eval_freq
=
100
,
),
),
other
=
dict
(
replay_buffer
=
dict
(
replay_buffer_size
=
20000
,
),
),
#
other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ),
# other=dict(replay_buffer=dict(replay_buffer_size=int(2e4)
, ), ),
other
=
dict
(
replay_buffer
=
dict
(
replay_buffer_size
=
int
(
1e5
),
),
),
),
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录