Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Annotated Deep Learning Paper Implementations
提交
3f4caf56
A
Annotated Deep Learning Paper Implementations
项目概览
Greenplum
/
Annotated Deep Learning Paper Implementations
10 个月 前同步成功
通知
6
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
A
Annotated Deep Learning Paper Implementations
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
3f4caf56
编写于
3月 27, 2021
作者:
V
Varuna Jayasiri
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
✨
ppo configs
上级
5c429e7a
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
71 addition
and
24 deletion
+71
-24
labml_nn/rl/ppo/experiment.py
labml_nn/rl/ppo/experiment.py
+71
-24
未找到文件。
labml_nn/rl/ppo/experiment.py
浏览文件 @
3f4caf56
...
...
@@ -19,6 +19,7 @@ from torch import optim
from
torch.distributions
import
Categorical
from
labml
import
monit
,
tracker
,
logger
,
experiment
from
labml.internal.configs.dynamic_hyperparam
import
FloatDynamicHyperParam
from
labml_helpers.module
import
Module
from
labml_nn.rl.game
import
Worker
from
labml_nn.rl.ppo
import
ClippedPPOLoss
,
ClippedValueFunctionLoss
...
...
@@ -89,24 +90,40 @@ class Trainer:
## Trainer
"""
def
__init__
(
self
):
def
__init__
(
self
,
*
,
updates
:
int
,
epochs
:
int
,
n_workers
:
int
,
worker_steps
:
int
,
batches
:
int
,
value_loss_coef
:
FloatDynamicHyperParam
,
entropy_bonus_coef
:
FloatDynamicHyperParam
,
clip_range
:
FloatDynamicHyperParam
,
learning_rate
:
FloatDynamicHyperParam
,
):
# #### Configurations
# number of updates
self
.
updates
=
10000
self
.
updates
=
updates
# number of epochs to train the model with sampled data
self
.
epochs
=
4
self
.
epochs
=
epochs
# number of worker processes
self
.
n_workers
=
8
self
.
n_workers
=
n_workers
# number of steps to run on each process for a single update
self
.
worker_steps
=
128
self
.
worker_steps
=
worker_steps
# number of mini batches
self
.
n_mini_batch
=
4
self
.
batches
=
batches
# total number of samples for a single update
self
.
batch_size
=
self
.
n_workers
*
self
.
worker_steps
# size of a mini batch
self
.
mini_batch_size
=
self
.
batch_size
//
self
.
n_mini_batch
assert
(
self
.
batch_size
%
self
.
n_mini_batch
==
0
)
self
.
mini_batch_size
=
self
.
batch_size
//
self
.
batches
assert
(
self
.
batch_size
%
self
.
batches
==
0
)
# Value loss coefficient
self
.
value_loss_coef
=
value_loss_coef
# Entropy bonus coefficient
self
.
entropy_bonus_coef
=
entropy_bonus_coef
# Clipping range
self
.
clip_range
=
clip_range
# Learning rate
self
.
learning_rate
=
learning_rate
# #### Initialize
...
...
@@ -204,7 +221,7 @@ class Trainer:
return
samples_flat
def
train
(
self
,
samples
:
Dict
[
str
,
torch
.
Tensor
]
,
learning_rate
:
float
,
clip_range
:
float
):
def
train
(
self
,
samples
:
Dict
[
str
,
torch
.
Tensor
]):
"""
### Train the model based on samples
"""
...
...
@@ -228,12 +245,11 @@ class Trainer:
mini_batch
[
k
]
=
v
[
mini_batch_indexes
]
# train
loss
=
self
.
_calc_loss
(
clip_range
=
clip_range
,
samples
=
mini_batch
)
loss
=
self
.
_calc_loss
(
mini_batch
)
# Set learning rate
for
pg
in
self
.
optimizer
.
param_groups
:
pg
[
'lr'
]
=
learning_rate
pg
[
'lr'
]
=
self
.
learning_rate
()
# Zero out the previously calculated gradients
self
.
optimizer
.
zero_grad
()
# Calculate gradients
...
...
@@ -248,7 +264,7 @@ class Trainer:
"""#### Normalize advantage function"""
return
(
adv
-
adv
.
mean
())
/
(
adv
.
std
()
+
1e-8
)
def
_calc_loss
(
self
,
samples
:
Dict
[
str
,
torch
.
Tensor
]
,
clip_range
:
float
)
->
torch
.
Tensor
:
def
_calc_loss
(
self
,
samples
:
Dict
[
str
,
torch
.
Tensor
])
->
torch
.
Tensor
:
"""
### Calculate total loss
"""
...
...
@@ -270,7 +286,7 @@ class Trainer:
log_pi
=
pi
.
log_prob
(
samples
[
'actions'
])
# Calculate policy loss
policy_loss
=
self
.
ppo_loss
(
log_pi
,
samples
[
'log_pis'
],
sampled_normalized_advantage
,
clip_range
)
policy_loss
=
self
.
ppo_loss
(
log_pi
,
samples
[
'log_pis'
],
sampled_normalized_advantage
,
self
.
clip_range
()
)
# Calculate Entropy Bonus
#
...
...
@@ -280,12 +296,14 @@ class Trainer:
entropy_bonus
=
entropy_bonus
.
mean
()
# Calculate value function loss
value_loss
=
self
.
value_loss
(
value
,
samples
[
'values'
],
sampled_return
,
clip_range
)
value_loss
=
self
.
value_loss
(
value
,
samples
[
'values'
],
sampled_return
,
self
.
clip_range
()
)
# $\mathcal{L}^{CLIP+VF+EB} (\theta) =
# \mathcal{L}^{CLIP} (\theta) +
# c_1 \mathcal{L}^{VF} (\theta) - c_2 \mathcal{L}^{EB}(\theta)$
loss
=
policy_loss
+
0.5
*
value_loss
-
0.01
*
entropy_bonus
loss
=
(
policy_loss
+
self
.
value_loss_coef
()
*
value_loss
-
self
.
entropy_bonus_coef
()
*
entropy_bonus
)
# for monitoring
approx_kl_divergence
=
.
5
*
((
samples
[
'log_pis'
]
-
log_pi
)
**
2
).
mean
()
...
...
@@ -309,17 +327,11 @@ class Trainer:
tracker
.
set_queue
(
'length'
,
100
,
True
)
for
update
in
monit
.
loop
(
self
.
updates
):
progress
=
update
/
self
.
updates
# decreasing `learning_rate` and `clip_range` $\epsilon$
learning_rate
=
2.5e-4
*
(
1
-
progress
)
clip_range
=
0.1
*
(
1
-
progress
)
# sample with current policy
samples
=
self
.
sample
()
# train the model
self
.
train
(
samples
,
learning_rate
,
clip_range
)
self
.
train
(
samples
)
# Save tracked indicators.
tracker
.
save
()
...
...
@@ -339,8 +351,43 @@ class Trainer:
def
main
():
# Create the experiment
experiment
.
create
(
name
=
'ppo'
)
# Configurations
configs
=
{
# number of updates
'updates'
:
10000
,
# number of epochs to train the model with sampled data
'epochs'
:
4
,
# number of worker processes
'n_workers'
:
8
,
# number of steps to run on each process for a single update
'worker_steps'
:
128
,
# number of mini batches
'batches'
:
4
,
# Value loss coefficient
'value_loss_coef'
:
FloatDynamicHyperParam
(
0.5
),
# Entropy bonus coefficient
'entropy_bonus_coef'
:
FloatDynamicHyperParam
(
0.01
),
# Clip range
'clip_range'
:
FloatDynamicHyperParam
(
0.1
),
# Learning rate
'learning_rate'
:
FloatDynamicHyperParam
(
2.5e-4
,
(
0
,
1e-3
)),
}
experiment
.
configs
(
configs
)
# Initialize the trainer
m
=
Trainer
()
m
=
Trainer
(
updates
=
configs
[
'updates'
],
epochs
=
configs
[
'epochs'
],
n_workers
=
configs
[
'n_workers'
],
worker_steps
=
configs
[
'worker_steps'
],
batches
=
configs
[
'batches'
],
value_loss_coef
=
configs
[
'value_loss_coef'
],
entropy_bonus_coef
=
configs
[
'entropy_bonus_coef'
],
clip_range
=
configs
[
'clip_range'
],
learning_rate
=
configs
[
'learning_rate'
],
)
# Run and monitor the experiment
with
experiment
.
start
():
m
.
run_training_loop
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录