Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleSeg
提交
16c7f9b7
P
PaddleSeg
项目概览
PaddlePaddle
/
PaddleSeg
通知
285
Star
8
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
53
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleSeg
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
53
Issue
53
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
16c7f9b7
编写于
9月 18, 2020
作者:
C
chenguowei01
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update optimizer and collect loss
上级
334a4b30
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
14 addition
and
4 deletion
+14
-4
dygraph/paddleseg/core/train.py
dygraph/paddleseg/core/train.py
+14
-4
未找到文件。
dygraph/paddleseg/core/train.py
浏览文件 @
16c7f9b7
...
@@ -16,6 +16,7 @@ import os
...
@@ -16,6 +16,7 @@ import os
import
paddle
import
paddle
from
paddle.distributed
import
ParallelEnv
from
paddle.distributed
import
ParallelEnv
from
paddle.distributed
import
init_parallel_env
from
paddle.io
import
DistributedBatchSampler
from
paddle.io
import
DistributedBatchSampler
from
paddle.io
import
DataLoader
from
paddle.io
import
DataLoader
import
paddle.nn.functional
as
F
import
paddle.nn.functional
as
F
...
@@ -77,11 +78,14 @@ def train(model,
...
@@ -77,11 +78,14 @@ def train(model,
os
.
makedirs
(
save_dir
)
os
.
makedirs
(
save_dir
)
if
nranks
>
1
:
if
nranks
>
1
:
# Initialize parallel training environment.
init_parallel_env
()
strategy
=
paddle
.
distributed
.
prepare_context
()
strategy
=
paddle
.
distributed
.
prepare_context
()
ddp_model
=
paddle
.
DataParallel
(
model
,
strategy
)
ddp_model
=
paddle
.
DataParallel
(
model
,
strategy
)
batch_sampler
=
DistributedBatchSampler
(
batch_sampler
=
DistributedBatchSampler
(
train_dataset
,
batch_size
=
batch_size
,
shuffle
=
True
,
drop_last
=
True
)
train_dataset
,
batch_size
=
batch_size
,
shuffle
=
True
,
drop_last
=
True
)
loader
=
DataLoader
(
loader
=
DataLoader
(
train_dataset
,
train_dataset
,
batch_sampler
=
batch_sampler
,
batch_sampler
=
batch_sampler
,
...
@@ -115,7 +119,6 @@ def train(model,
...
@@ -115,7 +119,6 @@ def train(model,
if
nranks
>
1
:
if
nranks
>
1
:
logits
=
ddp_model
(
images
)
logits
=
ddp_model
(
images
)
loss
=
loss_computation
(
logits
,
labels
,
losses
)
loss
=
loss_computation
(
logits
,
labels
,
losses
)
# loss = ddp_model(images, labels)
# apply_collective_grads sum grads over multiple gpus.
# apply_collective_grads sum grads over multiple gpus.
loss
=
ddp_model
.
scale_loss
(
loss
)
loss
=
ddp_model
.
scale_loss
(
loss
)
loss
.
backward
()
loss
.
backward
()
...
@@ -125,8 +128,15 @@ def train(model,
...
@@ -125,8 +128,15 @@ def train(model,
loss
=
loss_computation
(
logits
,
labels
,
losses
)
loss
=
loss_computation
(
logits
,
labels
,
losses
)
# loss = model(images, labels)
# loss = model(images, labels)
loss
.
backward
()
loss
.
backward
()
optimizer
.
minimize
(
loss
)
# optimizer.minimize(loss)
optimizer
.
step
()
if
isinstance
(
optimizer
.
_learning_rate
,
paddle
.
optimizer
.
_LRScheduler
):
optimizer
.
_learning_rate
.
step
()
model
.
clear_gradients
()
model
.
clear_gradients
()
# Sum loss over all ranks
if
nranks
>
1
:
paddle
.
distributed
.
all_reduce
(
loss
)
avg_loss
+=
loss
.
numpy
()[
0
]
avg_loss
+=
loss
.
numpy
()[
0
]
lr
=
optimizer
.
get_lr
()
lr
=
optimizer
.
get_lr
()
train_batch_cost
+=
timer
.
elapsed_time
()
train_batch_cost
+=
timer
.
elapsed_time
()
...
@@ -141,10 +151,10 @@ def train(model,
...
@@ -141,10 +151,10 @@ def train(model,
logger
.
info
(
logger
.
info
(
"[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
"[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
.
format
((
iter
-
1
)
//
iters_per_epoch
+
1
,
iter
,
iters
,
.
format
((
iter
-
1
)
//
iters_per_epoch
+
1
,
iter
,
iters
,
avg_loss
*
nranks
,
lr
,
avg_train_batch_cost
,
avg_loss
,
lr
,
avg_train_batch_cost
,
avg_train_reader_cost
,
eta
))
avg_train_reader_cost
,
eta
))
if
use_vdl
:
if
use_vdl
:
log_writer
.
add_scalar
(
'Train/loss'
,
avg_loss
*
nranks
,
iter
)
log_writer
.
add_scalar
(
'Train/loss'
,
avg_loss
,
iter
)
log_writer
.
add_scalar
(
'Train/lr'
,
lr
,
iter
)
log_writer
.
add_scalar
(
'Train/lr'
,
lr
,
iter
)
log_writer
.
add_scalar
(
'Train/batch_cost'
,
log_writer
.
add_scalar
(
'Train/batch_cost'
,
avg_train_batch_cost
,
iter
)
avg_train_batch_cost
,
iter
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录