Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
368d6302
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
368d6302
编写于
3月 23, 2020
作者:
L
LielinJiang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine fit, distributedsampler
上级
ba723731
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
55 addition
and
70 deletion
+55
-70
callbacks.py
callbacks.py
+1
-1
distributed.py
distributed.py
+12
-14
model.py
model.py
+40
-53
tests/test_model.py
tests/test_model.py
+2
-2
未找到文件。
callbacks.py
浏览文件 @
368d6302
...
...
@@ -220,7 +220,7 @@ class ProgBarLogger(Callback):
def
on_epoch_end
(
self
,
epoch
,
logs
=
None
):
logs
=
logs
or
{}
if
self
.
verbose
:
if
self
.
verbose
and
get_local_rank
()
==
0
:
self
.
_updates
(
logs
,
'train'
)
def
on_eval_begin
(
self
,
logs
=
None
):
...
...
distributed.py
浏览文件 @
368d6302
...
...
@@ -80,20 +80,18 @@ class DistributedBatchSampler(BatchSampler):
self
.
total_size
=
self
.
num_samples
*
self
.
nranks
def
__iter__
(
self
):
_sample_iter
=
self
.
sample_iter
if
_sample_iter
is
None
:
num_samples
=
len
(
self
.
dataset
)
indices
=
np
.
arange
(
num_samples
).
tolist
()
indices
+=
indices
[:(
self
.
total_size
-
len
(
indices
))]
assert
len
(
indices
)
==
self
.
total_size
if
self
.
shuffle
:
np
.
random
.
RandomState
(
self
.
epoch
).
shuffle
(
indices
)
self
.
epoch
+=
1
# subsample
indices
=
indices
[
self
.
local_rank
*
self
.
num_samples
:
(
self
.
local_rank
+
1
)
*
self
.
num_samples
]
assert
len
(
indices
)
==
self
.
num_samples
_sample_iter
=
iter
(
indices
)
num_samples
=
len
(
self
.
dataset
)
indices
=
np
.
arange
(
num_samples
).
tolist
()
indices
+=
indices
[:(
self
.
total_size
-
len
(
indices
))]
assert
len
(
indices
)
==
self
.
total_size
if
self
.
shuffle
:
np
.
random
.
RandomState
(
self
.
epoch
).
shuffle
(
indices
)
self
.
epoch
+=
1
# subsample
indices
=
indices
[
self
.
local_rank
*
self
.
num_samples
:
(
self
.
local_rank
+
1
)
*
self
.
num_samples
]
assert
len
(
indices
)
==
self
.
num_samples
_sample_iter
=
iter
(
indices
)
batch_indices
=
[]
for
idx
in
_sample_iter
:
...
...
model.py
浏览文件 @
368d6302
...
...
@@ -91,7 +91,7 @@ def init_context(backend):
place
=
fluid
.
CUDAPlace
(
distributed
.
Env
().
dev_id
)
if
\
distributed
.
Env
().
nranks
>
1
else
fluid
.
CUDAPlace
(
0
)
distributed
.
prepare_distributed_context
()
distributed
.
prepare_distributed_context
(
place
)
backend
=
backend
.
lower
()
if
backend
==
'dynamic'
:
fluid
.
enable_dygraph
(
place
)
...
...
@@ -419,22 +419,10 @@ class StaticGraphAdapter(object):
labels
=
[
k
.
forward
()
for
k
in
to_list
(
lbls
)]
self
.
_label_vars
[
mode
]
=
labels
outputs
=
to_list
(
self
.
model
.
forward
(
*
inputs
))
if
mode
!=
'test'
:
if
self
.
model
.
_loss_function
:
if
mode
!=
'test'
and
self
.
model
.
_loss_function
:
losses
=
self
.
model
.
_loss_function
(
outputs
,
labels
)
if
mode
==
'train'
and
self
.
model
.
_optimizer
:
self
.
_loss_endpoint
=
fluid
.
layers
.
sum
(
losses
)
if
self
.
_nranks
>
1
:
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
dist_strategy
=
DistributedStrategy
()
dist_strategy
.
mode
=
"collective"
dist_strategy
.
collective_mode
=
"grad_allreduce"
self
.
model
.
_optimizer
=
fleet
.
distributed_optimizer
(
self
.
model
.
_optimizer
,
strategy
=
dist_strategy
)
self
.
model
.
_optimizer
.
minimize
(
self
.
_loss_endpoint
)
if
self
.
_nranks
>
1
and
mode
!=
'train'
:
outputs
=
[
distributed
.
_all_gather
(
o
,
self
.
_nranks
)
for
o
in
outputs
]
if
mode
!=
'test'
:
...
...
@@ -442,8 +430,21 @@ class StaticGraphAdapter(object):
if
mode
!=
'test'
:
for
metric
in
self
.
model
.
_metrics
:
metrics
.
append
(
to_list
(
metric
.
add_metric_op
(
outputs
,
labels
)))
metrics
.
append
(
to_list
(
metric
.
add_metric_op
(
outputs
,
labels
)))
if
mode
==
'train'
and
self
.
model
.
_optimizer
:
self
.
_loss_endpoint
=
fluid
.
layers
.
sum
(
losses
)
if
self
.
_nranks
>
1
:
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
dist_strategy
=
DistributedStrategy
()
dist_strategy
.
mode
=
"collective"
dist_strategy
.
collective_mode
=
"grad_allreduce"
self
.
model
.
_optimizer
=
fleet
.
distributed_optimizer
(
self
.
model
.
_optimizer
,
strategy
=
dist_strategy
)
self
.
model
.
_optimizer
.
minimize
(
self
.
_loss_endpoint
)
if
mode
!=
'train'
:
# clone again to put it in test mode
prog
=
prog
.
clone
(
for_test
=
True
)
...
...
@@ -870,6 +871,8 @@ class Model(fluid.dygraph.Layer):
log_freq
=
10
,
save_freq
=
1
,
verbose
=
2
,
drop_last
=
False
,
shuffle
=
True
,
num_workers
=
0
,
callbacks
=
None
,
):
"""
...
...
@@ -901,43 +904,27 @@ class Model(fluid.dygraph.Layer):
feed_list
=
[
x
.
forward
()
for
x
in
self
.
_inputs
+
self
.
_labels
]
if
train_loader
is
None
:
if
distributed
.
get_nranks
()
>
1
:
train_sampler
=
DistributedBatchSampler
(
train_dataset
,
batch_size
=
batch_size
,
shuffle
=
True
)
train_loader
=
DataLoader
(
train_dataset
,
batch_sampler
=
train_sampler
,
places
=
self
.
_place
,
feed_list
=
feed_list
,
num_workers
=
num_workers
,
return_list
=
True
)
else
:
train_loader
=
DataLoader
(
train_dataset
,
batch_size
=
batch_size
,
places
=
self
.
_place
,
feed_list
=
feed_list
,
num_workers
=
4
,
return_list
=
True
)
train_sampler
=
DistributedBatchSampler
(
train_dataset
,
batch_size
=
batch_size
,
shuffle
=
shuffle
,
drop_last
=
drop_last
)
train_loader
=
DataLoader
(
train_dataset
,
batch_sampler
=
train_sampler
,
places
=
self
.
_place
,
feed_list
=
feed_list
,
num_workers
=
num_workers
,
return_list
=
True
)
if
eval_loader
is
None
and
eval_dataset
is
not
None
:
if
distributed
.
get_nranks
()
>
1
:
eval_sampler
=
DistributedBatchSampler
(
eval_dataset
,
batch_size
=
batch_size
)
eval_loader
=
DataLoader
(
eval_dataset
,
batch_sampler
=
eval_sampler
,
places
=
self
.
_place
,
feed_list
=
feed_list
,
num_workers
=
num_workers
,
return_list
=
True
)
else
:
eval_loader
=
DataLoader
(
eval_dataset
,
batch_size
=
batch_size
,
places
=
self
.
_place
,
feed_list
=
feed_list
,
num_workers
=
num_workers
,
return_list
=
True
)
eval_sampler
=
DistributedBatchSampler
(
eval_dataset
,
batch_size
=
batch_size
)
eval_loader
=
DataLoader
(
eval_dataset
,
batch_sampler
=
eval_sampler
,
places
=
self
.
_place
,
feed_list
=
feed_list
,
num_workers
=
num_workers
,
return_list
=
True
)
do_eval
=
eval_loader
is
not
None
self
.
_test_dataloader
=
eval_loader
metrics_name
=
self
.
_metrics_name
()
...
...
tests/test_model.py
浏览文件 @
368d6302
...
...
@@ -141,8 +141,8 @@ class MyCrossEntropy(Loss):
class
TestModel
(
unittest
.
TestCase
):
def
fit
(
self
,
dynamic
,
is_mlp
=
False
):
init_context
(
'dynamic'
if
FLAGS
.
dynamic
else
'static'
)
init_context
(
'dynamic'
if
dynamic
else
'static'
)
im_shape
=
(
-
1
,
784
)
batch_size
=
128
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录