Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
863897ce
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
863897ce
编写于
3月 31, 2020
作者:
G
guosheng
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' of
https://github.com/PaddlePaddle/hapi
into fix-data-train
上级
dd446685
4d22fee0
变更
5
展开全部
隐藏空白更改
内联
并排
Showing
5 changed file
with
480 addition
and
306 deletion
+480
-306
callbacks.py
callbacks.py
+14
-8
distributed.py
distributed.py
+58
-105
mnist.py
mnist.py
+12
-3
model.py
model.py
+360
-187
tests/test_model.py
tests/test_model.py
+36
-3
未找到文件。
callbacks.py
浏览文件 @
863897ce
...
...
@@ -16,7 +16,7 @@ import six
import
copy
from
progressbar
import
ProgressBar
from
distributed
import
get_local_rank
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
def
config_callbacks
(
callbacks
=
None
,
...
...
@@ -195,7 +195,7 @@ class ProgBarLogger(Callback):
self
.
steps
=
self
.
params
[
'steps'
]
self
.
epoch
=
epoch
self
.
train_step
=
0
if
self
.
verbose
and
self
.
epochs
and
get_local_rank
()
==
0
:
if
self
.
verbose
and
self
.
epochs
and
ParallelEnv
().
local_rank
==
0
:
print
(
'Epoch %d/%d'
%
(
epoch
+
1
,
self
.
epochs
))
self
.
train_progbar
=
ProgressBar
(
num
=
self
.
steps
,
verbose
=
self
.
verbose
)
...
...
@@ -213,8 +213,8 @@ class ProgBarLogger(Callback):
logs
=
logs
or
{}
self
.
train_step
+=
1
if
self
.
train_step
%
self
.
log_freq
==
0
and
self
.
verbose
and
get_local_rank
(
)
==
0
:
if
self
.
train_step
%
self
.
log_freq
==
0
and
self
.
verbose
and
ParallelEnv
(
)
.
local_rank
==
0
:
# if steps is not None, last step will update in on_epoch_end
if
self
.
steps
and
self
.
train_step
<
self
.
steps
:
self
.
_updates
(
logs
,
'train'
)
...
...
@@ -223,7 +223,7 @@ class ProgBarLogger(Callback):
def
on_epoch_end
(
self
,
epoch
,
logs
=
None
):
logs
=
logs
or
{}
if
self
.
verbose
and
get_local_rank
()
==
0
:
if
self
.
verbose
and
ParallelEnv
().
local_rank
==
0
:
self
.
_updates
(
logs
,
'train'
)
def
on_eval_begin
(
self
,
logs
=
None
):
...
...
@@ -233,7 +233,7 @@ class ProgBarLogger(Callback):
self
.
evaled_samples
=
0
self
.
eval_progbar
=
ProgressBar
(
num
=
self
.
eval_steps
,
verbose
=
self
.
verbose
)
if
get_local_rank
()
==
0
:
if
ParallelEnv
().
local_rank
==
0
:
print
(
'Eval begin...'
)
def
on_eval_batch_end
(
self
,
step
,
logs
=
None
):
...
...
@@ -242,9 +242,15 @@ class ProgBarLogger(Callback):
samples
=
logs
.
get
(
'batch_size'
,
1
)
self
.
evaled_samples
+=
samples
if
self
.
eval_step
%
self
.
log_freq
==
0
and
self
.
verbose
and
ParallelEnv
(
).
local_rank
==
0
:
# if steps is not None, last step will update in on_epoch_end
if
self
.
eval_steps
and
self
.
eval_step
<
self
.
eval_steps
:
self
.
_updates
(
logs
,
'eval'
)
def
on_eval_end
(
self
,
logs
=
None
):
logs
=
logs
or
{}
if
self
.
verbose
and
get_local_rank
()
==
0
:
if
self
.
verbose
and
ParallelEnv
().
local_rank
==
0
:
self
.
_updates
(
logs
,
'eval'
)
print
(
'Eval samples: %d'
%
(
self
.
evaled_samples
))
...
...
@@ -258,7 +264,7 @@ class ModelCheckpoint(Callback):
self
.
epoch
=
epoch
def
_is_save
(
self
):
return
self
.
model
and
self
.
save_dir
and
get_local_rank
()
==
0
return
self
.
model
and
self
.
save_dir
and
ParallelEnv
().
local_rank
==
0
def
on_epoch_end
(
self
,
epoch
,
logs
=
None
):
if
self
.
_is_save
()
and
self
.
epoch
%
self
.
save_freq
==
0
:
...
...
distributed.py
浏览文件 @
863897ce
...
...
@@ -13,30 +13,20 @@
# limitations under the License.
import
os
import
sys
import
six
import
time
import
math
import
socket
import
contextlib
from
contextlib
import
closing
from
six
import
string_types
import
numpy
as
np
from
collections
import
OrderedDict
from
paddle
import
fluid
import
paddle.fluid.unique_name
as
nameGen
from
paddle.fluid
import
core
from
paddle
.fluid
import
framework
from
paddle
import
fluid
from
paddle.fluid.layers
import
collective
from
paddle.fluid.dygraph
import
to_variable
,
no_grad
,
layers
from
paddle.fluid.framework
import
Variable
from
paddle.fluid.executor
import
global_scope
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
,
ParallelStrategy
from
paddle.fluid.io
import
BatchSampler
from
paddle.fluid.dygraph.parallel
import
Env
,
DataParallel
,
ParallelStrategy
from
paddle.fluid.layers.collective
import
_c_allreduce
,
_c_allgather
,
_c_broadcast
,
\
_c_sync_comm_stream
,
_c_sync_calc_stream
from
paddle.fluid.io
import
BatchSampler
,
DataLoader
_parallel_context_initialized
=
False
__parallel_context_init
=
False
class
DistributedBatchSampler
(
BatchSampler
):
"""Sampler that restricts data loading to a subset of the dataset.
...
...
@@ -71,11 +61,13 @@ class DistributedBatchSampler(BatchSampler):
self
.
shuffle
=
shuffle
assert
isinstance
(
drop_last
,
bool
),
\
"drop_last should be a boolean number"
self
.
drop_last
=
drop_last
self
.
nranks
=
get_nranks
()
self
.
local_rank
=
get_local_rank
()
self
.
nranks
=
ParallelEnv
().
nranks
self
.
local_rank
=
ParallelEnv
().
local_rank
self
.
epoch
=
0
self
.
num_samples
=
int
(
math
.
ceil
(
len
(
self
.
dataset
)
*
1.0
/
self
.
nranks
))
self
.
num_samples
=
int
(
math
.
ceil
(
len
(
self
.
dataset
)
*
1.0
/
self
.
nranks
))
self
.
total_size
=
self
.
num_samples
*
self
.
nranks
def
__iter__
(
self
):
...
...
@@ -86,9 +78,28 @@ class DistributedBatchSampler(BatchSampler):
if
self
.
shuffle
:
np
.
random
.
RandomState
(
self
.
epoch
).
shuffle
(
indices
)
self
.
epoch
+=
1
# subsample
indices
=
indices
[
self
.
local_rank
*
self
.
num_samples
:
(
self
.
local_rank
+
1
)
*
self
.
num_samples
]
def
_get_indices_by_batch_size
(
indices
):
subsampled_indices
=
[]
last_batch_size
=
self
.
total_size
%
(
self
.
batch_size
*
self
.
nranks
)
assert
last_batch_size
%
self
.
nranks
==
0
last_local_batch_size
=
last_batch_size
//
self
.
nranks
for
i
in
range
(
self
.
local_rank
*
self
.
batch_size
,
len
(
indices
)
-
last_batch_size
,
self
.
batch_size
*
self
.
nranks
):
subsampled_indices
.
extend
(
indices
[
i
:
i
+
self
.
batch_size
])
indices
=
indices
[
len
(
indices
)
-
last_batch_size
:]
subsampled_indices
.
extend
(
indices
[
self
.
local_rank
*
last_local_batch_size
:(
self
.
local_rank
+
1
)
*
last_local_batch_size
])
return
subsampled_indices
if
self
.
nranks
>
1
:
indices
=
_get_indices_by_batch_size
(
indices
)
assert
len
(
indices
)
==
self
.
num_samples
_sample_iter
=
iter
(
indices
)
...
...
@@ -106,46 +117,37 @@ class DistributedBatchSampler(BatchSampler):
num_samples
+=
int
(
not
self
.
drop_last
)
*
(
self
.
batch_size
-
1
)
return
num_samples
//
self
.
batch_size
def
_all_gather
(
x
,
nranks
,
ring_id
=
0
,
use_calc_stream
=
True
):
return
_c_allgather
(
x
,
nranks
,
ring_id
=
ring_id
,
use_calc_stream
=
use_calc_stream
)
def
get_local_rank
():
return
Env
().
local_rank
def
set_epoch
(
self
,
epoch
):
self
.
epoch
=
epoch
def
get_nranks
():
return
Env
().
nranks
def
_all_gather
(
x
,
nranks
,
ring_id
=
0
,
use_calc_stream
=
True
):
return
collective
.
_c_allgather
(
x
,
nranks
,
ring_id
=
ring_id
,
use_calc_stream
=
use_calc_stream
)
def
wait_server_ready
(
endpoints
):
assert
not
isinstance
(
endpoints
,
string_types
)
assert
not
isinstance
(
endpoints
,
s
ix
.
s
tring_types
)
while
True
:
all_ok
=
True
not_ready_endpoints
=
[]
for
ep
in
endpoints
:
ip_port
=
ep
.
split
(
":"
)
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
sock
:
with
contextlib
.
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
sock
:
sock
.
settimeout
(
2
)
result
=
sock
.
connect_ex
((
ip_port
[
0
],
int
(
ip_port
[
1
])))
if
result
!=
0
:
all_ok
=
False
not_ready_endpoints
.
append
(
ep
)
if
not
all_ok
:
sys
.
stderr
.
write
(
"server not ready, wait 3 sec to retry...
\n
"
)
sys
.
stderr
.
write
(
"not ready endpoints:"
+
str
(
not_ready_endpoints
)
+
"
\n
"
)
sys
.
stderr
.
flush
()
time
.
sleep
(
3
)
else
:
break
def
init_communicator
(
program
,
rank
,
nranks
,
wait_port
,
current_endpoint
,
endpoints
):
def
init_communicator
(
program
,
rank
,
nranks
,
wait_port
,
current_endpoint
,
endpoints
):
if
nranks
<
2
:
return
other_endpoints
=
endpoints
[:]
...
...
@@ -154,9 +156,9 @@ def init_communicator(program, rank, nranks, wait_port,
wait_server_ready
(
other_endpoints
)
block
=
program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
nameGen
.
generate
(
'nccl_id'
),
name
=
fluid
.
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
type
=
fluid
.
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
...
...
@@ -181,25 +183,28 @@ def init_communicator(program, rank, nranks, wait_port,
def
prepare_distributed_context
(
place
=
None
):
if
place
is
None
:
place
=
fluid
.
CUDAPlace
(
Env
().
dev_id
)
if
Env
().
nranks
>
1
\
place
=
fluid
.
CUDAPlace
(
ParallelEnv
().
dev_id
)
if
Parallel
Env
().
nranks
>
1
\
else
fluid
.
CUDAPlace
(
0
)
strategy
=
ParallelStrategy
()
strategy
.
nranks
=
Env
().
nranks
strategy
.
local_rank
=
Env
().
local_rank
strategy
.
trainer_endpoints
=
Env
().
trainer_endpoints
strategy
.
current_endpoint
=
Env
().
current_endpoint
strategy
.
nranks
=
Parallel
Env
().
nranks
strategy
.
local_rank
=
Parallel
Env
().
local_rank
strategy
.
trainer_endpoints
=
Parallel
Env
().
trainer_endpoints
strategy
.
current_endpoint
=
Parallel
Env
().
current_endpoint
if
strategy
.
nranks
<
2
:
return
global
__parallel_context_init
global
_parallel_context_initialized
if
not
_parallel_context_initialized
and
isinstance
(
place
,
fluid
.
CUDAPlace
):
if
not
__parallel_context_init
and
isinstance
(
place
,
core
.
CUDAPlace
):
def
_init_context
():
communicator_prog
=
framework
.
Program
()
init_communicator
(
communicator_prog
,
strategy
.
local_rank
,
strategy
.
nranks
,
True
,
strategy
.
current_endpoint
,
strategy
.
trainer_endpoints
)
communicator_prog
=
fluid
.
Program
()
init_communicator
(
communicator_prog
,
strategy
.
local_rank
,
strategy
.
nranks
,
True
,
strategy
.
current_endpoint
,
strategy
.
trainer_endpoints
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
communicator_prog
)
...
...
@@ -213,57 +218,5 @@ def prepare_distributed_context(place=None):
else
:
assert
(
"Only support CUDAPlace for now."
)
_
_parallel_context_init
=
True
_
parallel_context_initialized
=
True
return
strategy
class
DistributedDataParallel
(
DataParallel
):
def
__init__
(
self
,
layers
,
strategy
=
None
):
if
strategy
is
None
:
strategy
=
ParallelStrategy
()
strategy
.
nranks
=
Env
().
nranks
strategy
.
local_rank
=
Env
().
local_rank
strategy
.
trainer_endpoints
=
Env
().
trainer_endpoints
strategy
.
current_endpoint
=
Env
().
current_endpoint
super
(
DistributedDataParallel
,
self
).
__init__
(
layers
,
strategy
)
@
no_grad
def
apply_collective_grads
(
self
):
"""
AllReduce the Parameters' gradient.
"""
if
not
self
.
_is_data_parallel_mode
():
return
grad_var_set
=
set
()
grad_vars
=
[]
for
param
in
self
.
_layers
.
parameters
():
# NOTE(zcd): The grad_ivar maybe no generated.
if
param
.
trainable
and
param
.
_grad_ivar
():
g_var
=
param
.
_grad_ivar
()
grad_vars
.
append
(
g_var
)
assert
g_var
not
in
grad_var_set
grad_var_set
.
add
(
g_var
)
mega_bytes
=
128
*
1024
*
1024
group_idx
=
0
memory_counter
=
0
grad_var_groups
=
OrderedDict
()
dtype
=
grad_vars
[
0
].
dtype
for
g_var
in
grad_vars
:
# Note: the dtype of the same group should be the same.
bytes
=
np
.
prod
(
g_var
.
shape
)
*
core
.
size_of_dtype
(
g_var
.
dtype
)
if
memory_counter
<
mega_bytes
and
dtype
==
g_var
.
dtype
:
memory_counter
+=
bytes
else
:
memory_counter
=
bytes
group_idx
+=
1
grad_var_groups
.
setdefault
(
group_idx
,
[]).
append
(
g_var
)
coalesced_grads_and_vars
=
self
.
_coalesce_tensors
(
grad_var_groups
)
for
coalesced_grad
,
_
,
_
in
coalesced_grads_and_vars
:
collective
.
_c_allreduce
(
coalesced_grad
,
coalesced_grad
,
use_calc_stream
=
True
)
self
.
_split_tensors
(
coalesced_grads_and_vars
)
mnist.py
浏览文件 @
863897ce
...
...
@@ -26,7 +26,7 @@ from paddle.fluid.optimizer import Momentum
from
paddle.fluid.dygraph.nn
import
Conv2D
,
Pool2D
,
Linear
from
paddle.fluid.io
import
MNIST
as
MnistDataset
from
model
import
Model
,
CrossEntropy
,
Input
,
init_context
from
model
import
Model
,
CrossEntropy
,
Input
,
set_device
from
metrics
import
Accuracy
...
...
@@ -106,7 +106,8 @@ class MNIST(Model):
def
main
():
init_context
(
'dynamic'
if
FLAGS
.
dynamic
else
'static'
)
device
=
set_device
(
FLAGS
.
device
)
fluid
.
enable_dygraph
(
device
)
if
FLAGS
.
dynamic
else
None
train_dataset
=
MnistDataset
(
mode
=
'train'
)
val_dataset
=
MnistDataset
(
mode
=
'test'
)
...
...
@@ -118,7 +119,13 @@ def main():
optim
=
Momentum
(
learning_rate
=
FLAGS
.
lr
,
momentum
=
.
9
,
parameter_list
=
model
.
parameters
())
model
.
prepare
(
optim
,
CrossEntropy
(),
Accuracy
(
topk
=
(
1
,
2
)),
inputs
,
labels
)
model
.
prepare
(
optim
,
CrossEntropy
(),
Accuracy
(
topk
=
(
1
,
2
)),
inputs
,
labels
,
device
=
FLAGS
.
device
)
if
FLAGS
.
resume
is
not
None
:
model
.
load
(
FLAGS
.
resume
)
...
...
@@ -131,6 +138,8 @@ def main():
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
"CNN training on MNIST"
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
'gpu'
,
help
=
"device to use, gpu or cpu"
)
parser
.
add_argument
(
"-d"
,
"--dynamic"
,
action
=
'store_true'
,
help
=
"enable dygraph mode"
)
parser
.
add_argument
(
...
...
model.py
浏览文件 @
863897ce
此差异已折叠。
点击以展开。
tests/test_model.py
浏览文件 @
863897ce
...
...
@@ -28,7 +28,7 @@ import contextlib
import
paddle
from
paddle
import
fluid
from
paddle.fluid.dygraph.nn
import
Conv2D
,
Pool2D
,
Linear
from
model
import
Model
,
CrossEntropy
,
Input
,
Loss
,
init_context
from
model
import
Model
,
CrossEntropy
,
Input
,
Loss
,
set_device
from
metrics
import
Accuracy
from
callbacks
import
ProgBarLogger
from
paddle.fluid.io
import
BatchSampler
,
DataLoader
...
...
@@ -139,9 +139,30 @@ class MyCrossEntropy(Loss):
return
[
loss1
,
loss2
]
class
TestMnistDataset
(
MnistDataset
):
def
__init__
(
self
):
super
(
TestMnistDataset
,
self
).
__init__
(
mode
=
'test'
)
def
__getitem__
(
self
,
idx
):
return
self
.
images
[
idx
],
def
__len__
(
self
):
return
len
(
self
.
images
)
def
get_predict_accuracy
(
pred
,
gt
):
pred
=
np
.
argmax
(
pred
,
-
1
)
gt
=
np
.
array
(
gt
)
correct
=
pred
[:,
np
.
newaxis
]
==
gt
return
np
.
sum
(
correct
)
/
correct
.
shape
[
0
]
class
TestModel
(
unittest
.
TestCase
):
def
fit
(
self
,
dynamic
,
is_mlp
=
False
):
init_context
(
'dynamic'
if
dynamic
else
'static'
)
device
=
set_device
(
'gpu'
)
fluid
.
enable_dygraph
(
device
)
if
dynamic
else
None
im_shape
=
(
-
1
,
784
)
batch_size
=
128
...
...
@@ -151,19 +172,31 @@ class TestModel(unittest.TestCase):
train_dataset
=
MnistDataset
(
mode
=
'train'
)
val_dataset
=
MnistDataset
(
mode
=
'test'
)
test_dataset
=
TestMnistDataset
()
model
=
MNIST
()
if
not
is_mlp
else
MLP
()
optim
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.01
,
momentum
=
.
9
,
parameter_list
=
model
.
parameters
())
loss
=
CrossEntropy
()
if
not
is_mlp
else
MyCrossEntropy
()
model
.
prepare
(
optim
,
loss
,
Accuracy
(),
inputs
,
labels
)
model
.
prepare
(
optim
,
loss
,
Accuracy
(),
inputs
,
labels
,
device
=
device
)
cbk
=
ProgBarLogger
(
50
)
model
.
fit
(
train_dataset
,
val_dataset
,
epochs
=
2
,
batch_size
=
batch_size
,
callbacks
=
cbk
)
eval_result
=
model
.
evaluate
(
val_dataset
,
batch_size
=
batch_size
)
output
=
model
.
predict
(
test_dataset
,
batch_size
=
batch_size
)
np
.
testing
.
assert_equal
(
output
[
0
].
shape
[
0
],
len
(
test_dataset
))
acc
=
get_predict_accuracy
(
output
[
0
],
val_dataset
.
labels
)
np
.
testing
.
assert_allclose
(
acc
,
eval_result
[
'acc'
])
def
test_fit_static
(
self
):
self
.
fit
(
False
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录