Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
ERNIE
提交
67edb3e1
E
ERNIE
项目概览
PaddlePaddle
/
ERNIE
大约 1 年 前同步成功
通知
109
Star
5997
Fork
1270
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
29
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
E
ERNIE
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
29
Issue
29
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
67edb3e1
编写于
7月 01, 2019
作者:
Y
Yibing Liu
提交者:
GitHub
7月 01, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #174 from chengduoZH/benchmark
[NOT MERGE]Support multi process training for BERT
上级
f2683894
d45d17fb
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
95 addition
and
12 deletion
+95
-12
BERT/dist_utils.py
BERT/dist_utils.py
+45
-0
BERT/reader/cls.py
BERT/reader/cls.py
+4
-1
BERT/run_classifier.py
BERT/run_classifier.py
+46
-11
未找到文件。
BERT/dist_utils.py
0 → 100644
浏览文件 @
67edb3e1
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
paddle.fluid
as
fluid
def
nccl2_prepare
(
trainer_id
,
startup_prog
,
main_prog
):
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
mode
=
"nccl2"
t
=
fluid
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
trainer_id
,
trainers
=
os
.
environ
.
get
(
'PADDLE_TRAINER_ENDPOINTS'
),
current_endpoint
=
os
.
environ
.
get
(
'PADDLE_CURRENT_ENDPOINT'
),
startup_program
=
startup_prog
,
program
=
main_prog
)
def
prepare_for_multi_process
(
exe
,
build_strategy
,
train_prog
):
# prepare for multi-process
trainer_id
=
int
(
os
.
environ
.
get
(
'PADDLE_TRAINER_ID'
,
0
))
num_trainers
=
int
(
os
.
environ
.
get
(
'PADDLE_TRAINERS_NUM'
,
1
))
if
num_trainers
<
2
:
return
print
(
"PADDLE_TRAINERS_NUM"
,
num_trainers
)
print
(
"PADDLE_TRAINER_ID"
,
trainer_id
)
build_strategy
.
num_trainers
=
num_trainers
build_strategy
.
trainer_id
=
trainer_id
# NOTE(zcd): use multi processes to train the model,
# and each process use one GPU card.
startup_prog
=
fluid
.
Program
()
nccl2_prepare
(
trainer_id
,
startup_prog
,
train_prog
)
# the startup_prog are run two times, but it doesn't matter.
exe
.
run
(
startup_prog
)
BERT/reader/cls.py
浏览文件 @
67edb3e1
...
...
@@ -123,7 +123,8 @@ class DataProcessor(object):
phase
=
'train'
,
epoch
=
1
,
dev_count
=
1
,
shuffle
=
True
):
shuffle
=
True
,
shuffle_seed
=
None
):
"""
Generate data for train, dev or test.
...
...
@@ -149,6 +150,8 @@ class DataProcessor(object):
def
instance_reader
():
for
epoch_index
in
range
(
epoch
):
if
shuffle
:
if
shuffle_seed
is
not
None
:
np
.
random
.
seed
(
shuffle_seed
)
np
.
random
.
shuffle
(
examples
)
if
phase
==
'train'
:
self
.
current_train_epoch
=
epoch_index
...
...
BERT/run_classifier.py
浏览文件 @
67edb3e1
...
...
@@ -32,6 +32,9 @@ from model.classifier import create_model
from
optimization
import
optimization
from
utils.args
import
ArgumentGroup
,
print_arguments
from
utils.init
import
init_pretraining_params
,
init_checkpoint
import
dist_utils
num_trainers
=
int
(
os
.
environ
.
get
(
'PADDLE_TRAINERS_NUM'
,
1
))
# yapf: disable
parser
=
argparse
.
ArgumentParser
(
__doc__
)
...
...
@@ -76,6 +79,7 @@ data_g.add_arg("random_seed", int, 0, "Random seed.")
run_type_g
=
ArgumentGroup
(
parser
,
"run_type"
,
"running type options."
)
run_type_g
.
add_arg
(
"use_cuda"
,
bool
,
True
,
"If set, use GPU for training."
)
run_type_g
.
add_arg
(
"use_fast_executor"
,
bool
,
False
,
"If set, use fast parallel executor (in experiment)."
)
run_type_g
.
add_arg
(
"shuffle"
,
bool
,
True
,
""
)
run_type_g
.
add_arg
(
"num_iteration_per_drop_scope"
,
int
,
1
,
"Ihe iteration intervals to clean up temporary variables."
)
run_type_g
.
add_arg
(
"task_name"
,
str
,
None
,
"The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}."
)
...
...
@@ -106,6 +110,15 @@ def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase):
(
eval_phase
,
np
.
sum
(
total_cost
)
/
np
.
sum
(
total_num_seqs
),
np
.
sum
(
total_acc
)
/
np
.
sum
(
total_num_seqs
),
time_end
-
time_begin
))
def
get_device_num
():
# NOTE(zcd): for multi-processe training, each process use one GPU card.
if
num_trainers
>
1
:
return
1
visible_device
=
os
.
environ
.
get
(
'CUDA_VISIBLE_DEVICES'
,
None
)
if
visible_device
:
device_num
=
len
(
visible_device
.
split
(
','
))
else
:
device_num
=
subprocess
.
check_output
([
'nvidia-smi'
,
'-L'
]).
decode
().
count
(
'
\n
'
)
return
device_num
def
main
(
args
):
bert_config
=
BertConfig
(
args
.
bert_config_path
)
...
...
@@ -113,7 +126,7 @@ def main(args):
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
int
(
os
.
getenv
(
'FLAGS_selected_gpus'
,
'0'
)))
dev_count
=
fluid
.
core
.
get_cuda_device_count
()
dev_count
=
get_device_num
()
else
:
place
=
fluid
.
CPUPlace
()
dev_count
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
...
...
@@ -139,17 +152,24 @@ def main(args):
raise
ValueError
(
"For args `do_train`, `do_val` and `do_test`, at "
"least one of them must be True."
)
train_program
=
fluid
.
Program
()
startup_prog
=
fluid
.
Program
()
if
args
.
random_seed
is
not
None
:
startup_prog
.
random_seed
=
args
.
random_seed
train_program
.
random_seed
=
args
.
random_seed
if
args
.
do_train
:
# NOTE: If num_trainers > 1, the shuffle_seed must be set, because
# the order of batch data generated by reader
# must be the same in the respective processes.
shuffle_seed
=
1
if
num_trainers
>
1
else
None
train_data_generator
=
processor
.
data_generator
(
batch_size
=
args
.
batch_size
,
phase
=
'train'
,
epoch
=
args
.
epoch
,
dev_count
=
dev_count
,
shuffle
=
True
)
shuffle
=
args
.
shuffle
,
shuffle_seed
=
shuffle_seed
)
num_train_examples
=
processor
.
get_num_examples
(
phase
=
'train'
)
...
...
@@ -165,8 +185,6 @@ def main(args):
print
(
"Max train steps: %d"
%
max_train_steps
)
print
(
"Num warmup steps: %d"
%
warmup_steps
)
train_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
train_program
,
startup_prog
):
with
fluid
.
unique_name
.
guard
():
train_pyreader
,
loss
,
probs
,
accuracy
,
num_seqs
=
create_model
(
...
...
@@ -249,13 +267,21 @@ def main(args):
exec_strategy
.
use_experimental_executor
=
args
.
use_fast_executor
exec_strategy
.
num_threads
=
dev_count
exec_strategy
.
num_iteration_per_drop_scope
=
args
.
num_iteration_per_drop_scope
build_strategy
=
fluid
.
BuildStrategy
()
if
args
.
use_cuda
and
num_trainers
>
1
:
assert
shuffle_seed
is
not
None
dist_utils
.
prepare_for_multi_process
(
exe
,
build_strategy
,
train_program
)
train_data_generator
=
fluid
.
contrib
.
reader
.
distributed_batch_reader
(
train_data_generator
)
train_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
args
.
use_cuda
,
loss_name
=
loss
.
name
,
exec_strategy
=
exec_strategy
,
build_strategy
=
build_strategy
,
main_program
=
train_program
)
train_pyreader
.
decorate_tensor_provider
(
train_data_generator
)
else
:
train_exe
=
None
...
...
@@ -271,9 +297,10 @@ def main(args):
steps
=
0
total_cost
,
total_acc
,
total_num_seqs
=
[],
[],
[]
time_begin
=
time
.
time
()
throughput
=
[]
while
True
:
try
:
steps
+=
1
#
steps += 1
if
steps
%
args
.
skip_steps
==
0
:
if
warmup_steps
<=
0
:
fetch_list
=
[
loss
.
name
,
accuracy
.
name
,
num_seqs
.
name
]
...
...
@@ -309,21 +336,29 @@ def main(args):
)
time_end
=
time
.
time
()
used_time
=
time_end
-
time_begin
print
(
"epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s"
%
(
current_epoch
,
current_example
,
num_train_examples
,
log_record
=
"epoch: {}, progress: {}/{}, step: {}, ave loss: {}, ave acc: {}"
.
format
(
current_epoch
,
current_example
,
num_train_examples
,
steps
,
np
.
sum
(
total_cost
)
/
np
.
sum
(
total_num_seqs
),
np
.
sum
(
total_acc
)
/
np
.
sum
(
total_num_seqs
),
args
.
skip_steps
/
used_time
))
np
.
sum
(
total_acc
)
/
np
.
sum
(
total_num_seqs
))
if
steps
>
0
:
throughput
.
append
(
args
.
skip_steps
/
used_time
)
log_record
=
log_record
+
", speed: %f steps/s"
%
(
args
.
skip_steps
/
used_time
)
print
(
log_record
)
else
:
print
(
log_record
)
total_cost
,
total_acc
,
total_num_seqs
=
[],
[],
[]
time_begin
=
time
.
time
()
steps
+=
1
if
steps
%
args
.
save_steps
==
0
:
save_path
=
os
.
path
.
join
(
args
.
checkpoints
,
"step_"
+
str
(
steps
))
fluid
.
io
.
save_persistables
(
exe
,
save_path
,
train_program
)
if
steps
%
args
.
validation_steps
==
0
:
print
(
"Average throughtput: %s"
%
(
np
.
average
(
throughput
)))
throughput
=
[]
# evaluate dev set
if
args
.
do_val
:
test_pyreader
.
decorate_tensor_provider
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录