Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
6c16858f
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6c16858f
编写于
9月 16, 2020
作者:
S
sandyhouse
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update, test=develop
上级
a6344af2
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
276 addition
and
110 deletion
+276
-110
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+2
-2
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+27
-25
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+22
-16
paddle/fluid/framework/trainer_desc.proto
paddle/fluid/framework/trainer_desc.proto
+1
-1
paddle/fluid/operators/collective/c_recv_op.cc
paddle/fluid/operators/collective/c_recv_op.cc
+31
-3
paddle/fluid/operators/collective/c_recv_op.cu.cc
paddle/fluid/operators/collective/c_recv_op.cu.cc
+11
-3
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
...e/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+89
-32
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+38
-19
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+55
-9
未找到文件。
paddle/fluid/framework/device_worker.h
浏览文件 @
6c16858f
...
...
@@ -441,14 +441,14 @@ class SectionWorker : public DeviceWorker {
void
SetSkipVars
(
const
std
::
vector
<
std
::
string
>&
skip_vars
)
{
skip_vars_
=
skip_vars
;
}
void
SetStartCpuCoreId
(
int
id
)
{
cpu_id_
=
id
;
}
// static void ResetBatchId() { batch_id_ = 0; }
static
std
::
atomic
<
int
>
cpu_id_
;
protected:
void
AutoSetCPUAffinity
(
bool
reuse
);
int
section_id_
;
int
thread_id_
;
int
cpu_id_
;
int
num_microbatches_
;
std
::
vector
<
Scope
*>
microbatch_scopes_
;
std
::
vector
<
std
::
string
>
skip_vars_
;
...
...
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
6c16858f
...
...
@@ -34,8 +34,8 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
ParseDumpConfig
(
trainer_desc
);
// get filelist from trainer_desc here
// const std::vector<paddle::framework::DataFeed*> readers =
// VLOG(3) << "Number of program sections: " << section_num_;
// dataset->GetReaders();
// VLOG(3) << "Number of program sections: " << section_num_;
// VLOG(3) << "readers num: " << readers.size();
// int num_readers = readers.size();
// PADDLE_ENFORCE_EQ(num_readers, 1,
...
...
@@ -108,6 +108,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
this_worker
->
SetPlace
(
place_
);
this_worker
->
Initialize
(
trainer_desc
);
this_worker
->
SetMicrobatchNum
(
num_microbatches_
);
this_worker
->
SetStartCpuCoreId
(
start_cpu_core_id_
);
// set debug here
SetDebug
(
trainer_desc
.
debug
());
...
...
@@ -207,7 +208,7 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
}
else
if
(
!
var
->
Persistable
()
&&
!
is_param_grad
)
{
auto
*
ptr
=
microbatch_scopes_
[
microbatch_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
" microbatch "
<<
", which pointer is "
<<
ptr
;
<<
microbatch_id
<<
", which pointer is "
<<
ptr
;
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
...
...
@@ -235,39 +236,40 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
// }
// }
void
PipelineTrainer
::
GetSkipVars
(
const
ProgramDesc
&
program
)
{
auto
&
global_block
=
program
.
Block
(
0
);
for
(
auto
&
op
:
global_block
.
AllOps
())
{
if
(
op
->
Type
()
!=
"c_send"
)
{
continue
;
}
auto
input_arg_names
=
op
->
InputArgumentNames
();
PADDLE_ENFORCE_EQ
(
input_arg_names
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Number of input arguments for c_send op must be 1, "
"but the value given is %d."
,
input_arg_names
.
size
()));
std
::
string
input_arg_name
=
input_arg_names
[
0
];
if
(
input_arg_name
.
rfind
(
"@GRAD"
)
!=
input_arg_name
.
size
()
-
5
)
{
skip_vars_
.
emplace_back
(
input_arg_name
);
VLOG
(
3
)
<<
"add skip var name: "
<<
input_arg_name
;
}
}
}
// void PipelineTrainer::GetSkipVars(const ProgramDesc& program) {
// auto& global_block = program.Block(0);
// for (auto& op : global_block.AllOps()) {
// if (op->Type() != "c_send") {
// continue;
// }
// auto input_arg_names = op->InputArgumentNames();
// PADDLE_ENFORCE_EQ(input_arg_names.size(), 1,
// platform::errors::InvalidArgument(
// "Number of input arguments for c_send op must be 1,
// "
// "but the value given is %d.",
// input_arg_names.size()));
// std::string input_arg_name = input_arg_names[0];
// if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) {
// skip_vars_.emplace_back(input_arg_name);
// VLOG(3) << "add skip var name: " << input_arg_name;
// }
// }
// }
void
PipelineTrainer
::
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
)
{
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
platform
::
errors
::
InvalidArgument
(
"root_scope_ can not be nullptr"
));
auto
start_cpu_id
=
trainer_desc_
.
section_param
().
start_cpu_core_id
();
SectionWorker
::
cpu_id_
.
store
(
start_cpu_id
);
//
auto start_cpu_id = trainer_desc_.section_param().start_cpu_core_id();
//
SectionWorker::cpu_id_.store(start_cpu_id);
// minibatch_scopes_.resize(section_num_);
// microbatch_scopes_.resize(section_num_);
// minibatch_scopes_.resize(1);
microbatch_scopes_
.
resize
(
num_microbatches_
);
// skip_vars_.resize(section_num_);
VLOG
(
3
)
<<
"
Init ScopeQueues and create all scopes
"
;
VLOG
(
3
)
<<
"
Create minibatch and microbatch scopes...
"
;
// for (int i = 0; i < section_num_; ++i) {
minibatch_scope_
=
&
root_scope_
->
NewScope
();
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
...
...
@@ -282,7 +284,7 @@ void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
CopyParameters
(
j
,
*
program
,
place_
);
}
// GetSkipVars(i, *program);
GetSkipVars
(
*
program
);
//
GetSkipVars(*program);
// }
// for (int i = 0; i < section_num_; ++i) {
...
...
paddle/fluid/framework/section_worker.cc
浏览文件 @
6c16858f
...
...
@@ -30,7 +30,7 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
std
::
atomic
<
int
>
SectionWorker
::
cpu_id_
(
0
);
//
std::atomic<int> SectionWorker::cpu_id_(0);
// std::mutex SectionWorker::thread_mutex;
// std::mutex SectionWorker::cout_mutex;
// std::condition_variable SectionWorker::thread_condition;
...
...
@@ -48,18 +48,20 @@ void SectionWorker::Initialize(const TrainerDesc& desc) {
}
void
SectionWorker
::
AutoSetCPUAffinity
(
bool
reuse
)
{
int
thread_cpu_id
=
cpu_id_
.
fetch_add
(
1
);
//
int thread_cpu_id = cpu_id_.fetch_add(1);
unsigned
concurrency_cap
=
std
::
thread
::
hardware_concurrency
();
unsigned
proc
=
thread_cpu_id
;
// unsigned proc = thread_cpu_id;
unsigned
proc
=
cpu_id_
;
if
(
proc
>=
concurrency_cap
)
{
if
(
reuse
)
{
proc
%=
concurrency_cap
;
}
else
{
LOG
(
INFO
)
<<
"All "
<<
concurrency_cap
<<
" CPUs have been set affinities. Fail to set "
<<
thread_cpu_id
<<
"th thread"
;
<<
" CPUs have been set affinities. Fail to set "
<<
cpu_id_
<<
"th thread."
;
// << thread_cpu_id << "th thread";
return
;
}
}
...
...
@@ -78,7 +80,8 @@ void SectionWorker::AutoSetCPUAffinity(bool reuse) {
(
0
==
CPU_ISSET
(
proc
,
&
mask
)))
{
LOG
(
WARNING
)
<<
"Fail to set thread affinity to CPU "
<<
proc
;
}
VLOG
(
3
)
<<
"Set "
<<
thread_cpu_id
<<
"th thread affinity to CPU "
<<
proc
;
// VLOG(3) << "Set " << thread_cpu_id << "th thread affinity to CPU " << proc;
VLOG
(
3
)
<<
"Set "
<<
cpu_id_
<<
"th thread affinity to CPU "
<<
proc
;
}
void
SectionWorker
::
TrainFiles
()
{
...
...
@@ -141,7 +144,8 @@ void SectionWorker::TrainFiles() {
VLOG
(
3
)
<<
"thread completed."
;
// VLOG(3) << "called notify all";
// thread_condition.notify_all();
VLOG
(
0
)
<<
"EOF encountered"
;
VLOG
(
3
)
<<
"EOF encountered"
;
// throw platform::EOFException();
break
;
}
}
...
...
@@ -191,8 +195,8 @@ void SectionWorker::TrainFilesWithProfiler() {
platform
::
Timer
batch_timer
;
platform
::
Timer
timeline
;
std
::
vector
<
double
>
op_total_time
;
std
::
vector
<
std
::
string
>
op_name
;
std
::
vector
<
double
>
op_total_time
;
std
::
vector
<
double
>
op_max_time
;
std
::
vector
<
double
>
op_min_time
;
std
::
vector
<
uint64_t
>
op_count
;
...
...
@@ -204,6 +208,7 @@ void SectionWorker::TrainFilesWithProfiler() {
op_min_time
.
resize
(
ops_
.
size
());
for
(
size_t
i
=
0
;
i
<
op_min_time
.
size
();
++
i
)
{
op_min_time
[
i
]
=
DBL_MAX
;
op_max_time
[
i
]
=
0.0
;
}
op_count
.
resize
(
ops_
.
size
());
...
...
@@ -235,7 +240,7 @@ void SectionWorker::TrainFilesWithProfiler() {
struct
timeval
micro_end
;
// Start a minibatch.
batch_timer
.
Start
();
int
real_microbatch_num
=
0
;
//
int real_microbatch_num = 0;
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
try
{
int
op_idx
=
0
;
...
...
@@ -253,8 +258,9 @@ void SectionWorker::TrainFilesWithProfiler() {
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
));
if
((
i
==
0
&&
run_first_mbatch
)
||
(
i
!=
0
&&
run_others
))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for scope "
<<
i
;
// VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
// << " for scope " << i;
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for scope "
<<
i
;
timeline
.
Start
();
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
if
(
gc
)
{
...
...
@@ -365,11 +371,11 @@ void SectionWorker::TrainFilesWithProfiler() {
}
}
dev_ctx_
->
Wait
();
if
(
real_microbatch_num
==
0
)
{
batch_timer
.
Pause
();
VLOG
(
0
)
<<
"batch time: "
<<
batch_timer
.
ElapsedUS
();
return
;
}
//
if (real_microbatch_num == 0) {
//
batch_timer.Pause();
//
VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
//
return;
//
}
// update pass
int
op_idx
=
0
;
gettimeofday
(
&
micro_start
,
NULL
);
...
...
paddle/fluid/framework/trainer_desc.proto
浏览文件 @
6c16858f
...
...
@@ -84,7 +84,7 @@ message DownpourWorkerParameter {
}
message
SectionWorkerParameter
{
SectionConfig
section_config
=
1
;
optional
SectionConfig
section_config
=
1
;
optional
int32
queue_size
=
2
[
default
=
1
];
optional
int64
sync_steps
=
3
[
default
=
1
];
optional
int32
start_cpu_core_id
=
4
[
default
=
1
];
...
...
paddle/fluid/operators/collective/c_recv_op.cc
浏览文件 @
6c16858f
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_recv_op.h"
#include <string>
namespace
paddle
{
namespace
operators
{
...
...
@@ -33,14 +34,36 @@ class CRecvOp : public framework::OperatorWithKernel {
ring_id
,
0
,
platform
::
errors
::
InvalidArgument
(
"The ring_id (%d) for c_send_op must be non-negative."
,
ring_id
));
auto
out_shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"out_shape"
);
PADDLE_ENFORCE_GE
(
out_shape
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The size of the output shape must be greater than 0 "
"but the value given is %d."
,
out_shape
.
size
()));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
dtype
=
out
->
type
();
return
framework
::
OpKernelType
(
dtype
,
ctx
.
GetPlace
());
VLOG
(
0
)
<<
"wow1"
;
std
::
string
dtype
=
ctx
.
Attr
<
std
::
string
>
(
"dtype"
);
framework
::
proto
::
VarType
::
Type
type
;
if
(
dtype
==
"fp32"
)
{
type
=
framework
::
proto
::
VarType
::
FP32
;
}
else
if
(
dtype
==
"fp64"
)
{
type
=
framework
::
proto
::
VarType
::
FP64
;
}
else
if
(
dtype
==
"fp16"
)
{
type
=
framework
::
proto
::
VarType
::
FP16
;
}
else
if
(
dtype
==
"int32"
)
{
type
=
framework
::
proto
::
VarType
::
INT32
;
}
else
if
(
dtype
==
"int64"
)
{
type
=
framework
::
proto
::
VarType
::
INT64
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Unknown data type %s for c_recv op."
,
dtype
));
}
VLOG
(
0
)
<<
"wow2"
;
return
framework
::
OpKernelType
(
type
,
ctx
.
GetPlace
());
// OperatorWithKernel::IndicateVarDataType(ctx, "Out"), ctx.GetPlace());
}
};
...
...
@@ -52,6 +75,11 @@ class CRecvOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) nccl communication ring id."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"peer"
,
"(int default 0) rank id for sender."
).
SetDefault
(
0
);
AddAttr
<
std
::
string
>
(
"dtype"
,
"(std::string default fp32) data type of tensor."
)
.
SetDefault
(
"fp32"
);
AddAttr
<
std
::
vector
<
int
>>
(
"out_shape"
,
"shape of the output tensor."
)
.
SetDefault
(
std
::
vector
<
int
>
());
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/c_recv_op.cu.cc
浏览文件 @
6c16858f
...
...
@@ -27,13 +27,20 @@ class CRecvOpCUDAKernel : public framework::OpKernel<T> {
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_NCCL)
VLOG
(
0
)
<<
"here1"
;
auto
out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
int
numel
=
out
->
numel
();
ncclDataType_t
dtype
=
platform
::
ToNCCLDataType
(
out
->
type
());
VLOG
(
0
)
<<
"here2"
;
auto
out_shape
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"out_shape"
);
auto
out_dims
=
paddle
::
framework
::
make_ddim
(
out_shape
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
out
->
mutable_data
<
T
>
(
out_dims
,
place
);
VLOG
(
0
)
<<
"out_dims:"
<<
out_dims
;
ncclDataType_t
dtype
=
platform
::
ToNCCLDataType
(
out
->
type
());
int
numel
=
out
->
numel
();
VLOG
(
0
)
<<
"numel:"
<<
numel
;
cudaStream_t
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
...
...
@@ -49,9 +56,10 @@ class CRecvOpCUDAKernel : public framework::OpKernel<T> {
platform
::
errors
::
InvalidArgument
(
"The value of peer (%d) you set must "
"be less than comm->nranks (%d)."
,
peer
,
comm
->
nranks
()));
VLOG
(
0
)
<<
"here3"
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
out
->
data
<
T
>
(),
numel
,
dtype
,
peer
,
comm
->
comm
(),
stream
));
VLOG
(
3
)
<<
"rank "
<<
comm
->
rank
()
<<
" recv "
VLOG
(
0
)
<<
"rank "
<<
comm
->
rank
()
<<
" recv "
<<
framework
::
product
(
out
->
dims
())
<<
" from "
<<
peer
;
#else
PADDLE_THROW
(
...
...
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
浏览文件 @
6c16858f
...
...
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
from
__future__
import
print_function
from
__future__
import
division
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
,
unique_name
...
...
@@ -21,9 +22,50 @@ from .meta_optimizer_base import MetaOptimizerBase
from
.common
import
OpRole
,
OP_ROLE_KEY
,
OP_ROLE_VAR_KEY
,
CollectiveHelper
,
is_update_op
,
is_loss_grad_op
,
is_backward_op
,
is_optimizer_op
class
PipelineHelper
(
CollectiveHelper
):
def
__init__
(
self
,
role_maker
,
nrings
=
1
,
wait_port
=
'6174'
):
super
(
PipelineHelper
,
self
).
__init__
(
role_maker
,
nrings
,
wait_port
)
def
_get_node_num
(
endpoints
):
ss
=
set
()
for
ep
in
endpoints
:
ip
=
ep
.
split
(
":"
)[
0
].
strip
()
if
ip
not
in
ss
:
ss
.
add
(
ip
)
return
len
(
ss
)
class
PipelineHelper
(
object
):
def
__init__
(
self
,
role_maker
,
wait_port
=
'6174'
):
self
.
wait_port
=
wait_port
self
.
role_maker
=
role_maker
def
update_startup_program
(
self
,
startup_program
=
None
):
self
.
startup_program
=
startup_program
if
startup_program
is
None
:
self
.
startup_program
=
fluid
.
default_startup_program
()
endpoints
=
self
.
role_maker
.
get_trainer_endpoints
()
current_endpoint
=
endpoints
[
self
.
role_maker
.
worker_index
()]
node_num
=
_get_node_num
(
endpoints
)
assert
len
(
endpoints
)
%
node_num
==
0
gpus_per_node
=
len
(
endpoints
)
//
node_num
# Create a global ring for all gpus
print
(
"current_endpoint:"
,
current_endpoint
)
print
(
"endpoints:"
,
endpoints
)
print
(
"rank:"
,
self
.
role_maker
.
worker_index
())
self
.
_init_communicator
(
self
.
startup_program
,
current_endpoint
,
endpoints
,
self
.
role_maker
.
worker_index
(),
0
,
self
.
wait_port
)
if
node_num
==
1
:
return
# Create rings for gpus with the same gpu id
eps
=
[]
local_rank
=
self
.
role_maker
.
worker_index
()
%
gpus_per_node
ring_id
=
local_rank
+
1
for
i
in
range
(
node_num
):
eps
.
append
(
endpoints
[
i
*
gpus_per_node
+
local_rank
])
temp_rank
=
self
.
role_maker
.
worker_index
()
//
node_num
self
.
_init_communicator
(
self
.
startup_program
,
current_endpoint
,
eps
,
temp_rank
,
ring_id
,
self
.
wait_port
)
self
.
_broadcast_params
(
ring_id
)
def
_init_communicator
(
self
,
program
,
current_endpoint
,
endpoints
,
rank
,
ring_id
,
wait_port
):
...
...
@@ -46,9 +88,8 @@ class PipelineHelper(CollectiveHelper):
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
...
...
@@ -58,12 +99,10 @@ class PipelineHelper(CollectiveHelper):
'rank'
:
rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
'device_id'
:
OpRole
.
Forward
})
def
_broadcast_params
(
self
):
def
_broadcast_params
(
self
,
ring_id
):
block
=
self
.
startup_program
.
global_block
()
ring_id
=
0
for
param
in
block
.
iter_parameters
():
if
param
.
is_distributed
:
continue
...
...
@@ -78,13 +117,12 @@ class PipelineHelper(CollectiveHelper):
OP_ROLE_KEY
:
OpRole
.
Forward
})
for
ring_id
in
range
(
self
.
nrings
):
block
.
append_op
(
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
block
.
append_op
(
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
class
PipelineOptimizer
(
MetaOptimizerBase
):
...
...
@@ -100,7 +138,12 @@ class PipelineOptimizer(MetaOptimizerBase):
super
(
PipelineOptimizer
,
self
).
_set_basic_info
(
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
)
num_microbatches
=
user_defined_strategy
.
pipeline_configs
[
'micro_batch'
]
self
.
wrapped_opt
=
PO
(
self
.
inner_opt
,
num_microbatches
=
num_microbatches
)
endpoints
=
role_maker
.
get_trainer_endpoints
()
current_endpoint
=
endpoints
[
role_maker
.
worker_index
()]
self
.
local_rank
=
self
.
_get_local_rank
(
current_endpoint
,
endpoints
)
self
.
wrapped_opt
=
PO
(
self
.
inner_opt
,
num_microbatches
=
num_microbatches
,
start_cpu_core_id
=
self
.
local_rank
)
def
_can_apply
(
self
):
if
self
.
user_defined_strategy
.
pipeline
==
True
:
...
...
@@ -111,23 +154,37 @@ class PipelineOptimizer(MetaOptimizerBase):
dist_strategy
.
pipeline
=
False
dist_strategy
.
pipeline_configs
=
{}
def
_get_local_rank
(
self
,
current_endpoint
,
endpoints
):
cur_node_endpoints
=
[]
cur_ip
=
current_endpoint
.
split
(
':'
)[
0
].
strip
()
for
ep
in
endpoints
:
if
cur_ip
==
ep
.
split
(
':'
)[
0
].
strip
():
cur_node_endpoints
.
append
(
ep
)
return
cur_node_endpoints
.
index
(
current_endpoint
)
def
minimize_impl
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
optimize_ops
,
params_grads
,
prog_list
=
\
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
if
self
.
role_maker
.
worker_num
()
==
1
:
return
optimize_ops
,
params_grads
endpoints
=
self
.
role_maker
.
get_trainer_endpoints
()
current_endpoint
=
endpoints
[
self
.
role_maker
.
worker_index
()]
node_num
=
_get_node_num
(
endpoints
)
gpus_per_node
=
len
(
endpoints
)
//
node_num
self
.
startup_program
=
startup_program
self
.
local_rank
=
self
.
_get_local_rank
(
current_endpoint
,
endpoints
)
if
startup_program
is
None
:
self
.
startup_program
=
fluid
.
default_startup_program
()
if
self
.
role_maker
.
worker_num
()
==
1
:
return
self
.
inner_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
loss
.
block
.
program
.
_pipeline_opt
=
dict
()
loss
.
block
.
program
.
_pipeline_opt
[
'local_rank'
]
=
self
.
local_rank
optimize_ops
,
params_grads
,
prog_list
=
\
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
assert
prog_list
self
.
main_program_list
=
prog_list
self
.
main_program
=
loss
.
block
.
program
...
...
@@ -139,24 +196,24 @@ class PipelineOptimizer(MetaOptimizerBase):
self
.
endpoints
=
endpoints
self
.
current_endpoint
=
current_endpoint
pipeline_helper
=
PipelineHelper
(
self
.
role_maker
,
nrings
=
self
.
nrings
)
pipeline_helper
=
PipelineHelper
(
self
.
role_maker
)
pipeline_helper
.
update_startup_program
(
self
.
startup_program
)
self
.
_transpile_main_program
()
self
.
_transpile_main_program
(
loss
,
node_num
,
gpus_per_node
)
return
optimize_ops
,
params_grads
def
_transpile_main_program
(
self
):
self
.
_insert_loss_grad_ops
()
for
ring_id
in
range
(
self
.
nrings
):
def
_transpile_main_program
(
self
,
loss
,
node_num
,
gpus_per_node
):
self
.
_insert_loss_grad_ops
(
loss
,
gpus_per_node
,
node_num
)
for
ring_id
in
range
(
1
,
node_num
+
1
):
self
.
_insert_allreduce_ops
(
ring_id
)
def
_insert_loss_grad_ops
(
self
):
def
_insert_loss_grad_ops
(
self
,
loss
,
gpus_per_node
,
node_num
):
"""
In order to keep the learning rate consistent in different numbers of
training workers, we scale the loss grad by the number of workers
"""
block
=
self
.
main_program_list
[
self
.
nrings
-
1
][
'program'
].
global_block
(
)
block
=
self
.
main_program_list
[
gpus_per_node
-
1
][
'program'
].
global_block
(
)
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
if
is_loss_grad_op
(
op
):
loss_grad_var
=
block
.
vars
[
op
.
output_arg_names
[
0
]]
...
...
@@ -166,12 +223,12 @@ class PipelineOptimizer(MetaOptimizerBase):
inputs
=
{
'X'
:
loss_grad_var
},
outputs
=
{
'Out'
:
loss_grad_var
},
attrs
=
{
'scale'
:
1.0
/
self
.
nranks
,
'scale'
:
1.0
/
node_num
,
OP_ROLE_KEY
:
OpRole
.
Backward
})
def
_insert_allreduce_ops
(
self
,
ring_id
):
block
=
self
.
main_program_list
[
ring_id
][
'program'
].
global_block
()
block
=
self
.
main_program_list
[
ring_id
-
1
][
'program'
].
global_block
()
origin_block
=
self
.
main_program
.
global_block
()
grad
=
None
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
6c16858f
...
...
@@ -406,25 +406,44 @@ class Section(DeviceWorker):
section_param
=
trainer_desc
.
section_param
section_param
.
num_microbatches
=
pipeline_opt
[
"num_microbatches"
]
section_param
.
start_cpu_core_id
=
pipeline_opt
[
"start_cpu_core_id"
]
for
i
,
program
in
enumerate
(
pipeline_opt
[
"section_program_list"
]):
cfg
=
section_param
.
section_config
.
add
()
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
].
_get_desc
()
.
serialize_to_string
())
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
place
=
pipeline_opt
[
"place_list"
][
i
]
place_id
=
pipeline_opt
[
"place_id_list"
][
i
]
if
isinstance
(
place
,
core
.
CPUPlace
):
cfg
.
place
=
cfg
.
CPUPlace
elif
isinstance
(
place
,
core
.
CUDAPlace
):
cfg
.
place
=
cfg
.
CUDAPlace
elif
isinstance
(
place
,
core
.
CUDAPinnedPlace
):
cfg
.
place
=
cfg
.
CUDAPinnedPlace
else
:
raise
NotImplementedError
(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
)
cfg
.
place_id
=
place_id
cfg
=
section_param
.
section_config
program
=
pipeline_opt
[
"section_program"
]
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
].
_get_desc
()
.
serialize_to_string
())
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
place
=
pipeline_opt
[
"place"
]
place_id
=
pipeline_opt
[
"place_id"
]
if
isinstance
(
place
,
core
.
CPUPlace
):
cfg
.
place
=
cfg
.
CPUPlace
elif
isinstance
(
place
,
core
.
CUDAPlace
):
cfg
.
place
=
cfg
.
CUDAPlace
elif
isinstance
(
place
,
core
.
CUDAPinnedPlace
):
cfg
.
place
=
cfg
.
CUDAPinnedPlace
else
:
raise
NotImplementedError
(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
)
cfg
.
place_id
=
place_id
# for i, program in enumerate(pipeline_opt["section_program_list"]):
# cfg = section_param.section_config.add()
# cfg.program_desc.ParseFromString(program["program"]._get_desc()
# .serialize_to_string())
# # TODO: why does not work
# # cfg.program_desc.CopyFrom(program.program._get_desc())
# place = pipeline_opt["place_list"][i]
# place_id = pipeline_opt["place_id_list"][i]
# if isinstance(place, core.CPUPlace):
# cfg.place = cfg.CPUPlace
# elif isinstance(place, core.CUDAPlace):
# cfg.place = cfg.CUDAPlace
# elif isinstance(place, core.CUDAPinnedPlace):
# cfg.place = cfg.CUDAPinnedPlace
# else:
# raise NotImplementedError(
# "SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
# )
# cfg.place_id = place_id
class
DeviceWorkerFactory
(
object
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
6c16858f
...
...
@@ -3818,6 +3818,24 @@ class PipelineOptimizer(object):
return
programs
def
_split_startup_program
(
self
,
startup_program
,
local_rank
):
block
=
startup_program
.
block
(
0
)
new_startup_program
=
Program
()
for
op
in
block
.
ops
:
device
=
op
.
attr
(
self
.
_op_device_key
)
if
device
:
device_index
=
int
(
device
.
split
(
":"
)[
1
])
else
:
device_index
=
0
if
device_index
!=
local_rank
:
continue
op_role
=
op
.
attr
(
self
.
_op_role_key
)
op_desc
=
op
.
desc
ap_op
=
new_startup_program
.
block
(
0
).
desc
.
append_op
()
ap_op
.
copy_from
(
op_desc
)
ap_op
.
_set_attr
(
self
.
_op_device_key
,
device
)
self
.
_create_vars
(
new_startup_program
.
block
(
0
),
startup_program
)
return
new_startup_program
def
_find_post_op
(
self
,
ops
,
cur_op
,
var_name
):
"""
Find the real post op that has variable named var_name as input.
...
...
@@ -3933,6 +3951,7 @@ class PipelineOptimizer(object):
if
op
.
type
==
"read"
:
break
first_dev_spec
=
devices
[
0
]
first_dev_index
=
int
(
first_dev_spec
.
split
(
':'
)[
1
])
for
var_name
in
data_devices_map
.
keys
():
for
device
in
data_devices_map
[
var_name
]:
if
device
==
first_dev_spec
:
continue
...
...
@@ -3940,13 +3959,15 @@ class PipelineOptimizer(object):
assert
main_var
.
is_data
if
not
var_name
in
first_block
.
vars
:
self
.
_create_var
(
first_block
,
main_var
,
var_name
)
dev_index
=
int
(
device
.
split
(
':'
)[
1
])
first_block
.
_insert_op
(
index
=
insert_index
,
type
=
'c_send'
,
inputs
=
{
'X'
:
first_block
.
var
(
var_name
)},
attrs
=
{
self
.
_op_device_key
:
first_dev_spec
,
self
.
_op_role_key
:
self
.
_op_role
.
Forward
self
.
_op_role_key
:
self
.
_op_role
.
Forward
,
'peer'
:
dev_index
})
# Get the device that that data on
assert
device
in
devices
...
...
@@ -3961,8 +3982,10 @@ class PipelineOptimizer(object):
type
=
'c_recv'
,
outputs
=
{
'Out'
:
[
new_var
]},
attrs
=
{
'out_shape'
:
new_var
.
shape
,
self
.
_op_device_key
:
device
,
self
.
_op_role_key
:
self
.
_op_role
.
Forward
,
'peer'
:
first_dev_index
})
def
_strip_grad_suffix
(
self
,
name
):
...
...
@@ -4105,13 +4128,16 @@ class PipelineOptimizer(object):
op_role
=
op
.
all_attrs
()[
self
.
_op_role_key
]
var
=
block
.
vars
[
var_name
]
prev_device_index
=
int
(
prev_device_spec
.
split
(
':'
)[
1
])
cur_device_index
=
int
(
cur_device_spec
.
split
(
':'
)[
1
])
block
.
_insert_op
(
index
=
index
+
extra_index
,
type
=
'c_send'
,
inputs
=
{
'X'
:
var
},
attrs
=
{
self
.
_op_device_key
:
prev_device_spec
,
self
.
_op_role_key
:
op_role
self
.
_op_role_key
:
op_role
,
'peer'
:
prev_device_index
})
extra_index
+=
1
block
.
_insert_op
(
...
...
@@ -4119,8 +4145,10 @@ class PipelineOptimizer(object):
type
=
'c_recv'
,
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
'out_shape'
:
var
.
shape
,
self
.
_op_device_key
:
cur_device_spec
,
self
.
_op_role_key
:
op_role
self
.
_op_role_key
:
op_role
,
'peer'
:
cur_device_index
})
extra_index
+=
1
...
...
@@ -4271,9 +4299,13 @@ class PipelineOptimizer(object):
write_prog
=
write_info
[
var_name
]
write_block
=
write_prog
.
block
(
0
)
write_device
=
self
.
_get_device_info
(
write_block
)
write_dev_index
=
int
(
write_device
.
split
(
':'
)[
1
])
all_progs
=
var_info
[
var_name
]
for
prog
in
all_progs
:
if
prog
==
write_prog
:
continue
read_block
=
prog
.
block
(
0
)
read_device
=
self
.
_get_device_info
(
read_block
)
read_dev_index
=
int
(
read_device
.
split
(
':'
)[
1
])
write_block
.
_insert_op
(
index
=
0
,
...
...
@@ -4283,19 +4315,20 @@ class PipelineOptimizer(object):
self
.
_op_device_key
:
write_device
,
# A trick to make the role LRSched to avoid copy every
# microbatch
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
'peer'
:
read_dev_index
})
read_block
=
prog
.
block
(
0
)
read_device
=
self
.
_get_device_info
(
read_block
)
read_block
.
_insert_op
(
index
=
0
,
type
=
'c_recv'
,
outputs
=
{
'Out'
:
[
read_block
.
var
(
var_name
)]},
attrs
=
{
'out_shape'
:
read_block
.
var
(
var_name
).
shape
,
self
.
_op_device_key
:
read_device
,
# A trick to make the role LRSched to avoid copy every
# microbatch
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
'peer'
:
write_dev_index
})
def
minimize
(
self
,
...
...
@@ -4363,12 +4396,25 @@ class PipelineOptimizer(object):
# Step7: Add sub blocks for section programs
self
.
_add_sub_blocks
(
main_block
,
program_list
)
assert
(
main_program
.
_pipeline_opt
and
isinstance
(
main_program
.
_pipeline_opt
,
dict
)
and
'local_rank'
in
main_program
.
_pipeline_opt
),
\
"You must use pipeline with fleet"
local_rank
=
main_program
.
_pipeline_opt
[
'local_rank'
]
# Step8: Split startup program
startup_program
=
self
.
_split_startup_program
(
startup_program
,
program_list
[
local_rank
][
'program'
])
with
open
(
"startup_prog_%d"
%
local_rank
,
'w'
)
as
f
:
f
.
writelines
(
str
(
startup_program
))
with
open
(
"main_prog_%d"
%
local_rank
,
'w'
)
as
f
:
f
.
writelines
(
str
(
program_list
[
local_rank
][
'program'
]))
main_program
.
_pipeline_opt
=
{
"trainer"
:
"PipelineTrainer"
,
"device_worker"
:
"Section"
,
"section_program
_list"
:
program_list
,
"place
_list"
:
place_list
,
"place_id
_list"
:
place_id_list
,
"section_program
"
:
program_list
[
local_rank
]
,
"place
"
:
place_list
[
local_rank
]
,
"place_id
"
:
place_id_list
[
local_rank
]
,
"sync_steps"
:
-
1
,
"num_microbatches"
:
self
.
_num_microbatches
,
"start_cpu_core_id"
:
self
.
_start_cpu_core_id
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录