Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
14fe40aa
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
14fe40aa
编写于
3月 14, 2018
作者:
D
dzhwinter
提交者:
GitHub
3月 14, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refine/nccl (#9009)
* "Refine nccl op" * "refine code " * "refine nccl code"
上级
788c600e
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
89 addition
and
142 deletion
+89
-142
paddle/fluid/operators/nccl_op.cc
paddle/fluid/operators/nccl_op.cc
+45
-47
paddle/fluid/operators/nccl_op.cu.cc
paddle/fluid/operators/nccl_op.cu.cc
+44
-95
未找到文件。
paddle/fluid/operators/nccl_op.cc
浏览文件 @
14fe40aa
...
...
@@ -104,19 +104,38 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
" Input(Communicator) of AllReduce op input should not be NULL"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
" Output(Out) of AllReduce op output should not be NULL"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
std
::
string
reduction
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"reduction"
);
PADDLE_ENFORCE
((
reduction
==
"ncclSum"
||
reduction
==
"ncclProd"
||
reduction
==
"ncclMin"
||
reduction
==
"ncclMax"
),
"invalid reduction."
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
// AllReduceOp
class
NCCLAllReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
NCCLAllReduceOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The input of AllReduce op"
);
AddInput
(
"Communicator"
,
"Communicator for communicating between gpus"
);
AddOutput
(
"Out"
,
"The output of AllReduce op"
);
AddAttr
<
std
::
string
>
(
"reduction"
,
"(string, default 'ncclSum') "
"{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}."
)
.
SetDefault
(
"ncclSum"
);
AddComment
(
R"DOC(
NCCLAllReduce Operator.
AllReduce the input tensors.
)DOC"
);
}
};
// ReduceOp
class
NCCLReduceOp
:
public
framework
::
OperatorWithKernel
{
public:
...
...
@@ -143,50 +162,6 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
}
};
// BcastOp
class
NCCLBcastOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
" Input(X) of Bcast op input should not be NULL"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Communicator"
),
" Input(Communicator) of Bcast op input should not be NULL"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
" Output(Out) of Bcast op output should not be NULL"
);
int
root
=
ctx
->
Attrs
().
Get
<
int
>
(
"root"
);
PADDLE_ENFORCE
(
root
!=
platform
::
kInvalidGPUId
,
"Bcast root must be set."
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
// AllreduceOp
class
NCCLAllReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
NCCLAllReduceOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The input of AllReduce op"
);
AddInput
(
"Communicator"
,
"Communicator for communicating between gpus"
);
AddOutput
(
"Out"
,
"The output of AllReduce op"
);
AddAttr
<
std
::
string
>
(
"reduction"
,
"(string, default 'ncclSum') "
"{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}."
)
.
SetDefault
(
"ncclSum"
);
AddComment
(
R"DOC(
NCCLAllReduce Operator.
AllReduce the input tensors.
)DOC"
);
}
};
// ReduceOp
class
NCCLReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
...
...
@@ -213,6 +188,29 @@ Reduce the tensors.
}
};
// BcastOp
class
NCCLBcastOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
" Input(X) of Bcast op input should not be NULL"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Communicator"
),
" Input(Communicator) of Bcast op input should not be NULL"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
" Output(Out) of Bcast op output should not be NULL"
);
int
root
=
ctx
->
Attrs
().
Get
<
int
>
(
"root"
);
PADDLE_ENFORCE
(
root
!=
platform
::
kInvalidGPUId
,
"Bcast root must be set."
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
// BcastOp
class
NCCLBcastOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
...
...
paddle/fluid/operators/nccl_op.cu.cc
浏览文件 @
14fe40aa
...
...
@@ -43,13 +43,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"This kernel only runs on GPU device."
);
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
auto
outs
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"Out"
);
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
std
::
string
reduction
=
ctx
.
Attr
<
std
::
string
>
(
"reduction"
);
ncclRedOp_t
reduction_op_
=
ncclSum
;
ncclRedOp_t
reduction_op_
=
ncclSum
;
if
(
reduction
==
"ncclMin"
)
{
reduction_op_
=
ncclMin
;
}
else
if
(
reduction
==
"ncclMax"
)
{
...
...
@@ -61,30 +60,19 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
}
else
{
PADDLE_THROW
(
"Invalid reduction. default ncclSum."
);
}
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
// device id
int
gpu_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
gpu_id
);
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
VLOG
(
1
)
<<
"gpu : "
<<
" invoke allreduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv "
<<
outs
[
i
]
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
ins
[
i
]
->
data
<
T
>
(),
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
outs
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
comm
->
comms
().
at
(
idx
),
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
"gpu : "
<<
" finished allreduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv "
<<
outs
[
i
]
->
numel
();
}
VLOG
(
3
)
<<
"gpu : "
<<
" invoke allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
x
->
data
<
T
>
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
)
<<
"gpu : "
<<
" finished allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
}
};
...
...
@@ -94,13 +82,13 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"This kernel only runs on GPU device."
);
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
// x0, x1, x2
auto
outs
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"Out
"
);
auto
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
// x0, x1, x2
auto
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator
"
);
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
std
::
string
reduction
=
ctx
.
Attr
<
std
::
string
>
(
"reduction"
);
ncclRedOp_t
reduction_op_
=
ncclSum
;
ncclRedOp_t
reduction_op_
=
ncclSum
;
if
(
reduction
==
"ncclMin"
)
{
reduction_op_
=
ncclMin
;
}
else
if
(
reduction
==
"ncclMax"
)
{
...
...
@@ -112,40 +100,21 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
}
else
{
PADDLE_THROW
(
"Invalid reduction. default ncclSum."
);
}
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
();
// device id
int
gpu_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
gpu_id
);
auto
ins_names
=
ctx
.
Inputs
(
"X"
);
std
::
hash
<
std
::
string
>
hasher
;
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
if
(
root
==
platform
::
kInvalidGPUId
)
{
root
=
hasher
(
ins_names
[
i
])
%
comm
->
comms
().
size
();
}
T
*
recvbuffer
=
nullptr
;
if
(
root
==
gpu_id
)
{
recvbuffer
=
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" invoke reduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv "
<<
outs
[
i
]
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
ins
[
i
]
->
data
<
T
>
(),
recvbuffer
,
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
root
,
comm
->
comms
().
at
(
idx
),
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" finished reduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv "
<<
outs
[
i
]
->
numel
();
T
*
recvbuffer
=
nullptr
;
if
(
root
==
gpu_id
)
{
recvbuffer
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
x
->
data
<
T
>
(),
recvbuffer
,
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
}
};
...
...
@@ -155,47 +124,27 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"This kernel only runs on GPU device."
);
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
();
// device id
int
gpu_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
gpu_id
);
if
(
idx
==
root
)
{
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
ins
[
i
]
->
numel
();
VLOG
(
1
)
<<
" before ncclBcast"
;
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
(
void
*
)
ins
[
i
]
->
data
<
T
>
(),
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
stream
));
VLOG
(
1
)
<<
" after ncclBcast"
;
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast."
;
}
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
x
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
(
void
*
)
x
->
data
<
T
>
(),
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast."
;
}
else
{
auto
outs
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"Out"
);
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
<<
framework
::
product
(
outs
[
i
]
->
dims
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
outs
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast. recv "
<<
outs
[
i
]
->
numel
();
}
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
<<
framework
::
product
(
out
->
dims
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast. recv "
<<
out
->
numel
();
}
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录