Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
027f9953
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
027f9953
编写于
5月 14, 2020
作者:
W
WangXi
提交者:
GitHub
5月 14, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize error message, include dgc, nccl, size op (#24456)
上级
a5361982
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
145 addition
and
127 deletion
+145
-127
paddle/fluid/operators/dgc_clip_by_norm_op.cc
paddle/fluid/operators/dgc_clip_by_norm_op.cc
+2
-2
paddle/fluid/operators/dgc_op.cc
paddle/fluid/operators/dgc_op.cc
+15
-22
paddle/fluid/operators/dgc_op.h
paddle/fluid/operators/dgc_op.h
+18
-5
paddle/fluid/operators/nccl/nccl_op.cc
paddle/fluid/operators/nccl/nccl_op.cc
+38
-37
paddle/fluid/operators/nccl/nccl_op.cu.cc
paddle/fluid/operators/nccl/nccl_op.cu.cc
+33
-34
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+36
-23
paddle/fluid/operators/size_op.cc
paddle/fluid/operators/size_op.cc
+3
-4
未找到文件。
paddle/fluid/operators/dgc_clip_by_norm_op.cc
浏览文件 @
027f9953
...
@@ -23,8 +23,8 @@ class DGCClipByNormOp : public ClipByNormOp {
...
@@ -23,8 +23,8 @@ class DGCClipByNormOp : public ClipByNormOp {
protected:
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"current_step"
)
,
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"current_step"
),
"Input"
,
"current_step"
,
"
current_step should be set.
"
);
"
DGCClipByNormOp
"
);
return
ClipByNormOp
::
InferShape
(
ctx
);
return
ClipByNormOp
::
InferShape
(
ctx
);
}
}
...
...
paddle/fluid/operators/dgc_op.cc
浏览文件 @
027f9953
...
@@ -25,28 +25,21 @@ class DGCOp : public framework::OperatorWithKernel {
...
@@ -25,28 +25,21 @@ class DGCOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"U"
),
"Input(U) of DGCop should not be null."
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"U"
),
"Input"
,
"U"
,
"DGCOp"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"V"
),
"Input(V) of DGCop should not be null."
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"V"
),
"Input"
,
"V"
,
"DGCOp"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Grad"
),
"Input"
,
"Grad"
,
"DGCOp"
);
"Input(Grad) of DGCop should not be null."
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Param"
),
"Input"
,
"Param"
,
"DGCOp"
);
PADDLE_ENFORCE_EQ
(
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"current_step"
),
"Input"
,
"current_step"
,
ctx
->
HasInput
(
"Param"
),
true
,
"DGCOp"
);
platform
::
errors
::
NotFound
(
"Input(Param) of DGCop is not found."
));
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"nranks"
),
"Input"
,
"nranks"
,
"DGCOp"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"current_step"
),
"Input(current_step) of DGCop should not be null."
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"U_out"
),
"Output"
,
"U_out"
,
"DGCOp"
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"nranks"
),
true
,
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"V_out"
),
"Output"
,
"V_out"
,
"DGCOp"
);
"Input(nranks) of DGCop should not be null."
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"k"
),
"Output"
,
"k"
,
"DGCOp"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"EncodeGrad"
),
"Output"
,
"EncodeGrad"
,
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"U_out"
),
"DGCOp"
);
"Output(U_out) of DGCop should not be null."
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"GatherBuff"
),
"Output"
,
"GatherBuff"
,
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"V_out"
),
"DGCOp"
);
"Output(V_out) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"k"
),
"Output(k) of DGCop should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"EncodeGrad"
),
"Output(EncodeGrad) of DGCop should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"GatherBuff"
),
true
,
"Output(EncodeGrad) of DGCop should not be null."
);
}
}
protected:
protected:
...
...
paddle/fluid/operators/dgc_op.h
浏览文件 @
027f9953
...
@@ -24,14 +24,22 @@ namespace operators {
...
@@ -24,14 +24,22 @@ namespace operators {
inline
float
get_period_sparcity
(
const
std
::
vector
<
float
>&
sparsity
,
inline
float
get_period_sparcity
(
const
std
::
vector
<
float
>&
sparsity
,
float
cur_step
,
float
rampup_steps
)
{
float
cur_step
,
float
rampup_steps
)
{
PADDLE_ENFORCE_GE
(
static_cast
<
int
>
(
cur_step
),
0
);
PADDLE_ENFORCE_GE
(
static_cast
<
int
>
(
cur_step
),
0
,
platform
::
errors
::
InvalidArgument
(
"DGC current step=%d, but it must >= 0, "
"please submit issue in github"
,
static_cast
<
int
>
(
cur_step
)));
size_t
idx
=
static_cast
<
int
>
(
cur_step
*
sparsity
.
size
()
/
rampup_steps
);
size_t
idx
=
static_cast
<
int
>
(
cur_step
*
sparsity
.
size
()
/
rampup_steps
);
if
(
idx
>=
sparsity
.
size
())
{
if
(
idx
>=
sparsity
.
size
())
{
idx
=
sparsity
.
size
()
-
1
;
idx
=
sparsity
.
size
()
-
1
;
}
}
PADDLE_ENFORCE_LT
(
idx
,
sparsity
.
size
());
PADDLE_ENFORCE_LT
(
idx
,
sparsity
.
size
(),
platform
::
errors
::
OutOfRange
(
"sparsity index out of bounds. idx=%d >= sparsity.size=%d"
,
idx
,
sparsity
.
size
()));
return
sparsity
[
idx
];
return
sparsity
[
idx
];
}
}
...
@@ -55,7 +63,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
...
@@ -55,7 +63,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
// nranks
// nranks
auto
nranks_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"nranks"
);
auto
nranks_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"nranks"
);
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
const
int
nranks
=
static_cast
<
const
int
>
(
*
nranks_tensor
->
data
<
float
>
());
PADDLE_ENFORCE_GT
(
nranks
,
1
,
"DGC is not useful when num_trainers <= 1"
);
PADDLE_ENFORCE_GT
(
nranks
,
1
,
platform
::
errors
::
PreconditionNotMet
(
"DGC is not useful when num_trainers <= 1. Please "
"use multi card or multi machine GPU"
));
// regularization
// regularization
auto
p
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
p
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
...
@@ -105,8 +116,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
...
@@ -105,8 +116,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
1
-
get_period_sparcity
(
1
-
get_period_sparcity
(
sparsity
,
static_cast
<
float
>
(
*
current_step
-
rampup_begin_step
),
sparsity
,
static_cast
<
float
>
(
*
current_step
-
rampup_begin_step
),
rampup_step
);
rampup_step
);
PADDLE_ENFORCE_GE
(
ratio
,
0.0
);
PADDLE_ENFORCE_GE
(
ratio
,
0.0
,
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_LT
(
ratio
,
1.0
);
"DGC sparsity ratio must >= 0"
));
PADDLE_ENFORCE_LT
(
ratio
,
1.0
,
platform
::
errors
::
InvalidArgument
(
"DGC sparsity ratio must < 1"
));
int
k
=
static_cast
<
int
>
(
g
->
numel
()
*
ratio
);
int
k
=
static_cast
<
int
>
(
g
->
numel
()
*
ratio
);
VLOG
(
10
)
<<
"m:"
<<
m
<<
", use_nesterov:"
<<
use_nesterov
VLOG
(
10
)
<<
"m:"
<<
m
<<
", use_nesterov:"
<<
use_nesterov
...
...
paddle/fluid/operators/nccl/nccl_op.cc
浏览文件 @
027f9953
...
@@ -31,12 +31,15 @@ class NCCLInitOp : public framework::OperatorBase {
...
@@ -31,12 +31,15 @@ class NCCLInitOp : public framework::OperatorBase {
private:
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
const
platform
::
Place
&
place
)
const
override
{
PADDLE_ENFORCE_NOT_NULL
(
scope
.
FindVar
(
Input
(
kParallelScopes
)),
PADDLE_ENFORCE_NOT_NULL
(
"Can not find variable '%s' in the scope."
,
scope
.
FindVar
(
Input
(
kParallelScopes
)),
kParallelScopes
);
platform
::
errors
::
NotFound
(
"Can not find variable '%s' in the scope."
,
kParallelScopes
));
const
auto
&
name
=
Output
(
"Communicator"
);
const
auto
&
name
=
Output
(
"Communicator"
);
PADDLE_ENFORCE_NOT_NULL
(
scope
.
FindVar
(
name
),
PADDLE_ENFORCE_NOT_NULL
(
"Can not find variable '%s' in the scope."
,
name
);
scope
.
FindVar
(
name
),
platform
::
errors
::
NotFound
(
"Output(%s) is needed for ncclInit operator."
,
name
));
// A parallel do may not use all the gpus. For example, the batch size is 7
// A parallel do may not use all the gpus. For example, the batch size is 7
// in the last batch while we have 8 gpu. In this case, parallel_do will
// in the last batch while we have 8 gpu. In this case, parallel_do will
// create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
// create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
...
@@ -46,11 +49,9 @@ class NCCLInitOp : public framework::OperatorBase {
...
@@ -46,11 +49,9 @@ class NCCLInitOp : public framework::OperatorBase {
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
parallel_scopes
.
size
());
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
parallel_scopes
.
size
());
++
i
)
{
gpus
[
i
]
=
i
;
gpus
[
i
]
=
i
;
}
}
PADDLE_ENFORCE
(
!
gpus
.
empty
(),
"NCCL init with 0 gpus."
);
PADDLE_ENFORCE_EQ
(
!
gpus
.
empty
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
if
(
scope
.
FindVar
(
name
)
==
nullptr
)
{
"gpus is empty, NCCL must init with gpus"
));
PADDLE_THROW
(
"Output(Communicator) is needed for ncclInit operator."
);
}
platform
::
Communicator
*
comm
=
platform
::
Communicator
*
comm
=
scope
.
FindVar
(
name
)
->
GetMutable
<
platform
::
Communicator
>
();
scope
.
FindVar
(
name
)
->
GetMutable
<
platform
::
Communicator
>
();
...
@@ -92,17 +93,17 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
...
@@ -92,17 +93,17 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
protected:
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"NCCLAllReduce"
);
" Input(X) of AllReduce op input should not be NULL"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Communicator"
),
"Input"
,
"Communicator"
,
PADDLE_ENFORCE
(
"NCCLAllReduce"
);
ctx
->
HasInput
(
"Communicator"
),
" Input(Communicator) of AllReduce op input should not be NULL"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"NCCLAllReduce"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
" Output(Out) of AllReduce op output should not be NULL"
);
std
::
string
reduction
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"reduction"
);
std
::
string
reduction
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"reduction"
);
PADDLE_ENFORCE
((
reduction
==
"ncclSum"
||
reduction
==
"ncclProd"
||
PADDLE_ENFORCE_EQ
(
reduction
==
"ncclMin"
||
reduction
==
"ncclMax"
),
(
reduction
==
"ncclSum"
||
reduction
==
"ncclProd"
||
"invalid reduction."
);
reduction
==
"ncclMin"
||
reduction
==
"ncclMax"
),
true
,
platform
::
errors
::
InvalidArgument
(
"invalid nccl reduction."
));
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
...
@@ -137,18 +138,17 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
...
@@ -137,18 +138,17 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
protected:
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"NCCLReduce"
);
" Input(X) of Reduce op input should not be NULL"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Communicator"
),
"Input"
,
"Communicator"
,
PADDLE_ENFORCE
(
"NCCLReduce"
);
ctx
->
HasInput
(
"Communicator"
),
" Input(Communicator) of Reduce op input should not be NULL"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"NCCLReduce"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
" Input(X) of Reduce op input should not be NULL"
);
std
::
string
reduction
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"reduction"
);
std
::
string
reduction
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"reduction"
);
PADDLE_ENFORCE
((
reduction
==
"ncclSum"
||
reduction
==
"ncclProd"
||
PADDLE_ENFORCE_EQ
(
reduction
==
"ncclMin"
||
reduction
==
"ncclMax"
),
(
reduction
==
"ncclSum"
||
reduction
==
"ncclProd"
||
"invalid reduction."
);
reduction
==
"ncclMin"
||
reduction
==
"ncclMax"
),
true
,
platform
::
errors
::
InvalidArgument
(
"invalid nccl reduction."
));
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
...
@@ -188,15 +188,16 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
...
@@ -188,15 +188,16 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
protected:
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"NCCLBcast"
);
" Input(X) of Bcast op input should not be NULL"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Communicator"
),
"Input"
,
"Communicator"
,
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Communicator"
),
"NCCLBcast"
);
" Input(Communicator) of Bcast op input should not be NULL"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"NCCLBcast"
);
" Output(Out) of Bcast op output should not be NULL"
);
int
root
=
ctx
->
Attrs
().
Get
<
int
>
(
"root"
);
int
root
=
ctx
->
Attrs
().
Get
<
int
>
(
"root"
);
PADDLE_ENFORCE
(
root
!=
platform
::
kInvalidGPUId
,
"Bcast root must be set."
);
PADDLE_ENFORCE_EQ
(
root
!=
platform
::
kInvalidGPUId
,
true
,
platform
::
errors
::
InvalidArgument
(
"Bcast root must be set."
));
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
...
...
paddle/fluid/operators/nccl/nccl_op.cu.cc
浏览文件 @
027f9953
...
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
...
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <functional>
#include <functional>
#include <unordered_map>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
...
@@ -37,29 +38,35 @@ class NCCLTypeWrapper<double> {
...
@@ -37,29 +38,35 @@ class NCCLTypeWrapper<double> {
static
const
ncclDataType_t
type
=
ncclDouble
;
static
const
ncclDataType_t
type
=
ncclDouble
;
};
};
static
ncclRedOp_t
str_to_nccl_red_type
(
std
::
string
reduction
)
{
static
const
std
::
unordered_map
<
std
::
string
,
ncclRedOp_t
>
str_to_type
=
{
{
"ncclSum"
,
ncclSum
},
{
"ncclMin"
,
ncclMin
},
{
"ncclMax"
,
ncclMax
},
{
"ncclProd"
,
ncclProd
},
};
auto
it
=
str_to_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
str_to_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
"ncclProd | ncclSum"
));
return
it
->
second
;
}
template
<
typename
T
>
template
<
typename
T
>
class
NCCLAllReduceKernel
:
public
framework
::
OpKernel
<
T
>
{
class
NCCLAllReduceKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
"This kernel only runs on GPU device."
);
platform
::
errors
::
PreconditionNotMet
(
"This kernel only runs on GPU device."
));
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
std
::
string
reduction
=
ctx
.
Attr
<
std
::
string
>
(
"reduction"
);
std
::
string
reduction
=
ctx
.
Attr
<
std
::
string
>
(
"reduction"
);
ncclRedOp_t
reduction_op_
=
ncclSum
;
auto
reduction_op_
=
str_to_nccl_red_type
(
reduction
);
if
(
reduction
==
"ncclMin"
)
{
reduction_op_
=
ncclMin
;
}
else
if
(
reduction
==
"ncclMax"
)
{
reduction_op_
=
ncclMax
;
}
else
if
(
reduction
==
"ncclSum"
)
{
reduction_op_
=
ncclSum
;
}
else
if
(
reduction
==
"ncclProd"
)
{
reduction_op_
=
ncclProd
;
}
else
{
PADDLE_THROW
(
"Invalid reduction. default ncclSum."
);
}
// device id
// device id
int
gpu_id
=
int
gpu_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
ctx
.
GetPlace
()).
GetDeviceId
();
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
ctx
.
GetPlace
()).
GetDeviceId
();
...
@@ -67,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
...
@@ -67,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
VLOG
(
3
)
<<
"gpu : "
VLOG
(
3
)
<<
"gpu : "
<<
" invoke allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
" invoke allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
<<
out
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
x
->
data
<
T
>
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
x
->
data
<
T
>
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
comm
->
comms
().
at
(
idx
),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
ctx
.
cuda_device_context
().
stream
()));
...
@@ -81,26 +88,17 @@ template <typename T>
...
@@ -81,26 +88,17 @@ template <typename T>
class
NCCLReduceKernel
:
public
framework
::
OpKernel
<
T
>
{
class
NCCLReduceKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
"This kernel only runs on GPU device."
);
platform
::
errors
::
InvalidArgument
(
"This kernel only runs on GPU device."
));
auto
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
// x0, x1, x2
auto
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
// x0, x1, x2
auto
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
std
::
string
reduction
=
ctx
.
Attr
<
std
::
string
>
(
"reduction"
);
std
::
string
reduction
=
ctx
.
Attr
<
std
::
string
>
(
"reduction"
);
ncclRedOp_t
reduction_op_
=
ncclSum
;
auto
reduction_op_
=
str_to_nccl_red_type
(
reduction
);
if
(
reduction
==
"ncclMin"
)
{
reduction_op_
=
ncclMin
;
}
else
if
(
reduction
==
"ncclMax"
)
{
reduction_op_
=
ncclMax
;
}
else
if
(
reduction
==
"ncclSum"
)
{
reduction_op_
=
ncclSum
;
}
else
if
(
reduction
==
"ncclProd"
)
{
reduction_op_
=
ncclProd
;
}
else
{
PADDLE_THROW
(
"Invalid reduction. default ncclSum."
);
}
// device id
// device id
int
gpu_id
=
int
gpu_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
ctx
.
GetPlace
()).
GetDeviceId
();
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
ctx
.
GetPlace
()).
GetDeviceId
();
...
@@ -113,7 +111,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
...
@@ -113,7 +111,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
}
}
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke reduce. send "
<<
x
->
numel
()
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
<<
" recv "
<<
out
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
platform
::
dynload
::
ncclReduce
(
x
->
data
<
T
>
(),
recvbuffer
,
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
x
->
data
<
T
>
(),
recvbuffer
,
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
root
,
comm
->
comms
().
at
(
idx
),
reduction_op_
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
ctx
.
cuda_device_context
().
stream
()));
...
@@ -126,8 +124,9 @@ template <typename T>
...
@@ -126,8 +124,9 @@ template <typename T>
class
NCCLBcastKernel
:
public
framework
::
OpKernel
<
T
>
{
class
NCCLBcastKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
"This kernel only runs on GPU device."
);
platform
::
errors
::
InvalidArgument
(
"This kernel only runs on GPU device."
));
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
// device id
// device id
...
@@ -137,7 +136,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
...
@@ -137,7 +136,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
if
(
idx
==
root
)
{
if
(
idx
==
root
)
{
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
x
->
numel
();
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
x
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
platform
::
dynload
::
ncclBcast
(
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
())),
x
->
numel
(),
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
())),
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
ctx
.
cuda_device_context
().
stream
()));
...
@@ -146,7 +145,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
...
@@ -146,7 +145,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
<<
framework
::
product
(
out
->
dims
());
<<
framework
::
product
(
out
->
dims
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
platform
::
dynload
::
ncclBcast
(
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
ctx
.
cuda_device_context
().
stream
()));
...
...
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
浏览文件 @
027f9953
...
@@ -45,10 +45,9 @@ class NCCLTester : public ::testing::Test {
...
@@ -45,10 +45,9 @@ class NCCLTester : public ::testing::Test {
public:
public:
void
SetUp
()
override
{
void
SetUp
()
override
{
int
count
=
p
::
GetCUDADeviceCount
();
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
if
(
count
<=
0
)
{
LOG
(
WARNING
)
LOG
(
WARNING
)
<<
"Cannot test gpu nccl, because the CUDA device count is "
<<
"Cannot test multi-gpu nccl, because the CUDA device count is "
<<
count
;
<<
count
;
exit
(
0
);
exit
(
0
);
}
}
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
...
@@ -114,8 +113,9 @@ class NCCLTester : public ::testing::Test {
...
@@ -114,8 +113,9 @@ class NCCLTester : public ::testing::Test {
lk
.
unlock
();
lk
.
unlock
();
PADDLE_ENFORCE
(
send_tensor
->
numel
()
==
f
::
product
(
kDims
),
PADDLE_ENFORCE_EQ
(
"Tensor numel not match!"
);
send_tensor
->
numel
(),
f
::
product
(
kDims
),
paddle
::
platform
::
errors
::
InvalidArgument
(
"Tensor numel not match!"
));
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
...
@@ -126,6 +126,10 @@ class NCCLTester : public ::testing::Test {
...
@@ -126,6 +126,10 @@ class NCCLTester : public ::testing::Test {
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
}
}
void
testNcclReduceOp
();
void
testNcclAllReduceOp
();
void
testNcclBcastOp
();
public:
public:
std
::
vector
<
p
::
DeviceContext
*>
dev_ctxs_
;
std
::
vector
<
p
::
DeviceContext
*>
dev_ctxs_
;
f
::
Scope
g_scope_
;
f
::
Scope
g_scope_
;
...
@@ -133,13 +137,7 @@ class NCCLTester : public ::testing::Test {
...
@@ -133,13 +137,7 @@ class NCCLTester : public ::testing::Test {
std
::
vector
<
int
>
gpu_list_
;
std
::
vector
<
int
>
gpu_list_
;
};
};
// ncclInitOp with desc
void
NCCLTester
::
testNcclAllReduceOp
()
{
TEST_F
(
NCCLTester
,
ncclInitOp
)
{}
// ncclAllReduceOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
/*
TEST_F(NCCLTester, ncclAllReduceOp) {
std
::
unique_ptr
<
f
::
OpDesc
>
op2
(
new
f
::
OpDesc
);
std
::
unique_ptr
<
f
::
OpDesc
>
op2
(
new
f
::
OpDesc
);
op2
->
SetType
(
"ncclAllReduce"
);
op2
->
SetType
(
"ncclAllReduce"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"X"
,
{
"st"
});
...
@@ -186,10 +184,8 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
...
@@ -186,10 +184,8 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
}
}
}
}
}
}
*/
// ncclReduceOp with desc
void
NCCLTester
::
testNcclReduceOp
()
{
TEST_F
(
NCCLTester
,
ncclReduceOp
)
{
std
::
unique_ptr
<
f
::
OpDesc
>
op2
(
new
f
::
OpDesc
);
std
::
unique_ptr
<
f
::
OpDesc
>
op2
(
new
f
::
OpDesc
);
const
int
kRoot
=
0
;
const
int
kRoot
=
0
;
op2
->
SetType
(
"ncclReduce"
);
op2
->
SetType
(
"ncclReduce"
);
...
@@ -236,10 +232,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
...
@@ -236,10 +232,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
}
}
}
}
// ncclBcastOp with desc
void
NCCLTester
::
testNcclBcastOp
()
{
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
/*
TEST_F(NCCLTester, ncclBcastOp) {
std
::
unique_ptr
<
f
::
OpDesc
>
op2
(
new
f
::
OpDesc
);
std
::
unique_ptr
<
f
::
OpDesc
>
op2
(
new
f
::
OpDesc
);
const
int
kRoot
=
0
;
const
int
kRoot
=
0
;
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetType
(
"ncclBcast"
);
...
@@ -263,13 +256,17 @@ TEST_F(NCCLTester, ncclBcastOp) {
...
@@ -263,13 +256,17 @@ TEST_F(NCCLTester, ncclBcastOp) {
ths
[
i
].
join
();
ths
[
i
].
join
();
}
}
const int idx = 1;
const
int
idx
=
gpu_list_
.
size
()
-
1
;
float
result
=
GetGPUData
(
kRoot
);
float
result
=
GetGPUData
(
kRoot
);
p
::
CPUPlace
cpu_place
;
p
::
CPUPlace
cpu_place
;
p
::
CUDAPlace
gpu_place
(
gpu_list_
[
idx
]);
p
::
CUDAPlace
gpu_place
(
gpu_list_
[
idx
]);
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
std
::
string
rt_str
=
"rt"
;
if
(
idx
==
kRoot
)
{
rt_str
=
"st"
;
}
auto
&
recv_tensor
=
dev_scopes
[
idx
]
->
FindVar
(
rt_str
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
auto
*
result_tensor
=
dev_scopes
[
idx
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
auto
*
result_tensor
=
dev_scopes
[
idx
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
result_tensor
->
Resize
(
kDims
);
...
@@ -284,4 +281,20 @@ TEST_F(NCCLTester, ncclBcastOp) {
...
@@ -284,4 +281,20 @@ TEST_F(NCCLTester, ncclBcastOp) {
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
}
}
}
*/
// ncclInitOp with desc
TEST_F
(
NCCLTester
,
ncclInitOp
)
{}
TEST_F
(
NCCLTester
,
ncclOp
)
{
// Serial execution is required for the same nccl comm.
// ncclAllReduceOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
testNcclReduceOp
();
testNcclAllReduceOp
();
// ncclBcastOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
testNcclBcastOp
();
}
paddle/fluid/operators/size_op.cc
浏览文件 @
027f9953
...
@@ -23,10 +23,9 @@ class SizeOp : public framework::OperatorWithKernel {
...
@@ -23,10 +23,9 @@ class SizeOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Input"
),
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Input"
),
"Input"
,
"Input"
,
"Size"
);
"Input (Input) of Size op should not be null."
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"Size"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output (Out) of Size op should not be null."
);
ctx
->
SetOutputDim
(
"Out"
,
{
1
});
ctx
->
SetOutputDim
(
"Out"
,
{
1
});
}
}
};
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录