Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
38d3adfe
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
38d3adfe
编写于
10月 25, 2017
作者:
D
Dong Zhihong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
"add multioperator testcase"
上级
94992a99
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
180 addition
and
121 deletion
+180
-121
paddle/operators/nccl_op.cc
paddle/operators/nccl_op.cc
+25
-46
paddle/operators/nccl_op.cu
paddle/operators/nccl_op.cu
+8
-5
paddle/operators/nccl_op_test.cu
paddle/operators/nccl_op_test.cu
+147
-70
未找到文件。
paddle/operators/nccl_op.cc
浏览文件 @
38d3adfe
...
...
@@ -100,8 +100,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
}
};
// Bcast
Send
Op
class
NCCLBcast
Send
Op
:
public
framework
::
OperatorWithKernel
{
// BcastOp
class
NCCLBcastOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -111,20 +111,12 @@ class NCCLBcastSendOp : public framework::OperatorWithKernel {
" Input(X) of Bcast op input should not be NULL"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Communicator"
),
" Input(Communicator) of Bcast op input should not be NULL"
);
}
};
// BcastRecvOp
class
NCCLBcastRecvOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Communicator"
),
" Input(Communicator) of Bcast op input should not be NULL"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
" Output(Out) of Bcast op output should not be NULL"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
...
...
@@ -146,52 +138,41 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
}
};
// BcastSend should be in the root
// BcastSendOp
class
NCCLBcastSendOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
// ReduceOp
class
NCCLReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
NCCL
BcastSend
OpMaker
(
framework
::
OpProto
*
proto
,
framework
::
OpAttrChecker
*
op_checker
)
NCCL
Reduce
OpMaker
(
framework
::
OpProto
*
proto
,
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The input of
BcastSend
op"
);
AddInput
(
"X"
,
"The input of
Reduce
op"
);
AddInput
(
"Communicator"
,
"Communicator for communicating between gpus"
);
AddAttr
<
int
>
(
"root"
,
"root gpu of Bcast"
);
AddOutput
(
"Out"
,
"The output of Reduce op"
);
AddAttr
<
int
>
(
"root"
,
"root gpu of the parameter. if not set(-1). hashed by name."
)
.
SetDefault
(
-
1
);
AddComment
(
R"DOC(
Bcast the tensors.
)DOC"
);
Reduce the tensors)DOC"
);
}
};
// BcastOp
class
NCCLBcast
Recv
OpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
NCCLBcastOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
NCCLBcast
Recv
OpMaker
(
framework
::
OpProto
*
proto
,
framework
::
OpAttrChecker
*
op_checker
)
NCCLBcastOpMaker
(
framework
::
OpProto
*
proto
,
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The input of BcastSend op"
);
AddInput
(
"Communicator"
,
"Communicator for communicating between gpus"
);
AddAttr
<
int
>
(
"root"
,
"root gpu of BcastRecv"
);
AddOutput
(
"Out"
,
"The output of Bcast"
);
AddAttr
<
int
>
(
"root"
,
"root gpu of the parameter. if not set(-1). hashed by name."
)
.
SetDefault
(
-
1
);
AddComment
(
R"DOC(
Bcast the tensors.
)DOC"
);
}
};
// BcastRecvOp
class
NCCLReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
NCCLReduceOpMaker
(
framework
::
OpProto
*
proto
,
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The input of Reduce op"
);
AddInput
(
"Communicator"
,
"Communicator for communicating between gpus"
);
AddOutput
(
"Out"
,
"The output of Reduce op"
);
AddComment
(
R"DOC(
Reduce the tensors.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
...
...
@@ -201,9 +182,7 @@ REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,
REGISTER_OP_WITHOUT_GRADIENT
(
ncclAllReduce
,
ops
::
NCCLAllReduceOp
,
ops
::
NCCLAllReduceOpMaker
);
REGISTER_OP_WITHOUT_GRADIENT
(
ncclBcastSend
,
ops
::
NCCLBcastSendOp
,
ops
::
NCCLBcastSendOpMaker
);
REGISTER_OP_WITHOUT_GRADIENT
(
ncclBcastRecv
,
ops
::
NCCLBcastRecvOp
,
ops
::
NCCLBcastRecvOpMaker
);
REGISTER_OP_WITHOUT_GRADIENT
(
ncclBcast
,
ops
::
NCCLBcastOp
,
ops
::
NCCLBcastOpMaker
);
REGISTER_OP_WITHOUT_GRADIENT
(
ncclReduce
,
ops
::
NCCLReduceOp
,
ops
::
NCCLReduceOpMaker
);
paddle/operators/nccl_op.cu
浏览文件 @
38d3adfe
...
...
@@ -83,6 +83,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
// x0, x1, x2
auto
outs
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"Out"
);
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
...
...
@@ -97,7 +98,9 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
auto
ins_names
=
ctx
.
Inputs
(
"X"
);
std
::
hash
<
std
::
string
>
hasher
;
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
int
root
=
hasher
(
ins_names
[
i
])
%
comm
->
comms_
.
size
();
if
(
root
==
-
1
)
{
root
=
hasher
(
ins_names
[
i
])
%
comm
->
comms_
.
size
();
}
T
*
recvbuffer
=
nullptr
;
if
(
root
==
device_id
)
{
recvbuffer
=
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
@@ -135,8 +138,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
int
device_id
=
boost
::
get
<
platform
::
GPUPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
device_id
);
if
(
idx
==
root
)
{
auto
ins
=
ctx
.
MultiInput
<
Tensor
>
(
"X"
);
auto
ins
=
ctx
.
MultiInput
<
LoD
Tensor
>
(
"X"
);
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
(
void
*
)
ins
[
i
]
->
data
<
T
>
(),
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
...
...
@@ -144,7 +148,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
}
}
else
{
auto
outs
=
ctx
.
MultiOutput
<
Tensor
>
(
"Out"
);
auto
outs
=
ctx
.
MultiOutput
<
LoD
Tensor
>
(
"Out"
);
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
outs
[
i
]
->
numel
(),
...
...
@@ -160,6 +164,5 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_GPU_KERNEL
(
ncclAllReduce
,
ops
::
NCCLAllReduceKernel
<
float
>
);
REGISTER_OP_GPU_KERNEL
(
ncclBcast
Send
,
ops
::
NCCLBcastKernel
<
float
>
);
REGISTER_OP_GPU_KERNEL
(
ncclBcast
,
ops
::
NCCLBcastKernel
<
float
>
);
REGISTER_OP_GPU_KERNEL
(
ncclReduce
,
ops
::
NCCLReduceKernel
<
float
>
);
REGISTER_OP_GPU_KERNEL
(
ncclBcastRecv
,
ops
::
NCCLBcastKernel
<
float
>
);
paddle/operators/nccl_op_test.cu
浏览文件 @
38d3adfe
...
...
@@ -28,6 +28,7 @@
#include "paddle/framework/op_registry.h"
#include "paddle/framework/program_desc.h"
#include "paddle/framework/var_desc.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/nccl/nccl_gpu_common.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
...
...
@@ -37,8 +38,7 @@
USE_NO_KERNEL_OP
(
ncclInit
);
USE_GPU_ONLY_OP
(
ncclAllReduce
);
USE_GPU_ONLY_OP
(
ncclReduce
);
USE_GPU_ONLY_OP
(
ncclBcastSend
);
USE_GPU_ONLY_OP
(
ncclBcastRecv
);
USE_GPU_ONLY_OP
(
ncclBcast
);
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
@@ -144,12 +144,62 @@ class NCCLTester : public ::testing::Test {
// }
// ncclAllReduceOp with desc
TEST_F
(
NCCLTester
,
ncclAllReduceOp
)
{
// TEST_F(NCCLTester, ncclAllReduceOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// op2->SetType("ncclAllReduce");
// op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
// std::vector<f::Scope *> dev_scopes;
// std::vector<std::thread> ths;
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// dev_scopes.emplace_back(&g_scope.NewScope());
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(), dev_scopes[i]);
// ths.emplace_back(std::move(th));
// }
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
// }
// // check results
// float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
// for (size_t i = 0; i < dev_scopes.size(); ++i) {
// p::CPUPlace cpu_place;
// p::GPUPlace gpu_place(gpu_list[i]);
// auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
// auto *rt = recv_tensor.data<float>();
// auto *result_tensor =
// dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
// result_tensor->Resize(kDims);
// auto *ct = result_tensor->mutable_data<float>(cpu_place);
// paddle::memory::Copy(
// cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
// recv_tensor.numel() * sizeof(float),
// static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
// for (size_t j = 0; j < f::product(kDims); ++j) {
// ASSERT_NEAR(ct[j], result, 1e-5);
// }
// }
// }
// ncclAReduceOp with desc
TEST_F
(
NCCLTester
,
ncclReduceOp
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
op2
->
SetType
(
"ncclAllReduce"
);
const
int
kRoot
=
0
;
op2
->
SetType
(
"ncclReduce"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
std
::
vector
<
f
::
Scope
*>
dev_scopes
;
...
...
@@ -166,39 +216,43 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
ths
[
i
].
join
();
}
// check results
float
result
=
0
;
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
result
);
for
(
size_t
i
=
0
;
i
<
dev_scopes
.
size
();
++
i
)
{
auto
&
recv_tensor
=
dev_scopes
[
i
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
// check results on
float
result
=
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
0
);
p
::
CPUPlace
cpu_place
;
auto
*
result_tensor
=
dev_scopes
[
i
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
p
::
CPUPlace
cpu_place
;
p
::
GPUPlace
gpu_place
(
gpu_list
[
kRoot
]);
paddle
::
memory
::
Copy
(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
i
]),
rt
,
recv_tensor
.
numel
()
*
sizeof
(
float
),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
i
])
->
stream
());
for
(
size_t
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
auto
&
recv_tensor
=
dev_scopes
[
kRoot
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
auto
*
result_tensor
=
dev_scopes
[
kRoot
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
paddle
::
memory
::
Copy
(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
kRoot
]),
rt
,
recv_tensor
.
numel
()
*
sizeof
(
float
),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
kRoot
])
->
stream
());
for
(
int
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
}
//
ncclReduce
Op with desc
TEST
(
NCCL
,
ncclReduce
Op
)
{
//
// ncclBcast
Op with desc
TEST
_F
(
NCCLTester
,
ncclBcast
Op
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
op2
->
SetType
(
"ncclReduce"
);
const
int
kRoot
=
0
;
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
std
::
vector
<
f
::
Scope
*>
dev_scopes
;
std
::
vector
<
std
::
thread
>
ths
;
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
...
...
@@ -210,76 +264,99 @@ TEST(NCCL, ncclReduceOp) {
ths
[
i
].
join
();
}
// check results
float
result
=
0
;
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
result
);
for
(
size_t
i
=
0
;
i
<
dev_scopes
.
size
();
++
i
)
{
auto
&
recv_tensor
=
dev_scopes
[
i
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
const
int
idx
=
1
;
// check results on
float
result
=
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
0
);
p
::
CPUPlace
cpu_place
;
auto
*
result_tensor
=
dev_scopes
[
i
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
p
::
CPUPlace
cpu_place
;
p
::
GPUPlace
gpu_place
(
gpu_list
[
idx
]);
paddle
::
memory
::
Copy
(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
i
]),
rt
,
recv_tensor
.
numel
()
*
sizeof
(
float
),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
i
])
->
stream
());
for
(
size_t
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
auto
&
recv_tensor
=
dev_scopes
[
idx
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
auto
*
result_tensor
=
dev_scopes
[
idx
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
paddle
::
memory
::
Copy
(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
idx
]),
rt
,
recv_tensor
.
numel
()
*
sizeof
(
float
),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
idx
])
->
stream
());
for
(
size_t
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
}
// ncclBcastOp with desc
TEST
(
NCCL
,
ncclBcastOp
)
{
// joint ncclBcastOp and ncclReduceOp
TEST_F
(
NCCLTester
,
MultipleOp
)
{
const
int
kRoot
=
0
;
std
::
unique_ptr
<
f
::
OpDescBind
>
op1
(
new
f
::
OpDescBind
);
op1
->
SetType
(
"nccl
BcastSend
"
);
op1
->
SetInput
(
"X"
,
{
"
s
t"
});
op1
->
SetType
(
"nccl
Reduce
"
);
op1
->
SetInput
(
"X"
,
{
"
r
t"
});
op1
->
SetInput
(
"Communicator"
,
{
"comm"
});
op1
->
SetOutput
(
"Out"
,
{
"rt"
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
op2
->
SetType
(
"ncclBcastRecv"
);
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
std
::
vector
<
f
::
Scope
*>
dev_scopes
;
std
::
vector
<
std
::
thread
>
ths
;
for
(
size_t
i
=
1
;
i
<
gpu_list
.
size
();
++
i
)
{
// run Bcast
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
*
op
2
.
get
(),
&
g_scope
.
NewScope
()
);
*
op
1
.
get
(),
dev_scopes
[
i
]
);
ths
.
emplace_back
(
std
::
move
(
th
));
}
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
ths
[
i
].
join
();
}
}
// joint ncclBcastOp and ncclReduceOp
// TEST(NCCL, MultipleOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// op2->SetType("ncclBcastSend");
// op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
ths
.
clear
();
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// op2->SetType("ncclBcastRecv");
// op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
// run Reduce
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
*
op2
.
get
(),
dev_scopes
[
i
]);
ths
.
emplace_back
(
std
::
move
(
th
));
}
// std::vector<std::thread> ths;
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(),
// &g_scope.NewScope());
// ths.emplace_back(std::move(th));
// }
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
ths
[
i
].
join
();
}
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
// }
// }
// check results
float
result
=
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
0
);
for
(
size_t
i
=
0
;
i
<
dev_scopes
.
size
();
++
i
)
{
p
::
CPUPlace
cpu_place
;
p
::
GPUPlace
gpu_place
(
gpu_list
[
i
]);
auto
&
recv_tensor
=
dev_scopes
[
i
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
auto
*
result_tensor
=
dev_scopes
[
i
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
paddle
::
memory
::
Copy
(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
i
]),
rt
,
recv_tensor
.
numel
()
*
sizeof
(
float
),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
i
])
->
stream
());
for
(
int
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
}
}
int
main
(
int
argc
,
char
**
argv
)
{
const
int
dev_count
=
p
::
GetCUDADeviceCount
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录