Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
4e165f4e
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4e165f4e
编写于
10月 25, 2017
作者:
D
Dong Zhihong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
"fix create output variable bug"
上级
61c1b046
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
214 addition
and
197 deletion
+214
-197
paddle/operators/nccl_op.cc
paddle/operators/nccl_op.cc
+3
-0
paddle/operators/nccl_op.cu
paddle/operators/nccl_op.cu
+24
-20
paddle/operators/nccl_op_test.cu
paddle/operators/nccl_op_test.cu
+187
-177
未找到文件。
paddle/operators/nccl_op.cc
浏览文件 @
4e165f4e
...
...
@@ -114,6 +114,9 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
" Output(Out) of Bcast op output should not be NULL"
);
int
root
=
ctx
->
Attrs
().
Get
<
int
>
(
"root"
);
PADDLE_ENFORCE
(
root
!=
-
1
,
"Bcast root must be set."
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
...
...
paddle/operators/nccl_op.cu
浏览文件 @
4e165f4e
...
...
@@ -54,12 +54,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
ctx
.
device_context
())
.
stream
();
// device id
int
device_id
=
boost
::
get
<
platform
::
GPUPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
device_id
);
int
gpu_id
=
boost
::
get
<
platform
::
GPUPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
gpu_id
);
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
VLOG
(
1
)
<<
" invoke allreduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv "
VLOG
(
1
)
<<
"gpu : "
<<
" invoke allreduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv "
<<
outs
[
i
]
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
...
...
@@ -68,7 +68,8 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
comm
->
comms_
[
idx
],
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
" finished allreduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv "
VLOG
(
1
)
<<
"gpu : "
<<
" finished allreduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv "
<<
outs
[
i
]
->
numel
();
}
}
...
...
@@ -91,9 +92,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
ctx
.
device_context
())
.
stream
();
// device id
int
device_id
=
boost
::
get
<
platform
::
GPUPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
device_id
);
int
gpu_id
=
boost
::
get
<
platform
::
GPUPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
gpu_id
);
auto
ins_names
=
ctx
.
Inputs
(
"X"
);
std
::
hash
<
std
::
string
>
hasher
;
...
...
@@ -102,20 +102,20 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
root
=
hasher
(
ins_names
[
i
])
%
comm
->
comms_
.
size
();
}
T
*
recvbuffer
=
nullptr
;
if
(
root
==
device
_id
)
{
if
(
root
==
gpu
_id
)
{
recvbuffer
=
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
VLOG
(
1
)
<<
"
invoke reduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv
"
<<
outs
[
i
]
->
numel
();
VLOG
(
1
)
<<
"
gpu : "
<<
gpu_id
<<
" invoke reduce. send
"
<<
ins
[
i
]
->
numel
()
<<
" recv "
<<
outs
[
i
]
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
ins
[
i
]
->
data
<
T
>
(),
recvbuffer
,
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
ncclSum
,
root
,
comm
->
comms_
[
idx
],
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
"
finished reduce. send "
<<
ins
[
i
]
->
numel
()
<<
" recv
"
<<
outs
[
i
]
->
numel
();
VLOG
(
1
)
<<
"
gpu : "
<<
gpu_id
<<
" finished reduce. send
"
<<
ins
[
i
]
->
numel
()
<<
" recv "
<<
outs
[
i
]
->
numel
();
}
}
};
...
...
@@ -135,33 +135,37 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
ctx
.
device_context
())
.
stream
();
// device id
int
device_id
=
boost
::
get
<
platform
::
GPUPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
device_id
);
int
gpu_id
=
boost
::
get
<
platform
::
GPUPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
gpu_id
);
if
(
idx
==
root
)
{
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
VLOG
(
1
)
<<
" invoke Bcast. send "
<<
ins
[
i
]
->
numel
();
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
ins
[
i
]
->
numel
();
VLOG
(
1
)
<<
" before ncclBcast"
;
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
(
void
*
)
ins
[
i
]
->
data
<
T
>
(),
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms_
[
idx
],
stream
));
VLOG
(
1
)
<<
" after ncclBcast"
;
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
" finished Bcast."
;
VLOG
(
1
)
<<
"
gpu : "
<<
gpu_id
<<
"
finished Bcast."
;
}
}
else
{
auto
outs
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"Out"
);
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
VLOG
(
1
)
<<
" invoke Bcast. recv. "
;
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
<<
framework
::
product
(
outs
[
i
]
->
dims
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
outs
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms_
[
idx
],
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
" finished Bcast. recv "
<<
outs
[
i
]
->
numel
();
VLOG
(
1
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast. recv "
<<
outs
[
i
]
->
numel
();
}
}
}
...
...
paddle/operators/nccl_op_test.cu
浏览文件 @
4e165f4e
...
...
@@ -87,30 +87,34 @@ class NCCLTester : public ::testing::Test {
void
PerThreadProgram
(
int
gpu_id
,
const
f
::
OpDescBind
&
op_desc
,
f
::
Scope
*
scope
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mu
);
f
::
ProgramDescBind
program
;
f
::
BlockDescBind
*
block
=
program
.
Block
(
0
);
f
::
OpDescBind
*
op1
=
block
->
AppendOp
();
*
op1
=
op_desc
;
const
f
::
OpDescBind
*
op1
=
&
op_desc
;
p
::
GPUPlace
place
(
gpu_id
);
auto
&
ctx
=
dev_ctxs
.
at
(
gpu_id
);
auto
*
send_tensor
=
scope
->
Var
(
"st"
)
->
GetMutable
<
f
::
LoDTensor
>
();
auto
*
recv_tensor
=
scope
->
Var
(
"rt"
)
->
GetMutable
<
f
::
LoDTensor
>
();
send_tensor
->
Resize
(
kDims
);
send_tensor
->
mutable_data
<
T
>
(
kDims
,
place
);
std
::
vector
<
T
>
send_vector
(
f
::
product
(
kDims
),
gpu_id
);
send_tensor
->
CopyFromVector
<
T
>
(
send_vector
,
*
ctx
);
if
(
!
send_tensor
->
numel
())
{
send_tensor
->
Resize
(
kDims
);
send_tensor
->
mutable_data
<
T
>
(
kDims
,
place
);
std
::
vector
<
T
>
send_vector
(
f
::
product
(
kDims
),
gpu_id
);
send_tensor
->
CopyFromVector
<
T
>
(
send_vector
,
*
ctx
);
ctx
->
Wait
();
VLOG
(
1
)
<<
"Send Tensor filled with elements "
<<
send_tensor
->
numel
();
}
lk
.
unlock
();
PADDLE_ENFORCE
(
send_tensor
->
numel
()
==
f
::
product
(
kDims
),
"Tensor numel not match!"
);
ctx
->
Wait
();
VLOG
(
1
)
<<
"Send Tensor filled with elements "
<<
send_tensor
->
numel
();
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" invoke "
<<
op_desc
.
Type
();
VLOG
(
1
)
<<
" send_tensor : "
<<
send_tensor
->
numel
()
<<
" recv_tensor : "
<<
recv_tensor
->
numel
();
op
->
Run
(
*
scope
,
*
ctx
);
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
}
...
...
@@ -122,168 +126,171 @@ class NCCLTester : public ::testing::Test {
std
::
mutex
mu
;
};
// ncclInitOp with desc
TEST
(
NCCL
,
ncclInitOp
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op_desc
(
new
f
::
OpDescBind
);
op_desc
->
SetType
(
"ncclInit"
);
op_desc
->
SetOutput
(
"Communicator"
,
{
"x1"
});
op_desc
->
SetAttr
(
"gpus"
,
{
gpu_list
});
f
::
Scope
g_scope
;
std
::
unique_ptr
<
p
::
DeviceContext
>
ctx
(
new
p
::
CPUDeviceContext
(
p
::
CPUPlace
()));
auto
*
var
=
g_scope
.
Var
(
"x1"
);
var
->
GetMutable
<
p
::
Communicator
>
();
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op_desc
);
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
op
->
Run
(
g_scope
,
*
ctx
.
get
());
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
}
// ncclAllReduceOp with desc
TEST_F
(
NCCLTester
,
ncclAllReduceOp
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
op2
->
SetType
(
"ncclAllReduce"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
std
::
vector
<
f
::
Scope
*>
dev_scopes
;
std
::
vector
<
std
::
thread
>
ths
;
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
*
op2
.
get
(),
dev_scopes
[
i
]);
ths
.
emplace_back
(
std
::
move
(
th
));
}
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
ths
[
i
].
join
();
}
// check results
float
result
=
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
0
);
for
(
size_t
i
=
0
;
i
<
dev_scopes
.
size
();
++
i
)
{
p
::
CPUPlace
cpu_place
;
p
::
GPUPlace
gpu_place
(
gpu_list
[
i
]);
auto
&
recv_tensor
=
dev_scopes
[
i
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
auto
*
result_tensor
=
dev_scopes
[
i
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
paddle
::
memory
::
Copy
(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
i
]),
rt
,
recv_tensor
.
numel
()
*
sizeof
(
float
),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
i
])
->
stream
());
for
(
size_t
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
}
}
// ncclAReduceOp with desc
TEST_F
(
NCCLTester
,
ncclReduceOp
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
const
int
kRoot
=
0
;
op2
->
SetType
(
"ncclReduce"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
std
::
vector
<
f
::
Scope
*>
dev_scopes
;
std
::
vector
<
std
::
thread
>
ths
;
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
*
op2
.
get
(),
dev_scopes
[
i
]);
ths
.
emplace_back
(
std
::
move
(
th
));
}
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
ths
[
i
].
join
();
}
// check results on
float
result
=
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
0
);
p
::
CPUPlace
cpu_place
;
p
::
GPUPlace
gpu_place
(
gpu_list
[
kRoot
]);
auto
&
recv_tensor
=
dev_scopes
[
kRoot
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
auto
*
result_tensor
=
dev_scopes
[
kRoot
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
paddle
::
memory
::
Copy
(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
kRoot
]),
rt
,
recv_tensor
.
numel
()
*
sizeof
(
float
),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
kRoot
])
->
stream
());
for
(
int
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
}
// // ncclBcastOp with desc
TEST_F
(
NCCLTester
,
ncclBcastOp
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
const
int
kRoot
=
5
;
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
std
::
vector
<
f
::
Scope
*>
dev_scopes
;
std
::
vector
<
std
::
thread
>
ths
;
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
*
op2
.
get
(),
dev_scopes
[
i
]);
ths
.
emplace_back
(
std
::
move
(
th
));
}
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
ths
[
i
].
join
();
}
const
int
idx
=
1
;
// check results on
float
result
=
kRoot
;
p
::
CPUPlace
cpu_place
;
p
::
GPUPlace
gpu_place
(
gpu_list
[
idx
]);
auto
&
recv_tensor
=
dev_scopes
[
idx
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
auto
*
result_tensor
=
dev_scopes
[
idx
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
result_tensor
->
Resize
(
kDims
);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
paddle
::
memory
::
Copy
(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
idx
]),
rt
,
recv_tensor
.
numel
()
*
sizeof
(
float
),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
idx
])
->
stream
());
for
(
size_t
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
}
}
// // ncclInitOp with desc
// TEST(NCCL, ncclInitOp) {
// std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
// op_desc->SetType("ncclInit");
// op_desc->SetOutput("Communicator", {"x1"});
// op_desc->SetAttr("gpus", {gpu_list});
// f::Scope g_scope;
// std::unique_ptr<p::DeviceContext> ctx(new
// p::CPUDeviceContext(p::CPUPlace()));
// auto *var = g_scope.Var("x1");
// var->GetMutable<p::Communicator>();
// auto op = f::OpRegistry::CreateOp(*op_desc);
// VLOG(1) << "invoke NCCLInitOp.";
// op->Run(g_scope, *ctx.get());
// VLOG(1) << "NCCLInitOp finished.";
// }
// // ncclAllReduceOp with desc
// TEST_F(NCCLTester, ncclAllReduceOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// op2->SetType("ncclAllReduce");
// op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
// std::vector<f::Scope *> dev_scopes;
// std::vector<std::thread> ths;
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// dev_scopes.emplace_back(&g_scope.NewScope());
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(), dev_scopes[i]);
// ths.emplace_back(std::move(th));
// }
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
// }
// // check results
// float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
// for (size_t i = 0; i < dev_scopes.size(); ++i) {
// p::CPUPlace cpu_place;
// p::GPUPlace gpu_place(gpu_list[i]);
// auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
// auto *rt = recv_tensor.data<float>();
// auto *result_tensor =
// dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
// result_tensor->Resize(kDims);
// auto *ct = result_tensor->mutable_data<float>(cpu_place);
// paddle::memory::Copy(
// cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
// recv_tensor.numel() * sizeof(float),
// static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
// for (size_t j = 0; j < f::product(kDims); ++j) {
// ASSERT_NEAR(ct[j], result, 1e-5);
// }
// }
// }
// // ncclAReduceOp with desc
// TEST_F(NCCLTester, ncclReduceOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// const int kRoot = 0;
// op2->SetType("ncclReduce");
// op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
// op2->SetAttr("root", {kRoot});
// std::vector<f::Scope *> dev_scopes;
// std::vector<std::thread> ths;
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// dev_scopes.emplace_back(&g_scope.NewScope());
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(), dev_scopes[i]);
// ths.emplace_back(std::move(th));
// }
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
// }
// // check results on
// float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
// p::CPUPlace cpu_place;
// p::GPUPlace gpu_place(gpu_list[kRoot]);
// auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
// auto *rt = recv_tensor.data<float>();
// auto *result_tensor =
// dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
// result_tensor->Resize(kDims);
// auto *ct = result_tensor->mutable_data<float>(cpu_place);
// paddle::memory::Copy(
// cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt,
// recv_tensor.numel() * sizeof(float),
// static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
// for (int j = 0; j < f::product(kDims); ++j) {
// ASSERT_NEAR(ct[j], result, 1e-5);
// }
// }
// // // ncclBcastOp with desc
// TEST_F(NCCLTester, ncclBcastOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// const int kRoot = 5;
// op2->SetType("ncclBcast");
// op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
// op2->SetAttr("root", {kRoot});
// std::vector<f::Scope *> dev_scopes;
// std::vector<std::thread> ths;
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// dev_scopes.emplace_back(&g_scope.NewScope());
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(), dev_scopes[i]);
// ths.emplace_back(std::move(th));
// }
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
// }
// const int idx = 1;
// // check results on
// float result = kRoot;
// p::CPUPlace cpu_place;
// p::GPUPlace gpu_place(gpu_list[idx]);
// auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
// auto *rt = recv_tensor.data<float>();
// auto *result_tensor =
// dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
// result_tensor->Resize(kDims);
// auto *ct = result_tensor->mutable_data<float>(cpu_place);
// paddle::memory::Copy(
// cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt,
// recv_tensor.numel() * sizeof(float),
// static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
// for (size_t j = 0; j < f::product(kDims); ++j) {
// ASSERT_NEAR(ct[j], result, 1e-5);
// }
// }
// joint ncclBcastOp and ncclReduceOp
TEST_F
(
NCCLTester
,
MultipleOp
)
{
...
...
@@ -299,14 +306,17 @@ TEST_F(NCCLTester, MultipleOp) {
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetInput
(
"X"
,
{
"rt"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"
r
t"
});
op2
->
SetOutput
(
"Out"
,
{
"
ou
t"
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
std
::
vector
<
f
::
Scope
*>
dev_scopes
;
// for (size_t i = 0; i < dev_scopes.size(); ++i) {
// dev_scopes[i]->Var("out")->GetMutable<f::LoDTensor>();
// }
std
::
vector
<
std
::
thread
>
ths
;
// run
Bcast
// run
Reduce
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
...
...
@@ -320,9 +330,9 @@ TEST_F(NCCLTester, MultipleOp) {
ths
.
clear
();
// run
Reduce
// run
Bcast
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
()
);
dev_scopes
[
i
]
->
Var
(
"out"
)
->
GetMutable
<
f
::
LoDTensor
>
(
);
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
*
op2
.
get
(),
dev_scopes
[
i
]);
ths
.
emplace_back
(
std
::
move
(
th
));
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录