Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
61c1b046
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
61c1b046
编写于
10月 25, 2017
作者:
D
Dong Zhihong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
"fix multigpu testcase"
上级
38d3adfe
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
72 addition
and
66 deletion
+72
-66
paddle/operators/nccl_op.cu
paddle/operators/nccl_op.cu
+8
-0
paddle/operators/nccl_op_test.cu
paddle/operators/nccl_op_test.cu
+64
-66
未找到文件。
paddle/operators/nccl_op.cu
浏览文件 @
61c1b046
...
@@ -142,18 +142,26 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
...
@@ -142,18 +142,26 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
if
(
idx
==
root
)
{
if
(
idx
==
root
)
{
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
VLOG
(
1
)
<<
" invoke Bcast. send "
<<
ins
[
i
]
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
(
void
*
)
ins
[
i
]
->
data
<
T
>
(),
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
(
void
*
)
ins
[
i
]
->
data
<
T
>
(),
ins
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms_
[
idx
],
stream
));
root
,
comm
->
comms_
[
idx
],
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
" finished Bcast."
;
}
}
}
else
{
}
else
{
auto
outs
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"Out"
);
auto
outs
=
ctx
.
MultiOutput
<
LoDTensor
>
(
"Out"
);
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
VLOG
(
1
)
<<
" invoke Bcast. recv. "
;
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
outs
[
i
]
->
numel
(),
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
outs
[
i
]
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms_
[
idx
],
stream
));
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms_
[
idx
],
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
VLOG
(
1
)
<<
" finished Bcast. recv "
<<
outs
[
i
]
->
numel
();
}
}
}
}
}
}
...
...
paddle/operators/nccl_op_test.cu
浏览文件 @
61c1b046
...
@@ -123,73 +123,71 @@ class NCCLTester : public ::testing::Test {
...
@@ -123,73 +123,71 @@ class NCCLTester : public ::testing::Test {
};
};
// ncclInitOp with desc
// ncclInitOp with desc
//
TEST(NCCL, ncclInitOp) {
TEST
(
NCCL
,
ncclInitOp
)
{
//
std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
std
::
unique_ptr
<
f
::
OpDescBind
>
op_desc
(
new
f
::
OpDescBind
);
//
op_desc->SetType("ncclInit");
op_desc
->
SetType
(
"ncclInit"
);
//
op_desc->SetOutput("Communicator", {"x1"});
op_desc
->
SetOutput
(
"Communicator"
,
{
"x1"
});
//
op_desc->SetAttr("gpus", {gpu_list});
op_desc
->
SetAttr
(
"gpus"
,
{
gpu_list
});
// f::Scope g_scope;
f
::
Scope
g_scope
;
// std::unique_ptr<p::DeviceContext> ctx(new
std
::
unique_ptr
<
p
::
DeviceContext
>
ctx
(
new
p
::
CPUDeviceContext
(
p
::
CPUPlace
()));
// p::CPUDeviceContext(p::CPUPlace()));
//
auto *var = g_scope.Var("x1");
auto
*
var
=
g_scope
.
Var
(
"x1"
);
//
var->GetMutable<p::Communicator>();
var
->
GetMutable
<
p
::
Communicator
>
();
//
auto op = f::OpRegistry::CreateOp(*op_desc);
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op_desc
);
//
VLOG(1) << "invoke NCCLInitOp.";
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
//
op->Run(g_scope, *ctx.get());
op
->
Run
(
g_scope
,
*
ctx
.
get
());
//
VLOG(1) << "NCCLInitOp finished.";
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
//
}
}
// ncclAllReduceOp with desc
// ncclAllReduceOp with desc
// TEST_F(NCCLTester, ncclAllReduceOp) {
TEST_F
(
NCCLTester
,
ncclAllReduceOp
)
{
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
// op2->SetType("ncclAllReduce");
op2
->
SetType
(
"ncclAllReduce"
);
// op2->SetInput("X", {"st"});
op2
->
SetInput
(
"X"
,
{
"st"
});
// op2->SetInput("Communicator", {"comm"});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
// op2->SetOutput("Out", {"rt"});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
// std::vector<f::Scope *> dev_scopes;
std
::
vector
<
f
::
Scope
*>
dev_scopes
;
// std::vector<std::thread> ths;
std
::
vector
<
std
::
thread
>
ths
;
// for (size_t i = 0; i < gpu_list.size(); ++i) {
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
// dev_scopes.emplace_back(&g_scope.NewScope());
dev_scopes
.
emplace_back
(
&
g_scope
.
NewScope
());
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
],
// *op2.get(), dev_scopes[i]);
*
op2
.
get
(),
dev_scopes
[
i
]);
// ths.emplace_back(std::move(th));
ths
.
emplace_back
(
std
::
move
(
th
));
// }
}
// for (size_t i = 0; i < gpu_list.size(); ++i) {
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
// ths[i].join();
ths
[
i
].
join
();
// }
}
// // check results
// check results
// float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float
result
=
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
0
);
// for (size_t i = 0; i < dev_scopes.size(); ++i) {
for
(
size_t
i
=
0
;
i
<
dev_scopes
.
size
();
++
i
)
{
// p::CPUPlace cpu_place;
p
::
CPUPlace
cpu_place
;
// p::GPUPlace gpu_place(gpu_list[i]);
p
::
GPUPlace
gpu_place
(
gpu_list
[
i
]);
// auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto
&
recv_tensor
=
dev_scopes
[
i
]
->
FindVar
(
"rt"
)
->
Get
<
f
::
LoDTensor
>
();
// auto *rt = recv_tensor.data<float>();
auto
*
rt
=
recv_tensor
.
data
<
float
>
();
// auto *result_tensor =
auto
*
result_tensor
=
dev_scopes
[
i
]
->
Var
(
"ct"
)
->
GetMutable
<
f
::
LoDTensor
>
();
// dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
result_tensor
->
Resize
(
kDims
);
// result_tensor->Resize(kDims);
auto
*
ct
=
result_tensor
->
mutable_data
<
float
>
(
cpu_place
);
// auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle
::
memory
::
Copy
(
// paddle::memory::Copy(
cpu_place
,
ct
,
p
::
GPUPlace
(
gpu_list
[
i
]),
rt
,
// cpu_place, ct, p::GPUPlace(gpu_list[i]), rt,
recv_tensor
.
numel
()
*
sizeof
(
float
),
// recv_tensor.numel() * sizeof(float),
static_cast
<
p
::
CUDADeviceContext
*>
(
dev_ctxs
[
i
])
->
stream
());
// static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
for
(
size_t
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
// for (size_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR
(
ct
[
j
],
result
,
1e-5
);
// ASSERT_NEAR(ct[j], result, 1e-5);
}
// }
}
// }
}
// }
// ncclAReduceOp with desc
// ncclAReduceOp with desc
TEST_F
(
NCCLTester
,
ncclReduceOp
)
{
TEST_F
(
NCCLTester
,
ncclReduceOp
)
{
...
@@ -242,7 +240,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
...
@@ -242,7 +240,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
// // ncclBcastOp with desc
// // ncclBcastOp with desc
TEST_F
(
NCCLTester
,
ncclBcastOp
)
{
TEST_F
(
NCCLTester
,
ncclBcastOp
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
const
int
kRoot
=
0
;
const
int
kRoot
=
5
;
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
...
@@ -266,7 +264,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
...
@@ -266,7 +264,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
const
int
idx
=
1
;
const
int
idx
=
1
;
// check results on
// check results on
float
result
=
std
::
accumulate
(
gpu_list
.
begin
(),
gpu_list
.
end
(),
0
)
;
float
result
=
kRoot
;
p
::
CPUPlace
cpu_place
;
p
::
CPUPlace
cpu_place
;
p
::
GPUPlace
gpu_place
(
gpu_list
[
idx
]);
p
::
GPUPlace
gpu_place
(
gpu_list
[
idx
]);
...
@@ -292,14 +290,14 @@ TEST_F(NCCLTester, MultipleOp) {
...
@@ -292,14 +290,14 @@ TEST_F(NCCLTester, MultipleOp) {
const
int
kRoot
=
0
;
const
int
kRoot
=
0
;
std
::
unique_ptr
<
f
::
OpDescBind
>
op1
(
new
f
::
OpDescBind
);
std
::
unique_ptr
<
f
::
OpDescBind
>
op1
(
new
f
::
OpDescBind
);
op1
->
SetType
(
"ncclReduce"
);
op1
->
SetType
(
"ncclReduce"
);
op1
->
SetInput
(
"X"
,
{
"
r
t"
});
op1
->
SetInput
(
"X"
,
{
"
s
t"
});
op1
->
SetInput
(
"Communicator"
,
{
"comm"
});
op1
->
SetInput
(
"Communicator"
,
{
"comm"
});
op1
->
SetOutput
(
"Out"
,
{
"rt"
});
op1
->
SetOutput
(
"Out"
,
{
"rt"
});
op
2
->
SetAttr
(
"root"
,
{
kRoot
});
op
1
->
SetAttr
(
"root"
,
{
kRoot
});
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetType
(
"ncclBcast"
);
op2
->
SetInput
(
"X"
,
{
"
s
t"
});
op2
->
SetInput
(
"X"
,
{
"
r
t"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
op2
->
SetAttr
(
"root"
,
{
kRoot
});
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录