Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
11cf3e3a
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
11cf3e3a
编写于
10月 25, 2017
作者:
D
Dong Zhihong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
"refactorization of nccl test case"
上级
6d1493a4
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
111 addition
and
124 deletion
+111
-124
paddle/operators/nccl_op_test.cu
paddle/operators/nccl_op_test.cu
+111
-124
未找到文件。
paddle/operators/nccl_op_test.cu
浏览文件 @
11cf3e3a
...
...
@@ -43,81 +43,107 @@ namespace f = paddle::framework;
namespace
p
=
paddle
::
platform
;
static
std
::
vector
<
int
>
gpu_list
;
static
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
dev_ctxs
;
std
::
mutex
mu
;
// test data amount
const
f
::
DDim
kDims
=
{
100
,
100
};
// ncclInitOp with desc
TEST
(
NCCL
,
ncclInitOp
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op_desc
(
new
f
::
OpDescBind
);
// nccl op common tester, init communicator.
class
NCCLTester
:
public
::
testing
::
Test
{
public:
virtual
void
SetUp
()
override
{
cpu_ctx
=
new
p
::
CPUDeviceContext
(
p
::
CPUPlace
());
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
p
::
GPUPlace
place
(
i
);
dev_ctxs
.
emplace_back
(
new
p
::
CUDADeviceContext
(
place
));
}
NCCLInitOp
();
}
op_desc
->
SetType
(
"ncclInit"
);
op_desc
->
SetOutput
(
"Communicator"
,
{
"x1"
});
op_desc
->
SetAttr
(
"gpus"
,
{
gpu_list
})
;
f
::
Scope
g_scope
;
p
::
DeviceContext
*
ctx
=
new
p
::
CPUDeviceContext
(
p
::
CPUPlace
());
virtual
void
TearDown
()
override
{
for
(
auto
&
device_context
:
dev_ctxs
)
{
delete
device_context
;
}
}
auto
*
var
=
g_scope
.
Var
(
"x1"
);
var
->
GetMutable
<
p
::
Communicator
>
(
);
void
NCCLInitOp
()
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op1
(
new
f
::
OpDescBind
);
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op_desc
);
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
op
->
Run
(
g_scope
,
*
ctx
);
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
}
op1
->
SetType
(
"ncclInit"
);
op1
->
SetOutput
(
"Communicator"
,
{
"comm"
});
op1
->
SetAttr
(
"gpus"
,
{
gpu_list
});
template
<
class
T
>
void
DeviceProgram
(
int
gpu_id
,
const
f
::
OpDescBind
&
op_desc
,
f
::
Scope
*
scope
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mu
);
f
::
ProgramDescBind
program
;
f
::
BlockDescBind
*
block
=
program
.
Block
(
0
);
f
::
OpDescBind
*
op1
=
block
->
AppendOp
();
*
op1
=
op_desc
;
p
::
GPUPlace
place
(
gpu_id
);
auto
ctx
=
dev_ctxs
.
at
(
gpu_id
);
auto
*
send_tensor
=
scope
->
Var
(
"st"
)
->
GetMutable
<
f
::
LoDTensor
>
();
auto
*
recv_tensor
=
scope
->
Var
(
"rt"
)
->
GetMutable
<
f
::
LoDTensor
>
();
send_tensor
->
Resize
(
kDims
);
send_tensor
->
mutable_data
<
T
>
(
kDims
,
place
);
std
::
vector
<
T
>
send_vector
(
f
::
product
(
kDims
),
gpu_id
);
send_tensor
->
CopyFromVector
<
T
>
(
send_vector
,
*
ctx
);
lk
.
unlock
();
PADDLE_ENFORCE
(
send_tensor
->
numel
()
==
f
::
product
(
kDims
),
"Tensor numel not match!"
);
ctx
->
Wait
();
VLOG
(
1
)
<<
"Send Tensor filled with elements "
<<
send_tensor
->
numel
();
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" invoke "
<<
op_desc
.
Type
();
op
->
Run
(
*
scope
,
*
ctx
);
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
}
auto
*
var
=
g_scope
.
Var
(
"comm"
);
var
->
GetMutable
<
p
::
Communicator
>
();
// ncclAllReduceOp with desc
TEST
(
NCCL
,
ncclAllReduceOp
)
{
std
::
unique_ptr
<
p
::
DeviceContext
>
ctx
(
new
p
::
CPUDeviceContext
(
p
::
CPUPlace
()));
std
::
unique_ptr
<
f
::
Scope
>
g_scope
(
new
Scope
);
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
op
->
Run
(
g_scope
,
*
cpu_ctx
);
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
}
template
<
class
T
>
void
PerThreadProgram
(
int
gpu_id
,
const
f
::
OpDescBind
&
op_desc
,
f
::
Scope
*
scope
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mu
);
f
::
ProgramDescBind
program
;
f
::
BlockDescBind
*
block
=
program
.
Block
(
0
);
f
::
OpDescBind
*
op1
=
block
->
AppendOp
();
*
op1
=
op_desc
;
p
::
GPUPlace
place
(
gpu_id
);
auto
&
ctx
=
dev_ctxs
.
at
(
gpu_id
);
auto
*
send_tensor
=
scope
->
Var
(
"st"
)
->
GetMutable
<
f
::
LoDTensor
>
();
auto
*
recv_tensor
=
scope
->
Var
(
"rt"
)
->
GetMutable
<
f
::
LoDTensor
>
();
send_tensor
->
Resize
(
kDims
);
send_tensor
->
mutable_data
<
T
>
(
kDims
,
place
);
std
::
vector
<
T
>
send_vector
(
f
::
product
(
kDims
),
gpu_id
);
send_tensor
->
CopyFromVector
<
T
>
(
send_vector
,
*
ctx
);
lk
.
unlock
();
PADDLE_ENFORCE
(
send_tensor
->
numel
()
==
f
::
product
(
kDims
),
"Tensor numel not match!"
);
ctx
->
Wait
();
VLOG
(
1
)
<<
"Send Tensor filled with elements "
<<
send_tensor
->
numel
();
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" invoke "
<<
op_desc
.
Type
();
op
->
Run
(
*
scope
,
*
ctx
);
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
}
std
::
unique_ptr
<
f
::
OpDescBind
>
op1
(
new
f
::
OpDescBind
);
op1
->
SetType
(
"ncclInit"
);
op1
->
SetOutput
(
"Communicator"
,
{
"comm"
});
op1
->
SetAttr
(
"gpus"
,
{
gpu_list
});
public:
std
::
vector
<
p
::
DeviceContext
*>
dev_ctxs
;
p
::
DeviceContext
*
cpu_ctx
;
f
::
Scope
g_scope
;
std
::
mutex
mu
;
};
// ncclInitOp with desc
// TEST(NCCL, ncclInitOp) {
// std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
// op_desc->SetType("ncclInit");
// op_desc->SetOutput("Communicator", {"x1"});
// op_desc->SetAttr("gpus", {gpu_list});
// f::Scope g_scope;
// std::unique_ptr<p::DeviceContext> ctx(new
// p::CPUDeviceContext(p::CPUPlace()));
auto
*
var
=
g_scope
.
Var
(
"comm
"
);
var
->
GetMutable
<
p
::
Communicator
>
();
// auto *var = g_scope.Var("x1
");
//
var->GetMutable<p::Communicator>();
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
op
->
Run
(
g_scope
,
*
ctx
);
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
delete
ctx
;
// auto op = f::OpRegistry::CreateOp(*op_desc
);
//
VLOG(1) << "invoke NCCLInitOp.";
// op->Run(g_scope, *ctx.get()
);
//
VLOG(1) << "NCCLInitOp finished.";
// }
// ncclAllReduceOp with desc
TEST_F
(
NCCLTester
,
ncclAllReduceOp
)
{
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
op2
->
SetType
(
"ncclAllReduce"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
...
...
@@ -126,36 +152,18 @@ TEST(NCCL, ncclAllReduceOp) {
std
::
vector
<
std
::
thread
>
ths
;
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
std
::
thread
th
(
DeviceProgram
<
float
>
,
gpu_list
[
i
],
*
op2
,
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
]
,
*
op2
.
get
(),
&
g_scope
.
NewScope
());
ths
.
emplace_back
(
std
::
move
(
th
));
}
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
ths
[
i
].
join
();
}
g_scope
->
reset
(
nullptr
);
}
// ncclReduceOp with desc
TEST
(
NCCL
,
ncclReduceOp
)
{
std
::
unique_ptr
<
p
::
DeviceContext
>
ctx
(
new
p
::
CPUDeviceContext
(
p
::
CPUPlace
()));
std
::
unique_ptr
<
f
::
Scope
>
g_scope
(
new
Scope
);
std
::
unique_ptr
<
f
::
OpDescBind
>
op1
(
new
f
::
OpDescBind
);
op1
->
SetType
(
"ncclInit"
);
op1
->
SetOutput
(
"Communicator"
,
{
"comm"
});
op1
->
SetAttr
(
"gpus"
,
{
gpu_list
});
auto
*
var
=
g_scope
.
Var
(
"comm"
);
var
->
GetMutable
<
p
::
Communicator
>
();
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
op
->
Run
(
g_scope
,
*
ctx
);
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
delete
ctx
;
std
::
unique_ptr
<
f
::
OpDescBind
>
op2
(
new
f
::
OpDescBind
);
op2
->
SetType
(
"ncclReduce"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
...
...
@@ -164,53 +172,36 @@ TEST(NCCL, ncclReduceOp) {
std
::
vector
<
std
::
thread
>
ths
;
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
std
::
thread
th
(
DeviceProgram
<
float
>
,
gpu_list
[
i
],
*
op2
,
&
g_scope
.
NewScope
());
std
::
thread
th
(
&
NCCLTester
::
PerThreadProgram
<
float
>
,
this
,
gpu_list
[
i
]
,
*
op2
.
get
(),
&
g_scope
.
NewScope
());
ths
.
emplace_back
(
std
::
move
(
th
));
}
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
ths
[
i
].
join
();
}
g_scope
->
reset
(
nullptr
);
}
// ncclBcastOp with desc
TEST
(
NCCL
,
ncclBcastOp
)
{
f
::
ProgramDescBind
program
;
f
::
BlockDescBind
*
block
=
program
.
Block
(
0
);
f
::
OpDescBind
*
op1
=
block
->
AppendOp
();
p
::
DeviceContext
*
ctx
=
new
p
::
CPUDeviceContext
(
p
::
CPUPlace
());
op1
->
SetType
(
"ncclInit"
);
op1
->
SetOutput
(
"Communicator"
,
{
"comm"
});
op1
->
SetAttr
(
"gpus"
,
{
gpu_list
});
auto
*
var
=
g_scope
.
Var
(
"comm"
);
var
->
GetMutable
<
p
::
Communicator
>
();
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
op
->
Run
(
g_scope
,
*
ctx
);
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
f
::
OpDescBind
*
op2
=
new
f
::
OpDescBind
;
op2
->
SetType
(
"ncclBcastSend"
);
op2
->
SetInput
(
"X"
,
{
"st"
});
op2
->
SetInput
(
"Communicator"
,
{
"comm"
});
op2
->
SetOutput
(
"Out"
,
{
"rt"
});
std
::
vector
<
std
::
thread
>
ths
;
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
std
::
thread
th
(
DeviceProgram
<
float
>
,
gpu_list
[
i
],
*
op2
);
ths
.
emplace_back
(
std
::
move
(
th
));
}
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
ths
[
i
].
join
();
}
}
// TEST(NCCL, ncclBcastOp) {
// std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
// op2->SetType("ncclBcastSend");
// op2->SetInput("X", {"st"});
// op2->SetInput("Communicator", {"comm"});
// op2->SetOutput("Out", {"rt"});
// std::vector<std::thread> ths;
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
// *op2.get(),
// &g_scope.NewScope());
// ths.emplace_back(std::move(th));
// }
// for (size_t i = 0; i < gpu_list.size(); ++i) {
// ths[i].join();
// }
// }
int
main
(
int
argc
,
char
**
argv
)
{
const
int
dev_count
=
p
::
GetCUDADeviceCount
();
...
...
@@ -228,9 +219,5 @@ int main(int argc, char **argv) {
// device context should be release before scope.
// otherwise driver will down.
for
(
size_t
i
=
0
;
i
<
gpu_list
.
size
();
++
i
)
{
p
::
GPUPlace
place
(
i
);
dev_ctxs
.
emplace_back
(
new
p
::
CUDADeviceContext
(
place
));
}
return
RUN_ALL_TESTS
();
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录