Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
ef257e6d
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ef257e6d
编写于
10月 24, 2017
作者:
D
Dong Zhihong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
write nccl c++ test case
上级
50f04dca
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
298 addition
and
114 deletion
+298
-114
paddle/operators/CMakeLists.txt
paddle/operators/CMakeLists.txt
+4
-0
paddle/operators/nccl/CMakeLists.txt
paddle/operators/nccl/CMakeLists.txt
+0
-1
paddle/operators/nccl/nccl_gpu_common.h
paddle/operators/nccl/nccl_gpu_common.h
+0
-2
paddle/operators/nccl/nccl_gpu_common_test.cc
paddle/operators/nccl/nccl_gpu_common_test.cc
+0
-33
paddle/operators/nccl_op.cc
paddle/operators/nccl_op.cc
+17
-10
paddle/operators/nccl_op.cu
paddle/operators/nccl_op.cu
+0
-1
paddle/operators/nccl_op.h
paddle/operators/nccl_op.h
+2
-2
paddle/operators/nccl_op_test.cc
paddle/operators/nccl_op_test.cc
+71
-0
paddle/operators/nccl_op_test.cu
paddle/operators/nccl_op_test.cu
+71
-0
paddle/pybind/pybind.cc
paddle/pybind/pybind.cc
+12
-1
python/paddle/v2/framework/tests/test_multigpu.py
python/paddle/v2/framework/tests/test_multigpu.py
+8
-0
python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
+58
-64
python/paddle/v2/framework/tests/test_nccl_init_op.py
python/paddle/v2/framework/tests/test_nccl_init_op.py
+36
-0
python/paddle/v2/framework/tests/test_nccl_reduce_op.py
python/paddle/v2/framework/tests/test_nccl_reduce_op.py
+19
-0
未找到文件。
paddle/operators/CMakeLists.txt
浏览文件 @
ef257e6d
...
@@ -154,3 +154,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
...
@@ -154,3 +154,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test
(
scatter_test SRCS scatter_test.cc DEPS tensor
)
cc_test
(
scatter_test SRCS scatter_test.cc DEPS tensor
)
cc_test
(
strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory
)
cc_test
(
strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory
)
cc_test
(
dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc DEPS dynamic_recurrent_op recurrent_op tensor_array
)
cc_test
(
dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc DEPS dynamic_recurrent_op recurrent_op tensor_array
)
if
(
WITH_GPU
)
nv_test
(
nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context
)
endif
()
paddle/operators/nccl/CMakeLists.txt
浏览文件 @
ef257e6d
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
nv_library
(
nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator
)
nv_test
(
nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common
)
endif
()
endif
()
paddle/operators/nccl/nccl_gpu_common.h
浏览文件 @
ef257e6d
...
@@ -53,7 +53,5 @@ struct Communicator {
...
@@ -53,7 +53,5 @@ struct Communicator {
// DISABLE_COPY_AND_ASSIGN(Communicator);
// DISABLE_COPY_AND_ASSIGN(Communicator);
};
};
Communicator
*
NewCommunicator
(
const
std
::
vector
<
int
>&
gpus
);
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/operators/nccl/nccl_gpu_common_test.cc
已删除
100644 → 0
浏览文件 @
50f04dca
#include "paddle/operators/nccl/nccl_gpu_common.h"
#include <gtest/gtest.h>
#include <chrono>
#include <thread>
#include <vector>
namespace
paddle
{
namespace
platform
{
TEST
(
WaitGroup
,
wait
)
{
WaitGroup
wg
;
auto
run_thread
=
[
&
wg
](
int
idx
)
{
wg
.
Add
(
1
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
1
));
wg
.
Done
();
};
std
::
vector
<
std
::
thread
>
ths
;
constexpr
const
int
TNUM
=
5
;
for
(
int
i
=
0
;
i
<
TNUM
;
++
i
)
{
ths
.
emplace_back
(
std
::
thread
(
run_thread
,
i
));
}
wg
.
Wait
();
for
(
int
i
=
0
;
i
<
TNUM
;
++
i
)
{
ths
[
i
].
join
();
}
}
}
// namespace platform
}
// namespace paddle
paddle/operators/nccl_op.cc
浏览文件 @
ef257e6d
...
@@ -21,9 +21,14 @@ class NCCLInitOp : public framework::OperatorWithKernel {
...
@@ -21,9 +21,14 @@ class NCCLInitOp : public framework::OperatorWithKernel {
protected:
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Communicator"
),
ctx
->
HasOutput
(
"Communicator"
),
" Output(Communicator) of ncclInitOp should not be NULL"
);
" Output(Communicator) of ncclInit op input should not be NULL"
);
}
protected:
framework
::
DataType
IndicateDataType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
static_cast
<
framework
::
DataType
>
(
ctx
.
Attr
<
int
>
(
"data_type"
));
}
}
};
};
...
@@ -32,9 +37,11 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -32,9 +37,11 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
NCCLInitOpMaker
(
framework
::
OpProto
*
proto
,
NCCLInitOpMaker
(
framework
::
OpProto
*
proto
,
framework
::
OpAttrChecker
*
op_checker
)
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddAttr
<
std
::
vector
<
int
>>
(
"gpus"
,
"gpu id lists"
);
AddOutput
(
"Communicator"
,
AddOutput
(
"Communicator"
,
"Create Communicator for communicating between gpus"
);
"Create Communicator for communicating between gpus"
);
AddAttr
<
std
::
vector
<
int
>>
(
"gpus"
,
"gpu id lists"
);
AddAttr
<
int
>
(
"data_type"
,
"output data type"
)
.
SetDefault
(
framework
::
DataType
::
FP32
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
create communicator.
create communicator.
)DOC"
);
)DOC"
);
...
@@ -58,10 +65,10 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
...
@@ -58,10 +65,10 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
std
::
string
reduction
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"reduction"
);
//
std::string reduction = ctx->Attrs().Get<std::string>("reduction");
PADDLE_ENFORCE
((
reduction
==
"ncclSum"
||
reduction
==
"ncclProd"
||
//
PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
reduction
==
"ncclMin"
||
reduction
==
"ncclMax"
),
//
reduction == "ncclMin" || reduction == "ncclMax"),
"invalid reduction."
);
//
"invalid reduction.");
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
SetOutputsDim
(
"Out"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
...
@@ -122,8 +129,8 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -122,8 +129,8 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"X"
,
"The input of AllReduce op"
);
AddInput
(
"X"
,
"The input of AllReduce op"
);
AddInput
(
"Communicator"
,
"Communicator for communicating between gpus"
);
AddInput
(
"Communicator"
,
"Communicator for communicating between gpus"
);
AddOutput
(
"Out"
,
"The output of AllReduce op"
);
AddOutput
(
"Out"
,
"The output of AllReduce op"
);
AddAttr
<
std
::
string
>
(
"reduction"
,
//
AddAttr<std::string>("reduction",
"{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}."
);
//
"{'ncclmin', 'ncclmax', 'ncclprod', 'ncclsum'}.");
// AddAttr<std::vector<int>>("gpus", "gpu id lists");
// AddAttr<std::vector<int>>("gpus", "gpu id lists");
AddComment
(
R"DOC(
AddComment
(
R"DOC(
AllReduce the input tensors.
AllReduce the input tensors.
...
...
paddle/operators/nccl_op.cu
浏览文件 @
ef257e6d
...
@@ -26,7 +26,6 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
...
@@ -26,7 +26,6 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
auto
ins
=
ctx
.
MultiInput
<
Tensor
>
(
"X"
);
auto
ins
=
ctx
.
MultiInput
<
Tensor
>
(
"X"
);
auto
outs
=
ctx
.
MultiOutput
<
Tensor
>
(
"Out"
);
auto
outs
=
ctx
.
MultiOutput
<
Tensor
>
(
"Out"
);
std
::
string
reduction
=
ctx
.
Attr
<
std
::
string
>
(
"reduction"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
auto
*
comm
=
ctx
.
Input
<
Communicator
>
(
"Communicator"
);
...
...
paddle/operators/nccl_op.h
浏览文件 @
ef257e6d
...
@@ -40,9 +40,9 @@ template <typename T>
...
@@ -40,9 +40,9 @@ template <typename T>
class
NCCLInitKernel
:
public
framework
::
OpKernel
<
T
>
{
class
NCCLInitKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
gpus
=
ctx
.
Input
<
std
::
vector
<
int
>>
(
"gpus"
);
std
::
vector
<
int
>
gpus
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"gpus"
);
auto
*
comm
=
ctx
.
Output
<
Communicator
>
(
"Communicator"
);
auto
*
comm
=
ctx
.
Output
<
Communicator
>
(
"Communicator"
);
comm
->
InitAll
(
*
gpus
);
comm
->
InitAll
(
gpus
);
}
}
};
};
...
...
paddle/operators/nccl_op_test.cc
0 → 100644
浏览文件 @
ef257e6d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/nccl_op.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/gpu_info.h"
#include <thrust/device_vector.h>
#include <memory>
#include <vector>
static
std
::
vector
<
int
>
gpu_list
;
using
f
=
paddle
::
framework
;
using
ops
=
paddle
::
operators
;
void
AddOp
(
const
std
::
string
&
type
,
const
f
::
VariableNameMap
&
inputs
,
const
f
::
VariableNameMap
&
outputs
,
f
::
AttributeMap
attrs
,
paddle
::
framework
::
BlockDescBind
*
block
)
{
for
(
auto
kv
:
outputs
)
{
for
(
auto
v
:
kv
.
second
)
{
auto
var
=
block
->
Var
(
v
);
var
->
SetDataType
(
paddle
::
framework
::
DataType
::
FP32
);
}
}
auto
op
=
block
->
AppendOp
();
op
->
SetType
(
type
);
for
(
auto
&
kv
:
inputs
)
{
op
->
SetInput
(
kv
.
first
,
kv
.
second
);
}
for
(
auto
&
kv
:
outputs
)
{
op
->
SetOutput
(
kv
.
first
,
kv
.
second
);
}
op
->
SetAttrMap
(
attrs
);
}
TEST
(
NCCL
,
ncclInitOp
)
{
f
::
ProgramDescBind
program
;
f
::
BlockDescBind
*
block
=
program
.
Block
(
0
);
}
int
main
(
int
argc
,
char
**
argv
)
{
static
constexpr
int
gpu_count
=
paddle
::
platform
::
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
gpu_count
;
++
i
)
{
gpu_list
.
emplace_back
(
i
);
}
if
(
dev_count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu nccl, because the CUDA device count is "
<<
dev_count
;
return
0
;
}
testing
::
InitGoogleTest
(
&
argc
,
argv
);
return
RUN_ALL_TESTS
();
}
paddle/operators/nccl_op_test.cu
0 → 100644
浏览文件 @
ef257e6d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/nccl_op.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/gpu_info.h"
#include <thrust/device_vector.h>
#include <memory>
#include <vector>
static
std
::
vector
<
int
>
gpu_list
;
using
f
=
paddle
::
framework
;
using
ops
=
paddle
::
operators
;
void
AddOp
(
const
std
::
string
&
type
,
const
f
::
VariableNameMap
&
inputs
,
const
f
::
VariableNameMap
&
outputs
,
f
::
AttributeMap
attrs
,
paddle
::
framework
::
BlockDescBind
*
block
)
{
for
(
auto
kv
:
outputs
)
{
for
(
auto
v
:
kv
.
second
)
{
auto
var
=
block
->
Var
(
v
);
var
->
SetDataType
(
paddle
::
framework
::
DataType
::
FP32
);
}
}
auto
op
=
block
->
AppendOp
();
op
->
SetType
(
type
);
for
(
auto
&
kv
:
inputs
)
{
op
->
SetInput
(
kv
.
first
,
kv
.
second
);
}
for
(
auto
&
kv
:
outputs
)
{
op
->
SetOutput
(
kv
.
first
,
kv
.
second
);
}
op
->
SetAttrMap
(
attrs
);
}
TEST
(
NCCL
,
ncclInitOp
)
{
f
::
ProgramDescBind
program
;
f
::
BlockDescBind
*
block
=
program
.
Block
(
0
);
}
int
main
(
int
argc
,
char
**
argv
)
{
static
constexpr
int
gpu_count
=
paddle
::
platform
::
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
gpu_count
;
++
i
)
{
gpu_list
.
emplace_back
(
i
);
}
if
(
dev_count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu nccl, because the CUDA device count is "
<<
dev_count
;
return
0
;
}
testing
::
InitGoogleTest
(
&
argc
,
argv
);
return
RUN_ALL_TESTS
();
}
paddle/pybind/pybind.cc
浏览文件 @
ef257e6d
...
@@ -23,6 +23,7 @@ limitations under the License. */
...
@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/framework/tensor_array.h"
#include "paddle/framework/tensor_array.h"
#include "paddle/operators/cond_op.h"
#include "paddle/operators/cond_op.h"
#include "paddle/operators/dynamic_recurrent_op.h"
#include "paddle/operators/dynamic_recurrent_op.h"
#include "paddle/operators/nccl/nccl_gpu_common.h"
#include "paddle/operators/net_op.h"
#include "paddle/operators/net_op.h"
#include "paddle/operators/recurrent_op.h"
#include "paddle/operators/recurrent_op.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/enforce.h"
...
@@ -203,6 +204,13 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -203,6 +204,13 @@ All parameter, weight, gradient are variables in Paddle.
return
self
.
GetMutable
<
SelectedRows
>
();
return
self
.
GetMutable
<
SelectedRows
>
();
},
},
py
::
return_value_policy
::
reference
)
py
::
return_value_policy
::
reference
)
#ifdef PADDLE_WITH_CUDA
.
def
(
"get_communicator"
,
[](
Variable
&
self
)
->
platform
::
Communicator
*
{
return
self
.
GetMutable
<
platform
::
Communicator
>
();
},
py
::
return_value_policy
::
reference
)
#endif
.
def
(
"get_net"
,
.
def
(
"get_net"
,
[](
Variable
&
self
)
->
operators
::
NetOp
*
{
[](
Variable
&
self
)
->
operators
::
NetOp
*
{
return
self
.
GetMutable
<
operators
::
NetOp
>
();
return
self
.
GetMutable
<
operators
::
NetOp
>
();
...
@@ -258,8 +266,11 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -258,8 +266,11 @@ All parameter, weight, gradient are variables in Paddle.
return
new
paddle
::
platform
::
CUDADeviceContext
(
place
);
return
new
paddle
::
platform
::
CUDADeviceContext
(
place
);
#endif
#endif
});
});
// clang-format on
// clang-format on
#ifdef PADDLE_WITH_CUDA
py
::
class_
<
platform
::
Communicator
>
(
m
,
"Communicator"
).
def
(
py
::
init
<>
());
#endif
py
::
class_
<
platform
::
GPUPlace
>
(
m
,
"GPUPlace"
)
py
::
class_
<
platform
::
GPUPlace
>
(
m
,
"GPUPlace"
)
.
def
(
py
::
init
<
int
>
())
.
def
(
py
::
init
<
int
>
())
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
GPUPlace
&>
);
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
GPUPlace
&>
);
...
...
python/paddle/v2/framework/tests/test_multigpu.py
0 → 100644
浏览文件 @
ef257e6d
import
unittest
,
os
import
numpy
as
np
import
paddle.v2
as
paddle
from
paddle.v2.framework.op
import
Operator
import
paddle.v2.framework.core
as
core
from
op_test
import
OpTest
,
create_op
,
set_input
gpu_list
=
"0,1,2,3"
python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
浏览文件 @
ef257e6d
import
unittest
,
os
import
unittest
,
os
from
threading
import
Thread
import
numpy
as
np
import
numpy
as
np
import
paddle.v2
as
paddle
import
paddle.v2
as
paddle
from
paddle.v2.framework.op
import
Operator
from
paddle.v2.framework.op
import
Operator
...
@@ -13,94 +14,87 @@ if not core.is_compile_gpu() or not gpu_list:
...
@@ -13,94 +14,87 @@ if not core.is_compile_gpu() or not gpu_list:
g_scope
=
core
.
Scope
()
g_scope
=
core
.
Scope
()
g_ctx
=
core
.
DeviceContext
.
create
(
core
.
CPUPlace
())
g_ctx
=
core
.
DeviceContext
.
create
(
core
.
CPUPlace
())
gpus
=
[
int
(
g
)
for
g
in
gpu_list
.
split
(
","
)]
class
TestNCCLInit
(
OpTest
):
# ground truth
def
setUp
(
self
):
def
allreduce
(
tensors
,
gpus
):
self
.
op_type
=
"ncclInit"
num_device
=
len
(
gpus
)
self
.
gpus
=
[
int
(
g
)
for
g
in
gpu_list
.
split
(
","
)]
assert
(
len
(
tensors
)
==
num_device
),
"not match of tensor and device"
Out
=
tensors
self
.
attrs
=
{
"gpus"
:
self
.
gpus
}
for
i
in
range
(
1
,
len
(
tensors
)):
self
.
scope
=
g_scope
.
var
(
"Communicator"
)
Out
[
0
]
+=
Out
[
i
]
self
.
outputs
=
{
"Communicator"
:
self
.
scope
.
var
(
"Communicator"
)}
def
test_check_output
(
self
):
for
i
in
range
(
1
,
len
(
tensors
)
):
self
.
check_output
()
Out
[
i
]
=
Out
[
0
]
return
Out
class
TestNCCLAllReduce
(
unittest
.
TestCase
):
def
setUp
(
self
):
# cpu allreduce for check
def
allreduce
(
tensors
,
gpus
):
num_device
=
len
(
gpus
)
assert
(
len
(
tensors
)
==
num_device
),
"not match of tensor and device"
Out
=
tensors
for
i
in
range
(
1
,
len
(
tensors
)):
Out
[
0
]
+=
Out
[
i
]
for
i
in
range
(
1
,
len
(
tensors
)):
input_data
=
[
Out
[
i
]
=
Out
[
0
]
np
.
random
.
random
((
32
,
32
)).
astype
(
"float32"
)
for
i
in
range
(
len
(
gpus
))
]
return
Out
output_data
=
allreduce
(
input_data
,
gpus
)
self
.
op_type
=
"ncclAllReduce"
self
.
gpus
=
[
int
(
g
)
for
g
in
gpu_list
.
split
(
","
)]
# output_vars = [g_scope.var("Out_"+str(i)).get_tensor()
# for i in range(len(gpus))]
self
.
g_scope
=
core
.
Scope
()
self
.
g_ctx
=
core
.
DeviceContext
.
create
(
core
.
CPUPlace
())
self
.
scopes
=
[]
self
.
ops
=
[]
self
.
places
=
[]
self
.
input_data
=
[]
def
thread_allreduce_op
(
thread_id
,
gpu_id
):
i
=
gpu_id
scope
=
g_scope
.
new_scope
()
place
=
core
.
GPUPlace
(
gpus
[
i
])
inputs
=
{
"X"
:
input_data
[
i
],
"Communicator"
:
scope
.
find_var
(
"Communicator"
)
}
outputs
=
{
"Out"
:
output_data
[
i
]}
for
i
in
range
(
len
(
self
.
gpus
)):
op
=
create_op
(
scope
,
"ncclAllReduce"
,
inputs
,
outputs
,
attrs
=
{})
self
.
input_data
.
append
(
np
.
random
.
random
((
32
,
32
))
)
place
=
core
.
GPUPlace
(
gpus
[
i
]
)
self
.
output_data
=
allreduce
(
self
.
input_data
,
self
.
gpus
)
set_input
(
scope
,
op
,
inputs
,
place
)
nccl_init
=
Operator
(
"ncclInit"
,
Out
=
"Communicator"
,
gpus
=
self
.
gpus
)
ctx
=
core
.
DeviceContext
.
create
(
place
)
nccl_init
.
run
(
self
.
g_scope
,
self
.
g_ctx
)
for
i
in
range
(
len
(
self
.
gpus
)):
print
"thread_id : "
,
thread_id
,
"gpu_id : "
,
gpu_id
,
" invoke allreduce"
# insert kid scope
op
.
run
(
scope
,
ctx
)
scope
=
self
.
g_scope
.
new_scope
()
print
"thread_id : "
,
thread_id
,
"gpu_id : "
,
gpu_id
,
" allreduce Done."
place
=
core
.
GPUPlace
(
self
.
gpus
[
i
])
inputs
=
{
"X"
:
self
.
input_data
[
i
],
"Communicator"
:
scope
.
find_var
(
"Communicator"
)
}
outputs
=
{
"Out"
:
self
.
output_data
[
i
]}
# attrs = {"gpus": self.gpus}
op
=
create_op
(
scope
,
self
.
op_type
,
inputs
,
outputs
,
attrs
)
class
TestNCCLAllReduce
(
unittest
.
TestCase
):
set_input
(
scope
,
op
,
inputs
,
place
)
def
setUp
(
self
):
self
.
op_type
=
"ncclAllReduce"
self
.
scopes
.
append
(
scope
)
nccl_init
=
create_op
(
self
.
ops
.
append
(
op
)
g_scope
,
self
.
places
.
append
(
place
)
op_type
=
"ncclInit"
,
inputs
=
{},
outputs
=
{
"Communicator"
:
g_scope
.
var
(
"Communicator"
).
get_communicator
()
},
attrs
=
{
"gpus"
:
gpus
})
nccl_init
.
run
(
g_scope
,
g_ctx
)
def
test_output
(
self
):
def
test_output
(
self
):
idx
=
0
ops
=
[]
for
scope
,
place
,
op
in
zip
(
self
.
scopes
,
self
.
places
,
self
.
ops
):
for
i
in
range
(
len
(
gpus
)):
ctx
=
core
.
DeviceContext
.
create
(
place
)
th
=
Thread
(
op
.
run
(
scope
,
ctx
)
target
=
thread_allreduce_op
,
args
=
(
i
,
gpus
[
i
],
))
th
.
start
()
ops
.
append
(
ops
)
for
th
in
ops
:
th
.
join
()
idx
=
0
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
self
.
op
.
type
()):
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
self
.
op
.
type
()):
actual
=
np
.
array
(
scope
.
find_var
(
out_name
).
get_tensor
())
actual
=
np
.
array
(
scope
.
find_var
(
out_name
).
get_tensor
())
expect
=
self
.
output_data
[
idx
]
expect
=
output_data
[
idx
]
idx
+=
1
idx
+=
1
self
.
assertTrue
(
actual
,
expect
),
"has diff"
self
.
assertTrue
(
actual
,
expect
),
"has diff"
# if __name__ == "__main__":
# unittest.main()
# usage : export NV_LIST=0,1,2,3 python *.py
# os.environ["NV_LIST"] = ["0,1,2,3"]
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/v2/framework/tests/test_nccl_init_op.py
0 → 100644
浏览文件 @
ef257e6d
import
unittest
,
os
import
numpy
as
np
import
paddle.v2
as
paddle
from
paddle.v2.framework.op
import
Operator
import
paddle.v2.framework.core
as
core
from
op_test
import
OpTest
,
create_op
,
set_input
gpu_list
=
"0,1,2,3"
if
not
core
.
is_compile_gpu
()
or
not
gpu_list
:
exit
(
0
)
g_scope
=
core
.
Scope
()
g_ctx
=
core
.
DeviceContext
.
create
(
core
.
CPUPlace
())
class
TestNCCLInit
(
unittest
.
TestCase
):
def
test_init
(
self
):
self
.
op_type
=
"ncclInit"
self
.
gpus
=
[
int
(
g
)
for
g
in
gpu_list
.
split
(
","
)]
self
.
inputs
=
{}
self
.
attrs
=
{
"gpus"
:
self
.
gpus
}
g_scope
.
var
(
"Communicator"
).
get_communicator
()
self
.
outputs
=
{
"Communicator"
:
g_scope
.
find_var
(
"Communicator"
)}
nccl_init
=
create_op
(
g_scope
,
op_type
=
self
.
op_type
,
inputs
=
self
.
inputs
,
outputs
=
self
.
outputs
,
attrs
=
self
.
attrs
)
nccl_init
.
run
(
g_scope
,
g_ctx
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/v2/framework/tests/test_nccl_reduce_op.py
浏览文件 @
ef257e6d
...
@@ -4,3 +4,22 @@ import paddle.v2 as paddle
...
@@ -4,3 +4,22 @@ import paddle.v2 as paddle
from
paddle.v2.framework.op
import
Operator
from
paddle.v2.framework.op
import
Operator
import
paddle.v2.framework.core
as
core
import
paddle.v2.framework.core
as
core
from
op_test
import
OpTest
,
create_op
,
set_input
from
op_test
import
OpTest
,
create_op
,
set_input
gpu_list
=
"0,1,2,3"
g_scope
=
core
.
Scope
()
g_ctx
=
core
.
DeviceContext
.
create
(
core
.
CPUPlace
())
if
not
core
.
is_compile_gpu
()
or
not
gpu_list
:
exit
(
0
)
class
TestNCCLReduce
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"ncclReduce"
self
.
gpus
=
[
int
(
g
)
for
g
in
gpu_list
.
split
(
","
)]
self
.
scope
=
g_scope
.
var
(
"Communicator"
).
get_communicator
()
self
.
outputs
=
{
"Communicator"
:
self
.
scope
.
var
(
"Communicator"
)}
def
test_check_output
(
self
):
self
.
check_output
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录