Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
1d756746
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1d756746
编写于
4月 21, 2018
作者:
Q
qiaolongfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into add-async-listen-and-serv-op
上级
0763ae9a
6402b59a
变更
45
展开全部
隐藏空白更改
内联
并排
Showing
45 changed file
with
801 addition
and
778 deletion
+801
-778
Dockerfile
Dockerfile
+3
-0
paddle/fluid/framework/block_desc.cc
paddle/fluid/framework/block_desc.cc
+1
-0
paddle/fluid/framework/data_device_transform_test.cu
paddle/fluid/framework/data_device_transform_test.cu
+10
-9
paddle/fluid/framework/data_layout_transform_test.cc
paddle/fluid/framework/data_layout_transform_test.cc
+21
-20
paddle/fluid/framework/data_type_transform_test.cc
paddle/fluid/framework/data_type_transform_test.cc
+76
-50
paddle/fluid/framework/data_type_transform_test.cu
paddle/fluid/framework/data_type_transform_test.cu
+119
-80
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+9
-8
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+13
-8
paddle/fluid/framework/details/broadcast_op_handle_test.cc
paddle/fluid/framework/details/broadcast_op_handle_test.cc
+18
-6
paddle/fluid/framework/details/cow_ptr.h
paddle/fluid/framework/details/cow_ptr.h
+2
-2
paddle/fluid/framework/details/gather_op_handle.cc
paddle/fluid/framework/details/gather_op_handle.cc
+10
-5
paddle/fluid/framework/details/gather_op_handle_test.cc
paddle/fluid/framework/details/gather_op_handle_test.cc
+15
-6
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+5
-5
paddle/fluid/framework/details/op_registry.h
paddle/fluid/framework/details/op_registry.h
+3
-0
paddle/fluid/framework/details/reduce_and_gather.h
paddle/fluid/framework/details/reduce_and_gather.h
+4
-4
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+76
-68
paddle/fluid/framework/details/reduce_op_handle.h
paddle/fluid/framework/details/reduce_op_handle.h
+6
-3
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+34
-23
paddle/fluid/framework/op_registry_test.cc
paddle/fluid/framework/op_registry_test.cc
+3
-2
paddle/fluid/framework/operator_test.cc
paddle/fluid/framework/operator_test.cc
+7
-9
paddle/fluid/framework/program_desc.cc
paddle/fluid/framework/program_desc.cc
+5
-1
paddle/fluid/framework/program_desc.h
paddle/fluid/framework/program_desc.h
+2
-0
paddle/fluid/framework/threadpool_test.cc
paddle/fluid/framework/threadpool_test.cc
+4
-4
paddle/fluid/inference/io.cc
paddle/fluid/inference/io.cc
+12
-11
paddle/fluid/inference/io.h
paddle/fluid/inference/io.h
+5
-5
paddle/fluid/inference/tests/test_helper.h
paddle/fluid/inference/tests/test_helper.h
+2
-2
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+5
-0
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+199
-361
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+0
-1
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+48
-10
paddle/fluid/operators/dropout_op.cu
paddle/fluid/operators/dropout_op.cu
+29
-21
paddle/fluid/operators/dropout_op.h
paddle/fluid/operators/dropout_op.h
+6
-6
paddle/fluid/operators/dropout_op_test.cc
paddle/fluid/operators/dropout_op_test.cc
+12
-17
paddle/fluid/operators/mkldnn_activation_op.h
paddle/fluid/operators/mkldnn_activation_op.h
+1
-1
paddle/fluid/pybind/protobuf.cc
paddle/fluid/pybind/protobuf.cc
+1
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-0
paddle/scripts/docker/build.sh
paddle/scripts/docker/build.sh
+2
-2
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+4
-2
python/paddle/fluid/layer_helper.py
python/paddle/fluid/layer_helper.py
+5
-1
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+8
-3
python/paddle/fluid/tests/unittests/test_activation_op.py
python/paddle/fluid/tests/unittests/test_activation_op.py
+3
-13
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+4
-1
python/paddle/fluid/tests/unittests/test_multi_file_reader.py
...on/paddle/fluid/tests/unittests/test_multi_file_reader.py
+0
-1
python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
...on/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+2
-4
python/paddle/fluid/tests/unittests/test_recordio_reader.py
python/paddle/fluid/tests/unittests/test_recordio_reader.py
+3
-3
未找到文件。
Dockerfile
浏览文件 @
1d756746
# A image for building paddle binaries
# Use cuda devel base image for both cpu and gpu environment
# When you modify it, please be aware of cudnn-runtime version
# and libcudnn.so.x in paddle/scripts/docker/build.sh
FROM
nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
MAINTAINER
PaddlePaddle Authors <paddle-dev@baidu.com>
...
...
paddle/fluid/framework/block_desc.cc
浏览文件 @
1d756746
...
...
@@ -146,6 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
if
(
ops_
.
begin
()
+
s
==
ops_
.
end
()
||
ops_
.
begin
()
+
e
==
ops_
.
end
())
{
return
;
}
need_update_
=
true
;
ops_
.
erase
(
ops_
.
begin
()
+
s
,
ops_
.
begin
()
+
e
);
}
...
...
paddle/fluid/framework/data_device_transform_test.cu
浏览文件 @
1d756746
...
...
@@ -103,9 +103,7 @@ static void BuildVar(const std::string& param_name,
}
TEST
(
Operator
,
CPUtoGPU
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
InitDevices
(
true
);
paddle
::
framework
::
InitDevices
(
true
);
paddle
::
framework
::
Scope
scope
;
paddle
::
platform
::
CPUPlace
cpu_place
;
...
...
@@ -118,8 +116,9 @@ TEST(Operator, CPUtoGPU) {
auto
cpu_op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
cpu_op_desc
);
// prepare input
auto
*
in_t
=
scope
.
Var
(
"IN1"
)
->
GetMutable
<
LoDTensor
>
();
auto
*
src_ptr
=
in_t
->
mutable_data
<
float
>
({
2
,
3
},
CPUPlace
());
auto
*
in_t
=
scope
.
Var
(
"IN1"
)
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
auto
*
src_ptr
=
in_t
->
mutable_data
<
float
>
({
2
,
3
},
paddle
::
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
2
*
3
;
++
i
)
{
src_ptr
[
i
]
=
static_cast
<
float
>
(
i
);
}
...
...
@@ -128,7 +127,7 @@ TEST(Operator, CPUtoGPU) {
auto
*
output
=
scope
.
Var
(
"OUT1"
);
cpu_op
->
Run
(
scope
,
cpu_place
);
auto
*
output_ptr
=
output
->
Get
<
LoDTensor
>
().
data
<
float
>
();
auto
*
output_ptr
=
output
->
Get
<
paddle
::
framework
::
LoDTensor
>
().
data
<
float
>
();
for
(
int
i
=
0
;
i
<
2
*
3
;
++
i
)
{
ASSERT_EQ
(
output_ptr
[
i
],
static_cast
<
float
>
(
i
)
*
2
);
}
...
...
@@ -153,12 +152,14 @@ TEST(Operator, CPUtoGPU) {
VLOG
(
3
)
<<
"after gpu_op run"
;
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
DeviceContextPool
&
pool
=
DeviceContextPool
::
Instance
();
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
dev_ctx
=
pool
.
Get
(
cuda_place
);
paddle
::
framework
::
Tensor
output_tensor
;
TensorCopy
(
output2
->
Get
<
LoDTensor
>
(),
paddle
::
platform
::
CPUPlace
(),
*
dev_ctx
,
&
output_tensor
);
paddle
::
framework
::
TensorCopy
(
output2
->
Get
<
paddle
::
framework
::
LoDTensor
>
(),
paddle
::
platform
::
CPUPlace
(),
*
dev_ctx
,
&
output_tensor
);
dev_ctx
->
Wait
();
float
*
output2_ptr
=
output_tensor
.
data
<
float
>
();
...
...
paddle/fluid/framework/data_layout_transform_test.cc
浏览文件 @
1d756746
...
...
@@ -18,27 +18,28 @@
#include "paddle/fluid/platform/device_context.h"
TEST
(
DataTransform
,
DataLayoutFunction
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
auto
place
=
CPUPlace
();
Tensor
in
=
Tensor
();
Tensor
out
=
Tensor
();
in
.
mutable_data
<
double
>
(
make_ddim
({
2
,
3
,
1
,
2
}),
place
);
in
.
set_layout
(
DataLayout
::
kNHWC
);
auto
kernel_nhwc
=
OpKernelType
(
proto
::
VarType
::
FP32
,
place
,
DataLayout
::
kNHWC
,
LibraryType
::
kPlain
);
auto
kernel_ncwh
=
OpKernelType
(
proto
::
VarType
::
FP32
,
place
,
DataLayout
::
kNCHW
,
LibraryType
::
kPlain
);
TransDataLayout
(
kernel_nhwc
,
kernel_ncwh
,
in
,
&
out
);
EXPECT_TRUE
(
out
.
layout
()
==
DataLayout
::
kNCHW
);
EXPECT_TRUE
(
out
.
dims
()
==
make_ddim
({
2
,
2
,
3
,
1
}));
auto
place
=
paddle
::
platform
::
CPUPlace
();
paddle
::
framework
::
Tensor
in
=
paddle
::
framework
::
Tensor
();
paddle
::
framework
::
Tensor
out
=
paddle
::
framework
::
Tensor
();
in
.
mutable_data
<
double
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
,
1
,
2
}),
place
);
in
.
set_layout
(
paddle
::
framework
::
DataLayout
::
kNHWC
);
auto
kernel_nhwc
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP32
,
place
,
paddle
::
framework
::
DataLayout
::
kNHWC
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_ncwh
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP32
,
place
,
paddle
::
framework
::
DataLayout
::
kNCHW
,
paddle
::
framework
::
LibraryType
::
kPlain
);
paddle
::
framework
::
TransDataLayout
(
kernel_nhwc
,
kernel_ncwh
,
in
,
&
out
);
EXPECT_TRUE
(
out
.
layout
()
==
paddle
::
framework
::
DataLayout
::
kNCHW
);
EXPECT_TRUE
(
out
.
dims
()
==
paddle
::
framework
::
make_ddim
({
2
,
2
,
3
,
1
}));
TransDataLayout
(
kernel_ncwh
,
kernel_nhwc
,
in
,
&
out
);
EXPECT_TRUE
(
in
.
layout
()
==
DataLayout
::
kNHWC
);
EXPECT_TRUE
(
in
.
dims
()
==
make_ddim
({
2
,
3
,
1
,
2
}));
EXPECT_TRUE
(
in
.
layout
()
==
paddle
::
framework
::
DataLayout
::
kNHWC
);
EXPECT_TRUE
(
in
.
dims
()
==
paddle
::
framework
::
make_ddim
({
2
,
3
,
1
,
2
}));
}
paddle/fluid/framework/data_type_transform_test.cc
浏览文件 @
1d756746
...
...
@@ -17,43 +17,58 @@ limitations under the License. */
#include "gtest/gtest.h"
TEST
(
DataTypeTransform
,
CPUTransform
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
auto
place
=
CPUPlace
();
auto
kernel_fp16
=
OpKernelType
(
proto
::
VarType
::
FP16
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp32
=
OpKernelType
(
proto
::
VarType
::
FP32
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp64
=
OpKernelType
(
proto
::
VarType
::
FP64
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int32
=
OpKernelType
(
proto
::
VarType
::
INT32
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int64
=
OpKernelType
(
proto
::
VarType
::
INT64
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_bool
=
OpKernelType
(
proto
::
VarType
::
BOOL
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
kernel_fp16
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP16
,
place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_fp32
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP32
,
place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_fp64
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP64
,
place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_int32
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
INT32
,
place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_int64
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
INT64
,
place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_bool
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
BOOL
,
place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
// data type transform from float32
{
Tensor
in
;
Tensor
out
;
paddle
::
framework
::
Tensor
in
;
paddle
::
framework
::
Tensor
out
;
float
*
ptr
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
place
);
float
*
ptr
=
in
.
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
place
);
int
data_number
=
2
*
3
;
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ptr
[
i
]
=
i
/
3
;
}
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in
,
&
out
);
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
i
/
3
));
}
TransDataType
(
kernel_fp32
,
kernel_int32
,
in
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp32
,
kernel_int32
,
in
,
&
out
);
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
i
/
3
));
...
...
@@ -62,10 +77,11 @@ TEST(DataTypeTransform, CPUTransform) {
// data type transform from/to float16
{
Tensor
in
;
Tensor
out
;
paddle
::
framework
::
Tensor
in
;
paddle
::
framework
::
Tensor
out
;
float16
*
ptr
=
in
.
mutable_data
<
float16
>
(
make_ddim
({
2
,
3
}),
place
);
paddle
::
platform
::
float16
*
ptr
=
in
.
mutable_data
<
paddle
::
platform
::
float16
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
place
);
int
data_number
=
2
*
3
;
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
...
...
@@ -73,94 +89,104 @@ TEST(DataTypeTransform, CPUTransform) {
}
// transform from float16 to other data types
TransDataType
(
kernel_fp16
,
kernel_fp32
,
in
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_fp32
,
in
,
&
out
);
float
*
out_data_float
=
out
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
out_data_float
[
i
],
static_cast
<
float
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_fp64
,
in
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_fp64
,
in
,
&
out
);
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int32
,
in
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_int32
,
in
,
&
out
);
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int64
,
in
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_int64
,
in
,
&
out
);
int64_t
*
out_data_int64
=
out
.
data
<
int64_t
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
out_data_int64
[
i
],
static_cast
<
int64_t
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_bool
,
in
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_bool
,
in
,
&
out
);
bool
*
out_data_bool
=
out
.
data
<
bool
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
out_data_bool
[
i
],
static_cast
<
bool
>
(
ptr
[
i
]));
}
// transform float to float16
float
*
in_data_float
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
place
);
float
*
in_data_float
=
in
.
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_float
[
i
]
=
i
;
}
TransDataType
(
kernel_fp32
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
paddle
::
framework
::
TransDataType
(
kernel_fp32
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_float
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_float
[
i
]).
x
);
}
// transform double to float16
double
*
in_data_double
=
in
.
mutable_data
<
double
>
(
make_ddim
({
2
,
3
}),
place
);
double
*
in_data_double
=
in
.
mutable_data
<
double
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_double
[
i
]
=
i
;
}
TransDataType
(
kernel_fp64
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
paddle
::
framework
::
TransDataType
(
kernel_fp64
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_double
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_double
[
i
]).
x
);
}
// transform int to float16
int
*
in_data_int
=
in
.
mutable_data
<
int
>
(
make_ddim
({
2
,
3
}),
place
);
int
*
in_data_int
=
in
.
mutable_data
<
int
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int
[
i
]
=
i
;
}
TransDataType
(
kernel_int32
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
paddle
::
framework
::
TransDataType
(
kernel_int32
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_int
[
i
]).
x
);
}
// transform int64 to float16
int64_t
*
in_data_int64
=
in
.
mutable_data
<
int64_t
>
(
make_ddim
({
2
,
3
}),
place
);
int64_t
*
in_data_int64
=
in
.
mutable_data
<
int64_t
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int64
[
i
]
=
i
;
}
TransDataType
(
kernel_int64
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
paddle
::
framework
::
TransDataType
(
kernel_int64
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int64
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_int64
[
i
]).
x
);
}
// transform bool to float16
bool
*
in_data_bool
=
in
.
mutable_data
<
bool
>
(
make_ddim
({
2
,
3
}),
place
);
bool
*
in_data_bool
=
in
.
mutable_data
<
bool
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_bool
[
i
]
=
i
;
}
TransDataType
(
kernel_bool
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
paddle
::
framework
::
TransDataType
(
kernel_bool
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_bool
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_bool
[
i
]).
x
);
}
}
}
paddle/fluid/framework/data_type_transform_test.cu
浏览文件 @
1d756746
...
...
@@ -18,42 +18,58 @@ limitations under the License. */
#include "gtest/gtest.h"
TEST
(
DataTypeTransform
,
GPUTransform
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
auto
cpu_place
=
CPUPlace
();
auto
gpu_place
=
CUDAPlace
(
0
);
CUDADeviceContext
context
(
gpu_place
);
auto
kernel_fp16
=
OpKernelType
(
proto
::
VarType
::
FP16
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp32
=
OpKernelType
(
proto
::
VarType
::
FP32
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp64
=
OpKernelType
(
proto
::
VarType
::
FP64
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int32
=
OpKernelType
(
proto
::
VarType
::
INT32
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int64
=
OpKernelType
(
proto
::
VarType
::
INT64
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_bool
=
OpKernelType
(
proto
::
VarType
::
BOOL
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
cpu_place
=
paddle
::
platform
::
CPUPlace
();
auto
gpu_place
=
paddle
::
platform
::
CUDAPlace
(
0
);
paddle
::
platform
::
CUDADeviceContext
context
(
gpu_place
);
auto
kernel_fp16
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP16
,
gpu_place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_fp32
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP32
,
gpu_place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_fp64
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
FP64
,
gpu_place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_int32
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
INT32
,
gpu_place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_int64
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
INT64
,
gpu_place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
auto
kernel_bool
=
paddle
::
framework
::
OpKernelType
(
paddle
::
framework
::
proto
::
VarType
::
BOOL
,
gpu_place
,
paddle
::
framework
::
DataLayout
::
kAnyLayout
,
paddle
::
framework
::
LibraryType
::
kPlain
);
// data type transform from float32
{
Tensor
in
;
Tensor
in_gpu
;
Tensor
out_gpu
;
Tensor
out
;
paddle
::
framework
::
Tensor
in
;
paddle
::
framework
::
Tensor
in_gpu
;
paddle
::
framework
::
Tensor
out_gpu
;
paddle
::
framework
::
Tensor
out
;
float
*
in_ptr
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
float
*
in_ptr
=
in
.
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
cpu_place
);
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
int
data_number
=
sizeof
(
arr
)
/
sizeof
(
arr
[
0
]);
memcpy
(
in_ptr
,
arr
,
sizeof
(
arr
));
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
paddle
::
framework
::
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
context
.
Wait
();
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
double
*
out_data_double
=
out
.
data
<
double
>
();
...
...
@@ -61,8 +77,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
arr
[
i
]));
}
TransDataType
(
kernel_fp32
,
kernel_int32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp32
,
kernel_int32
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int
*
out_data_int
=
out
.
data
<
int
>
();
...
...
@@ -73,22 +90,27 @@ TEST(DataTypeTransform, GPUTransform) {
// data type transform from/to float16
{
Tensor
in
;
Tensor
in_gpu
;
Tensor
out_gpu
;
Tensor
out
;
float16
*
ptr
=
in
.
mutable_data
<
float16
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
float16
arr
[
6
]
=
{
float16
(
0
),
float16
(
1
),
float16
(
2
),
float16
(
3
),
float16
(
4
),
float16
(
5
)};
paddle
::
framework
::
Tensor
in
;
paddle
::
framework
::
Tensor
in_gpu
;
paddle
::
framework
::
Tensor
out_gpu
;
paddle
::
framework
::
Tensor
out
;
paddle
::
platform
::
float16
*
ptr
=
in
.
mutable_data
<
paddle
::
platform
::
float16
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
cpu_place
);
paddle
::
platform
::
float16
arr
[
6
]
=
{
paddle
::
platform
::
float16
(
0
),
paddle
::
platform
::
float16
(
1
),
paddle
::
platform
::
float16
(
2
),
paddle
::
platform
::
float16
(
3
),
paddle
::
platform
::
float16
(
4
),
paddle
::
platform
::
float16
(
5
)};
int
data_number
=
sizeof
(
arr
)
/
sizeof
(
arr
[
0
]);
memcpy
(
ptr
,
arr
,
sizeof
(
arr
));
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
paddle
::
framework
::
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
context
.
Wait
();
// transform from float16 to other data types
TransDataType
(
kernel_fp16
,
kernel_fp32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_fp32
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
float
*
out_data_float
=
out
.
data
<
float
>
();
...
...
@@ -96,8 +118,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ
(
out_data_float
[
i
],
static_cast
<
float
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_fp64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_fp64
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
double
*
out_data_double
=
out
.
data
<
double
>
();
...
...
@@ -105,8 +128,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_int32
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int
*
out_data_int
=
out
.
data
<
int
>
();
...
...
@@ -114,8 +138,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_int64
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int64_t
*
out_data_int64
=
out
.
data
<
int64_t
>
();
...
...
@@ -123,8 +148,9 @@ TEST(DataTypeTransform, GPUTransform) {
EXPECT_EQ
(
out_data_int64
[
i
],
static_cast
<
int64_t
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_bool
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp16
,
kernel_bool
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
bool
*
out_data_bool
=
out
.
data
<
bool
>
();
...
...
@@ -133,90 +159,103 @@ TEST(DataTypeTransform, GPUTransform) {
}
// transform float to float16
float
*
in_data_float
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
float
*
in_data_float
=
in
.
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_float
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
paddle
::
framework
::
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
context
.
Wait
();
TransDataType
(
kernel_fp32
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp32
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_float
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_float
[
i
]).
x
);
}
// transform double to float16
double
*
in_data_double
=
in
.
mutable_data
<
double
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
double
*
in_data_double
=
in
.
mutable_data
<
double
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_double
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
paddle
::
framework
::
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
context
.
Wait
();
TransDataType
(
kernel_fp64
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_fp64
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_double
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_double
[
i
]).
x
);
}
// transform int to float16
int
*
in_data_int
=
in
.
mutable_data
<
int
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
int
*
in_data_int
=
in
.
mutable_data
<
int
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
paddle
::
framework
::
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
context
.
Wait
();
TransDataType
(
kernel_int32
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_int32
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_int
[
i
]).
x
);
}
// transform int64 to float16
int64_t
*
in_data_int64
=
in
.
mutable_data
<
int64_t
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
int64_t
*
in_data_int64
=
in
.
mutable_data
<
int64_t
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int64
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
paddle
::
framework
::
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
context
.
Wait
();
TransDataType
(
kernel_int64
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_int64
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int64
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_int64
[
i
]).
x
);
}
// transform bool to float16
bool
*
in_data_bool
=
in
.
mutable_data
<
bool
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
bool
*
in_data_bool
=
in
.
mutable_data
<
bool
>
(
paddle
::
framework
::
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_bool
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
paddle
::
framework
::
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
context
.
Wait
();
TransDataType
(
kernel_bool
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
paddle
::
framework
::
TransDataType
(
kernel_bool
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
paddle
::
framework
::
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
ptr
=
out
.
data
<
paddle
::
platform
::
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_bool
[
i
]).
x
);
EXPECT_EQ
(
ptr
[
i
].
x
,
static_cast
<
paddle
::
platform
::
float16
>
(
in_data_bool
[
i
]).
x
);
}
}
}
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
1d756746
...
...
@@ -8,27 +8,28 @@ cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope plac
cc_library
(
ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base
)
cc_library
(
ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph
)
cc_library
(
variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows
)
if
(
WITH_GPU
)
nv_library
(
nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda
)
set
(
multi_devices_graph_builder_deps nccl_all_reduce_op_handle
)
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim dynload_cuda
)
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base
variable_visitor
scope ddim dynload_cuda
)
else
()
set
(
multi_devices_graph_builder_deps
)
cc_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base scope ddim
)
cc_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base
variable_visitor
scope ddim
)
endif
()
cc_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
cc_library
(
gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
cc_library
(
multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle send_op_handle
${
multi_devices_graph_builder_deps
}
)
scale_loss_grad_op_handle send_op_handle
${
multi_devices_graph_builder_deps
}
reduce_op_handle broadcast_op_handle
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto
)
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context
)
cc_library
(
variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows
)
cc_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base variable_visitor scope ddim memory
)
cc_library
(
gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope variable_visitor ddim memory
)
cc_test
(
broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context broadcast_op_handle
)
cc_test
(
gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
...
...
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
1d756746
...
...
@@ -44,9 +44,15 @@ void BroadcastOpHandle::RunImpl() {
// &in_place;
WaitInputVarGenerated
(
*
in_var_handle
);
auto
*
in_var
=
local_scopes_
.
at
(
in_var_handle
->
scope_idx_
)
->
FindVar
(
in_var_handle
->
name_
);
std
::
vector
<
const
Scope
*>
var_scopes
;
for
(
auto
*
s
:
local_scopes_
)
{
var_scopes
.
emplace_back
(
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
());
}
auto
*
in_var
=
var_scopes
.
at
(
in_var_handle
->
scope_idx_
)
->
FindVar
(
in_var_handle
->
name_
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
for
(
auto
*
out
:
out_var_handles
)
{
...
...
@@ -55,17 +61,16 @@ void BroadcastOpHandle::RunImpl() {
}
auto
&
out_p
=
out
->
place_
;
auto
*
out_var
=
local_scopes_
.
at
(
out
->
scope_idx_
)
->
FindVar
(
out
->
name_
);
auto
*
out_var
=
var_scopes
.
at
(
out
->
scope_idx_
)
->
FindVar
(
out
->
name_
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
PADDLE_ENFORCE_EQ
(
out_p
.
which
(),
in_var_handle
->
place_
.
which
(),
"Places must be all on CPU or all on CUDA."
);
VariableVisitor
::
ShareDimsAndLoD
(
*
in_var
,
out_var
);
VariableVisitor
::
GetMutableTensor
(
out_var
)
.
Resize
(
in_tensor
.
dims
())
.
mutable_data
(
out_p
,
in_tensor
.
type
());
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
out_p
,
in_tensor
.
type
());
auto
dev_ctx
=
dev_ctxes_
[
out_p
]
;
auto
dev_ctx
=
dev_ctxes_
.
at
(
out_p
)
;
RunAndRecordEvent
(
out_p
,
[
in_tensor
,
out_var
,
dev_ctx
,
out_p
]
{
paddle
::
framework
::
TensorCopy
(
in_tensor
,
out_p
,
*
(
dev_ctx
),
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.cc
浏览文件 @
1d756746
...
...
@@ -30,6 +30,7 @@ const f::DDim kDims = {20, 20};
struct
TestBroadcastOpHandle
{
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
param_scopes_
;
Scope
g_scope_
;
std
::
unique_ptr
<
OpHandleBase
>
op_handle_
;
std
::
vector
<
std
::
unique_ptr
<
VarHandleBase
>>
vars_
;
...
...
@@ -72,11 +73,17 @@ struct TestBroadcastOpHandle {
void
InitBroadcastOp
(
size_t
input_scope_idx
)
{
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
local_scopes_
[
j
]
->
Var
(
"out"
);
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
local_scope
.
Var
(
"out"
);
param_scopes_
.
emplace_back
(
&
local_scope
);
}
local
_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
param
_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
op_handle_
.
reset
(
new
BroadcastOpHandle
(
local_scopes_
,
gpu_list_
));
auto
*
in_var_handle
=
new
VarHandle
(
1
,
input_scope_idx
,
"input"
,
gpu_list_
[
input_scope_idx
]);
vars_
.
emplace_back
(
in_var_handle
);
...
...
@@ -105,7 +112,8 @@ struct TestBroadcastOpHandle {
}
void
TestBroadcastLodTensor
(
size_t
input_scope_idx
)
{
auto
in_var
=
local_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
auto
in_var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_lod_tensor
=
in_var
->
GetMutable
<
f
::
LoDTensor
>
();
in_lod_tensor
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
...
...
@@ -117,6 +125,7 @@ struct TestBroadcastOpHandle {
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
in_lod_tensor
);
in_lod_tensor
->
set_lod
(
lod
);
in_lod_tensor
->
Resize
(
kDims
);
op_handle_
->
Run
(
false
);
...
...
@@ -124,7 +133,8 @@ struct TestBroadcastOpHandle {
p
::
CPUPlace
cpu_place
;
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
auto
out_var
=
local_scopes_
[
j
]
->
Var
(
"out"
);
auto
out_var
=
param_scopes_
[
j
]
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
out_tensor
=
out_var
->
Get
<
f
::
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
out_tensor
.
lod
(),
lod
,
"lod is not equal."
);
...
...
@@ -139,7 +149,8 @@ struct TestBroadcastOpHandle {
}
void
TestBroadcastSelectedRows
(
size_t
input_scope_idx
)
{
auto
in_var
=
local_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
auto
in_var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
value
=
in_selected_rows
->
mutable_value
();
value
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
...
...
@@ -162,7 +173,8 @@ struct TestBroadcastOpHandle {
p
::
CPUPlace
cpu_place
;
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
auto
out_var
=
local_scopes_
[
j
]
->
Var
(
"out"
);
auto
out_var
=
param_scopes_
[
j
]
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
&
out_select_rows
=
out_var
->
Get
<
f
::
SelectedRows
>
();
auto
rt
=
out_select_rows
.
value
();
...
...
paddle/fluid/framework/details/cow_ptr.h
浏览文件 @
1d756746
...
...
@@ -14,7 +14,7 @@
#pragma once
#include <memory>
#include <thread>
#include <thread>
// NOLINT
namespace
paddle
{
namespace
framework
{
...
...
@@ -23,7 +23,7 @@ namespace details {
// Change it to thread safe flags if needed.
class
ThreadUnsafeOwnershipFlags
{
public:
ThreadUnsafeOwnershipFlags
(
bool
flag
)
:
flag_
(
flag
)
{}
explicit
ThreadUnsafeOwnershipFlags
(
bool
flag
)
:
flag_
(
flag
)
{}
ThreadUnsafeOwnershipFlags
(
const
ThreadUnsafeOwnershipFlags
&
other
)
=
delete
;
ThreadUnsafeOwnershipFlags
&
operator
=
(
...
...
paddle/fluid/framework/details/gather_op_handle.cc
浏览文件 @
1d756746
...
...
@@ -41,14 +41,19 @@ void GatherOpHandle::RunImpl() {
out_var_handle
=
out_var_handles
.
front
();
}
std
::
vector
<
const
Scope
*>
var_scopes
;
for
(
auto
*
s
:
local_scopes_
)
{
var_scopes
.
emplace_back
(
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
());
}
auto
in_0_handle
=
in_var_handles
[
0
];
auto
pre_in_var
=
local_scopes_
[
in_0_handle
->
scope_idx_
]
->
FindVar
(
in_0_handle
->
name_
);
auto
pre_place
=
in_0_handle
->
place_
;
var_scopes
.
at
(
in_0_handle
->
scope_idx_
)
->
FindVar
(
in_0_handle
->
name_
);
PADDLE_ENFORCE_NOT_NULL
(
pre_in_var
);
PADDLE_ENFORCE
(
pre_in_var
->
IsType
<
framework
::
SelectedRows
>
(),
"Currently, gather_op only can gather SelectedRows."
);
auto
pre_place
=
in_0_handle
->
place_
;
PADDLE_ENFORCE_EQ
(
out_var_handle
->
place_
.
which
(),
pre_place
.
which
(),
"The place of input and output should be the same."
);
...
...
@@ -67,7 +72,7 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ
(
in_p
.
which
(),
pre_place
.
which
(),
"Places must be all on CPU or all on CUDA."
);
auto
*
in_var
=
local_scopes_
.
at
(
in_handle
->
scope_idx_
)
->
FindVar
(
in_handle
->
name_
);
var_scopes
.
at
(
in_handle
->
scope_idx_
)
->
FindVar
(
in_handle
->
name_
);
auto
&
in_sr
=
in_var
->
Get
<
framework
::
SelectedRows
>
();
PADDLE_ENFORCE_EQ
(
in_sr
.
value
().
type
(),
pre_in
.
value
().
type
(),
...
...
@@ -86,7 +91,7 @@ void GatherOpHandle::RunImpl() {
// write the output
auto
&
out_place
=
out_var_handle
->
place_
;
auto
out_scope_idx
=
out_var_handle
->
scope_idx_
;
auto
out_var
=
local_scopes_
[
out_scope_idx
]
->
FindVar
(
out_var_handle
->
name_
);
auto
out_var
=
var_scopes
.
at
(
out_scope_idx
)
->
FindVar
(
out_var_handle
->
name_
);
auto
out
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
out
->
set_height
(
pre_in
.
height
());
...
...
paddle/fluid/framework/details/gather_op_handle_test.cc
浏览文件 @
1d756746
...
...
@@ -29,6 +29,7 @@ const f::DDim kDims = {20, 20};
struct
TestGatherOpHandle
{
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
param_scopes_
;
Scope
g_scope_
;
std
::
unique_ptr
<
OpHandleBase
>
op_handle_
;
std
::
vector
<
std
::
unique_ptr
<
VarHandleBase
>>
vars_
;
...
...
@@ -71,9 +72,14 @@ struct TestGatherOpHandle {
void
InitGatherOp
(
size_t
input_scope_idx
)
{
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
local_scopes_
[
j
]
->
Var
(
"out"
);
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
local_scope
.
Var
(
"input"
);
param_scopes_
.
emplace_back
(
&
local_scope
);
}
local_scopes_
[
input_scope_idx
]
->
Var
(
"inp
ut"
);
param_scopes_
[
input_scope_idx
]
->
Var
(
"o
ut"
);
op_handle_
.
reset
(
new
GatherOpHandle
(
local_scopes_
,
gpu_list_
));
// add input
...
...
@@ -115,7 +121,8 @@ struct TestGatherOpHandle {
for
(
size_t
input_scope_idx
=
0
;
input_scope_idx
<
gpu_list_
.
size
();
++
input_scope_idx
)
{
auto
in_var
=
local_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
auto
in_var
=
param_scopes_
.
at
(
input_scope_idx
)
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
value
=
in_selected_rows
->
mutable_value
();
value
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
...
...
@@ -128,10 +135,11 @@ struct TestGatherOpHandle {
value
->
Resize
(
kDims
);
}
auto
out_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"out"
);
auto
out_var
=
param_scopes_
.
at
(
output_scope_idx
)
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
out_selected_rows
=
out_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
in_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"input"
);
auto
in_var
=
param_scopes_
.
at
(
output_scope_idx
)
->
Find
Var
(
"input"
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
...
...
@@ -155,7 +163,8 @@ struct TestGatherOpHandle {
f
::
TensorCopy
(
rt
,
cpu_place
,
*
(
ctxs_
[
output_scope_idx
]),
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
j
=
0
;
j
<
f
::
product
(
kDims
);
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
f
::
product
(
kDims
)
*
static_cast
<
int64_t
>
(
gpu_list_
.
size
());
++
j
)
{
ASSERT_NEAR
(
ct
[
j
],
send_vector
[
j
%
send_vector
.
size
()],
1e-5
);
}
}
...
...
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
浏览文件 @
1d756746
...
...
@@ -43,21 +43,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
int
dtype
=
-
1
;
size_t
numel
=
0
;
std
::
vector
<
LoDTensor
>
lod_tensors
;
std
::
vector
<
const
LoDTensor
*
>
lod_tensors
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
*
s
=
local_scopes_
[
i
];
auto
&
local_scope
=
*
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
lod_tensor
=
local_scope
.
FindVar
(
var_name
)
->
Get
<
LoDTensor
>
();
lod_tensors
.
emplace_back
(
lod_tensor
);
lod_tensors
.
emplace_back
(
&
lod_tensor
);
}
if
(
platform
::
is_gpu_place
(
lod_tensors
[
0
]
.
place
()))
{
if
(
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
auto
&
lod_tensor
=
lod_tensors
[
i
];
auto
&
lod_tensor
=
*
lod_tensors
[
i
];
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor
.
data
<
void
>
());
if
(
dtype
==
-
1
)
{
...
...
@@ -93,7 +93,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
// Reduce All Tensor to trg in CPU
ReduceLoDTensor
func
(
lod_tensors
,
&
trg
);
VisitDataType
(
ToDataType
(
lod_tensors
[
0
]
.
type
()),
func
);
VisitDataType
(
ToDataType
(
lod_tensors
[
0
]
->
type
()),
func
);
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
scope
=
...
...
paddle/fluid/framework/details/op_registry.h
浏览文件 @
1d756746
...
...
@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
#include <string>
#include <tuple>
#include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_proto_maker.h"
...
...
paddle/fluid/framework/details/reduce_and_gather.h
浏览文件 @
1d756746
...
...
@@ -24,23 +24,23 @@ namespace framework {
namespace
details
{
struct
ReduceLoDTensor
{
const
std
::
vector
<
LoDTensor
>
&
src_tensors_
;
const
std
::
vector
<
const
LoDTensor
*
>
&
src_tensors_
;
LoDTensor
&
dst_tensor_
;
ReduceLoDTensor
(
const
std
::
vector
<
LoDTensor
>
&
src
,
LoDTensor
*
dst
)
ReduceLoDTensor
(
const
std
::
vector
<
const
LoDTensor
*
>
&
src
,
LoDTensor
*
dst
)
:
src_tensors_
(
src
),
dst_tensor_
(
*
dst
)
{}
template
<
typename
T
>
void
operator
()()
const
{
PADDLE_ENFORCE
(
!
src_tensors_
.
empty
());
auto
&
t0
=
src_tensors_
[
0
];
auto
&
t0
=
*
src_tensors_
[
0
];
PADDLE_ENFORCE_NE
(
t0
.
numel
(),
0
);
dst_tensor_
.
Resize
(
t0
.
dims
());
T
*
dst
=
dst_tensor_
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
copy
(
t0
.
data
<
T
>
(),
t0
.
data
<
T
>
()
+
t0
.
numel
(),
dst
);
for
(
size_t
i
=
1
;
i
<
src_tensors_
.
size
();
++
i
)
{
auto
&
t
=
src_tensors_
[
i
];
auto
&
t
=
*
src_tensors_
[
i
];
PADDLE_ENFORCE_EQ
(
t
.
dims
(),
t0
.
dims
());
PADDLE_ENFORCE_EQ
(
t
.
type
(),
t0
.
type
());
std
::
transform
(
t
.
data
<
T
>
(),
t
.
data
<
T
>
()
+
t
.
numel
(),
dst
,
dst
,
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
1d756746
...
...
@@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -21,85 +23,84 @@ namespace details {
void
ReduceOpHandle
::
RunImpl
()
{
// the input and output may have dummy var.
std
::
vector
<
VarHandle
*>
in_var_handles
=
GetValidVarHandles
(
inputs_
);
std
::
vector
<
VarHandle
*>
out_var_handles
=
GetValidVarHandles
(
outputs_
);
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
inputs_
);
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
places_
.
size
(),
"The number of output should equal to the number of places."
);
PADDLE_ENFORCE_EQ
(
out_var_handles
.
size
(),
1
,
"The number of output should be one."
);
// Wait input done, this Wait is asynchronous operation
WaitEvents
(
in_var_handles
);
VarHandle
*
out_var_handle
;
{
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
outputs_
);
PADDLE_ENFORCE_EQ
(
out_var_handles
.
size
(),
1
,
"The number of output should be one."
);
out_var_handle
=
out_var_handles
.
front
();
}
// check in the same place
auto
in_0_handle
=
in_var_handles
[
0
];
auto
pre_place
=
in_0_handle
->
place_
;
std
::
vector
<
const
Scope
*>
var_scopes
;
for
(
auto
*
s
:
local_scopes_
)
{
var_scopes
.
emplace_back
(
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
());
}
auto
pre_in_var
=
var_scopes
.
at
(
in_0_handle
->
scope_idx_
)
->
FindVar
(
in_0_handle
->
name_
);
PADDLE_ENFORCE_NOT_NULL
(
pre_in_var
);
// Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated
(
in_var_handles
);
auto
pre_place
=
in_0_handle
->
place_
;
std
::
vector
<
platform
::
Place
>
in_places
;
auto
pre_in_tensor
=
VariableVisitor
::
GetMutableTensor
(
pre_in_var
);
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
in_p
=
in_handle
->
place_
;
PADDLE_ENFORCE_EQ
(
in_p
.
which
(),
pre_place
.
which
(),
"Places must be all on CPU or all on CUDA."
);
in_places
.
emplace_back
(
in_p
);
}
auto
out_var
=
local_scopes_
[
out_var_handles
[
0
]
->
scope_idx_
]
->
FindVar
(
out_var_handles
[
0
]
->
name_
);
auto
in_var
=
var_scopes
.
at
(
in_handle
->
scope_idx_
)
->
FindVar
(
in_handle
->
name_
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
pre_in_var
=
local_scopes_
[
in_0_handle
->
scope_idx_
]
->
FindVar
(
in_0_handle
->
name_
);
if
(
pre_in_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
&
pre_in
=
pre_in_var
->
Get
<
framework
::
SelectedRows
>
();
std
::
vector
<
const
SelectedRows
*>
in_selected_rows
;
auto
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
PADDLE_ENFORCE_EQ
(
in_tensor
.
type
(),
pre_in_tensor
.
type
(),
"The type of input is not consistent."
);
}
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
in_var
=
local_scopes_
.
at
(
in_handle
->
scope_idx_
)
->
FindVar
(
in_handle
->
name_
);
auto
&
in_sr
=
in_var
->
Get
<
framework
::
SelectedRows
>
();
auto
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx_
)
->
FindVar
(
out_var_handle
->
name_
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
PADDLE_ENFORCE_EQ
(
in_sr
.
value
().
type
(),
pre_in
.
value
().
type
(),
"The type of input is not consistent."
);
if
(
pre_in_var
->
IsType
<
framework
::
SelectedRows
>
())
{
std
::
vector
<
const
SelectedRows
*>
in_selected_rows
=
GetInputValues
<
SelectedRows
>
(
in_var_handles
,
var_scopes
);
in_selected_rows
.
emplace_back
(
&
in_sr
);
}
auto
trg
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
GatherSelectedRows
(
in_selected_rows
,
in_places
,
dev_ctxes_
,
out_var_handles
[
0
]
->
place_
,
trg
);
out_var_handle
->
place_
,
out_var
->
GetMutable
<
framework
::
SelectedRows
>
());
}
else
{
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
std
::
vector
<
LoDTensor
>
lod_tensors
;
// can be refined
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
in_var
=
local_scopes_
.
at
(
in_handle
->
scope_idx_
)
->
FindVar
(
in_handle
->
name_
);
auto
&
in_sr
=
in_var
->
Get
<
framework
::
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
in_sr
.
type
(),
pre_in
.
type
(),
"The type of input is not consistent."
);
lod_tensors
.
emplace_back
(
in_sr
);
}
auto
trg
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
trg
->
Resize
(
pre_in
.
dims
());
trg
->
mutable_data
(
out_var_handles
[
0
]
->
place_
,
pre_in
.
type
());
std
::
vector
<
const
LoDTensor
*>
lod_tensors
=
GetInputValues
<
LoDTensor
>
(
in_var_handles
,
var_scopes
);
if
(
paddle
::
platform
::
is_cpu_place
(
pre_place
))
{
ReduceLoDTensor
func
(
lod_tensors
,
trg
);
VisitDataType
(
ToDataType
(
lod_tensors
[
0
].
type
()),
func
);
ReduceLoDTensor
func
(
lod_tensors
,
out_var
->
GetMutable
<
framework
::
LoDTensor
>
());
VisitDataType
(
ToDataType
(
lod_tensors
[
0
]
->
type
()),
func
);
}
else
if
(
paddle
::
platform
::
is_gpu_place
(
pre_place
))
{
#ifdef PADDLE_WITH_CUDA
auto
out_p
=
out_var_handles
[
0
]
->
place_
;
int
root
=
boost
::
get
<
platform
::
CUDAPlace
>
(
out_p
).
device
;
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
out_var_handle
->
place_
,
pre_in
.
type
());
auto
out_p
=
out_var_handle
->
place_
;
int
root
=
boost
::
get
<
platform
::
CUDAPlace
>
(
out_p
).
device
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
var_scopes
.
size
();
++
i
)
{
auto
&
p
=
in_places
[
i
];
auto
&
lod_tensor
=
lod_tensors
[
i
];
auto
&
lod_tensor
=
*
lod_tensors
[
i
];
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dev_id
);
...
...
@@ -109,14 +110,16 @@ void ReduceOpHandle::RunImpl() {
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor
.
data
<
void
>
());
void
*
recvbuffer
=
nullptr
;
if
(
root
==
dev_id
)
{
recvbuffer
=
trg
->
mutable_data
(
out_var_handles
[
0
]
->
place_
);
recvbuffer
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
mutable_data
(
out_var_handle
->
place_
);
}
int
type
=
platform
::
ToNCCLDataType
(
lod_tensor
.
type
());
all_reduce_calls
.
emplace_back
([
=
]
{
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
buffer
,
recvbuffer
,
static_cast
<
size_t
>
(
lod_tensor
.
numel
()),
platform
::
ToNCCLDataType
(
lod_tensor
.
type
()),
ncclSum
,
root
,
comm
,
stream
));
static_cast
<
ncclDataType_t
>
(
type
),
ncclSum
,
root
,
comm
,
stream
));
});
}
...
...
@@ -135,26 +138,31 @@ void ReduceOpHandle::RunImpl() {
}
}
void
ReduceOpHandle
::
WaitEvents
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
)
{
if
(
in_var_handles
[
0
]
->
generated_op_
)
{
for
(
auto
*
in
:
in_var_handles
)
{
in_var_handles
[
0
]
->
generated_op_
->
Wait
(
dev_ctxes_
[
in
->
place_
]);
}
template
<
typename
T
>
std
::
vector
<
const
T
*>
ReduceOpHandle
::
GetInputValues
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
,
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
const
{
std
::
vector
<
const
T
*>
in_selected_rows
;
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
&
in_sr
=
var_scopes
.
at
(
in_handle
->
scope_idx_
)
->
FindVar
(
in_handle
->
name_
)
->
Get
<
T
>
();
in_selected_rows
.
emplace_back
(
&
in_sr
);
}
return
in_selected_rows
;
}
std
::
vector
<
VarHandle
*>
ReduceOpHandle
::
GetValidVarHandles
(
const
std
::
vector
<
VarHandle
Base
*>
&
input
s
)
{
std
::
vector
<
VarHandle
*>
in_var_handles
;
for
(
auto
*
in
:
inputs
)
{
auto
*
in_handle
=
dynamic_cast
<
VarHandle
*>
(
in
);
if
(
in_handle
)
{
in_var_handles
.
push_back
(
in_handle
);
void
ReduceOpHandle
::
WaitInputVarGenerated
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handle
s
)
{
for
(
auto
*
in
:
in_var_handles
)
{
if
(
in
->
generated_op_
)
{
for
(
auto
pair
:
dev_ctxes_
)
{
in
->
generated_op_
->
Wait
(
pair
.
second
);
}
}
}
return
in_var_handles
;
}
std
::
string
ReduceOpHandle
::
Name
()
const
{
return
"reduce"
;
}
}
// namespace details
}
// namespace framework
...
...
paddle/fluid/framework/details/reduce_op_handle.h
浏览文件 @
1d756746
...
...
@@ -59,10 +59,13 @@ struct ReduceOpHandle : public OpHandleBase {
protected:
void
RunImpl
()
override
;
std
::
vector
<
VarHandle
*>
GetValidVarHandles
(
const
std
::
vector
<
VarHandleBase
*>
&
inputs
);
void
WaitEvents
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
);
void
WaitInputVarGenerated
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
);
template
<
typename
T
>
std
::
vector
<
const
T
*>
GetInputValues
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
,
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
const
;
};
}
// namespace details
...
...
paddle/fluid/framework/details/reduce_op_handle_test.cc
浏览文件 @
1d756746
...
...
@@ -14,7 +14,6 @@
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
...
...
@@ -30,6 +29,7 @@ struct TestReduceOpHandle {
bool
use_gpu_
;
Scope
g_scope_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
param_scopes_
;
std
::
unique_ptr
<
OpHandleBase
>
op_handle_
;
std
::
vector
<
std
::
unique_ptr
<
VarHandleBase
>>
vars_
;
std
::
vector
<
p
::
Place
>
gpu_list_
;
...
...
@@ -83,12 +83,18 @@ struct TestReduceOpHandle {
}
}
void
InitReduceOp
(
size_t
input_scope_idx
)
{
void
InitReduceOp
(
size_t
out_scope_idx
)
{
// init scope
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
local_scopes_
[
j
]
->
Var
(
"out"
);
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
local_scope
.
Var
(
"input"
);
param_scopes_
.
emplace_back
(
&
local_scope
);
}
local_scopes_
[
input_scope_idx
]
->
Var
(
"inp
ut"
);
param_scopes_
[
out_scope_idx
]
->
Var
(
"o
ut"
);
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
...
...
@@ -106,6 +112,7 @@ struct TestReduceOpHandle {
#endif
}
// init op handle
// add input
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
if
(
!
use_gpu_
)
{
...
...
@@ -126,7 +133,7 @@ struct TestReduceOpHandle {
// add output
auto
*
out_var_handle
=
new
VarHandle
(
2
,
input_scope_idx
,
"out"
,
gpu_list_
[
inp
ut_scope_idx
]);
new
VarHandle
(
2
,
out_scope_idx
,
"out"
,
gpu_list_
[
o
ut_scope_idx
]);
vars_
.
emplace_back
(
out_var_handle
);
op_handle_
->
AddOutput
(
out_var_handle
);
...
...
@@ -148,7 +155,8 @@ struct TestReduceOpHandle {
for
(
size_t
input_scope_idx
=
0
;
input_scope_idx
<
gpu_list_
.
size
();
++
input_scope_idx
)
{
auto
in_var
=
local_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
auto
in_var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
value
=
in_selected_rows
->
mutable_value
();
value
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
...
...
@@ -161,10 +169,11 @@ struct TestReduceOpHandle {
value
->
Resize
(
kDims
);
}
auto
out_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"out"
);
auto
out_var
=
param_scopes_
[
output_scope_idx
]
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
out_selected_rows
=
out_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
in_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"input"
);
auto
in_var
=
param_scopes_
[
output_scope_idx
]
->
Find
Var
(
"input"
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
...
...
@@ -202,7 +211,8 @@ struct TestReduceOpHandle {
for
(
size_t
input_scope_idx
=
0
;
input_scope_idx
<
gpu_list_
.
size
();
++
input_scope_idx
)
{
auto
in_var
=
local_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
auto
in_var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_lod_tensor
=
in_var
->
GetMutable
<
f
::
LoDTensor
>
();
in_lod_tensor
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
in_lod_tensor
->
set_lod
(
lod
);
...
...
@@ -211,10 +221,11 @@ struct TestReduceOpHandle {
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
in_lod_tensor
);
}
auto
out_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"out"
);
auto
out_var
=
param_scopes_
[
output_scope_idx
]
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
out_lodtensor
=
out_var
->
GetMutable
<
f
::
LoDTensor
>
();
auto
in_var
=
local_scopes_
[
output_scope_idx
]
->
Var
(
"input"
);
auto
in_var
=
param_scopes_
[
output_scope_idx
]
->
Find
Var
(
"input"
);
auto
in_lodtensor
=
in_var
->
Get
<
f
::
LoDTensor
>
();
out_lodtensor
->
ShareDataWith
(
in_lodtensor
);
...
...
@@ -239,34 +250,34 @@ struct TestReduceOpHandle {
TEST
(
ReduceTester
,
TestCPUReduceTestSelectedRows
)
{
TestReduceOpHandle
test_op
;
size_t
inp
ut_scope_idx
=
0
;
size_t
o
ut_scope_idx
=
0
;
test_op
.
InitCtxOnGpu
(
false
);
test_op
.
InitReduceOp
(
inp
ut_scope_idx
);
test_op
.
TestReduceSelectedRows
(
inp
ut_scope_idx
);
test_op
.
InitReduceOp
(
o
ut_scope_idx
);
test_op
.
TestReduceSelectedRows
(
o
ut_scope_idx
);
}
TEST
(
ReduceTester
,
TestCPUReduceTestLodTensor
)
{
TestReduceOpHandle
test_op
;
size_t
inp
ut_scope_idx
=
0
;
size_t
o
ut_scope_idx
=
0
;
test_op
.
InitCtxOnGpu
(
false
);
test_op
.
InitReduceOp
(
inp
ut_scope_idx
);
test_op
.
TestReduceLodTensors
(
inp
ut_scope_idx
);
test_op
.
InitReduceOp
(
o
ut_scope_idx
);
test_op
.
TestReduceLodTensors
(
o
ut_scope_idx
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
ReduceTester
,
TestGPUReduceTestSelectedRows
)
{
TestReduceOpHandle
test_op
;
size_t
inp
ut_scope_idx
=
0
;
size_t
o
ut_scope_idx
=
0
;
test_op
.
InitCtxOnGpu
(
true
);
test_op
.
InitReduceOp
(
inp
ut_scope_idx
);
test_op
.
TestReduceSelectedRows
(
inp
ut_scope_idx
);
test_op
.
InitReduceOp
(
o
ut_scope_idx
);
test_op
.
TestReduceSelectedRows
(
o
ut_scope_idx
);
}
TEST
(
ReduceTester
,
TestGPUReduceTestLodTensor
)
{
TestReduceOpHandle
test_op
;
size_t
inp
ut_scope_idx
=
0
;
size_t
o
ut_scope_idx
=
0
;
test_op
.
InitCtxOnGpu
(
true
);
test_op
.
InitReduceOp
(
inp
ut_scope_idx
);
test_op
.
TestReduceLodTensors
(
inp
ut_scope_idx
);
test_op
.
InitReduceOp
(
o
ut_scope_idx
);
test_op
.
TestReduceLodTensors
(
o
ut_scope_idx
);
}
#endif
...
...
paddle/fluid/framework/op_registry_test.cc
浏览文件 @
1d756746
...
...
@@ -202,8 +202,9 @@ class CosineOpComplete : public paddle::framework::CosineOp {
};
TEST
(
OperatorRegistrar
,
Test
)
{
using
namespace
paddle
::
framework
;
OperatorRegistrar
<
CosineOpComplete
,
CosineOpProtoAndCheckerMaker
>
reg
(
"cos"
);
paddle
::
framework
::
OperatorRegistrar
<
CosineOpComplete
,
paddle
::
framework
::
CosineOpProtoAndCheckerMaker
>
reg
(
"cos"
);
}
namespace
paddle
{
...
...
paddle/fluid/framework/operator_test.cc
浏览文件 @
1d756746
...
...
@@ -226,10 +226,8 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
// test with multi inputs
TEST
(
OpKernel
,
multi_inputs
)
{
using
namespace
paddle
::
framework
;
paddle
::
framework
::
InitDevices
(
true
);
proto
::
OpDesc
op_desc
;
p
addle
::
framework
::
p
roto
::
OpDesc
op_desc
;
op_desc
.
set_type
(
"op_multi_inputs_with_kernel"
);
BuildVar
(
"xs"
,
{
"x0"
,
"x1"
,
"x2"
},
op_desc
.
add_inputs
());
...
...
@@ -243,12 +241,12 @@ TEST(OpKernel, multi_inputs) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
framework
::
Scope
scope
;
scope
.
Var
(
"x0"
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
"x1"
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
"x2"
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
"k0"
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
"y0"
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
"y1"
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
"x0"
)
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
scope
.
Var
(
"x1"
)
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
scope
.
Var
(
"x2"
)
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
scope
.
Var
(
"k0"
)
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
scope
.
Var
(
"y0"
)
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
scope
.
Var
(
"y1"
)
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
auto
op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
op_desc
);
op
->
Run
(
scope
,
cpu_place
);
...
...
paddle/fluid/framework/program_desc.cc
浏览文件 @
1d756746
...
...
@@ -27,10 +27,14 @@ BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
return
blocks_
.
back
().
get
();
}
proto
::
ProgramDesc
*
ProgramDesc
::
Proto
()
{
void
ProgramDesc
::
Flush
()
{
for
(
auto
&
block
:
blocks_
)
{
block
->
Flush
();
}
}
proto
::
ProgramDesc
*
ProgramDesc
::
Proto
()
{
Flush
();
return
&
desc_
;
}
...
...
paddle/fluid/framework/program_desc.h
浏览文件 @
1d756746
...
...
@@ -51,6 +51,8 @@ class ProgramDesc {
size_t
Size
()
const
{
return
blocks_
.
size
();
}
void
Flush
();
proto
::
ProgramDesc
*
Proto
();
// The output variable of feed_op is referenced as feed_target.
...
...
paddle/fluid/framework/threadpool_test.cc
浏览文件 @
1d756746
...
...
@@ -15,14 +15,14 @@ limitations under the License. */
#include <gtest/gtest.h>
#include <atomic>
#include "threadpool.h"
#include "
paddle/fluid/framework/
threadpool.h"
namespace
framework
=
paddle
::
framework
;
void
do_sum
(
framework
::
ThreadPool
*
pool
,
std
::
atomic
<
int
>
&
sum
,
int
cnt
)
{
void
do_sum
(
framework
::
ThreadPool
*
pool
,
std
::
atomic
<
int
>
*
sum
,
int
cnt
)
{
std
::
vector
<
std
::
future
<
void
>>
fs
;
for
(
int
i
=
0
;
i
<
cnt
;
++
i
)
{
fs
.
push_back
(
framework
::
Async
([
&
sum
]()
{
sum
.
fetch_add
(
1
);
}));
fs
.
push_back
(
framework
::
Async
([
sum
]()
{
sum
->
fetch_add
(
1
);
}));
}
}
...
...
@@ -46,7 +46,7 @@ TEST(ThreadPool, ConcurrentRun) {
int
n
=
50
;
// sum = (n * (n + 1)) / 2
for
(
int
i
=
1
;
i
<=
n
;
++
i
)
{
std
::
thread
t
(
do_sum
,
pool
,
std
::
ref
(
sum
)
,
i
);
std
::
thread
t
(
do_sum
,
pool
,
&
sum
,
i
);
threads
.
push_back
(
std
::
move
(
t
));
}
for
(
auto
&
t
:
threads
)
{
...
...
paddle/fluid/inference/io.cc
浏览文件 @
1d756746
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/io.h"
#include <algorithm>
#include <fstream>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
...
...
@@ -27,14 +28,14 @@ namespace inference {
// linking the inference shared library.
void
Init
(
bool
init_p2p
)
{
framework
::
InitDevices
(
init_p2p
);
}
void
ReadBinaryFile
(
const
std
::
string
&
filename
,
std
::
string
&
contents
)
{
void
ReadBinaryFile
(
const
std
::
string
&
filename
,
std
::
string
*
contents
)
{
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
filename
);
fin
.
seekg
(
0
,
std
::
ios
::
end
);
contents
.
clear
();
contents
.
resize
(
fin
.
tellg
());
contents
->
clear
();
contents
->
resize
(
fin
.
tellg
());
fin
.
seekg
(
0
,
std
::
ios
::
beg
);
fin
.
read
(
&
contents
[
0
],
contents
.
size
());
fin
.
read
(
&
(
contents
->
at
(
0
)),
contents
->
size
());
fin
.
close
();
}
...
...
@@ -47,7 +48,7 @@ bool IsPersistable(const framework::VarDesc* var) {
return
false
;
}
void
LoadPersistables
(
framework
::
Executor
&
executor
,
framework
::
Scope
&
scope
,
void
LoadPersistables
(
framework
::
Executor
*
executor
,
framework
::
Scope
*
scope
,
const
framework
::
ProgramDesc
&
main_program
,
const
std
::
string
&
dirname
,
const
std
::
string
&
param_filename
)
{
...
...
@@ -92,18 +93,18 @@ void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
op
->
CheckAttrs
();
}
executor
.
Run
(
*
load_program
,
&
scope
,
0
,
true
,
true
);
executor
->
Run
(
*
load_program
,
scope
,
0
,
true
,
true
);
delete
load_program
;
}
std
::
unique_ptr
<
framework
::
ProgramDesc
>
Load
(
framework
::
Executor
&
executor
,
framework
::
Scope
&
scope
,
std
::
unique_ptr
<
framework
::
ProgramDesc
>
Load
(
framework
::
Executor
*
executor
,
framework
::
Scope
*
scope
,
const
std
::
string
&
dirname
)
{
std
::
string
model_filename
=
dirname
+
"/__model__"
;
std
::
string
program_desc_str
;
VLOG
(
3
)
<<
"loading model from "
<<
model_filename
;
ReadBinaryFile
(
model_filename
,
program_desc_str
);
ReadBinaryFile
(
model_filename
,
&
program_desc_str
);
std
::
unique_ptr
<
framework
::
ProgramDesc
>
main_program
(
new
framework
::
ProgramDesc
(
program_desc_str
));
...
...
@@ -113,11 +114,11 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
}
std
::
unique_ptr
<
framework
::
ProgramDesc
>
Load
(
framework
::
Executor
&
executor
,
framework
::
Scope
&
scope
,
framework
::
Executor
*
executor
,
framework
::
Scope
*
scope
,
const
std
::
string
&
prog_filename
,
const
std
::
string
&
param_filename
)
{
std
::
string
model_filename
=
prog_filename
;
std
::
string
program_desc_str
;
ReadBinaryFile
(
model_filename
,
program_desc_str
);
ReadBinaryFile
(
model_filename
,
&
program_desc_str
);
std
::
unique_ptr
<
framework
::
ProgramDesc
>
main_program
(
new
framework
::
ProgramDesc
(
program_desc_str
));
...
...
paddle/fluid/inference/io.h
浏览文件 @
1d756746
...
...
@@ -27,17 +27,17 @@ namespace inference {
void
Init
(
bool
init_p2p
);
void
LoadPersistables
(
framework
::
Executor
&
executor
,
framework
::
Scope
&
scope
,
void
LoadPersistables
(
framework
::
Executor
*
executor
,
framework
::
Scope
*
scope
,
const
framework
::
ProgramDesc
&
main_program
,
const
std
::
string
&
dirname
,
const
std
::
string
&
param_filename
);
std
::
unique_ptr
<
framework
::
ProgramDesc
>
Load
(
framework
::
Executor
&
executor
,
framework
::
Scope
&
scope
,
std
::
unique_ptr
<
framework
::
ProgramDesc
>
Load
(
framework
::
Executor
*
executor
,
framework
::
Scope
*
scope
,
const
std
::
string
&
dirname
);
std
::
unique_ptr
<
framework
::
ProgramDesc
>
Load
(
framework
::
Executor
&
executor
,
framework
::
Scope
&
scope
,
std
::
unique_ptr
<
framework
::
ProgramDesc
>
Load
(
framework
::
Executor
*
executor
,
framework
::
Scope
*
scope
,
const
std
::
string
&
prog_filename
,
const
std
::
string
&
param_filename
);
...
...
paddle/fluid/inference/tests/test_helper.h
浏览文件 @
1d756746
...
...
@@ -133,12 +133,12 @@ void TestInference(const std::string& dirname,
std
::
string
prog_filename
=
"__model_combined__"
;
std
::
string
param_filename
=
"__params_combined__"
;
inference_program
=
paddle
::
inference
::
Load
(
executor
,
*
scope
,
dirname
+
"/"
+
prog_filename
,
&
executor
,
scope
,
dirname
+
"/"
+
prog_filename
,
dirname
+
"/"
+
param_filename
);
}
else
{
// Parameters are saved in separate files sited in the specified
// `dirname`.
inference_program
=
paddle
::
inference
::
Load
(
executor
,
*
scope
,
dirname
);
inference_program
=
paddle
::
inference
::
Load
(
&
executor
,
scope
,
dirname
);
}
}
// Disable the profiler and print the timing information
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
1d756746
...
...
@@ -163,7 +163,12 @@ function(op_library TARGET)
# pybind USE_OP
if
(
${
pybind_flag
}
EQUAL 0
)
# NOTE(*): activation use macro to regist the kernels, set use_op manually.
if
(
${
TARGET
}
STREQUAL
"activation"
)
file
(
APPEND
${
pybind_file
}
"USE_OP(relu);
\n
"
)
else
()
file
(
APPEND
${
pybind_file
}
"USE_OP(
${
TARGET
}
);
\n
"
)
endif
()
endif
()
endfunction
()
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
1d756746
此差异已折叠。
点击以展开。
paddle/fluid/operators/activation_op.cu
浏览文件 @
1d756746
...
...
@@ -9,7 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/float16.h"
...
...
paddle/fluid/operators/activation_op.h
浏览文件 @
1d756746
...
...
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
...
...
@@ -25,6 +28,16 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
/* Use ugly global variable, for the using in python layer side
Please refer to the layer_helper.py and get the details.
*/
static
std
::
unordered_set
<
std
::
string
>
InplaceOpSet
=
{
"sigmoid"
,
"exp"
,
"relu"
,
"tanh"
,
"sqrt"
,
"ceil"
,
"floor"
,
"reciprocal"
,
"relu6"
,
"soft_relu"
,
"hard_sigmoid"
,
};
static
bool
IsInplace
(
std
::
string
op
)
{
return
InplaceOpSet
.
count
(
op
);
}
template
<
typename
DeviceContext
,
typename
Functor
>
class
ActivationKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
...
...
@@ -60,7 +73,6 @@ class ActivationGradKernel
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
X
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
Out
=
context
.
Input
<
framework
::
Tensor
>
(
"Out"
);
auto
*
dOut
=
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
...
...
@@ -68,7 +80,6 @@ class ActivationGradKernel
dX
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
dout
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dOut
);
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
X
);
auto
out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
Out
);
auto
dx
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dX
);
auto
*
place
=
...
...
@@ -78,7 +89,16 @@ class ActivationGradKernel
for
(
auto
&
attr
:
attrs
)
{
*
attr
.
second
=
context
.
Attr
<
float
>
(
attr
.
first
);
}
functor
(
*
place
,
x
,
out
,
dout
,
dx
);
bool
inplace
=
functor
.
Inplace
();
if
(
!
inplace
)
{
auto
*
X
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
X
);
functor
(
*
place
,
x
,
out
,
dout
,
dx
);
}
else
{
VLOG
(
10
)
<<
" Inplace activation "
;
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dX
);
functor
(
*
place
,
x
,
out
,
dout
,
dx
);
}
}
};
...
...
@@ -89,6 +109,14 @@ struct BaseActivationFunctor {
using
AttrPair
=
std
::
vector
<
std
::
pair
<
const
char
*
,
float
*>>
;
AttrPair
GetAttrs
()
{
return
AttrPair
();
}
/* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
For example, sigmoid op's gradient didn't involve x, so its output can
reuse
input memory. But abs op's gradient use x, it can not be inplaced.
gradient did use x.
*/
bool
Inplace
()
const
{
return
false
;
}
};
// sigmoid(x) = 1 / (1 + exp(-x))
...
...
@@ -102,6 +130,7 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
template
<
typename
T
>
struct
SigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
bool
Inplace
()
const
{
return
IsInplace
(
"sigmoid"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
...
...
@@ -156,6 +185,7 @@ struct ExpFunctor : public BaseActivationFunctor<T> {
template
<
typename
T
>
struct
ExpGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
bool
Inplace
()
const
{
return
IsInplace
(
"exp"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
...
...
@@ -174,10 +204,11 @@ struct ReluFunctor : public BaseActivationFunctor<T> {
template
<
typename
T
>
struct
ReluGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
bool
Inplace
()
const
{
return
IsInplace
(
"relu"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
(
x
>
static_cast
<
T
>
(
0
)).
template
cast
<
T
>();
dx
.
device
(
d
)
=
dout
*
(
out
>
static_cast
<
T
>
(
0
)).
template
cast
<
T
>();
}
};
...
...
@@ -192,6 +223,7 @@ struct TanhFunctor : public BaseActivationFunctor<T> {
template
<
typename
T
>
struct
TanhGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
bool
Inplace
()
const
{
return
IsInplace
(
"tanh"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
...
...
@@ -297,6 +329,7 @@ struct SqrtFunctor : public BaseActivationFunctor<T> {
template
<
typename
T
>
struct
SqrtGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
bool
Inplace
()
const
{
return
IsInplace
(
"sqrt"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
...
...
@@ -316,10 +349,11 @@ struct CeilFunctor : public BaseActivationFunctor<T> {
template
<
typename
T
>
struct
ZeroGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
bool
Inplace
()
const
{
return
IsInplace
(
"ceil"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0
)
/
x
;
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0
)
/
out
;
}
};
...
...
@@ -432,6 +466,7 @@ struct ReciprocalFunctor : public BaseActivationFunctor<T> {
template
<
typename
T
>
struct
ReciprocalGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
bool
Inplace
()
const
{
return
IsInplace
(
"reciprocal"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
...
...
@@ -531,12 +566,14 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"threshold"
,
&
threshold
}};
}
bool
Inplace
()
const
{
return
IsInplace
(
"relu6"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
((
x
>
static_cast
<
T
>
(
0
))
*
(
x
<
static_cast
<
T
>
(
threshold
)))
.
template
cast
<
T
>();
dx
.
device
(
d
)
=
dout
*
((
out
>
static_cast
<
T
>
(
0
))
*
(
out
<
static_cast
<
T
>
(
threshold
)))
.
template
cast
<
T
>();
}
};
...
...
@@ -611,11 +648,12 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"threshold"
,
&
threshold
}};
}
bool
Inplace
()
const
{
return
IsInplace
(
"soft_relu"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
auto
tmp
=
static_cast
<
T
>
(
threshold
);
auto
temp
=
((
x
>
-
tmp
)
*
(
x
<
tmp
)).
template
cast
<
T
>().
eval
();
auto
temp
=
((
out
>
-
tmp
)
*
(
out
<
tmp
)).
template
cast
<
T
>().
eval
();
dx
.
device
(
d
)
=
dout
*
(
static_cast
<
T
>
(
1
)
-
(
-
out
).
exp
())
*
temp
;
}
};
...
...
@@ -791,7 +829,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
bool
Inplace
()
{
return
IsInplace
(
"hard_sigmoid"
);
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
...
...
paddle/fluid/operators/dropout_op.cu
浏览文件 @
1d756746
...
...
@@ -24,12 +24,34 @@ namespace paddle {
namespace
operators
{
template
<
typename
T
>
__global__
void
RandomGenerator
(
const
size_t
n
,
const
T
*
src
,
const
T
*
cpu_mask_data
,
T
*
mask_data
,
T
*
dst
)
{
__global__
void
RandomGenerator
(
const
size_t
n
,
const
int
seed
,
const
float
dropout_prob
,
const
T
*
src
,
T
*
mask_data
,
T
*
dst
)
{
thrust
::
minstd_rand
rng
;
rng
.
seed
(
seed
);
thrust
::
uniform_real_distribution
<
float
>
dist
(
0
,
1
);
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
0
;
T
mask
;
T
dest
;
for
(;
idx
<
n
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
mask_data
[
idx
]
=
cpu_mask_data
[
idx
];
dst
[
idx
]
=
mask_data
[
idx
]
*
src
[
idx
];
T
s
=
src
[
idx
];
if
(
step_size
==
0
)
{
rng
.
discard
(
idx
);
step_size
=
blockDim
.
x
*
gridDim
.
x
;
}
else
{
rng
.
discard
(
step_size
);
}
if
(
dist
(
rng
)
<
dropout_prob
)
{
mask
=
static_cast
<
T
>
(
0
);
}
else
{
mask
=
static_cast
<
T
>
(
1
);
}
dest
=
s
*
mask
;
mask_data
[
idx
]
=
mask
;
dst
[
idx
]
=
dest
;
}
}
...
...
@@ -56,27 +78,15 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
std
::
random_device
rnd
;
int
seed
=
context
.
Attr
<
bool
>
(
"fix_seed"
)
?
context
.
Attr
<
int
>
(
"seed"
)
:
rnd
();
std
::
minstd_rand
engine
;
engine
.
seed
(
seed
);
std
::
uniform_real_distribution
<
float
>
dist
(
0
,
1
);
framework
::
Vector
<
T
>
cpu_mask
(
size
);
for
(
size_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
dist
(
engine
)
<
dropout_prob
)
{
cpu_mask
[
i
]
=
static_cast
<
T
>
(
0
);
}
else
{
cpu_mask
[
i
]
=
static_cast
<
T
>
(
1
);
}
}
int
threads
=
512
;
int
grid
=
(
x
->
numel
()
+
threads
-
1
)
/
threads
;
RandomGenerator
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
x_data
,
cpu_mask
.
CUDAData
(
context
.
GetPlace
()),
mask_data
,
y_data
);
size
,
seed
,
dropout_prob
,
x_data
,
mask_data
,
y_data
);
}
else
{
auto
X
=
Eigen
Vector
<
T
>::
Flatten
(
*
x
);
auto
Y
=
Eigen
Vector
<
T
>::
Flatten
(
*
y
);
auto
X
=
Eigen
Matrix
<
T
>::
Reshape
(
*
x
,
1
);
auto
Y
=
Eigen
Matrix
<
T
>::
Reshape
(
*
y
,
1
);
Y
.
device
(
place
)
=
X
*
static_cast
<
T
>
(
1.0
f
-
dropout_prob
);
}
}
...
...
@@ -89,8 +99,6 @@ namespace ops = paddle::operators;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
dropout
,
ops
::
GPUDropoutKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
GPUDropoutKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
GPUDropoutKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
dropout_grad
,
ops
::
DropoutGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
DropoutGradKernel
<
plat
::
CUDADeviceContext
,
float
>
);
paddle/fluid/operators/dropout_op.h
浏览文件 @
1d756746
...
...
@@ -24,7 +24,7 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
Eigen
Vector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
using
Eigen
Matrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
DeviceContext
,
typename
T
>
class
CPUDropoutKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
@@ -60,8 +60,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
}
}
}
else
{
auto
X
=
Eigen
Vector
<
T
>::
Flatten
(
*
x
);
auto
Y
=
Eigen
Vector
<
T
>::
Flatten
(
*
y
);
auto
X
=
Eigen
Matrix
<
T
>::
Reshape
(
*
x
,
1
);
auto
Y
=
Eigen
Matrix
<
T
>::
Reshape
(
*
y
,
1
);
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
Y
.
device
(
place
)
=
X
*
(
1.0
f
-
dropout_prob
);
...
...
@@ -81,9 +81,9 @@ class DropoutGradKernel : public framework::OpKernel<T> {
auto
*
mask
=
context
.
Input
<
Tensor
>
(
"Mask"
);
grad_x
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
M
=
Eigen
Vector
<
T
>::
Flatten
(
*
mask
);
auto
dX
=
Eigen
Vector
<
T
>::
Flatten
(
*
grad_x
);
auto
dY
=
Eigen
Vector
<
T
>::
Flatten
(
*
grad_y
);
auto
M
=
Eigen
Matrix
<
T
>::
Reshape
(
*
mask
,
1
);
auto
dX
=
Eigen
Matrix
<
T
>::
Reshape
(
*
grad_x
,
1
);
auto
dY
=
Eigen
Matrix
<
T
>::
Reshape
(
*
grad_y
,
1
);
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
...
...
paddle/fluid/operators/dropout_op_test.cc
浏览文件 @
1d756746
...
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <iostream>
#include <string>
#include <thread> // NOLINT
...
...
@@ -33,16 +32,14 @@ namespace m = paddle::operators::math;
USE_OP
(
dropout
);
static
paddle
::
framework
::
DDim
dims
=
{
10
,
10
};
void
Compare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
// init
auto
var
=
scope
->
Var
(
"X"
);
auto
tensor
=
var
->
GetMutable
<
f
::
LoDTensor
>
();
tensor
->
Resize
(
dims
);
tensor
->
Resize
(
{
10
,
10
}
);
std
::
vector
<
float
>
init
;
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
dims
)
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
10
*
10
;
++
i
)
{
init
.
push_back
(
1.0
);
}
...
...
@@ -51,19 +48,18 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
auto
place
=
ctx
.
GetPlace
();
auto
out_var
=
scope
->
Var
(
"Out"
);
auto
out_tensor
=
out_var
->
GetMutable
<
f
::
LoDTensor
>
();
out_tensor
->
Resize
(
dims
);
out_tensor
->
Resize
(
{
10
,
10
}
);
out_tensor
->
mutable_data
<
float
>
(
place
);
// allocate
auto
mask_var
=
scope
->
Var
(
"Mask"
);
auto
mask_tensor
=
mask_var
->
GetMutable
<
f
::
LoDTensor
>
();
mask_tensor
->
Resize
(
dims
);
mask_tensor
->
Resize
(
{
10
,
10
}
);
mask_tensor
->
mutable_data
<
float
>
(
place
);
// allocate
// run
f
::
AttributeMap
attrs
;
float
dropout_prob
=
0.5
;
attrs
.
insert
({
"is_test"
,
false
});
attrs
.
insert
({
"fix_seed"
,
true
});
attrs
.
insert
({
"fix_seed"
,
1
});
attrs
.
insert
({
"seed"
,
3
});
attrs
.
insert
({
"dropout_prob"
,
dropout_prob
});
auto
dropout_op
=
f
::
OpRegistry
::
CreateOp
(
...
...
@@ -73,7 +69,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
std
::
vector
<
float
>
out_vec
;
TensorToVector
(
*
out_tensor
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
std
::
vector
<
float
>
std_out
=
{
0
,
0
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
0
,
0
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
0
,
1
,
...
...
@@ -88,22 +83,22 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
}
}
// TODO(wyi): Due to
// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
// disable this test to remove the prevention of the merge of
// unrelated PRs.
/*
TEST(Dropout, CPUDense) {
f::Scope scope;
p::CPUPlace place;
p::CPUDeviceContext ctx(place);
Compare
(
&
scope
,
ctx
);
Compare(scope, ctx);
}
// TODO(wyi, dzhwinter): Due to
// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
// disable this test to remove the prevention of the merge of
// unrelated PRs.
/*
TEST(Dropout, GPUDense) {
f::Scope scope;
p::CUDAPlace place;
p::CUDADeviceContext ctx(place);
Compare(
&
scope, ctx);
Compare(scope, ctx);
}
*/
paddle/fluid/operators/mkldnn_activation_op.h
浏览文件 @
1d756746
...
...
@@ -60,7 +60,7 @@ class MKLDNNActivationGradKernel
}
};
namespace
{
namespace
{
// NOLINT
framework
::
OpKernelType
GetKernelType
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
OperatorWithKernel
&
oper
)
{
...
...
paddle/fluid/pybind/protobuf.cc
浏览文件 @
1d756746
...
...
@@ -127,6 +127,7 @@ void BindProgramDesc(pybind11::module *m) {
.
def
(
"block"
,
&
pd
::
ProgramDesc
::
MutableBlock
,
pybind11
::
return_value_policy
::
reference
)
.
def
(
"num_blocks"
,
&
pd
::
ProgramDesc
::
Size
)
.
def
(
"flush"
,
&
pd
::
ProgramDesc
::
Flush
)
.
def
(
"get_feed_target_names"
,
&
pd
::
ProgramDesc
::
GetFeedTargetNames
)
.
def
(
"get_fetch_target_names"
,
&
pd
::
ProgramDesc
::
GetFetchTargetNames
)
.
def
(
"serialize_to_string"
,
SerializeMessage
<
pd
::
ProgramDesc
>
)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
1d756746
...
...
@@ -33,6 +33,7 @@ limitations under the License. */
#include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -461,6 +462,9 @@ All parameter, weight, gradient are variables in Paddle.
self
.
back
().
set_lod
(
t
.
lod
());
});
m
.
def
(
"IsInplace"
,
[](
std
::
string
op
)
->
bool
{
return
operators
::
IsInplace
(
op
);
});
m
.
def
(
"op_support_gpu"
,
OpSupportGPU
);
#ifdef PADDLE_WITH_CUDA
m
.
def
(
"get_cuda_device_count"
,
platform
::
GetCUDADeviceCount
);
...
...
paddle/scripts/docker/build.sh
浏览文件 @
1d756746
...
...
@@ -155,7 +155,7 @@ EOF
function
gen_dockerfile
()
{
# Set BASE_IMAGE according to env variables
if
[[
${
WITH_GPU
}
==
"ON"
]]
;
then
BASE_IMAGE
=
"nvidia/cuda:8.0-cudnn
5
-runtime-ubuntu16.04"
BASE_IMAGE
=
"nvidia/cuda:8.0-cudnn
7
-runtime-ubuntu16.04"
else
BASE_IMAGE
=
"ubuntu:16.04"
fi
...
...
@@ -164,7 +164,7 @@ function gen_dockerfile() {
DOCKERFILE_CUDNN_DSO
=
""
if
[[
${
WITH_GPU
:-
OFF
}
==
'ON'
]]
;
then
DOCKERFILE_GPU_ENV
=
"ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:
\$
{LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO
=
"RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.
5
/usr/lib/x86_64-linux-gnu/libcudnn.so"
DOCKERFILE_CUDNN_DSO
=
"RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.
7
/usr/lib/x86_64-linux-gnu/libcudnn.so"
fi
cat
<<
EOF
...
...
python/paddle/fluid/io.py
浏览文件 @
1d756746
...
...
@@ -336,18 +336,20 @@ def save_inference_model(dirname,
if
main_program
is
None
:
main_program
=
default_main_program
()
copy_program
=
main_program
if
not
os
.
path
.
isdir
(
dirname
):
os
.
makedirs
(
dirname
)
# Clear the is_target information and remove the existed feed and fetch op
global_block
=
main
_program
.
global_block
()
global_block
=
copy
_program
.
global_block
()
for
i
,
op
in
enumerate
(
global_block
.
ops
):
op
.
desc
.
set_is_target
(
False
)
if
op
.
type
==
"feed"
or
op
.
type
==
"fetch"
:
global_block
.
remove_op
(
i
)
copy_program
.
desc
.
flush
()
pruned_program
=
main
_program
.
prune
(
targets
=
target_vars
)
pruned_program
=
copy
_program
.
prune
(
targets
=
target_vars
)
inference_program
=
pruned_program
.
inference_optimize
()
fetch_var_names
=
[
v
.
name
for
v
in
target_vars
]
...
...
python/paddle/fluid/layer_helper.py
浏览文件 @
1d756746
...
...
@@ -19,6 +19,7 @@ from framework import Variable, Parameter, default_main_program, default_startup
import
unique_name
from
paddle.fluid.initializer
import
Constant
,
Xavier
from
param_attr
import
ParamAttr
,
WeightNormParamAttr
import
core
class
LayerHelper
(
object
):
...
...
@@ -398,13 +399,16 @@ class LayerHelper(object):
return
input_var
if
isinstance
(
act
,
basestring
):
act
=
{
'type'
:
act
}
tmp
=
self
.
create_tmp_variable
(
dtype
=
input_var
.
dtype
)
if
'use_mkldnn'
in
self
.
kwargs
:
act
[
'use_mkldnn'
]
=
self
.
kwargs
.
get
(
'use_mkldnn'
)
act_type
=
act
.
pop
(
'type'
)
if
'use_mkldnn'
in
self
.
kwargs
:
act
[
'use_mkldnn'
]
=
self
.
kwargs
.
get
(
'use_mkldnn'
)
tmp
=
input_var
# NOTE(dzhwinter): some activation support inplace compution.
if
not
core
.
IsInplace
(
act_type
):
tmp
=
self
.
create_tmp_variable
(
dtype
=
input_var
.
dtype
)
self
.
append_op
(
type
=
act_type
,
inputs
=
{
"X"
:
[
input_var
]},
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
1d756746
...
...
@@ -21,7 +21,7 @@ from ..executor import global_scope
__all__
=
[
'data'
,
'BlockGuardServ'
,
'ListenAndServ'
,
'Send'
,
'open_recordio_file'
,
'open_files'
,
'read_file'
,
'shuffle'
,
'double_buffer'
'open_files'
,
'read_file'
,
'shuffle'
,
'
batch'
,
'
double_buffer'
]
...
...
@@ -290,7 +290,7 @@ def open_recordio_file(filename,
lod_levels
,
dtypes
,
pass_num
=
1
,
for_parallel
=
Fals
e
):
for_parallel
=
Tru
e
):
"""
Open a RecordIO file
...
...
@@ -364,7 +364,7 @@ def open_files(filenames,
thread_num
,
buffer_size
=
None
,
pass_num
=
1
,
for_parallel
=
Fals
e
):
for_parallel
=
Tru
e
):
"""
Open files
...
...
@@ -476,6 +476,11 @@ def shuffle(reader, buffer_size):
'create_shuffle_reader'
,
reader
,
{
'buffer_size'
:
int
(
buffer_size
)})
def
batch
(
reader
,
batch_size
):
return
__create_unshared_decorated_reader__
(
'create_batch_reader'
,
reader
,
{
'batch_size'
:
int
(
batch_size
)})
def
double_buffer
(
reader
,
place
=
None
):
attrs
=
dict
()
if
place
is
not
None
:
...
...
python/paddle/fluid/tests/unittests/test_activation_op.py
浏览文件 @
1d756746
...
...
@@ -361,10 +361,7 @@ class TestCeil(OpTest):
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
# The same reason with TestFloor
def
init_dtype
(
self
):
pass
...
...
@@ -396,10 +393,8 @@ class TestFloor(OpTest):
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
# the gradient on floor, ceil, round is undefined.
# we return zero as gradient, but the numpy return nan
def
init_dtype
(
self
):
pass
...
...
@@ -501,11 +496,6 @@ class TestRound(OpTest):
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
def
init_dtype
(
self
):
pass
...
...
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
浏览文件 @
1d756746
...
...
@@ -100,6 +100,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
# (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
# transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
if
data_format
!=
"NCHW"
and
data_format
!=
"NHWC"
:
raise
ValueError
(
"Unknown data order."
)
if
data_format
==
"NCHW"
:
x
=
np
.
transpose
(
x
,
(
0
,
2
,
3
,
1
))
y_grad
=
np
.
transpose
(
y_grad
,
(
0
,
2
,
3
,
1
))
...
...
@@ -304,7 +307,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
# run backward
y_grad
=
np
.
random
.
random_sample
(
shape
).
astype
(
np
.
float32
)
x_grad
,
scale_grad
,
bias_grad
=
_reference_grad
(
x
,
y_grad
,
scale
,
saved_mean
,
var_ref
,
epsilon
,
data_
forma
t
)
x
,
y_grad
,
scale
,
saved_mean
,
var_ref
,
epsilon
,
data_
layou
t
)
var_dict
=
locals
()
var_dict
[
'y@GRAD'
]
=
y_grad
...
...
python/paddle/fluid/tests/unittests/test_multi_file_reader.py
浏览文件 @
1d756746
...
...
@@ -69,7 +69,6 @@ class TestMultipleReader(unittest.TestCase):
break
batch_count
+=
1
self
.
assertLessEqual
(
img_val
.
shape
[
0
],
self
.
batch_size
)
data_files
.
reset
()
self
.
assertEqual
(
batch_count
,
self
.
num_batch
*
3
)
def
test_main
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
浏览文件 @
1d756746
...
...
@@ -43,9 +43,8 @@ class TestMultipleReader(unittest.TestCase):
filename
=
'./mnist.recordio'
,
shapes
=
[(
-
1
,
784
),
(
-
1
,
1
)],
lod_levels
=
[
0
,
0
],
dtypes
=
[
'float32'
,
'int64'
])
data_file
=
fluid
.
layers
.
io
.
multi_pass
(
reader
=
data_file
,
pass_num
=
self
.
pass_num
)
dtypes
=
[
'float32'
,
'int64'
],
pass_num
=
self
.
pass_num
)
img
,
label
=
fluid
.
layers
.
read_file
(
data_file
)
if
fluid
.
core
.
is_compiled_with_cuda
():
...
...
@@ -65,5 +64,4 @@ class TestMultipleReader(unittest.TestCase):
break
batch_count
+=
1
self
.
assertLessEqual
(
img_val
.
shape
[
0
],
self
.
batch_size
)
data_file
.
reset
()
self
.
assertEqual
(
batch_count
,
self
.
num_batch
*
self
.
pass_num
)
python/paddle/fluid/tests/unittests/test_recordio_reader.py
浏览文件 @
1d756746
...
...
@@ -74,13 +74,13 @@ class TestRecordIO(unittest.TestCase):
avg_loss_np
.
append
(
tmp
)
batch_id
+=
1
data_file
.
reset
()
self
.
assertEqual
(
batch_id
,
self
.
num_batches
)
self
.
assertLess
(
avg_loss_np
[
-
1
],
avg_loss_np
[
0
])
def
test_shuffle_reader
(
self
):
self
.
test_main
(
decorator_callback
=
lambda
reader
:
fluid
.
layers
.
io
.
shuffle
(
reader
,
buffer_size
=
200
))
self
.
test_main
(
decorator_callback
=
lambda
reader
:
fluid
.
layers
.
io
.
shuffle
(
reader
,
buffer_size
=
200
))
def
test_double_buffer_reader
(
self
):
self
.
test_main
(
decorator_callback
=
lambda
reader
:
fluid
.
layers
.
io
.
double_buffer
(
reader
,
place
=
'cuda:0'
if
fluid
.
core
.
is_compiled_with_cuda
()
else
'cpu'
))
place
=
'cuda:0'
if
fluid
.
core
.
is_compiled_with_cuda
()
else
'cpu'
))
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录