Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
d50f776b
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d50f776b
编写于
8月 01, 2018
作者:
N
nhzlx
浏览文件
操作
浏览文件
下载
差异文件
merge develop
上级
64a08f84
900d61dd
变更
55
显示空白变更内容
内联
并排
Showing
55 changed file
with
1314 addition
and
338 deletion
+1314
-338
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+4
-3
paddle/fluid/API.spec
paddle/fluid/API.spec
+2
-2
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+1
-0
paddle/fluid/framework/data_type.cc
paddle/fluid/framework/data_type.cc
+3
-1
paddle/fluid/framework/data_type_test.cc
paddle/fluid/framework/data_type_test.cc
+40
-0
paddle/fluid/framework/ir/graph_helper_test.cc
paddle/fluid/framework/ir/graph_helper_test.cc
+2
-2
paddle/fluid/framework/ir/graph_test.cc
paddle/fluid/framework/ir/graph_test.cc
+6
-6
paddle/fluid/framework/op_kernel_type_test.cc
paddle/fluid/framework/op_kernel_type_test.cc
+7
-0
paddle/fluid/framework/op_proto_maker.cc
paddle/fluid/framework/op_proto_maker.cc
+34
-0
paddle/fluid/framework/op_proto_maker.h
paddle/fluid/framework/op_proto_maker.h
+2
-0
paddle/fluid/framework/op_proto_maker_test.cc
paddle/fluid/framework/op_proto_maker_test.cc
+102
-5
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+17
-0
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+15
-0
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+1
-0
paddle/fluid/inference/analysis/data_flow_graph.h
paddle/fluid/inference/analysis/data_flow_graph.h
+1
-1
paddle/fluid/inference/analysis/model_store_pass.cc
paddle/fluid/inference/analysis/model_store_pass.cc
+3
-1
paddle/fluid/inference/analysis/model_store_pass.h
paddle/fluid/inference/analysis/model_store_pass.h
+2
-0
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+4
-1
paddle/fluid/inference/api/api_anakin_engine.cc
paddle/fluid/inference/api/api_anakin_engine.cc
+71
-20
paddle/fluid/inference/api/api_anakin_engine.h
paddle/fluid/inference/api/api_anakin_engine.h
+12
-8
paddle/fluid/inference/api/api_anakin_engine_tester.cc
paddle/fluid/inference/api/api_anakin_engine_tester.cc
+8
-9
paddle/fluid/inference/api/demo_ci/vis_demo.cc
paddle/fluid/inference/api/demo_ci/vis_demo.cc
+1
-1
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+3
-1
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+2
-1
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+50
-1
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+57
-0
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+33
-3
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+4
-4
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+12
-12
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+25
-22
paddle/fluid/operators/elementwise_add_op.cc
paddle/fluid/operators/elementwise_add_op.cc
+3
-1
paddle/fluid/operators/elementwise_add_op.h
paddle/fluid/operators/elementwise_add_op.h
+9
-7
paddle/fluid/operators/elementwise_div_op.cc
paddle/fluid/operators/elementwise_div_op.cc
+2
-0
paddle/fluid/operators/elementwise_op.h
paddle/fluid/operators/elementwise_op.h
+71
-3
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+122
-64
paddle/fluid/operators/elementwise_sub_op.cc
paddle/fluid/operators/elementwise_sub_op.cc
+4
-1
paddle/fluid/operators/elementwise_sub_op.h
paddle/fluid/operators/elementwise_sub_op.h
+6
-5
paddle/fluid/operators/softmax_op.cc
paddle/fluid/operators/softmax_op.cc
+23
-4
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+1
-0
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+0
-2
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+1
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-0
python/paddle/fluid/tests/unittests/dist_transformer.py
python/paddle/fluid/tests/unittests/dist_transformer.py
+280
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+61
-11
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+137
-0
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+5
-118
python/paddle/fluid/tests/unittests/test_dist_transformer.py
python/paddle/fluid/tests/unittests/test_dist_transformer.py
+27
-0
python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
...n/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+2
-2
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+2
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+9
-0
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+2
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+2
-3
python/paddle/fluid/tests/unittests/testsuite.py
python/paddle/fluid/tests/unittests/testsuite.py
+17
-9
python/paddle/fluid/tests/unittests/transformer_model.py
python/paddle/fluid/tests/unittests/transformer_model.py
+1
-1
未找到文件。
cmake/external/anakin.cmake
浏览文件 @
d50f776b
...
...
@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
set
(
ANAKIN_LIBRARY
"
${
ANAKIN_INSTALL_DIR
}
"
CACHE STRING
"path of Anakin library"
)
set
(
ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
-Wno-error=unused-variable -Wno-unused-variable
-Wno-error=format-extra-args -Wno-format-extra-args
-Wno-error=comment -Wno-comment
...
...
@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder
-Wno-error=cpp
)
set
(
ANAKIN_LIBRARY_URL
"https://github.com/pangge/Anakin/releases/download/
3.0/anakin_release_simple
.tar.gz"
)
set
(
ANAKIN_LIBRARY_URL
"https://github.com/pangge/Anakin/releases/download/
Version0.1.0/anakin
.tar.gz"
)
# A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files.
...
...
@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
message
(
STATUS
"Download Anakin library from
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"rm -rf
${
ANAKIN_INSTALL_DIR
}
/*"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; wget -q
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; wget -
-no-check-certificate -
q
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; tar xzf anakin
_release_simple
.tar.gz"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; tar xzf anakin.tar.gz"
)
endif
()
if
(
WITH_ANAKIN
)
...
...
paddle/fluid/API.spec
浏览文件 @
d50f776b
...
...
@@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
...
...
@@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
d50f776b
...
...
@@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_test
(
data_type_test SRCS data_type_test.cc DEPS data_type place tensor
)
if
(
WITH_GPU
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
else
()
...
...
paddle/fluid/framework/data_type.cc
浏览文件 @
d50f776b
...
...
@@ -17,6 +17,8 @@
#include <string>
#include <unordered_map>
using
float16
=
paddle
::
platform
::
float16
;
namespace
paddle
{
namespace
framework
{
...
...
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
RegisterType<cc_type>(retv, proto_type, #cc_type)
// NOTE: Add your customize type here.
RegType
(
platform
::
float16
,
proto
::
VarType
::
FP16
);
RegType
(
float16
,
proto
::
VarType
::
FP16
);
RegType
(
float
,
proto
::
VarType
::
FP32
);
RegType
(
double
,
proto
::
VarType
::
FP64
);
RegType
(
int
,
proto
::
VarType
::
INT32
);
...
...
paddle/fluid/framework/data_type_test.cc
0 → 100644
浏览文件 @
d50f776b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/data_type.h"
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
TEST
(
DataType
,
float16
)
{
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
CPUPlace
;
using
paddle
::
platform
::
float16
;
namespace
f
=
paddle
::
framework
;
f
::
proto
::
VarType
::
Type
dtype
=
f
::
proto
::
VarType
::
FP16
;
Tensor
tensor
;
CPUPlace
cpu
;
tensor
.
mutable_data
(
cpu
,
f
::
ToTypeIndex
(
dtype
));
// test fp16 tensor
EXPECT_EQ
(
tensor
.
type
(),
std
::
type_index
(
typeid
(
float16
)));
// test fp16 size
EXPECT_EQ
(
f
::
SizeOfType
(
f
::
ToTypeIndex
(
dtype
)),
2u
);
// test debug info
std
::
string
type
=
"float16"
;
EXPECT_STREQ
(
f
::
DataTypeToString
(
dtype
).
c_str
(),
type
.
c_str
());
}
paddle/fluid/framework/ir/graph_helper_test.cc
浏览文件 @
d50f776b
...
...
@@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) {
for
(
size_t
i
=
0
;
i
<
sorted
.
size
();
++
i
)
{
node_map
[
sorted
[
i
]
->
Name
()]
=
i
;
}
ASSERT_EQ
(
node_map
.
at
(
"op1"
),
0
);
ASSERT_EQ
(
node_map
.
at
(
"op2"
),
1
);
ASSERT_EQ
(
node_map
.
at
(
"op1"
),
0
UL
);
ASSERT_EQ
(
node_map
.
at
(
"op2"
),
1
UL
);
ASSERT_TRUE
(
node_map
.
at
(
"op3"
)
<
node_map
.
at
(
"op5"
));
}
}
// namespace ir
...
...
paddle/fluid/framework/ir/graph_test.cc
浏览文件 @
d50f776b
...
...
@@ -97,15 +97,15 @@ TEST(GraphTest, Basic) {
std
::
vector
<
ir
::
Node
*>
nodes
(
g
->
Nodes
().
begin
(),
g
->
Nodes
().
end
());
for
(
ir
::
Node
*
n
:
nodes
)
{
if
(
n
->
Name
()
==
"sum"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
3
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
3
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
UL
);
}
else
if
(
n
->
Name
()
==
"test_a"
||
n
->
Name
()
==
"test_b"
||
n
->
Name
()
==
"test_c"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
0
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
0
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
UL
);
}
else
if
(
n
->
Name
()
==
"test_out"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
0
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
1
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
0
UL
);
}
}
ASSERT_EQ
(
nodes
.
size
(),
5
);
...
...
paddle/fluid/framework/op_kernel_type_test.cc
浏览文件 @
d50f776b
...
...
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
ASSERT_EQ
(
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type
),
"data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
"CUDNN]"
);
using
CUDAPlace
=
paddle
::
platform
::
CUDAPlace
;
OpKernelType
op_kernel_type2
(
DataType
::
FP16
,
CUDAPlace
(
0
),
DataLayout
::
kNCHW
,
LibraryType
::
kCUDNN
);
ASSERT_EQ
(
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type2
),
"data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
"type[CUDNN]"
);
}
TEST
(
OpKernelType
,
Hash
)
{
...
...
paddle/fluid/framework/op_proto_maker.cc
浏览文件 @
d50f776b
...
...
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
return
OpProtoAndCheckerMaker
::
VariableBuilder
{
output
};
}
void
OpProtoAndCheckerMaker
::
Reuse
(
const
std
::
string
&
name
,
const
std
::
string
&
reused_name
)
{
bool
found
=
false
;
proto
::
OpProto
::
Var
*
var
;
for
(
auto
&
var
:
proto_
->
inputs
())
{
if
(
var
.
name
()
==
reused_name
)
{
found
=
true
;
break
;
}
}
PADDLE_ENFORCE
(
found
==
true
,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched."
,
name
,
reused_name
);
found
=
false
;
for
(
int
i
=
0
;
i
<
proto_
->
outputs
().
size
();
++
i
)
{
var
=
proto_
->
mutable_outputs
()
->
Mutable
(
i
);
if
(
var
->
name
()
==
name
)
{
PADDLE_ENFORCE
(
!
var
->
has_reuse
(),
"Output(%s) has been set reused var of %s"
,
name
,
var
->
reuse
());
found
=
true
;
var
->
set_reuse
(
reused_name
);
break
;
}
}
PADDLE_ENFORCE
(
found
==
true
,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched."
,
name
,
reused_name
);
}
void
OpProtoAndCheckerMaker
::
CheckNoDuplicatedInOutAttrs
()
{
std
::
unordered_set
<
std
::
string
>
names
;
auto
checker
=
[
&
](
const
std
::
string
&
name
)
{
...
...
paddle/fluid/framework/op_proto_maker.h
浏览文件 @
d50f776b
...
...
@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
VariableBuilder
AddOutput
(
const
std
::
string
&
name
,
const
std
::
string
&
comment
);
void
Reuse
(
const
std
::
string
&
name
,
const
std
::
string
&
reused_name
);
template
<
typename
T
>
TypedAttrChecker
<
T
>
&
AddAttr
(
const
std
::
string
&
name
,
const
std
::
string
&
comment
,
...
...
paddle/fluid/framework/op_proto_maker_test.cc
浏览文件 @
d50f776b
...
...
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
}
class
TestInplaceProtoMaker
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
AddOutput
(
"XOut"
,
"output of test op"
).
Reuse
(
"X"
);
}
};
class
TestInplaceProtoMaker2
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
...
...
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
};
TEST
(
ProtoMaker
,
InplaceOutput
)
{
paddle
::
framework
::
proto
::
OpProto
op_proto
;
paddle
::
framework
::
proto
::
OpProto
op_proto
,
op_proto2
;
paddle
::
framework
::
OpAttrChecker
op_checker
;
TestInplaceProtoMaker
proto_maker
;
ASSERT_THROW
(
proto_maker
(
&
op_proto
,
&
op_checker
),
TestInplaceProtoMaker2
proto_maker2
;
proto_maker
(
&
op_proto
,
&
op_checker
);
ASSERT_THROW
(
proto_maker2
(
&
op_proto2
,
&
op_checker
),
paddle
::
platform
::
EnforceNotMet
);
// proto_maker(&op_proto, &op_checker);
// proto_maker.Make();
// ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
}
// normal reuse
class
TestReuseProtoMaker
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
AddInput
(
"Y"
,
"input of test op"
);
AddOutput
(
"Out"
,
"output of test op"
);
AddOutput
(
"XOut"
,
"output of test op"
);
// avoid destructor exception.
// Validate();
TestReuse
();
}
virtual
void
TestReuse
()
{}
};
// test duplicate reuse error
class
TestReuseProtoMaker2
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"Out"
,
"X"
);
Reuse
(
"Out"
,
"Y"
);
}
};
// NotExists Input
class
TestReuseProtoMaker3
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"Out"
,
"NotExists"
);
Reuse
(
"XOut"
,
"X"
);
}
};
// NotExists Output
class
TestReuseProtoMaker4
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"NotExists"
,
"X"
);
}
};
TEST
(
ProtoMaker
,
Reuse
)
{
paddle
::
framework
::
proto
::
OpProto
op_proto
;
paddle
::
framework
::
OpAttrChecker
op_checker
;
TestReuseProtoMaker
proto_maker
;
proto_maker
(
&
op_proto
,
&
op_checker
);
}
// NOTE(dzhwinter):
// There is a Fatal CHECK on base class destructor, which will call abort inside
// instead of
// throw an exception. If we throw an exception in Make(), we will trigger the
// CHECK and terminate the tests.
//
// I had tried to replace the default CHECK with a exception, however, it's
// still not supported by glog.
// the details:
// https://github.com/google/glog/issues/249
// https://github.com/facebookresearch/TensorComprehensions/issues/351
/*
TEST(ProtoMaker, ReuseWithException) {
paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
paddle::framework::OpAttrChecker op_checker;
TestReuseProtoMaker2 proto_maker2;
TestReuseProtoMaker3 proto_maker3;
TestReuseProtoMaker4 proto_maker4;
EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
paddle::platform::EnforceNotMet);
}
void FailureFunction() {
throw std::runtime_error("Check failed in destructor.");
// return 0;
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
google::InstallFailureFunction(&FailureFunction);
return RUN_ALL_TESTS();
}
*/
paddle/fluid/framework/operator.cc
浏览文件 @
d50f776b
...
...
@@ -69,6 +69,21 @@ static DDim GetDims(const Scope& scope, const std::string& name,
}
}
static
std
::
string
GetDtype
(
const
Scope
&
scope
,
const
std
::
string
&
name
)
{
Variable
*
var
=
scope
.
FindVar
(
name
);
if
(
var
==
nullptr
)
{
return
""
;
}
if
(
var
->
IsType
<
LoDTensor
>
())
{
return
DataTypeToString
(
ToDataType
(
var
->
Get
<
LoDTensor
>
().
type
()));
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
return
DataTypeToString
(
ToDataType
(
var
->
Get
<
SelectedRows
>
().
value
().
type
()));
}
else
{
return
""
;
}
}
static
int
GetRowSize
(
const
Scope
&
scope
,
const
std
::
string
&
name
)
{
Variable
*
var
=
scope
.
FindVar
(
name
);
if
(
var
==
nullptr
)
{
...
...
@@ -172,6 +187,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
if
(
row_size
>=
0
)
{
ss
<<
"[row_size="
<<
row_size
<<
"]"
;
}
std
::
string
dtype
=
GetDtype
(
*
scope
,
input
.
second
[
i
]);
ss
<<
":"
<<
dtype
;
ss
<<
"["
<<
GetDims
(
*
scope
,
input
.
second
[
i
],
true
)
<<
"]"
;
ss
<<
"("
<<
GetLoD
(
*
scope
,
input
.
second
[
i
])
<<
")"
;
}
...
...
paddle/fluid/framework/tensor_test.cc
浏览文件 @
d50f776b
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/tensor.h"
#include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/platform/float16.h"
namespace
framework
=
paddle
::
framework
;
namespace
platform
=
paddle
::
platform
;
...
...
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
src
.
set_layout
(
framework
::
DataLayout
::
kAnyLayout
);
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kAnyLayout
);
}
TEST
(
Tensor
,
FP16
)
{
using
platform
::
float16
;
framework
::
Tensor
src
;
float16
*
src_ptr
=
src
.
mutable_data
<
float16
>
({
2
,
3
},
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
2
*
3
;
++
i
)
{
src_ptr
[
i
]
=
static_cast
<
float16
>
(
i
);
}
EXPECT_EQ
(
src
.
memory_size
(),
2
*
3
*
sizeof
(
float16
));
// EXPECT a human readable error message
// src.data<uint8_t>();
// Tensor holds the wrong type, it holds N6paddle8platform7float16E at
// [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
}
paddle/fluid/inference/analysis/argument.h
浏览文件 @
d50f776b
...
...
@@ -23,6 +23,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
...
...
paddle/fluid/inference/analysis/data_flow_graph.h
浏览文件 @
d50f776b
...
...
@@ -176,7 +176,7 @@ struct GraphTraits<DataFlowGraph> {
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
std
::
pair
<
std
::
vector
<
Node
*>
,
std
::
vector
<
Node
*>>
ExtractInputAndOutputOfSubGraph
(
std
::
vector
<
Node
*>
&
graph
);
ExtractInputAndOutputOfSubGraph
(
std
::
vector
<
Node
*>
&
graph
);
// NOLINT
}
// namespace analysis
}
// namespace inference
...
...
paddle/fluid/inference/analysis/model_store_pass.cc
浏览文件 @
d50f776b
...
...
@@ -12,11 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
namespace
paddle
{
namespace
inference
{
...
...
paddle/fluid/inference/analysis/model_store_pass.h
浏览文件 @
d50f776b
...
...
@@ -17,6 +17,8 @@
* model in the disk, and that model can be reloaded for prediction.
*/
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
namespace
paddle
{
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
d50f776b
...
...
@@ -19,6 +19,7 @@ endif(APPLE)
set
(
inference_deps paddle_inference_api paddle_fluid_api
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
)
endif
()
...
...
@@ -63,6 +64,8 @@ endif()
if
(
WITH_ANAKIN
)
# only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively
(
${
ANAKIN_INCLUDE
}
)
# compile the libinference_anakin_api.a and anakin.so.
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc
)
nv_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc
)
...
...
@@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI
if
(
WITH_TESTING
)
cc_test
(
inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=
${
ANAKIN_INSTALL_DIR
}
/mobilenet_v2.anakin.bin
DEPS inference_anakin_api
)
DEPS inference_anakin_api
_shared
)
target_compile_options
(
inference_anakin_test BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endif
(
WITH_TESTING
)
endif
()
paddle/fluid/inference/api/api_anakin_engine.cc
浏览文件 @
d50f776b
...
...
@@ -18,26 +18,36 @@
namespace
paddle
{
PaddleInferenceAnakinPredictor
::
PaddleInferenceAnakinPredictor
(
template
<
typename
Target
>
PaddleInferenceAnakinPredictor
<
Target
>::
PaddleInferenceAnakinPredictor
(
const
AnakinConfig
&
config
)
{
CHECK
(
Init
(
config
));
}
bool
PaddleInferenceAnakinPredictor
::
Init
(
const
AnakinConfig
&
config
)
{
template
<
typename
Target
>
bool
PaddleInferenceAnakinPredictor
<
Target
>::
Init
(
const
AnakinConfig
&
config
)
{
if
(
!
(
graph_
.
load
(
config
.
model_file
)))
{
LOG
(
FATAL
)
<<
"fail to load graph from "
<<
config
.
model_file
;
return
false
;
}
graph_
.
ResetBatchSize
(
"input_0"
,
config
.
max_batch_size
);
auto
inputs
=
graph_
.
get_ins
();
for
(
auto
&
input_str
:
inputs
)
{
graph_
.
ResetBatchSize
(
input_str
,
config
.
max_batch_size
);
}
// optimization for graph
if
(
!
(
graph_
.
Optimize
()))
{
return
false
;
}
// construct executer
executor_
.
init
(
graph_
);
if
(
executor_p_
==
nullptr
)
{
executor_p_
=
new
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
(
graph_
,
true
);
}
return
true
;
}
bool
PaddleInferenceAnakinPredictor
::
Run
(
template
<
typename
Target
>
bool
PaddleInferenceAnakinPredictor
<
Target
>::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
for
(
const
auto
&
input
:
inputs
)
{
...
...
@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
<<
"'s type is not float"
;
return
false
;
}
auto
d_tensor_in_p
=
executor_
.
get_in
(
input
.
name
);
auto
d_tensor_in_p
=
executor_p_
->
get_in
(
input
.
name
);
auto
net_shape
=
d_tensor_in_p
->
valid_shape
();
if
(
net_shape
.
size
()
!=
input
.
shape
.
size
())
{
LOG
(
ERROR
)
<<
" input "
<<
input
.
name
<<
"'s shape size should be equal to that of net"
;
return
false
;
}
int
sum
=
1
;
for_each
(
input
.
shape
.
begin
(),
input
.
shape
.
end
(),
[
&
](
int
n
)
{
sum
*=
n
;
});
if
(
sum
>
net_shape
.
count
())
{
graph_
.
Reshape
(
input
.
name
,
input
.
shape
);
delete
executor_p_
;
executor_p_
=
new
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
(
graph_
,
true
);
d_tensor_in_p
=
executor_p_
->
get_in
(
input
.
name
);
}
anakin
::
saber
::
Shape
tmp_shape
;
for
(
auto
s
:
input
.
shape
)
{
tmp_shape
.
push_back
(
s
);
}
d_tensor_in_p
->
reshape
(
tmp_shape
);
float
*
d_data_p
=
d_tensor_in_p
->
mutable_data
();
if
(
cudaMemcpy
(
d_data_p
,
static_cast
<
float
*>
(
input
.
data
.
data
()),
d_tensor_in_p
->
valid_size
()
*
sizeof
(
float
),
...
...
@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
}
cudaStreamSynchronize
(
NULL
);
}
executor_
.
prediction
();
cudaDeviceSynchronize
();
executor_p_
->
prediction
();
cudaDeviceSynchronize
();
if
(
output_data
->
empty
())
{
LOG
(
ERROR
)
<<
"At least one output should be set with tensors' names."
;
return
false
;
}
for
(
auto
&
output
:
*
output_data
)
{
auto
*
tensor
=
executor_
.
get_out
(
output
.
name
);
output
.
shape
=
tensor
->
shape
();
auto
*
tensor
=
executor_
p_
->
get_out
(
output
.
name
);
output
.
shape
=
tensor
->
valid_
shape
();
if
(
output
.
data
.
length
()
<
tensor
->
valid_size
()
*
sizeof
(
float
))
{
output
.
data
.
Resize
(
tensor
->
valid_size
()
*
sizeof
(
float
));
}
...
...
@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
return
true
;
}
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
&
PaddleInferenceAnakinPredictor
::
get_executer
()
{
return
executor_
;
template
<
typename
Target
>
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
&
PaddleInferenceAnakinPredictor
<
Target
>::
get_executer
()
{
return
*
executor_p_
;
}
// the cloned new Predictor of anakin share the same net weights from original
// Predictor
std
::
unique_ptr
<
PaddlePredictor
>
PaddleInferenceAnakinPredictor
::
Clone
()
{
template
<
typename
Target
>
std
::
unique_ptr
<
PaddlePredictor
>
PaddleInferenceAnakinPredictor
<
Target
>::
Clone
()
{
VLOG
(
3
)
<<
"Anakin Predictor::clone"
;
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
PaddleInferenceAnakinPredictor
());
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
PaddleInferenceAnakinPredictor
<
Target
>
());
// construct executer from other graph
auto
anakin_predictor_p
=
dynamic_cast
<
PaddleInferenceAnakinPredictor
*>
(
cls
.
get
());
dynamic_cast
<
PaddleInferenceAnakinPredictor
<
Target
>
*>
(
cls
.
get
());
if
(
!
anakin_predictor_p
)
{
LOG
(
ERROR
)
<<
"fail to call Init"
;
return
nullptr
;
...
...
@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
return
std
::
move
(
cls
);
}
template
class
PaddleInferenceAnakinPredictor
<
anakin
::
NV
>;
template
class
PaddleInferenceAnakinPredictor
<
anakin
::
X86
>;
// A factory to help create difference predictor.
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
const
AnakinConfig
&
config
)
{
VLOG
(
3
)
<<
"Anakin Predictor create."
;
if
(
config
.
target_type
==
AnakinConfig
::
NVGPU
)
{
VLOG
(
3
)
<<
"Anakin Predictor create on [ NVIDIA GPU ]."
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
new
PaddleInferenceAnakinPredictor
(
config
));
new
PaddleInferenceAnakinPredictor
<
anakin
::
NV
>
(
config
));
return
x
;
}
}
else
if
(
config
.
target_type
==
AnakinConfig
::
X86
)
{
VLOG
(
3
)
<<
"Anakin Predictor create on [ Intel X86 ]."
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
new
PaddleInferenceAnakinPredictor
<
anakin
::
X86
>
(
config
));
return
x
;
}
else
{
VLOG
(
3
)
<<
"Anakin Predictor create on unknown platform."
;
return
nullptr
;
}
};
}
// namespace paddle
paddle/fluid/inference/api/api_anakin_engine.h
浏览文件 @
d50f776b
...
...
@@ -20,14 +20,16 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
// from anakin
#include "framework/core/net/net.h"
#include "framework/graph/graph.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "saber/core/shape.h"
#include "saber/saber_types.h"
namespace
paddle
{
template
<
typename
Target
>
class
PaddleInferenceAnakinPredictor
:
public
PaddlePredictor
{
public:
PaddleInferenceAnakinPredictor
()
{}
...
...
@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
override
;
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>&
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>&
get_executer
();
~
PaddleInferenceAnakinPredictor
()
override
{};
~
PaddleInferenceAnakinPredictor
()
override
{
delete
executor_p_
;
executor_p_
=
nullptr
;
};
private:
bool
Init
(
const
AnakinConfig
&
config
);
anakin
::
graph
::
Graph
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
anakin
::
graph
::
Graph
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
graph_
;
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
executor_
;
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>*
executor_
p_
{
nullptr
}
;
AnakinConfig
config_
;
};
...
...
paddle/fluid/inference/api/api_anakin_engine_tester.cc
浏览文件 @
d50f776b
...
...
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string
(
model
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
model
,
""
,
"Directory of the inference model
(mobile_v2)
."
);
namespace
paddle
{
AnakinConfig
GetConfig
()
{
AnakinConfig
config
;
// using AnakinConfig::X86 if you need to use cpu to do inference
config
.
target_type
=
AnakinConfig
::
NVGPU
;
config
.
model_file
=
FLAGS_model
;
config
.
device
=
0
;
config
.
max_batch_size
=
1
;
...
...
@@ -36,7 +38,6 @@ TEST(inference, anakin) {
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
config
);
float
data
[
1
*
3
*
224
*
224
]
=
{
1.0
f
};
PaddleTensor
tensor
;
tensor
.
name
=
"input_0"
;
tensor
.
shape
=
std
::
vector
<
int
>
({
1
,
3
,
224
,
224
});
...
...
@@ -44,22 +45,20 @@ TEST(inference, anakin) {
tensor
.
dtype
=
PaddleDType
::
FLOAT32
;
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
;
paddle_tensor_feeds
.
emplace_back
(
std
::
move
(
tensor
));
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
(
1
,
tensor
);
PaddleTensor
tensor_out
;
tensor_out
.
name
=
"prob_out"
;
tensor_out
.
shape
=
std
::
vector
<
int
>
({
1000
,
1
});
tensor_out
.
shape
=
std
::
vector
<
int
>
({});
tensor_out
.
data
=
PaddleBuf
();
tensor_out
.
dtype
=
PaddleDType
::
FLOAT32
;
std
::
vector
<
PaddleTensor
>
outputs
;
outputs
.
emplace_back
(
std
::
move
(
tensor_out
));
std
::
vector
<
PaddleTensor
>
outputs
(
1
,
tensor_out
);
ASSERT_TRUE
(
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
));
float
*
data_o
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
());
for
(
size_t
j
=
0
;
j
<
1000
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
outputs
[
0
].
data
.
length
()
;
++
j
)
{
LOG
(
INFO
)
<<
"output["
<<
j
<<
"]: "
<<
data_o
[
j
];
}
}
...
...
paddle/fluid/inference/api/demo_ci/vis_demo.cc
浏览文件 @
d50f776b
...
...
@@ -20,8 +20,8 @@ limitations under the License. */
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/demo_ci/utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "utils.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
...
...
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
d50f776b
...
...
@@ -44,7 +44,7 @@ class PaddleBuf {
PaddleBuf
(
void
*
data
,
size_t
length
)
:
data_
(
data
),
length_
(
length
),
memory_owned_
{
false
}
{}
// Own memory.
PaddleBuf
(
size_t
length
)
explicit
PaddleBuf
(
size_t
length
)
:
data_
(
new
char
[
length
]),
length_
(
length
),
memory_owned_
(
true
)
{}
// Resize to `length` bytes.
void
Resize
(
size_t
length
);
...
...
@@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config {
// Configurations for Anakin engine.
struct
AnakinConfig
:
public
PaddlePredictor
::
Config
{
enum
TargetType
{
NVGPU
=
0
,
X86
};
int
device
;
std
::
string
model_file
;
int
max_batch_size
{
-
1
};
TargetType
target_type
;
};
struct
TensorRTConfig
:
public
NativeConfig
{
...
...
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
d50f776b
...
...
@@ -13,7 +13,8 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine mul_op SERIAL
)
nv_test
(
test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine activation_op SERIAL
)
nv_test
(
test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine conv_op SERIAL
)
nv_test
(
test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pool_op SERIAL
)
...
...
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
浏览文件 @
d50f776b
...
...
@@ -20,11 +20,60 @@ namespace tensorrt {
class
Conv2dOpConverter
:
public
OpConverter
{
public:
Conv2dOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
LOG
(
INFO
)
<<
"convert a fluid conv2d op to tensorrt conv layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Input"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Filter"
).
size
(),
1
);
// Y is a weight
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Output"
).
size
(),
1
);
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Input"
).
front
());
// Declare weights
auto
*
Y_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Filter"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
Y_v
);
auto
*
Y_t
=
Y_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
weight_data
=
Y_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
PADDLE_ENFORCE_EQ
(
Y_t
->
dims
().
size
(),
4UL
);
const
int
n_output
=
Y_t
->
dims
()[
0
];
const
int
filter_h
=
Y_t
->
dims
()[
2
];
const
int
filter_w
=
Y_t
->
dims
()[
3
];
const
int
groups
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"groups"
));
const
std
::
vector
<
int
>
dilations
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"dilations"
));
const
std
::
vector
<
int
>
strides
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
const
std
::
vector
<
int
>
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
nvinfer1
::
DimsHW
nv_ksize
(
filter_h
,
filter_w
);
nvinfer1
::
DimsHW
nv_dilations
(
dilations
[
0
],
dilations
[
1
]);
nvinfer1
::
DimsHW
nv_strides
(
strides
[
0
],
strides
[
1
]);
nvinfer1
::
DimsHW
nv_paddings
(
paddings
[
0
],
paddings
[
1
]);
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
Y_t
->
memory_size
()
/
sizeof
(
float
)};
TensorRTEngine
::
Weight
bias
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
n_output
,
nv_ksize
,
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
layer
!=
nullptr
);
layer
->
setStride
(
nv_strides
);
layer
->
setPadding
(
nv_paddings
);
layer
->
setDilation
(
nv_dilations
);
layer
->
setNbGroups
(
groups
);
auto
output_name
=
op_desc
.
Output
(
"Output"
).
front
();
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
engine_
->
DeclareOutput
(
output_name
);
}
}
};
...
...
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
d50f776b
...
...
@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
}
// indata c * k
// Reorder the data layout from CK to KC.
void
ReorderCKtoKC
(
TensorRTEngine
::
Weight
&
iweights
,
void
ReorderCKtoKC
(
TensorRTEngine
::
Weight
&
iweights
,
// NOLINT
TensorRTEngine
::
Weight
*
oweights
)
{
int
c
=
iweights
.
dims
[
0
];
int
k
=
iweights
.
dims
[
1
];
...
...
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
0 → 100644
浏览文件 @
d50f776b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
conv2d_op
,
test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"conv2d-Y"
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
5
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"conv2d-X"
,
nvinfer1
::
Dims3
(
2
,
5
,
5
));
validator
.
DeclParamVar
(
"conv2d-Y"
,
nvinfer1
::
Dims4
(
3
,
2
,
3
,
3
));
validator
.
DeclOutputVar
(
"conv2d-Out"
,
nvinfer1
::
Dims3
(
3
,
5
,
5
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"conv2d"
);
desc
.
SetInput
(
"Input"
,
{
"conv2d-X"
});
desc
.
SetInput
(
"Filter"
,
{
"conv2d-Y"
});
desc
.
SetOutput
(
"Output"
,
{
"conv2d-Out"
});
const
std
::
vector
<
int
>
strides
({
1
,
1
});
const
std
::
vector
<
int
>
paddings
({
1
,
1
});
const
std
::
vector
<
int
>
dilations
({
1
,
1
});
const
int
groups
=
1
;
desc
.
SetAttr
(
"strides"
,
strides
);
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"dilations"
,
dilations
);
desc
.
SetAttr
(
"groups"
,
groups
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
3
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
conv2d
);
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
浏览文件 @
d50f776b
...
...
@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
framework
::
ProgramDesc
prog
;
auto
*
block
=
prog
.
MutableBlock
(
0
);
auto
*
conv2d_op
=
block
->
AppendOp
();
// init trt engine
cudaStream_t
stream_
;
std
::
unique_ptr
<
TensorRTEngine
>
engine_
;
engine_
.
reset
(
new
TensorRTEngine
(
5
,
1
<<
15
,
&
stream_
));
engine_
->
InitNetwork
();
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream_
),
0
);
engine_
->
DeclareInput
(
"conv2d-X"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
(
2
,
5
,
5
));
conv2d_op
->
SetType
(
"conv2d"
);
conv2d_op
->
SetInput
(
"Input"
,
{
"conv2d-X"
});
conv2d_op
->
SetInput
(
"Filter"
,
{
"conv2d-Y"
});
conv2d_op
->
SetOutput
(
"Output"
,
{
"conv2d-Out"
});
OpConverter
converter
;
const
std
::
vector
<
int
>
strides
({
1
,
1
});
const
std
::
vector
<
int
>
paddings
({
1
,
1
});
const
std
::
vector
<
int
>
dilations
({
1
,
1
});
const
int
groups
=
1
;
conv2d_op
->
SetAttr
(
"strides"
,
strides
);
conv2d_op
->
SetAttr
(
"paddings"
,
paddings
);
conv2d_op
->
SetAttr
(
"dilations"
,
dilations
);
conv2d_op
->
SetAttr
(
"groups"
,
groups
);
// init scope
framework
::
Scope
scope
;
converter
.
ConvertBlock
(
*
block
->
Proto
(),
{},
scope
,
nullptr
/*TensorRTEngine*/
);
std
::
vector
<
int
>
dim_vec
=
{
3
,
2
,
3
,
3
};
auto
*
x
=
scope
.
Var
(
"conv2d-Y"
);
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
OpConverter
converter
;
converter
.
ConvertBlock
(
*
block
->
Proto
(),
{
"conv2d-Y"
},
scope
,
engine_
.
get
()
/*TensorRTEngine*/
);
}
}
// namespace tensorrt
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
d50f776b
...
...
@@ -20,10 +20,10 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
DEFINE_bool
(
cudnn_deterministic
,
tru
e
,
DEFINE_bool
(
cudnn_deterministic
,
fals
e
,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"
fals
e, the algorithm is deterministic."
);
"
tru
e, the algorithm is deterministic."
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
if
(
input_grad
)
{
if
(
FLAGS_cudnn_deterministic
)
{
if
(
!
FLAGS_cudnn_deterministic
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
...
...
@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
if
(
filter_grad
)
{
if
(
FLAGS_cudnn_deterministic
)
{
if
(
!
FLAGS_cudnn_deterministic
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
浏览文件 @
d50f776b
...
...
@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_bwd_weights_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
...
...
@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_weights_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
...
...
@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_data_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
...
...
@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
weights_pd
=
conv_bwd_data_pd_
->
weights_primitive_desc
();
auto
user_pd
=
user_weights_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_pd
,
user_weights_memory_p
,
...
...
@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
"@src_mem_p"
,
...
...
@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
user_weights_pd
=
user_weights_memory_p
->
get_primitive_desc
();
auto
weights_pd
=
conv_pd_
->
weights_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_weights_pd
,
...
...
@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static
std
::
string
GetHash
(
memory
::
dims
&
input_dims
,
memory
::
dims
&
weights_dims
,
std
::
vector
<
int
>&
strides
,
std
::
vector
<
int
>&
paddings
,
std
::
vector
<
int
>&
dilations
,
int
groups
,
const
std
::
string
&
suffix
)
{
static
std
::
string
GetHash
(
memory
::
dims
&
input_dims
,
// NOLINT
memory
::
dims
&
weights_dims
,
// NOLINT
std
::
vector
<
int
>&
strides
,
// NOLINT
std
::
vector
<
int
>&
paddings
,
// NOLINT
std
::
vector
<
int
>&
dilations
,
// NOLINT
int
groups
,
const
std
::
string
&
suffix
)
{
return
dims2str
(
input_dims
)
+
dims2str
(
weights_dims
)
+
dims2str
(
strides
)
+
dims2str
(
paddings
)
+
dims2str
(
dilations
)
+
std
::
to_string
(
groups
)
+
suffix
;
...
...
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
浏览文件 @
d50f776b
...
...
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
auto
y_dims
_untrimed
=
y
->
dims
();
auto
z_dims
=
z
->
dims
();
// Execute default elementwise_add operator when
// broadcast operations need to performed.
if
(
x_dims
!=
y_dims
)
{
if
(
x_dims
!=
y_dims
_untrimed
)
{
auto
sum_func
=
[](
T
a
,
T
b
)
->
T
{
return
a
+
b
;
};
TransformFunctor
<
decltype
(
sum_func
),
T
,
...
...
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
ctx
.
template
device_context
<
paddle
::
platform
::
CPUDeviceContext
>(),
sum_func
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
_untrimed
.
size
()
:
axis
);
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
"Axis should be in range [0, x_dims)"
);
trim_trailing_singular_dims
(
&
y_dims
);
auto
y_dims
=
trim_trailing_singular_dims
(
y_dims_untrimed
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
...
...
@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
"Wrong layout/format set for Y tensor"
);
std
::
vector
<
int
>
src_x_tz
=
framework
::
vectorize2int
(
x_dims
);
std
::
vector
<
int
>
src_y_tz
=
framework
::
vectorize2int
(
y_dims
);
std
::
vector
<
int
>
src_y_tz
=
framework
::
vectorize2int
(
y_dims
_untrimed
);
std
::
vector
<
int
>
dst_tz
=
framework
::
vectorize2int
(
z_dims
);
std
::
vector
<
memory
::
primitive_desc
>
srcs_pd
;
...
...
@@ -142,20 +142,22 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
// skip out, x, y,
// dout length is larger or equal than dx, dy.
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
auto
set_mkldnn_format
=
[](
Tensor
*
in
,
const
Tensor
*
out
)
{
in
->
set_layout
(
DataLayout
::
kMKLDNN
);
in
->
set_format
(
out
->
format
());
};
if
(
x
->
dims
()
==
y
->
dims
())
{
if
(
dx
!=
nullptr
&&
dy
!=
nullptr
&&
dx
->
dims
()
==
dy
->
dims
())
{
if
(
dx
->
dims
()
==
dy
->
dims
())
{
auto
blas
=
math
::
GetBlas
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
ctx
);
if
(
dx
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
...
...
@@ -168,9 +170,10 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
set_mkldnn_format
(
dy
,
dout
);
}
}
}
else
{
// Execute default kernel when broadcast is needed
ElemwiseGradCompute
<
paddle
::
platform
::
CPUDeviceContext
,
T
,
Elemwise
Explicit
GradCompute
<
paddle
::
platform
::
CPUDeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
...
...
paddle/fluid/operators/elementwise_add_op.cc
浏览文件 @
d50f776b
...
...
@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_add
,
"Add"
,
"Out = X + Y"
);
REGISTER_ELEMWISE_GRAD_MAKER
(
elementwise_add
,
Add
);
REGISTER_ELEMWISE_EXPLICIT_OP
(
elementwise_add
,
"Add"
,
"Out = X + Y"
,
"Out"
,
"X"
);
REGISTER_OP_CPU_KERNEL
(
elementwise_add
,
ops
::
ElementwiseAddKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_add_op.h
浏览文件 @
d50f776b
...
...
@@ -95,8 +95,9 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
framework
::
Tensor
*
dy
)
{
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElemwiseGradCompute
<
DeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
ElemwiseExplicitGradCompute
<
DeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
}
...
...
@@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
// skip out, x, y
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
(
x
->
dims
()
==
y
->
dims
()))
{
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
dx
!=
nullptr
&&
dy
!=
nullptr
&&
(
dx
->
dims
()
==
dy
->
dims
()))
{
elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
else
{
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
...
...
paddle/fluid/operators/elementwise_div_op.cc
浏览文件 @
d50f776b
...
...
@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_div
,
"Div"
,
"Out = X / Y"
);
REGISTER_OP_CPU_KERNEL
(
elementwise_div
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_op.h
浏览文件 @
d50f776b
...
...
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
void
Make
()
final
{
AddInput
(
"X"
,
"(Tensor), The first input tensor of elementwise op."
);
AddInput
(
"Y"
,
"(Tensor), The second input tensor of elementwise op."
);
AddOutput
(
"Out"
,
"The output of elementwise op."
).
Reuse
(
"X"
);
// AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
// memory.").AsIntermediate();
AddOutput
(
"Out"
,
"The output of elementwise op."
);
AddAttr
<
int
>
(
"axis"
,
"(int, default -1). The start dimension index "
"for broadcasting Y onto X."
)
...
...
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
)DOC"
,
GetName
(),
GetEquation
()));
SetReuse
();
}
protected:
virtual
std
::
string
GetName
()
const
=
0
;
virtual
std
::
string
GetEquation
()
const
=
0
;
virtual
void
SetReuse
()
{}
};
class
ElementwiseOpGrad
:
public
framework
::
OperatorWithKernel
{
...
...
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
());
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
)
)
->
type
());
#ifdef PADDLE_WITH_MKLDNN
if
(
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
...
...
@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
};
// For Add, Sub op, the X, Out is not needed.
class
ElementwiseOpExplicitGrad
:
public
ElementwiseOpGrad
{
public:
using
operators
::
ElementwiseOpGrad
::
ElementwiseOpGrad
;
using
operators
::
ElementwiseOpGrad
::
GetExpectedKernelType
;
using
Tensor
=
framework
::
Tensor
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null"
);
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
auto
out_dims
=
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
));
ctx
->
SetOutputDim
(
x_grad_name
,
out_dims
);
}
auto
y_grad_name
=
framework
::
GradVarName
(
"Y"
);
if
(
ctx
->
HasOutput
(
y_grad_name
))
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Y"
),
"Input(Y) should not be null"
);
auto
y_dims
=
ctx
->
GetInputDim
(
"Y"
);
ctx
->
SetOutputDim
(
y_grad_name
,
y_dims
);
}
}
};
}
// namespace operators
}
// namespace paddle
/*
*/
#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \
class kernel_type##GradMaker \
: public paddle::framework::SingleGradOpDescMaker { \
public: \
using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
\
protected: \
std::unique_ptr<paddle::framework::OpDesc> Apply() const override { \
auto* op = new paddle::framework::OpDesc(); \
op->SetType(#kernel_type "_grad"); \
op->SetInput("Y", Input("Y")); \
op->SetInput(::paddle::framework::GradVarName("Out"), \
OutputGrad("Out")); \
op->SetAttrMap(Attrs()); \
op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X")); \
op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y")); \
return std::unique_ptr<::paddle::framework::OpDesc>(op); \
} \
}
#define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
...
...
@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
::paddle::operators::ElementwiseOpInferVarType, \
::paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
protected: \
virtual std::string GetName() const { return op_name; } \
virtual std::string GetEquation() const { return equation; } \
virtual void SetReuse() { Reuse(__VA_ARGS__); } \
}; \
REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \
__ElemwiseOp##op_type##Maker__, \
::paddle::operators::ElementwiseOpInferVarType, \
op_type##GradMaker); \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad)
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
d50f776b
...
...
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
...
...
@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
}
}
inline
void
trim_trailing_singular_dims
(
framework
::
DDim
*
dims
)
{
inline
framework
::
DDim
trim_trailing_singular_dims
(
const
framework
::
DDim
&
dims
)
{
// Remove trailing dimensions of size 1 for y
auto
actual_dims_size
=
dims
->
size
();
auto
actual_dims_size
=
dims
.
size
();
for
(;
actual_dims_size
!=
0
;
--
actual_dims_size
)
{
if
(
(
*
dims
)
[
actual_dims_size
-
1
]
!=
1
)
break
;
if
(
dims
[
actual_dims_size
-
1
]
!=
1
)
break
;
}
if
(
actual_dims_size
!=
dims
->
size
())
{
auto
actual_dims
=
framework
::
vectorize
(
*
dims
);
actual_dims
.
resize
(
actual_dims_size
);
*
dims
=
framework
::
make_ddim
(
actual_dims
);
std
::
vector
<
int
>
trim_dims
;
trim_dims
.
resize
(
actual_dims_size
);
for
(
int
i
=
0
;
i
<
actual_dims_size
;
++
i
)
{
trim_dims
[
i
]
=
dims
[
i
];
}
framework
::
DDim
actual_dims
=
framework
::
make_ddim
(
trim_dims
);
return
actual_dims
;
}
template
<
typename
T
,
typename
DeviceContext
>
...
...
@@ -457,26 +463,30 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
#endif
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
if
(
x
.
dims
()
==
y
.
dims
())
{
size_t
N
=
static_cast
<
size_t
>
(
framework
::
product
(
x
.
dims
()));
void
ElemwiseGradComputeNoBroadcast
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
DDim
&
x_dim
,
const
framework
::
DDim
&
y_dim
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
size_t
N
=
static_cast
<
size_t
>
(
framework
::
product
(
x_dim
));
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
.
template
device_context
<
DeviceContext
>(),
N
);
for_range
(
ElemwiseGradNoBroadcast
<
T
,
DX_OP
,
DY_OP
>
{
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())});
}
else
{
// Y is a scalar
auto
x_dim
=
x
.
dims
();
auto
y_dim
=
y
.
dims
();
}
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
trim_trailing_singular_dims
(
&
y_dim
);
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradComputeWithBroadcast
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
DDim
&
x_dim
,
const
framework
::
DDim
&
y_dim_untrimed
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim_untrimed
.
size
()
:
axis
);
auto
y_dim
=
trim_trailing_singular_dims
(
y_dim_untrimed
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
int
pre
,
n
,
post
;
...
...
@@ -494,9 +504,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
#endif
}
else
{
ElemwiseGradBroadcast1CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
else
{
...
...
@@ -505,21 +514,70 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
ElemwiseGradBroadcast2CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
ElemwiseGradBroadcast2CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
}
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
const
framework
::
DDim
x_dim
=
x
.
dims
();
const
framework
::
DDim
y_dim
=
y
.
dims
();
if
(
x
.
dims
()
==
y
.
dims
())
{
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
else
{
// Y is a scalar
ElemwiseGradComputeWithBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
}
// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
// explicit gradient can cut off X, Y, Out from gradient op
// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
// elementwise code.
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseExplicitGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
if
(
dy
==
nullptr
)
{
const
framework
::
DDim
dx_dims
=
dout
.
dims
();
auto
dy_dims
=
dx_dims
;
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
else
{
if
(
dout
.
dims
()
==
dy
->
dims
())
{
const
framework
::
DDim
dx_dims
=
dout
.
dims
();
const
framework
::
DDim
dy_dims
=
dy
->
dims
();
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
else
{
// Y is a scalar
auto
dx_dims
=
dout
.
dims
();
const
framework
::
DDim
dy_dims
=
dy
->
dims
();
ElemwiseGradComputeWithBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
}
}
// Deprecated
template
<
typename
DeviceContext
,
typename
T
,
typename
functor
,
typename
broadcastfunctor
,
typename
broadcast2functor
>
void
ElementwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
...
...
@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
}
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
trim_trailing_singular_dims
(
&
y_dims
);
trim_trailing_singular_dims
(
y_dims
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
...
...
@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
x
,
y
,
z
,
ctx
.
template
device_context
<
DeviceContext
>(),
func
);
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
y_dims
.
size
(),
auto
y_dims
_untrimed
=
y
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
y_dims
_untrimed
.
size
(),
"Rank of first input must >= rank of second input."
);
if
(
x_dims
==
y_dims
)
{
if
(
x_dims
==
y_dims
_untrimed
)
{
functor
.
Run
();
return
;
}
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
_untrimed
.
size
()
:
axis
);
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
"Axis should be in range [0, x_dims)"
);
trim_trailing_singular_dims
(
&
y_dims
);
auto
y_dims
=
trim_trailing_singular_dims
(
y_dims_untrimed
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
...
...
paddle/fluid/operators/elementwise_sub_op.cc
浏览文件 @
d50f776b
...
...
@@ -15,7 +15,10 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_sub
,
"Sub"
,
"Out = X - Y"
);
REGISTER_ELEMWISE_GRAD_MAKER
(
elementwise_sub
,
Sub
);
REGISTER_ELEMWISE_EXPLICIT_OP
(
elementwise_sub
,
"Sub"
,
"Out = X - Y"
,
"Out"
,
"X"
);
REGISTER_OP_CPU_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_sub_op.h
浏览文件 @
d50f776b
...
...
@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElemwiseGradCompute
<
DeviceContext
,
T
,
SubGradDX
<
T
>
,
SubGradDY
<
T
>>
(
// skip out, x, y
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
ElemwiseExplicitGradCompute
<
DeviceContext
,
T
,
SubGradDX
<
T
>
,
SubGradDY
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
SubGradDX
<
T
>
(),
SubGradDY
<
T
>
());
}
};
...
...
paddle/fluid/operators/softmax_op.cc
浏览文件 @
d50f776b
...
...
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out) and its gradients should have a same shape."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
)));
}
protected:
...
...
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
());
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
)
)
->
type
());
if
(
input_data_type
==
framework
::
proto
::
VarType
::
FP16
)
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"float16 can only be used on GPU place"
);
...
...
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
}
};
class
SoftmaxOpGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
op
=
new
framework
::
OpDesc
();
op
->
SetType
(
"softmax_grad"
);
op
->
SetInput
(
"Out"
,
Output
(
"Out"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
OutputGrad
(
"Out"
));
op
->
SetAttrMap
(
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
softmax
,
ops
::
SoftmaxOp
,
ops
::
SoftmaxOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
ops
::
SoftmaxOpGradMaker
);
REGISTER_OPERATOR
(
softmax_grad
,
ops
::
SoftmaxOpGrad
);
REGISTER_OP_CPU_KERNEL
(
softmax
,
ops
::
SoftmaxKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
d50f776b
...
...
@@ -37,6 +37,7 @@ __all__ = [
__auto__
=
[
'iou_similarity'
,
'box_coder'
,
'polygon_box_transform'
,
]
__all__
+=
__auto__
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
d50f776b
...
...
@@ -66,9 +66,7 @@ __all__ = [
'scatter'
,
'sum'
,
'slice'
,
'polygon_box_transform'
,
'shape'
,
'iou_similarity'
,
'maxout'
,
]
+
__activations__
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
d50f776b
...
...
@@ -121,7 +121,7 @@ class ParallelExecutor(object):
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exec_strategy
.
num_threads
=
cpu_num
exec_strategy
.
num_threads
=
cpu_num
*
2
if
build_strategy
is
None
:
build_strategy
=
BuildStrategy
()
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
d50f776b
...
...
@@ -49,6 +49,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_train)
list
(
REMOVE_ITEM TEST_OPS test_parallel_executor_crf
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed
)
list
(
REMOVE_ITEM TEST_OPS test_dist_se_resnext
)
list
(
REMOVE_ITEM TEST_OPS test_dist_transformer
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
)
endforeach
(
TEST_OP
)
...
...
@@ -61,4 +62,5 @@ if(WITH_DISTRIBUTE)
endif
()
py_test_modules
(
test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
py_test_modules
(
test_dist_transformer MODULES test_dist_transformer SERIAL
)
py_test_modules
(
test_dist_se_resnext MODULES test_dist_se_resnext SERIAL
)
python/paddle/fluid/tests/unittests/dist_transformer.py
0 → 100644
浏览文件 @
d50f776b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
argparse
import
time
import
math
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
import
os
import
sys
import
transformer_model
import
paddle.dataset.wmt16
as
wmt16
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
WMT16_RECORDIO_FILE
=
"/tmp/wmt16.recordio"
class
ModelHyperParams
(
object
):
# Dictionary size for source and target language. This model directly uses
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
# alreay been added, but the <pad> token is not added. Transformer requires
# sequences in a mini-batch are padded to have the same length. A <pad> token is
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary.
src_vocab_size
=
10000
# index for <pad> token in source language.
src_pad_idx
=
src_vocab_size
# size of target word dictionay
trg_vocab_size
=
10000
# index for <pad> token in target language.
trg_pad_idx
=
trg_vocab_size
# position value corresponding to the <pad> token.
pos_pad_idx
=
0
# max length of sequences. It should plus 1 to include position
# padding token for position encoding.
max_length
=
50
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model
=
512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid
=
1024
# the dimension that keys are projected to for dot-product attention.
d_key
=
64
# the dimension that values are projected to for dot-product attention.
d_value
=
64
# number of head used in multi-head attention.
n_head
=
8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer
=
6
# dropout rate used by all dropout layers.
dropout
=
0.1
def
prepare_batch_input
(
insts
,
src_pad_idx
,
trg_pad_idx
,
n_head
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. Then, convert the numpy
data to tensors and return a dict mapping names to tensors.
"""
def
__pad_batch_data
(
insts
,
pad_idx
,
is_target
=
False
,
return_pos
=
True
,
return_attn_bias
=
True
,
return_max_len
=
True
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list
=
[]
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
inst_data
=
np
.
array
(
[
inst
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
if
return_pos
:
inst_pos
=
np
.
array
([[
pos_i
+
1
if
w_i
!=
pad_idx
else
0
for
pos_i
,
w_i
in
enumerate
(
inst
)
]
for
inst
in
inst_data
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
if
return_attn_bias
:
if
is_target
:
# This is used to avoid attention on paddings and subsequent
# words.
slf_attn_bias_data
=
np
.
ones
((
inst_data
.
shape
[
0
],
max_len
,
max_len
))
slf_attn_bias_data
=
np
.
triu
(
slf_attn_bias_data
,
1
).
reshape
(
[
-
1
,
1
,
max_len
,
max_len
])
slf_attn_bias_data
=
np
.
tile
(
slf_attn_bias_data
,
[
1
,
n_head
,
1
,
1
])
*
[
-
1e9
]
else
:
# This is used to avoid attention on paddings.
slf_attn_bias_data
=
np
.
array
([[
0
]
*
len
(
inst
)
+
[
-
1e9
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
slf_attn_bias_data
=
np
.
tile
(
slf_attn_bias_data
.
reshape
([
-
1
,
1
,
1
,
max_len
]),
[
1
,
n_head
,
max_len
,
1
])
return_list
+=
[
slf_attn_bias_data
.
astype
(
"float32"
)]
if
return_max_len
:
return_list
+=
[
max_len
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
src_word
,
src_pos
,
src_slf_attn_bias
,
src_max_len
=
__pad_batch_data
(
[
inst
[
0
]
for
inst
in
insts
],
src_pad_idx
,
is_target
=
False
)
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_max_len
=
__pad_batch_data
(
[
inst
[
1
]
for
inst
in
insts
],
trg_pad_idx
,
is_target
=
True
)
trg_src_attn_bias
=
np
.
tile
(
src_slf_attn_bias
[:,
:,
::
src_max_len
,
:],
[
1
,
1
,
trg_max_len
,
1
]).
astype
(
"float32"
)
lbl_word
=
__pad_batch_data
([
inst
[
2
]
for
inst
in
insts
],
trg_pad_idx
,
False
,
False
,
False
,
False
)
lbl_weight
=
(
lbl_word
!=
trg_pad_idx
).
astype
(
"float32"
).
reshape
([
-
1
,
1
])
return
[
src_word
,
src_pos
,
trg_word
,
trg_pos
,
src_slf_attn_bias
,
trg_slf_attn_bias
,
trg_src_attn_bias
,
lbl_word
,
lbl_weight
]
def
transformer
(
use_feed
):
assert
not
use_feed
,
"transfomer doesn't support feed yet"
return
transformer_model
.
transformer
(
ModelHyperParams
.
src_vocab_size
+
1
,
ModelHyperParams
.
trg_vocab_size
+
1
,
ModelHyperParams
.
max_length
+
1
,
ModelHyperParams
.
n_layer
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
d_inner_hid
,
ModelHyperParams
.
dropout
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
pos_pad_idx
)
def
get_model
():
avg_cost
=
transformer
(
use_feed
=
False
)
optimizer
=
fluid
.
optimizer
.
Adam
()
optimizer
.
minimize
(
avg_cost
)
return
avg_cost
def
get_transpiler
(
trainer_id
,
main_program
,
pserver_endpoints
,
trainers
):
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
=
trainer_id
,
program
=
main_program
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
return
t
class
DistTransformer2x2
(
object
):
def
run_pserver
(
self
,
pserver_endpoints
,
trainers
,
current_endpoint
,
trainer_id
):
get_model
()
t
=
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
pserver_endpoints
,
trainers
)
pserver_prog
=
t
.
get_pserver_program
(
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
current_endpoint
,
pserver_prog
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
exe
.
run
(
pserver_prog
)
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
20
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
print
(
"waiting ps ready: "
,
pid
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
retry_times
-=
1
def
run_trainer
(
self
,
place
,
endpoints
,
trainer_id
,
trainers
,
is_dist
=
True
):
avg_cost
=
get_model
()
if
is_dist
:
t
=
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
endpoints
,
trainers
)
trainer_prog
=
t
.
get_trainer_program
()
else
:
trainer_prog
=
fluid
.
default_main_program
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
strategy
.
allow_op_delay
=
False
exe
=
fluid
.
ParallelExecutor
(
True
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
)
first_loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
print
(
first_loss
)
for
i
in
xrange
(
5
):
_
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
last_loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
print
(
last_loss
)
def
main
(
role
=
"pserver"
,
endpoints
=
"127.0.0.1:9123"
,
trainer_id
=
0
,
current_endpoint
=
"127.0.0.1:9123"
,
trainers
=
1
,
is_dist
=
True
):
reader
=
paddle
.
batch
(
wmt16
.
train
(
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
),
batch_size
=
transformer_model
.
batch_size
)
with
fluid
.
recordio_writer
.
create_recordio_writer
(
WMT16_RECORDIO_FILE
)
as
writer
:
for
batch
in
reader
():
for
tensor
in
prepare_batch_input
(
batch
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
n_head
):
t
=
fluid
.
LoDTensor
()
t
.
set
(
tensor
,
fluid
.
CPUPlace
())
writer
.
append_tensor
(
t
)
writer
.
complete_append_tensor
()
model
=
DistTransformer2x2
()
if
role
==
"pserver"
:
model
.
run_pserver
(
endpoints
,
trainers
,
current_endpoint
,
trainer_id
)
else
:
p
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
model
.
run_trainer
(
p
,
endpoints
,
trainer_id
,
trainers
,
is_dist
)
if
__name__
==
"__main__"
:
if
len
(
sys
.
argv
)
!=
7
:
print
(
"Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
)
role
=
sys
.
argv
[
1
]
endpoints
=
sys
.
argv
[
2
]
trainer_id
=
int
(
sys
.
argv
[
3
])
current_endpoint
=
sys
.
argv
[
4
]
trainers
=
int
(
sys
.
argv
[
5
])
is_dist
=
True
if
sys
.
argv
[
6
]
==
"TRUE"
else
False
main
(
role
=
role
,
endpoints
=
endpoints
,
trainer_id
=
trainer_id
,
current_endpoint
=
current_endpoint
,
trainers
=
trainers
,
is_dist
=
is_dist
)
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
d50f776b
...
...
@@ -66,6 +66,10 @@ def get_numeric_gradient(place,
tensor_to_check_dtype
=
np
.
float32
elif
tensor_to_check_dtype
==
core
.
VarDesc
.
VarType
.
FP64
:
tensor_to_check_dtype
=
np
.
float64
elif
tensor_to_check_dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
tensor_to_check_dtype
=
np
.
float16
# set delta as np.float16, will automatic convert to float32, float64
delta
=
np
.
array
(
delta
).
astype
(
np
.
float16
)
else
:
raise
ValueError
(
"Not supported data type "
+
str
(
tensor_to_check_dtype
))
...
...
@@ -73,13 +77,24 @@ def get_numeric_gradient(place,
gradient_flat
=
np
.
zeros
(
shape
=
(
tensor_size
,
),
dtype
=
tensor_to_check_dtype
)
def
__get_elem__
(
tensor
,
i
):
if
tensor_to_check_dtype
==
np
.
float32
:
if
tensor_to_check_dtype
==
np
.
float16
:
numpy_tensor
=
np
.
array
(
tensor
).
astype
(
np
.
float16
)
numpy_tensor
=
numpy_tensor
.
flatten
()
return
numpy_tensor
[
i
]
elif
tensor_to_check_dtype
==
np
.
float32
:
return
tensor
.
_get_float_element
(
i
)
else
:
return
tensor
.
_get_double_element
(
i
)
def
__set_elem__
(
tensor
,
i
,
e
):
if
tensor_to_check_dtype
==
np
.
float32
:
if
tensor_to_check_dtype
==
np
.
float16
:
numpy_tensor
=
np
.
array
(
tensor
).
astype
(
np
.
float16
)
shape
=
numpy_tensor
.
shape
numpy_tensor
=
numpy_tensor
.
flatten
()
numpy_tensor
[
i
]
=
e
numpy_tensor
=
numpy_tensor
.
reshape
(
shape
).
view
(
np
.
uint16
)
tensor
.
set
(
numpy_tensor
,
place
)
elif
tensor_to_check_dtype
==
np
.
float32
:
tensor
.
_set_float_element
(
i
,
e
)
else
:
tensor
.
_set_double_element
(
i
,
e
)
...
...
@@ -133,6 +148,11 @@ class OpTest(unittest.TestCase):
if
not
self
.
call_once
:
self
.
call_once
=
True
self
.
dtype
=
data_type
# See the comment of np_dtype_to_fluid_dtype
# If the input type is uint16, we assume use float16
# for lodtensor dtype.
if
self
.
dtype
==
np
.
uint16
:
self
.
dtype
==
np
.
float16
def
infer_dtype_from_inputs_outputs
(
self
,
inputs
,
outputs
):
def
infer_dtype
(
numpy_dict
):
...
...
@@ -161,19 +181,25 @@ class OpTest(unittest.TestCase):
for
name
,
np_value
in
self
.
inputs
[
var_name
]:
tensor
=
core
.
LoDTensor
()
if
isinstance
(
np_value
,
tuple
):
tensor
.
set
(
np_value
[
0
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
np_value
[
0
]),
place
)
tensor
.
set_recursive_sequence_lengths
(
np_value
[
1
])
else
:
tensor
.
set
(
np_value
,
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
np_value
),
place
)
feed_map
[
name
]
=
tensor
else
:
tensor
=
core
.
LoDTensor
()
if
isinstance
(
self
.
inputs
[
var_name
],
tuple
):
tensor
.
set
(
self
.
inputs
[
var_name
][
0
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
self
.
inputs
[
var_name
][
0
]),
place
)
tensor
.
set_recursive_sequence_lengths
(
self
.
inputs
[
var_name
][
1
])
else
:
tensor
.
set
(
self
.
inputs
[
var_name
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
self
.
inputs
[
var_name
]),
place
)
feed_map
[
var_name
]
=
tensor
return
feed_map
...
...
@@ -307,13 +333,22 @@ class OpTest(unittest.TestCase):
np
.
allclose
(
actual_t
,
expect_t
,
atol
=
atol
),
"Output ("
+
out_name
+
") has diff at "
+
str
(
place
)
+
str
(
actual_t
)
+
"
\n
"
+
str
(
expect_t
))
"
\n
Expect "
+
str
(
expect_t
)
+
"
\n
"
+
"But Got"
+
str
(
actual_t
))
if
isinstance
(
expect
,
tuple
):
self
.
assertListEqual
(
actual
.
recursive_sequence_lengths
(),
expect
[
1
],
"Output ("
+
out_name
+
") has different lod at "
+
str
(
place
))
def
_get_places
(
self
):
if
self
.
dtype
==
np
.
float16
:
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
return
[
place
]
else
:
return
[]
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
places
.
append
(
core
.
CUDAPlace
(
0
))
...
...
@@ -344,8 +379,8 @@ class OpTest(unittest.TestCase):
def
err_msg
():
offset
=
np
.
argmax
(
diff_mat
>
max_relative_error
)
return
(
"%s Variable %s max gradient diff %f over limit %f, "
"the first error element is %d,
%f, %f"
)
%
(
msg_prefix
,
name
,
max_diff
,
max_relative_error
,
"the first error element is %d,
expected %f, but got %f"
)
%
(
msg_prefix
,
name
,
max_diff
,
max_relative_error
,
offset
,
a
.
flatten
()[
offset
],
b
.
flatten
()[
offset
])
self
.
assertLessEqual
(
max_diff
,
max_relative_error
,
err_msg
())
...
...
@@ -435,6 +470,21 @@ class OpTest(unittest.TestCase):
input
.
dtype
=
np
.
uint16
return
input
@
staticmethod
def
fluid_dtype_to_np_dtype
(
self
,
dtype
):
"""
See above, convert the dtype to normal type.
"""
if
dtype
==
np
.
uint16
:
dtype
=
np
.
float16
return
dtype
@
staticmethod
def
np_value_to_fluid_value
(
input
):
if
input
.
dtype
==
np
.
float16
:
input
=
input
.
view
(
np
.
uint16
)
return
input
def
_get_gradient
(
self
,
input_to_check
,
place
,
...
...
@@ -457,7 +507,7 @@ class OpTest(unittest.TestCase):
if
isinstance
(
place
,
fluid
.
CUDAPlace
(
0
)):
use_cuda
=
True
executor
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
prog
ram
)
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
prog
)
else
:
executor
=
Executor
(
place
)
return
map
(
np
.
array
,
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
0 → 100644
浏览文件 @
d50f776b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
time
import
unittest
import
os
import
sys
import
signal
import
subprocess
class
TestDistBase
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
_trainers
=
2
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:9123,127.0.0.1:9124"
self
.
_python_interp
=
"python"
def
start_pserver
(
self
,
model_file
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps0_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
ps1_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
ps0_proc
=
subprocess
.
Popen
(
ps0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
ps1_proc
=
subprocess
.
Popen
(
ps1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
return
ps0_proc
,
ps1_proc
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
50
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
as
e
:
sys
.
stderr
.
write
(
'waiting for pserver: %s, left retry %d
\n
'
%
(
e
,
retry_times
))
retry_times
-=
1
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
),
"FLAGS_fraction_of_gpu_memory_to_use"
:
"0.15"
}
# Run local to get a base line
env_local
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env_local
.
update
(
required_envs
)
local_cmd
=
"%s %s trainer %s 0 %s %d FLASE"
%
\
(
self
.
_python_interp
,
model_file
,
"127.0.0.1:1234"
,
"127.0.0.1:1234"
,
1
)
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env_local
)
local_proc
.
wait
()
out
,
err
=
local_proc
.
communicate
()
local_ret
=
out
sys
.
stderr
.
write
(
'local_loss: %s
\n
'
%
local_ret
)
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
err
)
# Run dist train to compare with local results
ps0
,
ps1
=
self
.
start_pserver
(
model_file
)
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr0_cmd
=
"%s %s trainer %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
tr1_cmd
=
"%s %s trainer %s 1 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
env0
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env1
=
{
"CUDA_VISIBLE_DEVICES"
:
"1"
}
env0
.
update
(
required_envs
)
env1
.
update
(
required_envs
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env1
)
tr0_proc
.
wait
()
tr1_proc
.
wait
()
out
,
err
=
tr0_proc
.
communicate
()
sys
.
stderr
.
write
(
'dist_stderr: %s
\n
'
%
err
)
loss_data0
=
out
sys
.
stderr
.
write
(
'dist_loss: %s
\n
'
%
loss_data0
)
lines
=
loss_data0
.
split
(
"
\n
"
)
dist_first_loss
=
eval
(
lines
[
0
].
replace
(
" "
,
","
))[
0
]
dist_last_loss
=
eval
(
lines
[
1
].
replace
(
" "
,
","
))[
0
]
local_lines
=
local_ret
.
split
(
"
\n
"
)
local_first_loss
=
eval
(
local_lines
[
0
])[
0
]
local_last_loss
=
eval
(
local_lines
[
1
])[
0
]
self
.
assertAlmostEqual
(
local_first_loss
,
dist_first_loss
,
delta
=
delta
)
self
.
assertAlmostEqual
(
local_last_loss
,
dist_last_loss
,
delta
=
delta
)
# check tr0_out
# FIXME: ensure the server process is killed
# replace with ps0.terminate()
os
.
kill
(
ps0
.
pid
,
signal
.
SIGKILL
)
os
.
kill
(
ps1
.
pid
,
signal
.
SIGKILL
)
FNULL
.
close
()
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
d50f776b
...
...
@@ -11,127 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
argparse
import
time
import
math
import
unittest
import
os
import
sys
import
signal
import
subprocess
class
TestDistSeResneXt2x2
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
_trainers
=
2
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:9123,127.0.0.1:9124"
self
.
_python_interp
=
"python"
def
start_pserver
(
self
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps0_cmd
=
"%s dist_se_resnext.py pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
ps1_cmd
=
"%s dist_se_resnext.py pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
ps0_proc
=
subprocess
.
Popen
(
ps0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
ps1_proc
=
subprocess
.
Popen
(
ps1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
return
ps0_proc
,
ps1_proc
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
20
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
retry_times
-=
1
def
test_with_place
(
self
):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
),
"FLAGS_fraction_of_gpu_memory_to_use"
:
"0.15"
}
# Run local to get a base line
env_local
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env_local
.
update
(
required_envs
)
local_cmd
=
"%s dist_se_resnext.py trainer %s 0 %s %d FLASE"
%
\
(
self
.
_python_interp
,
"127.0.0.1:1234"
,
"127.0.0.1:1234"
,
1
)
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env_local
)
local_proc
.
wait
()
out
,
err
=
local_proc
.
communicate
()
local_ret
=
out
sys
.
stderr
.
write
(
'local_loss: %s
\n
'
%
local_ret
)
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
err
)
# Run dist train to compare with local results
ps0
,
ps1
=
self
.
start_pserver
()
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr0_cmd
=
"%s dist_se_resnext.py trainer %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
tr1_cmd
=
"%s dist_se_resnext.py trainer %s 1 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
env0
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env1
=
{
"CUDA_VISIBLE_DEVICES"
:
"1"
}
env0
.
update
(
required_envs
)
env1
.
update
(
required_envs
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env1
)
tr0_proc
.
wait
()
tr1_proc
.
wait
()
out
,
err
=
tr0_proc
.
communicate
()
sys
.
stderr
.
write
(
'dist_stderr: %s
\n
'
%
err
)
loss_data0
=
out
sys
.
stderr
.
write
(
'dist_loss: %s
\n
'
%
loss_data0
)
lines
=
loss_data0
.
split
(
"
\n
"
)
dist_first_loss
=
eval
(
lines
[
0
].
replace
(
" "
,
","
))[
0
]
dist_last_loss
=
eval
(
lines
[
1
].
replace
(
" "
,
","
))[
0
]
local_lines
=
local_ret
.
split
(
"
\n
"
)
local_first_loss
=
eval
(
local_lines
[
0
])[
0
]
local_last_loss
=
eval
(
local_lines
[
1
])[
0
]
from
test_dist_base
import
TestDistBase
self
.
assertAlmostEqual
(
local_first_loss
,
dist_first_loss
)
self
.
assertAlmostEqual
(
local_last_loss
,
dist_last_loss
)
# check tr0_out
# FIXME: ensure the server process is killed
# replace with ps0.terminate()
os
.
kill
(
ps0
.
pid
,
signal
.
SIGKILL
)
os
.
kill
(
ps1
.
pid
,
signal
.
SIGKILL
)
FNULL
.
close
()
class
TestDistSeResneXt2x2
(
TestDistBase
):
def
test_se_resnext
(
self
):
# TODO(paddle-dev): Is the delta too large?
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
0.2
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_transformer.py
0 → 100644
浏览文件 @
d50f776b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
from
test_dist_base
import
TestDistBase
class
TestDistTransformer2x2
(
TestDistBase
):
def
test_transformer
(
self
):
# TODO(paddle-dev): check if the delta is OK.
# Usually start around ~8000 and converge to ~5000
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
400
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
浏览文件 @
d50f776b
...
...
@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
def
setUp
(
self
):
self
.
op_type
=
"elementwise_sub"
self
.
inputs
=
{
'X'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
"float32"
),
'Y'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
"float32"
)
'X'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
2
,
3
]).
astype
(
"float32"
),
'Y'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
2
,
3
]).
astype
(
"float32"
)
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
-
self
.
inputs
[
'Y'
]}
...
...
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
浏览文件 @
d50f776b
...
...
@@ -17,6 +17,8 @@ import numpy as np
import
math
from
op_test
import
OpTest
np
.
random
.
seed
(
100
)
def
find_latest_set
(
num
):
return
1
+
int
(
math
.
floor
(
math
.
log
(
num
,
2
)))
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
d50f776b
...
...
@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
def
test_shape
(
self
):
program
=
Program
()
with
program_guard
(
program
):
input
=
layers
.
data
(
name
=
"input"
,
shape
=
[
3
,
100
,
100
],
dtype
=
"float32"
)
out
=
layers
.
shape
(
input
,
name
=
"shape"
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
d50f776b
...
...
@@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase):
self
.
check_batchnorm_fc_convergence
(
False
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
True
)
# FIXME(zcd): close this test temporally.
# self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
d50f776b
...
...
@@ -21,7 +21,7 @@ import paddle
import
paddle.dataset.wmt16
as
wmt16
import
os
WMT16_RECORDIO_FILE
=
"
./wmt16_test_pe
.recordio"
WMT16_RECORDIO_FILE
=
"
/tmp/wmt16
.recordio"
class
ModelHyperParams
(
object
):
...
...
@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
writer
.
append_tensor
(
t
)
writer
.
complete_append_tensor
()
@
unittest
.
skip
(
"transformer is buggy in multi gpu"
)
def
test_main
(
self
):
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
False
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
False
,
iter
=
5
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/testsuite.py
浏览文件 @
d50f776b
...
...
@@ -18,14 +18,6 @@ import paddle.fluid.core as core
from
paddle.fluid.op
import
Operator
def
as_lodtensor
(
np_array
,
lod
,
place
):
tensor
=
core
.
LoDTensor
()
tensor
.
set
(
np_value
,
place
)
if
lod
is
not
None
:
tensor
.
set_recursive_sequence_lengths
(
lod
)
return
tensor
def
create_op
(
scope
,
op_type
,
inputs
,
outputs
,
attrs
):
kwargs
=
dict
()
...
...
@@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
def
set_input
(
scope
,
op
,
inputs
,
place
):
def
np_value_to_fluid_value
(
input
):
if
input
.
dtype
==
np
.
float16
:
input
=
input
.
view
(
np
.
uint16
)
return
input
def
__set_input__
(
var_name
,
var
):
if
isinstance
(
var
,
tuple
)
or
isinstance
(
var
,
np
.
ndarray
):
tensor
=
scope
.
find_var
(
var_name
).
get_tensor
()
...
...
@@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place):
tensor
.
set_recursive_sequence_lengths
(
var
[
1
])
var
=
var
[
0
]
tensor
.
_set_dims
(
var
.
shape
)
tensor
.
set
(
var
,
place
)
tensor
.
set
(
np_value_to_fluid_value
(
var
)
,
place
)
elif
isinstance
(
var
,
float
):
scope
.
find_var
(
var_name
).
set_float
(
var
)
elif
isinstance
(
var
,
int
):
...
...
@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
if
name
not
in
np_list
:
assert
var_proto
.
intermediate
,
"{} not found"
.
format
(
name
)
else
:
# inferece the dtype from numpy value.
np_value
=
np_list
[
name
]
if
isinstance
(
np_value
,
tuple
):
dtype
=
np_value
[
0
].
dtype
...
...
@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
if
is_input
:
shape
=
list
(
np_value
.
shape
)
lod_level
=
0
# NOTE(dzhwinter): type hacking
# numpy float16 is binded to paddle::platform::float16
# in tensor_py.h via the help of uint16 datatype. Because
# the internal memory representation of float16 is
# actually uint16_t in paddle. So we use np.uint16 in numpy for
# raw memory, it can pass through the pybind. So in the testcase,
# we feed data use data.view(uint16), but the dtype is float16 in fact.
# The data.view(uint16) means do not cast the data type, but process data as the uint16
if
dtype
==
np
.
uint16
:
dtype
=
np
.
float16
return
block
.
create_var
(
dtype
=
dtype
,
shape
=
shape
,
lod_level
=
lod_level
,
name
=
name
)
...
...
python/paddle/fluid/tests/unittests/transformer_model.py
浏览文件 @
d50f776b
...
...
@@ -403,7 +403,7 @@ def transformer(
trg_pad_idx
,
pos_pad_idx
,
):
file_obj
=
fluid
.
layers
.
open_recordio_file
(
filename
=
'
.
/wmt16.recordio'
,
filename
=
'
/tmp
/wmt16.recordio'
,
shapes
=
[
[
batch_size
*
max_length
,
1
],
[
batch_size
*
max_length
,
1
],
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录