Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
a58dd3e5
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a58dd3e5
编写于
8月 04, 2018
作者:
M
minqiyang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into port_python3_syntax
上级
850c3945
0964de11
变更
104
展开全部
隐藏空白更改
内联
并排
Showing
104 changed file
with
2720 addition
and
710 deletion
+2720
-710
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+4
-3
paddle/fluid/API.spec
paddle/fluid/API.spec
+2
-2
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+1
-0
paddle/fluid/framework/data_type.cc
paddle/fluid/framework/data_type.cc
+3
-1
paddle/fluid/framework/data_type_test.cc
paddle/fluid/framework/data_type_test.cc
+40
-0
paddle/fluid/framework/details/exception_holder.h
paddle/fluid/framework/details/exception_holder.h
+83
-0
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
...uid/framework/details/scope_buffered_ssa_graph_executor.h
+3
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+5
-22
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+3
-3
paddle/fluid/framework/ir/graph_helper_test.cc
paddle/fluid/framework/ir/graph_helper_test.cc
+2
-2
paddle/fluid/framework/ir/graph_test.cc
paddle/fluid/framework/ir/graph_test.cc
+6
-6
paddle/fluid/framework/op_kernel_type_test.cc
paddle/fluid/framework/op_kernel_type_test.cc
+7
-0
paddle/fluid/framework/op_proto_maker.cc
paddle/fluid/framework/op_proto_maker.cc
+34
-0
paddle/fluid/framework/op_proto_maker.h
paddle/fluid/framework/op_proto_maker.h
+2
-0
paddle/fluid/framework/op_proto_maker_test.cc
paddle/fluid/framework/op_proto_maker_test.cc
+102
-5
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+33
-2
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+1
-1
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+15
-0
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+3
-0
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+7
-0
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+13
-17
paddle/fluid/inference/analysis/analyzer_main.cc
paddle/fluid/inference/analysis/analyzer_main.cc
+33
-0
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+6
-2
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+14
-0
paddle/fluid/inference/analysis/data_flow_graph.h
paddle/fluid/inference/analysis/data_flow_graph.h
+3
-1
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+2
-2
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
...nference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+5
-5
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
...fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+9
-3
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
...fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+30
-6
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
.../fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+1
-1
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
...nference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+2
-1
paddle/fluid/inference/analysis/helper.h
paddle/fluid/inference/analysis/helper.h
+15
-0
paddle/fluid/inference/analysis/model_store_pass.cc
paddle/fluid/inference/analysis/model_store_pass.cc
+63
-0
paddle/fluid/inference/analysis/model_store_pass.h
paddle/fluid/inference/analysis/model_store_pass.h
+53
-0
paddle/fluid/inference/analysis/model_store_pass_tester.cc
paddle/fluid/inference/analysis/model_store_pass_tester.cc
+43
-0
paddle/fluid/inference/analysis/pass.h
paddle/fluid/inference/analysis/pass.h
+1
-0
paddle/fluid/inference/analysis/pass_manager_tester.cc
paddle/fluid/inference/analysis/pass_manager_tester.cc
+5
-2
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+4
-4
paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
...rence/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+3
-3
paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
...fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+3
-4
paddle/fluid/inference/analysis/ut_helper.h
paddle/fluid/inference/analysis/ut_helper.h
+3
-18
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+4
-1
paddle/fluid/inference/api/api_anakin_engine.cc
paddle/fluid/inference/api/api_anakin_engine.cc
+71
-20
paddle/fluid/inference/api/api_anakin_engine.h
paddle/fluid/inference/api/api_anakin_engine.h
+12
-8
paddle/fluid/inference/api/api_anakin_engine_tester.cc
paddle/fluid/inference/api/api_anakin_engine_tester.cc
+17
-16
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+11
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+12
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
...luid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+4
-5
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+9
-8
paddle/fluid/inference/api/demo_ci/vis_demo.cc
paddle/fluid/inference/api/demo_ci/vis_demo.cc
+6
-6
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+4
-2
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+6
-2
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+50
-1
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+210
-0
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+25
-0
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+57
-0
paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
...e/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+73
-0
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+33
-3
paddle/fluid/inference/tensorrt/convert/ut_helper.h
paddle/fluid/inference/tensorrt/convert/ut_helper.h
+1
-1
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+4
-4
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+12
-12
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+25
-22
paddle/fluid/operators/elementwise_add_op.cc
paddle/fluid/operators/elementwise_add_op.cc
+3
-1
paddle/fluid/operators/elementwise_add_op.h
paddle/fluid/operators/elementwise_add_op.h
+9
-7
paddle/fluid/operators/elementwise_div_op.cc
paddle/fluid/operators/elementwise_div_op.cc
+2
-0
paddle/fluid/operators/elementwise_op.h
paddle/fluid/operators/elementwise_op.h
+71
-3
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+122
-64
paddle/fluid/operators/elementwise_sub_op.cc
paddle/fluid/operators/elementwise_sub_op.cc
+4
-1
paddle/fluid/operators/elementwise_sub_op.h
paddle/fluid/operators/elementwise_sub_op.h
+6
-5
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+25
-0
paddle/fluid/operators/math/im2col.cc
paddle/fluid/operators/math/im2col.cc
+10
-52
paddle/fluid/operators/math/im2col_cfo_cpu.h
paddle/fluid/operators/math/im2col_cfo_cpu.h
+252
-0
paddle/fluid/operators/math/im2col_test.cc
paddle/fluid/operators/math/im2col_test.cc
+103
-72
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+5
-26
paddle/fluid/operators/softmax_op.cc
paddle/fluid/operators/softmax_op.cc
+23
-4
paddle/fluid/operators/split_ids_op.h
paddle/fluid/operators/split_ids_op.h
+11
-4
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+109
-74
paddle/fluid/platform/cuda_primitives.h
paddle/fluid/platform/cuda_primitives.h
+10
-10
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+1
-1
python/paddle/batch.py
python/paddle/batch.py
+6
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+1
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+4
-5
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+0
-2
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+1
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/dist_transformer.py
python/paddle/fluid/tests/unittests/dist_transformer.py
+280
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+61
-11
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+137
-0
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+5
-118
python/paddle/fluid/tests/unittests/test_dist_transformer.py
python/paddle/fluid/tests/unittests/test_dist_transformer.py
+27
-0
python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
...n/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+2
-2
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+2
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+9
-0
python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
...id/tests/unittests/test_memory_optimization_transpiler.py
+24
-0
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+2
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+2
-3
python/paddle/fluid/tests/unittests/test_reshape_op.py
python/paddle/fluid/tests/unittests/test_reshape_op.py
+3
-3
python/paddle/fluid/tests/unittests/test_split_ids_op.py
python/paddle/fluid/tests/unittests/test_split_ids_op.py
+52
-0
python/paddle/fluid/tests/unittests/testsuite.py
python/paddle/fluid/tests/unittests/testsuite.py
+17
-9
python/paddle/fluid/tests/unittests/transformer_model.py
python/paddle/fluid/tests/unittests/transformer_model.py
+1
-1
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+1
-0
tools/manylinux1/Dockerfile.x64
tools/manylinux1/Dockerfile.x64
+1
-1
未找到文件。
cmake/external/anakin.cmake
浏览文件 @
a58dd3e5
...
...
@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
set
(
ANAKIN_LIBRARY
"
${
ANAKIN_INSTALL_DIR
}
"
CACHE STRING
"path of Anakin library"
)
set
(
ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
-Wno-error=unused-variable -Wno-unused-variable
-Wno-error=format-extra-args -Wno-format-extra-args
-Wno-error=comment -Wno-comment
...
...
@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder
-Wno-error=cpp
)
set
(
ANAKIN_LIBRARY_URL
"https://github.com/pangge/Anakin/releases/download/
3.0/anakin_release_simple
.tar.gz"
)
set
(
ANAKIN_LIBRARY_URL
"https://github.com/pangge/Anakin/releases/download/
Version0.1.0/anakin
.tar.gz"
)
# A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files.
...
...
@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
message
(
STATUS
"Download Anakin library from
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"rm -rf
${
ANAKIN_INSTALL_DIR
}
/*"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; wget -q
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; wget -
-no-check-certificate -
q
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; tar xzf anakin
_release_simple
.tar.gz"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; tar xzf anakin.tar.gz"
)
endif
()
if
(
WITH_ANAKIN
)
...
...
paddle/fluid/API.spec
浏览文件 @
a58dd3e5
...
...
@@ -263,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
...
...
@@ -306,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
a58dd3e5
...
...
@@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
cc_library
(
data_type SRCS data_type.cc DEPS framework_proto ddim device_context
)
cc_test
(
data_type_test SRCS data_type_test.cc DEPS data_type place tensor
)
if
(
WITH_GPU
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
else
()
...
...
paddle/fluid/framework/data_type.cc
浏览文件 @
a58dd3e5
...
...
@@ -17,6 +17,8 @@
#include <string>
#include <unordered_map>
using
float16
=
paddle
::
platform
::
float16
;
namespace
paddle
{
namespace
framework
{
...
...
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
RegisterType<cc_type>(retv, proto_type, #cc_type)
// NOTE: Add your customize type here.
RegType
(
platform
::
float16
,
proto
::
VarType
::
FP16
);
RegType
(
float16
,
proto
::
VarType
::
FP16
);
RegType
(
float
,
proto
::
VarType
::
FP32
);
RegType
(
double
,
proto
::
VarType
::
FP64
);
RegType
(
int
,
proto
::
VarType
::
INT32
);
...
...
paddle/fluid/framework/data_type_test.cc
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/data_type.h"
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
TEST
(
DataType
,
float16
)
{
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
CPUPlace
;
using
paddle
::
platform
::
float16
;
namespace
f
=
paddle
::
framework
;
f
::
proto
::
VarType
::
Type
dtype
=
f
::
proto
::
VarType
::
FP16
;
Tensor
tensor
;
CPUPlace
cpu
;
tensor
.
mutable_data
(
cpu
,
f
::
ToTypeIndex
(
dtype
));
// test fp16 tensor
EXPECT_EQ
(
tensor
.
type
(),
std
::
type_index
(
typeid
(
float16
)));
// test fp16 size
EXPECT_EQ
(
f
::
SizeOfType
(
f
::
ToTypeIndex
(
dtype
)),
2u
);
// test debug info
std
::
string
type
=
"float16"
;
EXPECT_STREQ
(
f
::
DataTypeToString
(
dtype
).
c_str
(),
type
.
c_str
());
}
paddle/fluid/framework/details/exception_holder.h
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
ExceptionHolder
{
public:
void
Catch
(
const
platform
::
EnforceNotMet
&
exp
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
exception_
.
reset
(
new
platform
::
EnforceNotMet
(
exp
));
type_
=
kEnforceNotMet
;
}
void
Catch
(
const
platform
::
EOFException
&
exp
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
// EOFException will not cover up existing EnforceNotMet.
if
(
exception_
.
get
()
==
nullptr
)
{
exception_
.
reset
(
new
platform
::
EOFException
(
exp
));
type_
=
kEOF
;
}
}
bool
ExceptionCatched
()
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
return
exception_
.
get
()
!=
nullptr
;
}
void
Throw
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
switch
(
type_
)
{
case
kNone
:
break
;
case
kEnforceNotMet
:
{
auto
e
=
*
static_cast
<
platform
::
EnforceNotMet
*>
(
exception_
.
get
());
throw
e
;
break
;
}
case
kEOF
:
{
auto
e
=
*
static_cast
<
platform
::
EOFException
*>
(
exception_
.
get
());
throw
e
;
break
;
}
default:
LOG
(
FATAL
)
<<
"Unknown exception."
;
}
exception_
.
reset
();
type_
=
kNone
;
}
void
Clear
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu_
);
exception_
.
reset
();
type_
=
kNone
;
}
private:
enum
ExceptionType
{
kNone
,
kEnforceNotMet
,
kEOF
};
ExceptionType
type_
{
kNone
};
std
::
unique_ptr
<
std
::
exception
>
exception_
;
mutable
std
::
mutex
mu_
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
浏览文件 @
a58dd3e5
...
...
@@ -41,7 +41,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
std
::
vector
<
VariableInfo
>
var_infos
,
std
::
vector
<
platform
::
Place
>
places
,
std
::
unique_ptr
<
SSAGraphExecutor
>&&
underlying_executor
);
const
ir
::
Graph
&
Graph
()
const
{
return
underlying_executor_
->
Graph
();
}
const
ir
::
Graph
&
Graph
()
const
override
{
return
underlying_executor_
->
Graph
();
}
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>&
fetch_tensors
)
override
;
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
a58dd3e5
...
...
@@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context
run_op_futures_
.
clear
();
exception_
.
reset
();
exception_
holder_
.
Clear
();
// Step 3. Execution
while
(
!
pending_vars
.
empty
())
{
...
...
@@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto
cur_ready_vars
=
ready_vars
.
PopAll
(
1
,
&
timeout
);
if
(
timeout
)
{
std
::
unique_lock
<
std
::
mutex
>
l
(
exception_mu_
);
if
(
exception_
)
{
l
.
unlock
();
if
(
exception_holder_
.
ExceptionCatched
())
{
for
(
auto
&
run_op_future
:
run_op_futures_
)
{
run_op_future
.
wait
();
}
l
.
lock
();
std
::
exception
*
exp
=
exception_
.
get
();
if
(
dynamic_cast
<
platform
::
EOFException
*>
(
exp
))
{
auto
e
=
*
static_cast
<
platform
::
EOFException
*>
(
exp
);
throw
e
;
}
else
if
(
dynamic_cast
<
platform
::
EnforceNotMet
*>
(
exp
))
{
auto
e
=
*
static_cast
<
platform
::
EnforceNotMet
*>
(
exp
);
throw
e
;
}
else
{
LOG
(
FATAL
)
<<
"Unknown exception."
;
}
exception_holder_
.
Throw
();
}
else
{
continue
;
}
...
...
@@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp(
ready_var_q
->
Extend
(
op
->
Outputs
());
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
"Signal posted"
;
}
catch
(
platform
::
EOFException
ex
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
exception_mu_
);
// EOFException will not cover up existing EnforceNotMet.
if
(
exception_
.
get
()
==
nullptr
)
{
exception_
.
reset
(
new
platform
::
EOFException
(
ex
));
}
exception_holder_
.
Catch
(
ex
);
}
catch
(
platform
::
EnforceNotMet
ex
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
exception_mu_
);
exception_
.
reset
(
new
platform
::
EnforceNotMet
(
ex
));
exception_holder_
.
Catch
(
ex
);
}
catch
(...)
{
LOG
(
FATAL
)
<<
"Unknown exception catched"
;
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
a58dd3e5
...
...
@@ -24,6 +24,7 @@
#include <functional>
#include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
...
...
@@ -42,7 +43,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
const
ir
::
Graph
&
Graph
()
const
{
return
*
graph_
;
}
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graph_
;
}
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
...
...
@@ -59,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
platform
::
DeviceContextPool
fetch_ctxs_
;
std
::
mutex
exception_mu_
;
std
::
unique_ptr
<
std
::
exception
>
exception_
;
ExceptionHolder
exception_holder_
;
std
::
atomic
<
int
>
running_ops_
;
void
InsertPendingOp
(
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
*
pending_ops
,
...
...
paddle/fluid/framework/ir/graph_helper_test.cc
浏览文件 @
a58dd3e5
...
...
@@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) {
for
(
size_t
i
=
0
;
i
<
sorted
.
size
();
++
i
)
{
node_map
[
sorted
[
i
]
->
Name
()]
=
i
;
}
ASSERT_EQ
(
node_map
.
at
(
"op1"
),
0
);
ASSERT_EQ
(
node_map
.
at
(
"op2"
),
1
);
ASSERT_EQ
(
node_map
.
at
(
"op1"
),
0
UL
);
ASSERT_EQ
(
node_map
.
at
(
"op2"
),
1
UL
);
ASSERT_TRUE
(
node_map
.
at
(
"op3"
)
<
node_map
.
at
(
"op5"
));
}
}
// namespace ir
...
...
paddle/fluid/framework/ir/graph_test.cc
浏览文件 @
a58dd3e5
...
...
@@ -97,15 +97,15 @@ TEST(GraphTest, Basic) {
std
::
vector
<
ir
::
Node
*>
nodes
(
g
->
Nodes
().
begin
(),
g
->
Nodes
().
end
());
for
(
ir
::
Node
*
n
:
nodes
)
{
if
(
n
->
Name
()
==
"sum"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
3
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
3
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
UL
);
}
else
if
(
n
->
Name
()
==
"test_a"
||
n
->
Name
()
==
"test_b"
||
n
->
Name
()
==
"test_c"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
0
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
0
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
UL
);
}
else
if
(
n
->
Name
()
==
"test_out"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
0
);
ASSERT_EQ
(
n
->
inputs
.
size
(),
1
UL
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
0
UL
);
}
}
ASSERT_EQ
(
nodes
.
size
(),
5
);
...
...
paddle/fluid/framework/op_kernel_type_test.cc
浏览文件 @
a58dd3e5
...
...
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
ASSERT_EQ
(
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type
),
"data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
"CUDNN]"
);
using
CUDAPlace
=
paddle
::
platform
::
CUDAPlace
;
OpKernelType
op_kernel_type2
(
DataType
::
FP16
,
CUDAPlace
(
0
),
DataLayout
::
kNCHW
,
LibraryType
::
kCUDNN
);
ASSERT_EQ
(
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type2
),
"data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
"type[CUDNN]"
);
}
TEST
(
OpKernelType
,
Hash
)
{
...
...
paddle/fluid/framework/op_proto_maker.cc
浏览文件 @
a58dd3e5
...
...
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
return
OpProtoAndCheckerMaker
::
VariableBuilder
{
output
};
}
void
OpProtoAndCheckerMaker
::
Reuse
(
const
std
::
string
&
name
,
const
std
::
string
&
reused_name
)
{
bool
found
=
false
;
proto
::
OpProto
::
Var
*
var
;
for
(
auto
&
var
:
proto_
->
inputs
())
{
if
(
var
.
name
()
==
reused_name
)
{
found
=
true
;
break
;
}
}
PADDLE_ENFORCE
(
found
==
true
,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched."
,
name
,
reused_name
);
found
=
false
;
for
(
int
i
=
0
;
i
<
proto_
->
outputs
().
size
();
++
i
)
{
var
=
proto_
->
mutable_outputs
()
->
Mutable
(
i
);
if
(
var
->
name
()
==
name
)
{
PADDLE_ENFORCE
(
!
var
->
has_reuse
(),
"Output(%s) has been set reused var of %s"
,
name
,
var
->
reuse
());
found
=
true
;
var
->
set_reuse
(
reused_name
);
break
;
}
}
PADDLE_ENFORCE
(
found
==
true
,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched."
,
name
,
reused_name
);
}
void
OpProtoAndCheckerMaker
::
CheckNoDuplicatedInOutAttrs
()
{
std
::
unordered_set
<
std
::
string
>
names
;
auto
checker
=
[
&
](
const
std
::
string
&
name
)
{
...
...
paddle/fluid/framework/op_proto_maker.h
浏览文件 @
a58dd3e5
...
...
@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
VariableBuilder
AddOutput
(
const
std
::
string
&
name
,
const
std
::
string
&
comment
);
void
Reuse
(
const
std
::
string
&
name
,
const
std
::
string
&
reused_name
);
template
<
typename
T
>
TypedAttrChecker
<
T
>
&
AddAttr
(
const
std
::
string
&
name
,
const
std
::
string
&
comment
,
...
...
paddle/fluid/framework/op_proto_maker_test.cc
浏览文件 @
a58dd3e5
...
...
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
}
class
TestInplaceProtoMaker
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
AddOutput
(
"XOut"
,
"output of test op"
).
Reuse
(
"X"
);
}
};
class
TestInplaceProtoMaker2
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
...
...
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
};
TEST
(
ProtoMaker
,
InplaceOutput
)
{
paddle
::
framework
::
proto
::
OpProto
op_proto
;
paddle
::
framework
::
proto
::
OpProto
op_proto
,
op_proto2
;
paddle
::
framework
::
OpAttrChecker
op_checker
;
TestInplaceProtoMaker
proto_maker
;
ASSERT_THROW
(
proto_maker
(
&
op_proto
,
&
op_checker
),
TestInplaceProtoMaker2
proto_maker2
;
proto_maker
(
&
op_proto
,
&
op_checker
);
ASSERT_THROW
(
proto_maker2
(
&
op_proto2
,
&
op_checker
),
paddle
::
platform
::
EnforceNotMet
);
// proto_maker(&op_proto, &op_checker);
// proto_maker.Make();
// ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
}
// normal reuse
class
TestReuseProtoMaker
:
public
paddle
::
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"input of test op"
);
AddInput
(
"Y"
,
"input of test op"
);
AddOutput
(
"Out"
,
"output of test op"
);
AddOutput
(
"XOut"
,
"output of test op"
);
// avoid destructor exception.
// Validate();
TestReuse
();
}
virtual
void
TestReuse
()
{}
};
// test duplicate reuse error
class
TestReuseProtoMaker2
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"Out"
,
"X"
);
Reuse
(
"Out"
,
"Y"
);
}
};
// NotExists Input
class
TestReuseProtoMaker3
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"Out"
,
"NotExists"
);
Reuse
(
"XOut"
,
"X"
);
}
};
// NotExists Output
class
TestReuseProtoMaker4
:
public
TestReuseProtoMaker
{
public:
void
TestReuse
()
{
Reuse
(
"NotExists"
,
"X"
);
}
};
TEST
(
ProtoMaker
,
Reuse
)
{
paddle
::
framework
::
proto
::
OpProto
op_proto
;
paddle
::
framework
::
OpAttrChecker
op_checker
;
TestReuseProtoMaker
proto_maker
;
proto_maker
(
&
op_proto
,
&
op_checker
);
}
// NOTE(dzhwinter):
// There is a Fatal CHECK on base class destructor, which will call abort inside
// instead of
// throw an exception. If we throw an exception in Make(), we will trigger the
// CHECK and terminate the tests.
//
// I had tried to replace the default CHECK with a exception, however, it's
// still not supported by glog.
// the details:
// https://github.com/google/glog/issues/249
// https://github.com/facebookresearch/TensorComprehensions/issues/351
/*
TEST(ProtoMaker, ReuseWithException) {
paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
paddle::framework::OpAttrChecker op_checker;
TestReuseProtoMaker2 proto_maker2;
TestReuseProtoMaker3 proto_maker3;
TestReuseProtoMaker4 proto_maker4;
EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
paddle::platform::EnforceNotMet);
}
void FailureFunction() {
throw std::runtime_error("Check failed in destructor.");
// return 0;
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
google::InstallFailureFunction(&FailureFunction);
return RUN_ALL_TESTS();
}
*/
paddle/fluid/framework/operator.cc
浏览文件 @
a58dd3e5
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type.h"
...
...
@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
}
if
(
var
->
IsType
<
LoDTensor
>
())
{
return
var
->
Get
<
LoDTensor
>
().
dims
();
const
LoDTensor
&
tensor
=
var
->
Get
<
LoDTensor
>
();
if
(
UNLIKELY
(
!
tensor
.
IsInitialized
()))
{
return
DDim
({
-
1
});
}
return
tensor
.
dims
();
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
if
(
get_actual_dim
)
{
return
var
->
Get
<
SelectedRows
>
().
value
().
dims
();
...
...
@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
}
}
static
std
::
string
GetDtype
(
const
Scope
&
scope
,
const
std
::
string
&
name
)
{
Variable
*
var
=
scope
.
FindVar
(
name
);
if
(
var
==
nullptr
)
{
return
""
;
}
if
(
var
->
IsType
<
LoDTensor
>
())
{
const
LoDTensor
&
tensor
=
var
->
Get
<
LoDTensor
>
();
if
(
UNLIKELY
(
!
tensor
.
IsInitialized
()))
{
return
""
;
}
return
DataTypeToString
(
ToDataType
(
tensor
.
type
()));
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
return
DataTypeToString
(
ToDataType
(
var
->
Get
<
SelectedRows
>
().
value
().
type
()));
}
else
{
return
""
;
}
}
static
int
GetRowSize
(
const
Scope
&
scope
,
const
std
::
string
&
name
)
{
Variable
*
var
=
scope
.
FindVar
(
name
);
if
(
var
==
nullptr
)
{
...
...
@@ -91,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
if
(
var
->
IsType
<
LoDTensor
>
())
{
return
var
->
Get
<
LoDTensor
>
().
lod
();
const
LoDTensor
&
tensor
=
var
->
Get
<
LoDTensor
>
();
if
(
UNLIKELY
(
!
tensor
.
IsInitialized
()))
{
return
default_lod
;
}
return
tensor
.
lod
();
}
else
{
return
default_lod
;
}
...
...
@@ -172,6 +201,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
if
(
row_size
>=
0
)
{
ss
<<
"[row_size="
<<
row_size
<<
"]"
;
}
std
::
string
dtype
=
GetDtype
(
*
scope
,
input
.
second
[
i
]);
ss
<<
":"
<<
dtype
;
ss
<<
"["
<<
GetDims
(
*
scope
,
input
.
second
[
i
],
true
)
<<
"]"
;
ss
<<
"("
<<
GetLoD
(
*
scope
,
input
.
second
[
i
])
<<
")"
;
}
...
...
paddle/fluid/framework/tensor.h
浏览文件 @
a58dd3e5
...
...
@@ -82,7 +82,7 @@ class Tensor {
template
<
typename
T
>
const
T
*
data
()
const
;
bool
IsInitialized
()
const
;
inline
bool
IsInitialized
()
const
;
/**
* @brief Return a pointer to mutable memory block.
...
...
paddle/fluid/framework/tensor_test.cc
浏览文件 @
a58dd3e5
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/tensor.h"
#include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/platform/float16.h"
namespace
framework
=
paddle
::
framework
;
namespace
platform
=
paddle
::
platform
;
...
...
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
src
.
set_layout
(
framework
::
DataLayout
::
kAnyLayout
);
ASSERT_EQ
(
src
.
layout
(),
framework
::
DataLayout
::
kAnyLayout
);
}
TEST
(
Tensor
,
FP16
)
{
using
platform
::
float16
;
framework
::
Tensor
src
;
float16
*
src_ptr
=
src
.
mutable_data
<
float16
>
({
2
,
3
},
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
2
*
3
;
++
i
)
{
src_ptr
[
i
]
=
static_cast
<
float16
>
(
i
);
}
EXPECT_EQ
(
src
.
memory_size
(),
2
*
3
*
sizeof
(
float16
));
// EXPECT a human readable error message
// src.data<uint8_t>();
// Tensor holds the wrong type, it holds N6paddle8platform7float16E at
// [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
}
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
a58dd3e5
...
...
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
tensorrt_subgraph_node_mark_pass.cc
analyzer.cc
helper.cc
model_store_pass.cc
DEPS framework_proto proto_desc
)
cc_test
(
test_node SRCS node_tester.cc DEPS analysis
)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
...
...
@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
inference_analysis_test
(
test_pass_manager SRCS pass_manager_tester.cc
)
inference_analysis_test
(
test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc
)
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc
)
inference_analysis_test
(
test_model_store_pass SRCS model_store_pass_tester.cc
)
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
a58dd3e5
...
...
@@ -17,6 +17,7 @@
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
...
...
@@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
DEFINE_string
(
inference_analysis_graphviz_log_root
,
"./"
,
"Graphviz debuger for data flow graphs."
);
DEFINE_string
(
inference_analysis_output_storage_path
,
""
,
"optimized model output path"
);
namespace
inference
{
namespace
analysis
{
...
...
@@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
AddPass
(
"tensorrt-subgraph"
,
new
TensorRTSubGraphPass
(
trt_teller
));
}
AddPass
(
"data-flow-graph-to-fluid"
,
new
DataFlowGraphToFluidPass
);
if
(
!
FLAGS_inference_analysis_output_storage_path
.
empty
())
{
AddPass
(
"model-store-pass"
,
new
ModelStorePass
);
}
}
std
::
string
repr
()
const
override
{
return
"dfg-pass-manager"
;
}
...
...
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
a58dd3e5
...
...
@@ -16,28 +16,23 @@ limitations under the License. */
/*
* This file contains Analyzer, an class that exposed as a library that analyze
* and optimize
* Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
* control whether
* an process is applied on the program.
* and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
* multiple flags to
* control whether an process is applied on the program.
*
* The processes are called Passes in analysis, the Passes are placed in a
* pipeline, the first
* Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
* a data flow
* graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
* graph to a
* Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
* which take a
* node or data flow graph as input.
* pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
* Fluid ProgramDesc to
* a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
* a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
* pipeline can be any Passes
* which take a node or data flow graph as input.
*
* The Analyzer can be used in two methods, the first is a executable file which
* can be used to
* pre-process the inference model and can be controlled by passing difference
* command flags;
* can be used to pre-process the inference model and can be controlled by
* passing difference command flags;
* the other way is to compose inside the inference API as a runtime pre-process
* phase in the
* inference service.
* phase in the inference service.
*/
#include <gflags/gflags.h>
...
...
@@ -50,6 +45,7 @@ namespace paddle {
// flag if not available.
DECLARE_bool
(
inference_analysis_enable_tensorrt_subgraph_engine
);
DECLARE_string
(
inference_analysis_graphviz_log_root
);
DECLARE_string
(
inference_analysis_output_storage_path
);
namespace
inference
{
namespace
analysis
{
...
...
paddle/fluid/inference/analysis/analyzer_main.cc
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file implements analysizer -- an executation help to analyze and
* optimize trained model.
*/
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
int
main
(
int
argc
,
char
**
argv
)
{
google
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
using
paddle
::
inference
::
analysis
::
Analyzer
;
using
paddle
::
inference
::
analysis
::
Argument
;
Argument
argument
;
Analyzer
analyzer
;
analyzer
.
Run
(
&
argument
);
return
0
;
}
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -20,14 +20,18 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST
_F
(
DFG_Test
er
,
analysis_without_tensorrt
)
{
TEST
(
Analyz
er
,
analysis_without_tensorrt
)
{
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
=
false
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
}
TEST
_F
(
DFG_Test
er
,
analysis_with_tensorrt
)
{
TEST
(
Analyz
er
,
analysis_with_tensorrt
)
{
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
=
true
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
Analyzer
analyser
;
analyser
.
Run
(
&
argument
);
}
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
a58dd3e5
...
...
@@ -23,6 +23,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
...
...
@@ -36,6 +37,16 @@ namespace analysis {
* All the fields should be registered here for clearness.
*/
struct
Argument
{
Argument
()
=
default
;
explicit
Argument
(
const
std
::
string
&
fluid_model_dir
)
:
fluid_model_dir
(
new
std
::
string
(
fluid_model_dir
))
{}
// The directory of the trained model.
std
::
unique_ptr
<
std
::
string
>
fluid_model_dir
;
// The path of `__model__` and `param`, this is used when the file name of
// model and param is changed.
std
::
unique_ptr
<
std
::
string
>
fluid_model_program_path
;
std
::
unique_ptr
<
std
::
string
>
fluid_model_param_path
;
// The graph that process by the Passes or PassManagers.
std
::
unique_ptr
<
DataFlowGraph
>
main_dfg
;
...
...
@@ -44,6 +55,9 @@ struct Argument {
// The processed program desc.
std
::
unique_ptr
<
framework
::
proto
::
ProgramDesc
>
transformed_program_desc
;
// The output storage path of ModelStorePass.
std
::
unique_ptr
<
std
::
string
>
model_output_store_path
;
};
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
...
...
paddle/fluid/inference/analysis/data_flow_graph.h
浏览文件 @
a58dd3e5
...
...
@@ -36,6 +36,8 @@ namespace analysis {
/*
* DataFlowGraph - A container of Value and Function Nodes.
*
* This is the base graph for any other type of graphs, such as SSA or CFG.
*/
struct
DataFlowGraph
{
NodeMap
nodes
;
...
...
@@ -174,7 +176,7 @@ struct GraphTraits<DataFlowGraph> {
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
std
::
pair
<
std
::
vector
<
Node
*>
,
std
::
vector
<
Node
*>>
ExtractInputAndOutputOfSubGraph
(
std
::
vector
<
Node
*>
&
graph
);
ExtractInputAndOutputOfSubGraph
(
std
::
vector
<
Node
*>
&
graph
);
// NOLINT
}
// namespace analysis
}
// namespace inference
...
...
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -20,7 +20,7 @@ namespace inference {
namespace
analysis
{
TEST
(
DataFlowGraph
,
BFS
)
{
auto
desc
=
LoadProgramDesc
();
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
dfg
.
Build
();
...
...
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
}
TEST
(
DataFlowGraph
,
DFS
)
{
auto
desc
=
LoadProgramDesc
();
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
dfg
.
Build
();
GraphTraits
<
DataFlowGraph
>
trait
(
&
dfg
);
...
...
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -26,21 +26,21 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
Test
)
{
DataFlowGraph
graph
;
TEST
(
DataFlowGraph
,
Test
)
{
Argument
argument
(
FLAGS_inference_model_dir
)
;
FluidToDataFlowGraphPass
pass0
;
DataFlowGraphToFluidPass
pass1
;
ASSERT_TRUE
(
pass0
.
Initialize
(
&
argument
));
ASSERT_TRUE
(
pass1
.
Initialize
(
&
argument
));
pass0
.
Run
(
&
graph
);
pass1
.
Run
(
&
graph
);
pass0
.
Run
(
argument
.
main_dfg
.
get
()
);
pass1
.
Run
(
argument
.
main_dfg
.
get
()
);
pass0
.
Finalize
();
pass1
.
Finalize
();
LOG
(
INFO
)
<<
graph
.
nodes
.
size
();
LOG
(
INFO
)
<<
argument
.
main_dfg
->
nodes
.
size
();
}
};
// namespace analysis
...
...
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -23,12 +23,18 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST_F
(
DFG_Tester
,
dfg_graphviz_draw_pass_tester
)
{
auto
dfg
=
ProgramDescToDFG
(
*
argument
.
origin_program_desc
);
TEST
(
DFG_GraphvizDrawPass
,
dfg_graphviz_draw_pass_tester
)
{
Argument
argument
(
FLAGS_inference_model_dir
);
FluidToDataFlowGraphPass
pass0
;
ASSERT_TRUE
(
pass0
.
Initialize
(
&
argument
));
pass0
.
Run
(
argument
.
main_dfg
.
get
());
// auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
DFG_GraphvizDrawPass
::
Config
config
(
"./"
,
"test"
);
DFG_GraphvizDrawPass
pass
(
config
);
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
&
dfg
);
pass
.
Run
(
argument
.
main_dfg
.
get
()
);
// test content
std
::
ifstream
file
(
"./0-graph_test.dot"
);
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
浏览文件 @
a58dd3e5
...
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <string>
#include <vector>
...
...
@@ -25,8 +26,20 @@ namespace analysis {
bool
FluidToDataFlowGraphPass
::
Initialize
(
Argument
*
argument
)
{
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
);
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
origin_program_desc
);
PADDLE_ENFORCE
(
argument
);
if
(
argument
->
origin_program_desc
)
{
LOG
(
WARNING
)
<<
"argument's origin_program_desc is already set, might "
"duplicate called"
;
}
if
(
!
argument
->
fluid_model_program_path
)
{
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
fluid_model_dir
);
argument
->
fluid_model_program_path
.
reset
(
new
std
::
string
(
*
argument
->
fluid_model_dir
+
"/__model__"
));
}
ANALYSIS_ARGUMENT_CHECK_FIELD
(
argument
->
fluid_model_program_path
);
auto
program
=
LoadProgramDesc
(
*
argument
->
fluid_model_program_path
);
argument
->
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
program
));
if
(
!
argument
->
main_dfg
)
{
argument
->
main_dfg
.
reset
(
new
DataFlowGraph
);
}
...
...
@@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
PADDLE_ENFORCE
(
graph
);
PADDLE_ENFORCE
(
desc_
);
// insert vars
// The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
// will keep updating to its latest alias during the graph-building.
std
::
unordered_map
<
std
::
string
,
size_t
>
var2id
;
auto
&
main_block
=
desc_
->
blocks
(
framework
::
kRootBlockIndex
);
for
(
int
i
=
0
;
i
<
main_block
.
vars_size
();
i
++
)
{
...
...
@@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
var2id
[
var
.
name
()]
=
v
->
id
();
}
// The variables in a SSA can only write once, so if a variable is written
// multiple times(quite common in our ProgramDesc design), multiple alias
// Nodes of this variable will be created, and each will just write once.
// An set that keep all the names of the variables(the original, not alias)
// that have been written(as outputs). Once an Op's output variable hit the
// set, it should create a new alias and update the global alias for this
// variable. And that make a Data Flow Graph a SSA.
std
::
unordered_set
<
Node
*>
unique_written_vars
;
for
(
int
i
=
0
;
i
<
main_block
.
ops_size
();
i
++
)
{
const
auto
&
op
=
main_block
.
ops
(
i
);
auto
*
o
=
graph
->
nodes
.
Create
(
Node
::
Type
::
kFunction
);
...
...
@@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
o
->
SetPbMsg
(
op
.
SerializeAsString
());
// set inputs and outputs
std
::
unordered_set
<
Node
*>
inlinks
;
for
(
int
j
=
0
;
j
<
op
.
inputs_size
();
j
++
)
{
auto
&
in_var
=
op
.
inputs
(
j
);
for
(
int
k
=
0
;
k
<
in_var
.
arguments_size
();
k
++
)
{
auto
*
in
=
graph
->
nodes
.
GetMutable
(
var2id
.
at
(
in_var
.
arguments
(
k
)));
in
->
outlinks
.
push_back
(
o
);
o
->
inlinks
.
push_back
(
in
);
inlinks
.
insert
(
in
);
}
}
for
(
int
j
=
0
;
j
<
op
.
outputs_size
();
j
++
)
{
auto
&
out_var
=
op
.
outputs
(
j
);
for
(
int
k
=
0
;
k
<
out_var
.
arguments_size
();
k
++
)
{
auto
*
out
=
graph
->
nodes
.
GetMutable
(
var2id
[
out_var
.
arguments
(
k
)]);
if
(
inlink
s
.
count
(
out
))
{
if
(
unique_written_var
s
.
count
(
out
))
{
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto
*
out_alias
=
graph
->
nodes
.
Create
(
Node
::
Type
::
kValue
);
out_alias
->
SetName
(
out
->
name
());
out_alias
->
SetPbDesc
(
out
->
pb_desc
());
out_alias
->
SetPbMsg
(
out
->
pb_msg
());
var2id
[
out_alias
->
name
()]
=
out_alias
->
id
();
// update a -> a0
var2id
[
out_alias
->
name
()]
=
out_alias
->
id
();
// update variable's alias Node
LOG
(
INFO
)
<<
"loop found in graph, create SSA alias node ["
<<
out_alias
->
repr
()
<<
"] for ["
<<
out
->
repr
()
<<
"]"
;
out
=
out_alias
;
}
out
->
inlinks
.
push_back
(
o
);
o
->
outlinks
.
push_back
(
out
);
unique_written_vars
.
insert
(
out
);
}
}
}
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
浏览文件 @
a58dd3e5
...
...
@@ -30,7 +30,7 @@ namespace inference {
namespace
analysis
{
/*
* Transform a FluidDesc to a
data flow graph
.
* Transform a FluidDesc to a
SSA
.
*/
class
FluidToDataFlowGraphPass
final
:
public
DataFlowGraphPass
{
public:
...
...
paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -21,8 +21,9 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
Ini
t
)
{
TEST
(
FluidToDataFlowGraphPass
,
Tes
t
)
{
FluidToDataFlowGraphPass
pass
;
Argument
argument
(
FLAGS_inference_model_dir
);
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
// Analysis is sensitive to ProgramDesc, careful to change the original model.
...
...
paddle/fluid/inference/analysis/helper.h
浏览文件 @
a58dd3e5
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <cstdio>
#include <fstream>
#include <string>
#include <typeindex>
#include <unordered_map>
...
...
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
}
}
static
framework
::
proto
::
ProgramDesc
LoadProgramDesc
(
const
std
::
string
&
model_path
)
{
std
::
ifstream
fin
(
model_path
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
fin
.
is_open
(),
"Cannot open file %s"
,
model_path
);
fin
.
seekg
(
0
,
std
::
ios
::
end
);
std
::
string
buffer
(
fin
.
tellg
(),
' '
);
fin
.
seekg
(
0
,
std
::
ios
::
beg
);
fin
.
read
(
&
buffer
[
0
],
buffer
.
size
());
fin
.
close
();
framework
::
proto
::
ProgramDesc
program_desc
;
program_desc
.
ParseFromString
(
buffer
);
return
program_desc
;
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
...
...
paddle/fluid/inference/analysis/model_store_pass.cc
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
ModelStorePass
::
Run
(
DataFlowGraph
*
x
)
{
if
(
!
argument_
->
fluid_model_param_path
)
{
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
fluid_model_dir
);
argument_
->
fluid_model_param_path
.
reset
(
new
std
::
string
(
*
argument_
->
fluid_model_dir
+
"param"
));
}
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
model_output_store_path
);
// Directly copy param file to destination.
std
::
stringstream
ss
;
// NOTE these commands only works on linux.
ss
<<
"mkdir -p "
<<
*
argument_
->
model_output_store_path
;
LOG
(
INFO
)
<<
"run command: "
<<
ss
.
str
();
PADDLE_ENFORCE_EQ
(
system
(
ss
.
str
().
c_str
()),
0
);
ss
.
str
(
""
);
ss
<<
"cp "
<<
*
argument_
->
fluid_model_dir
<<
"/*"
<<
" "
<<
*
argument_
->
model_output_store_path
;
LOG
(
INFO
)
<<
"run command: "
<<
ss
.
str
();
PADDLE_ENFORCE_EQ
(
system
(
ss
.
str
().
c_str
()),
0
);
// Store program
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
transformed_program_desc
,
"program desc is not transformed, should call "
"DataFlowGraphToFluidPass first."
);
const
std
::
string
program_output_path
=
*
argument_
->
model_output_store_path
+
"/__model__"
;
std
::
ofstream
file
(
program_output_path
,
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
file
.
is_open
(),
"failed to open %s to write."
,
program_output_path
);
const
std
::
string
serialized_message
=
argument_
->
transformed_program_desc
->
SerializeAsString
();
file
.
write
(
serialized_message
.
c_str
(),
serialized_message
.
size
());
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/model_store_pass.h
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines ModelStorePass, which store the runtime DFG to a Paddle
* model in the disk, and that model can be reloaded for prediction.
*/
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
class
ModelStorePass
:
public
DataFlowGraphPass
{
public:
bool
Initialize
(
Argument
*
argument
)
override
{
if
(
!
argument
)
{
LOG
(
ERROR
)
<<
"invalid argument"
;
return
false
;
}
argument_
=
argument
;
return
true
;
}
void
Run
(
DataFlowGraph
*
x
)
override
;
std
::
string
repr
()
const
override
{
return
"DFG-store-pass"
;
}
std
::
string
description
()
const
override
{
return
R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
model in the disk, and that model can be reloaded for prediction again.)DD"
;
}
private:
Argument
*
argument_
{
nullptr
};
};
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/model_store_pass_tester.cc
0 → 100644
浏览文件 @
a58dd3e5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
DEFINE_string
(
inference_model_dir
,
""
,
"Model path"
);
TEST
(
DFG_StorePass
,
test
)
{
Analyzer
analyzer
;
Argument
argument
(
FLAGS_inference_model_dir
);
argument
.
model_output_store_path
.
reset
(
new
std
::
string
(
"./_dfg_store_pass_tmp"
));
// disable storage in alalyzer
FLAGS_inference_analysis_output_storage_path
=
""
;
analyzer
.
Run
(
&
argument
);
ModelStorePass
pass
;
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/pass.h
浏览文件 @
a58dd3e5
...
...
@@ -50,6 +50,7 @@ class Pass {
// Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual
Pass
*
CreateGraphvizDebugerPass
()
const
{
return
nullptr
;
}
virtual
void
Run
()
{
LOG
(
FATAL
)
<<
"not valid"
;
}
// Run on a single Node.
virtual
void
Run
(
Node
*
x
)
{
LOG
(
FATAL
)
<<
"not valid"
;
}
// Run on a single Function.
...
...
paddle/fluid/inference/analysis/pass_manager_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
std
::
string
description
()
const
override
{
return
"some doc"
;
}
};
TEST
_F
(
DFG_Test
er
,
DFG_pass_manager
)
{
TEST
(
PassManag
er
,
DFG_pass_manager
)
{
TestDfgPassManager
manager
;
DFG_GraphvizDrawPass
::
Config
config
(
"./"
,
"dfg.dot"
);
...
...
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
manager
.
Register
(
"graphviz"
,
new
DFG_GraphvizDrawPass
(
config
));
manager
.
Register
(
"dfg-to-fluid"
,
new
DataFlowGraphToFluidPass
);
Argument
argument
(
FLAGS_inference_model_dir
);
ASSERT_TRUE
(
&
argument
);
ASSERT_TRUE
(
manager
.
Initialize
(
&
argument
));
manager
.
RunAll
();
}
TEST_F
(
DFG_Tester
,
Node_pass_manager
)
{
TEST
(
PassManager
,
Node_pass_manager
)
{
Argument
argument
(
FLAGS_inference_model_dir
);
// Pre-process: initialize the DFG with the ProgramDesc first.
FluidToDataFlowGraphPass
pass0
;
pass0
.
Initialize
(
&
argument
);
...
...
paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
return
false
;
};
TEST
_F
(
DFG_Tes
ter
,
Split
)
{
auto
desc
=
LoadProgramDesc
();
TEST
(
SubGraphSplit
ter
,
Split
)
{
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
LOG
(
INFO
)
<<
"spliter
\n
"
<<
dfg
.
DotString
();
...
...
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
ASSERT_EQ
(
subgraphs
.
back
().
size
(),
6UL
);
}
TEST
_F
(
DFG_Tes
ter
,
Fuse
)
{
auto
desc
=
LoadProgramDesc
();
TEST
(
SubGraphSplit
ter
,
Fuse
)
{
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
auto
dfg
=
ProgramDescToDFG
(
desc
);
size_t
count0
=
dfg
.
nodes
.
size
();
...
...
paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -22,11 +22,11 @@ namespace paddle {
namespace
inference
{
namespace
analysis
{
TEST
_F
(
DFG_Tester
,
tensorrt_subgraph_node_mark_pass
)
{
TEST
(
TensorRTSubgraphNodeMarkPass
,
test
)
{
// init
FluidToDataFlowGraphPass
pass
;
Argument
argument
(
FLAGS_inference_model_dir
);
ASSERT_TRUE
(
pass
.
Initialize
(
&
argument
));
argument
.
main_dfg
.
reset
(
new
DataFlowGraph
);
pass
.
Run
(
argument
.
main_dfg
.
get
());
TensorRTSubgraphNodeMarkPass
::
teller_t
teller
=
[](
const
Node
*
node
)
{
...
...
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
for
(
auto
&
node
:
argument
.
main_dfg
->
nodes
.
nodes
())
{
counter
+=
node
->
attr
(
ATTR_supported_by_tensorrt
).
Bool
();
}
ASSERT_EQ
(
counter
,
2
);
LOG
(
INFO
)
<<
counter
<<
" nodes marked"
;
}
...
...
paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -25,7 +25,7 @@ namespace analysis {
DEFINE_string
(
dot_dir
,
"./"
,
""
);
TEST
_F
(
DFG_Tester
,
tensorrt_single_pass
)
{
TEST
(
TensorRTSubGraphPass
,
main
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"elementwise_add"
,
"mul"
,
"sigmoid"
});
SubGraphSplitter
::
NodeInsideSubgraphTeller
teller
=
[
&
](
const
Node
*
node
)
{
...
...
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
return
false
;
};
LOG
(
INFO
)
<<
"init"
;
Argument
argument
(
FLAGS_inference_model_dir
);
DFG_GraphvizDrawPass
::
Config
config
{
FLAGS_dot_dir
,
"origin"
};
DFG_GraphvizDrawPass
::
Config
config1
{
FLAGS_dot_dir
,
"fusion"
};
...
...
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
FluidToDataFlowGraphPass
pass0
;
TensorRTSubGraphPass
trt_pass
(
std
::
move
(
teller
));
LOG
(
INFO
)
<<
"Initialize"
;
dfg_pass
.
Initialize
(
&
argument
);
dfg_pass1
.
Initialize
(
&
argument
);
pass0
.
Initialize
(
&
argument
);
trt_pass
.
Initialize
(
&
argument
);
LOG
(
INFO
)
<<
"Run"
;
argument
.
main_dfg
.
reset
(
new
DataFlowGraph
);
pass0
.
Run
(
argument
.
main_dfg
.
get
());
dfg_pass
.
Run
(
argument
.
main_dfg
.
get
());
...
...
paddle/fluid/inference/analysis/ut_helper.h
浏览文件 @
a58dd3e5
...
...
@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/
ut_
helper.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -32,27 +32,12 @@ namespace analysis {
DEFINE_string
(
inference_model_dir
,
""
,
"inference test model dir"
);
static
framework
::
proto
::
ProgramDesc
LoadProgramDesc
(
const
std
::
string
&
model_dir
=
FLAGS_inference_model_dir
)
{
std
::
string
msg
;
std
::
string
net_file
=
FLAGS_inference_model_dir
+
"/__model__"
;
std
::
ifstream
fin
(
net_file
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
net_file
);
fin
.
seekg
(
0
,
std
::
ios
::
end
);
msg
.
resize
(
fin
.
tellg
());
fin
.
seekg
(
0
,
std
::
ios
::
beg
);
fin
.
read
(
&
(
msg
.
at
(
0
)),
msg
.
size
());
fin
.
close
();
framework
::
proto
::
ProgramDesc
program_desc
;
program_desc
.
ParseFromString
(
msg
);
return
program_desc
;
}
static
DataFlowGraph
ProgramDescToDFG
(
const
framework
::
proto
::
ProgramDesc
&
desc
)
{
DataFlowGraph
graph
;
FluidToDataFlowGraphPass
pass
;
Argument
argument
;
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
FLAGS_inference_model_dir
));
argument
.
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
desc
));
pass
.
Initialize
(
&
argument
);
pass
.
Run
(
&
graph
);
...
...
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
class
DFG_Tester
:
public
::
testing
::
Test
{
protected:
void
SetUp
()
override
{
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
);
auto
desc
=
LoadProgramDesc
(
FLAGS_inference_model_dir
+
"/__model__"
);
argument
.
origin_program_desc
.
reset
(
new
framework
::
proto
::
ProgramDesc
(
desc
));
}
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
a58dd3e5
...
...
@@ -19,6 +19,7 @@ endif(APPLE)
set
(
inference_deps paddle_inference_api paddle_fluid_api
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine
)
endif
()
...
...
@@ -63,6 +64,8 @@ endif()
if
(
WITH_ANAKIN
)
# only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively
(
${
ANAKIN_INCLUDE
}
)
# compile the libinference_anakin_api.a and anakin.so.
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc
)
nv_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc
)
...
...
@@ -73,7 +76,7 @@ if (WITH_ANAKIN) # only needed in CI
if
(
WITH_TESTING
)
cc_test
(
inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=
${
ANAKIN_INSTALL_DIR
}
/mobilenet_v2.anakin.bin
DEPS inference_anakin_api
)
DEPS inference_anakin_api
_shared
)
target_compile_options
(
inference_anakin_test BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endif
(
WITH_TESTING
)
endif
()
paddle/fluid/inference/api/api_anakin_engine.cc
浏览文件 @
a58dd3e5
...
...
@@ -18,26 +18,36 @@
namespace
paddle
{
PaddleInferenceAnakinPredictor
::
PaddleInferenceAnakinPredictor
(
template
<
typename
Target
>
PaddleInferenceAnakinPredictor
<
Target
>::
PaddleInferenceAnakinPredictor
(
const
AnakinConfig
&
config
)
{
CHECK
(
Init
(
config
));
}
bool
PaddleInferenceAnakinPredictor
::
Init
(
const
AnakinConfig
&
config
)
{
template
<
typename
Target
>
bool
PaddleInferenceAnakinPredictor
<
Target
>::
Init
(
const
AnakinConfig
&
config
)
{
if
(
!
(
graph_
.
load
(
config
.
model_file
)))
{
LOG
(
FATAL
)
<<
"fail to load graph from "
<<
config
.
model_file
;
return
false
;
}
graph_
.
ResetBatchSize
(
"input_0"
,
config
.
max_batch_size
);
auto
inputs
=
graph_
.
get_ins
();
for
(
auto
&
input_str
:
inputs
)
{
graph_
.
ResetBatchSize
(
input_str
,
config
.
max_batch_size
);
}
// optimization for graph
if
(
!
(
graph_
.
Optimize
()))
{
return
false
;
}
// construct executer
executor_
.
init
(
graph_
);
if
(
executor_p_
==
nullptr
)
{
executor_p_
=
new
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
(
graph_
,
true
);
}
return
true
;
}
bool
PaddleInferenceAnakinPredictor
::
Run
(
template
<
typename
Target
>
bool
PaddleInferenceAnakinPredictor
<
Target
>::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
for
(
const
auto
&
input
:
inputs
)
{
...
...
@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
<<
"'s type is not float"
;
return
false
;
}
auto
d_tensor_in_p
=
executor_
.
get_in
(
input
.
name
);
auto
d_tensor_in_p
=
executor_p_
->
get_in
(
input
.
name
);
auto
net_shape
=
d_tensor_in_p
->
valid_shape
();
if
(
net_shape
.
size
()
!=
input
.
shape
.
size
())
{
LOG
(
ERROR
)
<<
" input "
<<
input
.
name
<<
"'s shape size should be equal to that of net"
;
return
false
;
}
int
sum
=
1
;
for_each
(
input
.
shape
.
begin
(),
input
.
shape
.
end
(),
[
&
](
int
n
)
{
sum
*=
n
;
});
if
(
sum
>
net_shape
.
count
())
{
graph_
.
Reshape
(
input
.
name
,
input
.
shape
);
delete
executor_p_
;
executor_p_
=
new
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
(
graph_
,
true
);
d_tensor_in_p
=
executor_p_
->
get_in
(
input
.
name
);
}
anakin
::
saber
::
Shape
tmp_shape
;
for
(
auto
s
:
input
.
shape
)
{
tmp_shape
.
push_back
(
s
);
}
d_tensor_in_p
->
reshape
(
tmp_shape
);
float
*
d_data_p
=
d_tensor_in_p
->
mutable_data
();
if
(
cudaMemcpy
(
d_data_p
,
static_cast
<
float
*>
(
input
.
data
.
data
()),
d_tensor_in_p
->
valid_size
()
*
sizeof
(
float
),
...
...
@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
}
cudaStreamSynchronize
(
NULL
);
}
executor_
.
prediction
();
cudaDeviceSynchronize
();
executor_p_
->
prediction
();
cudaDeviceSynchronize
();
if
(
output_data
->
empty
())
{
LOG
(
ERROR
)
<<
"At least one output should be set with tensors' names."
;
return
false
;
}
for
(
auto
&
output
:
*
output_data
)
{
auto
*
tensor
=
executor_
.
get_out
(
output
.
name
);
output
.
shape
=
tensor
->
shape
();
auto
*
tensor
=
executor_
p_
->
get_out
(
output
.
name
);
output
.
shape
=
tensor
->
valid_
shape
();
if
(
output
.
data
.
length
()
<
tensor
->
valid_size
()
*
sizeof
(
float
))
{
output
.
data
.
Resize
(
tensor
->
valid_size
()
*
sizeof
(
float
));
}
...
...
@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
return
true
;
}
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
&
PaddleInferenceAnakinPredictor
::
get_executer
()
{
return
executor_
;
template
<
typename
Target
>
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
&
PaddleInferenceAnakinPredictor
<
Target
>::
get_executer
()
{
return
*
executor_p_
;
}
// the cloned new Predictor of anakin share the same net weights from original
// Predictor
std
::
unique_ptr
<
PaddlePredictor
>
PaddleInferenceAnakinPredictor
::
Clone
()
{
template
<
typename
Target
>
std
::
unique_ptr
<
PaddlePredictor
>
PaddleInferenceAnakinPredictor
<
Target
>::
Clone
()
{
VLOG
(
3
)
<<
"Anakin Predictor::clone"
;
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
PaddleInferenceAnakinPredictor
());
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
PaddleInferenceAnakinPredictor
<
Target
>
());
// construct executer from other graph
auto
anakin_predictor_p
=
dynamic_cast
<
PaddleInferenceAnakinPredictor
*>
(
cls
.
get
());
dynamic_cast
<
PaddleInferenceAnakinPredictor
<
Target
>
*>
(
cls
.
get
());
if
(
!
anakin_predictor_p
)
{
LOG
(
ERROR
)
<<
"fail to call Init"
;
return
nullptr
;
...
...
@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
return
std
::
move
(
cls
);
}
template
class
PaddleInferenceAnakinPredictor
<
anakin
::
NV
>;
template
class
PaddleInferenceAnakinPredictor
<
anakin
::
X86
>;
// A factory to help create difference predictor.
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
const
AnakinConfig
&
config
)
{
VLOG
(
3
)
<<
"Anakin Predictor create."
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
new
PaddleInferenceAnakinPredictor
(
config
));
return
x
;
}
if
(
config
.
target_type
==
AnakinConfig
::
NVGPU
)
{
VLOG
(
3
)
<<
"Anakin Predictor create on [ NVIDIA GPU ]."
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
new
PaddleInferenceAnakinPredictor
<
anakin
::
NV
>
(
config
));
return
x
;
}
else
if
(
config
.
target_type
==
AnakinConfig
::
X86
)
{
VLOG
(
3
)
<<
"Anakin Predictor create on [ Intel X86 ]."
;
std
::
unique_ptr
<
PaddlePredictor
>
x
(
new
PaddleInferenceAnakinPredictor
<
anakin
::
X86
>
(
config
));
return
x
;
}
else
{
VLOG
(
3
)
<<
"Anakin Predictor create on unknown platform."
;
return
nullptr
;
}
};
}
// namespace paddle
paddle/fluid/inference/api/api_anakin_engine.h
浏览文件 @
a58dd3e5
...
...
@@ -20,14 +20,16 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
// from anakin
#include "framework/core/net/net.h"
#include "framework/graph/graph.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "saber/core/shape.h"
#include "saber/saber_types.h"
namespace
paddle
{
template
<
typename
Target
>
class
PaddleInferenceAnakinPredictor
:
public
PaddlePredictor
{
public:
PaddleInferenceAnakinPredictor
()
{}
...
...
@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
override
;
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>&
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>&
get_executer
();
~
PaddleInferenceAnakinPredictor
()
override
{};
~
PaddleInferenceAnakinPredictor
()
override
{
delete
executor_p_
;
executor_p_
=
nullptr
;
};
private:
bool
Init
(
const
AnakinConfig
&
config
);
anakin
::
graph
::
Graph
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
anakin
::
graph
::
Graph
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
graph_
;
anakin
::
Net
<
anakin
::
NV
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>
executor_
;
anakin
::
Net
<
Target
,
anakin
::
saber
::
AK_FLOAT
,
anakin
::
Precision
::
FP32
>*
executor_
p_
{
nullptr
}
;
AnakinConfig
config_
;
};
...
...
paddle/fluid/inference/api/api_anakin_engine_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string
(
model
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
model
,
""
,
"Directory of the inference model
(mobile_v2)
."
);
namespace
paddle
{
AnakinConfig
GetConfig
()
{
AnakinConfig
config
;
// using AnakinConfig::X86 if you need to use cpu to do inference
config
.
target_type
=
AnakinConfig
::
NVGPU
;
config
.
model_file
=
FLAGS_model
;
config
.
device
=
0
;
config
.
max_batch_size
=
1
;
...
...
@@ -36,28 +38,27 @@ TEST(inference, anakin) {
CreatePaddlePredictor
<
AnakinConfig
,
PaddleEngineKind
::
kAnakin
>
(
config
);
float
data
[
1
*
3
*
224
*
224
]
=
{
1.0
f
};
PaddleTensor
tensor
{.
name
=
"input_0"
,
.
shape
=
std
::
vector
<
int
>
({
1
,
3
,
224
,
224
}),
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
.
dtype
=
PaddleDType
::
FLOAT32
}
;
PaddleTensor
tensor
;
tensor
.
name
=
"input_0"
;
tensor
.
shape
=
std
::
vector
<
int
>
({
1
,
3
,
224
,
224
});
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
tensor
.
dtype
=
PaddleDType
::
FLOAT32
;
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
;
paddle_tensor_feeds
.
emplace_back
(
std
::
move
(
tensor
));
std
::
vector
<
PaddleTensor
>
paddle_tensor_feeds
(
1
,
tensor
);
PaddleTensor
tensor_out
{.
name
=
"prob_out"
,
.
shape
=
std
::
vector
<
int
>
({
1000
,
1
}),
.
data
=
PaddleBuf
(),
.
dtype
=
PaddleDType
::
FLOAT32
};
PaddleTensor
tensor_out
;
tensor_out
.
name
=
"prob_out"
;
tensor_out
.
shape
=
std
::
vector
<
int
>
({});
tensor_out
.
data
=
PaddleBuf
();
tensor_out
.
dtype
=
PaddleDType
::
FLOAT32
;
std
::
vector
<
PaddleTensor
>
outputs
;
outputs
.
emplace_back
(
std
::
move
(
tensor_out
));
std
::
vector
<
PaddleTensor
>
outputs
(
1
,
tensor_out
);
ASSERT_TRUE
(
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
));
float
*
data_o
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
());
for
(
size_t
j
=
0
;
j
<
1000
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
outputs
[
0
].
data
.
length
()
;
++
j
)
{
LOG
(
INFO
)
<<
"output["
<<
j
<<
"]: "
<<
data_o
[
j
];
}
}
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
a58dd3e5
...
...
@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
lod
.
emplace_back
(
level
);
}
input
.
set_lod
(
lod
);
feeds
->
push_back
(
input
);
}
return
true
;
...
...
@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
buffer
.
Resize
(
sizeof
(
float
)
*
data
.
size
());
}
std
::
memcpy
(
buffer
.
data
(),
data
.
data
(),
buffer
.
length
());
// copy LoD
for
(
const
auto
&
level
:
fetchs
[
i
].
lod
())
{
outputs
->
at
(
i
).
lod
.
emplace_back
(
level
);
}
outputs
->
at
(
i
).
dtype
=
PaddleDType
::
FLOAT32
;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy.
}
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
浏览文件 @
a58dd3e5
...
...
@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
void
OptimizeInferenceProgram
()
{
// Analyze inference_program
Argument
argument
;
if
(
!
config_
.
model_dir
.
empty
())
{
argument
.
fluid_model_dir
.
reset
(
new
std
::
string
(
config_
.
model_dir
));
}
else
{
PADDLE_ENFORCE
(
!
config_
.
param_file
.
empty
(),
"Either model_dir or (param_file, prog_file) should be set."
);
PADDLE_ENFORCE
(
!
config_
.
prog_file
.
empty
());
argument
.
fluid_model_program_path
.
reset
(
new
std
::
string
(
config_
.
prog_file
));
argument
.
fluid_model_param_path
.
reset
(
new
std
::
string
(
config_
.
param_file
));
}
argument
.
origin_program_desc
.
reset
(
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
Singleton
<
Analyzer
>::
Global
().
Run
(
&
argument
);
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
浏览文件 @
a58dd3e5
...
...
@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
std
::
vector
<
int64_t
>
data
(
20
);
for
(
int
i
=
0
;
i
<
20
;
i
++
)
data
[
i
]
=
i
;
PaddleTensor
tensor
{
.
name
=
""
,
.
shape
=
std
::
vector
<
int
>
({
10
,
1
}),
.
data
=
PaddleBuf
(
data
.
data
(),
data
.
size
()
*
sizeof
(
int64_t
)),
.
dtype
=
PaddleDType
::
INT64
};
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
10
,
1
});
tensor
.
data
=
PaddleBuf
(
data
.
data
(),
data
.
size
()
*
sizeof
(
int64_t
));
tensor
.
dtype
=
PaddleDType
::
INT64
;
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
slots
(
4
,
tensor
);
...
...
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
浏览文件 @
a58dd3e5
...
...
@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
//# 2. Prepare input.
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleTensor
tensor
{.
name
=
""
,
.
shape
=
std
::
vector
<
int
>
({
4
,
1
}),
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
.
dtype
=
PaddleDType
::
INT64
}
;
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
4
,
1
});
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
tensor
.
dtype
=
PaddleDType
::
INT64
;
// For simplicity, we set all the slots with the same data.
std
::
vector
<
PaddleTensor
>
slots
(
4
,
tensor
);
...
...
@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
for
(
int
batch_id
=
0
;
batch_id
<
num_batches
;
++
batch_id
)
{
// 2. Dummy Input Data
int64_t
data
[
4
]
=
{
1
,
2
,
3
,
4
};
PaddleTensor
tensor
{.
name
=
""
,
.
shape
=
std
::
vector
<
int
>
({
4
,
1
}),
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
)),
.
dtype
=
PaddleDType
::
INT64
};
PaddleTensor
tensor
;
tensor
.
shape
=
std
::
vector
<
int
>
({
4
,
1
});
tensor
.
data
=
PaddleBuf
(
data
,
sizeof
(
data
));
tensor
.
dtype
=
PaddleDType
::
INT64
;
std
::
vector
<
PaddleTensor
>
inputs
(
4
,
tensor
);
std
::
vector
<
PaddleTensor
>
outputs
;
// 3. Run
...
...
paddle/fluid/inference/api/demo_ci/vis_demo.cc
浏览文件 @
a58dd3e5
...
...
@@ -20,8 +20,8 @@ limitations under the License. */
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/demo_ci/utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "utils.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
...
...
@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
file
.
close
();
// Inference.
PaddleTensor
input
{
.
name
=
"xx"
,
.
shape
=
record
.
shape
,
.
data
=
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
)),
.
dtype
=
PaddleDType
::
FLOAT32
}
;
PaddleTensor
input
;
input
.
shape
=
record
.
shape
;
input
.
data
=
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
));
input
.
dtype
=
PaddleDType
::
FLOAT32
;
VLOG
(
3
)
<<
"run executor"
;
std
::
vector
<
PaddleTensor
>
output
;
...
...
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
a58dd3e5
...
...
@@ -44,7 +44,7 @@ class PaddleBuf {
PaddleBuf
(
void
*
data
,
size_t
length
)
:
data_
(
data
),
length_
(
length
),
memory_owned_
{
false
}
{}
// Own memory.
PaddleBuf
(
size_t
length
)
explicit
PaddleBuf
(
size_t
length
)
:
data_
(
new
char
[
length
]),
length_
(
length
),
memory_owned_
(
true
)
{}
// Resize to `length` bytes.
void
Resize
(
size_t
length
);
...
...
@@ -67,9 +67,9 @@ struct PaddleTensor {
PaddleTensor
()
=
default
;
std
::
string
name
;
// variable name.
std
::
vector
<
int
>
shape
;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
PaddleBuf
data
;
// blob of data.
PaddleDType
dtype
;
std
::
vector
<
std
::
vector
<
uint64_t
>>
lod
;
// lod data
};
enum
class
PaddleEngineKind
{
...
...
@@ -126,9 +126,11 @@ struct NativeConfig : public PaddlePredictor::Config {
// Configurations for Anakin engine.
struct
AnakinConfig
:
public
PaddlePredictor
::
Config
{
enum
TargetType
{
NVGPU
=
0
,
X86
};
int
device
;
std
::
string
model_file
;
int
max_batch_size
{
-
1
};
TargetType
target_type
;
};
struct
TensorRTConfig
:
public
NativeConfig
{
...
...
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
a58dd3e5
# Add TRT tests
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
elementwise_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
...
...
@@ -13,6 +13,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine mul_op SERIAL
)
nv_test
(
test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine activation_op SERIAL
)
nv_test
(
test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine conv_op SERIAL
)
nv_test
(
test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pool_op SERIAL
)
nv_test
(
test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine elementwise_add_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -20,11 +20,60 @@ namespace tensorrt {
class
Conv2dOpConverter
:
public
OpConverter
{
public:
Conv2dOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
LOG
(
INFO
)
<<
"convert a fluid conv2d op to tensorrt conv layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Input"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Filter"
).
size
(),
1
);
// Y is a weight
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Output"
).
size
(),
1
);
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Input"
).
front
());
// Declare weights
auto
*
Y_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Filter"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
Y_v
);
auto
*
Y_t
=
Y_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
weight_data
=
Y_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
PADDLE_ENFORCE_EQ
(
Y_t
->
dims
().
size
(),
4UL
);
const
int
n_output
=
Y_t
->
dims
()[
0
];
const
int
filter_h
=
Y_t
->
dims
()[
2
];
const
int
filter_w
=
Y_t
->
dims
()[
3
];
const
int
groups
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"groups"
));
const
std
::
vector
<
int
>
dilations
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"dilations"
));
const
std
::
vector
<
int
>
strides
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
const
std
::
vector
<
int
>
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
nvinfer1
::
DimsHW
nv_ksize
(
filter_h
,
filter_w
);
nvinfer1
::
DimsHW
nv_dilations
(
dilations
[
0
],
dilations
[
1
]);
nvinfer1
::
DimsHW
nv_strides
(
strides
[
0
],
strides
[
1
]);
nvinfer1
::
DimsHW
nv_paddings
(
paddings
[
0
],
paddings
[
1
]);
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
Y_t
->
memory_size
()
/
sizeof
(
float
)};
TensorRTEngine
::
Weight
bias
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
n_output
,
nv_ksize
,
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
layer
!=
nullptr
);
layer
->
setStride
(
nv_strides
);
layer
->
setPadding
(
nv_paddings
);
layer
->
setDilation
(
nv_dilations
);
layer
->
setNbGroups
(
groups
);
auto
output_name
=
op_desc
.
Output
(
"Output"
).
front
();
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
engine_
->
DeclareOutput
(
output_name
);
}
}
};
...
...
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
0 → 100644
浏览文件 @
a58dd3e5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
class
ElementwiseWeightOpConverter
:
public
OpConverter
{
public:
ElementwiseWeightOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
LOG
(
INFO
)
<<
"convert a fluid elementwise op to tensorrt IScaleLayer"
;
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1
);
// Y is a weight
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
).
front
());
nvinfer1
::
Dims
dims_x
=
X
->
getDimensions
();
PADDLE_ENFORCE
(
dims_x
.
nbDims
>=
3
);
auto
*
Y_v
=
scope
.
FindVar
(
op_desc
.
Input
(
"Y"
).
front
());
PADDLE_ENFORCE_NOT_NULL
(
Y_v
);
auto
*
Y_t
=
Y_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
weight_data
=
Y_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
auto
scale_mode
=
nvinfer1
::
ScaleMode
::
kELEMENTWISE
;
std
::
vector
<
int
>
dims_y
=
framework
::
vectorize2int
(
Y_t
->
dims
());
if
(
static_cast
<
int
>
(
dims_y
.
size
())
==
dims_x
.
nbDims
+
1
)
{
if
(
dims_y
[
0
]
==
1
)
dims_y
.
erase
(
dims_y
.
begin
());
}
if
(
static_cast
<
int
>
(
dims_y
.
size
())
==
1
&&
dims_y
[
0
]
==
dims_x
.
d
[
0
])
{
scale_mode
=
nvinfer1
::
ScaleMode
::
kCHANNEL
;
}
else
if
(
static_cast
<
int
>
(
dims_y
.
size
())
==
dims_x
.
nbDims
&&
dims_y
[
0
]
==
dims_x
.
d
[
0
])
{
scale_mode
=
nvinfer1
::
ScaleMode
::
kELEMENTWISE
;
for
(
int
i
=
1
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_y
[
i
]
!=
dims_x
.
d
[
i
])
{
scale_mode
=
nvinfer1
::
ScaleMode
::
kCHANNEL
;
break
;
}
}
if
(
scale_mode
==
nvinfer1
::
ScaleMode
::
kCHANNEL
)
{
for
(
int
i
=
1
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_y
[
i
]
!=
1
)
PADDLE_THROW
(
"TensorRT unsupported weight shape for Elementwise op!"
);
}
}
}
else
{
PADDLE_THROW
(
"TensorRT unsupported weight Shape for Elementwise op!"
);
}
TensorRTEngine
::
Weight
shift_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
Y_t
->
memory_size
()
/
sizeof
(
float
)};
TensorRTEngine
::
Weight
scale_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
TensorRTEngine
::
Weight
power_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
nvinfer1
::
IScaleLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
scale_mode
,
shift_weights
.
get
(),
scale_weights
.
get
(),
power_weights
.
get
());
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
}
}
};
class
ElementwiseTensorOpConverter
:
public
OpConverter
{
public:
ElementwiseTensorOpConverter
()
{}
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
LOG
(
INFO
)
<<
"convert a fluid elementwise op to tensorrt IScaleLayer"
;
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1
);
// Y is a weight
PADDLE_ENFORCE_EQ
(
op_desc
.
Output
(
"Out"
).
size
(),
1
);
auto
*
X
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
).
front
());
auto
*
Y
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"Y"
).
front
());
nvinfer1
::
Dims
dims_x
=
X
->
getDimensions
();
nvinfer1
::
Dims
dims_y
=
Y
->
getDimensions
();
// The two input tensor should have the same dims
PADDLE_ENFORCE
(
dims_x
.
nbDims
>=
3
);
if
(
dims_x
.
nbDims
==
dims_y
.
nbDims
)
{
for
(
int
i
=
0
;
i
<
dims_x
.
nbDims
;
i
++
)
{
if
(
dims_x
.
d
[
i
]
!=
dims_y
.
d
[
i
])
PADDLE_THROW
(
"TensorRT unsupported tensor shape for Elementwise op!"
);
}
}
else
{
PADDLE_THROW
(
"TensorRT unsupported tensor shape for Elementwise op!"
);
}
auto
op_pair
=
ops
.
find
(
op_type_
);
if
(
op_pair
==
ops
.
end
())
{
PADDLE_THROW
(
"Wrong elementwise op type!"
);
}
nvinfer1
::
IElementWiseLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
ElementWise
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
*
const_cast
<
nvinfer1
::
ITensor
*>
(
Y
),
op_pair
->
second
);
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
}
}
protected:
static
const
std
::
unordered_map
<
std
::
string
,
nvinfer1
::
ElementWiseOperation
>
ops
;
std
::
string
op_type_
;
};
const
std
::
unordered_map
<
std
::
string
,
nvinfer1
::
ElementWiseOperation
>
ElementwiseTensorOpConverter
::
ops
=
{
{
"add"
,
nvinfer1
::
ElementWiseOperation
::
kSUM
},
{
"mul"
,
nvinfer1
::
ElementWiseOperation
::
kPROD
},
{
"sub"
,
nvinfer1
::
ElementWiseOperation
::
kSUB
},
{
"div"
,
nvinfer1
::
ElementWiseOperation
::
kDIV
},
{
"min"
,
nvinfer1
::
ElementWiseOperation
::
kMIN
},
{
"pow"
,
nvinfer1
::
ElementWiseOperation
::
kPOW
},
{
"max"
,
nvinfer1
::
ElementWiseOperation
::
kMAX
},
};
class
ElementwiseTensorAddOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorAddOpConverter
()
{
op_type_
=
"add"
;
}
};
class
ElementwiseTensorMulOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorMulOpConverter
()
{
op_type_
=
"mul"
;
}
};
class
ElementwiseTensorSubOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorSubOpConverter
()
{
op_type_
=
"sub"
;
}
};
class
ElementwiseTensorDivOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorDivOpConverter
()
{
op_type_
=
"div"
;
}
};
class
ElementwiseTensorMinOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorMinOpConverter
()
{
op_type_
=
"min"
;
}
};
class
ElementwiseTensorMaxOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorMaxOpConverter
()
{
op_type_
=
"max"
;
}
};
class
ElementwiseTensorPowOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorPowOpConverter
()
{
op_type_
=
"pow"
;
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_weight
,
ElementwiseWeightOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_tensor
,
ElementwiseTensorAddOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_sub_tensor
,
ElementwiseTensorSubOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_div_tensor
,
ElementwiseTensorDivOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_mul_tensor
,
ElementwiseTensorMulOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_max_tensor
,
ElementwiseTensorMaxOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_min_tensor
,
ElementwiseTensorMinOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_pow_tensor
,
ElementwiseTensorPowOpConverter
);
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
}
// indata c * k
// Reorder the data layout from CK to KC.
void
ReorderCKtoKC
(
TensorRTEngine
::
Weight
&
iweights
,
void
ReorderCKtoKC
(
TensorRTEngine
::
Weight
&
iweights
,
// NOLINT
TensorRTEngine
::
Weight
*
oweights
)
{
int
c
=
iweights
.
dims
[
0
];
int
k
=
iweights
.
dims
[
1
];
...
...
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
a58dd3e5
...
...
@@ -55,6 +55,31 @@ class OpConverter {
it
=
Registry
<
OpConverter
>::
Lookup
(
"fc"
);
}
}
if
(
op_desc
.
Type
().
find
(
"elementwise"
)
!=
std
::
string
::
npos
)
{
static
std
::
unordered_set
<
std
::
string
>
add_tensor_op_set
{
"add"
,
"mul"
,
"sub"
,
"div"
,
"max"
,
"min"
,
"pow"
};
// TODO(xingzhaolong): all mul, sub, div
// static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
// "sub", "div"};
static
std
::
unordered_set
<
std
::
string
>
add_weight_op_set
{
"add"
};
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"Y"
).
size
(),
1UL
);
int
op_type_len
=
op_desc
.
Type
().
size
();
std
::
string
op_type
=
op_desc
.
Type
().
substr
(
op_type_len
-
3
,
op_type_len
);
std
::
string
Y
=
op_desc
.
Input
(
"Y"
)[
0
];
if
(
parameters
.
count
(
Y
))
{
PADDLE_ENFORCE
(
add_weight_op_set
.
count
(
op_type
)
>
0
,
"Unsupported elementwise type"
+
op_type
);
it
=
Registry
<
OpConverter
>::
Lookup
(
"elementwise_"
+
op_type
+
"_weight"
);
}
else
{
PADDLE_ENFORCE
(
add_tensor_op_set
.
count
(
op_type
)
>
0
,
"Unsupported elementwise type"
+
op_type
);
it
=
Registry
<
OpConverter
>::
Lookup
(
"elementwise_"
+
op_type
+
"_tensor"
);
}
}
if
(
!
it
)
{
it
=
Registry
<
OpConverter
>::
Lookup
(
op_desc
.
Type
());
}
...
...
paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
0 → 100644
浏览文件 @
a58dd3e5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
conv2d_op
,
test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"conv2d-Y"
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
5
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"conv2d-X"
,
nvinfer1
::
Dims3
(
2
,
5
,
5
));
validator
.
DeclParamVar
(
"conv2d-Y"
,
nvinfer1
::
Dims4
(
3
,
2
,
3
,
3
));
validator
.
DeclOutputVar
(
"conv2d-Out"
,
nvinfer1
::
Dims3
(
3
,
5
,
5
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"conv2d"
);
desc
.
SetInput
(
"Input"
,
{
"conv2d-X"
});
desc
.
SetInput
(
"Filter"
,
{
"conv2d-Y"
});
desc
.
SetOutput
(
"Output"
,
{
"conv2d-Out"
});
const
std
::
vector
<
int
>
strides
({
1
,
1
});
const
std
::
vector
<
int
>
paddings
({
1
,
1
});
const
std
::
vector
<
int
>
dilations
({
1
,
1
});
const
int
groups
=
1
;
desc
.
SetAttr
(
"strides"
,
strides
);
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"dilations"
,
dilations
);
desc
.
SetAttr
(
"groups"
,
groups
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
3
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
conv2d
);
paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
0 → 100644
浏览文件 @
a58dd3e5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
elementwise_op
,
add_weight_test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"elementwise_add-Y"
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_add-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclParamVar
(
"elementwise_add-Y"
,
nvinfer1
::
Dims3
(
10
,
1
,
1
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator
.
DeclOutputVar
(
"elementwise_add-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_add"
);
desc
.
SetInput
(
"X"
,
{
"elementwise_add-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_add-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_add-Out"
});
int
axis
=
1
;
desc
.
SetAttr
(
"axis"
,
axis
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
8
);
}
TEST
(
elementwise_op
,
add_tensor_test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
;
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
8
,
parameters
,
scope
,
1
<<
15
);
validator
.
DeclInputVar
(
"elementwise_add-X"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
validator
.
DeclInputVar
(
"elementwise_add-Y"
,
nvinfer1
::
Dims3
(
10
,
3
,
3
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator
.
DeclOutputVar
(
"elementwise_add-Out"
,
nvinfer1
::
DimsCHW
(
10
,
3
,
3
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"elementwise_add"
);
desc
.
SetInput
(
"X"
,
{
"elementwise_add-X"
});
desc
.
SetInput
(
"Y"
,
{
"elementwise_add-Y"
});
desc
.
SetOutput
(
"Out"
,
{
"elementwise_add-Out"
});
// the defalut axis of elementwise op is -1
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
8
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
elementwise_add
);
paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
浏览文件 @
a58dd3e5
...
...
@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
framework
::
ProgramDesc
prog
;
auto
*
block
=
prog
.
MutableBlock
(
0
);
auto
*
conv2d_op
=
block
->
AppendOp
();
// init trt engine
cudaStream_t
stream_
;
std
::
unique_ptr
<
TensorRTEngine
>
engine_
;
engine_
.
reset
(
new
TensorRTEngine
(
5
,
1
<<
15
,
&
stream_
));
engine_
->
InitNetwork
();
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream_
),
0
);
engine_
->
DeclareInput
(
"conv2d-X"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
(
2
,
5
,
5
));
conv2d_op
->
SetType
(
"conv2d"
);
conv2d_op
->
SetInput
(
"Input"
,
{
"conv2d-X"
});
conv2d_op
->
SetInput
(
"Filter"
,
{
"conv2d-Y"
});
conv2d_op
->
SetOutput
(
"Output"
,
{
"conv2d-Out"
});
OpConverter
converter
;
const
std
::
vector
<
int
>
strides
({
1
,
1
});
const
std
::
vector
<
int
>
paddings
({
1
,
1
});
const
std
::
vector
<
int
>
dilations
({
1
,
1
});
const
int
groups
=
1
;
conv2d_op
->
SetAttr
(
"strides"
,
strides
);
conv2d_op
->
SetAttr
(
"paddings"
,
paddings
);
conv2d_op
->
SetAttr
(
"dilations"
,
dilations
);
conv2d_op
->
SetAttr
(
"groups"
,
groups
);
// init scope
framework
::
Scope
scope
;
converter
.
ConvertBlock
(
*
block
->
Proto
(),
{},
scope
,
nullptr
/*TensorRTEngine*/
);
std
::
vector
<
int
>
dim_vec
=
{
3
,
2
,
3
,
3
};
auto
*
x
=
scope
.
Var
(
"conv2d-Y"
);
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
OpConverter
converter
;
converter
.
ConvertBlock
(
*
block
->
Proto
(),
{
"conv2d-Y"
},
scope
,
engine_
.
get
()
/*TensorRTEngine*/
);
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/convert/ut_helper.h
浏览文件 @
a58dd3e5
...
...
@@ -149,7 +149,7 @@ class TRTConvertValidation {
cudaStreamSynchronize
(
*
engine_
->
stream
());
ASSERT_FALSE
(
op_desc_
->
OutputArgumentNames
().
empty
());
const
size_t
output_space_size
=
2
000
;
const
size_t
output_space_size
=
3
000
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
std
::
vector
<
float
>
fluid_out
;
std
::
vector
<
float
>
trt_out
(
output_space_size
);
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
a58dd3e5
...
...
@@ -20,10 +20,10 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
DEFINE_bool
(
cudnn_deterministic
,
tru
e
,
DEFINE_bool
(
cudnn_deterministic
,
fals
e
,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"
fals
e, the algorithm is deterministic."
);
"
tru
e, the algorithm is deterministic."
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
if
(
input_grad
)
{
if
(
FLAGS_cudnn_deterministic
)
{
if
(
!
FLAGS_cudnn_deterministic
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
...
...
@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
if
(
filter_grad
)
{
if
(
FLAGS_cudnn_deterministic
)
{
if
(
!
FLAGS_cudnn_deterministic
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_bwd_weights_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
...
...
@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_weights_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
...
...
@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_data_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
...
...
@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
weights_pd
=
conv_bwd_data_pd_
->
weights_primitive_desc
();
auto
user_pd
=
user_weights_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_pd
,
user_weights_memory_p
,
...
...
@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
"@src_mem_p"
,
...
...
@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
user_weights_pd
=
user_weights_memory_p
->
get_primitive_desc
();
auto
weights_pd
=
conv_pd_
->
weights_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_weights_pd
,
...
...
@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static
std
::
string
GetHash
(
memory
::
dims
&
input_dims
,
memory
::
dims
&
weights_dims
,
std
::
vector
<
int
>&
strides
,
std
::
vector
<
int
>&
paddings
,
std
::
vector
<
int
>&
dilations
,
int
groups
,
const
std
::
string
&
suffix
)
{
static
std
::
string
GetHash
(
memory
::
dims
&
input_dims
,
// NOLINT
memory
::
dims
&
weights_dims
,
// NOLINT
std
::
vector
<
int
>&
strides
,
// NOLINT
std
::
vector
<
int
>&
paddings
,
// NOLINT
std
::
vector
<
int
>&
dilations
,
// NOLINT
int
groups
,
const
std
::
string
&
suffix
)
{
return
dims2str
(
input_dims
)
+
dims2str
(
weights_dims
)
+
dims2str
(
strides
)
+
dims2str
(
paddings
)
+
dims2str
(
dilations
)
+
std
::
to_string
(
groups
)
+
suffix
;
...
...
paddle/fluid/operators/elementwise_add_mkldnn_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
auto
y_dims
_untrimed
=
y
->
dims
();
auto
z_dims
=
z
->
dims
();
// Execute default elementwise_add operator when
// broadcast operations need to performed.
if
(
x_dims
!=
y_dims
)
{
if
(
x_dims
!=
y_dims
_untrimed
)
{
auto
sum_func
=
[](
T
a
,
T
b
)
->
T
{
return
a
+
b
;
};
TransformFunctor
<
decltype
(
sum_func
),
T
,
...
...
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
ctx
.
template
device_context
<
paddle
::
platform
::
CPUDeviceContext
>(),
sum_func
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
_untrimed
.
size
()
:
axis
);
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
"Axis should be in range [0, x_dims)"
);
trim_trailing_singular_dims
(
&
y_dims
);
auto
y_dims
=
trim_trailing_singular_dims
(
y_dims_untrimed
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
...
...
@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
"Wrong layout/format set for Y tensor"
);
std
::
vector
<
int
>
src_x_tz
=
framework
::
vectorize2int
(
x_dims
);
std
::
vector
<
int
>
src_y_tz
=
framework
::
vectorize2int
(
y_dims
);
std
::
vector
<
int
>
src_y_tz
=
framework
::
vectorize2int
(
y_dims
_untrimed
);
std
::
vector
<
int
>
dst_tz
=
framework
::
vectorize2int
(
z_dims
);
std
::
vector
<
memory
::
primitive_desc
>
srcs_pd
;
...
...
@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
// skip out, x, y,
// dout length is larger or equal than dx, dy.
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
auto
set_mkldnn_format
=
[](
Tensor
*
in
,
const
Tensor
*
out
)
{
in
->
set_layout
(
DataLayout
::
kMKLDNN
);
in
->
set_format
(
out
->
format
());
};
if
(
x
->
dims
()
==
y
->
dims
())
{
auto
blas
=
math
::
GetBlas
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
ctx
);
if
(
dx
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
set_mkldnn_format
(
dx
,
dout
);
}
if
(
dy
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
set_mkldnn_format
(
dy
,
dout
);
if
(
dx
!=
nullptr
&&
dy
!=
nullptr
&&
dx
->
dims
()
==
dy
->
dims
())
{
if
(
dx
->
dims
()
==
dy
->
dims
())
{
auto
blas
=
math
::
GetBlas
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
ctx
);
if
(
dx
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
set_mkldnn_format
(
dx
,
dout
);
}
if
(
dy
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
set_mkldnn_format
(
dy
,
dout
);
}
}
}
else
{
// Execute default kernel when broadcast is needed
ElemwiseGradCompute
<
paddle
::
platform
::
CPUDeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
Elemwise
Explicit
GradCompute
<
paddle
::
platform
::
CPUDeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
}
...
...
paddle/fluid/operators/elementwise_add_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_add
,
"Add"
,
"Out = X + Y"
);
REGISTER_ELEMWISE_GRAD_MAKER
(
elementwise_add
,
Add
);
REGISTER_ELEMWISE_EXPLICIT_OP
(
elementwise_add
,
"Add"
,
"Out = X + Y"
,
"Out"
,
"X"
);
REGISTER_OP_CPU_KERNEL
(
elementwise_add
,
ops
::
ElementwiseAddKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_add_op.h
浏览文件 @
a58dd3e5
...
...
@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
framework
::
Tensor
*
dy
)
{
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElemwiseGradCompute
<
DeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
ElemwiseExplicitGradCompute
<
DeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
}
template
<
typename
DeviceContext
,
typename
T
>
...
...
@@ -140,14 +141,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
// skip out, x, y
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
(
x
->
dims
()
==
y
->
dims
()))
{
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
dx
!=
nullptr
&&
dy
!=
nullptr
&&
(
dx
->
dims
()
==
dy
->
dims
()))
{
elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
else
{
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
...
...
paddle/fluid/operators/elementwise_div_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_div
,
"Div"
,
"Out = X / Y"
);
REGISTER_OP_CPU_KERNEL
(
elementwise_div
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_op.h
浏览文件 @
a58dd3e5
...
...
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
void
Make
()
final
{
AddInput
(
"X"
,
"(Tensor), The first input tensor of elementwise op."
);
AddInput
(
"Y"
,
"(Tensor), The second input tensor of elementwise op."
);
AddOutput
(
"Out"
,
"The output of elementwise op."
).
Reuse
(
"X"
);
// AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
// memory.").AsIntermediate();
AddOutput
(
"Out"
,
"The output of elementwise op."
);
AddAttr
<
int
>
(
"axis"
,
"(int, default -1). The start dimension index "
"for broadcasting Y onto X."
)
...
...
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
)DOC"
,
GetName
(),
GetEquation
()));
SetReuse
();
}
protected:
virtual
std
::
string
GetName
()
const
=
0
;
virtual
std
::
string
GetEquation
()
const
=
0
;
virtual
void
SetReuse
()
{}
};
class
ElementwiseOpGrad
:
public
framework
::
OperatorWithKernel
{
...
...
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
());
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
)
)
->
type
());
#ifdef PADDLE_WITH_MKLDNN
if
(
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
...
...
@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
};
// For Add, Sub op, the X, Out is not needed.
class
ElementwiseOpExplicitGrad
:
public
ElementwiseOpGrad
{
public:
using
operators
::
ElementwiseOpGrad
::
ElementwiseOpGrad
;
using
operators
::
ElementwiseOpGrad
::
GetExpectedKernelType
;
using
Tensor
=
framework
::
Tensor
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null"
);
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
auto
out_dims
=
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
));
ctx
->
SetOutputDim
(
x_grad_name
,
out_dims
);
}
auto
y_grad_name
=
framework
::
GradVarName
(
"Y"
);
if
(
ctx
->
HasOutput
(
y_grad_name
))
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Y"
),
"Input(Y) should not be null"
);
auto
y_dims
=
ctx
->
GetInputDim
(
"Y"
);
ctx
->
SetOutputDim
(
y_grad_name
,
y_dims
);
}
}
};
}
// namespace operators
}
// namespace paddle
/*
*/
#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \
class kernel_type##GradMaker \
: public paddle::framework::SingleGradOpDescMaker { \
public: \
using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
\
protected: \
std::unique_ptr<paddle::framework::OpDesc> Apply() const override { \
auto* op = new paddle::framework::OpDesc(); \
op->SetType(#kernel_type "_grad"); \
op->SetInput("Y", Input("Y")); \
op->SetInput(::paddle::framework::GradVarName("Out"), \
OutputGrad("Out")); \
op->SetAttrMap(Attrs()); \
op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X")); \
op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y")); \
return std::unique_ptr<::paddle::framework::OpDesc>(op); \
} \
}
#define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
...
...
@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
::paddle::operators::ElementwiseOpInferVarType, \
::paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
class __ElemwiseOp##op_type##Maker__ \
: public ::paddle::operators::ElementwiseOpMaker { \
protected: \
virtual std::string GetName() const { return op_name; } \
virtual std::string GetEquation() const { return equation; } \
virtual void SetReuse() { Reuse(__VA_ARGS__); } \
}; \
REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \
__ElemwiseOp##op_type##Maker__, \
::paddle::operators::ElementwiseOpInferVarType, \
op_type##GradMaker); \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad)
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
a58dd3e5
...
...
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
...
...
@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
}
}
inline
void
trim_trailing_singular_dims
(
framework
::
DDim
*
dims
)
{
inline
framework
::
DDim
trim_trailing_singular_dims
(
const
framework
::
DDim
&
dims
)
{
// Remove trailing dimensions of size 1 for y
auto
actual_dims_size
=
dims
->
size
();
auto
actual_dims_size
=
dims
.
size
();
for
(;
actual_dims_size
!=
0
;
--
actual_dims_size
)
{
if
(
(
*
dims
)
[
actual_dims_size
-
1
]
!=
1
)
break
;
if
(
dims
[
actual_dims_size
-
1
]
!=
1
)
break
;
}
if
(
actual_dims_size
!=
dims
->
size
())
{
auto
actual_dims
=
framework
::
vectorize
(
*
dims
);
actual_dims
.
resize
(
actual_dims_size
);
*
dims
=
framework
::
make_ddim
(
actual_dims
);
std
::
vector
<
int
>
trim_dims
;
trim_dims
.
resize
(
actual_dims_size
);
for
(
int
i
=
0
;
i
<
actual_dims_size
;
++
i
)
{
trim_dims
[
i
]
=
dims
[
i
];
}
framework
::
DDim
actual_dims
=
framework
::
make_ddim
(
trim_dims
);
return
actual_dims
;
}
template
<
typename
T
,
typename
DeviceContext
>
...
...
@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
#endif
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradComputeNoBroadcast
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
DDim
&
x_dim
,
const
framework
::
DDim
&
y_dim
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
size_t
N
=
static_cast
<
size_t
>
(
framework
::
product
(
x_dim
));
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
.
template
device_context
<
DeviceContext
>(),
N
);
for_range
(
ElemwiseGradNoBroadcast
<
T
,
DX_OP
,
DY_OP
>
{
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())});
}
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradComputeWithBroadcast
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
DDim
&
x_dim
,
const
framework
::
DDim
&
y_dim_untrimed
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim_untrimed
.
size
()
:
axis
);
auto
y_dim
=
trim_trailing_singular_dims
(
y_dim_untrimed
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
if
(
post
==
1
)
{
int
h
=
pre
;
int
w
=
n
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef __NVCC__
ElemwiseGradBroadcast1CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
ElemwiseGradBroadcast1CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
else
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef __NVCC__
ElemwiseGradBroadcast2CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
ElemwiseGradBroadcast2CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
}
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
...
...
@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
const
framework
::
DDim
x_dim
=
x
.
dims
();
const
framework
::
DDim
y_dim
=
y
.
dims
();
if
(
x
.
dims
()
==
y
.
dims
())
{
size_t
N
=
static_cast
<
size_t
>
(
framework
::
product
(
x
.
dims
()));
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
.
template
device_context
<
DeviceContext
>(),
N
);
for_range
(
ElemwiseGradNoBroadcast
<
T
,
DX_OP
,
DY_OP
>
{
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
())});
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
else
{
// Y is a scalar
auto
x_dim
=
x
.
dims
();
auto
y_dim
=
y
.
dims
();
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
trim_trailing_singular_dims
(
&
y_dim
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
if
(
post
==
1
)
{
int
h
=
pre
;
int
w
=
n
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef __NVCC__
ElemwiseGradBroadcast1CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
ElemwiseGradBroadcast1CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
h
,
w
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
else
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef __NVCC__
ElemwiseGradBroadcast2CUDA
(
ctx
.
template
device_context
<
DeviceContext
>().
stream
(),
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
#endif
}
else
{
ElemwiseGradBroadcast2CPU
(
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
out
.
data
<
T
>
(),
dout
.
data
<
T
>
(),
pre
,
n
,
post
,
dx_op
,
dy_op
,
dx
==
nullptr
?
nullptr
:
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
dy
==
nullptr
?
nullptr
:
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
ElemwiseGradComputeWithBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
x_dim
,
y_dim
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
}
// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
// explicit gradient can cut off X, Y, Out from gradient op
// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
// elementwise code.
template
<
typename
DeviceContext
,
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
void
ElemwiseExplicitGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
&
x
,
const
framework
::
Tensor
&
y
,
const
framework
::
Tensor
&
out
,
const
framework
::
Tensor
&
dout
,
int
axis
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
,
DX_OP
dx_op
,
DY_OP
dy_op
)
{
if
(
dy
==
nullptr
)
{
const
framework
::
DDim
dx_dims
=
dout
.
dims
();
auto
dy_dims
=
dx_dims
;
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
else
{
if
(
dout
.
dims
()
==
dy
->
dims
())
{
const
framework
::
DDim
dx_dims
=
dout
.
dims
();
const
framework
::
DDim
dy_dims
=
dy
->
dims
();
ElemwiseGradComputeNoBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
else
{
// Y is a scalar
auto
dx_dims
=
dout
.
dims
();
const
framework
::
DDim
dy_dims
=
dy
->
dims
();
ElemwiseGradComputeWithBroadcast
<
DeviceContext
,
T
,
DX_OP
,
DY_OP
>
(
ctx
,
dx_dims
,
dy_dims
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
,
dx_op
,
dy_op
);
}
}
}
// Deprecated
template
<
typename
DeviceContext
,
typename
T
,
typename
functor
,
typename
broadcastfunctor
,
typename
broadcast2functor
>
void
ElementwiseGradCompute
(
const
framework
::
ExecutionContext
&
ctx
,
...
...
@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
}
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
trim_trailing_singular_dims
(
&
y_dims
);
trim_trailing_singular_dims
(
y_dims
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
...
...
@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
x
,
y
,
z
,
ctx
.
template
device_context
<
DeviceContext
>(),
func
);
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
y_dims
.
size
(),
auto
y_dims
_untrimed
=
y
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
y_dims
_untrimed
.
size
(),
"Rank of first input must >= rank of second input."
);
if
(
x_dims
==
y_dims
)
{
if
(
x_dims
==
y_dims
_untrimed
)
{
functor
.
Run
();
return
;
}
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
.
size
()
:
axis
);
axis
=
(
axis
==
-
1
?
x_dims
.
size
()
-
y_dims
_untrimed
.
size
()
:
axis
);
PADDLE_ENFORCE
(
axis
>=
0
&&
axis
<
x_dims
.
size
(),
"Axis should be in range [0, x_dims)"
);
trim_trailing_singular_dims
(
&
y_dims
);
auto
y_dims
=
trim_trailing_singular_dims
(
y_dims_untrimed
);
axis
=
(
y_dims
.
size
()
==
0
)
?
x_dims
.
size
()
:
axis
;
int
pre
,
n
,
post
;
...
...
paddle/fluid/operators/elementwise_sub_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -15,7 +15,10 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/operators/elementwise_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_ELEMWISE_OP
(
elementwise_sub
,
"Sub"
,
"Out = X - Y"
);
REGISTER_ELEMWISE_GRAD_MAKER
(
elementwise_sub
,
Sub
);
REGISTER_ELEMWISE_EXPLICIT_OP
(
elementwise_sub
,
"Sub"
,
"Out = X - Y"
,
"Out"
,
"X"
);
REGISTER_OP_CPU_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/elementwise_sub_op.h
浏览文件 @
a58dd3e5
...
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
...
...
@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElemwiseGradCompute
<
DeviceContext
,
T
,
SubGradDX
<
T
>
,
SubGradDY
<
T
>>
(
// skip out, x, y
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
ElemwiseExplicitGradCompute
<
DeviceContext
,
T
,
SubGradDX
<
T
>
,
SubGradDY
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
SubGradDX
<
T
>
(),
SubGradDY
<
T
>
());
}
};
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -19,12 +19,17 @@ limitations under the License. */
#include <thread> // NOLINT
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32
(
listen_and_serv_profile_period
,
0
,
"the period of listen_and_serv to do profile"
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -122,7 +127,18 @@ void ListenAndServOp::RunSyncLoop(
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
rpc_service_
->
ResetBarrierCounter
();
int32_t
profile_step
=
0
;
while
(
true
)
{
PADDLE_ENFORCE_LE
(
profile_step
,
FLAGS_listen_and_serv_profile_period
,
"profile_step should not be larger then "
"FLAGS_listen_and_serv_profile_period"
);
if
(
FLAGS_listen_and_serv_profile_period
>
0
)
{
if
(
profile_step
==
0
)
{
auto
pf_state
=
paddle
::
platform
::
ProfilerState
::
kCPU
;
paddle
::
platform
::
EnableProfiler
(
pf_state
);
}
}
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
...
...
@@ -164,6 +180,15 @@ void ListenAndServOp::RunSyncLoop(
// reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast
<
distributed
::
RequestSendHandler
*>
(
request_send_handler_
.
get
())
->
ResetSparseVarRecorder
();
if
(
FLAGS_listen_and_serv_profile_period
>
0
)
{
if
(
profile_step
==
FLAGS_listen_and_serv_profile_period
)
{
paddle
::
platform
::
DisableProfiler
(
paddle
::
platform
::
EventSortingKey
::
kTotal
,
"/dev/null"
);
profile_step
=
0
;
}
else
{
profile_step
++
;
}
}
}
// while(true)
}
...
...
paddle/fluid/operators/math/im2col.cc
浏览文件 @
a58dd3e5
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -35,61 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
PADDLE_ENFORCE
(
im
.
dims
().
size
()
==
3
);
PADDLE_ENFORCE
(
col
->
dims
().
size
()
==
5
);
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
// TODO(TJ): change me to template
// further optimaze:
// 1. padding != 1
// 2. could also support stride_h != 1
if
(
stride
[
0
]
==
1
&&
stride
[
1
]
==
1
&&
dilation
[
0
]
==
1
&&
dilation
[
1
]
==
1
&&
padding
[
0
]
==
0
&&
padding
[
1
]
==
0
)
{
int
col_matrix_width
=
output_width
*
output_height
;
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
oh
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_height
*
im_width
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
kw
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
}
}
return
;
}
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
static_cast
<
T
>
(
0
)
:
im_data
[
im_idx
];
}
dilation
[
1
]
==
1
)
{
if
(
padding
[
0
]
==
0
&&
padding
[
1
]
==
0
)
{
im2col_sh1sw1dh1dw1ph0pw0
<
T
>
(
im
,
col
);
return
;
}
else
if
(
padding
[
0
]
==
1
&&
padding
[
1
]
==
1
)
{
im2col_sh1sw1dh1dw1ph1pw1
<
T
>
(
im
,
col
);
return
;
}
// TODO(TJ): complete padding >=2
}
im2col_common
<
T
>
(
im
,
dilation
,
stride
,
padding
,
col
);
}
};
...
...
paddle/fluid/operators/math/im2col_cfo_cpu.h
0 → 100644
浏览文件 @
a58dd3e5
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
/**
* The most common im2col algorithm.
* Support dilation, stride and padding.
*/
template
<
typename
T
>
inline
void
im2col_common
(
const
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
static_cast
<
T
>
(
0
)
:
im_data
[
im_idx
];
}
}
}
}
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 0
*/
template
<
typename
T
>
inline
void
im2col_sh1sw1dh1dw1ph0pw0
(
const
framework
::
Tensor
&
im
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
int
col_matrix_width
=
output_width
*
output_height
;
int
im_size
=
im_height
*
im_width
;
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
const
T
*
im_data_oh
=
im_data
;
T
*
dst_data_oh
=
col_data
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
src_data_ic
=
im_data_oh
;
T
*
dst_data
=
dst_data_oh
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
src_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
kw
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
src_data_ic
=
src_data_ic
+
im_size
;
}
im_data_oh
=
im_data_oh
+
im_width
;
dst_data_oh
=
dst_data_oh
+
output_width
;
}
}
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 1
* and filter_width == 1 have a special implementation
*/
template
<
typename
T
>
inline
void
im2col_sh1sw1dh1dw1ph1pw1
(
const
framework
::
Tensor
&
im
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
constexpr
int
plh
=
1
;
constexpr
int
prh
=
1
;
constexpr
int
plw
=
1
;
constexpr
int
prw
=
1
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
int
im_size
=
im_height
*
im_width
;
int
col_matrix_width
=
output_width
*
output_height
;
int
col_block_fh
=
filter_width
*
col_matrix_width
;
// fw*oh*ow
int
col_block_ic
=
filter_height
*
col_block_fh
;
// fh*fw*oh*ow
// fill height padding
{
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
T
*
col_start_l
=
col_data
;
T
*
col_start_r
=
col_data
+
(
filter_height
-
1
)
*
col_block_fh
+
col_matrix_width
-
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_l
=
col_start_l
;
T
*
dst_data_r
=
col_start_r
;
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memset
(
dst_data_l
,
0
,
copy_size
);
std
::
memset
(
dst_data_r
,
0
,
copy_size
);
dst_data_l
=
dst_data_l
+
col_matrix_width
;
dst_data_r
=
dst_data_r
+
col_matrix_width
;
}
col_start_l
=
col_start_l
+
col_block_ic
;
col_start_r
=
col_start_r
+
col_block_ic
;
}
}
auto
pad
=
static_cast
<
T
>
(
0
);
if
(
filter_width
==
1
)
{
// fill width padding
T
*
dst_data_ic
=
col_data
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_kh
=
dst_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
T
*
dst_data
=
dst_data_kh
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
-
1
;
*
dst_data
=
pad
;
++
dst_data
;
}
dst_data_kh
=
dst_data_kh
+
col_block_fh
;
}
dst_data_ic
=
dst_data_ic
+
col_block_ic
;
}
// fill core
size_t
copy_size
=
sizeof
(
T
)
*
(
output_width
-
plw
-
prw
);
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
(
oh
-
plh
>
0
?
oh
-
plh
:
0
)
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_size
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
if
((
oh
<
plh
&&
kh
<
plh
)
||
(
oh
>
(
output_height
-
prh
-
1
)
&&
kh
>
(
filter_height
-
prh
-
1
)))
{
dst_data
=
dst_data
+
col_matrix_width
;
continue
;
}
std
::
memcpy
(
dst_data
+
plw
,
src_data
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
src_data
=
src_data
+
im_width
;
}
}
}
return
;
}
// filter_width != 1
// fill width padding
T
*
dst_data_ic
=
col_data
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
T
*
dst_data_kh
=
dst_data_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
T
*
dst_data
:
{
dst_data_kh
,
dst_data_kh
+
(
filter_width
-
prw
)
*
col_matrix_width
+
output_width
-
1
})
{
// TODO(TJ): from plh, saving repeated assignment
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
;
}
}
dst_data_kh
=
dst_data_kh
+
col_block_fh
;
}
dst_data_ic
=
dst_data_ic
+
col_block_ic
;
}
// TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
// (output_width-1)}
// length of copy_size is equal kw.
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
(
oh
-
plh
>
0
?
oh
-
plh
:
0
)
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_size
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
if
((
oh
<
plh
&&
kh
<
plh
)
||
(
oh
>
(
output_height
-
prh
-
1
)
&&
kh
>
(
filter_height
-
prh
-
1
)))
{
dst_data
=
dst_data
+
filter_width
*
col_matrix_width
;
continue
;
}
// TODO(TJ): reuse plw-kw outside this for
// try to unify
for
(
int
kw
=
0
;
kw
<
plw
;
++
kw
)
{
std
::
memcpy
(
dst_data
+
(
plw
-
kw
),
src_data
,
sizeof
(
T
)
*
(
output_width
-
(
plw
-
kw
)));
dst_data
=
dst_data
+
col_matrix_width
;
}
for
(
int
kw
=
plw
;
kw
<
filter_width
-
prw
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
output_width
);
dst_data
=
dst_data
+
col_matrix_width
;
}
int
i
=
1
;
for
(
int
kw
=
filter_width
-
prw
;
kw
<
filter_width
;
++
kw
,
++
i
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
(
output_width
-
i
));
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
}
}
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/im2col_test.cc
浏览文件 @
a58dd3e5
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/im2col.h"
#include <gtest/gtest.h>
#include <sys/time.h>
#include <vector>
#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
template
<
typename
DeviceContext
,
typename
Place
>
void
testIm2col
()
{
...
...
@@ -160,82 +162,111 @@ void testIm2col() {
delete
context
;
}
void
testIm2colCPU
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
paddle
::
framework
::
Tensor
input
;
paddle
::
framework
::
Tensor
output
;
paddle
::
framework
::
Tensor
ref_output
;
std
::
vector
<
int
>
padding
({
ph
,
pw
});
std
::
vector
<
int
>
stride
({
1
,
1
});
// stride_y, stride_x
std
::
vector
<
int
>
dilation
({
1
,
1
});
// dilation_y, dilation_x
int
output_height
=
(
ih
-
fh
+
padding
[
0
]
*
2
)
/
stride
[
0
]
+
1
;
int
output_width
=
(
iw
-
fw
+
padding
[
1
]
*
2
)
/
stride
[
1
]
+
1
;
float
*
input_ptr
=
input
.
mutable_data
<
float
>
({
ic
,
ih
,
iw
},
paddle
::
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
input
.
numel
();
++
i
)
{
input_ptr
[
i
]
=
static_cast
<
float
>
(
i
+
1
);
}
paddle
::
platform
::
CPUPlace
place
;
paddle
::
platform
::
CPUDeviceContext
context
(
place
);
output
.
mutable_data
<
float
>
({
ic
,
fh
,
fw
,
output_height
,
output_width
},
place
);
ref_output
.
mutable_data
<
float
>
({
ic
,
fh
,
fw
,
output_height
,
output_width
},
place
);
paddle
::
operators
::
math
::
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
paddle
::
platform
::
CPUDeviceContext
,
float
>
im2col
;
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
output
);
auto
ref_im2col
=
[
&
](
const
paddle
::
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
paddle
::
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
float
*
im_data
=
im
.
data
<
float
>
();
float
*
col_data
=
col
->
data
<
float
>
();
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
0.
f
:
im_data
[
im_idx
];
}
}
}
};
ref_im2col
(
input
,
dilation
,
stride
,
padding
,
&
ref_output
);
float
*
out_cfo_ptr
=
output
.
data
<
float
>
();
float
*
out_ref_ptr
=
ref_output
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
output
.
numel
();
++
i
)
{
EXPECT_EQ
(
out_cfo_ptr
[
i
],
out_ref_ptr
[
i
]);
}
}
TEST
(
math
,
im2col
)
{
testIm2col
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
5
,
/*iw*/
5
,
/*fh*/
3
,
/*fw*/
2
,
/*ph*/
0
,
/*pw*/
0
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
3
,
/*fw*/
3
,
/*ph*/
1
,
/*pw*/
1
);
#ifdef PADDLE_WITH_CUDA
testIm2col
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
();
#endif
}
#define PREPARE_IM2COL_CPU \
paddle::platform::CPUPlace place; \
paddle::platform::CPUDeviceContext context(place); \
paddle::framework::Tensor input; \
paddle::framework::Tensor out; \
paddle::framework::Tensor ref; \
std::vector<int> padding({ph, pw}); \
std::vector<int> stride({1, 1}); \
std::vector<int> dilation({1, 1}); \
float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place); \
for (int i = 0; i < input.numel(); ++i) { \
input_ptr[i] = static_cast<float>(i + 1); \
} \
int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \
int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \
out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
paddle::operators::math::Im2ColFunctor< \
paddle::operators::math::ColFormat::kCFO, \
paddle::platform::CPUDeviceContext, float> \
im2col
void
testIm2colCPU
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
PREPARE_IM2COL_CPU
;
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
out
);
paddle
::
operators
::
math
::
im2col_common
<
float
>
(
input
,
dilation
,
stride
,
padding
,
&
ref
);
float
*
ref_data
=
ref
.
data
<
float
>
();
float
*
out_data
=
out
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
out
.
numel
();
++
i
)
{
EXPECT_EQ
(
out_data
[
i
],
ref_data
[
i
]);
}
}
void
benchIm2col
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
PREPARE_IM2COL_CPU
;
constexpr
int
repeat
=
100
;
auto
GetCurrentMs
=
[]()
->
double
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
1e+3
*
time
.
tv_sec
+
1e-3
*
time
.
tv_usec
;
};
auto
t1
=
GetCurrentMs
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
out
);
}
auto
t2
=
GetCurrentMs
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
paddle
::
operators
::
math
::
im2col_common
<
float
>
(
input
,
dilation
,
stride
,
padding
,
&
ref
);
}
auto
t3
=
GetCurrentMs
();
LOG
(
INFO
)
<<
"before: "
<<
(
t3
-
t2
)
/
repeat
<<
",after: "
<<
(
t2
-
t1
)
/
repeat
<<
",boost: "
<<
((
t3
-
t2
)
/
(
t2
-
t1
)
-
1
)
*
100
<<
"%"
;
}
TEST
(
math
,
im2col_cputest
)
{
// padding_h == padding_w
for
(
int
p
=
0
;
p
<
4
;
++
p
)
{
// width == height
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
5
,
/*fh*/
4
,
/*fw*/
4
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
3
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
2
,
/*ph*/
p
,
/*pw*/
p
);
// height != width
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
5
,
/*fh*/
3
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
// filter == 1
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
3
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
}
// padding_h != padding_w
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
3
,
/*ph*/
1
,
/*pw*/
2
);
// benchmark
for
(
int
p
:
{
0
,
1
})
{
for
(
int
k
:
{
1
,
3
,
5
})
{
LOG
(
INFO
)
<<
"padding == "
<<
p
<<
", filter == "
<<
k
;
benchIm2col
(
/*ic*/
3
,
/*ih*/
224
,
/*iw*/
224
,
/*fh*/
k
,
/*fw*/
k
,
/*ph*/
p
,
/*pw*/
p
);
}
}
}
paddle/fluid/operators/reshape_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(Tensor). The output tensor of reshape operator."
);
AddAttr
<
std
::
vector
<
int
>>
(
"shape"
,
"(std::vector<int>) Target shape of reshape operator."
);
AddAttr
<
bool
>
(
"inplace"
,
"(default: false) Change the source tensor's shape without "
"memory copy. When Attr(inplace) is set true, the output "
"tensor shares memory with Input(X), otherwise, a new output "
"tensor is created, and its data are copied from Input(x)."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Reshape Operator.
...
...
@@ -233,16 +227,9 @@ class ReshapeKernel {
"sequence_reshape op."
);
}
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
out
->
mutable_data
(
ctx
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopySync
(
*
in
,
ctx
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims
);
if
(
!
inplace
)
{
out
->
mutable_data
(
ctx
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopySync
(
*
in
,
ctx
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
in
);
out
->
Resize
(
out_dims
);
}
}
};
...
...
@@ -251,19 +238,11 @@ class ReshapeGradKernel {
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
d_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_x
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
in_dims
=
d_x
->
dims
();
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
auto
in_dims
=
d_x
->
dims
();
if
(
!
inplace
)
{
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
d_x
);
ctx
.
device_context
().
Wait
();
d_x
->
Resize
(
in_dims
);
}
else
{
d_x
->
ShareDataWith
(
*
d_out
);
d_x
->
Resize
(
in_dims
);
}
framework
::
TensorCopySync
(
*
d_out
,
ctx
.
GetPlace
(),
d_x
);
d_x
->
Resize
(
in_dims
);
}
};
...
...
paddle/fluid/operators/softmax_op.cc
浏览文件 @
a58dd3e5
...
...
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out) and its gradients should have a same shape."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
)));
}
protected:
...
...
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
());
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
)
)
->
type
());
if
(
input_data_type
==
framework
::
proto
::
VarType
::
FP16
)
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"float16 can only be used on GPU place"
);
...
...
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
}
};
class
SoftmaxOpGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
op
=
new
framework
::
OpDesc
();
op
->
SetType
(
"softmax_grad"
);
op
->
SetInput
(
"Out"
,
Output
(
"Out"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
OutputGrad
(
"Out"
));
op
->
SetAttrMap
(
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
softmax
,
ops
::
SoftmaxOp
,
ops
::
SoftmaxOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
ops
::
SoftmaxOpGradMaker
);
REGISTER_OPERATOR
(
softmax_grad
,
ops
::
SoftmaxOpGrad
);
REGISTER_OP_CPU_KERNEL
(
softmax
,
ops
::
SoftmaxKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/split_ids_op.h
浏览文件 @
a58dd3e5
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
...
...
@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
const
auto
&
ids_rows
=
ids_selected_rows
->
rows
();
auto
outs
=
ctx
.
MultiOutput
<
framework
::
SelectedRows
>
(
"Out"
);
const
size_t
shard_num
=
outs
.
size
();
for
(
auto
&
out
:
outs
)
{
out
->
mutable_rows
()
->
clear
();
}
// get rows for outputs
for
(
auto
&
id
:
ids_rows
)
{
size_t
shard_id
=
static_cast
<
size_t
>
(
id
)
%
shard_num
;
outs
[
shard_id
]
->
mutable_rows
()
->
push_back
(
id
);
std
::
unordered_map
<
int64_t
,
size_t
>
id_to_index
;
for
(
size_t
i
=
0
;
i
<
ids_rows
.
size
();
++
i
)
{
id_to_index
[
ids_rows
[
i
]]
=
i
;
size_t
shard_id
=
static_cast
<
size_t
>
(
ids_rows
[
i
])
%
shard_num
;
outs
[
shard_id
]
->
mutable_rows
()
->
push_back
(
ids_rows
[
i
]);
}
int64_t
row_width
=
ids_dims
[
1
];
...
...
@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
{
static_cast
<
int64_t
>
(
out
->
rows
().
size
()),
row_width
});
T
*
output
=
out
->
mutable_value
()
->
mutable_data
<
T
>
(
ddim
,
place
);
for
(
int64_t
i
=
0
;
i
<
ddim
[
0
];
++
i
)
{
memcpy
(
output
+
i
*
row_width
,
ids
+
out
->
rows
()[
i
]
*
row_width
,
memcpy
(
output
+
i
*
row_width
,
ids
+
id_to_index
[
out
->
rows
()[
i
]]
*
row_width
,
row_width
*
sizeof
(
T
));
}
}
...
...
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
a58dd3e5
...
...
@@ -13,7 +13,6 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <bitset>
#include <iostream>
#include <random>
...
...
@@ -25,13 +24,13 @@
using
paddle
::
platform
::
PADDLE_CUDA_NUM_THREADS
;
using
paddle
::
platform
::
float16
;
#define CUDA_ATOMIC_KERNEL(op, T) \
__global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \
i += blockDim.x * gridDim.x) { \
paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \
} \
template
<
typename
T
>
__global__
void
AddKernel
(
const
T
*
data_a
,
T
*
data_b
,
size_t
num
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
paddle
::
platform
::
CudaAtomicAdd
(
&
data_b
[
i
],
data_a
[
i
]);
}
}
template
<
typename
T
>
struct
AddFunctor
{
...
...
@@ -39,80 +38,116 @@ struct AddFunctor {
};
template
<
typename
T
>
struct
SubFunctor
{
T
operator
()(
const
T
&
a
,
const
T
&
b
)
{
return
a
-
b
;
}
};
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
#define ARITHMETIC_KERNEL_LAUNCH(op, T) \
void Test##T##op(size_t num) { \
T *in1, *in2, *out; \
T *d_in1, *d_in2; \
size_t size = sizeof(T) * num; \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1 = reinterpret_cast<T*>(malloc(size)); \
in2 = reinterpret_cast<T*>(malloc(size)); \
out = reinterpret_cast<T*>(malloc(size)); \
std::minstd_rand engine; \
std::uniform_real_distribution<double> dist(0.0, 1.0); \
for (size_t i = 0; i < num; ++i) { \
in1[i] = static_cast<T>(dist(engine)); \
in2[i] = static_cast<T>(dist(engine)); \
} \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \
cudaDeviceSynchronize(); \
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \
cudaDeviceSynchronize(); \
for (size_t i = 0; i < num; ++i) { \
EXPECT_NEAR(static_cast<float>(out[i]), \
static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
0.001); \
} \
free(in1); \
free(in2); \
free(out); \
cudaFree(d_in1); \
cudaFree(d_in2); \
void
TestCase
(
size_t
num
)
{
T
*
in1
,
*
in2
,
*
out
;
T
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
T
)
*
num
;
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
in1
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
T
*>
(
malloc
(
size
));
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
in1
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
in2
[
i
]
=
static_cast
<
T
>
(
dist
(
engine
));
}
CUDA_ATOMIC_KERNEL
(
Add
,
float
);
CUDA_ATOMIC_KERNEL
(
Add
,
double
);
CUDA_ATOMIC_KERNEL
(
Add
,
float16
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
float
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
double
);
ARITHMETIC_KERNEL_LAUNCH
(
Add
,
float16
);
namespace
paddle
{
namespace
platform
{
USE_CUDA_ATOMIC
(
Sub
,
int
);
};
};
CUDA_ATOMIC_KERNEL
(
Sub
,
int
);
ARITHMETIC_KERNEL_LAUNCH
(
Sub
,
int
);
cudaMemcpy
(
d_in1
,
in1
,
size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
in2
,
size
,
cudaMemcpyHostToDevice
);
AddKernel
<
T
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
out
[
i
]),
static_cast
<
float
>
(
AddFunctor
<
T
>
()(
in1
[
i
],
in2
[
i
])),
0.001
);
}
free
(
in1
);
free
(
in2
);
free
(
out
);
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
}
// cuda primitives
TEST
(
CudaAtomic
,
Add
)
{
TestfloatAdd
(
static_cast
<
size_t
>
(
10
));
TestfloatAdd
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestdoubleAdd
(
static_cast
<
size_t
>
(
10
));
TestdoubleAdd
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
TestCase
<
float
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
float
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
TEST
(
CudaAtomic
,
Sub
)
{
TestintSub
(
static_cast
<
size_t
>
(
10
));
TestintSub
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestCase
<
double
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
double
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
TEST
(
CudaAtomic
,
float16
)
{
using
paddle
::
platform
::
float16
;
Testfloat16Add
(
static_cast
<
size_t
>
(
1
));
Testfloat16Add
(
static_cast
<
size_t
>
(
2
));
Testfloat16Add
(
static_cast
<
size_t
>
(
3
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
1
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
2
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
3
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
10
));
TestCase
<
float16
>
(
static_cast
<
size_t
>
(
1024
*
1024
));
}
// unalignment of uint8
void
TestUnalign
(
size_t
num
,
const
int
shift_bit
)
{
PADDLE_ENFORCE
(
num
%
2
==
0
,
"must be a multiple of 2"
);
float16
*
in1
,
*
in2
,
*
out
;
float16
*
d_in1
,
*
d_in2
;
size_t
size
=
sizeof
(
uint8_t
)
*
(
num
+
shift_bit
);
size_t
array_size
=
sizeof
(
float16
)
*
(
num
/
2
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in1
),
size
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
d_in2
),
size
);
in1
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
in2
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
out
=
reinterpret_cast
<
float16
*>
(
malloc
(
size
));
// right shift 1, mimic the unalignment of address
float16
*
r_in1
=
reinterpret_cast
<
float16
*>
(
reinterpret_cast
<
uint8_t
*>
(
in1
)
+
shift_bit
);
float16
*
r_in2
=
reinterpret_cast
<
float16
*>
(
reinterpret_cast
<
uint8_t
*>
(
in2
)
+
shift_bit
);
std
::
minstd_rand
engine
;
std
::
uniform_real_distribution
<
double
>
dist
(
0.0
,
1.0
);
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
r_in1
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
r_in2
[
i
]
=
static_cast
<
float16
>
(
dist
(
engine
));
}
cudaMemcpy
(
d_in1
,
r_in1
,
array_size
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_in2
,
r_in2
,
array_size
,
cudaMemcpyHostToDevice
);
AddKernel
<
float16
><<<
1
,
PADDLE_CUDA_NUM_THREADS
>>>
(
d_in1
,
d_in2
,
num
/
2
);
cudaDeviceSynchronize
();
cudaMemcpy
(
out
,
d_in2
,
array_size
,
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
for
(
size_t
i
=
0
;
i
<
num
/
2
;
++
i
)
{
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR
(
static_cast
<
float
>
(
out
[
i
]),
static_cast
<
float
>
(
AddFunctor
<
float16
>
()(
r_in1
[
i
],
r_in2
[
i
])),
0.001
);
}
free
(
in1
);
free
(
in2
);
free
(
out
);
cudaFree
(
d_in1
);
cudaFree
(
d_in2
);
}
TEST
(
CudaAtomic
,
float16Unalign
)
{
// same with float16 testcase
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
2
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
2
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
2
);
// shift the address.
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
1
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
1
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
1
);
Testfloat16Add
(
static_cast
<
size_t
>
(
10
));
Testfloat16Add
(
static_cast
<
size_t
>
(
1024
*
1024
));
TestUnalign
(
static_cast
<
size_t
>
(
2
),
/*shift_bit*/
3
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
),
/*shift_bit*/
3
);
TestUnalign
(
static_cast
<
size_t
>
(
1024
*
1024
),
/*shift_bit*/
3
);
}
paddle/fluid/platform/cuda_primitives.h
浏览文件 @
a58dd3e5
...
...
@@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
// convert the value into float and do the add arithmetic.
// then store the result into a uint32.
inline
__device__
uint32_t
add_to_low_half
(
uint32_t
val
,
float
x
)
{
inline
static
__device__
uint32_t
add_to_low_half
(
uint32_t
val
,
float
x
)
{
float16
low_half
;
// the float16 in lower 16bits
low_half
.
x
=
static_cast
<
uint16_t
>
(
val
&
0x
ffff
u
);
low_half
.
x
=
static_cast
<
uint16_t
>
(
val
&
0x
FFFF
u
);
low_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
low_half
)
+
x
);
return
(
val
&
0x
ffff
0000u
)
|
low_half
.
x
;
return
(
val
&
0x
FFFF
0000u
)
|
low_half
.
x
;
}
inline
__device__
uint32_t
add_to_high_half
(
uint32_t
val
,
float
x
)
{
inline
static
__device__
uint32_t
add_to_high_half
(
uint32_t
val
,
float
x
)
{
float16
high_half
;
// the float16 in higher 16bits
high_half
.
x
=
static_cast
<
uint16_t
>
(
val
>>
16
);
high_half
=
static_cast
<
float16
>
(
static_cast
<
float
>
(
high_half
)
+
x
);
return
(
val
&
0x
ffff
u
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
return
(
val
&
0x
FFFF
u
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
}
CUDA_ATOMIC_WRAPPER
(
Add
,
float16
)
{
// concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t
*
address_as_ui
=
reinterpret_cast
<
uint32_t
*>
(
reinterpret_cast
<
char
*>
(
address
)
-
(
reinterpret_cast
<
size_t
>
(
address
)
&
2
));
uint32_t
*
address_as_ui
=
reinterpret_cast
<
uint32_t
*>
(
reinterpret_cast
<
char
*>
(
address
)
-
(
reinterpret_cast
<
uintptr_t
>
(
address
)
&
0x0
2
));
float
val_f
=
static_cast
<
float
>
(
val
);
uint32_t
old
=
*
address_as_ui
;
uint32_t
sum
;
uint32_t
newval
;
uint32_t
assumed
;
if
(((
size_t
)
address
&
2
)
==
0
)
{
if
(((
uintptr_t
)
address
&
0x0
2
)
==
0
)
{
// the float16 value stay at lower 16 bits of the address.
do
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_ui
,
assumed
,
add_to_low_half
(
assumed
,
val_f
));
}
while
(
old
!=
assumed
);
float16
ret
;
ret
.
x
=
old
&
0x
ffff
u
;
ret
.
x
=
old
&
0x
FFFF
u
;
return
ret
;
}
else
{
// the float16 value stay at higher 16 bits of the address.
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
a58dd3e5
...
...
@@ -534,7 +534,7 @@ EOF
make
-j
`
nproc
`
inference_lib_dist
cd
${
PADDLE_ROOT
}
/build
cp
-r
fluid_install_dir fluid
tar
-cf
fluid.tgz fluid
tar
-c
z
f
fluid.tgz fluid
fi
}
...
...
python/paddle/batch.py
浏览文件 @
a58dd3e5
...
...
@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False):
if
drop_last
==
False
and
len
(
b
)
!=
0
:
yield
b
# Batch size check
batch_size
=
int
(
batch_size
)
if
batch_size
<=
0
:
raise
ValueError
(
"batch_size should be a positive integeral value, "
"but got batch_size={}"
.
format
(
batch_size
))
return
batch_reader
python/paddle/fluid/__init__.py
浏览文件 @
a58dd3e5
...
...
@@ -127,6 +127,7 @@ def __bootstrap__():
]
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'listen_and_serv_profile_period'
)
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
a58dd3e5
...
...
@@ -38,6 +38,7 @@ __all__ = [
__auto__
=
[
'iou_similarity'
,
'box_coder'
,
'polygon_box_transform'
,
]
__all__
+=
__auto__
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
a58dd3e5
...
...
@@ -4474,15 +4474,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"except one unknown dimension."
)
helper
=
LayerHelper
(
"reshape"
,
**
locals
())
reshaped
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
"reshape"
,
inputs
=
inputs
,
attrs
=
{
"shape"
:
shape
,
"inplace"
:
inplace
},
outputs
=
{
"Out"
:
reshaped
})
attrs
=
{
"shape"
:
shape
},
outputs
=
{
"Out"
:
out
})
return
helper
.
append_activation
(
reshaped
)
return
helper
.
append_activation
(
out
)
def
lod_reset
(
x
,
y
=
None
,
target_lod
=
None
):
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
a58dd3e5
...
...
@@ -66,9 +66,7 @@ __all__ = [
'scatter'
,
'sum'
,
'slice'
,
'polygon_box_transform'
,
'shape'
,
'iou_similarity'
,
'maxout'
,
]
+
__activations__
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
a58dd3e5
...
...
@@ -122,7 +122,7 @@ class ParallelExecutor(object):
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exec_strategy
.
num_threads
=
cpu_num
exec_strategy
.
num_threads
=
cpu_num
*
2
if
build_strategy
is
None
:
build_strategy
=
BuildStrategy
()
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
a58dd3e5
...
...
@@ -62,5 +62,6 @@ if(WITH_DISTRIBUTE)
endif
()
py_test_modules
(
test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
py_test_modules
(
test_dist_transformer MODULES test_dist_transformer SERIAL
)
py_test_modules
(
test_dist_se_resnext MODULES test_dist_se_resnext SERIAL
)
py_test_modules
(
test_dist_transformer MODULES test_dist_transformer SERIAL
)
python/paddle/fluid/tests/unittests/dist_transformer.py
0 → 100644
浏览文件 @
a58dd3e5
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
argparse
import
time
import
math
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
import
os
import
sys
import
transformer_model
import
paddle.dataset.wmt16
as
wmt16
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
WMT16_RECORDIO_FILE
=
"/tmp/wmt16.recordio"
class
ModelHyperParams
(
object
):
# Dictionary size for source and target language. This model directly uses
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
# alreay been added, but the <pad> token is not added. Transformer requires
# sequences in a mini-batch are padded to have the same length. A <pad> token is
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary.
src_vocab_size
=
10000
# index for <pad> token in source language.
src_pad_idx
=
src_vocab_size
# size of target word dictionay
trg_vocab_size
=
10000
# index for <pad> token in target language.
trg_pad_idx
=
trg_vocab_size
# position value corresponding to the <pad> token.
pos_pad_idx
=
0
# max length of sequences. It should plus 1 to include position
# padding token for position encoding.
max_length
=
50
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model
=
512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid
=
1024
# the dimension that keys are projected to for dot-product attention.
d_key
=
64
# the dimension that values are projected to for dot-product attention.
d_value
=
64
# number of head used in multi-head attention.
n_head
=
8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer
=
6
# dropout rate used by all dropout layers.
dropout
=
0.1
def
prepare_batch_input
(
insts
,
src_pad_idx
,
trg_pad_idx
,
n_head
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. Then, convert the numpy
data to tensors and return a dict mapping names to tensors.
"""
def
__pad_batch_data
(
insts
,
pad_idx
,
is_target
=
False
,
return_pos
=
True
,
return_attn_bias
=
True
,
return_max_len
=
True
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list
=
[]
max_len
=
max
(
len
(
inst
)
for
inst
in
insts
)
inst_data
=
np
.
array
(
[
inst
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
if
return_pos
:
inst_pos
=
np
.
array
([[
pos_i
+
1
if
w_i
!=
pad_idx
else
0
for
pos_i
,
w_i
in
enumerate
(
inst
)
]
for
inst
in
inst_data
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
if
return_attn_bias
:
if
is_target
:
# This is used to avoid attention on paddings and subsequent
# words.
slf_attn_bias_data
=
np
.
ones
((
inst_data
.
shape
[
0
],
max_len
,
max_len
))
slf_attn_bias_data
=
np
.
triu
(
slf_attn_bias_data
,
1
).
reshape
(
[
-
1
,
1
,
max_len
,
max_len
])
slf_attn_bias_data
=
np
.
tile
(
slf_attn_bias_data
,
[
1
,
n_head
,
1
,
1
])
*
[
-
1e9
]
else
:
# This is used to avoid attention on paddings.
slf_attn_bias_data
=
np
.
array
([[
0
]
*
len
(
inst
)
+
[
-
1e9
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
slf_attn_bias_data
=
np
.
tile
(
slf_attn_bias_data
.
reshape
([
-
1
,
1
,
1
,
max_len
]),
[
1
,
n_head
,
max_len
,
1
])
return_list
+=
[
slf_attn_bias_data
.
astype
(
"float32"
)]
if
return_max_len
:
return_list
+=
[
max_len
]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
src_word
,
src_pos
,
src_slf_attn_bias
,
src_max_len
=
__pad_batch_data
(
[
inst
[
0
]
for
inst
in
insts
],
src_pad_idx
,
is_target
=
False
)
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_max_len
=
__pad_batch_data
(
[
inst
[
1
]
for
inst
in
insts
],
trg_pad_idx
,
is_target
=
True
)
trg_src_attn_bias
=
np
.
tile
(
src_slf_attn_bias
[:,
:,
::
src_max_len
,
:],
[
1
,
1
,
trg_max_len
,
1
]).
astype
(
"float32"
)
lbl_word
=
__pad_batch_data
([
inst
[
2
]
for
inst
in
insts
],
trg_pad_idx
,
False
,
False
,
False
,
False
)
lbl_weight
=
(
lbl_word
!=
trg_pad_idx
).
astype
(
"float32"
).
reshape
([
-
1
,
1
])
return
[
src_word
,
src_pos
,
trg_word
,
trg_pos
,
src_slf_attn_bias
,
trg_slf_attn_bias
,
trg_src_attn_bias
,
lbl_word
,
lbl_weight
]
def
transformer
(
use_feed
):
assert
not
use_feed
,
"transfomer doesn't support feed yet"
return
transformer_model
.
transformer
(
ModelHyperParams
.
src_vocab_size
+
1
,
ModelHyperParams
.
trg_vocab_size
+
1
,
ModelHyperParams
.
max_length
+
1
,
ModelHyperParams
.
n_layer
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
d_inner_hid
,
ModelHyperParams
.
dropout
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
pos_pad_idx
)
def
get_model
():
avg_cost
=
transformer
(
use_feed
=
False
)
optimizer
=
fluid
.
optimizer
.
Adam
()
optimizer
.
minimize
(
avg_cost
)
return
avg_cost
def
get_transpiler
(
trainer_id
,
main_program
,
pserver_endpoints
,
trainers
):
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
=
trainer_id
,
program
=
main_program
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
return
t
class
DistTransformer2x2
(
object
):
def
run_pserver
(
self
,
pserver_endpoints
,
trainers
,
current_endpoint
,
trainer_id
):
get_model
()
t
=
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
pserver_endpoints
,
trainers
)
pserver_prog
=
t
.
get_pserver_program
(
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
current_endpoint
,
pserver_prog
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
exe
.
run
(
pserver_prog
)
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
20
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
print
(
"waiting ps ready: "
,
pid
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
retry_times
-=
1
def
run_trainer
(
self
,
place
,
endpoints
,
trainer_id
,
trainers
,
is_dist
=
True
):
avg_cost
=
get_model
()
if
is_dist
:
t
=
get_transpiler
(
trainer_id
,
fluid
.
default_main_program
(),
endpoints
,
trainers
)
trainer_prog
=
t
.
get_trainer_program
()
else
:
trainer_prog
=
fluid
.
default_main_program
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
strategy
.
allow_op_delay
=
False
exe
=
fluid
.
ParallelExecutor
(
True
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
)
first_loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
print
(
first_loss
)
for
i
in
xrange
(
5
):
_
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
last_loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
])
print
(
last_loss
)
def
main
(
role
=
"pserver"
,
endpoints
=
"127.0.0.1:9123"
,
trainer_id
=
0
,
current_endpoint
=
"127.0.0.1:9123"
,
trainers
=
1
,
is_dist
=
True
):
reader
=
paddle
.
batch
(
wmt16
.
train
(
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
),
batch_size
=
transformer_model
.
batch_size
)
with
fluid
.
recordio_writer
.
create_recordio_writer
(
WMT16_RECORDIO_FILE
)
as
writer
:
for
batch
in
reader
():
for
tensor
in
prepare_batch_input
(
batch
,
ModelHyperParams
.
src_pad_idx
,
ModelHyperParams
.
trg_pad_idx
,
ModelHyperParams
.
n_head
):
t
=
fluid
.
LoDTensor
()
t
.
set
(
tensor
,
fluid
.
CPUPlace
())
writer
.
append_tensor
(
t
)
writer
.
complete_append_tensor
()
model
=
DistTransformer2x2
()
if
role
==
"pserver"
:
model
.
run_pserver
(
endpoints
,
trainers
,
current_endpoint
,
trainer_id
)
else
:
p
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
model
.
run_trainer
(
p
,
endpoints
,
trainer_id
,
trainers
,
is_dist
)
if
__name__
==
"__main__"
:
if
len
(
sys
.
argv
)
!=
7
:
print
(
"Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
)
role
=
sys
.
argv
[
1
]
endpoints
=
sys
.
argv
[
2
]
trainer_id
=
int
(
sys
.
argv
[
3
])
current_endpoint
=
sys
.
argv
[
4
]
trainers
=
int
(
sys
.
argv
[
5
])
is_dist
=
True
if
sys
.
argv
[
6
]
==
"TRUE"
else
False
main
(
role
=
role
,
endpoints
=
endpoints
,
trainer_id
=
trainer_id
,
current_endpoint
=
current_endpoint
,
trainers
=
trainers
,
is_dist
=
is_dist
)
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
a58dd3e5
...
...
@@ -68,6 +68,10 @@ def get_numeric_gradient(place,
tensor_to_check_dtype
=
np
.
float32
elif
tensor_to_check_dtype
==
core
.
VarDesc
.
VarType
.
FP64
:
tensor_to_check_dtype
=
np
.
float64
elif
tensor_to_check_dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
tensor_to_check_dtype
=
np
.
float16
# set delta as np.float16, will automatic convert to float32, float64
delta
=
np
.
array
(
delta
).
astype
(
np
.
float16
)
else
:
raise
ValueError
(
"Not supported data type "
+
str
(
tensor_to_check_dtype
))
...
...
@@ -75,13 +79,24 @@ def get_numeric_gradient(place,
gradient_flat
=
np
.
zeros
(
shape
=
(
tensor_size
,
),
dtype
=
tensor_to_check_dtype
)
def
__get_elem__
(
tensor
,
i
):
if
tensor_to_check_dtype
==
np
.
float32
:
if
tensor_to_check_dtype
==
np
.
float16
:
numpy_tensor
=
np
.
array
(
tensor
).
astype
(
np
.
float16
)
numpy_tensor
=
numpy_tensor
.
flatten
()
return
numpy_tensor
[
i
]
elif
tensor_to_check_dtype
==
np
.
float32
:
return
tensor
.
_get_float_element
(
i
)
else
:
return
tensor
.
_get_double_element
(
i
)
def
__set_elem__
(
tensor
,
i
,
e
):
if
tensor_to_check_dtype
==
np
.
float32
:
if
tensor_to_check_dtype
==
np
.
float16
:
numpy_tensor
=
np
.
array
(
tensor
).
astype
(
np
.
float16
)
shape
=
numpy_tensor
.
shape
numpy_tensor
=
numpy_tensor
.
flatten
()
numpy_tensor
[
i
]
=
e
numpy_tensor
=
numpy_tensor
.
reshape
(
shape
).
view
(
np
.
uint16
)
tensor
.
set
(
numpy_tensor
,
place
)
elif
tensor_to_check_dtype
==
np
.
float32
:
tensor
.
_set_float_element
(
i
,
e
)
else
:
tensor
.
_set_double_element
(
i
,
e
)
...
...
@@ -135,6 +150,11 @@ class OpTest(unittest.TestCase):
if
not
self
.
call_once
:
self
.
call_once
=
True
self
.
dtype
=
data_type
# See the comment of np_dtype_to_fluid_dtype
# If the input type is uint16, we assume use float16
# for lodtensor dtype.
if
self
.
dtype
==
np
.
uint16
:
self
.
dtype
==
np
.
float16
def
infer_dtype_from_inputs_outputs
(
self
,
inputs
,
outputs
):
def
infer_dtype
(
numpy_dict
):
...
...
@@ -163,19 +183,25 @@ class OpTest(unittest.TestCase):
for
name
,
np_value
in
self
.
inputs
[
var_name
]:
tensor
=
core
.
LoDTensor
()
if
isinstance
(
np_value
,
tuple
):
tensor
.
set
(
np_value
[
0
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
np_value
[
0
]),
place
)
tensor
.
set_recursive_sequence_lengths
(
np_value
[
1
])
else
:
tensor
.
set
(
np_value
,
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
np_value
),
place
)
feed_map
[
name
]
=
tensor
else
:
tensor
=
core
.
LoDTensor
()
if
isinstance
(
self
.
inputs
[
var_name
],
tuple
):
tensor
.
set
(
self
.
inputs
[
var_name
][
0
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
self
.
inputs
[
var_name
][
0
]),
place
)
tensor
.
set_recursive_sequence_lengths
(
self
.
inputs
[
var_name
][
1
])
else
:
tensor
.
set
(
self
.
inputs
[
var_name
],
place
)
tensor
.
set
(
OpTest
.
np_value_to_fluid_value
(
self
.
inputs
[
var_name
]),
place
)
feed_map
[
var_name
]
=
tensor
return
feed_map
...
...
@@ -309,13 +335,22 @@ class OpTest(unittest.TestCase):
np
.
allclose
(
actual_t
,
expect_t
,
atol
=
atol
),
"Output ("
+
out_name
+
") has diff at "
+
str
(
place
)
+
str
(
actual_t
)
+
"
\n
"
+
str
(
expect_t
))
"
\n
Expect "
+
str
(
expect_t
)
+
"
\n
"
+
"But Got"
+
str
(
actual_t
))
if
isinstance
(
expect
,
tuple
):
self
.
assertListEqual
(
actual
.
recursive_sequence_lengths
(),
expect
[
1
],
"Output ("
+
out_name
+
") has different lod at "
+
str
(
place
))
def
_get_places
(
self
):
if
self
.
dtype
==
np
.
float16
:
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
return
[
place
]
else
:
return
[]
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
places
.
append
(
core
.
CUDAPlace
(
0
))
...
...
@@ -346,9 +381,9 @@ class OpTest(unittest.TestCase):
def
err_msg
():
offset
=
np
.
argmax
(
diff_mat
>
max_relative_error
)
return
(
"%s Variable %s max gradient diff %f over limit %f, "
"the first error element is %d,
%f, %f"
)
%
(
msg_prefix
,
name
,
max_diff
,
max_relative_error
,
offset
,
a
.
flatten
()[
offset
],
b
.
flatten
()[
offset
])
"the first error element is %d,
expected %f, but got %f"
)
%
(
msg_prefix
,
name
,
max_diff
,
max_relative_error
,
offset
,
a
.
flatten
()[
offset
],
b
.
flatten
()[
offset
])
self
.
assertLessEqual
(
max_diff
,
max_relative_error
,
err_msg
())
...
...
@@ -437,6 +472,21 @@ class OpTest(unittest.TestCase):
input
.
dtype
=
np
.
uint16
return
input
@
staticmethod
def
fluid_dtype_to_np_dtype
(
self
,
dtype
):
"""
See above, convert the dtype to normal type.
"""
if
dtype
==
np
.
uint16
:
dtype
=
np
.
float16
return
dtype
@
staticmethod
def
np_value_to_fluid_value
(
input
):
if
input
.
dtype
==
np
.
float16
:
input
=
input
.
view
(
np
.
uint16
)
return
input
def
_get_gradient
(
self
,
input_to_check
,
place
,
...
...
@@ -459,7 +509,7 @@ class OpTest(unittest.TestCase):
if
isinstance
(
place
,
fluid
.
CUDAPlace
(
0
)):
use_cuda
=
True
executor
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
prog
ram
)
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
prog
)
else
:
executor
=
Executor
(
place
)
return
list
(
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
0 → 100644
浏览文件 @
a58dd3e5
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
time
import
unittest
import
os
import
sys
import
signal
import
subprocess
class
TestDistBase
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
_trainers
=
2
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:9123,127.0.0.1:9124"
self
.
_python_interp
=
"python"
def
start_pserver
(
self
,
model_file
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps0_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
ps1_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
ps0_proc
=
subprocess
.
Popen
(
ps0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
ps1_proc
=
subprocess
.
Popen
(
ps1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
return
ps0_proc
,
ps1_proc
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
50
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
as
e
:
sys
.
stderr
.
write
(
'waiting for pserver: %s, left retry %d
\n
'
%
(
e
,
retry_times
))
retry_times
-=
1
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
),
"FLAGS_fraction_of_gpu_memory_to_use"
:
"0.15"
}
# Run local to get a base line
env_local
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env_local
.
update
(
required_envs
)
local_cmd
=
"%s %s trainer %s 0 %s %d FLASE"
%
\
(
self
.
_python_interp
,
model_file
,
"127.0.0.1:1234"
,
"127.0.0.1:1234"
,
1
)
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env_local
)
local_proc
.
wait
()
out
,
err
=
local_proc
.
communicate
()
local_ret
=
out
sys
.
stderr
.
write
(
'local_loss: %s
\n
'
%
local_ret
)
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
err
)
# Run dist train to compare with local results
ps0
,
ps1
=
self
.
start_pserver
(
model_file
)
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr0_cmd
=
"%s %s trainer %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
tr1_cmd
=
"%s %s trainer %s 1 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
env0
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env1
=
{
"CUDA_VISIBLE_DEVICES"
:
"1"
}
env0
.
update
(
required_envs
)
env1
.
update
(
required_envs
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env1
)
tr0_proc
.
wait
()
tr1_proc
.
wait
()
out
,
err
=
tr0_proc
.
communicate
()
sys
.
stderr
.
write
(
'dist_stderr: %s
\n
'
%
err
)
loss_data0
=
out
sys
.
stderr
.
write
(
'dist_loss: %s
\n
'
%
loss_data0
)
lines
=
loss_data0
.
split
(
"
\n
"
)
dist_first_loss
=
eval
(
lines
[
0
].
replace
(
" "
,
","
))[
0
]
dist_last_loss
=
eval
(
lines
[
1
].
replace
(
" "
,
","
))[
0
]
local_lines
=
local_ret
.
split
(
"
\n
"
)
local_first_loss
=
eval
(
local_lines
[
0
])[
0
]
local_last_loss
=
eval
(
local_lines
[
1
])[
0
]
self
.
assertAlmostEqual
(
local_first_loss
,
dist_first_loss
,
delta
=
delta
)
self
.
assertAlmostEqual
(
local_last_loss
,
dist_last_loss
,
delta
=
delta
)
# check tr0_out
# FIXME: ensure the server process is killed
# replace with ps0.terminate()
os
.
kill
(
ps0
.
pid
,
signal
.
SIGKILL
)
os
.
kill
(
ps1
.
pid
,
signal
.
SIGKILL
)
FNULL
.
close
()
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
a58dd3e5
...
...
@@ -11,127 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
argparse
import
time
import
math
import
unittest
import
os
import
sys
import
signal
import
subprocess
class
TestDistSeResneXt2x2
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
_trainers
=
2
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:9123,127.0.0.1:9124"
self
.
_python_interp
=
"python"
def
start_pserver
(
self
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps0_cmd
=
"%s dist_se_resnext.py pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
ps1_cmd
=
"%s dist_se_resnext.py pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
ps0_proc
=
subprocess
.
Popen
(
ps0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
ps1_proc
=
subprocess
.
Popen
(
ps1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
return
ps0_proc
,
ps1_proc
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
20
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
retry_times
-=
1
def
test_with_place
(
self
):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
),
"FLAGS_fraction_of_gpu_memory_to_use"
:
"0.15"
}
# Run local to get a base line
env_local
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env_local
.
update
(
required_envs
)
local_cmd
=
"%s dist_se_resnext.py trainer %s 0 %s %d FLASE"
%
\
(
self
.
_python_interp
,
"127.0.0.1:1234"
,
"127.0.0.1:1234"
,
1
)
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env_local
)
local_proc
.
wait
()
out
,
err
=
local_proc
.
communicate
()
local_ret
=
out
sys
.
stderr
.
write
(
'local_loss: %s
\n
'
%
local_ret
)
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
err
)
# Run dist train to compare with local results
ps0
,
ps1
=
self
.
start_pserver
()
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr0_cmd
=
"%s dist_se_resnext.py trainer %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps0_ep
,
self
.
_trainers
)
tr1_cmd
=
"%s dist_se_resnext.py trainer %s 1 %s %d TRUE"
%
\
(
self
.
_python_interp
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
env0
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env1
=
{
"CUDA_VISIBLE_DEVICES"
:
"1"
}
env0
.
update
(
required_envs
)
env1
.
update
(
required_envs
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env1
)
tr0_proc
.
wait
()
tr1_proc
.
wait
()
out
,
err
=
tr0_proc
.
communicate
()
sys
.
stderr
.
write
(
'dist_stderr: %s
\n
'
%
err
)
loss_data0
=
out
sys
.
stderr
.
write
(
'dist_loss: %s
\n
'
%
loss_data0
)
lines
=
loss_data0
.
split
(
"
\n
"
)
dist_first_loss
=
eval
(
lines
[
0
].
replace
(
" "
,
","
))[
0
]
dist_last_loss
=
eval
(
lines
[
1
].
replace
(
" "
,
","
))[
0
]
local_lines
=
local_ret
.
split
(
"
\n
"
)
local_first_loss
=
eval
(
local_lines
[
0
])[
0
]
local_last_loss
=
eval
(
local_lines
[
1
])[
0
]
from
test_dist_base
import
TestDistBase
self
.
assertAlmostEqual
(
local_first_loss
,
dist_first_loss
)
self
.
assertAlmostEqual
(
local_last_loss
,
dist_last_loss
)
# check tr0_out
# FIXME: ensure the server process is killed
# replace with ps0.terminate()
os
.
kill
(
ps0
.
pid
,
signal
.
SIGKILL
)
os
.
kill
(
ps1
.
pid
,
signal
.
SIGKILL
)
FNULL
.
close
()
class
TestDistSeResneXt2x2
(
TestDistBase
):
def
test_se_resnext
(
self
):
# TODO(paddle-dev): Is the delta too large?
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
0.2
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_transformer.py
0 → 100644
浏览文件 @
a58dd3e5
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
from
test_dist_base
import
TestDistBase
class
TestDistTransformer2x2
(
TestDistBase
):
def
test_transformer
(
self
):
# TODO(paddle-dev): check if the delta is OK.
# Usually start around ~8000 and converge to ~5000
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
400
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
浏览文件 @
a58dd3e5
...
...
@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
def
setUp
(
self
):
self
.
op_type
=
"elementwise_sub"
self
.
inputs
=
{
'X'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
"float32"
),
'Y'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
17
]).
astype
(
"float32"
)
'X'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
2
,
3
]).
astype
(
"float32"
),
'Y'
:
np
.
random
.
uniform
(
0.1
,
1
,
[
2
,
3
]).
astype
(
"float32"
)
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
]
-
self
.
inputs
[
'Y'
]}
...
...
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
浏览文件 @
a58dd3e5
...
...
@@ -17,6 +17,8 @@ import numpy as np
import
math
from
op_test
import
OpTest
np
.
random
.
seed
(
100
)
def
find_latest_set
(
num
):
return
1
+
int
(
math
.
floor
(
math
.
log
(
num
,
2
)))
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
a58dd3e5
...
...
@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
def
test_shape
(
self
):
program
=
Program
()
with
program_guard
(
program
):
input
=
layers
.
data
(
name
=
"input"
,
shape
=
[
3
,
100
,
100
],
dtype
=
"float32"
)
out
=
layers
.
shape
(
input
,
name
=
"shape"
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
浏览文件 @
a58dd3e5
...
...
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
print
(
str
(
result_program
))
class
TestMemoryTranspiler2
(
unittest
.
TestCase
):
def
setUp
(
self
):
program
=
Program
()
with
program_guard
(
program
,
startup_program
=
Program
()):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
13
],
dtype
=
'float32'
)
fc
=
layers
.
fc
(
input
=
x
,
size
=
10
,
act
=
None
)
reshape
=
layers
.
reshape
(
x
=
fc
,
shape
=
[
-
1
,
2
,
5
])
fc
=
layers
.
reshape
(
x
=
reshape
,
shape
=
[
-
1
,
5
,
2
])
y_predict
=
layers
.
fc
(
input
=
fc
,
size
=
1
,
act
=
None
)
y
=
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
layers
.
mean
(
cost
)
opt
=
optimizer
.
SGD
(
learning_rate
=
0.001
)
opt
.
minimize
(
avg_cost
)
self
.
program
=
program
def
test_inplace_ops
(
self
):
print
(
"before optimization"
)
print
(
str
(
self
.
program
))
result_program
=
memory_optimize
(
self
.
program
)
print
(
"after optimization"
)
print
(
str
(
result_program
))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
a58dd3e5
...
...
@@ -211,7 +211,8 @@ class TestMNIST(TestParallelExecutorBase):
self
.
check_batchnorm_fc_convergence
(
False
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
True
)
# FIXME(zcd): close this test temporally.
# self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
a58dd3e5
...
...
@@ -21,7 +21,7 @@ import paddle
import
paddle.dataset.wmt16
as
wmt16
import
os
WMT16_RECORDIO_FILE
=
"
./wmt16_test_pe
.recordio"
WMT16_RECORDIO_FILE
=
"
/tmp/wmt16
.recordio"
class
ModelHyperParams
(
object
):
...
...
@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
writer
.
append_tensor
(
t
)
writer
.
complete_append_tensor
()
@
unittest
.
skip
(
"transformer is buggy in multi gpu"
)
def
test_main
(
self
):
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
False
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
False
,
iter
=
5
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_reshape_op.py
浏览文件 @
a58dd3e5
...
...
@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
new_shape
)}
def
test_check_output
(
self
):
...
...
@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
self
.
attrs
[
"shape"
])}
def
test_check_output
(
self
):
...
...
@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
self
.
op_type
=
"reshape"
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
ori_shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
"shape"
:
new_shape
,
"inplace"
:
False
}
self
.
attrs
=
{
"shape"
:
new_shape
}
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
infered_shape
)}
def
test_check_output
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_split_ids_op.py
浏览文件 @
a58dd3e5
...
...
@@ -15,6 +15,8 @@
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
class
TestSplitIdsOp
(
OpTest
):
...
...
@@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest):
self
.
check_output
()
class
TestSpliteIds
(
unittest
.
TestCase
):
def
get_places
(
self
):
places
=
[
core
.
CPUPlace
()]
return
places
def
test_check_output
(
self
):
for
place
in
self
.
get_places
():
self
.
check_with_place
(
place
)
def
check_with_place
(
self
,
place
):
scope
=
core
.
Scope
()
rows
=
[
0
,
5
,
7
,
4
,
9
]
height
=
20
row_numel
=
2
# initialize input variable X
x
=
scope
.
var
(
'X'
).
get_selected_rows
()
x
.
set_rows
(
rows
)
x
.
set_height
(
height
)
np_array
=
np
.
ones
((
len
(
rows
),
row_numel
)).
astype
(
"float32"
)
for
i
in
range
(
len
(
rows
)):
for
j
in
range
(
row_numel
):
np_array
[
i
,
j
]
=
rows
[
i
]
+
j
x_tensor
=
x
.
get_tensor
()
x_tensor
.
set
(
np_array
,
place
)
outs_name
=
[
"out%d"
%
i
for
i
in
xrange
(
3
)]
outs
=
[
scope
.
var
(
var_name
).
get_selected_rows
()
for
var_name
in
outs_name
]
# expected output selected rows
expected_out_rows
=
[[
0
,
9
],
[
7
,
4
],
[
5
]]
op
=
Operator
(
"split_ids"
,
Ids
=
"X"
,
Out
=
outs_name
)
for
_
in
range
(
3
):
op
.
run
(
scope
,
place
)
for
i
in
range
(
len
(
outs
)):
expected_rows
=
expected_out_rows
[
i
]
self
.
assertEqual
(
outs
[
i
].
rows
(),
expected_rows
)
for
j
in
range
(
len
(
expected_rows
)):
row
=
expected_rows
[
j
]
self
.
assertAlmostEqual
(
float
(
row
),
np
.
array
(
outs
[
i
].
get_tensor
())[
j
,
0
])
self
.
assertAlmostEqual
(
float
(
row
+
1
),
np
.
array
(
outs
[
i
].
get_tensor
())[
j
,
1
])
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/testsuite.py
浏览文件 @
a58dd3e5
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/transformer_model.py
浏览文件 @
a58dd3e5
此差异已折叠。
点击以展开。
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
a58dd3e5
此差异已折叠。
点击以展开。
tools/manylinux1/Dockerfile.x64
浏览文件 @
a58dd3e5
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录