Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
ddfb9f11
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ddfb9f11
编写于
1月 11, 2019
作者:
M
minqiyang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into imperative_shared_ptr
test=develop
上级
d0b640dc
d1220f23
变更
44
隐藏空白更改
内联
并排
Showing
44 changed file
with
1837 addition
and
340 deletion
+1837
-340
cmake/external/gflags.cmake
cmake/external/gflags.cmake
+9
-0
cmake/generic.cmake
cmake/generic.cmake
+10
-9
paddle/fluid/API.spec
paddle/fluid/API.spec
+2
-0
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+11
-0
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+63
-0
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+15
-0
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+1
-1
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
.../fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+148
-0
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
...e/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+38
-0
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+1
-1
paddle/fluid/imperative/layer.cc
paddle/fluid/imperative/layer.cc
+98
-28
paddle/fluid/imperative/layer.h
paddle/fluid/imperative/layer.h
+40
-5
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+61
-8
paddle/fluid/imperative/tracer.h
paddle/fluid/imperative/tracer.h
+3
-0
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+1
-0
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+2
-2
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+4
-0
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+31
-6
paddle/fluid/inference/utils/CMakeLists.txt
paddle/fluid/inference/utils/CMakeLists.txt
+0
-3
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+64
-85
paddle/fluid/operators/data_norm_op.cc
paddle/fluid/operators/data_norm_op.cc
+409
-0
paddle/fluid/operators/data_norm_op.h
paddle/fluid/operators/data_norm_op.h
+35
-0
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+2
-2
paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+162
-0
paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+118
-0
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+3
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+1
-1
paddle/fluid/platform/float16.h
paddle/fluid/platform/float16.h
+1
-1
paddle/fluid/platform/float16_test.cu
paddle/fluid/platform/float16_test.cu
+4
-2
paddle/fluid/platform/temporary_allocator.cc
paddle/fluid/platform/temporary_allocator.cc
+14
-49
paddle/fluid/platform/temporary_allocator.h
paddle/fluid/platform/temporary_allocator.h
+5
-5
paddle/fluid/platform/temporary_allocator_test.cc
paddle/fluid/platform/temporary_allocator_test.cc
+6
-52
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+2
-3
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+3
-1
paddle/fluid/pybind/imperative.h
paddle/fluid/pybind/imperative.h
+1
-5
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+35
-7
python/paddle/dataset/mnist.py
python/paddle/dataset/mnist.py
+43
-48
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-2
python/paddle/fluid/imperative/layers.py
python/paddle/fluid/imperative/layers.py
+53
-3
python/paddle/fluid/imperative/nn.py
python/paddle/fluid/imperative/nn.py
+3
-3
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+173
-3
python/paddle/fluid/tests/unittests/test_imperative.py
python/paddle/fluid/tests/unittests/test_imperative.py
+100
-3
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
...paddle/fluid/tests/unittests/test_imperative_optimizer.py
+2
-2
python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
...d/tests/unittests/test_teacher_student_sigmoid_loss_op.py
+59
-0
未找到文件。
cmake/external/gflags.cmake
浏览文件 @
ddfb9f11
...
...
@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST
(
APPEND external_project_dependencies gflags
)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if
(
WIN32
)
include
(
CheckIncludeFileCXX
)
check_include_file_cxx
(
"shlwapi.h"
HAVE_SHLWAPI
)
if
(
HAVE_SHLWAPI
)
set_property
(
GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib
)
endif
(
HAVE_SHLWAPI
)
endif
(
WIN32
)
IF
(
WITH_C_API
)
INSTALL
(
DIRECTORY
${
GFLAGS_INCLUDE_DIR
}
DESTINATION third_party/gflags
)
IF
(
ANDROID
)
...
...
cmake/generic.cmake
浏览文件 @
ddfb9f11
...
...
@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
add_dependencies
(
${
TARGET_NAME
}
${
cc_binary_DEPS
}
)
common_link
(
${
TARGET_NAME
}
)
endif
()
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
os_dependency_modules
}
)
endfunction
(
cc_binary
)
function
(
cc_test TARGET_NAME
)
...
...
@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS ARGS
)
cmake_parse_arguments
(
cc_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
if
(
WIN32
)
list
(
APPEND win32_deps shlwapi
)
if
(
"
${
cc_test_DEPS
}
;"
MATCHES
"python;"
)
list
(
REMOVE_ITEM cc_test_DEPS python
)
list
(
APPEND win32_deps
${
PYTHON_LIBRARIES
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
PYTHON_LIBRARIES
}
)
endif
()
endif
(
WIN32
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
if
(
WIN32
)
target_link_libraries
(
${
TARGET_NAME
}
${
win32_deps
}
)
endif
(
WIN32
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
${
os_dependency_modules
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
NAME
${
TARGET_NAME
}
...
...
@@ -451,7 +450,8 @@ function(nv_test TARGET_NAME)
set
(
multiValueArgs SRCS DEPS
)
cmake_parse_arguments
(
nv_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cuda_add_executable
(
${
TARGET_NAME
}
${
nv_test_SRCS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
${
os_dependency_modules
}
)
add_dependencies
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
${
TARGET_NAME
}
${
TARGET_NAME
}
)
...
...
@@ -538,7 +538,8 @@ function(hip_test TARGET_NAME)
endif
()
add_executable
(
${
TARGET_NAME
}
${
_cmake_options
}
${
_generated_files
}
${
_sources
}
)
set_target_properties
(
${
TARGET_NAME
}
PROPERTIES LINKER_LANGUAGE HIP
)
target_link_libraries
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
${
os_dependency_modules
}
)
add_dependencies
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
${
TARGET_NAME
}
${
TARGET_NAME
}
)
...
...
paddle/fluid/API.spec
浏览文件 @
ddfb9f11
...
...
@@ -88,6 +88,7 @@ paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'poo
paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
paddle.fluid.layers.data_norm ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, False, None, None, None, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
...
...
@@ -210,6 +211,7 @@ paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], va
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
ddfb9f11
...
...
@@ -48,6 +48,17 @@ pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library
(
conv_elementwise_add2_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add_fuse_pass inference
)
pass_library
(
conv_affine_channel_fuse_pass inference
)
pass_library
(
transpose_flatten_concat_fuse_pass inference
)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach
(
index RANGE 3 6
)
file
(
APPEND
${
pass_file
}
"USE_PASS(transpose_flatten
${
index
}
_concat_fuse_pass);
\n
"
)
endforeach
()
if
(
WITH_MKLDNN
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
depthwise_conv_mkldnn_pass base
)
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
ddfb9f11
...
...
@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
return
ac_out_var
;
}
// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
// ...
// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
// flatten_out_a -> concat_op flatten_out_b -> concat_op ... flatten_out_z ->
// concat_op
PDNode
*
patterns
::
TransposeFlattenConcat
::
operator
()(
std
::
vector
<
PDNode
*>
conv_in
,
int
times
)
{
// The times represents the repeat times of the
// {trans, trans_out, flatten, flatten_out}
const
int
kNumFields
=
4
;
const
int
kTransOutOffset
=
1
;
const
int
kFlattenOffset
=
2
;
const
int
kFlattenOutOffset
=
3
;
std
::
vector
<
PDNode
*>
nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"transpose"
+
std
::
to_string
(
i
)))
->
assert_is_op
(
"transpose2"
));
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"transpose_out"
+
std
::
to_string
(
i
)))
->
assert_is_op_output
(
"transpose2"
)
->
assert_is_op_input
(
"flatten2"
,
"X"
)
->
AsIntermediate
());
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"flatten"
+
std
::
to_string
(
i
)))
->
assert_is_op
(
"flatten2"
));
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"flatten_out"
+
std
::
to_string
(
i
)))
->
assert_is_op_output
(
"flatten2"
)
->
assert_is_op_nth_input
(
"concat"
,
"X"
,
i
)
->
AsIntermediate
());
}
auto
concat_op
=
pattern
->
NewNode
(
GetNodeName
(
"concat"
))
->
assert_is_op
(
"concat"
)
->
assert_op_has_n_inputs
(
"concat"
,
times
);
auto
concat_out
=
pattern
->
NewNode
(
GetNodeName
(
"concat_out"
))
->
assert_is_op_output
(
"concat"
)
->
AsOutput
();
std
::
vector
<
PDNode
*>
flatten_outs
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
conv_in
[
i
]
->
AsInput
();
// trans
nodes
[
i
*
kNumFields
]
->
LinksFrom
({
conv_in
[
i
]});
// trans_out
nodes
[
i
*
kNumFields
+
kTransOutOffset
]
->
LinksFrom
({
nodes
[
i
*
kNumFields
]});
// flatten
nodes
[
i
*
kNumFields
+
kFlattenOffset
]
->
LinksFrom
(
{
nodes
[
i
*
kNumFields
+
kTransOutOffset
]});
// flatten_out
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]
->
LinksFrom
(
{
nodes
[
i
*
kNumFields
+
kFlattenOffset
]});
flatten_outs
.
push_back
(
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]);
}
concat_op
->
LinksFrom
(
flatten_outs
).
LinksTo
({
concat_out
});
return
concat_out
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
ddfb9f11
...
...
@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
PATTERN_DECL_NODE
(
ac_out
);
// Out
};
struct
TransposeFlattenConcat
:
public
PatternBase
{
TransposeFlattenConcat
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"transpose_flatten_concat"
)
{}
PDNode
*
operator
()(
std
::
vector
<
PDNode
*>
conv_inputs
,
int
times
);
std
::
string
GetNodeName
(
const
std
::
string
&
op_type
)
{
return
PDNodeName
(
name_scope_
,
repr_
,
id_
,
op_type
);
}
PDNode
*
GetPDNode
(
const
std
::
string
&
op_type
)
{
return
pattern
->
RetrieveNode
(
GetNodeName
(
op_type
));
}
};
}
// namespace patterns
// Link two ir::Nodes from each other.
...
...
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
浏览文件 @
ddfb9f11
...
...
@@ -50,7 +50,7 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
// the other one should be unused empty var.
if
(
is_nth_input_var_of_concat
(
x
->
outputs
[
0
],
idx
))
{
satisfied_all
=
satisfied_all
&&
x
->
outputs
[
1
]
->
IsVar
()
&&
x
->
outputs
[
1
]
->
outputs
.
size
()
==
0
;
x
->
outputs
[
1
]
->
outputs
.
empty
()
;
}
else
{
satisfied_all
=
satisfied_all
&&
is_nth_input_var_of_concat
(
x
->
outputs
[
1
],
idx
)
&&
...
...
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
0 → 100644
浏览文件 @
ddfb9f11
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
template
<
int
times
>
std
::
unique_ptr
<
ir
::
Graph
>
TransposeFlattenConcatFusePass
<
times
>::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
pattern_name
=
"transpose_flatten"
+
std
::
to_string
(
times
)
+
"_concat_fuse"
;
FusePassBase
::
Init
(
pattern_name
,
graph
.
get
());
GraphPatternDetector
gpd
;
std
::
vector
<
PDNode
*>
input_nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
input_nodes
.
push_back
(
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
+
std
::
to_string
(
i
))
->
assert_is_op_input
(
"transpose2"
,
"X"
)
->
AsInput
());
}
patterns
::
TransposeFlattenConcat
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
pattern
(
input_nodes
,
times
);
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
const
int
kNumFields
=
5
;
const
int
kTransOffset
=
1
;
const
int
kTransOutOffset
=
2
;
const
int
kFlattenOffset
=
3
;
const
int
kFlattenOutOffset
=
4
;
std
::
vector
<
Node
*>
nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_out"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten_out"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
input_nodes
[
i
]));
nodes
.
push_back
(
subgraph
.
at
(
input_nodes
[
i
]));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_out"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten_out"
+
std
::
to_string
(
i
))));
}
Node
*
concat_op
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"concat"
));
Node
*
concat_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"concat_out"
));
std
::
vector
<
std
::
string
>
input_names
;
std
::
vector
<
int
>
trans_axis
=
boost
::
get
<
std
::
vector
<
int
>>
(
nodes
[
kTransOffset
]
->
Op
()
->
GetAttr
(
"axis"
));
int
flatten_axis
=
boost
::
get
<
int
>
(
nodes
[
kFlattenOffset
]
->
Op
()
->
GetAttr
(
"axis"
));
int
concat_axis
=
boost
::
get
<
int
>
(
concat_op
->
Op
()
->
GetAttr
(
"axis"
));
std
::
string
output_name
=
concat_out
->
Name
();
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
input_names
.
push_back
(
nodes
[
i
*
kNumFields
]
->
Name
());
}
framework
::
OpDesc
new_op_desc
;
new_op_desc
.
SetType
(
"fusion_transpose_flatten_concat"
);
new_op_desc
.
SetInput
(
"X"
,
input_names
);
new_op_desc
.
SetAttr
(
"trans_axis"
,
trans_axis
);
new_op_desc
.
SetAttr
(
"flatten_axis"
,
flatten_axis
);
new_op_desc
.
SetAttr
(
"concat_axis"
,
concat_axis
);
new_op_desc
.
SetOutput
(
"Out"
,
{
output_name
});
new_op_desc
.
Flush
();
// Create a new node for the fused op.
auto
*
new_conv_op
=
graph
->
CreateOpNode
(
&
new_op_desc
);
std
::
unordered_set
<
const
Node
*>
delete_nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
nodes
[
i
*
kNumFields
]
->
outputs
.
push_back
(
new_conv_op
);
new_conv_op
->
inputs
.
push_back
(
nodes
[
i
*
kNumFields
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kTransOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kTransOutOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kFlattenOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]);
}
delete_nodes
.
insert
(
concat_op
);
new_conv_op
->
outputs
.
push_back
(
concat_out
);
concat_out
->
inputs
.
push_back
(
new_conv_op
);
// Delete the unneeded nodes.
GraphSafeRemoveNodes
(
graph
.
get
(),
delete_nodes
);
};
gpd
(
graph
.
get
(),
handler
);
return
graph
;
}
template
class
TransposeFlattenConcatFusePass
<
1
>;
template
class
TransposeFlattenConcatFusePass
<
3
>;
template
class
TransposeFlattenConcatFusePass
<
4
>;
template
class
TransposeFlattenConcatFusePass
<
5
>;
template
class
TransposeFlattenConcatFusePass
<
6
>;
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
transpose_flatten_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
1
>
);
REGISTER_PASS
(
transpose_flatten3_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
3
>
);
REGISTER_PASS
(
transpose_flatten4_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
4
>
);
REGISTER_PASS
(
transpose_flatten5_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
5
>
);
REGISTER_PASS
(
transpose_flatten6_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
6
>
);
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
0 → 100644
浏览文件 @
ddfb9f11
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
// There may be many transpose-flatten structures in a model, and the output of
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template
<
int
times
>
class
TransposeFlattenConcatFusePass
:
public
FusePassBase
{
public:
virtual
~
TransposeFlattenConcatFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/operator.h
浏览文件 @
ddfb9f11
...
...
@@ -391,7 +391,7 @@ class ExecutionContext {
PADDLE_ENFORCE
(
dynamic_cast
<
platform
::
TemporaryAllocation
*>
(
allocation_ptr
)
!=
nullptr
,
"The AllocationPtr must be TemporaryAllocation."
);
PADDLE_ENFORCE_
GE
(
allocation_ptr
->
size
(),
PADDLE_ENFORCE_
EQ
(
allocation_ptr
->
size
(),
framework
::
product
(
dim
)
*
sizeof
(
T
));
paddle
::
framework
::
Tensor
temp_tensor
(
...
...
paddle/fluid/imperative/layer.cc
浏览文件 @
ddfb9f11
...
...
@@ -27,6 +27,8 @@
namespace
paddle
{
namespace
imperative
{
std
::
map
<
int
,
py
::
object
>
py_funcs_
;
using
framework
::
Variable
;
void
AddTo
(
Variable
*
src
,
Variable
*
dst
)
{
...
...
@@ -55,6 +57,7 @@ class Autograd {
if
(
var
->
stop_gradient_
)
{
return
;
}
VLOG
(
3
)
<<
"start autograd"
;
std
::
deque
<
OpBase
*>
ready
;
ready
.
push_back
(
var
->
pre_op_
);
...
...
@@ -120,51 +123,57 @@ framework::LoDTensor& VarBase::GradValue() {
}
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>
OpBase
::
ApplyGrad
()
{
if
(
!
grad_op_desc_
)
{
if
(
!
grad_op_desc_
&&
backward_id_
<=
0
)
{
LOG
(
WARNING
)
<<
"op with no grad: "
<<
op_desc_
->
Type
();
return
{};
}
VLOG
(
3
)
<<
"op grad "
<<
grad_op_desc_
->
Type
();
std
::
vector
<
std
::
unique_ptr
<
framework
::
Variable
>>
tmp_vars
;
std
::
map
<
std
::
string
,
std
::
vector
<
framework
::
Variable
*>>
grad_outputs
;
for
(
auto
it
:
grad_output_vars_
)
{
auto
&
outputs
=
grad_outputs
[
it
.
first
];
for
(
size_t
i
=
0
;
i
<
it
.
second
.
size
();
++
i
)
{
// Allocate a new variable
Variable
*
tmp_var
=
new
framework
::
Variable
();
tmp_var
->
GetMutable
<
framework
::
LoDTensor
>
();
tmp_vars
.
emplace_back
(
tmp_var
);
outputs
.
push_back
(
tmp_var
);
if
(
backward_id_
>
0
)
{
VLOG
(
3
)
<<
"py_layer_grad"
;
grad_outputs
[
"Out@GRAD"
]
=
PyLayer
::
ApplyGrad
(
backward_id_
,
grad_input_vars_
[
"X@GRAD"
]);
}
else
{
VLOG
(
3
)
<<
"op grad "
<<
grad_op_desc_
->
Type
();
for
(
auto
it
:
grad_output_vars_
)
{
auto
&
outputs
=
grad_outputs
[
it
.
first
];
for
(
size_t
i
=
0
;
i
<
it
.
second
.
size
();
++
i
)
{
// Allocate a new variable
Variable
*
tmp_var
=
new
framework
::
Variable
();
tmp_var
->
GetMutable
<
framework
::
LoDTensor
>
();
outputs
.
push_back
(
tmp_var
);
}
}
}
framework
::
RuntimeContext
ctx
(
grad_input_vars_
,
grad_outputs
);
framework
::
RuntimeContext
ctx
(
grad_input_vars_
,
grad_outputs
);
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
grad_op_desc_
->
InferVarType
(
block_
);
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
grad_op_desc_
->
InferVarType
(
block_
);
std
::
unique_ptr
<
framework
::
OperatorBase
>
opbase
=
framework
::
OpRegistry
::
CreateOp
(
*
grad_op_desc_
);
framework
::
OperatorWithKernel
*
op_kernel
=
dynamic_cast
<
framework
::
OperatorWithKernel
*>
(
opbase
.
get
());
PADDLE_ENFORCE_NOT_NULL
(
op_kernel
,
"only support op with kernel"
);
std
::
unique_ptr
<
framework
::
OperatorBase
>
opbase
=
framework
::
OpRegistry
::
CreateOp
(
*
grad_op_desc_
);
framework
::
OperatorWithKernel
*
op_kernel
=
dynamic_cast
<
framework
::
OperatorWithKernel
*>
(
opbase
.
get
());
PADDLE_ENFORCE_NOT_NULL
(
op_kernel
,
"only support op with kernel"
);
framework
::
Scope
scope
;
platform
::
CPUPlace
place
;
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place
);
p
.
op
.
RuntimeInferShape
(
scope
,
place
,
ctx
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
));
framework
::
Scope
scope
;
platform
::
CPUPlace
place
;
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place
);
p
.
op
.
RuntimeInferShape
(
scope
,
place
,
ctx
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
));
}
for
(
auto
it
:
grad_output_vars_
)
{
auto
&
outputs
=
grad_outputs
[
it
.
first
];
auto
&
origin_outputs
=
it
.
second
;
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
origin_outputs
.
size
());
for
(
size_t
i
=
0
;
i
<
outputs
.
size
();
++
i
)
{
framework
::
Variable
*
grad
=
outputs
[
i
];
framework
::
Variable
*
orig_grad
=
origin_outputs
[
i
];
AddTo
(
outputs
[
i
],
orig_grad
);
AddTo
(
grad
,
orig_grad
);
delete
grad
;
}
}
return
input_vars_
;
...
...
@@ -173,6 +182,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
void
VarBase
::
RunBackward
()
{
if
(
!
pre_op_
)
return
;
VLOG
(
3
)
<<
"start backward"
;
auto
grads_t
=
grads_
->
var_
->
GetMutable
<
framework
::
LoDTensor
>
();
float
*
data
=
grads_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
std
::
fill
(
data
,
data
+
grads_t
->
numel
(),
1.0
);
...
...
@@ -183,5 +193,65 @@ void VarBase::RunBackward() {
Autograd
().
RunBackward
(
this
);
}
void
PyLayer
::
RegisterFunc
(
int
func_id
,
const
py
::
object
&
py_func
)
{
py_funcs_
[
func_id
]
=
py_func
;
}
int
PyLayer
::
NumFuncs
()
{
return
py_funcs_
.
size
();
}
std
::
vector
<
VarBase
*>
PyLayer
::
Apply
(
int
func_id
,
const
std
::
vector
<
VarBase
*>&
inputs
)
{
std
::
vector
<
framework
::
Variable
*>
invars
;
for
(
const
VarBase
*
in
:
inputs
)
{
invars
.
push_back
(
in
->
var_
);
}
PADDLE_ENFORCE
(
py_funcs_
.
find
(
func_id
)
!=
py_funcs_
.
end
());
std
::
vector
<
Variable
*>
outvars
=
CallPythonFunc
(
py_funcs_
[
func_id
],
invars
);
std
::
vector
<
VarBase
*>
ret
;
for
(
Variable
*
v
:
outvars
)
{
ret
.
push_back
(
new
VarBase
(
v
,
new
VarBase
(
true
)));
}
return
ret
;
}
std
::
vector
<
Variable
*>
PyLayer
::
ApplyGrad
(
int
func_id
,
const
std
::
vector
<
framework
::
Variable
*>&
inputs
)
{
PADDLE_ENFORCE
(
py_funcs_
.
find
(
func_id
)
!=
py_funcs_
.
end
());
return
CallPythonFunc
(
py_funcs_
[
func_id
],
inputs
);
}
std
::
vector
<
framework
::
Variable
*>
PyLayer
::
CallPythonFunc
(
const
py
::
object
&
callable
,
const
std
::
vector
<
framework
::
Variable
*>&
ins
)
{
py
::
gil_scoped_acquire
guard
;
py
::
tuple
in_args
(
ins
.
size
());
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
const
framework
::
LoDTensor
&
t
=
ins
[
i
]
->
Get
<
framework
::
LoDTensor
>
();
in_args
[
i
]
=
t
.
IsInitialized
()
?
py
::
cast
(
t
)
:
py
::
cast
(
nullptr
);
}
VLOG
(
3
)
<<
"pyfunc in "
<<
py
::
len
(
in_args
);
// TODO(panyx0718): Who owns the returned LoDTensor.
auto
ret
=
callable
(
in_args
);
auto
ret_tuple
=
py
::
cast
<
py
::
tuple
>
(
ret
);
size_t
ret_num
=
py
::
len
(
ret_tuple
);
std
::
vector
<
framework
::
Variable
*>
outs
;
VLOG
(
3
)
<<
"pyfunc out "
<<
ret_num
;
for
(
size_t
i
=
0
;
i
<
ret_num
;
++
i
)
{
try
{
auto
*
py_out_tensor
=
py
::
cast
<
framework
::
LoDTensor
*>
(
ret_tuple
[
i
]);
PADDLE_ENFORCE_NOT_NULL
(
py_out_tensor
,
"Output tensor %d should not be nullptr"
,
i
);
auto
*
var
=
new
framework
::
Variable
();
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
ShareDataWith
(
*
py_out_tensor
);
tensor
->
set_lod
(
py_out_tensor
->
lod
());
outs
.
push_back
(
var
);
}
catch
(
py
::
cast_error
&
)
{
PADDLE_THROW
(
"The %d-th output must be LoDTensor"
,
i
);
}
}
return
outs
;
}
}
// namespace imperative
}
// namespace paddle
paddle/fluid/imperative/layer.h
浏览文件 @
ddfb9f11
...
...
@@ -22,12 +22,15 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
#include "pybind11/pybind11.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace
paddle
{
namespace
imperative
{
namespace
py
=
::
pybind11
;
class
PreparedOp
{
public:
PreparedOp
(
const
framework
::
OperatorBase
&
op
,
...
...
@@ -90,16 +93,21 @@ class OpBase;
*/
class
VarBase
{
public:
VarBase
()
VarBase
()
:
VarBase
(
new
framework
::
Variable
(),
new
VarBase
(
true
))
{}
// Owns `var` and `grad`
VarBase
(
framework
::
Variable
*
var
,
VarBase
*
grad
)
:
pre_op_
(
nullptr
),
pre_op_out_name_
(),
pre_op_out_idx_
(
-
1
),
var_desc_
(
nullptr
),
var_
(
new
framework
::
Variable
()
),
grads_
(
new
VarBase
(
true
)
),
var_
(
var
),
grads_
(
grad
),
stop_gradient_
(
false
)
{}
explicit
VarBase
(
bool
stop_gradient
)
:
pre_op_
(
nullptr
),
pre_op_out_name_
(),
pre_op_out_idx_
(
-
1
),
var_desc_
(
nullptr
),
var_
(
new
framework
::
Variable
()),
...
...
@@ -144,7 +152,11 @@ class VarBase {
*/
class
OpBase
{
public:
OpBase
()
:
op_desc_
(
nullptr
),
grad_op_desc_
(
nullptr
)
{}
OpBase
()
:
op_desc_
(
nullptr
),
forward_id_
(
-
1
),
grad_op_desc_
(
nullptr
),
backward_id_
(
-
1
)
{}
virtual
~
OpBase
()
{
if
(
grad_op_desc_
)
delete
grad_op_desc_
;
...
...
@@ -152,8 +164,14 @@ class OpBase {
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>
ApplyGrad
();
// One of `op_desc_` or `forward_id_` is set, not both.
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
framework
::
OpDesc
*
op_desc_
;
int
forward_id_
;
// When has backward, one of `grad_op_desc_` or `backward_id_` is set,
// not both.
framework
::
OpDesc
*
grad_op_desc_
;
int
backward_id_
;
VarBasePtrMap
input_vars_
;
VarBasePtrMap
output_vars_
;
...
...
@@ -173,8 +191,25 @@ class Layer {
std
::
vector
<
VarBase
>
vars
;
return
vars
;
}
};
class
PyLayer
{
public:
virtual
~
PyLayer
()
{}
static
void
RegisterFunc
(
int
func_id
,
const
py
::
object
&
py_func
);
static
int
NumFuncs
();
static
std
::
vector
<
VarBase
*>
Apply
(
int
func_id
,
const
std
::
vector
<
VarBase
*>&
inputs
);
static
std
::
vector
<
framework
::
Variable
*>
ApplyGrad
(
int
func_id
,
const
std
::
vector
<
framework
::
Variable
*>&
inputs
);
virtual
void
Backward
()
{
LOG
(
ERROR
)
<<
"To support customize"
;
}
private:
static
std
::
vector
<
framework
::
Variable
*>
CallPythonFunc
(
const
py
::
object
&
callable
,
const
std
::
vector
<
framework
::
Variable
*>&
ins
);
};
}
// namespace imperative
...
...
paddle/fluid/imperative/tracer.cc
浏览文件 @
ddfb9f11
...
...
@@ -115,8 +115,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
if
(
!
stop_gradient
)
{
framework
::
OpDesc
*
grad_op_desc
;
auto
grad_to_var
=
new
std
::
unordered_map
<
std
::
string
,
std
::
string
>
();
CreateGradOp
(
*
op_desc
,
{},
{
block
},
&
grad_op_desc
,
grad_to_var
);
// TODO(panyx): Is this leaked?
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
grad_to_var
(
new
std
::
unordered_map
<
std
::
string
,
std
::
string
>
());
CreateGradOp
(
*
op_desc
,
{},
{
block
},
&
grad_op_desc
,
grad_to_var
.
get
());
op
->
grad_op_desc_
=
grad_op_desc
;
for
(
auto
it
:
grad_op_desc
->
Inputs
())
{
...
...
@@ -127,13 +129,15 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
if
(
var_it
==
grad_to_var
->
end
())
{
auto
fwd_var_it
=
vars
.
find
(
grad_invar
);
PADDLE_ENFORCE
(
fwd_var_it
!=
vars
.
end
());
// Forward inputs or outputs.
grad_in_vars
.
push_back
(
fwd_var_it
->
second
->
var_
);
}
else
{
VarBase
*
var
=
vars
[
var_it
->
second
];
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
InitVar
(
var
->
var_
,
var
->
grads_
->
var_
);
if
(
!
var
->
grads_
->
IsInitialized
())
{
InitVar
(
var
->
var_
,
var
->
grads_
);
}
grad_in_vars
.
push_back
(
var
->
grads_
->
var_
);
// Douts.
grad_in_vars
.
push_back
(
var
->
grads_
);
}
}
}
...
...
@@ -145,10 +149,10 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
auto
var_it
=
grad_to_var
->
find
(
grad_outvar
);
PADDLE_ENFORCE
(
var_it
!=
grad_to_var
->
end
());
VarBase
*
var
=
vars
[
var_it
->
second
];
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
InitVar
(
var
->
var_
,
var
->
grads_
->
var_
);
if
(
!
var
->
grads_
->
IsInitialized
())
{
InitVar
(
var
->
var_
,
var
->
grads_
);
}
grad_out_vars
.
push_back
(
var
->
grads_
->
var_
);
grad_out_vars
.
push_back
(
var
->
grads_
);
}
}
}
...
...
@@ -156,5 +160,54 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
op
->
block_
=
block
;
}
std
::
vector
<
VarBase
*>
Tracer
::
PyTrace
(
OpBase
*
op
,
const
std
::
vector
<
VarBase
*>&
inputs
,
bool
stop_gradient
)
{
VLOG
(
3
)
<<
"py_trace"
;
op
->
input_vars_
[
"X"
]
=
inputs
;
op
->
output_vars_
[
"Out"
]
=
PyLayer
::
Apply
(
op
->
forward_id_
,
inputs
);
for
(
VarBase
*
inp
:
inputs
)
{
if
(
inp
->
pre_op_
)
{
op
->
pre_ops_
[
"X"
].
push_back
(
inp
->
pre_op_
);
op
->
pre_ops_out_idx_
[
"X"
].
push_back
(
inp
->
pre_op_out_idx_
);
}
else
{
op
->
pre_ops_
[
"X"
].
push_back
(
nullptr
);
}
}
auto
&
outputs
=
op
->
output_vars_
[
"Out"
];
for
(
size_t
i
=
0
;
i
<
outputs
.
size
();
++
i
)
{
VarBase
*
out
=
outputs
[
i
];
out
->
stop_gradient_
=
stop_gradient
;
out
->
pre_op_
=
op
;
out
->
pre_op_out_name_
=
"Out"
;
out
->
pre_op_out_idx_
=
i
;
}
if
(
!
stop_gradient
)
{
auto
&
grad_input_vars
=
op
->
grad_input_vars_
[
"X@GRAD"
];
auto
&
grad_output_vars
=
op
->
grad_output_vars_
[
"Out@GRAD"
];
for
(
const
VarBase
*
inp
:
inputs
)
{
grad_input_vars
.
push_back
(
inp
->
var_
);
}
for
(
VarBase
*
out
:
outputs
)
{
grad_input_vars
.
push_back
(
out
->
var_
);
}
for
(
VarBase
*
out
:
outputs
)
{
grad_input_vars
.
push_back
(
out
->
grads_
);
if
(
!
grad_input_vars
.
back
()
->
IsInitialized
())
{
InitVar
(
out
->
var_
,
grad_input_vars
.
back
());
}
}
for
(
const
VarBase
*
inp
:
inputs
)
{
grad_output_vars
.
push_back
(
inp
->
grads_
);
if
(
!
grad_output_vars
.
back
()
->
IsInitialized
())
{
InitVar
(
inp
->
var_
,
grad_output_vars
.
back
());
}
}
}
return
outputs
;
}
}
// namespace imperative
}
// namespace paddle
paddle/fluid/imperative/tracer.h
浏览文件 @
ddfb9f11
...
...
@@ -45,6 +45,9 @@ class Tracer {
const
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>&
outputs
,
framework
::
BlockDesc
*
block
,
const
bool
stop_gradient
=
false
);
std
::
vector
<
VarBase
*>
PyTrace
(
OpBase
*
op
,
const
std
::
vector
<
VarBase
*>&
inputs
,
bool
stop_gradient
=
false
);
private:
framework
::
BlockDesc
*
root_block_
;
};
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
ddfb9f11
...
...
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_
=
true
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
Update
();
}
void
contrib
::
AnalysisConfig
::
Update
()
{
...
...
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
ddfb9f11
...
...
@@ -128,8 +128,8 @@ else()
${
CMAKE_STATIC_LIBRARY_PREFIX
}
glog
${
CMAKE_STATIC_LIBRARY_PREFIX
}
gflags
${
CMAKE_STATIC_LIBRARY_PREFIX
}
protobuf
${
CMAKE_STATIC_LIBRARY_PREFIX
}
snappy
${
CMAKE_STATIC_LIBRARY_PREFIX
}
z
${
CMAKE_STATIC_LIBRARY_PREFIX
}
xxhash
snappystream
${
EXTERNAL_LIB
}
)
# NOTE(dzhwinter) shlwapi is deprecated.
set
(
DEPS
${
DEPS
}
libcmt
shlwapi
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
set
(
DEPS
${
DEPS
}
libcmt
${
os_dependency_modules
}
)
endif
(
NOT WIN32
)
if
(
WITH_GPU
)
...
...
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
ddfb9f11
...
...
@@ -141,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add_fuse_pass"
,
//
});
for
(
int
i
=
6
;
i
>=
3
;
i
--
)
{
passes_
.
push_back
(
"transpose_flatten"
+
std
::
to_string
(
i
)
+
"_concat_fuse_pass"
);
}
use_gpu_
=
true
;
}
...
...
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
浏览文件 @
ddfb9f11
...
...
@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
nvinfer1
::
ILayer
*
layer
=
nullptr
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IScaleLayer"
;
...
...
@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
0
};
TensorRTEngine
::
Weight
power_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
if
(
op_type_
==
"add"
)
{
nvinfer1
::
IScaleLayer
*
scale_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
X
,
scale_mode
,
shift_weights
.
get
(),
scale_weights
.
get
(),
power_weights
.
get
());
layer
=
scale_layer
;
}
else
if
(
op_type_
==
"mul"
)
{
nvinfer1
::
IScaleLayer
*
scale_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
X
,
scale_mode
,
scale_weights
.
get
(),
shift_weights
.
get
(),
power_weights
.
get
());
layer
=
scale_layer
;
}
nvinfer1
::
IScaleLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
scale_mode
,
shift_weights
.
get
(),
scale_weights
.
get
(),
power_weights
.
get
());
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
layer
->
setName
((
"elementwise_add
(Output: "
+
output_name
+
")"
).
c_str
());
layer
->
setName
(
(
"elementwise_"
+
op_type_
+
"
(Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
weight_map
[
op_desc
.
Input
(
"Y"
).
front
()]
=
std
::
move
(
weight_tensor
);
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
...
...
@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
engine_
->
DeclareOutput
(
output_name
);
}
}
protected:
std
::
string
op_type_
;
};
class
ElementwiseTensorOpConverter
:
public
OpConverter
{
...
...
@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
{
"max"
,
nvinfer1
::
ElementWiseOperation
::
kMAX
},
};
class
ElementwiseWeightAddOpConverter
:
public
ElementwiseWeightOpConverter
{
public:
ElementwiseWeightAddOpConverter
()
{
op_type_
=
"add"
;
}
};
class
ElementwiseWeightMulOpConverter
:
public
ElementwiseWeightOpConverter
{
public:
ElementwiseWeightMulOpConverter
()
{
op_type_
=
"mul"
;
}
};
class
ElementwiseTensorAddOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorAddOpConverter
()
{
op_type_
=
"add"
;
}
...
...
@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_weight
,
ElementwiseWeightOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_weight
,
ElementwiseWeightAddOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_mul_weight
,
ElementwiseWeightMulOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_tensor
,
ElementwiseTensorAddOpConverter
);
...
...
paddle/fluid/inference/utils/CMakeLists.txt
浏览文件 @
ddfb9f11
...
...
@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test
(
test_benchmark SRCS benchmark_tester.cc DEPS benchmark
)
cc_binary
(
visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes
)
if
(
WIN32
)
target_link_libraries
(
visualizer shlwapi
)
endif
(
WIN32
)
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
ddfb9f11
...
...
@@ -137,6 +137,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t
algo
;
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
bool
half_float
=
false
;
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
...
...
@@ -157,8 +158,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
Tensor
cudnn_workspace
;
void
*
cudnn_workspace_ptr
=
nullptr
;
auto
x_dims
=
framework
::
vectorize
(
input
->
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
...
...
@@ -181,26 +180,21 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
.
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_limit
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
));
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
...
...
@@ -225,23 +219,17 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
"workspace_size to be allocated exceeds the limit"
);
// Allocate on GPU memory
if
(
!
cudnn_workspace_ptr
)
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_in_bytes
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
}
// ------------------- cudnn conv forward ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_conv_desc
,
algo
,
cudnn_workspace_ptr
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
i
*
group_offset_out
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
i
*
group_offset_out
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
};
...
...
@@ -365,20 +353,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size_limit
=
max_user_size
*
1024
*
1024
;
}
Tensor
cudnn_workspace
;
void
*
cudnn_workspace_ptr
=
nullptr
;
if
((
input_data
||
filter_data
)
&&
exhaustive_search
)
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_limit
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
}
auto
x_dims
=
framework
::
vectorize
(
input
->
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
...
...
@@ -396,22 +374,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
data_algo
=
data_algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdDataAlgoPerf_t
,
kNUM_CUDNN_BWD_DATA_ALGS
>
data_perf_stat
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
handle
,
cudnn_filter_desc
,
filter_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_input_desc
,
input_grad_data
,
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
data_perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
));
auto
cudnn_find_bd_data_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
handle
,
cudnn_filter_desc
,
filter_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_input_desc
,
input_grad_data
,
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
data_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_bd_data_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
...
...
@@ -462,23 +443,25 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
filter_algo
=
f_algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
kNUM_CUDNN_BWD_FILTER_ALGS
>
filter_perf_stat
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_grad_data
,
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
filter_perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
));
auto
cudnn_find_bd_f_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_grad_data
,
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
filter_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_bd_f_func
,
workspace_size_limit
);
return
filter_perf_stat
[
0
].
algo
;
});
VLOG
(
3
)
<<
"cuDNN backward filter algo "
<<
filter_algo
;
...
...
@@ -499,16 +482,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
// ------------------- cudnn conv workspace ---------------------
if
(
!
cudnn_workspace_ptr
)
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_in_bytes
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
}
// ------------------- cudnn conv backward data ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
if
(
input_grad
)
{
...
...
@@ -516,12 +489,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace_ptr
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
i
*
group_offset_in
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
i
*
group_offset_in
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
// ------------------- cudnn conv backward filter ---------------------
...
...
@@ -529,12 +505,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset filter_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace_ptr
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
i
*
group_offset_filter
));
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
i
*
group_offset_filter
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
...
...
paddle/fluid/operators/data_norm_op.cc
0 → 100644
浏览文件 @
ddfb9f11
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/data_norm_op.h"
#include <string>
#include "paddle/fluid/framework/data_layout.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
DataLayout
=
framework
::
DataLayout
;
template
<
typename
T
>
using
EigenArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
>>
;
template
<
typename
T
>
using
ConstEigenArrayMap
=
Eigen
::
Map
<
const
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
>>
;
template
<
typename
T
>
using
EigenVectorArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>>
;
template
<
typename
T
>
using
ConstEigenVectorArrayMap
=
Eigen
::
Map
<
const
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>>
;
class
DataNormOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BatchSize"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BatchSum"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BatchSquareSum"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Means"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Scales"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Y"
),
""
);
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"data_layout"
));
PADDLE_ENFORCE
(
x_dims
.
size
()
>=
2
&&
x_dims
.
size
()
<=
5
,
"Input X must have 2 to 5 dimensions."
);
const
int64_t
C
=
(
data_layout
==
DataLayout
::
kNCHW
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
]);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"BatchSize"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"BatchSum"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"BatchSquareSum"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"BatchSize"
)[
0
],
C
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"BatchSum"
)[
0
],
C
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"BatchSquareSum"
)[
0
],
C
);
ctx
->
SetOutputDim
(
"Y"
,
x_dims
);
ctx
->
SetOutputDim
(
"Means"
,
{
C
});
ctx
->
SetOutputDim
(
"Scales"
,
{
C
});
ctx
->
ShareLoD
(
"X"
,
"Y"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
();
// By default, the type of the scale, bias, mean,
// and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto
dn_param_type
=
framework
::
proto
::
VarType
::
FP32
;
if
(
input_data_type
==
framework
::
proto
::
VarType
::
FP64
)
{
dn_param_type
=
framework
::
proto
::
VarType
::
FP64
;
}
PADDLE_ENFORCE_EQ
(
dn_param_type
,
ctx
.
Input
<
Tensor
>
(
"BatchSize"
)
->
type
(),
"BatchSize input should be of float type"
);
PADDLE_ENFORCE_EQ
(
dn_param_type
,
ctx
.
Input
<
Tensor
>
(
"BatchSum"
)
->
type
(),
"BatchSum input should be of float type"
);
PADDLE_ENFORCE_EQ
(
dn_param_type
,
ctx
.
Input
<
Tensor
>
(
"BatchSquareSum"
)
->
type
(),
"BatchSquareSum input should be of float type"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
if
(
library
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
);
}
};
class
DataNormOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
// AddAttr<bool>("is_test", "").SetDefault(false);
AddAttr
<
float
>
(
"epsilon"
,
""
)
.
SetDefault
(
1e-4
)
.
AddCustomChecker
([](
const
float
&
epsilon
)
{
PADDLE_ENFORCE
(
epsilon
>=
0.0
f
&&
epsilon
<=
0.001
f
,
"'epsilon' should be between 0.0 and 0.001."
);
});
AddAttr
<
std
::
string
>
(
"data_layout"
,
""
).
SetDefault
(
"NCHW"
);
AddInput
(
"X"
,
"The input tensor"
);
AddInput
(
"BatchSize"
,
"BatchSize is a 1-dimensional tensor of size C "
"that is applied to the output"
);
AddInput
(
"BatchSum"
,
"BatchSum is a 1-dimensional tensor of size C "
"that is applied to the output"
);
AddInput
(
"BatchSquareSum"
,
"The global BatchSquareSum (for training) or "
"estimated BatchSquareSum (for testing)"
);
AddOutput
(
"Y"
,
"result after normalization"
);
AddOutput
(
"Means"
,
"Mean of the history data batch, "
"will apply to output when training"
)
.
AsIntermediate
();
AddOutput
(
"Scales"
,
"Scales of the history data batch, "
"will apply to output when training"
)
.
AsIntermediate
();
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Data Normalization.
Can be used as a normalizer function for data
The required data format for this layer is one of the following:
1. NHWC `[batch, in_height, in_width, in_channels]`
2. NCHW `[batch, in_channels, in_height, in_width]`
)DOC"
);
}
};
template
<
typename
T
>
class
DataNormKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
// const bool is_test = ctx.Attr<bool>("is_test");
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
data_layout_str
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE
(
x_dims
.
size
()
==
2
,
"The Input dim size should be 2"
);
const
int
N
=
x_dims
[
0
];
const
int
C
=
(
data_layout
==
DataLayout
::
kNCHW
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
]);
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
mean_out
=
ctx
.
Output
<
Tensor
>
(
"Means"
);
auto
*
scales
=
ctx
.
Output
<
Tensor
>
(
"Scales"
);
// alloc memory
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
inv_std
(
C
);
ConstEigenVectorArrayMap
<
T
>
b_size_arr
(
ctx
.
Input
<
Tensor
>
(
"BatchSize"
)
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
b_sum_arr
(
ctx
.
Input
<
Tensor
>
(
"BatchSum"
)
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
b_square_sum_arr
(
ctx
.
Input
<
Tensor
>
(
"BatchSquareSum"
)
->
data
<
T
>
(),
C
);
EigenVectorArrayMap
<
T
>
means_arr
(
mean_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
);
EigenVectorArrayMap
<
T
>
scales_arr
(
scales
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
);
means_arr
=
b_sum_arr
/
b_size_arr
;
scales_arr
=
(
b_size_arr
/
b_square_sum_arr
).
sqrt
();
switch
(
data_layout
)
{
case
DataLayout
::
kNCHW
:
// because it's two dimensions, so make no
// difference
case
DataLayout
::
kNHWC
:
{
EigenArrayMap
<
T
>
(
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
N
)
=
(
ConstEigenArrayMap
<
T
>
(
x
->
data
<
T
>
(),
C
,
N
).
colwise
()
-
means_arr
)
.
colwise
()
*
scales_arr
;
break
;
}
default:
PADDLE_THROW
(
"Unknown storage order: %d"
,
data_layout
);
}
}
};
class
DataNormGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
// check input
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
));
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BatchSize"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BatchSum"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"BatchSquareSum"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Means"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Scales"
),
""
);
// check output
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"BatchSize"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"BatchSum"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"BatchSquareSum"
)),
""
);
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"data_layout"
));
const
int
C
=
(
data_layout
==
DataLayout
::
kNCHW
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
]);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
x_dims
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"BatchSize"
),
{
C
});
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"BatchSum"
),
{
C
});
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"BatchSquareSum"
),
{
C
});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
var
=
ctx
.
InputVar
(
framework
::
GradVarName
(
"Y"
));
if
(
var
==
nullptr
)
{
PADDLE_THROW
(
"can't find Y@GRAD"
);
}
const
Tensor
*
t
=
nullptr
;
if
(
var
->
IsType
<
Tensor
>
())
{
t
=
&
var
->
Get
<
Tensor
>
();
}
else
if
(
var
->
IsType
<
LoDTensor
>
())
{
t
=
&
var
->
Get
<
LoDTensor
>
();
}
if
(
t
==
nullptr
)
{
PADDLE_THROW
(
"can't find Y@GRAD"
);
}
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
if
(
library
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
(),
ctx
.
GetPlace
(),
layout
,
library
);
}
};
template
<
typename
T
>
class
DataNormGradKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
*
batch_size
=
ctx
.
Input
<
Tensor
>
(
"BatchSize"
);
const
auto
*
batch_sum
=
ctx
.
Input
<
Tensor
>
(
"BatchSum"
);
const
auto
*
batch_square_sum
=
ctx
.
Input
<
Tensor
>
(
"BatchSquareSum"
);
const
auto
*
scales
=
ctx
.
Input
<
Tensor
>
(
"Scales"
);
const
auto
*
means
=
ctx
.
Input
<
Tensor
>
(
"Means"
);
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
data_layout_str
);
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE
(
x_dims
.
size
()
==
2
,
"The Input dim size should be 2"
);
const
int
N
=
x_dims
[
0
];
const
int
C
=
(
data_layout
==
DataLayout
::
kNCHW
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
]);
// init output
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_batch_size
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"BatchSize"
));
auto
*
d_batch_sum
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"BatchSum"
));
auto
*
d_batch_square_sum
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"BatchSquareSum"
));
EigenVectorArrayMap
<
T
>
d_batch_size_arr
(
d_batch_size
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
);
EigenVectorArrayMap
<
T
>
d_batch_sum_arr
(
d_batch_sum
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
);
EigenVectorArrayMap
<
T
>
d_batch_square_sum_arr
(
d_batch_square_sum
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
);
d_batch_size_arr
.
setZero
();
d_batch_sum_arr
.
setZero
();
d_batch_square_sum_arr
.
setZero
();
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
switch
(
data_layout
)
{
// because it's two dimensions, so make no difference
case
DataLayout
::
kNCHW
:
case
DataLayout
::
kNHWC
:
{
ConstEigenVectorArrayMap
<
T
>
scales_arr
(
scales
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
means_arr
(
means
->
data
<
T
>
(),
C
);
ConstEigenArrayMap
<
T
>
x_arr
(
x
->
data
<
T
>
(),
C
,
N
);
ConstEigenArrayMap
<
T
>
d_y_arr
(
d_y
->
data
<
T
>
(),
C
,
N
);
EigenArrayMap
<
T
>
d_x_arr
(
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
N
);
d_x_arr
.
setZero
();
for
(
int
nc
=
0
;
nc
<
N
;
++
nc
)
{
d_x_arr
.
col
(
nc
)
=
d_y_arr
.
col
(
nc
)
*
scales_arr
;
}
// calculate data sum and squre sum
ConstEigenVectorArrayMap
<
T
>
batch_size_arr
(
batch_size
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
batch_sum_arr
(
batch_sum
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
batch_square_sum_arr
(
batch_square_sum
->
data
<
T
>
(),
C
);
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
sample_sum
(
C
);
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
sample_square_sum
(
C
);
// calculate data sample sum and square sum
sample_sum
.
setZero
();
sample_square_sum
.
setZero
();
for
(
int
nc
=
0
;
nc
<
N
;
++
nc
)
{
sample_sum
+=
x_arr
.
col
(
nc
);
sample_square_sum
+=
(
x_arr
.
col
(
nc
)
-
means_arr
).
square
();
}
// calculate gradient
d_batch_size_arr
.
setConstant
(
N
);
d_batch_sum_arr
=
sample_sum
;
d_batch_square_sum_arr
=
sample_square_sum
+
d_batch_size_arr
*
epsilon
;
break
;
}
default:
PADDLE_THROW
(
"Unknown storage order: %s"
,
data_layout_str
);
}
}
};
class
DataNormGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
op
=
new
framework
::
OpDesc
();
op
->
SetType
(
"data_norm_grad"
);
op
->
SetInput
(
"X"
,
Input
(
"X"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Y"
),
OutputGrad
(
"Y"
));
op
->
SetInput
(
"BatchSize"
,
Input
(
"BatchSize"
));
op
->
SetInput
(
"BatchSum"
,
Input
(
"BatchSum"
));
op
->
SetInput
(
"BatchSquareSum"
,
Input
(
"BatchSquareSum"
));
op
->
SetInput
(
"Scales"
,
Output
(
"Scales"
));
op
->
SetInput
(
"Means"
,
Output
(
"Means"
));
op
->
SetAttrMap
(
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"BatchSize"
),
InputGrad
(
"BatchSize"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"BatchSum"
),
InputGrad
(
"BatchSum"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"BatchSquareSum"
),
InputGrad
(
"BatchSquareSum"
));
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
data_norm
,
ops
::
DataNormOp
,
ops
::
DataNormOpMaker
,
ops
::
DataNormGradMaker
);
REGISTER_OPERATOR
(
data_norm_grad
,
ops
::
DataNormGradOp
);
REGISTER_OP_CPU_KERNEL
(
data_norm
,
ops
::
DataNormKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
DataNormKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
data_norm_grad
,
ops
::
DataNormGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
DataNormGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/data_norm_op.h
0 → 100644
浏览文件 @
ddfb9f11
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
DataNormKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
DataNormGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
ddfb9f11
...
...
@@ -52,11 +52,11 @@ struct BenchFunc {
for
(
int
i
=
0
;
i
<
FLAGS_burning
;
++
i
)
{
tgt
(
args
...);
}
auto
start
=
paddle
::
platform
::
PosixInNsec
()
/
1e-3
;
auto
start
=
paddle
::
platform
::
PosixInNsec
()
*
1e-3
;
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
++
i
)
{
tgt
(
args
...);
}
auto
end
=
paddle
::
platform
::
PosixInNsec
()
/
1e-3
;
auto
end
=
paddle
::
platform
::
PosixInNsec
()
*
1e-3
;
return
static_cast
<
double
>
(
end
-
start
)
/
FLAGS_repeat
;
}
};
...
...
paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
0 → 100644
浏览文件 @
ddfb9f11
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
class
TeacherStudentSigmoidLossOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Label"
),
"Input(Label) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Y"
),
"Output(Y) should be not null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
label_dims
=
ctx
->
GetInputDim
(
"Label"
);
PADDLE_ENFORCE_EQ
(
x_dims
.
size
(),
2UL
,
"Input(X)'s rank should be 2."
);
PADDLE_ENFORCE_EQ
(
label_dims
.
size
(),
2UL
,
"Input(Label)'s rank should be 2."
);
PADDLE_ENFORCE_EQ
(
x_dims
[
0
],
label_dims
[
0
],
"The 1st dimension of Input(X) and Input(Label) should "
"be equal."
);
PADDLE_ENFORCE_EQ
(
label_dims
[
1
],
1UL
,
"The 2nd dimension of "
"Input(Label) should be 1."
);
ctx
->
SetOutputDim
(
"Y"
,
{
x_dims
[
0
],
1
});
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Y"
);
}
protected:
// Explicitly set that the data type of computation kernel of
// teacher_student_sigmoid_loss
// is determined by its input "X".
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
(),
ctx
.
device_context
());
}
};
class
TeacherStudentSigmoidLossGradientOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Label"
),
"Input(Label) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
"Input(Y@GRAD) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
"Output(X@GRAD) should be not null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
label_dims
=
ctx
->
GetInputDim
(
"Label"
);
auto
dy_dims
=
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Y"
));
PADDLE_ENFORCE_EQ
(
x_dims
.
size
(),
2
,
"Input(X)'s rank should be 2."
);
PADDLE_ENFORCE_EQ
(
dy_dims
.
size
(),
2
,
"Input(Y@Grad)'s rank should be 2."
);
PADDLE_ENFORCE_EQ
(
label_dims
.
size
(),
2
,
"Input(Label)'s rank should be 2."
);
PADDLE_ENFORCE_EQ
(
x_dims
[
0
],
label_dims
[
0
],
"The 1st dimension of Input(X) and Input(Label) should "
"be equal."
);
PADDLE_ENFORCE_EQ
(
x_dims
[
0
],
dy_dims
[
0
],
"The 1st dimension of Input(X) and Input(Y@Grad) should "
"be equal."
);
PADDLE_ENFORCE_EQ
(
dy_dims
[
1
],
1
,
"The 2nd dimension of Input(Y@Grad) should be 1."
);
PADDLE_ENFORCE_EQ
(
label_dims
[
1
],
1
,
"When Attr(soft_label) == false, the 2nd dimension of "
"Input(Label) should be 1."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
x_dims
);
ctx
->
ShareLoD
(
"X"
,
framework
::
GradVarName
(
"X"
));
}
protected:
// Explicitly set that the data type of computation kernel of
// teacher_student_sigmoid_loss
// is determined by its input "X".
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
(),
ctx
.
device_context
());
}
};
class
TeacherStudentSigmoidLossOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor, default Tensor<float>), a 2-D tensor with shape [N x 1],"
" where N is the batch size and D is the output. "
"This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator."
);
AddInput
(
"Label"
,
"(Tensor), the ground truth which is a 2-D tensor. "
"Label is a Tensor<float> with shape [N x 1]. "
);
AddOutput
(
"Y"
,
"(Tensor, default Tensor<float>), a 2-D tensor with shape "
"[N x 1]. The teacher student sigmoid loss."
);
AddAttr
<
float
>
(
"soft_max_up_bound"
,
"fp32, if input > soft_max_up_bound, will be bound, default 15.0"
)
.
SetDefault
(
15.0
);
AddAttr
<
float
>
(
"soft_max_lower_bound"
,
"fp32, if input < soft_max_lower_bound, will be bound, default -15.0"
)
.
SetDefault
(
-
15.0
);
AddComment
(
R"DOC(
TeacherStudentSigmoidLoss Operator.
It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is that
we add another label(z') to original.
loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
z is click or not
z' is teacher value
label = {-2, -1, [0, 2]}
when z' is not exist, clk = 0 : label = -2;
when z' is not exist, clk = 1 : label = -1;
when z' is exist , clk = 0 : label = 0 + z';
when z' is exist , clk = 1 : label = 1 + z';
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
teacher_student_sigmoid_loss
,
ops
::
TeacherStudentSigmoidLossOp
,
ops
::
TeacherStudentSigmoidLossOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
teacher_student_sigmoid_loss_grad
,
ops
::
TeacherStudentSigmoidLossGradientOp
);
REGISTER_OP_CPU_KERNEL
(
teacher_student_sigmoid_loss
,
ops
::
TeacherStudentSigmoidLossOpKernel
<
float
>
,
ops
::
TeacherStudentSigmoidLossOpKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
teacher_student_sigmoid_loss_grad
,
ops
::
TeacherStudentSigmoidLossGradOpKernel
<
float
>
,
ops
::
TeacherStudentSigmoidLossGradOpKernel
<
double
>
);
paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
0 → 100644
浏览文件 @
ddfb9f11
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
class
TeacherStudentSigmoidLossOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
Tensor
*
y
=
context
.
Output
<
Tensor
>
(
"Y"
);
const
Tensor
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Label"
);
T
*
y_data
=
y
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
label_data
=
labels
->
data
<
T
>
();
int64_t
batch_size
=
x
->
dims
()[
0
];
// loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' +
// log(1 + exp(-abs(x)))
// z is click or not
// z' is value q of feed_fine
// label = {-2, -1, [0, 2]}
// when z' is not exist, clk = 0 : label = -2;
// when z' is not exist, clk = 1 : label = -1;
// when z' is exist , clk = 0 : label = 0 + z';
// when z' is exist , clk = 1 : label = 1 + z';
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
if
(
label_data
[
i
]
<
-
1.0
)
{
y_data
[
i
]
=
(
x_data
[
i
]
>
0
?
x_data
[
i
]
:
0.0
)
+
log
(
1.0
+
exp
(
-
fabs
(
x_data
[
i
])));
}
else
if
(
label_data
[
i
]
<
0.0
)
{
y_data
[
i
]
=
(
x_data
[
i
]
>
0
?
x_data
[
i
]
:
0.0
)
-
x_data
[
i
]
+
log
(
1.0
+
exp
(
-
fabs
(
x_data
[
i
])));
}
else
if
(
label_data
[
i
]
<
1.0
)
{
y_data
[
i
]
=
(
x_data
[
i
]
>
0
?
x_data
[
i
]
:
0.0
)
+
log
(
1.0
+
exp
(
-
fabs
(
x_data
[
i
])))
+
(
x_data
[
i
]
>
0
?
x_data
[
i
]
:
0.0
)
-
x_data
[
i
]
*
label_data
[
i
]
+
log
(
1.0
+
exp
(
-
fabs
(
x_data
[
i
])));
}
else
{
y_data
[
i
]
=
(
x_data
[
i
]
>
0
?
x_data
[
i
]
:
0.0
)
-
x_data
[
i
]
+
log
(
1.0
+
exp
(
-
fabs
(
x_data
[
i
])))
+
(
x_data
[
i
]
>
0
?
x_data
[
i
]
:
0.0
)
-
x_data
[
i
]
*
(
label_data
[
i
]
-
1.0
)
+
log
(
1.0
+
exp
(
-
fabs
(
x_data
[
i
])));
}
}
}
};
template
<
typename
T
>
class
TeacherStudentSigmoidLossGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
Tensor
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
Tensor
*
dx
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
T
*
dx_data
=
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Label"
);
const
T
*
label_data
=
labels
->
data
<
T
>
();
T
soft_max_up_bound
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"soft_max_up_bound"
));
T
soft_max_lower_bound
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"soft_max_lower_bound"
));
int64_t
batch_size
=
x
->
dims
()[
0
];
const
framework
::
Tensor
*
dOut
=
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
T
*
dout_data
=
dOut
->
data
<
T
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
T
sum_val
=
x_data
[
i
];
if
(
sum_val
>
soft_max_up_bound
)
{
sum_val
=
soft_max_up_bound
;
}
else
{
if
(
sum_val
<
soft_max_lower_bound
)
{
sum_val
=
soft_max_lower_bound
;
}
}
T
pred
=
1.0
/
(
1.0
+
exp
(
-
sum_val
));
if
(
label_data
[
i
]
<
-
1.0
)
{
dx_data
[
i
]
=
0.0
-
pred
;
}
else
if
(
label_data
[
i
]
<
0.0
)
{
dx_data
[
i
]
=
1.0
-
pred
;
}
else
{
dx_data
[
i
]
=
label_data
[
i
]
-
2.0
*
pred
;
}
if
(
sum_val
>=
soft_max_up_bound
||
sum_val
<=
soft_max_lower_bound
)
{
dx_data
[
i
]
=
0
;
}
dx_data
[
i
]
*=
dout_data
[
i
]
*
-
1
;
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
ddfb9f11
...
...
@@ -15,6 +15,9 @@
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#ifdef _WIN32
#include <numeric>
#endif
#include <random>
#define PADDLE_CUDA_FP16
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
ddfb9f11
...
...
@@ -61,7 +61,7 @@ namespace platform {
* the allocations of temp_allocation_queue:
* - when the Stream calls cudaStreamSynchronize;
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_t
mp
_allocation).
* (defined by FLAGS_limit_of_t
emporary
_allocation).
*
* */
class
DeviceTemporaryAllocator
{
...
...
paddle/fluid/platform/float16.h
浏览文件 @
ddfb9f11
...
...
@@ -59,7 +59,7 @@ limitations under the License. */
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
/*do nothing*/
#define PADDLE_ALIGN(x)
__declspec(align(x))
#endif
namespace
paddle
{
...
...
paddle/fluid/platform/float16_test.cu
浏览文件 @
ddfb9f11
...
...
@@ -271,11 +271,13 @@ TEST(float16, isinf) {
float16
b
=
float16
(
INFINITY
);
// underflow to 0
float16
native_a
(
5e-40
f
);
// overflow to inf
float16
native_b
(
5e40
f
);
EXPECT_EQ
(
std
::
isinf
(
a
),
true
);
EXPECT_EQ
(
std
::
isinf
(
b
),
true
);
#ifndef _WIN32
// overflow to inf
float16
native_b
(
5e40
f
);
EXPECT_EQ
(
std
::
isinf
(
native_b
),
true
);
#endif
EXPECT_EQ
(
native_a
,
float16
(
0
));
}
...
...
paddle/fluid/platform/temporary_allocator.cc
浏览文件 @
ddfb9f11
...
...
@@ -15,15 +15,8 @@
#include "paddle/fluid/platform/temporary_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
DEFINE_int64
(
limit_of_tmp_allocation
,
-
1
,
"The up limit of temporary_allocation size."
);
DEFINE_double
(
times_excess_than_required_tmp_allocation
,
2
,
"times_excess_than_required_tmp_allocation indicates the "
"max size the TemporaryAllocator can return. For example, "
"if the required memory size is N, and "
"times_excess_than_required_tmp_allocation is 2.0, "
"the TemporaryAllocator will return the available allocation "
"that the range of size is N ~ 2*N."
);
DEFINE_double
(
limit_of_temporary_allocation
,
-
1
,
"The up limit of temporary_allocation size."
);
namespace
paddle
{
namespace
platform
{
...
...
@@ -36,25 +29,24 @@ TemporaryAllocation::TemporaryAllocation(
underlying_allocation_
(
std
::
move
(
underlying_allocation
))
{}
TemporaryAllocator
::
TemporaryAllocator
(
platform
::
Place
place
)
:
place_
(
place
)
{
temp_mem_
map_
.
reset
(
new
std
::
multimap
<
size_t
,
TemporaryAllocation
*>
());
temp_mem_
queue_
.
reset
(
new
std
::
deque
<
TemporaryAllocation
*>
());
}
bool
TemporaryAllocator
::
IsAllocThreadSafe
()
const
{
return
true
;
}
void
TemporaryAllocator
::
Release
(
const
std
::
function
<
void
()
>
&
callback
)
{
std
::
unique_ptr
<
std
::
multimap
<
size_t
,
TemporaryAllocation
*>>
t_allocations
;
std
::
shared_ptr
<
std
::
deque
<
TemporaryAllocation
*>>
t_allocations
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
callback
();
t_allocations
.
swap
(
temp_mem_map_
)
;
temp_mem_
map_
.
reset
(
new
std
::
multimap
<
size_t
,
TemporaryAllocation
*>
());
t_allocations
=
temp_mem_queue_
;
temp_mem_
queue_
.
reset
(
new
std
::
deque
<
TemporaryAllocation
*>
());
wait_delete_mem_
=
0
;
}
for
(
auto
tmp
:
*
t_allocations
)
{
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
tmp
.
second
->
ptr
()
<<
" size: "
<<
tmp
.
second
->
size
();
delete
tmp
.
second
;
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
tmp
->
ptr
()
<<
" size: "
<<
tmp
->
size
();
delete
tmp
;
}
}
...
...
@@ -62,34 +54,28 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) {
auto
*
temp_allocation
=
dynamic_cast
<
TemporaryAllocation
*>
(
allocation
);
PADDLE_ENFORCE_NOT_NULL
(
temp_allocation
);
if
(
platform
::
is_gpu_place
(
temp_allocation
->
place
()))
{
PADDLE_ENFORCE
(
platform
::
is_same_place
(
temp_allocation
->
place
(),
place_
),
"The place should be the same."
);
size_t
wait_delete_mem
=
0
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
temp_mem_
map_
->
emplace
(
temp_allocation
->
size
(),
temp_allocation
);
temp_mem_
queue_
->
emplace_back
(
temp_allocation
);
wait_delete_mem_
+=
temp_allocation
->
size
();
wait_delete_mem
=
wait_delete_mem_
;
VLOG
(
10
)
<<
"Move temporary allocation: "
<<
temp_allocation
->
ptr
()
<<
" to delete queue: "
<<
temp_allocation
->
size
()
<<
"; "
<<
"wait_delete_mem: "
<<
wait_delete_mem
;
<<
"wait_delete_mem: "
<<
wait_delete_mem
_
;
}
if
(
FLAGS_limit_of_tmp_allocation
>
0
&&
wait_delete_mem
>
static_cast
<
size_t
>
(
FLAGS_limit_of_tmp_allocation
))
{
PADDLE_ENFORCE
(
callback_
!=
nullptr
,
"The callback is non-initialized."
);
if
(
FLAGS_limit_of_temporary_allocation
>
0
&&
wait_delete_mem
>
FLAGS_limit_of_temporary_allocation
)
{
Release
(
callback_
);
}
return
;
}
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
temp_allocation
->
ptr
()
<<
" size: "
<<
temp_allocation
->
size
();
delete
temp_allocation
;
}
size_t
TemporaryAllocator
::
TemporaryAllocationQueueSize
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
return
temp_mem_
map_
?
temp_mem_map
_
->
size
()
:
0
;
return
temp_mem_
queue_
?
temp_mem_queue
_
->
size
()
:
0
;
}
void
TemporaryAllocator
::
SetCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
...
...
@@ -98,27 +84,6 @@ void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
alloc
::
Allocation
*
TemporaryAllocator
::
AllocateImpl
(
size_t
size
,
alloc
::
Allocator
::
Attr
attr
)
{
{
// Find available allocation in temp_mem_map.
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
if
(
temp_mem_map_
->
size
())
{
auto
it
=
temp_mem_map_
->
lower_bound
(
size
);
// FIXME(zcd): Not sure the best value of excess fraction.
if
(
it
!=
temp_mem_map_
->
end
()
&&
it
->
first
<
static_cast
<
size_t
>
(
size
*
FLAGS_times_excess_than_required_tmp_allocation
))
{
auto
tmp_ptr
=
it
->
second
;
temp_mem_map_
->
erase
(
it
);
wait_delete_mem_
-=
tmp_ptr
->
size
();
VLOG
(
10
)
<<
"Reuse temporary allocation: "
<<
tmp_ptr
->
ptr
()
<<
": "
<<
tmp_ptr
->
size
();
return
tmp_ptr
;
}
}
}
// If not find the the available allocation, get allocation from
// AllocatorFacadeInstance.
auto
raw_allocation
=
alloc
::
AllocatorFacade
::
Instance
().
Alloc
(
place_
,
size
,
attr
);
auto
temp_mem
=
new
TemporaryAllocation
(
std
::
move
(
raw_allocation
));
...
...
paddle/fluid/platform/temporary_allocator.h
浏览文件 @
ddfb9f11
...
...
@@ -15,7 +15,6 @@
#pragma once
#include <condition_variable> // NOLINT
#include <deque>
#include <map>
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
...
...
@@ -40,7 +39,7 @@ class TemporaryAllocation : public memory::allocation::Allocation {
*
* There is one opportunity to free the allocations of temp_allocation_queue:
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_t
mp
_allocation).
* (defined by FLAGS_limit_of_t
emporary
_allocation).
*
* */
class
TemporaryAllocator
:
public
memory
::
allocation
::
Allocator
{
...
...
@@ -63,10 +62,11 @@ class TemporaryAllocator : public memory::allocation::Allocator {
private:
platform
::
Place
place_
;
// When the allocation is not held by any variable, it should be placed
// to temp_mem_
map
immediately.
std
::
unique_ptr
<
std
::
multimap
<
size_t
,
TemporaryAllocation
*>>
temp_mem_map_
{
nullptr
};
// to temp_mem_
queue
immediately.
std
::
shared_ptr
<
std
::
deque
<
TemporaryAllocation
*>>
temp_mem_queue_
{
nullptr
};
std
::
mutex
mtx_
;
size_t
wait_delete_mem_
{
0
};
std
::
function
<
void
()
>
callback_
;
...
...
paddle/fluid/platform/temporary_allocator_test.cc
浏览文件 @
ddfb9f11
...
...
@@ -18,8 +18,7 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
DECLARE_int64
(
limit_of_tmp_allocation
);
DECLARE_double
(
times_excess_than_required_tmp_allocation
);
DECLARE_double
(
limit_of_temporary_allocation
);
namespace
paddle
{
namespace
platform
{
...
...
@@ -36,7 +35,7 @@ class DummyOp : public framework::OperatorBase {
const
platform
::
Place
&
place
)
const
override
{}
};
TEST
(
temporary_allocator
,
te
st_base_function
)
{
TEST
(
temporary_allocator
,
te
mporary_allocator
)
{
platform
::
CPUPlace
cpu_place
;
TemporaryAllocator
alloc
(
cpu_place
);
alloc
.
Allocate
(
100
);
...
...
@@ -60,10 +59,10 @@ TEST(temporary_allocator, test_base_function) {
#endif
}
TEST
(
temporary_allocator
,
test_flags_function
)
{
TEST
(
temporary_allocator
,
add_callback
)
{
#ifdef PADDLE_WITH_CUDA
const
int64_t
limit
=
FLAGS_limit_of_tmp
_allocation
;
FLAGS_limit_of_t
mp
_allocation
=
10
;
const
double
limit
=
FLAGS_limit_of_temporary
_allocation
;
FLAGS_limit_of_t
emporary
_allocation
=
10
;
platform
::
CUDAPlace
gpu_place
(
0
);
TemporaryAllocator
gpu_alloc
(
gpu_place
);
...
...
@@ -79,52 +78,7 @@ TEST(temporary_allocator, test_flags_function) {
});
{
gpu_alloc
.
Allocate
(
100
);
}
PADDLE_ENFORCE
(
deleted
);
FLAGS_limit_of_tmp_allocation
=
limit
;
#endif
}
TEST
(
temporary_allocator
,
test_reuse_tmp_allocation
)
{
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
gpu_place
(
0
);
TemporaryAllocator
gpu_alloc
(
gpu_place
);
gpu_alloc
.
SetCallback
([]()
{});
void
*
tmp_allocation_ptr1
=
nullptr
;
{
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
auto
tmp_allocation1
=
gpu_alloc
.
Allocate
(
100
);
tmp_allocation_ptr1
=
tmp_allocation1
->
ptr
();
}
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
1
);
auto
tmp_allocation2
=
gpu_alloc
.
Allocate
(
100
);
void
*
tmp_allocation_ptr2
=
tmp_allocation2
->
ptr
();
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
PADDLE_ENFORCE_EQ
(
tmp_allocation_ptr1
,
tmp_allocation_ptr2
);
auto
tmp_allocation3
=
gpu_alloc
.
Allocate
(
100
);
void
*
tmp_allocation_ptr3
=
tmp_allocation2
->
ptr
();
PADDLE_ENFORCE_EQ
(
tmp_allocation_ptr1
,
tmp_allocation_ptr3
);
#endif
}
TEST
(
temporary_allocator
,
test_times_excess_than_required_tmp_allocation
)
{
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
gpu_place
(
0
);
TemporaryAllocator
gpu_alloc
(
gpu_place
);
gpu_alloc
.
SetCallback
([]()
{});
double
excess_fraction
=
FLAGS_times_excess_than_required_tmp_allocation
;
void
*
tmp_allocation_ptr1
=
nullptr
;
{
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
auto
tmp_allocation1
=
gpu_alloc
.
Allocate
(
static_cast
<
size_t
>
(
100
*
excess_fraction
-
1
));
tmp_allocation_ptr1
=
tmp_allocation1
->
ptr
();
}
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
1
);
auto
tmp_allocation2
=
gpu_alloc
.
Allocate
(
100
);
void
*
tmp_allocation_ptr2
=
tmp_allocation2
->
ptr
();
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
PADDLE_ENFORCE_EQ
(
tmp_allocation_ptr1
,
tmp_allocation_ptr2
);
FLAGS_limit_of_temporary_allocation
=
limit
;
#endif
}
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
ddfb9f11
...
...
@@ -22,9 +22,8 @@ if(WITH_PYTHON)
endif
(
NOT APPLE AND NOT ANDROID AND NOT WIN32
)
endif
(
WITH_AMD_GPU
)
if
(
WIN32
)
target_link_libraries
(
paddle_pybind shlwapi
)
endif
(
WIN32
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
paddle_pybind
${
os_dependency_modules
}
)
cc_test
(
tensor_py_test SRCS tensor_py_test.cc DEPS python
)
endif
(
WITH_PYTHON
)
paddle/fluid/pybind/imperative.cc
浏览文件 @
ddfb9f11
...
...
@@ -26,7 +26,9 @@ void BindTracer(pybind11::module *m) {
[](
imperative
::
Tracer
&
self
,
framework
::
BlockDesc
*
root_block
)
{
new
(
&
self
)
imperative
::
Tracer
(
root_block
);
})
.
def
(
"trace"
,
&
imperative
::
Tracer
::
Trace
);
.
def
(
"trace"
,
&
imperative
::
Tracer
::
Trace
)
.
def
(
"py_trace"
,
&
imperative
::
Tracer
::
PyTrace
,
pybind11
::
return_value_policy
::
take_ownership
);
}
}
// namespace pybind
...
...
paddle/fluid/pybind/imperative.h
浏览文件 @
ddfb9f11
...
...
@@ -22,7 +22,7 @@ limitations under the License. */
namespace
paddle
{
namespace
pybind
{
class
Py
Layer
:
public
imperative
::
Layer
{
class
Layer
:
public
imperative
::
Layer
{
public:
using
imperative
::
Layer
::
Layer
;
// Inherit constructors
...
...
@@ -31,10 +31,6 @@ class PyLayer : public imperative::Layer {
PYBIND11_OVERLOAD
(
std
::
vector
<
imperative
::
VarBase
>
,
Layer
,
Forward
,
inputs
);
// NOLINT
}
void
Backward
()
override
{
PYBIND11_OVERLOAD
(
void
,
Layer
,
Backward
,
);
// NOLINT
}
};
class
PyOpBase
:
public
imperative
::
OpBase
{
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
ddfb9f11
...
...
@@ -161,16 +161,44 @@ PYBIND11_MODULE(core, m) {
self
.
op_desc_
=
op_desc
;
}
},
py
::
return_value_policy
::
reference
)
.
def_property
(
"forward_id"
,
[](
const
imperative
::
OpBase
&
self
)
{
return
self
.
forward_id_
;
},
[](
imperative
::
OpBase
&
self
,
int
forward_id
)
{
self
.
forward_id_
=
forward_id
;
},
py
::
return_value_policy
::
reference
)
.
def_property
(
"backward_id"
,
[](
const
imperative
::
OpBase
&
self
)
{
return
self
.
backward_id_
;
},
[](
imperative
::
OpBase
&
self
,
int
backward_id
)
{
self
.
backward_id_
=
backward_id
;
},
py
::
return_value_policy
::
reference
);
py
::
class_
<
imperative
::
Layer
,
Py
Layer
/* <--- trampoline*/
>
layer
(
m
,
"Layer"
);
py
::
class_
<
imperative
::
Layer
,
Layer
/* <--- trampoline*/
>
layer
(
m
,
"Layer"
);
layer
.
def
(
py
::
init
<>
())
.
def
(
"forward"
,
[](
imperative
::
Layer
&
self
,
const
std
::
vector
<
imperative
::
VarBase
>
&
inputs
)
{
return
self
.
Forward
(
inputs
);
})
.
def
(
"backward"
,
&
imperative
::
Layer
::
Backward
);
.
def
(
"forward"
,
[](
imperative
::
Layer
&
self
,
const
std
::
vector
<
imperative
::
VarBase
>
&
inputs
)
{
return
self
.
Forward
(
inputs
);
});
py
::
class_
<
imperative
::
PyLayer
>
(
m
,
"PyLayer"
)
.
def
(
py
::
init
<>
())
.
def_static
(
"apply"
,
[](
int
func_id
,
const
std
::
vector
<
imperative
::
VarBase
*>
&
inputs
)
->
std
::
vector
<
imperative
::
VarBase
*>
{
return
imperative
::
PyLayer
::
Apply
(
func_id
,
inputs
);
},
py
::
return_value_policy
::
take_ownership
)
.
def_static
(
"register_func"
,
[](
int
func_id
,
const
py
::
object
&
callable
)
{
imperative
::
PyLayer
::
RegisterFunc
(
func_id
,
callable
);
})
.
def_static
(
"num_funcs"
,
&
imperative
::
PyLayer
::
NumFuncs
);
BindTracer
(
&
m
);
py
::
class_
<
Tensor
>
(
m
,
"Tensor"
,
py
::
buffer_protocol
())
...
...
python/paddle/dataset/mnist.py
浏览文件 @
ddfb9f11
...
...
@@ -21,10 +21,9 @@ parse training set and test set into paddle reader creators.
from
__future__
import
print_function
import
paddle.dataset.common
import
subprocess
import
gzip
import
numpy
import
platform
import
tempfile
import
struct
from
six.moves
import
range
__all__
=
[
'train'
,
'test'
,
'convert'
]
...
...
@@ -41,51 +40,47 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
def
reader_creator
(
image_filename
,
label_filename
,
buffer_size
):
def
reader
():
if
platform
.
system
()
==
'Darwin'
:
zcat_cmd
=
'gzcat'
elif
platform
.
system
()
==
'Linux'
:
zcat_cmd
=
'zcat'
else
:
raise
NotImplementedError
()
# According to http://stackoverflow.com/a/38061619/724872, we
# cannot use standard package gzip here.
tmp_image_file
=
tempfile
.
TemporaryFile
(
prefix
=
'paddle_dataset'
)
m
=
subprocess
.
Popen
(
[
zcat_cmd
,
image_filename
],
stdout
=
tmp_image_file
).
communicate
()
tmp_image_file
.
seek
(
16
)
# skip some magic bytes
# Python3 will not take stdout as file
tmp_label_file
=
tempfile
.
TemporaryFile
(
prefix
=
'paddle_dataset'
)
l
=
subprocess
.
Popen
(
[
zcat_cmd
,
label_filename
],
stdout
=
tmp_label_file
).
communicate
()
tmp_label_file
.
seek
(
8
)
# skip some magic bytes
try
:
# reader could be break.
while
True
:
labels
=
numpy
.
fromfile
(
tmp_label_file
,
'ubyte'
,
count
=
buffer_size
).
astype
(
"int"
)
if
labels
.
size
!=
buffer_size
:
break
# numpy.fromfile returns empty slice after EOF.
images
=
numpy
.
fromfile
(
tmp_image_file
,
'ubyte'
,
count
=
buffer_size
*
28
*
28
).
reshape
((
buffer_size
,
28
*
28
)).
astype
(
'float32'
)
images
=
images
/
255.0
*
2.0
-
1.0
for
i
in
range
(
buffer_size
):
yield
images
[
i
,
:],
int
(
labels
[
i
])
finally
:
try
:
m
.
terminate
()
except
:
pass
try
:
l
.
terminate
()
except
:
pass
with
gzip
.
GzipFile
(
image_filename
,
'rb'
)
as
image_file
:
img_buf
=
image_file
.
read
()
with
gzip
.
GzipFile
(
label_filename
,
'rb'
)
as
label_file
:
lab_buf
=
label_file
.
read
()
step_label
=
0
offset_img
=
0
# read from Big-endian
# get file info from magic byte
# image file : 16B
magic_byte_img
=
'>IIII'
magic_img
,
image_num
,
rows
,
cols
=
struct
.
unpack_from
(
magic_byte_img
,
img_buf
,
offset_img
)
offset_img
+=
struct
.
calcsize
(
magic_byte_img
)
offset_lab
=
0
# label file : 8B
magic_byte_lab
=
'>II'
magic_lab
,
label_num
=
struct
.
unpack_from
(
magic_byte_lab
,
lab_buf
,
offset_lab
)
offset_lab
+=
struct
.
calcsize
(
magic_byte_lab
)
while
True
:
if
step_label
>=
label_num
:
break
fmt_label
=
'>'
+
str
(
buffer_size
)
+
'B'
labels
=
struct
.
unpack_from
(
fmt_label
,
lab_buf
,
offset_lab
)
offset_lab
+=
struct
.
calcsize
(
fmt_label
)
step_label
+=
buffer_size
fmt_images
=
'>'
+
str
(
buffer_size
*
rows
*
cols
)
+
'B'
images_temp
=
struct
.
unpack_from
(
fmt_images
,
img_buf
,
offset_img
)
images
=
numpy
.
reshape
(
images_temp
,
(
buffer_size
,
rows
*
cols
)).
astype
(
'float32'
)
offset_img
+=
struct
.
calcsize
(
fmt_images
)
images
=
images
/
255.0
*
2.0
-
1.0
for
i
in
range
(
buffer_size
):
yield
images
[
i
,
:],
int
(
labels
[
i
])
return
reader
...
...
python/paddle/fluid/__init__.py
浏览文件 @
ddfb9f11
...
...
@@ -155,8 +155,7 @@ def __bootstrap__():
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'sync_nccl_allreduce'
,
'limit_of_tmp_allocation'
,
'times_excess_than_required_tmp_allocation'
'sync_nccl_allreduce'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
...
...
python/paddle/fluid/imperative/layers.py
浏览文件 @
ddfb9f11
...
...
@@ -20,10 +20,12 @@ from paddle.fluid import core
from
paddle.fluid
import
framework
from
paddle.fluid.imperative
import
base
__all__
=
[
'PyLayer'
]
__all__
=
[
'
Layer'
,
'
PyLayer'
]
class
PyLayer
(
core
.
Layer
):
class
Layer
(
core
.
Layer
):
"""Layers composed of operators."""
def
__init__
(
self
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
name
=
None
):
self
.
_once_built
=
False
self
.
_dtype
=
dtype
...
...
@@ -37,8 +39,56 @@ class PyLayer(core.Layer):
self
.
_once_built
=
True
outputs
=
self
.
forward
(
*
inputs
)
return
outputs
def
forward
(
self
,
*
inputs
):
raise
NotImplementedError
def
backward
(
self
,
*
inputs
):
raise
ValueError
(
"Layer shouldn't implement backward"
)
class
PyLayer
(
core
.
PyLayer
):
"""Layers composed of user-defined python codes."""
def
__init__
(
self
):
super
(
PyLayer
,
self
).
__init__
()
@
staticmethod
def
forward
(
inputs
):
raise
NotImplementedError
@
staticmethod
def
backward
(
douts
):
raise
NotImplementedError
@
classmethod
def
__call__
(
cls
,
inputs
):
tracer
=
framework
.
_imperative_tracer
()
block
=
framework
.
default_main_program
().
current_block
()
inputs
=
[
x
.
_ivar
for
x
in
inputs
]
if
not
hasattr
(
cls
,
'forward_id'
):
cls
.
forward_id
=
core
.
PyLayer
.
num_funcs
()
+
1
PyLayer
.
register_func
(
cls
.
forward_id
,
cls
.
forward
)
cls
.
backward_id
=
core
.
PyLayer
.
num_funcs
()
+
1
PyLayer
.
register_func
(
cls
.
backward_id
,
cls
.
backward
)
iop
=
core
.
OpBase
()
iop
.
forward_id
=
cls
.
forward_id
iop
.
backward_id
=
cls
.
backward_id
block
.
ops
.
append
(
iop
)
ivars
=
tracer
.
py_trace
(
iop
,
inputs
,
False
)
# ivars = core.PyLayer.apply(cls.forward, inputs)
ret
=
[]
for
ivar
in
ivars
:
tensor
=
ivar
.
value
.
get_tensor
()
py_var
=
framework
.
Variable
(
block
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
name
=
None
,
shape
=
tensor
.
shape
(),
dtype
=
tensor
.
_dtype
(),
ivar
=
ivar
)
ret
.
append
(
py_var
)
return
ret
python/paddle/fluid/imperative/nn.py
浏览文件 @
ddfb9f11
...
...
@@ -30,7 +30,7 @@ __all__ = [
]
class
Conv2D
(
layers
.
Py
Layer
):
class
Conv2D
(
layers
.
Layer
):
def
__init__
(
self
,
num_channels
,
num_filters
,
...
...
@@ -143,7 +143,7 @@ class Conv2D(layers.PyLayer):
return
self
.
_helper
.
append_activation
(
pre_act
)
class
Pool2D
(
layers
.
Py
Layer
):
class
Pool2D
(
layers
.
Layer
):
def
__init__
(
self
,
pool_size
=-
1
,
pool_type
=
"max"
,
...
...
@@ -205,7 +205,7 @@ class Pool2D(layers.PyLayer):
return
pool_out
class
FC
(
layers
.
Py
Layer
):
class
FC
(
layers
.
Layer
):
def
__init__
(
self
,
size
,
param_attr
=
None
,
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
ddfb9f11
...
...
@@ -58,6 +58,7 @@ __all__ = [
'adaptive_pool2d'
,
'adaptive_pool3d'
,
'batch_norm'
,
'data_norm'
,
'beam_search_decode'
,
'conv2d_transpose'
,
'conv3d_transpose'
,
...
...
@@ -180,6 +181,7 @@ __all__ = [
'lstm'
,
'py_func'
,
'psroi_pool'
,
'teacher_student_sigmoid_loss'
,
'huber_loss'
,
]
...
...
@@ -2896,6 +2898,133 @@ def batch_norm(input,
return
helper
.
append_activation
(
batch_norm_out
)
def
data_norm
(
input
,
act
=
None
,
epsilon
=
1e-05
,
param_attr
=
None
,
data_layout
=
'NCHW'
,
in_place
=
False
,
use_mkldnn
=
False
,
name
=
None
,
moving_mean_name
=
None
,
moving_variance_name
=
None
,
do_model_average_for_mean_and_var
=
False
):
"""
**Data Normalization Layer**
Can be used as a normalizer function for conv2d and fully_connected operations.
The required data format for this layer is one of the following:
1. NHWC `[batch, in_height, in_width, in_channels]`
2. NCHW `[batch, in_channels, in_height, in_width]`
:math:`input` is the input features over a mini-batch.
.. math::
\\
mu_{
\\
beta} &
\\
gets
\\
frac{1}{m}
\\
sum_{i=1}^{m} x_i
\\
qquad &//
\\
\ mini-batch\ mean
\\\\
\\
sigma_{
\\
beta}^{2} &
\\
gets
\\
frac{1}{m}
\\
sum_{i=1}^{m}(x_i -
\\
\\
mu_{
\\
beta})^2
\\
qquad &//\ mini-batch\ variance
\\\\
\\
hat{x_i} &
\\
gets
\\
frac{x_i -
\\
mu_
\\
beta} {
\\
sqrt{
\\
\\
sigma_{
\\
beta}^{2} +
\\
epsilon}}
\\
qquad &//\ normalize
\\\\
y_i &
\\
gets
\\
gamma
\\
hat{x_i} +
\\
beta
\\
qquad &//\ scale\ and\ shift
Args:
input(variable): The input variable which is a LoDTensor.
act(string, Default None): Activation type, linear|relu|prelu|...
epsilon(float, Default 1e-05):
param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
data_layout(string, default NCHW): NCHW|NHWC
in_place(bool, Default False): Make the input and output of batch norm reuse memory.
use_mkldnn(bool, Default false): ${use_mkldnn_comment}
name(string, Default None): A name for this layer(optional). If set None, the layer
will be named automatically.
moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
Returns:
Variable: A tensor variable which is the result after applying data normalization on the input.
Examples:
.. code-block:: python
data = fluid.layers.data(input=x, size=200, param_attr='fc1.w')
hidden2 = fluid.layers.data_norm(input=hidden1)
"""
helper
=
LayerHelper
(
'data_norm'
,
**
locals
())
dtype
=
helper
.
input_dtype
()
input_shape
=
input
.
shape
if
data_layout
==
'NCHW'
:
channel_num
=
input_shape
[
1
]
else
:
if
data_layout
==
'NHWC'
:
channel_num
=
input_shape
[
-
1
]
else
:
raise
ValueError
(
"unsupported data layout:"
+
data_layout
)
param_shape
=
[
channel_num
]
batch_size_default
=
1e4
batch_sum_default
=
0.0
batch_square_sum_default
=
1e4
if
param_attr
and
isinstance
(
param_attr
,
dict
):
batch_size_default
=
param_attr
.
get
(
"batch_size"
,
1e4
)
batch_sum_default
=
param_attr
.
get
(
"batch_sum"
,
0.0
)
batch_square_sum_default
=
param_attr
.
get
(
"batch_square"
,
1e4
)
# create parameter
batch_size
=
helper
.
create_parameter
(
attr
=
ParamAttr
(
name
=
name
+
'.batch_size'
,
initializer
=
Constant
(
value
=
float
(
batch_size_default
)),
trainable
=
True
),
shape
=
param_shape
,
dtype
=
input
.
dtype
)
batch_sum
=
helper
.
create_parameter
(
attr
=
ParamAttr
(
name
=
name
+
'.batch_sum'
,
initializer
=
Constant
(
value
=
float
(
batch_sum_default
)),
trainable
=
True
),
shape
=
param_shape
,
dtype
=
input
.
dtype
)
batch_square_sum
=
helper
.
create_parameter
(
attr
=
ParamAttr
(
name
=
name
+
'.batch_square_sum'
,
initializer
=
Constant
(
value
=
float
(
batch_square_sum_default
)),
trainable
=
True
),
shape
=
param_shape
,
dtype
=
input
.
dtype
)
means
=
helper
.
create_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
scales
=
helper
.
create_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
data_norm_out
=
input
if
in_place
else
helper
.
create_variable
(
dtype
=
dtype
)
helper
.
append_op
(
type
=
"data_norm"
,
inputs
=
{
"X"
:
input
,
"BatchSize"
:
batch_size
,
"BatchSum"
:
batch_sum
,
"BatchSquareSum"
:
batch_square_sum
},
outputs
=
{
"Y"
:
data_norm_out
,
"Means"
:
means
,
"Scales"
:
scales
},
attrs
=
{
"epsilon"
:
epsilon
,
"use_mkldnn"
:
use_mkldnn
})
return
helper
.
append_activation
(
data_norm_out
)
@
templatedoc
()
def
layer_norm
(
input
,
scale
=
True
,
...
...
@@ -3064,9 +3193,9 @@ def group_norm(input,
inputs
[
'Bias'
]
=
bias
# create output
mean_out
=
helper
.
create_
tmp_
variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
variance_out
=
helper
.
create_
tmp_
variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
group_norm_out
=
helper
.
create_
tmp_
variable
(
dtype
)
mean_out
=
helper
.
create_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
variance_out
=
helper
.
create_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
group_norm_out
=
helper
.
create_variable
(
dtype
)
helper
.
append_op
(
type
=
"group_norm"
,
...
...
@@ -9264,6 +9393,47 @@ def log_loss(input, label, epsilon=1e-4, name=None):
return
loss
def
teacher_student_sigmoid_loss
(
input
,
label
,
soft_max_up_bound
=
15.0
,
soft_max_lower_bound
=-
15.0
):
"""
**Teacher Student Log Loss Layer**
This layer accepts input predictions and target label and returns the
teacher_student loss.
.. math::
loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
Args:
input (Variable|list): a 2-D tensor with shape [N x 1], where N is the
batch size. This input is a probability computed
by the previous operator.
label (Variable|list): the ground truth which is a 2-D tensor with
shape [N x 1], where N is the batch size.
soft_max_up_bound (float): if input > soft_max_up_bound, will be bound
soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
Returns:
Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss.
Examples:
.. code-block:: python
cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
"""
helper
=
LayerHelper
(
'teacher_student_sigmoid_loss'
,
**
locals
())
out
=
helper
.
create_variable
(
dtype
=
input
.
dtype
)
helper
.
append_op
(
type
=
'teacher_student_sigmoid_loss'
,
inputs
=
{
'X'
:
[
input
],
'Label'
:
[
label
]},
outputs
=
{
'Y'
:
[
out
]},
attrs
=
{
"soft_max_lower_bound"
:
float
(
soft_max_lower_bound
),
\
"soft_max_up_bound"
:
float
(
soft_max_up_bound
)})
return
out
def
add_position_encoding
(
input
,
alpha
,
beta
,
name
=
None
):
"""
**Add Position Encoding Layer**
...
...
python/paddle/fluid/tests/unittests/test_imperative.py
浏览文件 @
ddfb9f11
...
...
@@ -15,6 +15,7 @@
import
contextlib
import
unittest
import
numpy
as
np
import
sys
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
...
...
@@ -22,7 +23,7 @@ from paddle.fluid.imperative.nn import FC
from
test_imperative_base
import
new_program_scope
class
MyLayer
(
fluid
.
imperative
.
Py
Layer
):
class
MyLayer
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
MyLayer
,
self
).
__init__
()
...
...
@@ -34,7 +35,35 @@ class MyLayer(fluid.imperative.PyLayer):
return
[
x
]
class
MLP
(
fluid
.
imperative
.
PyLayer
):
class
MyPyLayer
(
fluid
.
imperative
.
PyLayer
):
def
__init__
(
self
):
super
(
MyPyLayer
,
self
).
__init__
()
@
staticmethod
def
forward
(
inputs
):
sys
.
stderr
.
write
(
'before forward
\n
'
)
ret
=
np
.
tanh
(
inputs
[
0
])
sys
.
stderr
.
write
(
'after forward: %s
\n
'
%
ret
)
tensor
=
core
.
LoDTensor
()
tensor
.
set
(
ret
,
core
.
CPUPlace
())
return
tuple
([
tensor
])
@
staticmethod
def
backward
(
inputs
):
sys
.
stderr
.
write
(
'calling into backward: %s
\n
'
%
str
(
inputs
))
inp
,
out
,
dout
=
inputs
inp
=
np
.
array
(
inp
)
out
=
np
.
array
(
out
)
dout
=
np
.
array
(
dout
)
sys
.
stderr
.
write
(
'calling into backward: %s, %s, %s
\n
'
%
(
inp
,
out
,
dout
))
ret
=
np
.
array
(
dout
)
*
(
1
-
np
.
square
(
np
.
array
(
out
)))
tensor
=
core
.
LoDTensor
()
tensor
.
set
(
ret
,
core
.
CPUPlace
())
return
tuple
([
tensor
])
class
MLP
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
):
super
(
MLP
,
self
).
__init__
()
self
.
_fc1
=
FC
(
3
,
...
...
@@ -56,9 +85,77 @@ class TestImperative(unittest.TestCase):
with
fluid
.
imperative
.
guard
():
cl
=
core
.
Layer
()
cl
.
forward
([])
l
=
fluid
.
imperative
.
Py
Layer
()
l
=
fluid
.
imperative
.
Layer
()
self
.
assertRaises
(
NotImplementedError
,
l
.
forward
,
[])
def
test_pylayer_func_id
(
self
):
with
fluid
.
imperative
.
guard
():
class
PyLayer1
(
fluid
.
imperative
.
PyLayer
):
def
__init__
(
self
):
super
(
PyLayer1
,
self
).
__init__
()
@
staticmethod
def
forward
(
inputs
):
return
inputs
@
staticmethod
def
backward
(
inputs
):
return
inputs
class
PyLayer2
(
fluid
.
imperative
.
PyLayer
):
def
__init__
(
self
):
super
(
PyLayer2
,
self
).
__init__
()
@
staticmethod
def
forward
(
inputs
):
return
inputs
@
staticmethod
def
backward
(
inputs
):
return
inputs
py_layer_1
=
PyLayer1
()
py_layer_2
=
PyLayer2
()
py_layer_1
([
fluid
.
imperative
.
base
.
to_variable
(
np
.
ones
([
2
,
2
]))])
py_layer_2
([
fluid
.
imperative
.
base
.
to_variable
(
np
.
ones
([
2
,
2
]))])
id
=
py_layer_1
.
forward_id
self
.
assertGreater
(
id
,
0
)
self
.
assertEqual
(
py_layer_1
.
backward_id
,
id
+
1
)
self
.
assertEqual
(
py_layer_2
.
forward_id
,
id
+
2
)
self
.
assertEqual
(
py_layer_2
.
backward_id
,
id
+
3
)
py_layer_1
([
fluid
.
imperative
.
base
.
to_variable
(
np
.
ones
([
2
,
2
]))])
self
.
assertEqual
(
py_layer_1
.
forward_id
,
id
)
def
test_pylayer
(
self
):
np_inp
=
np
.
ones
([
2
,
2
],
np
.
float32
)
with
fluid
.
imperative
.
guard
():
my_py_layer
=
MyPyLayer
()
var_inp
=
fluid
.
imperative
.
base
.
to_variable
(
np_inp
)
outs
=
my_py_layer
([
var_inp
])
dy_out
=
np
.
sum
(
outs
[
0
].
_numpy
())
outs
[
0
].
_backward
()
dy_grad
=
var_inp
.
_gradient
()
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
# TODO(panyx0718): Paddle doesn't diff against data `inp`.
x1
=
inp
*
1
# TODO(panyx0718): If reduce_sum is skipped, the result is wrong.
x
=
fluid
.
layers
.
reduce_sum
(
fluid
.
layers
.
tanh
(
x1
))
param_grads
=
fluid
.
backward
.
append_backward
(
x
,
parameter_list
=
[
x1
.
name
])[
0
]
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
static_out
,
static_grad
=
exe
.
run
(
feed
=
{
inp
.
name
:
np_inp
},
fetch_list
=
[
x
.
name
,
param_grads
[
1
].
name
])
self
.
assertTrue
(
np
.
allclose
(
dy_out
,
static_out
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
def
test_layer_in_out
(
self
):
np_inp
=
np
.
array
([
1.0
,
2.0
,
-
1.0
],
dtype
=
np
.
float32
)
with
fluid
.
imperative
.
guard
():
...
...
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
浏览文件 @
ddfb9f11
...
...
@@ -26,7 +26,7 @@ from paddle.fluid.imperative.base import to_variable
from
test_imperative_base
import
new_program_scope
class
SimpleImgConvPool
(
fluid
.
imperative
.
Py
Layer
):
class
SimpleImgConvPool
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
num_channels
,
num_filters
,
...
...
@@ -72,7 +72,7 @@ class SimpleImgConvPool(fluid.imperative.PyLayer):
return
x
class
MNIST
(
fluid
.
imperative
.
Py
Layer
):
class
MNIST
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
param_attr
=
None
,
bias_attr
=
None
):
super
(
MNIST
,
self
).
__init__
()
...
...
python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py
0 → 100644
浏览文件 @
ddfb9f11
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
from
math
import
log
from
math
import
exp
from
op_test
import
OpTest
from
scipy.special
import
logit
from
scipy.special
import
expit
import
unittest
class
TestTeacherStudentSigmoidLossOp
(
OpTest
):
"""
Test teacher_student_sigmoid_loss with discrete one-hot labels.
"""
def
setUp
(
self
):
self
.
op_type
=
"teacher_student_sigmoid_loss"
batch_size
=
16
num_classes
=
1
self
.
inputs
=
{
'X'
:
logit
(
np
.
random
.
uniform
(
0
,
1
,
(
batch_size
,
num_classes
))
.
astype
(
"float32"
)),
'Label'
:
np
.
random
.
uniform
(
0
,
2
,
(
batch_size
,
num_classes
))
.
astype
(
"float32"
)
}
outs
=
[]
for
index
,
label
in
enumerate
(
self
.
inputs
[
"Label"
]):
x
=
self
.
inputs
[
"X"
][
index
]
if
label
<
-
1.0
:
outs
.
append
(
max
(
x
,
0.0
)
+
log
(
1.0
+
exp
(
-
abs
(
x
))))
elif
label
<
0.0
:
outs
.
append
(
max
(
x
,
0.0
)
-
x
+
log
(
1.0
+
exp
(
-
abs
(
x
))))
elif
label
<
1.0
:
outs
.
append
(
max
(
x
,
0.0
)
+
log
(
1.0
+
exp
(
-
abs
(
x
)))
+
\
max
(
x
,
0.0
)
-
x
*
label
+
log
(
1.0
+
exp
(
-
abs
(
x
))))
else
:
outs
.
append
(
max
(
x
,
0.0
)
-
x
+
log
(
1.0
+
exp
(
-
abs
(
x
)))
+
\
max
(
x
,
0.0
)
-
x
*
(
label
-
1.0
)
+
log
(
1.0
+
exp
(
-
abs
(
x
))))
self
.
outputs
=
{
'Y'
:
np
.
array
(
outs
)}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
"X"
],
"Y"
,
numeric_grad_delta
=
0.005
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录