提交 770e2a18 编写于 作者: T tangwei12

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into Pdv

...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
...@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. ...@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.0.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0) ### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
### Install Latest Stable Release: ### Install Latest Stable Release:
``` ```
# Linux CPU # Linux CPU
...@@ -27,9 +27,9 @@ pip install paddlepaddle ...@@ -27,9 +27,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7 # Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==0.15.0.post87 pip install paddlepaddle-gpu==1.0.1.post87
# Linux GPU cuda8cudnn5 # Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==0.15.0.post85 pip install paddlepaddle-gpu==1.0.1.post85
# For installation on other platform, refer to http://paddlepaddle.org/ # For installation on other platform, refer to http://paddlepaddle.org/
``` ```
......
...@@ -261,6 +261,13 @@ function(cc_library TARGET_NAME) ...@@ -261,6 +261,13 @@ function(cc_library TARGET_NAME)
add_dependencies(${TARGET_NAME} mklml) add_dependencies(${TARGET_NAME} mklml)
target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif() endif()
# remove link to python, see notes at:
# https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
if("${cc_library_DEPS};" MATCHES "python;")
list(REMOVE_ITEM cc_library_DEPS python)
add_dependencies(${TARGET_NAME} python)
target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
endif()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
endif() endif()
...@@ -311,6 +318,8 @@ function(cc_test TARGET_NAME) ...@@ -311,6 +318,8 @@ function(cc_test TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -629,6 +638,8 @@ function(py_test TARGET_NAME) ...@@ -629,6 +638,8 @@ function(py_test TARGET_NAME)
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
# No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
endif() endif()
endfunction() endfunction()
......
...@@ -61,12 +61,12 @@ paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None ...@@ -61,12 +61,12 @@ paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None
paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False)) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
...@@ -97,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti ...@@ -97,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
...@@ -116,6 +116,7 @@ paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], var ...@@ -116,6 +116,7 @@ paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], var
paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)) paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR')) paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
......
...@@ -49,6 +49,8 @@ struct VarHandleBase { ...@@ -49,6 +49,8 @@ struct VarHandleBase {
void AddOutput(OpHandleBase* out, ir::Node* node) { void AddOutput(OpHandleBase* out, ir::Node* node) {
if (pending_ops_.find(out) == pending_ops_.end()) { if (pending_ops_.find(out) == pending_ops_.end()) {
PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
this->Node()->Name());
pending_ops_.insert(out); pending_ops_.insert(out);
node_->outputs.push_back(node); node_->outputs.push_back(node);
} }
......
...@@ -10,7 +10,7 @@ function(pass_library TARGET DEST) ...@@ -10,7 +10,7 @@ function(pass_library TARGET DEST)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS}) cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
# add more DEST here, such as train, dist and collect USE_PASS into a file automatically. # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
message(STATUS "add pass ${TARGET} ${DEST}") message(STATUS "add pass ${TARGET} ${DEST}")
...@@ -25,13 +25,11 @@ cc_library(graph_helper SRCS graph_helper.cc DEPS graph) ...@@ -25,13 +25,11 @@ cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
if (WITH_MKLDNN)
pass_library(conv_relu_mkldnn_fuse_pass inference)
endif ()
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
pass_library(fc_lstm_fuse_pass inference) pass_library(fc_lstm_fuse_pass inference)
...@@ -39,6 +37,13 @@ pass_library(embedding_fc_lstm_fuse_pass inference) ...@@ -39,6 +37,13 @@ pass_library(embedding_fc_lstm_fuse_pass inference)
pass_library(fc_gru_fuse_pass inference) pass_library(fc_gru_fuse_pass inference)
pass_library(seq_concat_fc_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference)
pass_library(conv_bn_fuse_pass inference) pass_library(conv_bn_fuse_pass inference)
pass_library(seqconv_eltadd_relu_fuse_pass inference)
if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base)
pass_library(conv_bias_mkldnn_fuse_pass inference)
pass_library(conv_relu_mkldnn_fuse_pass inference)
pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
endif()
cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
...@@ -54,4 +59,5 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g ...@@ -54,4 +59,5 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
endif () endif ()
...@@ -262,7 +262,7 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl( ...@@ -262,7 +262,7 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
std::unordered_set<std::string> specified_vars({"data_lod_attention", std::unordered_set<std::string> specified_vars({"data_lod_attention",
"cell_init", "hidden_init", "cell_init", "hidden_init",
"data", "week", "minute"}); "data", "week", "minute"});
int count = 0; size_t count = 0;
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
if (node->IsVar() && specified_vars.count(node->Name())) { if (node->IsVar() && specified_vars.count(node->Name())) {
++count; ++count;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
#include <functional>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
template <typename BinaryOperation>
LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
BinaryOperation f) {
PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
LoDTensor vec_y;
vec_y.Resize(vec_a.dims());
const float* a = vec_a.data<float>();
const float* b = vec_b.data<float>();
float* y = vec_y.mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < vec_a.numel(); i++) {
y[i] = f(a[i], b[i]);
}
return vec_y;
}
std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init(name_scope_, graph.get());
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
GraphPatternDetector gpd;
auto* conv_input =
gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
conv_bias_pattern(conv_input);
int found_conv_bias_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvBias fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_bias_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern); // CONV op
// bias
GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern);
// output
GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern);
// elementwise_add op
GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
PADDLE_ENFORCE(subgraph.count(conv_input));
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
VLOG(3) << "do not perform conv+bias fuse";
return;
}
auto* eltwise_bias_tensor =
scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
auto input_names = conv->Op()->InputNames();
bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") !=
input_names.end();
if (has_bias && conv->Op()->Input("Bias").size() > 0) {
auto conv_bias_names = conv->Op()->Input("Bias");
// add eltwise bias to existing conv bias
PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims());
*conv_bias_tensor = tensor_apply_eltwise(
*conv_bias_tensor, *eltwise_bias_tensor, std::plus<float>());
conv->Op()->SetOutput("Output",
std::vector<std::string>({eltwise_out->Name()}));
GraphSafeRemoveNodes(graph.get(), {eltwise, conv_out});
IR_NODE_LINK_TO(conv, eltwise_out);
} else {
// take eltwise bias as conv bias
OpDesc desc;
desc.SetInput(
"Input", std::vector<std::string>({subgraph.at(conv_input)->Name()}));
desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second);
}
auto conv_bias_node = g->CreateOpNode(&desc);
IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node);
IR_NODE_LINK_TO(conv_weight, conv_bias_node);
IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out});
}
found_conv_bias_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_bias_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
paddle::framework::ir::ConvBiasFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the Conv and Elementwise_add to a ConvBiasOp.
*/
class ConvBiasFusePass : public FusePassBase {
public:
virtual ~ConvBiasFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"conv_bias_mkldnn_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -126,12 +126,21 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( ...@@ -126,12 +126,21 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
// conv, batch_norm, // conv, batch_norm,
// conv_weight, conv_out, // conv_weight, conv_out,
// bn_scale, bn_bias, bn_mean, bn_variance, // bn_scale, bn_bias, bn_mean, bn_variance,
// bn_out, bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,
// bn_saved_variance
GET_CONV_BN_NODES(conv_bn_pattern); GET_CONV_BN_NODES(conv_bn_pattern);
// check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+bn fuse";
return;
}
// Create eltwise_y (conv bias) variable // Create eltwise_y (conv bias) variable
VarDesc eltwise_y_in_desc( VarDesc eltwise_y_in_desc(
patterns::PDNodeName(name_scope_, "eltwise_y_in")); patterns::PDNodeName(name_scope_, "eltwise_y_in"));
eltwise_y_in_desc.SetPersistable(true);
auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
auto* eltwise_y_in_tensor = auto* eltwise_y_in_tensor =
scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>(); scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
...@@ -151,27 +160,59 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( ...@@ -151,27 +160,59 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
*bn_mean, *bn_variance, eltwise_y_in_tensor, *bn_mean, *bn_variance, eltwise_y_in_tensor,
epsilon); epsilon);
// Create an elementwise add node // with MKL-DNN fuse conv+bn into conv with bias
OpDesc desc; // without MKL-DNN fuse conv+bn into conv+elementwise_add
desc.SetInput("X", std::vector<std::string>({conv_out->Name()})); if (fuse_option == FUSE_MKLDNN) {
desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()})); auto input_names = conv->Op()->InputNames();
desc.SetOutput("Out", std::vector<std::string>({bn_out->Name()})); bool has_bias = std::find(input_names.begin(), input_names.end(),
desc.SetType("elementwise_add"); "Bias") != input_names.end();
desc.SetAttr("axis", 1); if (has_bias && conv->Op()->Input("Bias").size() > 0) {
bool a = boost::get<bool>(conv->Op()->GetAttr("use_mkldnn")); // reuse existing conv bias node
desc.SetAttr("use_mkldnn", a); auto conv_bias_names = conv->Op()->Input("Bias");
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
GraphSafeRemoveNodes(graph.get(), {bn_scale, bn_bias, bn_mean, bn_variance, auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
batch_norm, bn_mean_out, bn_variance_out, PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
bn_saved_mean, bn_saved_variance}); eltwise_y_in_tensor->dims());
PADDLE_ENFORCE(subgraph.count(conv_input)); auto eigen_conv_bias = EigenVector<float>::From(*conv_bias_tensor);
IR_NODE_LINK_TO(conv_out, eltwise_op); eigen_conv_bias += EigenVector<float>::From(*eltwise_y_in_tensor);
IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); } else {
IR_NODE_LINK_TO(eltwise_op, bn_out); // add new conv_bias node
conv->Op()->SetInput(
found_conv_bn_count++; "Bias", std::vector<std::string>({eltwise_y_in_node->Name()}));
IR_NODE_LINK_TO(eltwise_y_in_node, conv);
}
conv->Op()->SetOutput("Output",
std::vector<std::string>({bn_out->Name()}));
GraphSafeRemoveNodes(
graph.get(),
{conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm,
bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance});
IR_NODE_LINK_TO(conv, bn_out);
found_conv_bn_count++;
} else { // fuse_option == FUSE_NATIVE
// create an elementwise add node.
OpDesc desc;
desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
desc.SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
desc.SetType("elementwise_add");
desc.SetAttr("axis", 1);
auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(
graph.get(),
{bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
bn_variance_out, bn_saved_mean, bn_saved_variance});
IR_NODE_LINK_TO(conv_out, eltwise_op);
IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
IR_NODE_LINK_TO(eltwise_op, bn_out);
found_conv_bn_count++;
}
}; };
gpd(graph.get(), handler); gpd(graph.get(), handler);
...@@ -237,7 +278,6 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl( ...@@ -237,7 +278,6 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
{bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out}); bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out});
PADDLE_ENFORCE(subgraph.count(conv_input));
IR_NODE_LINK_TO(eltwise, bn_out); IR_NODE_LINK_TO(eltwise, bn_out);
found_conv_bn_count++; found_conv_bn_count++;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
#include <functional>
#include <utility>
#include "paddle/fluid/framework/ir/graph_traits.h"
namespace paddle {
namespace framework {
namespace ir {
namespace {
// The function keeps the graph consistent by replacing
// a node 'from' in the set of inputs nodes
// of the visited node by a node 'to'.
void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
for (auto& node : GraphTraits::DFS(*graph)) {
auto from_in_inputs =
std::find(std::begin(node.inputs), std::end(node.inputs), from);
if (from_in_inputs != std::end(node.inputs)) {
IR_NODE_LINK_TO(to, (&node));
auto inputs = node.Op()->Inputs();
using input_type = VariableNameMap::value_type;
std::for_each(std::begin(inputs), std::end(inputs),
[from, to, &node](const input_type& i) -> void {
auto param_names = i.second;
auto pi = std::find(std::begin(param_names),
std::end(param_names), from->Name());
if (pi != std::end(param_names)) {
node.Op()->SetInput(i.first, {to->Name()});
}
});
}
}
}
} // namespace
using graph_ptr = std::unique_ptr<ir::Graph>;
graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
FusePassBase::Init(name_scope_, graph.get());
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Conv conv_pattern{pattern, name_scope_};
auto conv_output = conv_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
elementwise_add_pattern(conv_output);
conv_output->AsIntermediate();
auto conv_op_has_bias = [](const Node& conv_op) -> std::pair<bool, Node*> {
auto bias_input_names = conv_op.Op()->Inputs();
auto bias_it = bias_input_names.find("Bias");
if (bias_it != std::end(bias_input_names)) {
bool has_bias = !bias_it->second.empty();
if (has_bias) {
auto conv_bias_names = bias_it->second;
auto conv_bias_names_it =
std::find_if(std::begin(conv_op.inputs), std::end(conv_op.inputs),
[&conv_bias_names](Node* n) -> bool {
return n->Name() == conv_bias_names[0];
});
return std::make_pair(has_bias, *conv_bias_names_it);
}
}
return std::make_pair(false, nullptr);
};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
OpDesc op_desc;
op_desc.SetType("conv2d");
op_desc.SetInput("Input", {conv_input->Name()});
op_desc.SetInput("Filter", {conv_filter->Name()});
op_desc.SetInput("ResidualData", {elementwise_add_x->Name()});
op_desc.SetOutput("Output", {conv_output->Name()});
bool has_bias;
Node* conv_bias;
std::tie(has_bias, conv_bias) = conv_op_has_bias(*conv_op);
if (has_bias) {
op_desc.SetInput("Bias", {conv_bias->Name()});
}
for (const auto& attr : conv_op->Op()->GetAttrMap()) {
op_desc.SetAttr(attr.first, attr.second);
}
op_desc.SetAttr("fuse_residual_connection", true);
auto fused_conv_op = g->CreateOpNode(&op_desc);
IR_NODE_LINK_TO(conv_input, fused_conv_op);
IR_NODE_LINK_TO(conv_filter, fused_conv_op);
IR_NODE_LINK_TO(elementwise_add_x, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, conv_output);
if (has_bias) {
IR_NODE_LINK_TO(conv_bias, fused_conv_op);
}
CorrectGraphEdges(g, elementwise_add_out, conv_output);
GraphSafeRemoveNodes(g, {elementwise_add_out, conv_op, elementwise_add_op});
};
gpd(graph.get(), handler);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
paddle::framework::ir::ConvElementwiseAddMKLDNNFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class ConvElementwiseAddMKLDNNFusePass : public FusePassBase {
public:
virtual ~ConvElementwiseAddMKLDNNFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"residual_connections_fuse_pass"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
namespace paddle {
namespace framework {
namespace ir {
namespace {
constexpr int nodes_removed = 3;
constexpr int nodes_added = 1;
void SetOp(ProgramDesc* prog, const std::string& type,
const std::vector<std::pair<std::string, std::string>>& inputs,
const std::pair<std::string, std::string>& output) {
auto op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("use_mkldnn", true);
for (const auto& input : inputs) {
op->SetInput(input.first, {input.second});
}
op->SetOutput(output.first, {output.second});
}
struct IsReachable {
using func = std::function<bool(const std::string&, const std::string&)>;
auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
auto find_node = [](const std::unique_ptr<ir::Graph>& graph,
const std::string& name) -> Node* {
for (auto& node : GraphTraits::DFS(*graph)) {
if (name == node.Name()) {
return &node;
}
}
return nullptr;
};
return [&](std::string from, const std::string to) -> bool {
if (from == to) return true;
std::map<std::string, bool> visited;
for (auto& node : GraphTraits::DFS(*graph)) {
visited[node.Name()] = false;
}
visited[from] = true;
std::list<std::string> queue;
queue.push_back(from);
while (!queue.empty()) {
auto cur = find_node(graph, queue.front());
queue.pop_front();
if (cur == nullptr) return false;
for (auto n : cur->outputs) {
if (n->Name() == to) return true;
if (!visited[n->Name()]) {
visited[n->Name()] = true;
queue.push_back(n->Name());
}
}
}
return false;
};
}
};
void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
int conv_count = 0;
int elementwise_add_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") {
++conv_count;
}
if (node->IsOp() && node->Op()->Type() == "elementwise_add") {
++elementwise_add_count;
}
}
EXPECT_EQ(conv_count, 1);
EXPECT_EQ(elementwise_add_count, 0);
}
ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
const std::vector<std::string>& persistent_vars) {
ProgramDesc prog;
auto add_var_to_prog = [&prog](const std::string& var_name) -> VarDesc* {
auto var = prog.MutableBlock(0)->Var(var_name);
var->SetType(proto::VarType::LOD_TENSOR);
return var;
};
for (const auto& v : transient_vars) {
add_var_to_prog(v);
}
for (const auto& v : persistent_vars) {
auto var = add_var_to_prog(v);
var->SetPersistable(true);
}
return prog;
}
} // namespace
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
auto prog =
BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
SetOp(&prog, "conv2d",
{{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
IsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "relu"));
auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "relu"));
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num);
AssertOpsCount(graph);
}
TEST(ConvElementwiseAddMKLDNNFusePass,
ConvolutionWithElementwiseAddReluNoBias) {
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
IsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "relu"));
auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "relu"));
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num);
AssertOpsCount(graph);
}
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) {
auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"});
SetOp(&prog, "conv2d",
{{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
IsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "d"));
auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_FALSE(is_reachable(graph)("a", "d"));
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num);
AssertOpsCount(graph);
}
TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
auto prog =
BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d",
{{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
IsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "f"));
auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "f"));
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num);
AssertOpsCount(graph);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(conv_elementwise_add_mkldnn_fuse_pass);
...@@ -46,6 +46,12 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( ...@@ -46,6 +46,12 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out
GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op
FuseOptions fuse_option = FindFuseOption(*conv, *relu);
if (fuse_option == DO_NOT_FUSE) {
VLOG(3) << "do not perform conv+relu fuse";
return;
}
// Transform Conv node into ConvReLU node. // Transform Conv node into ConvReLU node.
OpDesc* desc = conv->Op(); OpDesc* desc = conv->Op();
desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()})); desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
......
...@@ -20,17 +20,19 @@ namespace paddle { ...@@ -20,17 +20,19 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs, const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) { const std::vector<std::string>& outputs, bool use_mkldnn = false) {
auto* op = prog->MutableBlock(0)->AppendOp(); auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type); op->SetType(type);
if (type == "conv2d") { if (type == "conv2d") {
op->SetAttr("use_mkldnn", true); op->SetAttr("use_mkldnn", use_mkldnn);
op->SetAttr("name", name);
op->SetInput("Input", {inputs[0]}); op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]}); op->SetInput("Filter", {inputs[1]});
op->SetInput("Bias", {inputs[2]}); op->SetInput("Bias", {inputs[2]});
} else if (type == "relu") { } else if (type == "relu") {
op->SetAttr("use_mkldnn", use_mkldnn);
op->SetInput("X", inputs); op->SetInput("X", inputs);
} }
op->SetOutput("Out", outputs); op->SetOutput("Out", outputs);
...@@ -43,7 +45,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, ...@@ -43,7 +45,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
ProgramDesc BuildProgramDesc() { ProgramDesc BuildProgramDesc() {
ProgramDesc prog; ProgramDesc prog;
for (auto& v : for (auto& v :
std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) { std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
"h", "weights2", "bias2", "k", "l"})) {
auto* var = prog.MutableBlock(0)->Var(v); auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS); var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights" || v == "bias") { if (v == "weights" || v == "bias") {
...@@ -51,14 +54,24 @@ ProgramDesc BuildProgramDesc() { ...@@ -51,14 +54,24 @@ ProgramDesc BuildProgramDesc() {
} }
} }
SetOp(&prog, "OP0", std::vector<std::string>({"a"}), SetOp(&prog, "OP0", "op0", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"})); std::vector<std::string>({"b"}));
SetOp(&prog, "OP1", std::vector<std::string>({"b"}), SetOp(&prog, "OP1", "op1", std::vector<std::string>({"b"}),
std::vector<std::string>({"c"})); std::vector<std::string>({"c"}));
SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}), // conv+relu, both with MKL-DNN
std::vector<std::string>({"f"})); SetOp(&prog, "conv2d", "conv1",
SetOp(&prog, "relu", std::vector<std::string>({"f"}), std::vector<std::string>({"c", "weights", "bias"}),
std::vector<std::string>({"g"})); std::vector<std::string>({"f"}), true);
SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
std::vector<std::string>({"g"}), true);
SetOp(&prog, "OP3", "op3", std::vector<std::string>({"g"}),
std::vector<std::string>({"h"}));
// conv+relu, only one with MKL-DNN
SetOp(&prog, "conv2d", "conv2",
std::vector<std::string>({"h", "weights2", "bias2"}),
std::vector<std::string>({"k"}), true);
SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
std::vector<std::string>({"l"}));
return prog; return prog;
} }
...@@ -88,10 +101,16 @@ TEST(ConvReLUFusePass, basic) { ...@@ -88,10 +101,16 @@ TEST(ConvReLUFusePass, basic) {
auto* op = node->Op(); auto* op = node->Op();
ASSERT_TRUE(op->HasAttr("use_mkldnn")); ASSERT_TRUE(op->HasAttr("use_mkldnn"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn"))); EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
ASSERT_TRUE(op->HasAttr("fuse_relu")); // check if only "conv1" convolution is fused
bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu")); auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (fuse_relu) { if (op_name == "conv1") {
++conv_relu_count; ASSERT_TRUE(op->HasAttr("fuse_relu"));
bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
if (fuse_relu) {
++conv_relu_count;
}
} else if (op_name == "conv2") {
ASSERT_FALSE(op->HasAttr("fuse_relu"));
} }
} }
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
void FusePassBase::Init(const std::string& repr, Graph* graph) const {
repr_ = repr;
graph_ = graph;
}
Scope* FusePassBase::param_scope() const {
PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
return graph_->Get<framework::Scope*>(kParamScopeAttr);
}
void FusePassBase::AddStatis(int count_of_fused) const {
PADDLE_ENFORCE(graph_);
PADDLE_ENFORCE(!repr_.empty());
if (!graph_->Has(kFuseStatisAttr)) {
graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
}
auto& info =
graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
info[repr_] = count_of_fused;
}
FuseOptions FusePassBase::FindFuseOption(const Node& node1,
const Node& node2) const {
#ifdef PADDLE_WITH_MKLDNN
bool node1_mkldnn = node1.Op()->HasAttr("use_mkldnn") &&
boost::get<bool>(node1.Op()->GetAttr("use_mkldnn"));
bool node2_mkldnn = node2.Op()->HasAttr("use_mkldnn") &&
boost::get<bool>(node2.Op()->GetAttr("use_mkldnn"));
if (node1_mkldnn && node2_mkldnn)
return FUSE_MKLDNN;
else if (!node1_mkldnn && !node2_mkldnn)
return FUSE_NATIVE;
else
return DO_NOT_FUSE;
#else
return FUSE_NATIVE;
#endif
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -25,32 +25,24 @@ namespace ir { ...@@ -25,32 +25,24 @@ namespace ir {
static const char kParamScopeAttr[] = "__param_scope__"; static const char kParamScopeAttr[] = "__param_scope__";
static const char kFuseStatisAttr[] = "__fuse_statis__"; static const char kFuseStatisAttr[] = "__fuse_statis__";
enum FuseOptions {
DO_NOT_FUSE, // fusing will not be done
FUSE_NATIVE, // fusing will be done without MKL-DNN
FUSE_MKLDNN // fusing will be done with MKL-DNN
};
class FusePassBase : public Pass { class FusePassBase : public Pass {
public: public:
void Init(const std::string& repr, Graph* graph) const { void Init(const std::string& repr, Graph* graph) const;
repr_ = repr; Scope* param_scope() const;
graph_ = graph; void AddStatis(int count_of_fused) const;
}
Scope* param_scope() const {
PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
return graph_->Get<framework::Scope*>(kParamScopeAttr);
}
void AddStatis(int count_of_fused) const {
PADDLE_ENFORCE(graph_);
PADDLE_ENFORCE(!repr_.empty());
if (!graph_->Has(kFuseStatisAttr)) {
graph_->Set(kFuseStatisAttr, new std::unordered_map<std::string, int>);
}
auto& info =
graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
info[repr_] = count_of_fused;
}
virtual ~FusePassBase() {} virtual ~FusePassBase() {}
protected: protected:
virtual FuseOptions FindFuseOption(const Node& node1,
const Node& node2) const;
mutable Graph* graph_; mutable Graph* graph_;
mutable std::string repr_; mutable std::string repr_;
}; };
......
...@@ -200,15 +200,15 @@ TEST(GraphHelperTest, GraphNum) { ...@@ -200,15 +200,15 @@ TEST(GraphHelperTest, GraphNum) {
Graph g(prog); Graph g(prog);
BuildZeroGraph(&g); BuildZeroGraph(&g);
ASSERT_EQ(GraphNum(g), 0); ASSERT_EQ(GraphNum(g), 0UL);
Graph g2(prog); Graph g2(prog);
BuildOneGraph(&g2); BuildOneGraph(&g2);
ASSERT_EQ(GraphNum(g2), 1); ASSERT_EQ(GraphNum(g2), 1UL);
Graph g3(prog); Graph g3(prog);
BuildTwoGraphs(&g3); BuildTwoGraphs(&g3);
ASSERT_EQ(GraphNum(g3), 2); ASSERT_EQ(GraphNum(g3), 2UL);
} }
} // namespace ir } // namespace ir
......
...@@ -259,6 +259,8 @@ GraphPatternDetector::DetectPatterns() { ...@@ -259,6 +259,8 @@ GraphPatternDetector::DetectPatterns() {
return result; return result;
} }
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550
void GraphPatternDetector::UniquePatterns( void GraphPatternDetector::UniquePatterns(
std::vector<GraphPatternDetector::subgraph_t> *subgraphs) { std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
if (subgraphs->empty()) return; if (subgraphs->empty()) return;
...@@ -759,6 +761,51 @@ PDNode *patterns::ConvReLU::operator()( ...@@ -759,6 +761,51 @@ PDNode *patterns::ConvReLU::operator()(
return relu_out_var; return relu_out_var;
} }
PDNode *patterns::SeqConvEltAddRelu::operator()(
paddle::framework::ir::PDNode *seqconv_input) {
// Create Operators
seqconv_input->assert_is_op_input("sequence_conv", "X");
auto *seqconv_op = pattern->NewNode(seqconv_repr())
->assert_is_op("sequence_conv")
->assert_op_attr<bool>("paddingTrainable", false)
->assert_op_attr<int>("contextStride", 1);
auto *eltadd_op =
pattern->NewNode(eltadd_repr())->assert_is_op("elementwise_add");
auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
// Create variables
// Filter
auto *seqconv_weight_var =
pattern->NewNode(seqconv_weight_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("sequence_conv", "Filter");
// Bias
auto *eltadd_bias_var = pattern->NewNode(eltadd_bias_repr())
->AsInput()
->assert_is_op_input("elementwise_add");
// intermediate variable, will be removed in the IR after fuse.
auto *seqconv_out_var = pattern->NewNode(seqconv_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("sequence_conv")
->assert_is_op_input("elementwise_add");
auto *eltadd_out_var = pattern->NewNode(eltadd_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("elementwise_add")
->assert_is_only_input_of_op("relu");
// output
auto *relu_out_var = pattern->NewNode(relu_out_repr())
->AsOutput()
->assert_is_op_output("relu");
seqconv_op->LinksFrom({seqconv_input, seqconv_weight_var})
.LinksTo({seqconv_out_var});
eltadd_op->LinksFrom({seqconv_out_var, eltadd_bias_var})
.LinksTo({eltadd_out_var});
relu_op->LinksFrom({eltadd_out_var}).LinksTo({relu_out_var});
return relu_out_var;
}
PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
bool with_bias) { bool with_bias) {
// Create shared nodes. // Create shared nodes.
...@@ -964,6 +1011,79 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( ...@@ -964,6 +1011,79 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
return ele_add_grad; return ele_add_grad;
} }
PDNode *patterns::ConvBias::operator()(
paddle::framework::ir::PDNode *conv_input) {
// Create Operators
conv_input->assert_is_op_input("conv2d", "Input");
auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
auto *eltiwse_op =
pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
// Create variables
// Filter
auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter");
// intermediate variable, will be removed in the IR after fuse.
auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("conv2d")
->assert_is_op_input("elementwise_add");
// Bias stored in elementwise_add
auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("elementwise_add", "Y");
// output
auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr())
->AsOutput()
->assert_is_op_output("elementwise_add");
conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var})
.LinksTo({eltwise_out_var});
return eltwise_out_var;
}
PDNode *patterns::Conv::operator()() {
auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
auto input_var = pattern->NewNode(conv_input_repr())
->AsInput()
->assert_is_op_input("conv2d", "Input");
auto filter_var = pattern->NewNode(conv_filter_repr())
->AsInput()
->assert_is_op_input("conv2d", "Filter");
auto output_var = pattern->NewNode(conv_output_repr())
->AsOutput()
->assert_is_op_output("conv2d", "Output");
conv_op->LinksFrom({input_var, filter_var});
conv_op->LinksTo({output_var});
return output_var;
}
PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) {
auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
->assert_is_op("elementwise_add");
x_var->assert_is_op_input("elementwise_add", "X");
auto y_var = pattern->NewNode(elementwise_add_x_repr())
->AsInput()
->assert_is_op_input("elementwise_add", "Y");
auto out_var = pattern->NewNode(elementwise_add_out_repr())
->AsOutput()
->assert_is_op_output("elementwise_add", "Out");
elementwise_add_op->LinksFrom({x_var, y_var});
elementwise_add_op->LinksTo({out_var});
return out_var;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -128,6 +128,15 @@ struct PDNode { ...@@ -128,6 +128,15 @@ struct PDNode {
const std::unordered_set<std::string>& op_types, const std::unordered_set<std::string>& op_types,
const std::string& argument, int nth); const std::string& argument, int nth);
template <typename T>
PDNode* assert_op_attr(const std::string& attr_name, const T& attr) {
asserts_.emplace_back([=](Node* x) {
return x && x->IsOp() && x->Op()->HasAttr(attr_name) &&
boost::get<T>(x->Op()->GetAttr(attr_name)) == attr;
});
return this;
}
private: private:
PDNode(PDPattern* pattern, const std::string& name = "", PDNode(PDPattern* pattern, const std::string& name = "",
Type type = Type::kVar) Type type = Type::kVar)
...@@ -434,6 +443,31 @@ struct ConvReLU : public PatternBase { ...@@ -434,6 +443,31 @@ struct ConvReLU : public PatternBase {
PATTERN_DECL_NODE(relu_out); PATTERN_DECL_NODE(relu_out);
}; };
// SEQCONV with Elementwise_Add ReLU
// op: seqconv + elementwise_add + relu
// named nodes:
// seqconv_input, seqconv_weight,
// seqconv_out, seqconv,
// elementwise_add_bias, elementwise_add_out, elementwise_add
// relu_out, relu
struct SeqConvEltAddRelu : public PatternBase {
SeqConvEltAddRelu(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "seqconv_eltadd_relu") {}
PDNode* operator()(PDNode* seqconv_input);
// declare operator node's name
PATTERN_DECL_NODE(seqconv);
PATTERN_DECL_NODE(eltadd);
PATTERN_DECL_NODE(relu);
// declare variable node's name
PATTERN_DECL_NODE(seqconv_weight);
PATTERN_DECL_NODE(seqconv_out);
PATTERN_DECL_NODE(eltadd_bias);
PATTERN_DECL_NODE(eltadd_out);
PATTERN_DECL_NODE(relu_out);
};
// FC with bias // FC with bias
// op: mul + elementwise_add // op: mul + elementwise_add
// named nodes: // named nodes:
...@@ -578,6 +612,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase { ...@@ -578,6 +612,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
PATTERN_DECL_NODE(d_ele_y); PATTERN_DECL_NODE(d_ele_y);
PATTERN_DECL_NODE(ele_y); PATTERN_DECL_NODE(ele_y);
}; };
// Conv with Elementwise_add as bias
// op: conv + elementwise_add
// named nodes:
// conv_input, conv_weight,
// conv_out, conv,
// eltwise_bias, eltwise_out,
// elementwise_add
struct ConvBias : public PatternBase {
ConvBias(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_bias") {}
PDNode* operator()(PDNode* conv_input);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(eltwise);
// declare variable node's name
PATTERN_DECL_NODE(conv_weight);
PATTERN_DECL_NODE(conv_out);
PATTERN_DECL_NODE(eltwise_bias);
PATTERN_DECL_NODE(eltwise_out);
};
// Convolution op
// Forward pass for convolution.
// conv_input, conv_bias and conv_filter are inputs.
// conv_output is a result of the operator.
// residual_data is data used by skip connection.
// If residual connection fusion is on, the formula is:
// conv_output = conv_op(conv_filter, conv_input, conv_bias)
// + conv_residual_data
// If the fusion is off, conv_residual_data is not added.
struct Conv : public PatternBase {
Conv(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "convolution") {}
PDNode* operator()();
PATTERN_DECL_NODE(conv_op);
PATTERN_DECL_NODE(conv_input);
PATTERN_DECL_NODE(conv_filter);
PATTERN_DECL_NODE(conv_residual_data);
PATTERN_DECL_NODE(conv_output);
};
// ElementwiseAdd used in residual connections.
// y_var is used and convolution output.
// The operator is removed, when residual
// connection fusion is on.
struct ElementwiseAdd : public PatternBase {
ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elementwise_add") {}
PDNode* operator()(PDNode* x_var);
PATTERN_DECL_NODE(elementwise_add_op);
PATTERN_DECL_NODE(elementwise_add_x);
PATTERN_DECL_NODE(elementwise_add_y);
PATTERN_DECL_NODE(elementwise_add_out);
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
......
...@@ -124,7 +124,7 @@ TEST(GraphTest, Basic) { ...@@ -124,7 +124,7 @@ TEST(GraphTest, Basic) {
ASSERT_EQ(n->outputs.size(), 0UL); ASSERT_EQ(n->outputs.size(), 0UL);
} }
} }
ASSERT_EQ(nodes.size(), 5); ASSERT_EQ(nodes.size(), 5UL);
} }
TEST(GraphTest, WriteAfterRead) { TEST(GraphTest, WriteAfterRead) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Aplies MKL-DNN placement strategy.";
for (const Node* n : graph->Nodes()) {
if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
n->Op()->SetAttr("use_mkldnn", true);
}
}
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(mkldnn_placement_pass,
paddle::framework::ir::MKLDNNPlacementPass);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class MKLDNNPlacementPass : public Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
namespace paddle {
namespace framework {
namespace ir {
int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "X"))
->assert_is_op_input("sequence_conv")
->assert_var_not_persistable();
patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope);
fuse_pattern(x);
// Create New OpDesc
auto fuse_creator = [&](Node* seqconv, Node* input, Node* seqconv_weight,
Node* eltadd_bias, Node* relu_out) {
OpDesc op_desc;
op_desc.SetType("fusion_seqconv_eltadd_relu");
op_desc.SetInput("X", {input->Name()});
op_desc.SetInput("Filter", {seqconv_weight->Name()});
op_desc.SetInput("Bias", {eltadd_bias->Name()});
op_desc.SetAttr("contextLength", seqconv->Op()->GetAttr("contextLength"));
op_desc.SetAttr("contextStart", seqconv->Op()->GetAttr("contextStart"));
op_desc.SetAttr("contextStride", seqconv->Op()->GetAttr("contextStride"));
PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
auto* scope = graph->Get<Scope*>(kParamScopeAttr);
const std::string ColMat = patterns::UniqueKey("SeqConvColMat");
op_desc.SetOutput("ColMat", {ColMat});
op_desc.SetOutput("Out", {relu_out->Name()});
scope->Var(ColMat)->GetMutable<LoDTensor>();
auto* op = graph->CreateOpNode(&op_desc);
IR_NODE_LINK_TO(input, op);
IR_NODE_LINK_TO(seqconv_weight, op);
IR_NODE_LINK_TO(eltadd_bias, op);
IR_NODE_LINK_TO(op, relu_out);
return op;
};
int fusion_count{0};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle SeqConv EltAdd Relu fuse";
GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(eltadd, eltadd, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(eltadd_bias, eltadd_bias, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(eltadd_out, eltadd_out, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(relu, relu, fuse_pattern);
GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, fuse_pattern);
fuse_creator(seqconv, subgraph.at(x), seqconv_weight, eltadd_bias,
relu_out);
std::unordered_set<const Node*> marked_nodes(
{seqconv, seqconv_out, eltadd, eltadd_out, relu});
GraphSafeRemoveNodes(graph, marked_nodes);
++fusion_count;
};
gpd(graph, handler);
return fusion_count;
}
std::unique_ptr<ir::Graph> SeqConvEltAddReluFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope());
AddStatis(fusion_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
paddle::framework::ir::SeqConvEltAddReluFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class SeqConvEltAddReluFusePass : public FusePassBase {
public:
virtual ~SeqConvEltAddReluFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -85,10 +85,6 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -85,10 +85,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
VLOG(3) << "input " << in << " is not LodTensor"; VLOG(3) << "input " << in << " is not LodTensor";
return; return;
} }
PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j,
out);
out_var->SetLoDLevel(in_var->GetLoDLevel()); out_var->SetLoDLevel(in_var->GetLoDLevel());
} }
...@@ -519,20 +515,14 @@ void OpDesc::InferShape(const BlockDesc &block) const { ...@@ -519,20 +515,14 @@ void OpDesc::InferShape(const BlockDesc &block) const {
} }
void OpDesc::InferVarType(BlockDesc *block) const { void OpDesc::InferVarType(BlockDesc *block) const {
// There are a few places that var type can be set.
// When VarDesc is created, default set to LOD_TENSOR.
// When output variable is created, default is defaut set to LOD_TENSOR.
// We limit here to be the only place that operator defines its customized
// var type inference. Hence, we don't do any "default" setting here.
auto &info = OpInfoMap::Instance().Get(this->Type()); auto &info = OpInfoMap::Instance().Get(this->Type());
if (info.infer_var_type_) { if (info.infer_var_type_) {
info.infer_var_type_(*this, block); info.infer_var_type_(*this, block);
} else {
// all output type is LoDTensor by default
VLOG(10) << this->Type()
<< " has not registered InferVarType. Set output variables to "
"LOD_TENSOR";
for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) {
block->FindRecursiveOrCreateVar(out_var_name)
.SetType(proto::VarType::LOD_TENSOR);
}
}
} }
} }
......
...@@ -156,12 +156,10 @@ ParallelExecutor::ParallelExecutor( ...@@ -156,12 +156,10 @@ ParallelExecutor::ParallelExecutor(
params, member_->local_scopes_, member_->use_cuda_); params, member_->local_scopes_, member_->use_cuda_);
#endif #endif
if (VLOG_IS_ON(5)) { // If the loss_var_name is given, the number of graph should be only one.
// If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) {
if (loss_var_name.size()) { PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, "The number of graph should be only one");
"The number of graph should be only one");
}
} }
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
...@@ -299,6 +297,12 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ...@@ -299,6 +297,12 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
} }
ParallelExecutor::~ParallelExecutor() { ParallelExecutor::~ParallelExecutor() {
const auto dev_ctxs =
platform::DeviceContextPool::Instance().GetAllDeviceContexts();
for (auto &dev_ctx : dev_ctxs) {
dev_ctx->Wait();
}
if (member_->own_local_scope_) { if (member_->own_local_scope_) {
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
Scope *local_scope = member_->local_scopes_[i]; Scope *local_scope = member_->local_scopes_[i];
......
...@@ -126,7 +126,7 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() { ...@@ -126,7 +126,7 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
std::vector<std::string> feed_target_names; std::vector<std::string> feed_target_names;
for (auto *op : global_block.AllOps()) { for (auto *op : global_block.AllOps()) {
if (op->Type() == kFeedOpType) { if (op->Type() == kFeedOpType) {
int col = boost::get<int>(op->GetAttr("col")); size_t col = boost::get<int>(op->GetAttr("col"));
if (col >= feed_target_names.size()) { if (col >= feed_target_names.size()) {
feed_target_names.resize(col + 1); feed_target_names.resize(col + 1);
} }
...@@ -143,7 +143,7 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() { ...@@ -143,7 +143,7 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
std::vector<std::string> fetch_target_names; std::vector<std::string> fetch_target_names;
for (auto *op : global_block.AllOps()) { for (auto *op : global_block.AllOps()) {
if (op->Type() == kFetchOpType) { if (op->Type() == kFetchOpType) {
int col = boost::get<int>(op->GetAttr("col")); size_t col = boost::get<int>(op->GetAttr("col"));
if (col >= fetch_target_names.size()) { if (col >= fetch_target_names.size()) {
fetch_target_names.resize(col + 1); fetch_target_names.resize(col + 1);
} }
......
...@@ -103,7 +103,7 @@ TEST(ProgramDesc, copy_ctor) { ...@@ -103,7 +103,7 @@ TEST(ProgramDesc, copy_ctor) {
ASSERT_EQ(1, op->GetBlockAttrId("sub_block")); ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
found_sub_block = true; found_sub_block = true;
ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size()); ASSERT_EQ(2UL, op->GetBlocksAttrIds("sub_blocks").size());
found_sub_blocks = true; found_sub_blocks = true;
} }
} }
......
...@@ -39,8 +39,8 @@ TEST(READER, decorate_chain) { ...@@ -39,8 +39,8 @@ TEST(READER, decorate_chain) {
{ {
auto endpoints = root->GetEndPoints(); auto endpoints = root->GetEndPoints();
ASSERT_EQ(endpoints.size(), 2U); ASSERT_EQ(endpoints.size(), 2U);
ASSERT_NE(endpoints.count(end_point1.get()), 0); ASSERT_NE(endpoints.count(end_point1.get()), 0UL);
ASSERT_NE(endpoints.count(end_point2.get()), 0); ASSERT_NE(endpoints.count(end_point2.get()), 0UL);
} }
{ {
......
...@@ -91,7 +91,7 @@ TEST(SelectedRows, SparseTable) { ...@@ -91,7 +91,7 @@ TEST(SelectedRows, SparseTable) {
ASSERT_TRUE(table.HasKey(10)); ASSERT_TRUE(table.HasKey(10));
ASSERT_TRUE(table.HasKey(8)); ASSERT_TRUE(table.HasKey(8));
ASSERT_TRUE(table.HasKey(6)); ASSERT_TRUE(table.HasKey(6));
ASSERT_EQ(table.rows().size(), 3); ASSERT_EQ(table.rows().size(), 3UL);
framework::Tensor ids; framework::Tensor ids;
ids.Resize(framework::make_ddim({4})); ids.Resize(framework::make_ddim({4}));
......
...@@ -101,7 +101,13 @@ Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } ...@@ -101,7 +101,13 @@ Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
void Analyzer::Run(Argument* argument) { void Analyzer::Run(Argument* argument) {
std::vector<std::string> passes; std::vector<std::string> passes;
for (auto& pass : all_ir_passes_) { #ifdef PADDLE_WITH_MKLDNN
if (use_mkldnn_) {
VLOG(3) << "Adding MKL-DNN placement pass";
passes.push_back("mkldnn_placement_pass");
}
#endif
for (auto& pass : ir_passes_) {
if (!disabled_ir_passes_.count(pass)) { if (!disabled_ir_passes_.count(pass)) {
passes.push_back(pass); passes.push_back(pass);
passes.push_back("graph_viz_pass"); // add graphviz for debug. passes.push_back("graph_viz_pass"); // add graphviz for debug.
...@@ -117,11 +123,26 @@ void Analyzer::Run(Argument* argument) { ...@@ -117,11 +123,26 @@ void Analyzer::Run(Argument* argument) {
} }
} }
Analyzer& Analyzer::IncludeAllIrPasses() {
ir_passes_ = all_ir_passes_;
return *this;
}
Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) { Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
disabled_ir_passes_.insert(passes.begin(), passes.end()); disabled_ir_passes_.insert(passes.begin(), passes.end());
return *this; return *this;
} }
Analyzer& Analyzer::IncludeIrPasses(const std::vector<std::string>& passes) {
ir_passes_ = passes;
return *this;
}
Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) {
use_mkldnn_ = use_mkldnn;
return *this;
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -54,6 +54,9 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -54,6 +54,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
void Run(Argument* argument); void Run(Argument* argument);
Analyzer& DisableIrPasses(const std::vector<std::string>& passes); Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
Analyzer& IncludeIrPasses(const std::vector<std::string>& passes);
Analyzer& IncludeAllIrPasses();
Analyzer& SetUseMkldnn(bool use_mkldnn);
DISABLE_COPY_AND_ASSIGN(Analyzer); DISABLE_COPY_AND_ASSIGN(Analyzer);
...@@ -64,23 +67,29 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -64,23 +67,29 @@ class Analyzer : public OrderedRegistry<PassManager> {
// larger fusion. // larger fusion.
const std::vector<std::string> all_ir_passes_{{ const std::vector<std::string> all_ir_passes_{{
// Manual update the passes here. // Manual update the passes here.
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"attention_lstm_fuse_pass", // "attention_lstm_fuse_pass", //
"embedding_fc_lstm_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", //
"fc_lstm_fuse_pass", // "embedding_fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", // "fc_lstm_fuse_pass", //
"fc_gru_fuse_pass", // "mul_lstm_fuse_pass", //
"mul_gru_fuse_pass", // "fc_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", // "mul_gru_fuse_pass", //
"fc_fuse_pass", // "seq_concat_fc_fuse_pass", //
"conv_bn_fuse_pass", // "fc_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
"conv_relu_mkldnn_fuse_pass", // "conv_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass", //
#endif #endif
}}; }};
std::unordered_set<std::string> disabled_ir_passes_; std::unordered_set<std::string> disabled_ir_passes_;
// Ir passes to run
std::vector<std::string> ir_passes_;
bool use_mkldnn_;
}; };
} // namespace analysis } // namespace analysis
......
...@@ -77,10 +77,6 @@ bool AnalysisPredictor::Init( ...@@ -77,10 +77,6 @@ bool AnalysisPredictor::Init(
inference_program_ = program; inference_program_ = program;
} }
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
executor_->Prepare(scope_.get(), *inference_program_, 0, executor_->Prepare(scope_.get(), *inference_program_, 0,
config_.use_feed_fetch_ops); config_.use_feed_fetch_ops);
...@@ -225,10 +221,24 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -225,10 +221,24 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.origin_program_desc.reset( argument_.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
PADDLE_ENFORCE(
config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude, switch (config_.ir_mode) {
"Only kExclude is supported yet."); case contrib::AnalysisConfig::IrPassMode::kExclude:
Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_); Analyzer()
.IncludeAllIrPasses()
.SetUseMkldnn(config_._use_mkldnn)
.DisableIrPasses(config_.ir_passes)
.Run(&argument_);
break;
case contrib::AnalysisConfig::IrPassMode::kInclude:
Analyzer()
.SetUseMkldnn(config_._use_mkldnn)
.IncludeIrPasses(config_.ir_passes)
.Run(&argument_);
break;
default:
LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet.";
}
CHECK(argument_.transformed_program_desc); CHECK(argument_.transformed_program_desc);
VLOG(5) << "to prepare executor"; VLOG(5) << "to prepare executor";
......
...@@ -205,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -205,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
float* ref_data = refs[tid].data<float>(); float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float))); EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
for (int i = 0; i < refs[tid].numel(); ++i) { for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); EXPECT_NEAR(ref_data[i], data[i], 2e-3);
} }
}); });
} }
......
...@@ -259,10 +259,17 @@ struct AnalysisConfig : public NativeConfig { ...@@ -259,10 +259,17 @@ struct AnalysisConfig : public NativeConfig {
kExclude // Specify the disabled passes in `ir_passes`. kExclude // Specify the disabled passes in `ir_passes`.
}; };
void SetIncludeMode() {
ir_mode = IrPassMode::kInclude;
// this pass has to be run at the beginning of all fuse passes
ir_passes = {"infer_clean_graph_pass"};
}
// Determine whether to perform graph optimization. // Determine whether to perform graph optimization.
bool enable_ir_optim = true; bool enable_ir_optim = true;
// Manually determine the IR passes to run. // Manually determine the IR passes to run.
IrPassMode ir_mode{IrPassMode::kExclude}; IrPassMode ir_mode{IrPassMode::kExclude};
// passes to be excluded/included
std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"}; std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"};
// NOT stable yet. // NOT stable yet.
......
...@@ -52,9 +52,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -52,9 +52,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
} }
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_resnet50, profile) { void profile(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg._use_mkldnn = use_mkldnn;
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -69,6 +70,11 @@ TEST(Analyzer_resnet50, profile) { ...@@ -69,6 +70,11 @@ TEST(Analyzer_resnet50, profile) {
} }
} }
TEST(Analyzer_resnet50, profile) { profile(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); }
#endif
// Check the fuse status // Check the fuse status
TEST(Analyzer_resnet50, fuse_statis) { TEST(Analyzer_resnet50, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
...@@ -82,15 +88,21 @@ TEST(Analyzer_resnet50, fuse_statis) { ...@@ -82,15 +88,21 @@ TEST(Analyzer_resnet50, fuse_statis) {
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_resnet50, compare) { void compare(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg._use_mkldnn = use_mkldnn;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(cfg, input_slots_all);
} }
TEST(Analyzer_resnet50, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -18,12 +18,12 @@ namespace paddle { ...@@ -18,12 +18,12 @@ namespace paddle {
namespace inference { namespace inference {
using namespace framework; // NOLINT using namespace framework; // NOLINT
static std::vector<float> result_data;
struct DataRecord { struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all; std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<size_t> lod; std::vector<size_t> lod;
std::vector<std::vector<float>> rnn_link_data; std::vector<std::vector<float>> rnn_link_data;
std::vector<float> result_data;
size_t num_samples; // total number of samples size_t num_samples; // total number of samples
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
...@@ -57,6 +57,7 @@ struct DataRecord { ...@@ -57,6 +57,7 @@ struct DataRecord {
std::ifstream file(path); std::ifstream file(path);
std::string line; std::string line;
int num_lines = 0; int num_lines = 0;
result_data.clear();
while (std::getline(file, line)) { while (std::getline(file, line)) {
num_lines++; num_lines++;
std::vector<std::string> data; std::vector<std::string> data;
...@@ -135,13 +136,12 @@ TEST(Analyzer_rnn2, profile) { ...@@ -135,13 +136,12 @@ TEST(Analyzer_rnn2, profile) {
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PADDLE_ENFORCE_GT(outputs.size(), 0); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]); size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data()); float *result = static_cast<float *>(outputs[0].data.data());
for (size_t i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(result[i], data.result_data[i], 1e-3); EXPECT_NEAR(result[i], result_data[i], 1e-3);
} }
} }
} }
......
...@@ -183,7 +183,13 @@ TEST(Analyzer_seq_conv1, fuse_statis) { ...@@ -183,7 +183,13 @@ TEST(Analyzer_seq_conv1, fuse_statis) {
SetConfig(&cfg); SetConfig(&cfg);
int num_ops; int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
GetFuseStatis(predictor.get(), &num_ops);
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("seqconv_eltadd_relu_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 2);
EXPECT_EQ(fuse_statis.at("seqconv_eltadd_relu_fuse"), 6);
EXPECT_EQ(num_ops, 32);
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
......
...@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->specify_input_name = true; cfg->specify_input_name = true;
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg->ir_passes.push_back("fc_gru_fuse_pass"); cfg->ir_passes.push_back("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN
cfg->_use_mkldnn = true;
#endif
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...@@ -84,9 +81,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -84,9 +81,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
// ocr, mobilenet and se_resnext50 // ocr, mobilenet and se_resnext50
TEST(Analyzer_vis, profile) { void profile(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg._use_mkldnn = use_mkldnn;
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -108,6 +106,12 @@ TEST(Analyzer_vis, profile) { ...@@ -108,6 +106,12 @@ TEST(Analyzer_vis, profile) {
} }
} }
TEST(Analyzer_vis, profile) { profile(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
#endif
// Check the fuse status // Check the fuse status
TEST(Analyzer_vis, fuse_statis) { TEST(Analyzer_vis, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
...@@ -118,15 +122,21 @@ TEST(Analyzer_vis, fuse_statis) { ...@@ -118,15 +122,21 @@ TEST(Analyzer_vis, fuse_statis) {
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_vis, compare) { void compare(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg._use_mkldnn = use_mkldnn;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(cfg, input_slots_all);
} }
TEST(Analyzer_vis, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -50,7 +50,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -50,7 +50,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
auto &ref_out = ref_outputs[i]; auto &ref_out = ref_outputs[i];
size_t size = VecReduceToInt(out.shape); size_t size = VecReduceToInt(out.shape);
size_t ref_size = VecReduceToInt(ref_out.shape); size_t ref_size = VecReduceToInt(ref_out.shape);
EXPECT_GT(size, 0); EXPECT_GT(size, 0UL);
EXPECT_EQ(size, ref_size); EXPECT_EQ(size, ref_size);
EXPECT_EQ(out.dtype, ref_out.dtype); EXPECT_EQ(out.dtype, ref_out.dtype);
switch (out.dtype) { switch (out.dtype) {
...@@ -163,7 +163,8 @@ void TestPrediction(const AnalysisConfig &config, ...@@ -163,7 +163,8 @@ void TestPrediction(const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = FLAGS_use_analysis) { bool use_analysis = FLAGS_use_analysis) {
LOG(INFO) << "use_analysis: " << use_analysis; LOG(INFO) << "use_analysis: " << use_analysis
<< ", use_mkldnn: " << config._use_mkldnn;
if (num_threads == 1) { if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs, use_analysis); TestOneThreadPrediction(config, inputs, outputs, use_analysis);
} else { } else {
...@@ -175,6 +176,7 @@ void TestPrediction(const AnalysisConfig &config, ...@@ -175,6 +176,7 @@ void TestPrediction(const AnalysisConfig &config,
void CompareNativeAndAnalysis( void CompareNativeAndAnalysis(
const AnalysisConfig &config, const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
LOG(INFO) << "use_mkldnn: " << config._use_mkldnn;
std::vector<PaddleTensor> native_outputs, analysis_outputs; std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true); TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
......
...@@ -86,7 +86,7 @@ function(op_library TARGET) ...@@ -86,7 +86,7 @@ function(op_library TARGET)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops. # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
"crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
"channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}") if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return() return()
endif() endif()
...@@ -284,10 +284,10 @@ op_library(max_sequence_len_op DEPS lod_rank_table) ...@@ -284,10 +284,10 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling) op_library(sequence_pool_op DEPS sequence_pooling)
if (NOT WIN32) if (NOT WIN32)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
endif(NOT WIN32) endif(NOT WIN32)
op_library(recurrent_op DEPS executor) op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
...@@ -316,7 +316,7 @@ op_library(save_op DEPS lod_tensor) ...@@ -316,7 +316,7 @@ op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor)
op_library(concat_op DEPS concat) op_library(concat_op DEPS concat_and_split)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
...@@ -348,6 +348,6 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) ...@@ -348,6 +348,6 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
if(NOT WIN32) if(NOT WIN32)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
...@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/operators/math/concat.h> #include <paddle/fluid/operators/math/concat_and_split.h>
#include <numeric> #include <numeric>
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
namespace paddle { namespace paddle {
...@@ -89,29 +89,17 @@ class ConcatGradKernel : public framework::OpKernel<T> { ...@@ -89,29 +89,17 @@ class ConcatGradKernel : public framework::OpKernel<T> {
outputs.push_back(nullptr); outputs.push_back(nullptr);
} }
} }
auto& dev_ctx = ctx.template device_context<DeviceContext>();
// Sometimes direct copies will be faster, this maybe need deeply analysis. // Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && outs.size() < 10) { if (axis == 0 && outs.size() < 10) {
size_t input_offset = 0; std::vector<const framework::Tensor*> ref_shape;
const auto in_stride = framework::stride_numel(out_grad->dims()); ref_shape.insert(ref_shape.begin(), ins.begin(), ins.end());
StridedMemcpyWithAxis0<T>(dev_ctx, *out_grad, ref_shape, &outputs);
for (size_t i = 0; i < outs.size(); ++i) {
auto out_stride = framework::stride_numel(ins[i]->dims());
auto* out = outputs[i];
if (out != nullptr) {
StridedNumelCopyWithAxis<T>(
ctx.device_context(), axis, out->data<T>(), out_stride,
out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
}
input_offset += out_stride[axis];
}
} else { } else {
auto& dev_ctx = ctx.template device_context<DeviceContext>(); math::SplitFunctor<DeviceContext, T> split_functor;
paddle::operators::math::ConcatGradFunctor<DeviceContext, T> split_functor(dev_ctx, *out_grad, ctx.MultiInput<framework::Tensor>("X"),
concat_grad_functor; static_cast<int>(axis), &outputs);
concat_grad_functor(dev_ctx, *out_grad,
ctx.MultiInput<framework::Tensor>("X"),
static_cast<int>(axis), &outputs);
} }
} }
}; };
......
...@@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations"); std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
bool fuse_relu = ctx.Attr<bool>("fuse_relu"); bool fuse_relu = ctx.Attr<bool>("fuse_relu");
bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise"); bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
// TODO: add support for dilation // TODO(tpatejko): add support for dilation
PADDLE_ENFORCE( PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet"); "dilation in convolution is not implemented yet");
...@@ -369,11 +369,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -369,11 +369,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x); bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine, strides, paddings, mkldnn_engine,
fuse_relu, fuse_eltwise); fuse_relu, fuse_residual_conn);
} else { } else {
conv_pd = conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_eltwise); mkldnn_engine, fuse_relu, fuse_residual_conn);
} }
// Save conv_pd/src_memory/weights_memory for backward pass // Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd); dev_ctx.SetBlob(key_conv_pd, conv_pd);
...@@ -386,8 +386,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -386,8 +386,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_weights_memory_p = handler.AcquireWeightsMemory( auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data)); user_weights_md, to_void_cast<T>(filter_data));
T* output_data = T* output_data = nullptr;
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
if (fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData");
auto residual_param_data = residual_param->data<T>();
PADDLE_ENFORCE(
residual_param_data != nullptr,
"Provide data if you want MKLDNN conv+elementwise_add fusion");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the "
"same dimension sizes");
output->ShareDataWith(*residual_param);
output_data = output->mutable_data<T>(ctx.GetPlace());
} else {
output_data =
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
}
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
auto src_memory_p = auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
...@@ -424,14 +442,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -424,14 +442,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
private: private:
mkldnn::primitive_attr CreatePostOps(bool fuse_relu, mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
bool fuse_eltwise) const { bool fuse_residual_conn) const {
mkldnn::primitive_attr conv_attr; mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations; mkldnn::post_ops post_operations;
// Fusion with Elementwise layer relies on adding a sum post-operation with // Fusion with Elementwise layer relies on adding a sum post-operation with
// the scale parameter. It is assumed that when fuse_eltwise is true, the // the scale parameter. It is assumed that when fuse_residual_connection is
// Output tensor contains the data coming from residual connection. The // true, the output tensor contains the data coming from residual
// result of this post_op is: Output = scale * Output + Conv_Out. // connection. The result of this post_op is:
if (fuse_eltwise) { // Output = scale * Output + Conv_Out.
if (fuse_residual_conn) {
post_operations.append_sum(1.0f); post_operations.append_sum(1.0f);
} }
// Fusion with ReLU layer is executed through the PostOps feature. Create a // Fusion with ReLU layer is executed through the PostOps feature. Create a
...@@ -452,7 +471,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -452,7 +471,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const memory::desc& dst, const std::vector<int>& strides, const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_eltwise) const { const bool fuse_residual_conn) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
...@@ -461,7 +480,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -461,7 +480,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dst, stride_dims, padding_dims, padding_dims, dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero); mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, fuse_residual_conn);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine); conv_desc, conv_attr, engine);
...@@ -476,7 +496,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -476,7 +496,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_eltwise) const { const bool fuse_residual_conn) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
...@@ -485,7 +505,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -485,7 +505,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias, dst, stride_dims, padding_dims, padding_dims, bias, dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero); mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, fuse_residual_conn);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine); conv_desc, conv_attr, engine);
......
...@@ -132,6 +132,11 @@ void Conv2DOpMaker::Make() { ...@@ -132,6 +132,11 @@ void Conv2DOpMaker::Make() {
"(Tensor) The output tensor of convolution operator. " "(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.") "The format of output tensor is also NCHW.")
.Reuse("Input"); .Reuse("Input");
AddInput("ResidualData",
"(Tensor) Tensor with residual data "
"to which convolution output will be added."
"Used with fuse_residual_connection fusion.")
.AsDispensable();
AddAttr<std::vector<int>>("strides", AddAttr<std::vector<int>>("strides",
"(vector<int> default:{1, 1}), the " "(vector<int> default:{1, 1}), the "
"strides(h_stride, w_stride) of " "strides(h_stride, w_stride) of "
...@@ -164,10 +169,10 @@ void Conv2DOpMaker::Make() { ...@@ -164,10 +169,10 @@ void Conv2DOpMaker::Make() {
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel") AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("fuse_eltwise", AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used " "(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is connected via skip connection " "whenever convolution output is as an input to residual "
"to a previous layer.") "connection.")
.SetDefault(false); .SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
......
...@@ -20,7 +20,7 @@ detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) ...@@ -20,7 +20,7 @@ detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
detection_library(iou_similarity_op SRCS iou_similarity_op.cc detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op.cu) iou_similarity_op.cu)
detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc) detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
detection_library(anchor_generator_op SRCS anchor_generator_op.cc detection_library(anchor_generator_op SRCS anchor_generator_op.cc
anchor_generator_op.cu) anchor_generator_op.cu)
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cmath>
#include <cstring>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -25,21 +27,17 @@ namespace operators { ...@@ -25,21 +27,17 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
struct AppendProposalsFunctor { static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
LoDTensor *out_;
int64_t offset_;
Tensor *to_add_;
AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add) static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
: out_(out), offset_(offset), to_add_(to_add) {} auto *out_data = dst->data<void>();
auto *to_add_data = src.data<void>();
template <typename T> size_t size_of_t = framework::SizeOfType(src.type());
void apply() const { offset *= size_of_t;
auto *out_data = out_->data<T>(); std::memcpy(
auto *to_add_data = to_add_->data<T>(); reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T)); to_add_data, src.numel() * size_of_t);
} }
};
class GenerateProposalsOp : public framework::OperatorWithKernel { class GenerateProposalsOp : public framework::OperatorWithKernel {
public: public:
...@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { ...@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
}; };
template <class T> template <class T>
void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, static inline void BoxCoder(const platform::DeviceContext &ctx,
Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) { Tensor *all_anchors, Tensor *bbox_deltas,
Tensor *variances, Tensor *proposals) {
T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace()); T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
int64_t row = all_anchors->dims()[0]; int64_t row = all_anchors->dims()[0];
...@@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, ...@@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
anchor_center_y; anchor_center_y;
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] * bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2], bbox_deltas_data[i * len + 2],
std::log(1000.0 / 16.0))) * kBBoxClipDefault)) *
anchor_width; anchor_width;
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] * bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3], bbox_deltas_data[i * len + 3],
std::log(1000.0 / 16.0))) * kBBoxClipDefault)) *
anchor_height; anchor_height;
} else { } else {
bbox_center_x = bbox_center_x =
...@@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, ...@@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
bbox_center_y = bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2], bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
std::log(1000.0 / 16.0))) * kBBoxClipDefault)) *
anchor_width; anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3], bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
std::log(1000.0 / 16.0))) * kBBoxClipDefault)) *
anchor_height; anchor_height;
} }
...@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, ...@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
} }
template <class T> template <class T>
void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info, static inline void ClipTiledBoxes(const platform::DeviceContext &ctx,
Tensor *boxes) { const Tensor &im_info, Tensor *boxes) {
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace()); T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
const T *im_info_data = im_info.data<T>(); const T *im_info_data = im_info.data<T>();
T zero(0);
for (int64_t i = 0; i < boxes->numel(); ++i) { for (int64_t i = 0; i < boxes->numel(); ++i) {
if (i % 4 == 0) { if (i % 4 == 0) {
boxes_data[i] = boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f); std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else if (i % 4 == 1) { } else if (i % 4 == 1) {
boxes_data[i] = boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f); std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
} else if (i % 4 == 2) { } else if (i % 4 == 2) {
boxes_data[i] = boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f); std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else { } else {
boxes_data[i] = boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f); std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
} }
} }
} }
template <class T> template <class T>
void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, static inline void FilterBoxes(const platform::DeviceContext &ctx,
float min_size, const Tensor &im_info, Tensor *keep) { Tensor *boxes, float min_size,
const Tensor &im_info, Tensor *keep) {
const T *im_info_data = im_info.data<T>(); const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace()); T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
T im_scale = im_info_data[2]; T im_scale = im_info_data[2];
...@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, ...@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
keep->Resize({keep_len}); keep->Resize({keep_len});
} }
bool SortScorePairDescend(const std::pair<float, int> &pair1,
const std::pair<float, int> &pair2) {
return pair1.first > pair2.first;
}
template <class T> template <class T>
void GetMaxScoreIndex(const std::vector<T> &scores, static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
std::vector<std::pair<T, int>> *sorted_indices) { const std::vector<T> &scores) {
std::vector<std::pair<T, int>> sorted_indices;
sorted_indices.reserve(scores.size());
for (size_t i = 0; i < scores.size(); ++i) { for (size_t i = 0; i < scores.size(); ++i) {
sorted_indices->push_back(std::make_pair(scores[i], i)); sorted_indices.emplace_back(scores[i], i);
} }
// Sort the score pair according to the scores in descending order // Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices->begin(), sorted_indices->end(), std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
SortScorePairDescend); [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
return a.first < b.first;
});
return sorted_indices;
} }
template <class T> template <class T>
T BBoxArea(const T *box, const bool normalized) { static inline T BBoxArea(const T *box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) { if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid // If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0. // (e.g. xmax < xmin or ymax < ymin), return 0.
...@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) { ...@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) {
} }
template <class T> template <class T>
T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) { box2[3] < box1[1]) {
return static_cast<T>(0.); return static_cast<T>(0.);
...@@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { ...@@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
const T inter_ymin = std::max(box1[1], box2[1]); const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]); const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]); const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1); const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1); const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
const T inter_area = inter_w * inter_h; const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized); const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized); const T bbox2_area = BBoxArea<T>(box2, normalized);
...@@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { ...@@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
} }
} }
template <typename T>
static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
int selected_num) {
Tensor keep_nms;
keep_nms.Resize({selected_num});
auto *keep_data = keep_nms.mutable_data<T>(platform::CPUPlace());
for (int i = 0; i < selected_num; ++i) {
keep_data[i] = selected_indices[i];
}
return keep_nms;
}
template <class T> template <class T>
Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores, static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox,
const T nms_threshold, const float eta) { Tensor *scores, T nms_threshold, float eta) {
PADDLE_ENFORCE_NOT_NULL(bbox); PADDLE_ENFORCE_NOT_NULL(bbox);
int64_t num_boxes = bbox->dims()[0]; int64_t num_boxes = bbox->dims()[0];
// 4: [xmin ymin xmax ymax] // 4: [xmin ymin xmax ymax]
...@@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores, ...@@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
std::vector<T> scores_data(num_boxes); std::vector<T> scores_data(num_boxes);
std::copy_n(scores->data<T>(), num_boxes, scores_data.begin()); std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices; std::vector<std::pair<T, int>> sorted_indices =
GetMaxScoreIndex<T>(scores_data, &sorted_indices); GetSortedScoreIndex<T>(scores_data);
std::vector<int> selected_indices; std::vector<int> selected_indices;
int selected_num = 0; int selected_num = 0;
T adaptive_threshold = nms_threshold; T adaptive_threshold = nms_threshold;
const T *bbox_data = bbox->data<T>(); const T *bbox_data = bbox->data<T>();
bool flag;
while (sorted_indices.size() != 0) { while (sorted_indices.size() != 0) {
int idx = sorted_indices.front().second; int idx = sorted_indices.back().second;
flag = true; bool flag = true;
for (size_t k = 0; k < selected_indices.size(); ++k) { for (int kept_idx : selected_indices) {
if (flag) { if (flag) {
const int kept_idx = selected_indices[k];
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size, T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, false); bbox_data + kept_idx * box_size, false);
flag = (overlap <= adaptive_threshold); flag = (overlap <= adaptive_threshold);
...@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores, ...@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
} }
if (flag) { if (flag) {
selected_indices.push_back(idx); selected_indices.push_back(idx);
selected_num++; ++selected_num;
} }
sorted_indices.erase(sorted_indices.begin()); sorted_indices.erase(sorted_indices.end());
if (flag && eta < 1 && adaptive_threshold > 0.5) { if (flag && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta; adaptive_threshold *= eta;
} }
} }
Tensor keep_nms; return VectorToTensor(selected_indices, selected_num);
keep_nms.Resize({selected_num});
int *keep_data = keep_nms.mutable_data<int>(ctx.GetPlace());
for (int i = 0; i < selected_num; ++i) {
keep_data[i] = selected_indices[i];
}
return keep_nms;
} }
template <typename DeviceContext, typename T> template <typename T>
class GenerateProposalsKernel : public framework::OpKernel<T> { class GenerateProposalsKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto *scores = context.Input<Tensor>("Scores"); auto *scores = context.Input<Tensor>("Scores");
auto *bbox_deltas = context.Input<Tensor>("BboxDeltas"); auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
auto *im_info = context.Input<Tensor>("ImInfo"); auto *im_info = context.Input<Tensor>("ImInfo");
auto *anchors = context.Input<Tensor>("Anchors"); auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
auto *variances = context.Input<Tensor>("Variances"); "Cannot find input Anchors(%s) in scope",
context.Inputs("Anchors")[0]);
auto variances = detail::Ref(context.Input<Tensor>("Variances"),
"Cannot find input Variances(%s) in scope",
context.Inputs("Variances")[0]);
auto *rpn_rois = context.Output<LoDTensor>("RpnRois"); auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs"); auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
...@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
float min_size = context.Attr<float>("min_size"); float min_size = context.Attr<float>("min_size");
float eta = context.Attr<float>("eta"); float eta = context.Attr<float>("eta");
auto &dev_ctx = context.template device_context<DeviceContext>(); auto &dev_ctx =
context.template device_context<platform::CPUDeviceContext>();
auto scores_dim = scores->dims(); auto &scores_dim = scores->dims();
int64_t num = scores_dim[0]; int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1]; int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2]; int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3]; int64_t w_score = scores_dim[3];
auto bbox_dim = bbox_deltas->dims(); auto &bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1]; int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2]; int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3]; int64_t w_bbox = bbox_dim[3];
...@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
scores_swap.mutable_data<T>({num, h_score, w_score, c_score}, scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
dev_ctx.GetPlace()); dev_ctx.GetPlace());
math::Transpose<DeviceContext, T, 4> trans; math::Transpose<platform::CPUDeviceContext, T, 4> trans;
std::vector<int> axis = {0, 2, 3, 1}; std::vector<int> axis = {0, 2, 3, 1};
trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
trans(dev_ctx, *scores, &scores_swap, axis); trans(dev_ctx, *scores, &scores_swap, axis);
framework::LoD lod; framework::LoD lod;
std::vector<size_t> lod0(1, 0); lod.resize(1);
Tensor *anchor = const_cast<framework::Tensor *>(anchors); auto &lod0 = lod[0];
anchor->Resize({anchors->numel() / 4, 4}); lod0.push_back(0);
Tensor *var = const_cast<framework::Tensor *>(variances); anchors.Resize({anchors.numel() / 4, 4});
var->Resize({var->numel() / 4, 4}); variances.Resize({variances.numel() / 4, 4});
int64_t num_proposals = 0; int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) { for (int64_t i = 0; i < num; ++i) {
...@@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
scores_slice.Resize({h_score * w_score * c_score, 1}); scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> tensor_pair = std::pair<Tensor, Tensor> tensor_pair =
ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var, ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances,
bbox_deltas_slice, scores_slice, pre_nms_top_n, bbox_deltas_slice, scores_slice, pre_nms_top_n,
post_nms_top_n, nms_thresh, min_size, eta); post_nms_top_n, nms_thresh, min_size, eta);
Tensor proposals = tensor_pair.first; Tensor &proposals = tensor_pair.first;
Tensor scores = tensor_pair.second; Tensor &scores = tensor_pair.second;
framework::VisitDataType(
framework::ToDataType(rpn_rois->type()),
AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals));
framework::VisitDataType(
framework::ToDataType(rpn_roi_probs->type()),
AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores));
AppendProposals(rpn_rois, 4 * num_proposals, proposals);
AppendProposals(rpn_roi_probs, num_proposals, scores);
num_proposals += proposals.dims()[0]; num_proposals += proposals.dims()[0];
lod0.emplace_back(num_proposals); lod0.push_back(num_proposals);
} }
lod.emplace_back(lod0);
rpn_rois->set_lod(lod); rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod); rpn_roi_probs->set_lod(lod);
rpn_rois->Resize({num_proposals, 4}); rpn_rois->Resize({num_proposals, 4});
...@@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
} }
std::pair<Tensor, Tensor> ProposalForOneImage( std::pair<Tensor, Tensor> ProposalForOneImage(
const DeviceContext &ctx, const Tensor &im_info_slice, const platform::CPUDeviceContext &ctx, const Tensor &im_info_slice,
const Tensor &anchors, const Tensor &variances, const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas_slice, // [M, 4] const Tensor &bbox_deltas_slice, // [M, 4]
const Tensor &scores_slice, // [N, 1] const Tensor &scores_slice, // [N, 1]
...@@ -392,10 +394,9 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -392,10 +394,9 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
for (int i = 0; i < scores_slice.numel(); ++i) { for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i; index[i] = i;
} }
std::function<bool(const int64_t &, const int64_t &)> compare = auto compare = [scores_data](const int64_t &i, const int64_t &j) {
[scores_data](const int64_t &i, const int64_t &j) { return scores_data[i] > scores_data[j];
return scores_data[i] > scores_data[j]; };
};
if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
std::sort(index, index + scores_slice.numel(), compare); std::sort(index, index + scores_slice.numel(), compare);
...@@ -452,33 +453,45 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -452,33 +453,45 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Scores", "The scores of anchors should be foreground."); AddInput("Scores",
AddInput("BboxDeltas", "bbox_deltas."); "(Tensor) The scores from conv is in shape (N, A, H, W), "
AddInput("ImInfo", "Information for image reshape."); "N is batch size, A is number of anchors, "
AddInput("Anchors", "All anchors."); "H and W are height and width of the feature map");
AddInput("Variances", " variances"); AddInput("BboxDeltas",
"(Tensor) Bounding box deltas from conv is in "
AddOutput("RpnRois", "Anchors."); "shape (N, 4*A, H, W).");
AddOutput("RpnRoiProbs", "Anchors."); AddInput("ImInfo",
AddAttr<int>("pre_nms_topN", "pre_nms_topN"); "(Tensor) Information for image reshape is in shape (N, 3), "
AddAttr<int>("post_nms_topN", "post_nms_topN"); "in format (height, width, scale)");
AddAttr<float>("nms_thresh", "nms_thres"); AddInput("Anchors",
AddAttr<float>("min_size", "min size"); "(Tensor) Bounding box anchors from anchor_generator_op "
"is in shape (A, H, W, 4).");
AddInput("Variances",
"(Tensor) Bounding box variances with same shape as `Anchors`.");
AddOutput("RpnRois",
"(LoDTensor), Output proposals with shape (rois_num, 4).");
AddOutput("RpnRoiProbs",
"(LoDTensor) Scores of proposals with shape (rois_num, 1).");
AddAttr<int>("pre_nms_topN",
"Number of top scoring RPN proposals to keep before "
"applying NMS.");
AddAttr<int>("post_nms_topN",
"Number of top scoring RPN proposals to keep after "
"applying NMS");
AddAttr<float>("nms_thresh", "NMS threshold used on RPN proposals.");
AddAttr<float>("min_size",
"Proposal height and width both need to be greater "
"than this min_size.");
AddAttr<float>("eta", "The parameter for adaptive NMS."); AddAttr<float>("eta", "The parameter for adaptive NMS.");
AddComment(R"DOC( AddComment(R"DOC(
Generate Proposals OP This operator Generate bounding box proposals for Faster RCNN.
The propoasls are generated for a list of images based on image
This operator proposes rois according to each box with their probability to be a foreground object and score 'Scores', bounding box regression result 'BboxDeltas' as
the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals well as predefined bounding box shapes 'anchors'. Greedy
could be used to train detection net. non-maximum suppression is applied to generate the final bounding
boxes.
Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
of anchors, H and W are height and width of the feature map.
BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W)
For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and
calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area.
Finally, apply nms to get final proposals as output.
)DOC"); )DOC");
} }
}; };
...@@ -490,6 +503,5 @@ namespace ops = paddle::operators; ...@@ -490,6 +503,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp, REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp,
ops::GenerateProposalsOpMaker, ops::GenerateProposalsOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
generate_proposals, ops::GenerateProposalsKernel<double>);
ops::GenerateProposalsKernel<paddle::platform::CPUDeviceContext, float>);
...@@ -16,10 +16,13 @@ limitations under the License. */ ...@@ -16,10 +16,13 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "cub/cub.cuh" #include "cub/cub.cuh"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -36,36 +39,38 @@ namespace { ...@@ -36,36 +39,38 @@ namespace {
int const kThreadsPerBlock = sizeof(uint64_t) * 8; int const kThreadsPerBlock = sizeof(uint64_t) * 8;
template <typename T> static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
__global__ void RangeInitKernel(const T start, const T delta, const int size,
T *out) { struct RangeInitFunctor {
CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; } int start_;
} int delta_;
int *out_;
__device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
};
template <typename T> template <typename T>
void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value, static void SortDescending(const platform::CUDADeviceContext &ctx,
Tensor *value_out, Tensor *index_out) { const Tensor &value, Tensor *value_out,
int num = value.numel(); Tensor *index_out) {
int num = static_cast<int>(value.numel());
Tensor index_in_t; Tensor index_in_t;
int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace()); int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
int block = 512; platform::ForRange<platform::CUDADeviceContext> for_range(ctx, num);
auto stream = ctx.stream(); for_range(RangeInitFunctor{0, 1, idx_in});
RangeInitKernel<<<DIVUP(num, block), block, 0, stream>>>(0, 1, num, idx_in);
int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace()); int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
const T *keys_in = value.data<T>(); const T *keys_in = value.data<T>();
T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace()); T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
// Determine temporary device storage requirements // Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
num);
// Allocate temporary storage // Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
d_temp_storage = memory::Alloc(place, temp_storage_bytes); void *d_temp_storage = memory::Alloc(place, temp_storage_bytes);
// Run sorting operation // Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>( cub::DeviceRadixSort::SortPairsDescending<T, int>(
...@@ -76,22 +81,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value, ...@@ -76,22 +81,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
} }
template <typename T> template <typename T>
__device__ __forceinline__ T Min(T x, T y) { struct BoxDecodeAndClipFunctor {
return x < y ? x : y; const T *anchor;
} const T *deltas;
const T *var;
template <typename T> const int *index;
__device__ __forceinline__ T Max(T x, T y) { const T *im_info;
return x > y ? x : y;
} T *proposals;
template <typename T> BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var,
__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas, const int *index, const T *im_info, T *proposals)
const T *var, const int *index, : anchor(anchor),
const T *im_info, const int num, deltas(deltas),
T *proposals) { var(var),
T kBBoxClipDefault = log(1000.0 / 16.0); index(index),
CUDA_1D_KERNEL_LOOP(i, num) { im_info(im_info),
proposals(proposals) {}
T bbox_clip_default{static_cast<T>(kBBoxClipDefault)};
__device__ void operator()(size_t i) {
int k = index[i] * 4; int k = index[i] * 4;
T axmin = anchor[k]; T axmin = anchor[k];
T aymin = anchor[k + 1]; T aymin = anchor[k + 1];
...@@ -108,17 +118,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas, ...@@ -108,17 +118,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
T dxmax = deltas[k + 2]; T dxmax = deltas[k + 2];
T dymax = deltas[k + 3]; T dymax = deltas[k + 3];
T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.; T d_cx, d_cy, d_w, d_h;
if (var) { if (var) {
d_cx = cx + dxmin * w * var[k]; d_cx = cx + dxmin * w * var[k];
d_cy = cy + dymin * h * var[k + 1]; d_cy = cy + dymin * h * var[k + 1];
d_w = exp(Min<T>(dxmax * var[k + 2], kBBoxClipDefault)) * w; d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w;
d_h = exp(Min<T>(dymax * var[k + 3], kBBoxClipDefault)) * h; d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h;
} else { } else {
d_cx = cx + dxmin * w; d_cx = cx + dxmin * w;
d_cy = cy + dymin * h; d_cy = cy + dymin * h;
d_w = exp(Min<T>(dxmax, kBBoxClipDefault)) * w; d_w = exp(Min(dxmax, bbox_clip_default)) * w;
d_h = exp(Min<T>(dymax, kBBoxClipDefault)) * h; d_h = exp(Min(dymax, bbox_clip_default)) * h;
} }
T oxmin = d_cx - d_w * 0.5; T oxmin = d_cx - d_w * 0.5;
...@@ -126,17 +136,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas, ...@@ -126,17 +136,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
T oxmax = d_cx + d_w * 0.5 - 1.; T oxmax = d_cx + d_w * 0.5 - 1.;
T oymax = d_cy + d_h * 0.5 - 1.; T oymax = d_cy + d_h * 0.5 - 1.;
proposals[i * 4] = Max<T>(Min<T>(oxmin, im_info[1] - 1.), 0.); proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.);
proposals[i * 4 + 1] = Max<T>(Min<T>(oymin, im_info[0] - 1.), 0.); proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.);
proposals[i * 4 + 2] = Max<T>(Min<T>(oxmax, im_info[1] - 1.), 0.); proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.);
proposals[i * 4 + 3] = Max<T>(Min<T>(oymax, im_info[0] - 1.), 0.); proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.);
} }
}
__device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; }
__device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; }
};
template <typename T, int BlockSize> template <typename T, int BlockSize>
__global__ void FilterBBoxes(const T *bboxes, const T *im_info, static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
const T min_size, const int num, int *keep_num, const T min_size, const int num,
int *keep) { int *keep_num, int *keep) {
T im_h = im_info[0]; T im_h = im_info[0];
T im_w = im_info[1]; T im_w = im_info[1];
T im_scale = im_info[2]; T im_scale = im_info[2];
...@@ -181,7 +195,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info, ...@@ -181,7 +195,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
} }
} }
__device__ inline float IoU(const float *a, const float *b) { static __device__ inline float IoU(const float *a, const float *b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]); float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
...@@ -191,8 +205,9 @@ __device__ inline float IoU(const float *a, const float *b) { ...@@ -191,8 +205,9 @@ __device__ inline float IoU(const float *a, const float *b) {
return inter_s / (s_a + s_b - inter_s); return inter_s / (s_a + s_b - inter_s);
} }
__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh, static __global__ void NMSKernel(const int n_boxes,
const float *dev_boxes, uint64_t *dev_mask) { const float nms_overlap_thresh,
const float *dev_boxes, uint64_t *dev_mask) {
const int row_start = blockIdx.y; const int row_start = blockIdx.y;
const int col_start = blockIdx.x; const int col_start = blockIdx.x;
...@@ -234,9 +249,9 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh, ...@@ -234,9 +249,9 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
} }
template <typename T> template <typename T>
void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const Tensor &sorted_indices, const T nms_threshold, const Tensor &sorted_indices, const T nms_threshold,
Tensor *keep_out) { Tensor *keep_out) {
int boxes_num = proposals.dims()[0]; int boxes_num = proposals.dims()[0];
PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]); PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]);
...@@ -247,13 +262,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, ...@@ -247,13 +262,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const T *boxes = proposals.data<T>(); const T *boxes = proposals.data<T>();
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
int size_bytes = boxes_num * col_blocks * sizeof(uint64_t); framework::Vector<uint64_t> mask(boxes_num * col_blocks);
uint64_t *d_mask = NMSKernel<<<blocks, threads>>>(
reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes)); boxes_num, nms_threshold, boxes,
NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask); mask.CUDAMutableData(boost::get<platform::CUDAPlace>(ctx.GetPlace())));
uint64_t *h_mask = reinterpret_cast<uint64_t *>(
memory::Alloc(platform::CPUPlace(), size_bytes));
memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
std::vector<uint64_t> remv(col_blocks); std::vector<uint64_t> remv(col_blocks);
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
...@@ -267,7 +279,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, ...@@ -267,7 +279,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
if (!(remv[nblock] & (1ULL << inblock))) { if (!(remv[nblock] & (1ULL << inblock))) {
++num_to_keep; ++num_to_keep;
keep_vec.push_back(i); keep_vec.push_back(i);
uint64_t *p = &h_mask[0] + i * col_blocks; uint64_t *p = &mask[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) { for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j]; remv[j] |= p[j];
} }
...@@ -276,12 +288,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, ...@@ -276,12 +288,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace()); int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(), memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
sizeof(int) * num_to_keep, 0); sizeof(int) * num_to_keep, 0);
memory::Free(place, d_mask);
memory::Free(platform::CPUPlace(), h_mask);
} }
template <typename T> template <typename T>
std::pair<Tensor, Tensor> ProposalForOneImage( static std::pair<Tensor, Tensor> ProposalForOneImage(
const platform::CUDADeviceContext &ctx, const Tensor &im_info, const platform::CUDADeviceContext &ctx, const Tensor &im_info,
const Tensor &anchors, const Tensor &variances, const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas, // [M, 4] const Tensor &bbox_deltas, // [M, 4]
...@@ -300,18 +310,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -300,18 +310,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
// 2. box decode and clipping // 2. box decode and clipping
Tensor proposals; Tensor proposals;
proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace()); proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
int block = 512;
auto stream = ctx.stream(); {
BoxDecodeAndClipKernel<T><<<DIVUP(pre_nms_num, block), block, 0, stream>>>( platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(), for_range(BoxDecodeAndClipFunctor<T>{
index_sort.data<int>(), im_info.data<T>(), pre_nms_num, anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
proposals.data<T>()); index_sort.data<int>(), im_info.data<T>(), proposals.data<T>()});
}
// 3. filter // 3. filter
Tensor keep_index, keep_num_t; Tensor keep_index, keep_num_t;
keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace()); keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
keep_num_t.mutable_data<int>({1}, ctx.GetPlace()); keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
min_size = std::max(min_size, 1.0f); min_size = std::max(min_size, 1.0f);
auto stream = ctx.stream();
FilterBBoxes<T, 512><<<1, 512, 0, stream>>>( FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num, proposals.data<T>(), im_info.data<T>(), min_size, pre_nms_num,
keep_num_t.data<int>(), keep_index.data<int>()); keep_num_t.data<int>(), keep_index.data<int>());
...@@ -355,8 +367,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -355,8 +367,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
auto *scores = context.Input<Tensor>("Scores"); auto *scores = context.Input<Tensor>("Scores");
auto *bbox_deltas = context.Input<Tensor>("BboxDeltas"); auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
auto *im_info = context.Input<Tensor>("ImInfo"); auto *im_info = context.Input<Tensor>("ImInfo");
auto *anchors = context.Input<Tensor>("Anchors"); auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
auto *variances = context.Input<Tensor>("Variances"); "Cannot find input Anchors(%s) in scope",
context.Inputs("Anchors")[0]);
auto variances = detail::Ref(context.Input<Tensor>("Variances"),
"Cannot find input Variances(%s) in scope",
context.Inputs("Variances")[0]);
auto *rpn_rois = context.Output<LoDTensor>("RpnRois"); auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs"); auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
...@@ -392,10 +408,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -392,10 +408,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
trans(dev_ctx, *scores, &scores_swap, axis); trans(dev_ctx, *scores, &scores_swap, axis);
Tensor *anchor = const_cast<framework::Tensor *>(anchors); anchors.Resize({anchors.numel() / 4, 4});
anchor->Resize({anchors->numel() / 4, 4}); variances.Resize({variances.numel() / 4, 4});
Tensor *var = const_cast<framework::Tensor *>(variances);
var->Resize({var->numel() / 4, 4});
rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4}, rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
context.GetPlace()); context.GetPlace());
...@@ -417,12 +431,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -417,12 +431,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
scores_slice.Resize({h_score * w_score * c_score, 1}); scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> box_score_pair = std::pair<Tensor, Tensor> box_score_pair =
ProposalForOneImage<T>(dev_ctx, im_info_slice, *anchor, *var, ProposalForOneImage<T>(dev_ctx, im_info_slice, anchors, variances,
bbox_deltas_slice, scores_slice, pre_nms_top_n, bbox_deltas_slice, scores_slice, pre_nms_top_n,
post_nms_top_n, nms_thresh, min_size, eta); post_nms_top_n, nms_thresh, min_size, eta);
Tensor proposals = box_score_pair.first; Tensor &proposals = box_score_pair.first;
Tensor scores = box_score_pair.second; Tensor &scores = box_score_pair.second;
memory::Copy(place, rpn_rois_data + num_proposals * 4, place, memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
proposals.data<T>(), sizeof(T) * proposals.numel(), 0); proposals.data<T>(), sizeof(T) * proposals.numel(), 0);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @file src/gpc.cpp
* @author huhan02(com@baidu.com)
* @date 2015/12/18 14:17:30
* @brief
*
* @modified by sunyipeng
* @email sunyipeng@baidu.com
* @date 2018/6/12
**/
#include "paddle/fluid/operators/detection/gpc.h"
namespace gpc {
typedef struct lmt_shape { /* Local minima table */
double y; /* Y coordinate at local minimum */
edge_node *first_bound; /* Pointer to bound list */
struct lmt_shape *next; /* Pointer to next local minimum */
} lmt_node;
typedef struct sbt_t_shape { /* Scanbeam tree */
double y; /* Scanbeam node y value */
struct sbt_t_shape *less; /* Pointer to nodes with lower y */
struct sbt_t_shape *more; /* Pointer to nodes with higher y */
} sb_tree;
typedef struct it_shape { /* Intersection table */
edge_node *ie[2]; /* Intersecting edge (bundle) pair */
gpc_vertex point; /* Point of intersection */
struct it_shape *next; /* The next intersection table node */
} it_node;
typedef struct st_shape { /* Sorted edge table */
edge_node *edge; /* Pointer to AET edge */
double xb; /* Scanbeam bottom x coordinate */
double xt; /* Scanbeam top x coordinate */
double dx; /* Change in x for a unit y increase */
struct st_shape *prev; /* Previous edge in sorted list */
} st_node;
typedef struct bbox_shape { /* Contour axis-aligned bounding box */
double xmin; /* Minimum x coordinate */
double ymin; /* Minimum y coordinate */
double xmax; /* Maximum x coordinate */
double ymax; /* Maximum y coordinate */
} bbox;
/*
===========================================================================
Global Data
===========================================================================
*/
/* Horizontal edge state transitions within scanbeam boundary */
const h_state next_h_state[3][6] = {
/* ABOVE BELOW CROSS */
/* L R L R L R */
/* NH */
{BH, TH, TH, BH, NH, NH},
/* BH */
{NH, NH, NH, NH, TH, TH},
/* TH */
{NH, NH, NH, NH, BH, BH}};
/*
===========================================================================
Private Functions
===========================================================================
*/
static void reset_it(it_node **it) {
it_node *itn;
while (*it) {
itn = (*it)->next;
gpc_free<it_node>(*it);
*it = itn;
}
}
static void reset_lmt(lmt_node **lmt) {
lmt_node *lmtn;
while (*lmt) {
lmtn = (*lmt)->next;
gpc_free<lmt_node>(*lmt);
*lmt = lmtn;
}
}
static void insert_bound(edge_node **b, edge_node *e) {
edge_node *existing_bound = NULL;
if (!*b) {
/* Link node e to the tail of the list */
*b = e;
} else {
/* Do primary sort on the x field */
if (e[0].bot.x < (*b)[0].bot.x) {
/* Insert a new node mid-list */
existing_bound = *b;
*b = e;
(*b)->next_bound = existing_bound;
} else {
if (e[0].bot.x == (*b)[0].bot.x) {
/* Do secondary sort on the dx field */
if (e[0].dx < (*b)[0].dx) {
/* Insert a new node mid-list */
existing_bound = *b;
*b = e;
(*b)->next_bound = existing_bound;
} else {
/* Head further down the list */
insert_bound(&((*b)->next_bound), e);
}
} else {
/* Head further down the list */
insert_bound(&((*b)->next_bound), e);
}
}
}
}
static edge_node **bound_list(lmt_node **lmt, double y) {
lmt_node *existing_node;
if (!*lmt) {
/* Add node onto the tail end of the LMT */
gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
const_cast<char *>("LMT insertion"));
(*lmt)->y = y;
(*lmt)->first_bound = NULL;
(*lmt)->next = NULL;
return &((*lmt)->first_bound);
} else if (y < (*lmt)->y) {
/* Insert a new LMT node before the current node */
existing_node = *lmt;
gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
const_cast<char *>("LMT insertion"));
(*lmt)->y = y;
(*lmt)->first_bound = NULL;
(*lmt)->next = existing_node;
return &((*lmt)->first_bound);
} else {
if (y > (*lmt)->y) {
/* Head further up the LMT */
return bound_list(&((*lmt)->next), y);
} else {
/* Use this existing LMT node */
return &((*lmt)->first_bound);
}
}
}
static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) {
if (!*sbtree) {
/* Add a new tree node here */
gpc_malloc<sb_tree>(*sbtree, sizeof(sb_tree),
const_cast<char *>("scanbeam tree insertion"));
(*sbtree)->y = y;
(*sbtree)->less = NULL;
(*sbtree)->more = NULL;
(*entries)++;
} else {
if ((*sbtree)->y > y) {
/* Head into the 'less' sub-tree */
add_to_sbtree(entries, &((*sbtree)->less), y);
} else {
if ((*sbtree)->y < y) {
/* Head into the 'more' sub-tree */
add_to_sbtree(entries, &((*sbtree)->more), y);
}
}
}
}
static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) {
if (sbtree->less) {
build_sbt(entries, sbt, sbtree->less);
}
sbt[*entries] = sbtree->y;
(*entries)++;
if (sbtree->more) {
build_sbt(entries, sbt, sbtree->more);
}
}
static void free_sbtree(sb_tree **sbtree) {
if (*sbtree) {
free_sbtree(&((*sbtree)->less));
free_sbtree(&((*sbtree)->more));
gpc_free<sb_tree>(*sbtree);
}
}
static int count_optimal_vertices(gpc_vertex_list c) {
int result = 0;
int i = 0;
/* Ignore non-contributing contours */
if (c.num_vertices > 0) {
for (i = 0; i < c.num_vertices; i++) {
/* Ignore superfluous vertices embedded in horizontal edges */
if (gpc_optimal(c.vertex, i, c.num_vertices)) {
result++;
}
}
}
return result;
}
static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries,
gpc_polygon *p, int type, gpc_op op) {
int c = 0;
int i = 0;
int min = 0;
int max = 0;
int num_edges = 0;
int v = 0;
int num_vertices = 0;
int total_vertices = 0;
int e_index = 0;
edge_node *e = NULL;
edge_node *edge_table = NULL;
for (c = 0; c < p->num_contours; c++) {
total_vertices += count_optimal_vertices(p->contour[c]);
}
/* Create the entire input polygon edge table in one go */
gpc_malloc<edge_node>(edge_table, total_vertices * sizeof(edge_node),
const_cast<char *>("edge table creation"));
for (c = 0; c < p->num_contours; c++) {
if (p->contour[c].num_vertices < 0) {
/* Ignore the non-contributing contour and repair the vertex count */
p->contour[c].num_vertices = -p->contour[c].num_vertices;
} else {
/* Perform contour optimisation */
num_vertices = 0;
for (i = 0; i < p->contour[c].num_vertices; i++) {
if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) {
edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x;
edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y;
/* Record vertex in the scanbeam table */
add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y);
num_vertices++;
}
}
/* Do the contour forward pass */
for (min = 0; min < num_vertices; min++) {
/* If a forward local minimum... */
if (gpc_fwd_min(edge_table, min, num_vertices)) {
/* Search for the next local maximum... */
num_edges = 1;
max = gpc_next_index(min, num_vertices);
while (gpc_not_fmax(edge_table, max, num_vertices)) {
num_edges++;
max = gpc_next_index(max, num_vertices);
}
/* Build the next edge list */
e = &edge_table[e_index];
e_index += num_edges;
v = min;
e[0].bstate[BELOW] = UNBUNDLED;
e[0].bundle[BELOW][CLIP] = 0;
e[0].bundle[BELOW][SUBJ] = 0;
for (i = 0; i < num_edges; i++) {
e[i].xb = edge_table[v].vertex.x;
e[i].bot.x = edge_table[v].vertex.x;
e[i].bot.y = edge_table[v].vertex.y;
v = gpc_next_index(v, num_vertices);
e[i].top.x = edge_table[v].vertex.x;
e[i].top.y = edge_table[v].vertex.y;
e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
(e[i].top.y - e[i].bot.y);
e[i].type = type;
e[i].outp[ABOVE] = NULL;
e[i].outp[BELOW] = NULL;
e[i].next = NULL;
e[i].prev = NULL;
e[i].succ =
((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
e[i].next_bound = NULL;
e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
e[i].bside[SUBJ] = LEFT;
}
insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
}
}
/* Do the contour reverse pass */
for (min = 0; min < num_vertices; min++) {
/* If a reverse local minimum... */
if (gpc_rev_min(edge_table, min, num_vertices)) {
/* Search for the previous local maximum... */
num_edges = 1;
max = gpc_prev_index(min, num_vertices);
while (gpc_not_rmax(edge_table, max, num_vertices)) {
num_edges++;
max = gpc_prev_index(max, num_vertices);
}
/* Build the previous edge list */
e = &edge_table[e_index];
e_index += num_edges;
v = min;
e[0].bstate[BELOW] = UNBUNDLED;
e[0].bundle[BELOW][CLIP] = 0;
e[0].bundle[BELOW][SUBJ] = 0;
for (i = 0; i < num_edges; i++) {
e[i].xb = edge_table[v].vertex.x;
e[i].bot.x = edge_table[v].vertex.x;
e[i].bot.y = edge_table[v].vertex.y;
v = gpc_prev_index(v, num_vertices);
e[i].top.x = edge_table[v].vertex.x;
e[i].top.y = edge_table[v].vertex.y;
e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
(e[i].top.y - e[i].bot.y);
e[i].type = type;
e[i].outp[ABOVE] = NULL;
e[i].outp[BELOW] = NULL;
e[i].next = NULL;
e[i].prev = NULL;
e[i].succ =
((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
e[i].next_bound = NULL;
e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
e[i].bside[SUBJ] = LEFT;
}
insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
}
}
}
}
return edge_table;
} // NOLINT
static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) {
if (!*aet) {
/* Append edge onto the tail end of the AET */
*aet = edge;
edge->prev = prev;
edge->next = NULL;
} else {
/* Do primary sort on the xb field */
if (edge->xb < (*aet)->xb) {
/* Insert edge here (before the AET edge) */
edge->prev = prev;
edge->next = *aet;
(*aet)->prev = edge;
*aet = edge;
} else {
if (edge->xb == (*aet)->xb) {
/* Do secondary sort on the dx field */
if (edge->dx < (*aet)->dx) {
/* Insert edge here (before the AET edge) */
edge->prev = prev;
edge->next = *aet;
(*aet)->prev = edge;
*aet = edge;
} else {
/* Head further into the AET */
add_edge_to_aet(&((*aet)->next), edge, *aet);
}
} else {
/* Head further into the AET */
add_edge_to_aet(&((*aet)->next), edge, *aet);
}
}
}
}
static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1,
double x, double y) {
it_node *existing_node;
if (!*it) {
/* Append a new node to the tail of the list */
gpc_malloc<it_node>(*it, sizeof(it_node),
const_cast<char *>("IT insertion"));
(*it)->ie[0] = edge0;
(*it)->ie[1] = edge1;
(*it)->point.x = x;
(*it)->point.y = y;
(*it)->next = NULL;
} else {
if ((*it)->point.y > y) {
/* Insert a new node mid-list */
existing_node = *it;
gpc_malloc<it_node>(*it, sizeof(it_node),
const_cast<char *>("IT insertion"));
(*it)->ie[0] = edge0;
(*it)->ie[1] = edge1;
(*it)->point.x = x;
(*it)->point.y = y;
(*it)->next = existing_node;
} else {
/* Head further down the list */
add_intersection(&((*it)->next), edge0, edge1, x, y);
}
}
}
static void add_st_edge(st_node **st, it_node **it, edge_node *edge,
double dy) {
st_node *existing_node;
double den = 0.0;
double r = 0.0;
double x = 0.0;
double y = 0.0;
if (!*st) {
/* Append edge onto the tail end of the ST */
gpc_malloc<st_node>(*st, sizeof(st_node),
const_cast<char *>("ST insertion"));
(*st)->edge = edge;
(*st)->xb = edge->xb;
(*st)->xt = edge->xt;
(*st)->dx = edge->dx;
(*st)->prev = NULL;
} else {
den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb);
/* If new edge and ST edge don't cross */
if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) ||
(fabs(den) <= DBL_EPSILON)) {
/* No intersection - insert edge here (before the ST edge) */
existing_node = *st;
gpc_malloc<st_node>(*st, sizeof(st_node),
const_cast<char *>("ST insertion"));
(*st)->edge = edge;
(*st)->xb = edge->xb;
(*st)->xt = edge->xt;
(*st)->dx = edge->dx;
(*st)->prev = existing_node;
} else {
/* Compute intersection between new edge and ST edge */
r = (edge->xb - (*st)->xb) / den;
x = (*st)->xb + r * ((*st)->xt - (*st)->xb);
y = r * dy;
/* Insert the edge pointers and the intersection point in the IT */
add_intersection(it, (*st)->edge, edge, x, y);
/* Head further into the ST */
add_st_edge(&((*st)->prev), it, edge, dy);
}
}
}
static void build_intersection_table(it_node **it, edge_node *aet, double dy) {
st_node *st;
st_node *stp;
edge_node *edge = NULL;
/* Build intersection table for the current scanbeam */
reset_it(it);
st = NULL;
/* Process each AET edge */
for (edge = aet; edge; edge = edge->next) {
if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] ||
edge->bundle[ABOVE][SUBJ]) {
add_st_edge(&st, it, edge, dy);
}
}
/* Free the sorted edge table */
while (st) {
stp = st->prev;
gpc_free<st_node>(st);
st = stp;
}
}
static int count_contours(polygon_node *polygon) {
int nc = 0;
int nv = 0;
vertex_node *v = NULL;
vertex_node *nextv = NULL;
for (nc = 0; polygon; polygon = polygon->next) {
if (polygon->active) {
/* Count the vertices in the current contour */
nv = 0;
for (v = polygon->proxy->v[LEFT]; v; v = v->next) {
nv++;
}
/* Record valid vertex counts in the active field */
if (nv > 2) {
polygon->active = nv;
nc++;
} else {
/* Invalid contour: just free the heap */
for (v = polygon->proxy->v[LEFT]; v; v = nextv) {
nextv = v->next;
gpc_free<vertex_node>(v);
}
polygon->active = 0;
}
}
}
return nc;
}
static void add_left(polygon_node *p, double x, double y) {
vertex_node *nv = NULL;
/* Create a new vertex node and set its fields */
gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
const_cast<char *>("vertex node creation"));
nv->x = x;
nv->y = y;
/* Add vertex nv to the left end of the polygon's vertex list */
nv->next = p->proxy->v[LEFT];
/* Update proxy->[LEFT] to point to nv */
p->proxy->v[LEFT] = nv;
}
static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) {
polygon_node *target = NULL;
/* Label contour as a hole */
q->proxy->hole = 1;
if (p->proxy != q->proxy) {
/* Assign p's vertex list to the left end of q's list */
p->proxy->v[RIGHT]->next = q->proxy->v[LEFT];
q->proxy->v[LEFT] = p->proxy->v[LEFT];
/* Redirect any p->proxy references to q->proxy */
for (target = p->proxy; list; list = list->next) {
if (list->proxy == target) {
list->active = 0;
list->proxy = q->proxy;
}
}
}
}
static void add_right(polygon_node *p, double x, double y) {
vertex_node *nv = NULL;
/* Create a new vertex node and set its fields */
gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
const_cast<char *>("vertex node creation"));
nv->x = x;
nv->y = y;
nv->next = NULL;
/* Add vertex nv to the right end of the polygon's vertex list */
p->proxy->v[RIGHT]->next = nv;
/* Update proxy->v[RIGHT] to point to nv */
p->proxy->v[RIGHT] = nv;
}
static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
polygon_node *target = NULL;
/* Label contour as external */
q->proxy->hole = 0;
if (p->proxy != q->proxy) {
/* Assign p's vertex list to the right end of q's list */
q->proxy->v[RIGHT]->next = p->proxy->v[LEFT];
q->proxy->v[RIGHT] = p->proxy->v[RIGHT];
/* Redirect any p->proxy references to q->proxy */
for (target = p->proxy; list; list = list->next) {
if (list->proxy == target) {
list->active = 0;
list->proxy = q->proxy;
}
}
}
}
static void add_local_min(polygon_node **p, edge_node *edge, double x,
double y) {
polygon_node *existing_min = NULL;
vertex_node *nv = NULL;
existing_min = *p;
gpc_malloc<polygon_node>(*p, sizeof(polygon_node),
const_cast<char *>("polygon node creation"));
/* Create a new vertex node and set its fields */
gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
const_cast<char *>("vertex node creation"));
nv->x = x;
nv->y = y;
nv->next = NULL;
/* Initialise proxy to point to p itself */
(*p)->proxy = (*p);
(*p)->active = 1;
(*p)->next = existing_min;
/* Make v[LEFT] and v[RIGHT] point to new vertex nv */
(*p)->v[LEFT] = nv;
(*p)->v[RIGHT] = nv;
/* Assign polygon p to the edge */
edge->outp[ABOVE] = *p;
}
static int count_tristrips(polygon_node *tn) {
int total = 0;
for (total = 0; tn; tn = tn->next) {
if (tn->active > 2) {
total++;
}
}
return total;
}
void add_vertex(vertex_node **t, double x, double y) {
if (!(*t)) {
gpc_malloc<vertex_node>(*t, sizeof(vertex_node),
const_cast<char *>("tristrip vertex creation"));
(*t)->x = x;
(*t)->y = y;
(*t)->next = NULL;
} else {
/* Head further down the list */
add_vertex(&((*t)->next), x, y);
}
}
void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
add_vertex(&(e->outp[p]->v[s]), x, y);
e->outp[p]->active++;
}
static void new_tristrip(polygon_node **tn, edge_node *edge, double x,
double y) {
if (!(*tn)) {
gpc_malloc<polygon_node>(*tn, sizeof(polygon_node),
const_cast<char *>("tristrip node creation"));
(*tn)->next = NULL;
(*tn)->v[LEFT] = NULL;
(*tn)->v[RIGHT] = NULL;
(*tn)->active = 1;
add_vertex(&((*tn)->v[LEFT]), x, y);
edge->outp[ABOVE] = *tn;
} else {
/* Head further down the list */
new_tristrip(&((*tn)->next), edge, x, y);
}
}
static bbox *create_contour_bboxes(gpc_polygon *p) {
bbox *box;
int c = 0;
int v = 0;
gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
const_cast<char *>("Bounding box creation"));
/* Construct contour bounding boxes */
for (c = 0; c < p->num_contours; c++) {
/* Initialise bounding box extent */
box[c].xmin = DBL_MAX;
box[c].ymin = DBL_MAX;
box[c].xmax = -DBL_MAX;
box[c].ymax = -DBL_MAX;
for (v = 0; v < p->contour[c].num_vertices; v++) {
/* Adjust bounding box */
if (p->contour[c].vertex[v].x < box[c].xmin) {
box[c].xmin = p->contour[c].vertex[v].x;
}
if (p->contour[c].vertex[v].y < box[c].ymin) {
box[c].ymin = p->contour[c].vertex[v].y;
}
if (p->contour[c].vertex[v].x > box[c].xmax) {
box[c].xmax = p->contour[c].vertex[v].x;
}
if (p->contour[c].vertex[v].y > box[c].ymax) {
box[c].ymax = p->contour[c].vertex[v].y;
}
}
}
return box;
}
static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) {
bbox *s_bbox;
bbox *c_bbox;
int s = 0;
int c = 0;
int *o_table = NULL;
int overlap = 0;
s_bbox = create_contour_bboxes(subj);
c_bbox = create_contour_bboxes(clip);
gpc_malloc<int>(o_table,
subj->num_contours * clip->num_contours * sizeof(int),
const_cast<char *>("overlap table creation"));
/* Check all subject contour bounding boxes against clip boxes */
for (s = 0; s < subj->num_contours; s++) {
for (c = 0; c < clip->num_contours; c++) {
o_table[c * subj->num_contours + s] =
(!((s_bbox[s].xmax < c_bbox[c].xmin) ||
(s_bbox[s].xmin > c_bbox[c].xmax))) &&
(!((s_bbox[s].ymax < c_bbox[c].ymin) ||
(s_bbox[s].ymin > c_bbox[c].ymax)));
}
}
/* For each clip contour, search for any subject contour overlaps */
for (c = 0; c < clip->num_contours; c++) {
overlap = 0;
for (s = 0; (!overlap) && (s < subj->num_contours); s++) {
overlap = o_table[c * subj->num_contours + s];
}
if (!overlap) {
/* Flag non contributing status by negating vertex count */
clip->contour[c].num_vertices = -clip->contour[c].num_vertices;
}
}
if (op == GPC_INT) {
/* For each subject contour, search for any clip contour overlaps */
for (s = 0; s < subj->num_contours; s++) {
overlap = 0;
for (c = 0; (!overlap) && (c < clip->num_contours); c++) {
overlap = o_table[c * subj->num_contours + s];
}
if (!overlap) {
/* Flag non contributing status by negating vertex count */
subj->contour[s].num_vertices = -subj->contour[s].num_vertices;
}
}
}
gpc_free<bbox>(s_bbox);
gpc_free<bbox>(c_bbox);
gpc_free<int>(o_table);
}
/*
===========================================================================
Public Functions
===========================================================================
*/
void gpc_free_polygon(gpc_polygon *p) {
int c = 0;
for (c = 0; c < p->num_contours; c++) {
gpc_free<gpc_vertex>(p->contour[c].vertex);
}
gpc_free<int>(p->hole);
gpc_free<gpc_vertex_list>(p->contour);
p->num_contours = 0;
}
/*
void gpc_read_polygon(FILE *fp, int read_hole_flags, gpc_polygon *p) {
int c = 0;
int v = 0;
fscanf(fp, "%d", &(p->num_contours));
gpc_malloc<int>(p->hole, p->num_contours * sizeof(int),
(char *)"hole flag array creation");
gpc_malloc<gpc_vertex_list>(p->contour,
p->num_contours * sizeof(gpc_vertex_list),
(char *)"contour creation");
for (c = 0; c < p->num_contours; c++) {
fscanf(fp, "%d", &(p->contour[c].num_vertices));
if (read_hole_flags) {
fscanf(fp, "%d", &(p->hole[c]));
} else {
p->hole[c] = 0; // Assume all contours to be external
}
gpc_malloc<gpc_vertex>(p->contour[c].vertex,
p->contour[c].num_vertices * sizeof(gpc_vertex),
(char *)"vertex creation");
for (v = 0; v < p->contour[c].num_vertices; v++) {
fscanf(fp, "%lf %lf", &(p->contour[c].vertex[v].x),
&(p->contour[c].vertex[v].y));
}
}
}
void gpc_write_polygon(FILE *fp, int write_hole_flags, gpc_polygon *p) {
int c = 0;
int v = 0;
fprintf(fp, "%d\n", p->num_contours);
for (c = 0; c < p->num_contours; c++) {
fprintf(fp, "%d\n", p->contour[c].num_vertices);
if (write_hole_flags) {
fprintf(fp, "%d\n", p->hole[c]);
}
for (v = 0; v < p->contour[c].num_vertices; v++) {
fprintf(fp, "% .*lf % .*lf\n", DBL_DIG, p->contour[c].vertex[v].x,
DBL_DIG, p->contour[c].vertex[v].y);
}
}
}
*/
void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
int *extended_hole = NULL;
int c = 0;
int v = 0;
gpc_vertex_list *extended_contour = NULL;
/* Create an extended hole array */
gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
const_cast<char *>("contour hole addition"));
/* Create an extended contour array */
gpc_malloc<gpc_vertex_list>(extended_contour,
(p->num_contours + 1) * sizeof(gpc_vertex_list),
const_cast<char *>("contour addition"));
/* Copy the old contour and hole data into the extended arrays */
for (c = 0; c < p->num_contours; c++) {
extended_hole[c] = p->hole[c];
extended_contour[c] = p->contour[c];
}
/* Copy the new contour and hole onto the end of the extended arrays */
c = p->num_contours;
extended_hole[c] = hole;
extended_contour[c].num_vertices = new_contour->num_vertices;
gpc_malloc<gpc_vertex>(extended_contour[c].vertex,
new_contour->num_vertices * sizeof(gpc_vertex),
const_cast<char *>("contour addition"));
for (v = 0; v < new_contour->num_vertices; v++) {
extended_contour[c].vertex[v] = new_contour->vertex[v];
}
/* Dispose of the old contour */
gpc_free<gpc_vertex_list>(p->contour);
gpc_free<int>(p->hole);
/* Update the polygon information */
p->num_contours++;
p->hole = extended_hole;
p->contour = extended_contour;
}
// gpc_polygon_clip
void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
gpc_polygon *result) {
sb_tree *sbtree = NULL;
it_node *it = NULL;
it_node *intersect = NULL;
edge_node *edge = NULL;
edge_node *prev_edge = NULL;
edge_node *next_edge = NULL;
edge_node *succ_edge = NULL;
edge_node *e0 = NULL;
edge_node *e1 = NULL;
edge_node *aet = NULL;
edge_node *c_heap = NULL;
edge_node *s_heap = NULL;
lmt_node *lmt = NULL;
lmt_node *local_min = NULL;
polygon_node *out_poly = NULL;
polygon_node *p = NULL;
polygon_node *q = NULL;
polygon_node *poly = NULL;
polygon_node *npoly = NULL;
polygon_node *cf = NULL;
vertex_node *vtx = NULL;
vertex_node *nv = NULL;
h_state horiz[2];
int in[2];
int exists[2];
int parity[2] = {LEFT, LEFT};
int c = 0;
int v = 0;
int contributing = 0;
int search = 0;
int scanbeam = 0;
int sbt_entries = 0;
int vclass = 0;
int bl = 0;
int br = 0;
int tl = 0;
int tr = 0;
double *sbt = NULL;
double xb = 0.0;
double px = 0.0;
double yb = 0.0;
double yt = 0.0;
double dy = 0.0;
double ix = 0.0;
double iy = 0.0;
/* Test for trivial NULL result cases */
if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
((clip->num_contours == 0) && (op == GPC_INT))) {
result->num_contours = 0;
result->hole = NULL;
result->contour = NULL;
return;
}
/* Identify potentialy contributing contours */
if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
(clip->num_contours > 0)) {
minimax_test(subj, clip, op);
}
/* Build LMT */
if (subj->num_contours > 0) {
s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
}
if (clip->num_contours > 0) {
c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
}
/* Return a NULL result if no contours contribute */
if (lmt == NULL) {
result->num_contours = 0;
result->hole = NULL;
result->contour = NULL;
reset_lmt(&lmt);
gpc_free<edge_node>(s_heap);
gpc_free<edge_node>(c_heap);
return;
}
/* Build scanbeam table from scanbeam tree */
gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
const_cast<char *>("sbt creation"));
build_sbt(&scanbeam, sbt, sbtree);
scanbeam = 0;
free_sbtree(&sbtree);
/* Allow pointer re-use without causing memory leak */
if (subj == result) {
gpc_free_polygon(subj);
}
if (clip == result) {
gpc_free_polygon(clip);
}
/* Invert clip polygon for difference operation */
if (op == GPC_DIFF) {
parity[CLIP] = RIGHT;
}
local_min = lmt;
// Process each scanbeam
while (scanbeam < sbt_entries) {
/* Set yb and yt to the bottom and top of the scanbeam */
yb = sbt[scanbeam++];
if (scanbeam < sbt_entries) {
yt = sbt[scanbeam];
dy = yt - yb;
}
/* === SCANBEAM BOUNDARY PROCESSING ================================ */
/* If LMT node corresponding to yb exists */
if (local_min) {
if (local_min->y == yb) {
/* Add edges starting at this local minimum to the AET */
for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
add_edge_to_aet(&aet, edge, NULL);
}
local_min = local_min->next;
}
}
/* Set dummy previous x value */
px = -DBL_MAX;
/* Create bundles within AET */
e0 = aet;
e1 = aet;
/* Set up bundle fields of first edge */
aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
aet->bundle[ABOVE][!aet->type] = 0;
aet->bstate[ABOVE] = UNBUNDLED;
for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
/* Set up bundle fields of next edge */
next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
next_edge->bundle[ABOVE][!next_edge->type] = 0;
next_edge->bstate[ABOVE] = UNBUNDLED;
/* Bundle edges above the scanbeam boundary if they coincide */
if (next_edge->bundle[ABOVE][next_edge->type]) {
if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
(e0->top.y != yb)) {
next_edge->bundle[ABOVE][next_edge->type] ^=
e0->bundle[ABOVE][next_edge->type];
next_edge->bundle[ABOVE][!next_edge->type] =
e0->bundle[ABOVE][!next_edge->type];
next_edge->bstate[ABOVE] = BUNDLE_HEAD;
e0->bundle[ABOVE][CLIP] = 0;
e0->bundle[ABOVE][SUBJ] = 0;
e0->bstate[ABOVE] = BUNDLE_TAIL;
}
e0 = next_edge;
}
}
horiz[CLIP] = NH;
horiz[SUBJ] = NH;
// Process each edge at this scanbeam boundary
for (edge = aet; edge; edge = edge->next) {
exists[CLIP] =
edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
exists[SUBJ] =
edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
if (exists[CLIP] || exists[SUBJ]) {
/* Set bundle side */
edge->bside[CLIP] = parity[CLIP];
edge->bside[SUBJ] = parity[SUBJ];
/* Determine contributing status and quadrant occupancies */
switch (op) {
case GPC_DIFF:
case GPC_INT:
contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
(exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
(exists[CLIP] && exists[SUBJ] &&
(parity[CLIP] == parity[SUBJ]));
br = (parity[CLIP]) && (parity[SUBJ]);
bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
(parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
(parity[SUBJ] ^ (horiz[SUBJ] != NH));
tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
edge->bundle[BELOW][CLIP]) &&
(parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
edge->bundle[BELOW][SUBJ]);
break;
case GPC_XOR:
contributing = exists[CLIP] || exists[SUBJ];
br = (parity[CLIP]) ^ (parity[SUBJ]);
bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
(parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
(parity[SUBJ] ^ (horiz[SUBJ] != NH));
tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
edge->bundle[BELOW][CLIP]) ^
(parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
edge->bundle[BELOW][SUBJ]);
break;
case GPC_UNION:
contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
(exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
(exists[CLIP] && exists[SUBJ] &&
(parity[CLIP] == parity[SUBJ]));
br = (parity[CLIP]) || (parity[SUBJ]);
bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
(parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
(parity[SUBJ] ^ (horiz[SUBJ] != NH));
tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
edge->bundle[BELOW][CLIP]) ||
(parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
edge->bundle[BELOW][SUBJ]);
break;
}
// Update parity
parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
/* Update horizontal state */
if (exists[CLIP]) {
horiz[CLIP] = next_h_state[horiz[CLIP]]
[((exists[CLIP] - 1) << 1) + parity[CLIP]];
}
if (exists[SUBJ]) {
horiz[SUBJ] = next_h_state[horiz[SUBJ]]
[((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
}
vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
if (contributing) {
xb = edge->xb;
switch (vclass) {
case EMN:
case IMN:
add_local_min(&out_poly, edge, xb, yb);
px = xb;
cf = edge->outp[ABOVE];
break;
case ERI:
if (xb != px) {
add_right(cf, xb, yb);
px = xb;
}
edge->outp[ABOVE] = cf;
cf = NULL;
break;
case ELI:
add_left(edge->outp[BELOW], xb, yb);
px = xb;
cf = edge->outp[BELOW];
break;
case EMX:
if (xb != px) {
add_left(cf, xb, yb);
px = xb;
}
merge_right(cf, edge->outp[BELOW], out_poly);
cf = NULL;
break;
case ILI:
if (xb != px) {
add_left(cf, xb, yb);
px = xb;
}
edge->outp[ABOVE] = cf;
cf = NULL;
break;
case IRI:
add_right(edge->outp[BELOW], xb, yb);
px = xb;
cf = edge->outp[BELOW];
edge->outp[BELOW] = NULL;
break;
case IMX:
if (xb != px) {
add_right(cf, xb, yb);
px = xb;
}
merge_left(cf, edge->outp[BELOW], out_poly);
cf = NULL;
edge->outp[BELOW] = NULL;
break;
case IMM:
if (xb != px) {
add_right(cf, xb, yb);
px = xb;
}
merge_left(cf, edge->outp[BELOW], out_poly);
edge->outp[BELOW] = NULL;
add_local_min(&out_poly, edge, xb, yb);
cf = edge->outp[ABOVE];
break;
case EMM:
if (xb != px) {
add_left(cf, xb, yb);
px = xb;
}
merge_right(cf, edge->outp[BELOW], out_poly);
edge->outp[BELOW] = NULL;
add_local_min(&out_poly, edge, xb, yb);
cf = edge->outp[ABOVE];
break;
case LED:
if (edge->bot.y == yb) {
add_left(edge->outp[BELOW], xb, yb);
}
edge->outp[ABOVE] = edge->outp[BELOW];
px = xb;
break;
case RED:
if (edge->bot.y == yb) {
add_right(edge->outp[BELOW], xb, yb);
}
edge->outp[ABOVE] = edge->outp[BELOW];
px = xb;
break;
default:
break;
} /* End of switch */
} /* End of contributing conditional */
} /* End of edge exists conditional */
} // End of AET loop
/* Delete terminating edges from the AET, otherwise compute xt */
for (edge = aet; edge; edge = edge->next) {
if (edge->top.y == yb) {
prev_edge = edge->prev;
next_edge = edge->next;
if (prev_edge) {
prev_edge->next = next_edge;
} else {
aet = next_edge;
}
if (next_edge) {
next_edge->prev = prev_edge;
}
/* Copy bundle head state to the adjacent tail edge if required */
if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
prev_edge->outp[BELOW] = edge->outp[BELOW];
prev_edge->bstate[BELOW] = UNBUNDLED;
if (prev_edge->prev) {
if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
prev_edge->bstate[BELOW] = BUNDLE_HEAD;
}
}
}
}
} else {
if (edge->top.y == yt) {
edge->xt = edge->top.x;
} else {
edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
}
}
}
if (scanbeam < sbt_entries) {
/* === SCANBEAM INTERIOR PROCESSING ============================== */
build_intersection_table(&it, aet, dy);
/* Process each node in the intersection table */
for (intersect = it; intersect; intersect = intersect->next) {
e0 = intersect->ie[0];
e1 = intersect->ie[1];
/* Only generate output for contributing intersections */
if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
(e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
p = e0->outp[ABOVE];
q = e1->outp[ABOVE];
ix = intersect->point.x;
iy = intersect->point.y + yb;
in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
(e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
(!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
e0->bside[CLIP] && e1->bside[CLIP]);
in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
(e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
(!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
e0->bside[SUBJ] && e1->bside[SUBJ]);
// Determine quadrant occupancies
switch (op) {
case GPC_DIFF:
case GPC_INT:
tr = (in[CLIP]) && (in[SUBJ]);
tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
(in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
e0->bundle[ABOVE][CLIP]) &&
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
e0->bundle[ABOVE][SUBJ]);
break;
case GPC_XOR:
tr = (in[CLIP]) ^ (in[SUBJ]);
tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
(in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
e0->bundle[ABOVE][CLIP]) ^
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
e0->bundle[ABOVE][SUBJ]);
break;
case GPC_UNION:
tr = (in[CLIP]) || (in[SUBJ]);
tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
(in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
e0->bundle[ABOVE][CLIP]) ||
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
e0->bundle[ABOVE][SUBJ]);
break;
}
vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
switch (vclass) {
case EMN:
add_local_min(&out_poly, e0, ix, iy);
e1->outp[ABOVE] = e0->outp[ABOVE];
break;
case ERI:
if (p) {
add_right(p, ix, iy);
e1->outp[ABOVE] = p;
e0->outp[ABOVE] = NULL;
}
break;
case ELI:
if (q) {
add_left(q, ix, iy);
e0->outp[ABOVE] = q;
e1->outp[ABOVE] = NULL;
}
break;
case EMX:
if (p && q) {
add_left(p, ix, iy);
merge_right(p, q, out_poly);
e0->outp[ABOVE] = NULL;
e1->outp[ABOVE] = NULL;
}
break;
case IMN:
add_local_min(&out_poly, e0, ix, iy);
e1->outp[ABOVE] = e0->outp[ABOVE];
break;
case ILI:
if (p) {
add_left(p, ix, iy);
e1->outp[ABOVE] = p;
e0->outp[ABOVE] = NULL;
}
break;
case IRI:
if (q) {
add_right(q, ix, iy);
e0->outp[ABOVE] = q;
e1->outp[ABOVE] = NULL;
}
break;
case IMX:
if (p && q) {
add_right(p, ix, iy);
merge_left(p, q, out_poly);
e0->outp[ABOVE] = NULL;
e1->outp[ABOVE] = NULL;
}
break;
case IMM:
if (p && q) {
add_right(p, ix, iy);
merge_left(p, q, out_poly);
add_local_min(&out_poly, e0, ix, iy);
e1->outp[ABOVE] = e0->outp[ABOVE];
}
break;
case EMM:
if (p && q) {
add_left(p, ix, iy);
merge_right(p, q, out_poly);
add_local_min(&out_poly, e0, ix, iy);
e1->outp[ABOVE] = e0->outp[ABOVE];
}
break;
default:
break;
} // End of switch
} /* End of contributing intersection conditional */
/* Swap bundle sides in response to edge crossing */
if (e0->bundle[ABOVE][CLIP]) {
e1->bside[CLIP] = !e1->bside[CLIP];
}
if (e1->bundle[ABOVE][CLIP]) {
e0->bside[CLIP] = !e0->bside[CLIP];
}
if (e0->bundle[ABOVE][SUBJ]) {
e1->bside[SUBJ] = !e1->bside[SUBJ];
}
if (e1->bundle[ABOVE][SUBJ]) {
e0->bside[SUBJ] = !e0->bside[SUBJ];
}
/* Swap e0 and e1 bundles in the AET */
prev_edge = e0->prev;
next_edge = e1->next;
if (next_edge) {
next_edge->prev = e0;
}
if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
search = 1;
while (search) {
prev_edge = prev_edge->prev;
if (prev_edge) {
if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) {
search = 0;
}
} else {
search = 0;
}
}
}
if (!prev_edge) {
aet->prev = e1;
e1->next = aet;
aet = e0->next;
} else {
prev_edge->next->prev = e1;
e1->next = prev_edge->next;
prev_edge->next = e0->next;
}
e0->next->prev = prev_edge;
e1->next->prev = e1;
e0->next = next_edge;
} /* End of IT loop*/
// Prepare for next scanbeam
for (edge = aet; edge; edge = next_edge) {
next_edge = edge->next;
succ_edge = edge->succ;
if ((edge->top.y == yt) && succ_edge) {
/* Replace AET edge by its successor */
succ_edge->outp[BELOW] = edge->outp[ABOVE];
succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
prev_edge = edge->prev;
if (prev_edge) {
prev_edge->next = succ_edge;
} else {
aet = succ_edge;
}
if (next_edge) {
next_edge->prev = succ_edge;
}
succ_edge->prev = prev_edge;
succ_edge->next = next_edge;
} else {
/* Update this edge */
edge->outp[BELOW] = edge->outp[ABOVE];
edge->bstate[BELOW] = edge->bstate[ABOVE];
edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
edge->xb = edge->xt;
}
edge->outp[ABOVE] = NULL;
}
}
} /* === END OF SCANBEAM PROCESSING ================================== */
// Generate result polygon from out_poly
result->contour = NULL;
result->hole = NULL;
result->num_contours = count_contours(out_poly);
if (result->num_contours > 0) {
gpc_malloc<int>(result->hole, result->num_contours * sizeof(int),
const_cast<char *>("hole flag table creation"));
gpc_malloc<gpc_vertex_list>(result->contour,
result->num_contours * sizeof(gpc_vertex_list),
const_cast<char *>("contour creation"));
c = 0;
for (poly = out_poly; poly; poly = npoly) {
npoly = poly->next;
if (poly->active) {
result->hole[c] = poly->proxy->hole;
result->contour[c].num_vertices = poly->active;
gpc_malloc<gpc_vertex>(
result->contour[c].vertex,
result->contour[c].num_vertices * sizeof(gpc_vertex),
const_cast<char *>("vertex creation"));
v = result->contour[c].num_vertices - 1;
for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) {
nv = vtx->next;
result->contour[c].vertex[v].x = vtx->x;
result->contour[c].vertex[v].y = vtx->y;
gpc_free<vertex_node>(vtx);
v--;
}
c++;
}
gpc_free<polygon_node>(poly);
}
} else {
for (poly = out_poly; poly; poly = npoly) {
npoly = poly->next;
gpc_free<polygon_node>(poly);
}
}
// Tidy up
reset_it(&it);
reset_lmt(&lmt);
gpc_free<edge_node>(c_heap);
gpc_free<edge_node>(s_heap);
gpc_free<double>(sbt);
} // NOLINT
void gpc_free_tristrip(gpc_tristrip *t) {
int s = 0;
for (s = 0; s < t->num_strips; s++) {
gpc_free<gpc_vertex>(t->strip[s].vertex);
}
gpc_free<gpc_vertex_list>(t->strip);
t->num_strips = 0;
}
void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) {
gpc_polygon c;
c.num_contours = 0;
c.hole = NULL;
c.contour = NULL;
gpc_tristrip_clip(GPC_DIFF, s, &c, t);
}
// gpc_tristrip_clip
void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
gpc_tristrip *result) {
sb_tree *sbtree = NULL;
it_node *it = NULL;
it_node *intersect = NULL;
edge_node *edge = NULL;
edge_node *prev_edge = NULL;
edge_node *next_edge = NULL;
edge_node *succ_edge = NULL;
edge_node *e0 = NULL;
edge_node *e1 = NULL;
edge_node *aet = NULL;
edge_node *c_heap = NULL;
edge_node *s_heap = NULL;
edge_node *cf = NULL;
lmt_node *lmt = NULL;
lmt_node *local_min = NULL;
polygon_node *tlist = NULL;
polygon_node *tn = NULL;
polygon_node *tnn = NULL;
polygon_node *p = NULL;
polygon_node *q = NULL;
vertex_node *lt = NULL;
vertex_node *ltn = NULL;
vertex_node *rt = NULL;
vertex_node *rtn = NULL;
h_state horiz[2];
vertex_type cft = NUL;
int in[2];
int exists[2];
int parity[2] = {LEFT, LEFT};
int s = 0;
int v = 0;
int contributing = 0;
int search = 0;
int scanbeam = 0;
int sbt_entries = 0;
int vclass = 0;
int bl = 0;
int br = 0;
int tl = 0;
int tr = 0;
double *sbt = NULL;
double xb = 0.0;
double px = 0.0;
double nx = 0.0;
double yb = 0.0;
double yt = 0.0;
double dy = 0.0;
double ix = 0.0;
double iy = 0.0;
/* Test for trivial NULL result cases */
if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
((clip->num_contours == 0) && (op == GPC_INT))) {
result->num_strips = 0;
result->strip = NULL;
return;
}
/* Identify potentialy contributing contours */
if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
(clip->num_contours > 0)) {
minimax_test(subj, clip, op);
}
/* Build LMT */
if (subj->num_contours > 0) {
s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
}
if (clip->num_contours > 0) {
c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
}
/* Return a NULL result if no contours contribute */
if (lmt == NULL) {
result->num_strips = 0;
result->strip = NULL;
reset_lmt(&lmt);
gpc_free<edge_node>(s_heap);
gpc_free<edge_node>(c_heap);
return;
}
/* Build scanbeam table from scanbeam tree */
gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
const_cast<char *>("sbt creation"));
build_sbt(&scanbeam, sbt, sbtree);
scanbeam = 0;
free_sbtree(&sbtree);
/* Invert clip polygon for difference operation */
if (op == GPC_DIFF) {
parity[CLIP] = RIGHT;
}
local_min = lmt;
// Process each scanbeam
while (scanbeam < sbt_entries) {
/* Set yb and yt to the bottom and top of the scanbeam */
yb = sbt[scanbeam++];
if (scanbeam < sbt_entries) {
yt = sbt[scanbeam];
dy = yt - yb;
}
/* === SCANBEAM BOUNDARY PROCESSING ================================ */
/* If LMT node corresponding to yb exists */
if (local_min) {
if (local_min->y == yb) {
/* Add edges starting at this local minimum to the AET */
for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
add_edge_to_aet(&aet, edge, NULL);
}
local_min = local_min->next;
}
}
/* Set dummy previous x value */
/* Create bundles within AET */
px = -DBL_MAX;
e0 = aet;
e1 = aet;
/* Set up bundle fields of first edge */
aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
aet->bundle[ABOVE][!aet->type] = 0;
aet->bstate[ABOVE] = UNBUNDLED;
for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
/* Set up bundle fields of next edge */
next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
next_edge->bundle[ABOVE][!next_edge->type] = 0;
next_edge->bstate[ABOVE] = UNBUNDLED;
/* Bundle edges above the scanbeam boundary if they coincide */
if (next_edge->bundle[ABOVE][next_edge->type]) {
if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
(e0->top.y != yb)) {
next_edge->bundle[ABOVE][next_edge->type] ^=
e0->bundle[ABOVE][next_edge->type];
next_edge->bundle[ABOVE][!next_edge->type] =
e0->bundle[ABOVE][!next_edge->type];
next_edge->bstate[ABOVE] = BUNDLE_HEAD;
e0->bundle[ABOVE][CLIP] = 0;
e0->bundle[ABOVE][SUBJ] = 0;
e0->bstate[ABOVE] = BUNDLE_TAIL;
}
e0 = next_edge;
}
}
horiz[CLIP] = NH;
horiz[SUBJ] = NH;
/* Process each edge at this scanbeam boundary */
for (edge = aet; edge; edge = edge->next) {
exists[CLIP] =
edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
exists[SUBJ] =
edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
if (exists[CLIP] || exists[SUBJ]) {
/* Set bundle side */
edge->bside[CLIP] = parity[CLIP];
edge->bside[SUBJ] = parity[SUBJ];
/* Determine contributing status and quadrant occupancies */
switch (op) {
case GPC_DIFF:
case GPC_INT:
contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
(exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
(exists[CLIP] && exists[SUBJ] &&
(parity[CLIP] == parity[SUBJ]));
br = (parity[CLIP]) && (parity[SUBJ]);
bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
(parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
(parity[SUBJ] ^ (horiz[SUBJ] != NH));
tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
edge->bundle[BELOW][CLIP]) &&
(parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
edge->bundle[BELOW][SUBJ]);
break;
case GPC_XOR:
contributing = exists[CLIP] || exists[SUBJ];
br = (parity[CLIP]) ^ (parity[SUBJ]);
bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
(parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
(parity[SUBJ] ^ (horiz[SUBJ] != NH));
tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
edge->bundle[BELOW][CLIP]) ^
(parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
edge->bundle[BELOW][SUBJ]);
break;
case GPC_UNION:
contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
(exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
(exists[CLIP] && exists[SUBJ] &&
(parity[CLIP] == parity[SUBJ]));
br = (parity[CLIP]) || (parity[SUBJ]);
bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
(parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
(parity[SUBJ] ^ (horiz[SUBJ] != NH));
tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
edge->bundle[BELOW][CLIP]) ||
(parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
edge->bundle[BELOW][SUBJ]);
break;
}
// Update parity
parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
/* Update horizontal state */
if (exists[CLIP]) {
horiz[CLIP] = next_h_state[horiz[CLIP]]
[((exists[CLIP] - 1) << 1) + parity[CLIP]];
}
if (exists[SUBJ]) {
horiz[SUBJ] = next_h_state[horiz[SUBJ]]
[((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
}
vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
if (contributing) {
xb = edge->xb;
switch (vclass) {
case EMN:
new_tristrip(&tlist, edge, xb, yb);
cf = edge;
break;
case ERI:
edge->outp[ABOVE] = cf->outp[ABOVE];
if (xb != cf->xb) {
gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
}
cf = NULL;
break;
case ELI:
gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
edge->outp[ABOVE] = NULL;
cf = edge;
break;
case EMX:
if (xb != cf->xb) {
gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
}
edge->outp[ABOVE] = NULL;
cf = NULL;
break;
case IMN:
if (cft == LED) {
if (cf->bot.y != yb) {
gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
}
new_tristrip(&tlist, cf, cf->xb, yb);
}
edge->outp[ABOVE] = cf->outp[ABOVE];
gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
break;
case ILI:
new_tristrip(&tlist, edge, xb, yb);
cf = edge;
cft = ILI;
break;
case IRI:
if (cft == LED) {
if (cf->bot.y != yb) {
gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
}
new_tristrip(&tlist, cf, cf->xb, yb);
}
gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
edge->outp[ABOVE] = NULL;
break;
case IMX:
gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
edge->outp[ABOVE] = NULL;
cft = IMX;
break;
case IMM:
gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
edge->outp[ABOVE] = cf->outp[ABOVE];
if (xb != cf->xb) {
gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb);
}
cf = edge;
break;
case EMM:
gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
edge->outp[ABOVE] = NULL;
new_tristrip(&tlist, edge, xb, yb);
cf = edge;
break;
case LED:
if (edge->bot.y == yb) {
gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
}
edge->outp[ABOVE] = edge->outp[BELOW];
cf = edge;
cft = LED;
break;
case RED:
edge->outp[ABOVE] = cf->outp[ABOVE];
if (cft == LED) {
if (cf->bot.y == yb) {
gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
} else {
if (edge->bot.y == yb) {
gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
}
}
} else {
gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
}
cf = NULL;
break;
default:
break;
} /* End of switch */
} /* End of contributing conditional */
} /* End of edge exists conditional */
} // End of AET loop
/* Delete terminating edges from the AET, otherwise compute xt */
for (edge = aet; edge; edge = edge->next) {
if (edge->top.y == yb) {
prev_edge = edge->prev;
next_edge = edge->next;
if (prev_edge) {
prev_edge->next = next_edge;
} else {
aet = next_edge;
}
if (next_edge) {
next_edge->prev = prev_edge;
}
/* Copy bundle head state to the adjacent tail edge if required */
if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
prev_edge->outp[BELOW] = edge->outp[BELOW];
prev_edge->bstate[BELOW] = UNBUNDLED;
if (prev_edge->prev) {
if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
prev_edge->bstate[BELOW] = BUNDLE_HEAD;
}
}
}
}
} else {
if (edge->top.y == yt) {
edge->xt = edge->top.x;
} else {
edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
}
}
}
if (scanbeam < sbt_entries) {
/* === SCANBEAM INTERIOR PROCESSING ============================== */
build_intersection_table(&it, aet, dy);
/* Process each node in the intersection table */
for (intersect = it; intersect; intersect = intersect->next) {
e0 = intersect->ie[0];
e1 = intersect->ie[1];
/* Only generate output for contributing intersections */
if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
(e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
p = e0->outp[ABOVE];
q = e1->outp[ABOVE];
ix = intersect->point.x;
iy = intersect->point.y + yb;
in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
(e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
(!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
e0->bside[CLIP] && e1->bside[CLIP]);
in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
(e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
(!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
e0->bside[SUBJ] && e1->bside[SUBJ]);
switch (op) { // Determine quadrant occupancies
case GPC_DIFF:
case GPC_INT:
tr = (in[CLIP]) && (in[SUBJ]);
tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
(in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
e0->bundle[ABOVE][CLIP]) &&
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
e0->bundle[ABOVE][SUBJ]);
break;
case GPC_XOR:
tr = (in[CLIP]) ^ (in[SUBJ]);
tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
(in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
e0->bundle[ABOVE][CLIP]) ^
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
e0->bundle[ABOVE][SUBJ]);
break;
case GPC_UNION:
tr = (in[CLIP]) || (in[SUBJ]);
tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
(in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
e0->bundle[ABOVE][CLIP]) ||
(in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
e0->bundle[ABOVE][SUBJ]);
break;
}
vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
switch (vclass) {
case EMN:
new_tristrip(&tlist, e1, ix, iy);
e0->outp[ABOVE] = e1->outp[ABOVE];
break;
case ERI:
if (p) {
gpc_p_edge(prev_edge, e0, ABOVE);
gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
e1->outp[ABOVE] = e0->outp[ABOVE];
e0->outp[ABOVE] = NULL;
}
break;
case ELI:
if (q) {
gpc_n_edge(next_edge, e1, ABOVE);
gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
e0->outp[ABOVE] = e1->outp[ABOVE];
e1->outp[ABOVE] = NULL;
}
break;
case EMX:
if (p && q) {
gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
e0->outp[ABOVE] = NULL;
e1->outp[ABOVE] = NULL;
}
break;
case IMN:
gpc_p_edge(prev_edge, e0, ABOVE);
gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
gpc_n_edge(next_edge, e1, ABOVE);
gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
new_tristrip(&tlist, prev_edge, px, iy);
e1->outp[ABOVE] = prev_edge->outp[ABOVE];
gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
new_tristrip(&tlist, e0, ix, iy);
next_edge->outp[ABOVE] = e0->outp[ABOVE];
gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
break;
case ILI:
if (p) {
gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
gpc_n_edge(next_edge, e1, ABOVE);
gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
e1->outp[ABOVE] = e0->outp[ABOVE];
e0->outp[ABOVE] = NULL;
}
break;
case IRI:
if (q) {
gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
gpc_p_edge(prev_edge, e0, ABOVE);
gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
e0->outp[ABOVE] = e1->outp[ABOVE];
e1->outp[ABOVE] = NULL;
}
break;
case IMX:
if (p && q) {
gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
e0->outp[ABOVE] = NULL;
e1->outp[ABOVE] = NULL;
gpc_p_edge(prev_edge, e0, ABOVE);
gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
new_tristrip(&tlist, prev_edge, px, iy);
gpc_n_edge(next_edge, e1, ABOVE);
gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
next_edge->outp[ABOVE] = prev_edge->outp[ABOVE];
gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
}
break;
case IMM:
if (p && q) {
gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
gpc_p_edge(prev_edge, e0, ABOVE);
gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
new_tristrip(&tlist, prev_edge, px, iy);
gpc_n_edge(next_edge, e1, ABOVE);
gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
e1->outp[ABOVE] = prev_edge->outp[ABOVE];
gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
new_tristrip(&tlist, e0, ix, iy);
next_edge->outp[ABOVE] = e0->outp[ABOVE];
gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
}
break;
case EMM:
if (p && q) {
gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
new_tristrip(&tlist, e1, ix, iy);
e0->outp[ABOVE] = e1->outp[ABOVE];
}
break;
default:
break;
} /* End of switch */
} /* End of contributing intersection conditional */
// Swap bundle sides in response to edge crossing
if (e0->bundle[ABOVE][CLIP]) {
e1->bside[CLIP] = !e1->bside[CLIP];
}
if (e1->bundle[ABOVE][CLIP]) {
e0->bside[CLIP] = !e0->bside[CLIP];
}
if (e0->bundle[ABOVE][SUBJ]) {
e1->bside[SUBJ] = !e1->bside[SUBJ];
}
if (e1->bundle[ABOVE][SUBJ]) {
e0->bside[SUBJ] = !e0->bside[SUBJ];
}
/* Swap e0 and e1 bundles in the AET */
prev_edge = e0->prev;
next_edge = e1->next;
if (e1->next) {
e1->next->prev = e0;
}
if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
search = 1;
while (search) {
prev_edge = prev_edge->prev;
if (prev_edge) {
if (prev_edge->bundle[ABOVE][CLIP] ||
prev_edge->bundle[ABOVE][SUBJ] ||
(prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) {
search = 0;
}
} else {
search = 0;
}
}
}
if (!prev_edge) {
e1->next = aet;
aet = e0->next;
} else {
e1->next = prev_edge->next;
prev_edge->next = e0->next;
}
e0->next->prev = prev_edge;
e1->next->prev = e1;
e0->next = next_edge;
} /* End of IT loop*/
/* Prepare for next scanbeam */
for (edge = aet; edge; edge = next_edge) {
next_edge = edge->next;
succ_edge = edge->succ;
if ((edge->top.y == yt) && succ_edge) {
/* Replace AET edge by its successor */
succ_edge->outp[BELOW] = edge->outp[ABOVE];
succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
prev_edge = edge->prev;
if (prev_edge) {
prev_edge->next = succ_edge;
} else {
aet = succ_edge;
}
if (next_edge) {
next_edge->prev = succ_edge;
}
succ_edge->prev = prev_edge;
succ_edge->next = next_edge;
} else {
/* Update this edge */
edge->outp[BELOW] = edge->outp[ABOVE];
edge->bstate[BELOW] = edge->bstate[ABOVE];
edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
edge->xb = edge->xt;
}
edge->outp[ABOVE] = NULL;
}
}
} /* === END OF SCANBEAM PROCESSING ================================== */
// Generate result tristrip from tlist
result->strip = NULL;
result->num_strips = count_tristrips(tlist);
if (result->num_strips > 0) {
gpc_malloc<gpc_vertex_list>(result->strip,
result->num_strips * sizeof(gpc_vertex_list),
const_cast<char *>("tristrip list creation"));
s = 0;
for (tn = tlist; tn; tn = tnn) {
tnn = tn->next;
if (tn->active > 2) {
/* Valid tristrip: copy the vertices and free the heap */
result->strip[s].num_vertices = tn->active;
gpc_malloc<gpc_vertex>(result->strip[s].vertex,
tn->active * sizeof(gpc_vertex),
const_cast<char *>("tristrip creation"));
v = 0;
if (0) {
lt = tn->v[RIGHT];
rt = tn->v[LEFT];
} else {
lt = tn->v[LEFT];
rt = tn->v[RIGHT];
}
while (lt || rt) {
if (lt) {
ltn = lt->next;
result->strip[s].vertex[v].x = lt->x;
result->strip[s].vertex[v].y = lt->y;
v++;
gpc_free<vertex_node>(lt);
lt = ltn;
}
if (rt) {
rtn = rt->next;
result->strip[s].vertex[v].x = rt->x;
result->strip[s].vertex[v].y = rt->y;
v++;
gpc_free<vertex_node>(rt);
rt = rtn;
}
}
s++;
} else {
/* Invalid tristrip: just free the heap */
for (lt = tn->v[LEFT]; lt; lt = ltn) {
ltn = lt->next;
gpc_free<vertex_node>(lt);
}
for (rt = tn->v[RIGHT]; rt; rt = rtn) {
rtn = rt->next;
gpc_free<vertex_node>(rt);
}
}
gpc_free<polygon_node>(tn);
}
}
// Tidy up
reset_it(&it);
reset_lmt(&lmt);
gpc_free<edge_node>(c_heap);
gpc_free<edge_node>(s_heap);
gpc_free<double>(sbt);
} // NOLINT
} // namespace gpc
/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/***************************************************************************
*
* Copyright (c) 2015 Baidu.com, Inc. All Rights Reserved
*
**************************************************************************/
/**
* @file include/gpc.h
* @author huhan02(com@baidu.com)
* @date 2015/12/18 13:52:10
* @brief
*
* @modified by sunyipeng
* @email sunyipeng@baidu.com
* @date 2018/6/12
**/
#ifndef PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_ // GPC_H_
#define PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_ // GPC_H_
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
namespace gpc {
typedef enum { // Set operation type
GPC_DIFF, // Difference
GPC_INT, // Intersection
GPC_XOR, // Exclusive or
GPC_UNION // Union
} gpc_op;
typedef struct { // Polygon vertex structure
double x; // Vertex x component
double y; // vertex y component
} gpc_vertex;
typedef struct { // Vertex list structure
int num_vertices; // Number of vertices in list
gpc_vertex *vertex; // Vertex array pointer
} gpc_vertex_list;
typedef struct { // Polygon set structure
int num_contours; // Number of contours in polygon
int *hole; // Hole external contour flags
gpc_vertex_list *contour; // Contour array pointer
} gpc_polygon;
typedef struct { // Tristrip set structure
int num_strips; // Number of tristrips
gpc_vertex_list *strip; // Tristrip array pointer
} gpc_tristrip;
typedef enum { LEFT, RIGHT } gpc_left_right;
typedef enum { ABOVE, BELOW } gpc_above_below;
typedef enum { CLIP, SUBJ } gpc_clip_subj;
typedef enum { /* Edge intersection classes */
NUL, /* Empty non-intersection */
EMX, /* External maximum */
ELI, /* External left intermediate */
TED, /* Top edge */
ERI, /* External right intermediate */
RED, /* Right edge */
IMM, /* Internal maximum and minimum */
IMN, /* Internal minimum */
EMN, /* External minimum */
EMM, /* External maximum and minimum */
LED, /* Left edge */
ILI, /* Internal left intermediate */
BED, /* Bottom edge */
IRI, /* Internal right intermediate */
IMX, /* Internal maximum */
FUL /* Full non-intersection */
} vertex_type;
typedef enum { /* Horizontal edge states */
NH, /* No horizontal edge */
BH, /* Bottom horizontal edge */
TH /* Top horizontal edge */
} h_state;
typedef enum { /* Edge bundle state */
UNBUNDLED, /* Isolated edge not within a bundle */
BUNDLE_HEAD, /* Bundle head node */
BUNDLE_TAIL /* Passive bundle tail node */
} bundle_state;
typedef struct v_shape { /* Internal vertex list datatype */
double x; /* X coordinate component */
double y; /* Y coordinate component */
struct v_shape *next; /* Pointer to next vertex in list */
} vertex_node;
typedef struct p_shape { /* Internal contour / tristrip type */
int active; /* Active flag / vertex count */
int hole; /* Hole / external contour flag */
vertex_node *v[2]; /* Left and right vertex list ptrs */
struct p_shape *next; /* Pointer to next polygon contour */
struct p_shape *proxy; /* Pointer to actual structure used */
} polygon_node;
typedef struct edge_shape {
gpc_vertex vertex; /* Piggy-backed contour vertex data */
gpc_vertex bot; /* Edge lower (x, y) coordinate */
gpc_vertex top; /* Edge upper (x, y) coordinate */
double xb; /* Scanbeam bottom x coordinate */
double xt; /* Scanbeam top x coordinate */
double dx; /* Change in x for a unit y increase */
int type; /* Clip / subject edge flag */
int bundle[2][2]; /* Bundle edge flags */
int bside[2]; /* Bundle left / right indicators */
bundle_state bstate[2]; /* Edge bundle state */
polygon_node *outp[2]; /* Output polygon / tristrip pointer */
struct edge_shape *prev; /* Previous edge in the AET */
struct edge_shape *next; /* Next edge in the AET */
struct edge_shape *pred; /* Edge connected at the lower end */
struct edge_shape *succ; /* Edge connected at the upper end */
struct edge_shape *next_bound; /* Pointer to next bound in LMT */
} edge_node;
inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); }
inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); }
inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); }
inline int gpc_next_index(int i, int n) { return ((i + 1) % n); }
inline int gpc_optimal(gpc_vertex *v, int i, int n) {
return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y);
}
inline int gpc_fwd_min(edge_node *v, int i, int n) {
return (v[(i + 1) % n].vertex.y > v[i].vertex.y &&
v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y);
}
inline int gpc_not_fmax(edge_node *v, int i, int n) {
return (v[(i + 1) % n].vertex.y > v[i].vertex.y);
}
inline int gpc_rev_min(edge_node *v, int i, int n) {
return (v[(i + 1) % n].vertex.y >= v[i].vertex.y &&
v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
}
inline int gpc_not_rmax(edge_node *v, int i, int n) {
return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
}
// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j)
// {
inline void gpc_p_edge(edge_node *d, edge_node *e, int p) {
d = e;
do {
d = d->prev;
} while (!d->outp[p]);
// i = d->bot.x + d->dx * (j - d->bot.y);
}
// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j)
// {
inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
d = e;
do {
d = d->next;
} while (!d->outp[p]);
// i = d->bot.x + d->dx * (j - d->bot.y);
}
template <typename T>
void gpc_malloc(T *&p, int b, char *s) {
if (b > 0) {
p = (T *)malloc(b);
if (!p) {
fprintf(stderr, "gpc malloc failure: %s\n", s);
exit(0);
}
} else {
p = NULL;
}
}
template <typename T>
void gpc_free(T *&p) {
if (p) {
free(p);
p = NULL;
}
}
/*
===========================================================================
Public Function Prototypes
===========================================================================
*/
void add_vertex(vertex_node **t, double x, double y);
void gpc_vertex_create(edge_node *e, int p, int s, double x, double y);
/*
void gpc_read_polygon(FILE *infile_ptr, int read_hole_flags,
gpc_polygon *polygon);
void gpc_write_polygon(FILE *outfile_ptr, int write_hole_flags,
gpc_polygon *polygon);
*/
void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole);
void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
gpc_polygon *clip_polygon, gpc_polygon *result_polygon);
void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
gpc_polygon *clip_polygon,
gpc_tristrip *result_tristrip);
void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip);
void gpc_free_polygon(gpc_polygon *polygon);
void gpc_free_tristrip(gpc_tristrip *tristrip);
} // namespace gpc
#endif // PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_
/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
...@@ -9,10 +9,11 @@ http://www.apache.org/licenses/LICENSE-2.0 ...@@ -9,10 +9,11 @@ http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/poly_util.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -20,9 +21,6 @@ namespace operators { ...@@ -20,9 +21,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
constexpr int64_t kOutputDim = 6;
constexpr int64_t kBBoxSize = 4;
class MultiClassNMSOp : public framework::OperatorWithKernel { class MultiClassNMSOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -42,10 +40,15 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { ...@@ -42,10 +40,15 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
"The rank of Input(BBoxes) must be 3."); "The rank of Input(BBoxes) must be 3.");
PADDLE_ENFORCE_EQ(score_dims.size(), 3, PADDLE_ENFORCE_EQ(score_dims.size(), 3,
"The rank of Input(Scores) must be 3."); "The rank of Input(Scores) must be 3.");
PADDLE_ENFORCE_EQ(box_dims[2], 4, PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 ||
"The 2nd dimension of Input(BBoxes) must be 4, " box_dims[2] == 24 || box_dims[2] == 32,
"represents the layout of coordinate " "The 2nd dimension of Input(BBoxes) must be 4 or 8, "
"[xmin, ymin, xmax, ymax]"); "represents the layout of coordinate "
"[xmin, ymin, xmax, ymax] or "
"4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
"8 points: [xi, yi] i= 1,2,...,8 or "
"12 points: [xi, yi] i= 1,2,...,12 or "
"16 points: [xi, yi] i= 1,2,...,16");
PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2], PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
"The 1st dimensiong of Input(BBoxes) must be equal to " "The 1st dimensiong of Input(BBoxes) must be equal to "
"3rd dimension of Input(Scores), which represents the " "3rd dimension of Input(Scores), which represents the "
...@@ -53,7 +56,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { ...@@ -53,7 +56,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
// Here the box_dims[0] is not the real dimension of output. // Here the box_dims[0] is not the real dimension of output.
// It will be rewritten in the computing kernel. // It will be rewritten in the computing kernel.
ctx->SetOutputDim("Out", {box_dims[1], 6}); ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
} }
protected: protected:
...@@ -128,6 +131,21 @@ static inline T JaccardOverlap(const T* box1, const T* box2, ...@@ -128,6 +131,21 @@ static inline T JaccardOverlap(const T* box1, const T* box2,
} }
} }
template <class T>
T PolyIoU(const T* box1, const T* box2, const size_t box_size,
const bool normalized) {
T bbox1_area = PolyArea<T>(box1, box_size, normalized);
T bbox2_area = PolyArea<T>(box2, box_size, normalized);
T inter_area = PolyOverlapArea<T>(box1, box2, box_size, normalized);
if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
// If coordinate values are is invalid
// if area size <= 0, return 0.
return T(0.);
} else {
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <typename T> template <typename T>
class MultiClassNMSKernel : public framework::OpKernel<T> { class MultiClassNMSKernel : public framework::OpKernel<T> {
public: public:
...@@ -137,6 +155,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -137,6 +155,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
// The total boxes for each instance. // The total boxes for each instance.
int64_t num_boxes = bbox.dims()[0]; int64_t num_boxes = bbox.dims()[0];
// 4: [xmin ymin xmax ymax] // 4: [xmin ymin xmax ymax]
// 8: [x1 y1 x2 y2 x3 y3 x4 y4]
// 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16
int64_t box_size = bbox.dims()[1]; int64_t box_size = bbox.dims()[1];
std::vector<T> scores_data(num_boxes); std::vector<T> scores_data(num_boxes);
...@@ -154,8 +174,19 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -154,8 +174,19 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
for (size_t k = 0; k < selected_indices->size(); ++k) { for (size_t k = 0; k < selected_indices->size(); ++k) {
if (keep) { if (keep) {
const int kept_idx = (*selected_indices)[k]; const int kept_idx = (*selected_indices)[k];
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size, T overlap = T(0.);
// 4: [xmin ymin xmax ymax]
if (box_size == 4) {
overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, true); bbox_data + kept_idx * box_size, true);
}
// 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
if (box_size == 8 || box_size == 16 || box_size == 24 ||
box_size == 32) {
overlap =
PolyIoU<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, box_size, true);
}
keep = overlap <= adaptive_threshold; keep = overlap <= adaptive_threshold;
} else { } else {
break; break;
...@@ -228,7 +259,9 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -228,7 +259,9 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
const std::map<int, std::vector<int>>& selected_indices, const std::map<int, std::vector<int>>& selected_indices,
Tensor* outs) const { Tensor* outs) const {
int predict_dim = scores.dims()[1]; int64_t predict_dim = scores.dims()[1];
int64_t box_size = bboxes.dims()[1];
int64_t out_dim = bboxes.dims()[1] + 2;
auto* scores_data = scores.data<T>(); auto* scores_data = scores.data<T>();
auto* bboxes_data = bboxes.data<T>(); auto* bboxes_data = bboxes.data<T>();
auto* odata = outs->data<T>(); auto* odata = outs->data<T>();
...@@ -240,11 +273,11 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -240,11 +273,11 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
const std::vector<int>& indices = it.second; const std::vector<int>& indices = it.second;
for (size_t j = 0; j < indices.size(); ++j) { for (size_t j = 0; j < indices.size(); ++j) {
int idx = indices[j]; int idx = indices[j];
const T* bdata = bboxes_data + idx * kBBoxSize; const T* bdata = bboxes_data + idx * box_size;
odata[count * kOutputDim] = label; // label odata[count * out_dim] = label; // label
odata[count * kOutputDim + 1] = sdata[idx]; // score odata[count * out_dim + 1] = sdata[idx]; // score
// xmin, ymin, xmax, ymax // xmin, ymin, xmax, ymax or multi-points coordinates
std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T)); std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
count++; count++;
} }
} }
...@@ -261,6 +294,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -261,6 +294,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
int64_t class_num = score_dims[1]; int64_t class_num = score_dims[1];
int64_t predict_dim = score_dims[2]; int64_t predict_dim = score_dims[2];
int64_t box_dim = boxes->dims()[2]; int64_t box_dim = boxes->dims()[2];
int64_t out_dim = boxes->dims()[2] + 2;
std::vector<std::map<int, std::vector<int>>> all_indices; std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<size_t> batch_starts = {0}; std::vector<size_t> batch_starts = {0};
...@@ -283,7 +317,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> { ...@@ -283,7 +317,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
T* od = outs->mutable_data<T>({1}, ctx.GetPlace()); T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
od[0] = -1; od[0] = -1;
} else { } else {
outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace()); outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
for (int64_t i = 0; i < batch_size; ++i) { for (int64_t i = 0; i < batch_size; ++i) {
Tensor ins_score = scores->Slice(i, i + 1); Tensor ins_score = scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim}); ins_score.Resize({class_num, predict_dim});
...@@ -311,10 +345,11 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -311,10 +345,11 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("BBoxes", AddInput("BBoxes",
"(Tensor) A 3-D Tensor with shape [N, M, 4] represents the " "(Tensor) A 3-D Tensor with shape "
"[N, M, 4 or 8 16 24 32] represents the "
"predicted locations of M bounding bboxes, N is the batch size. " "predicted locations of M bounding bboxes, N is the batch size. "
"Each bounding box has four coordinate values and the layout is " "Each bounding box has four coordinate values and the layout is "
"[xmin, ymin, xmax, ymax]."); "[xmin, ymin, xmax, ymax], when box size equals to 4.");
AddInput("Scores", AddInput("Scores",
"(Tensor) A 3-D Tensor with shape [N, C, M] represents the " "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
"predicted confidence predictions. N is the batch size, C is the " "predicted confidence predictions. N is the batch size, C is the "
...@@ -351,8 +386,12 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -351,8 +386,12 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", AddOutput("Out",
"(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
"detections. Each row has 6 values: " "detections. Each row has 6 values: "
"[label, confidence, xmin, ymin, xmax, ymax], No is the total " "[label, confidence, xmin, ymin, xmax, ymax] or "
"number of detections in this mini-batch. For each instance, " "(LoDTensor) A 2-D LoDTensor with shape [No, 10] represents the "
"detections. Each row has 10 values: "
"[label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the "
"total number of detections in this mini-batch."
"For each instance, "
"the offsets in first dimension are called LoD, the number of " "the offsets in first dimension are called LoD, the number of "
"offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
"no detected bbox."); "no detected bbox.");
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef POLY_UTIL_CC_
#define POLY_UTIL_CC_
#include "paddle/fluid/operators/detection/poly_util.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using gpc::gpc_polygon_clip;
using gpc::gpc_free_polygon;
template <class T>
void Array2PointVec(const T*& box, const size_t box_size,
std::vector<Point_<T>>& vec) {
size_t pts_num = box_size / 2;
vec.resize(pts_num);
for (size_t i = 0; i < pts_num; i++) {
vec.at(i).x = box[2 * i];
vec.at(i).y = box[2 * i + 1];
}
}
template <class T>
void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly) {
size_t pts_num = box_size / 2;
poly.num_contours = 1;
poly.hole = (int*)malloc(sizeof(int));
poly.hole[0] = 0;
poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
poly.contour->num_vertices = pts_num;
poly.contour->vertex =
(gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
for (size_t i = 0; i < pts_num; ++i) {
poly.contour->vertex[i].x = box[2 * i];
poly.contour->vertex[i].y = box[2 * i + 1];
}
}
template <class T>
void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly) {
int pts_num = vec.size();
poly.num_contours = 1;
poly.hole = (int*)malloc(sizeof(int));
poly.hole[0] = 0;
poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
poly.contour->num_vertices = pts_num;
poly.contour->vertex =
(gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
for (size_t i = 0; i < pts_num; ++i) {
poly.contour->vertex[i].x = vec[i].x;
poly.contour->vertex[i].y = vec[i].y;
}
}
template <class T>
void Poly2PointVec(const gpc::gpc_vertex_list& contour,
std::vector<Point_<T>>& vec) {
int pts_num = contour.num_vertices;
vec.resize(pts_num);
for (int i = 0; i < pts_num; i++) {
vec.at(i).x = contour.vertex[i].x;
vec.at(i).y = contour.vertex[i].y;
}
}
template <class T>
T GetContourArea(std::vector<Point_<T>>& vec) {
size_t pts_num = vec.size();
if (pts_num < 3) return T(0.);
T area = T(0.);
for (size_t i = 0; i < pts_num; ++i) {
area += vec[i].x * vec[(i + 1) % pts_num].y -
vec[i].y * vec[(i + 1) % pts_num].x;
}
return std::fabs(area / 2.0);
}
template <class T>
T PolyArea(const T* box, const size_t box_size, const bool normalized) {
// If coordinate values are is invalid
// if area size <= 0, return 0.
std::vector<Point_<T>> vec;
Array2PointVec<T>(box, box_size, vec);
return GetContourArea<T>(vec);
}
template <class T>
T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
const bool normalized) {
gpc::gpc_polygon poly1;
gpc::gpc_polygon poly2;
Array2Poly<T>(box1, box_size, poly1);
Array2Poly<T>(box2, box_size, poly2);
gpc::gpc_polygon respoly;
gpc::gpc_op op = gpc::GPC_INT;
gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
T inter_area = T(0.);
int contour_num = respoly.num_contours;
for (int i = 0; i < contour_num; ++i) {
std::vector<Point_<T>> resvec;
Poly2PointVec<T>(respoly.contour[i], resvec);
// inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f *
// (cv::arcLength(resvec, true));
inter_area += GetContourArea<T>(resvec);
}
gpc::gpc_free_polygon(&poly1);
gpc::gpc_free_polygon(&poly2);
gpc::gpc_free_polygon(&respoly);
return inter_area;
}
} // namespace operators
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef POLY_UTIL_H_
#define POLY_UTIL_H_
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/gpc.h"
namespace paddle {
namespace operators {
template <class T>
class Point_ {
public:
// default constructor
Point_() {}
Point_(T _x, T _y) {}
Point_(const Point_& pt) {}
Point_& operator=(const Point_& pt);
// conversion to another data type
// template<typename _T> operator Point_<_T>() const;
// conversion to the old-style C structures
// operator Vec<T, 2>() const;
// checks whether the point is inside the specified rectangle
// bool inside(const Rect_<T>& r) const;
T x; //!< x coordinate of the point
T y; //!< y coordinate of the point
};
template <class T>
void Array2PointVec(const T*& box, const size_t box_size,
std::vector<Point_<T>>& vec);
template <class T>
void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly);
template <class T>
void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon& poly);
template <class T>
void Poly2PointVec(const gpc::gpc_vertex_list& contour,
std::vector<Point_<T>>& vec);
template <class T>
T GetContourArea(std::vector<Point_<T>>& vec);
template <class T>
T PolyArea(const T* box, const size_t box_size, const bool normalized);
template <class T>
T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
const bool normalized);
} // namespace operators
} // namespace paddle
#include "paddle/fluid/operators/detection/poly_util.cc"
#endif // POLY_UTIL_H_
...@@ -41,9 +41,9 @@ class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> { ...@@ -41,9 +41,9 @@ class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
for (int id_w = 0; id_w < width; ++id_w) { for (int id_w = 0; id_w < width; ++id_w) {
id = id_n * height * width + width * id_h + id_w; id = id_n * height * width + width * id_h + id_w;
if (id_n % 2 == 0) { if (id_n % 2 == 0) {
out_data[id] = id_w - in_data[id]; out_data[id] = id_w * 4 - in_data[id];
} else { } else {
out_data[id] = id_h - in_data[id]; out_data[id] = id_h * 4 - in_data[id];
} }
} }
} }
......
...@@ -32,9 +32,9 @@ __global__ void PolygonBoxTransformKernel(const int n, const int h, const int w, ...@@ -32,9 +32,9 @@ __global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
if (id_n < n && id_h < h && id_w < w) { if (id_n < n && id_h < h && id_w < w) {
int id = id_n * h * w + w * id_h + id_w; int id = id_n * h * w + w * id_h + id_w;
if (id_n % 2 == 0) { if (id_n % 2 == 0) {
output[id] = id_w - input[id]; output[id] = id_w * 4 - input[id];
} else { } else {
output[id] = id_h - input[id]; output[id] = id_h * 4 - input[id];
} }
} }
} }
......
...@@ -86,7 +86,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -86,7 +86,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = nullptr; s->response_call_back_ = nullptr;
platform::RecordEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method, p_ctx);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
...@@ -143,7 +143,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, ...@@ -143,7 +143,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method, p_ctx);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
...@@ -191,7 +191,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -191,7 +191,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method, p_ctx);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
...@@ -221,7 +221,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, ...@@ -221,7 +221,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE); req.set_varname(BATCH_BARRIER_MESSAGE);
platform::RecordEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -246,7 +246,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -246,7 +246,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE); req.set_varname(FETCH_BARRIER_MESSAGE);
platform::RecordEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -271,7 +271,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, ...@@ -271,7 +271,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(COMPLETE_MESSAGE); req.set_varname(COMPLETE_MESSAGE);
platform::RecordEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -301,7 +301,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, ...@@ -301,7 +301,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
req.set_varname(CHECKPOINT_SAVE_MESSAGE); req.set_varname(CHECKPOINT_SAVE_MESSAGE);
req.set_out_varname(dir); req.set_out_varname(dir);
platform::RecordEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method, nullptr);
auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......
...@@ -36,7 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -36,7 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg, ::grpc::ByteBuffer* msg,
const std::string& out_name) { const std::string& out_name) {
platform::RecordEvent record_event("serial", &ctx); platform::RecordRPCEvent record_event("serial", &ctx);
// Default DestroyCallback does nothing, When using GPU // Default DestroyCallback does nothing, When using GPU
// the CPU buffer need to be freed. // the CPU buffer need to be freed.
DestroyCallback destroy_callback = [](void* backing) {}; DestroyCallback destroy_callback = [](void* backing) {};
...@@ -148,7 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -148,7 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var) { framework::Variable** var) {
platform::RecordEvent record_event("deserial", &ctx); platform::RecordRPCEvent record_event("deserial", &ctx);
operators::distributed::GRPCVariableResponse resp(scope, &ctx); operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar(); *var = resp.GetVar();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h"
#include <algorithm> // for min, max
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc_compute.h"
namespace paddle {
namespace operators {
void FusionSeqConvEltAddReluOp::InferShape(
framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FusionSeqConvEltAddReluOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("Filter"),
"Input(Filter) of FusionSeqConvEltAddReluOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("Bias"),
"Input(Bias) of FusionSeqConvEltAddReluOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FusionSeqConvEltAddReluOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ColMat"),
"Output(ColMat) of FusionSeqConvEltAddReluOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto w_dims = ctx->GetInputDim("Filter");
int context_length = ctx->Attrs().Get<int>("contextLength");
PADDLE_ENFORCE(
ctx->Attrs().Get<int>("contextStride") == 1,
"Currently, FusionSeqConvEltAddReluOp only supports contextStride=1.");
PADDLE_ENFORCE(x_dims.size() == 2 && w_dims.size() == 2,
"Input(X, Filter) should be 2-D tensor.");
PADDLE_ENFORCE(x_dims.size() == 2 && w_dims.size() == 2,
"Input(X, Filter) should be 2-D tensor.");
PADDLE_ENFORCE(w_dims[0] == context_length * x_dims[1],
"Filter's height should be context_length * "
"input_hidden_size .");
PADDLE_ENFORCE_GT(context_length + ctx->Attrs().Get<int>("contextStart"), 0,
"contextStart size should be smaller than contextLength.");
ctx->SetOutputDim("Out", {x_dims[0], w_dims[1]});
ctx->SetOutputDim("ColMat", {x_dims[0], w_dims[0]});
ctx->ShareLoD("X", "Out");
}
framework::OpKernelType FusionSeqConvEltAddReluOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
void FusionSeqConvEltAddReluOpMaker::Make() {
AddInput("X",
"(LoDTensor) the input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X M), where T is the "
"total time steps in this mini-batch, M is the dim size of x.");
// PaddingData only support false yet, should be ensured at pass.
AddInput("Filter",
"(Tensor) same as the input(Filter) of sequence conv op is an "
"learnable parameter."
"This is a tensor with shape (K, N), where K is the "
"context_length * dim size of x, N is the output feature size.");
AddInput("Bias",
"(Tensor) the learnable weights. shape (1, N), where N is the "
"output feature size");
AddOutput(
"Out",
"(LoDTensor) the output(Out) is a LodTensor, which support "
"variable-time length output sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T, N), where, T is the "
"total time steps in this mini-batch, N is the output feature size.");
AddOutput("ColMat",
"(Tensor) (T, K), where T is where T is the "
"total time steps in this mini-batch, K is height of Filter")
.AsIntermediate();
AddAttr<int>("contextLength",
"(int) the contextLength of FusionSeqConvEltAddReluOp is the "
"height of the convolution kernel.")
.GreaterThan(0);
AddAttr<int>("contextStart",
"(int, default:0) the contextStart of FusionSeqConvEltAddReluOp "
"represents the beginning of the convolution of the number of "
"rows of sequence, which can be negative. The negative number "
"means to pad contextStart time-steps of zeros or learnable "
"parameters at the beginning of each instance. The positive "
"number means to skip contextStart time-steps of each "
"instance.")
.SetDefault(0);
AddAttr<int>(
"contextStride",
"(int, default:1) the contextStride of FusionSeqConvEltAddReluOp "
"represents the stride length of convolution kernel. "
"Currently, FusionSeqConvEltAddReluOp only supports"
"contextStride=1.")
.SetDefault(1)
.GreaterThan(0);
AddComment(R"DOC(
Fusion Sequence Conv and ElementwiseAdd Operator.
)DOC");
}
template <typename T>
class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
auto* w = ctx.Input<Tensor>("Filter");
auto* b = ctx.Input<Tensor>("Bias");
auto* y = ctx.Output<LoDTensor>("Out");
auto* col = ctx.Output<Tensor>("ColMat");
auto x_lod = x->lod();
auto x_dims = x->dims();
auto w_dims = w->dims();
PADDLE_ENFORCE_EQ(b->numel(), w_dims[1],
"bias size should be equal to output feature size.");
PADDLE_ENFORCE_EQ(x_lod.size(), 1UL,
"Only support one level sequence now.");
const T* x_data = x->data<T>();
const T* w_data = w->data<T>();
const T* b_data = b->data<T>();
T* y_data = y->mutable_data<T>(ctx.GetPlace());
T* col_data = col->mutable_data<T>(ctx.GetPlace());
int context_start = ctx.Attr<int>("contextStart");
int context_length = ctx.Attr<int>("contextLength");
int up_pad = std::max(0, -context_start);
int down_pad = std::max(0, context_start + context_length - 1);
// im2col
int src_mat_w = static_cast<int>(x_dims[1]);
int src_mat_w_sz = src_mat_w * sizeof(T);
int col_mat_w = static_cast<int>(w_dims[0]);
int col_mat_w_sz = col_mat_w * sizeof(T);
for (int i = 0; i < static_cast<int>(x_lod[0].size()) - 1; ++i) {
int st = x_lod[0][i];
int ed = x_lod[0][i + 1];
const T* src_data = x_data + st * src_mat_w;
T* dst_data = col_data + st * col_mat_w;
int seq_len = ed - st;
if (seq_len > up_pad + down_pad) {
// zero all up_pad and fill data
std::memset(dst_data, 0, up_pad * col_mat_w_sz);
dst_data = dst_data + up_pad * src_mat_w;
int copy_size = col_mat_w_sz - up_pad * src_mat_w_sz;
for (int j = 0; j < up_pad; ++j) {
// blas.VCOPY?
std::memcpy(dst_data, src_data, copy_size);
dst_data += (col_mat_w - src_mat_w);
copy_size += src_mat_w_sz;
}
// fill data
for (int j = 0; j < seq_len - up_pad - down_pad; ++j) {
std::memcpy(dst_data, src_data, copy_size);
dst_data += col_mat_w;
src_data += src_mat_w;
}
// zero all down_pad and fill data
std::memset(dst_data, 0, down_pad * col_mat_w_sz);
copy_size -= src_mat_w_sz;
for (int j = 0; j < down_pad; ++j) {
std::memcpy(dst_data, src_data, copy_size);
dst_data += col_mat_w;
src_data += src_mat_w;
copy_size -= src_mat_w_sz;
}
} else {
PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1);
std::memset(dst_data, 0, seq_len * col_mat_w_sz);
dst_data = dst_data + up_pad * src_mat_w;
int zero_sz = up_pad * src_mat_w_sz;
int cur_src_sz = seq_len * src_mat_w_sz;
for (int j = 0; j < std::min(up_pad, seq_len); ++j) {
int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
std::memcpy(dst_data, src_data, copy_size);
dst_data += (col_mat_w - src_mat_w);
zero_sz -= src_mat_w_sz;
}
// from bottom
dst_data = col_data + ed * col_mat_w;
src_data = x_data + st * src_mat_w;
zero_sz = down_pad * src_mat_w_sz;
for (int j = 1; j <= std::min(down_pad, seq_len); ++j) {
int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T),
src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w,
copy_size);
dst_data -= col_mat_w;
zero_sz -= src_mat_w_sz;
}
}
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
math::FCCompute<DeviceContext, T>(blas, x_dims[0], w_dims[1], w_dims[0],
col_data, w_data, y_data, b_data, true);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_seqconv_eltadd_relu, ops::FusionSeqConvEltAddReluOp,
ops::FusionSeqConvEltAddReluOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(fusion_seqconv_eltadd_relu,
ops::FusionSeqConvEltAddReluKernel<float>,
ops::FusionSeqConvEltAddReluKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
class FusionSeqConvEltAddReluOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FusionSeqConvEltAddReluOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
} // namespace operators
} // namespace paddle
...@@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, ...@@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
// check index of shape 1-D // check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1); PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index.dims()[0]; int64_t index_size = index.dims()[0];
auto src_dims = src.dims(); auto src_dims = src.dims();
framework::DDim output_dims(src_dims);
output_dims[0] = index_size;
const T* p_src = src.data<T>(); const T* p_src = src.data<T>();
const int* p_index = index.data<int>(); const int* p_index = index.data<int>();
...@@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, ...@@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const size_t slice_bytes = slice_size * sizeof(T); const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) { for (int64_t i = 0; i < index_size; ++i) {
int index_ = p_index[i]; int index_ = p_index[i];
memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
} }
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
...@@ -79,7 +79,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor<void> { ...@@ -79,7 +79,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
template <typename DeviceContext> template <typename DeviceContext>
template <typename T> template <typename T>
void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() { void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() {
math::ConcatGradFunctor<DeviceContext, T> func; math::SplitFunctor<DeviceContext, T> func;
func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0, func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0,
&prev_functor_->outputs_); &prev_functor_->outputs_);
} }
......
if (NOT WIN32) if (NOT WIN32)
add_subdirectory(detail) add_subdirectory(detail)
endif(NOT WIN32) endif(NOT WIN32)
function(math_library TARGET) function(math_library TARGET)
...@@ -35,7 +35,7 @@ function(math_library TARGET) ...@@ -35,7 +35,7 @@ function(math_library TARGET)
endfunction() endfunction()
# please add new math_library in alphabetical order # please add new math_library in alphabetical order
math_library(concat) math_library(concat_and_split)
math_library(context_project DEPS im2col math_function) math_library(context_project DEPS im2col math_function)
math_library(cross_entropy) math_library(cross_entropy)
math_library(cos_sim_functor) math_library(cos_sim_functor)
...@@ -43,8 +43,8 @@ math_library(depthwise_conv) ...@@ -43,8 +43,8 @@ math_library(depthwise_conv)
math_library(im2col) math_library(im2col)
if (NOT WIN32) # windows do not support avx functions yet. if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function) math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions) math_library(lstm_compute DEPS activation_functions)
endif (NOT WIN32) endif (NOT WIN32)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
...@@ -58,7 +58,7 @@ math_library(sequence_pooling DEPS math_function) ...@@ -58,7 +58,7 @@ math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax DEPS math_function) math_library(softmax DEPS math_function)
if (NOT WIN32) if (NOT WIN32)
math_library(matrix_bit_code) math_library(matrix_bit_code)
endif (NOT WIN32) endif (NOT WIN32)
math_library(unpooling) math_library(unpooling)
math_library(vol2col) math_library(vol2col)
...@@ -72,9 +72,9 @@ if(WITH_GPU) ...@@ -72,9 +72,9 @@ if(WITH_GPU)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
endif() endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat) cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
cc_library(jit_kernel cc_library(jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
DEPS cpu_info cblas activation_functions) DEPS cpu_info cblas)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -67,7 +67,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> { ...@@ -67,7 +67,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
* each dimension must be the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CPUDeviceContext, T> { class SplitFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
...@@ -111,7 +111,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -111,7 +111,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
}; };
#define DEFINE_FUNCTOR(type) \ #define DEFINE_FUNCTOR(type) \
template class ConcatFunctor<platform::CPUDeviceContext, type>; \ template class ConcatFunctor<platform::CPUDeviceContext, type>; \
template class ConcatGradFunctor<platform::CPUDeviceContext, type>; template class SplitFunctor<platform::CPUDeviceContext, type>;
FOR_ALL_TYPES(DEFINE_FUNCTOR); FOR_ALL_TYPES(DEFINE_FUNCTOR);
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
...@@ -24,7 +24,7 @@ namespace operators { ...@@ -24,7 +24,7 @@ namespace operators {
namespace math { namespace math {
template <typename T> template <typename T>
__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, __global__ void ConcatKernel(T** inputs, const int* input_cols, int col_size,
const int output_rows, const int output_cols, const int output_rows, const int output_cols,
T* output) { T* output) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -50,7 +50,7 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, ...@@ -50,7 +50,7 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
} }
template <typename T> template <typename T>
__global__ void KernelConcat(T** inputs_data, const int fixed_in_col, __global__ void ConcatKernel(T** inputs_data, const int fixed_in_col,
const int out_rows, const int out_cols, const int out_rows, const int out_cols,
T* output_data) { T* output_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -67,9 +67,9 @@ __global__ void KernelConcat(T** inputs_data, const int fixed_in_col, ...@@ -67,9 +67,9 @@ __global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
} }
template <typename T> template <typename T>
__global__ void KernelConcatGrad(const T* input_data, const int in_row, __global__ void SplitKernel(const T* input_data, const int in_row,
const int in_col, const int* out_cols, const int in_col, const int* out_cols,
int out_cols_size, T** outputs_data) { int out_cols_size, T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int curr_segment = 0; int curr_segment = 0;
int curr_offset = out_cols[0]; int curr_offset = out_cols[0];
...@@ -94,9 +94,9 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, ...@@ -94,9 +94,9 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
} }
template <typename T> template <typename T>
__global__ void KernelConcatGrad(const T* input_data, const int in_row, __global__ void SplitKernel(const T* input_data, const int in_row,
const int in_col, const int fixed_out_col, const int in_col, const int fixed_out_col,
T** outputs_data) { T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
int split = tid_x / fixed_out_col; int split = tid_x / fixed_out_col;
...@@ -170,11 +170,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -170,11 +170,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
dim3 grid_size = dim3(grid_cols, grid_rows, 1); dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) { if (sameShape) {
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>( ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
dev_ins_data, in_col, out_row, out_col, output->data<T>()); dev_ins_data, in_col, out_row, out_col, output->data<T>());
} else { } else {
const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace()); const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>( ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()), dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
out_row, out_col, output->data<T>()); out_row, out_col, output->data<T>());
} }
...@@ -189,7 +189,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -189,7 +189,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
* each dimension must be the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> { class SplitFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
...@@ -248,11 +248,11 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -248,11 +248,11 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
dim3 grid_size = dim3(grid_cols, grid_rows, 1); dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) { if (sameShape) {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data); input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
} else { } else {
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace()); const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), in_row, in_col, dev_outs_col_data, input.data<T>(), in_row, in_col, dev_outs_col_data,
static_cast<int>(outputs_cols.size()), dev_out_gpu_data); static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
} }
...@@ -264,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -264,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
#define DEFINE_FUNCTOR(type) \ #define DEFINE_FUNCTOR(type) \
template class ConcatFunctor<platform::CUDADeviceContext, type>; \ template class ConcatFunctor<platform::CUDADeviceContext, type>; \
template class ConcatGradFunctor<platform::CUDADeviceContext, type> template class SplitFunctor<platform::CUDADeviceContext, type>
FOR_ALL_TYPES(DEFINE_FUNCTOR); FOR_ALL_TYPES(DEFINE_FUNCTOR);
......
...@@ -54,7 +54,7 @@ class ConcatFunctor { ...@@ -54,7 +54,7 @@ class ConcatFunctor {
* Output[1] = [[5,6]] * Output[1] = [[5,6]]
*/ */
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ConcatGradFunctor { class SplitFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const std::vector<const framework::Tensor*>& ref_inputs, const std::vector<const framework::Tensor*>& ref_inputs,
......
...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
template <typename DeviceContext, typename Place> template <typename DeviceContext, typename Place>
void testConcat() { void testConcat() {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
...@@ -30,20 +31,25 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M, ...@@ -30,20 +31,25 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
if (B == NULL) { if (B == NULL) {
return; return;
} }
if (relu) {
const auto& vaddrelu = jitkernel::KernelPool::Instance()
.template Get<jitkernel::VAddReluKernel<T>>(N);
for (int i = 0; i < M; i++) {
T* dst = Y + i * N;
vaddrelu->Compute(B, dst, dst);
}
} else {
const auto& vadd = jitkernel::KernelPool::Instance()
.template Get<jitkernel::VAddKernel<T>>(N);
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1) #pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif #endif
for (int i = 0; i < M; i++) { for (int i = 0; i < M; i++) {
blas.AXPY(N, static_cast<T>(1), B, Y + i * N); T* dst = Y + i * N;
vadd->Compute(B, dst, dst);
}
} }
if (!relu) {
return;
}
// TODO(TJ): fuse relu
LOG(FATAL) << "Not implemented!";
} }
} // namespace math } // namespace math
......
...@@ -86,6 +86,12 @@ class VAddBiasKernel : public Kernel { ...@@ -86,6 +86,12 @@ class VAddBiasKernel : public Kernel {
virtual void Compute(const T a, const T *x, T *y) const = 0; virtual void Compute(const T a, const T *x, T *y) const = 0;
}; };
template <typename T>
class VAddReluKernel : public Kernel {
public:
virtual void Compute(const T *x, const T *y, T *z) const = 0;
};
template <typename T> template <typename T>
class VActKernel : public Kernel { class VActKernel : public Kernel {
public: public:
......
...@@ -378,11 +378,99 @@ class VIdentityKernelImpl : public VIdentityKernel<T> { ...@@ -378,11 +378,99 @@ class VIdentityKernelImpl : public VIdentityKernel<T> {
void Compute(const T* x, T* y) const override {} void Compute(const T* x, T* y) const override {}
}; };
/* VAddRelu JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VAddReluKernelImpl : public VAddReluKernel<T> {
public:
explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() { this->num_ = d; }
void Compute(const T* x, const T* y, T* z) const override {
for (int i = 0; i < this->num_; ++i) {
z[i] = x[i] + y[i];
z[i] = z[i] > 0 ? z[i] : 0;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx = _mm256_loadu_ps(x); \
__m256 tmpy = _mm256_loadu_ps(y); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, _mm256_setzero_ps()); \
_mm256_storeu_ps(z, tmpy); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ16>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(y); \
tmp0 = _mm256_add_ps(tmp0, tmp1); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_loadu_ps(x + 8); \
__m256 tmp2 = _mm256_loadu_ps(y + 8); \
tmp1 = _mm256_add_ps(tmp1, tmp2); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(z, tmp0); \
_mm256_storeu_ps(z + 8, tmp1); \
}
#define INTRI_COMMON_FLOAT(isa, block) \
template <> \
VAddReluKernelImpl<float, isa, block>::VAddReluKernelImpl(int d) \
: VAddReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
} \
template <> \
void VAddReluKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmpx = _mm256_loadu_ps(x + i); \
__m256 tmpy = _mm256_loadu_ps(y + i); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, zeros); \
_mm256_storeu_ps(z + i, tmpy); \
} \
for (int i = this->end_; i < this->num_; ++i) { \
z[i] = x[i] + y[i]; \
z[i] = z[i] > 0 ? z[i] : 0; \
} \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
INTRI_COMMON_FLOAT(jit::avx, kGT16);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
INTRI_COMMON_FLOAT(jit::avx2, kGT16);
#endif
#ifdef __AVX512F__
// TODO(TJ): refine avx512
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
INTRI_COMMON_FLOAT(jit::avx512f, kGT16);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_COMMON_FLOAT
REGISTER_JITKERNEL(vmul, VMulKernel); REGISTER_JITKERNEL(vmul, VMulKernel);
REGISTER_JITKERNEL(vadd, VAddKernel); REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vscal, VScalKernel); REGISTER_JITKERNEL(vscal, VScalKernel);
REGISTER_JITKERNEL(vaddb, VAddBiasKernel); REGISTER_JITKERNEL(vaddb, VAddBiasKernel);
REGISTER_JITKERNEL(vrelu, VReluKernel); REGISTER_JITKERNEL(vrelu, VReluKernel);
REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
REGISTER_JITKERNEL(videntity, VIdentityKernel); REGISTER_JITKERNEL(videntity, VIdentityKernel);
} // namespace jitkernel } // namespace jitkernel
......
...@@ -27,13 +27,6 @@ limitations under the License. */ ...@@ -27,13 +27,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
#ifdef __AVX__
namespace detail {
__m256 Exp(__m256 a);
} // namespace detail
#endif
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit; namespace jit = platform::jit;
...@@ -69,37 +62,186 @@ FOR_EACH_ISA(MKL_FLOAT, kGT16); ...@@ -69,37 +62,186 @@ FOR_EACH_ISA(MKL_FLOAT, kGT16);
FOR_EACH_ISA_BLOCK(MKL_DOUBLE); FOR_EACH_ISA_BLOCK(MKL_DOUBLE);
#endif #endif
#define INTRI8_FLOAT(isa) \ namespace detail {
#ifdef __AVX__
#define ALIGN32 __attribute__((aligned(32)))
#define _PS256_CONST(Name, Val) \
static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
Val, Val, Val, Val}
#define _PI256_CONST(Name, Val) \
static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
Val, Val, Val, Val}
_PI256_CONST(0x7f, 0x7f);
_PS256_CONST(one, 1.f);
_PS256_CONST(0p5, 0.5f);
_PS256_CONST(exp_hi, 88.3762626647949f);
_PS256_CONST(exp_lo, -88.3762626647949f);
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS256_CONST(cephes_exp_C1, 0.693359375);
_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
typedef union imm_xmm_union {
__m256i imm;
__m128i xmm[2];
} imm_xmm_union;
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
{ \
imm_xmm_union u ALIGN32; \
u.imm = imm_; \
xmm0_ = u.xmm[0]; \
xmm1_ = u.xmm[1]; \
}
#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
{ \
imm_xmm_union u ALIGN32; \
u.xmm[0] = xmm0_; \
u.xmm[1] = xmm1_; \
imm_ = u.imm; \
}
#define AVX2_BITOP_USING_SSE2(fn) \
static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \
/* use SSE2 to perform the bitop AVX2 */ \
__m128i x1, x2; \
__m256i ret; \
COPY_IMM_TO_XMM(x, x1, x2); \
x1 = _mm_##fn(x1, y); \
x2 = _mm_##fn(x2, y); \
COPY_XMM_TO_IMM(x1, x2, ret); \
return ret; \
}
#define AVX2_INTOP_USING_SSE2(fn) \
static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \
/* use SSE2 to perform the AVX2 integer operation */ \
__m128i x1, x2; \
__m128i y1, y2; \
__m256i ret; \
COPY_IMM_TO_XMM(x, x1, x2); \
COPY_IMM_TO_XMM(y, y1, y2); \
x1 = _mm_##fn(x1, y1); \
x2 = _mm_##fn(x2, y2); \
COPY_XMM_TO_IMM(x1, x2, ret); \
return ret; \
}
AVX2_BITOP_USING_SSE2(slli_epi32);
AVX2_INTOP_USING_SSE2(add_epi32);
#define AVXEXP_BASE \
__m256 tmp = _mm256_setzero_ps(), fx; \
__m256 one = *reinterpret_cast<const __m256*>(_ps256_one); \
__m256i imm0; \
x = _mm256_min_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_hi)); \
x = _mm256_max_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_lo)); \
/* express exp(x) as exp(g + n*log(2)) */ \
fx = _mm256_mul_ps(x, \
*reinterpret_cast<const __m256*>(_ps256_cephes_LOG2EF)); \
fx = _mm256_add_ps(fx, *reinterpret_cast<const __m256*>(_ps256_0p5)); \
tmp = _mm256_floor_ps(fx); \
/* if greater, substract 1 */ \
__m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); \
mask = _mm256_and_ps(mask, one); \
fx = _mm256_sub_ps(tmp, mask); \
tmp = _mm256_mul_ps(fx, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_C1)); \
__m256 z = _mm256_mul_ps( \
fx, *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C2)); \
x = _mm256_sub_ps(x, tmp); \
x = _mm256_sub_ps(x, z); \
z = _mm256_mul_ps(x, x); \
__m256 y = *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p0); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p1)); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p2)); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p3)); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p4)); \
y = _mm256_mul_ps(y, x); \
y = _mm256_add_ps(y, \
*reinterpret_cast<const __m256*>(_ps256_cephes_exp_p5)); \
y = _mm256_mul_ps(y, z); \
y = _mm256_add_ps(y, x); \
y = _mm256_add_ps(y, one); \
/* build 2^n */ \
imm0 = _mm256_cvttps_epi32(fx)
__m256 ExpAVX(__m256 x) {
AVXEXP_BASE;
// two AVX2 instructions using SSE2
imm0 = avx2_mm256_add_epi32(imm0,
*reinterpret_cast<const __m256i*>(_pi256_0x7f));
imm0 = avx2_mm256_slli_epi32(imm0, 23);
__m256 pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}
#endif
#ifdef __AVX2__
__m256 ExpAVX2(__m256 x) {
AVXEXP_BASE;
// two AVX2 instructions
imm0 = _mm256_add_epi32(imm0, *reinterpret_cast<const __m256i*>(_pi256_0x7f));
imm0 = _mm256_slli_epi32(imm0, 23);
__m256 pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}
#endif
} // namespace detail
#define INTRI8_FLOAT(isa, expisa) \
template <> \ template <> \
void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \ void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \ const { \
__m256 tmp = _mm256_loadu_ps(x); \ __m256 tmp = _mm256_loadu_ps(x); \
_mm256_storeu_ps(y, detail::Exp(tmp)); \ _mm256_storeu_ps(y, expisa(tmp)); \
} }
#define INTRI16_FLOAT(isa) \ #define INTRI16_FLOAT(isa, expisa) \
template <> \ template <> \
void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \ void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \ const { \
__m256 tmp0 = _mm256_loadu_ps(x); \ __m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \ __m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = detail::Exp(tmp0); \ tmp0 = expisa(tmp0); \
tmp1 = detail::Exp(tmp1); \ tmp1 = expisa(tmp1); \
_mm256_storeu_ps(y, tmp0); \ _mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \ _mm256_storeu_ps(y + 8, tmp1); \
} }
#ifdef __AVX__ #ifdef __AVX__
INTRI8_FLOAT(jit::avx); INTRI8_FLOAT(jit::avx, detail::ExpAVX);
INTRI16_FLOAT(jit::avx); INTRI16_FLOAT(jit::avx, detail::ExpAVX);
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
INTRI8_FLOAT(jit::avx2); INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
INTRI16_FLOAT(jit::avx2); INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
#endif #endif
#ifdef __AVX512F__ #ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f); INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
INTRI16_FLOAT(jit::avx512f); INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
#endif #endif
// TODO(TJ): eq16 test and complete avx512 // TODO(TJ): eq16 test and complete avx512
...@@ -135,26 +277,27 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> { ...@@ -135,26 +277,27 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
std::shared_ptr<const VExpKernel<T>> vexp_; std::shared_ptr<const VExpKernel<T>> vexp_;
}; };
#define INTRI_SIGMOID(tmp, min, max) \ #define INTRI_SIGMOID(tmp, min, max, expisa) \
tmp = _mm256_max_ps(tmp, min); \ tmp = _mm256_max_ps(tmp, min); \
tmp = _mm256_min_ps(tmp, max); \ tmp = _mm256_min_ps(tmp, max); \
tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \ tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \
tmp = detail::Exp(tmp); \ tmp = expisa(tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp) tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
#define INTRI8_FLOAT(isa) \ #define INTRI8_FLOAT(isa, expisa) \
template <> \ template <> \
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \ void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \ const { \
/* TODO(TJ): try to use static const*/ \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \ __m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \ INTRI_SIGMOID(tmp, min, max, expisa); \
_mm256_storeu_ps(y, tmp); \ _mm256_storeu_ps(y, tmp); \
} }
#define INTRI16_FLOAT(isa) \ #define INTRI16_FLOAT(isa, expisa) \
template <> \ template <> \
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x, \ void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x, \
float* y) const { \ float* y) const { \
...@@ -162,13 +305,13 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> { ...@@ -162,13 +305,13 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp0 = _mm256_loadu_ps(x); \ __m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \ __m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_SIGMOID(tmp0, min, max); \ INTRI_SIGMOID(tmp0, min, max, expisa); \
INTRI_SIGMOID(tmp1, min, max); \ INTRI_SIGMOID(tmp1, min, max, expisa); \
_mm256_storeu_ps(y, tmp0); \ _mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \ _mm256_storeu_ps(y + 8, tmp1); \
} }
#define INTRI_GT8LT16_FLOAT(isa) \ #define INTRI_GT8LT16_FLOAT(isa, expisa) \
template <> \ template <> \
VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \ VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \ : VSigmoidKernel<float>() { \
...@@ -184,7 +327,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> { ...@@ -184,7 +327,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \ __m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \ INTRI_SIGMOID(tmp, min, max, expisa); \
_mm256_storeu_ps(y, tmp); \ _mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \ const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \ const float max_ = SIGMOID_THRESHOLD_MAX; \
...@@ -198,7 +341,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> { ...@@ -198,7 +341,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
} \ } \
} }
#define INTRI_GT16_FLOAT(isa) \ #define INTRI_GT16_FLOAT(isa, expisa) \
template <> \ template <> \
VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \ VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \ : VSigmoidKernel<float>() { \
...@@ -215,7 +358,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> { ...@@ -215,7 +358,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \ __m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \ INTRI_SIGMOID(tmp, min, max, expisa); \
_mm256_storeu_ps(y + i, tmp); \ _mm256_storeu_ps(y + i, tmp); \
} \ } \
const float min_ = SIGMOID_THRESHOLD_MIN; \ const float min_ = SIGMOID_THRESHOLD_MIN; \
...@@ -231,22 +374,20 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> { ...@@ -231,22 +374,20 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
} }
#ifdef __AVX__ #ifdef __AVX__
INTRI8_FLOAT(jit::avx); INTRI8_FLOAT(jit::avx, detail::ExpAVX);
INTRI16_FLOAT(jit::avx); INTRI16_FLOAT(jit::avx, detail::ExpAVX);
INTRI_GT8LT16_FLOAT(jit::avx); INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX);
INTRI_GT16_FLOAT(jit::avx); INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX);
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
INTRI8_FLOAT(jit::avx2); INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
INTRI16_FLOAT(jit::avx2); INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
// INTRI_GT8LT16_FLOAT(jit::avx2); // maybe use avx at gt8lt16 and gt16
// INTRI_GT16_FLOAT(jit::avx2);
#endif #endif
#ifdef __AVX512F__ #ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f); INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
INTRI16_FLOAT(jit::avx512f); INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
// INTRI_GT8LT16_FLOAT(jit::avx512f); // maybe use avx2 at gt8lt16 and gt16
// INTRI_GT16_FLOAT(jit::avx512f);
#endif #endif
#undef INTRI8_FLOAT #undef INTRI8_FLOAT
...@@ -280,36 +421,36 @@ class VTanhKernelImpl : public VTanhKernel<T> { ...@@ -280,36 +421,36 @@ class VTanhKernelImpl : public VTanhKernel<T> {
std::shared_ptr<const VAddBiasKernel<T>> vaddbias_; std::shared_ptr<const VAddBiasKernel<T>> vaddbias_;
}; };
#define INTRI_VTANH(tmp) \ #define INTRI_VTANH(tmp, expisa) \
tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \ tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \
tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \ tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \
tmp = detail::Exp(tmp); \ tmp = expisa(tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \ tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \
tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f)) tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
#define INTRI8_FLOAT(isa) \ #define INTRI8_FLOAT(isa, expisa) \
template <> \ template <> \
void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \ void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \ const { \
__m256 tmp = _mm256_loadu_ps(x); \ __m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \ INTRI_VTANH(tmp, expisa); \
_mm256_storeu_ps(y, tmp); \ _mm256_storeu_ps(y, tmp); \
} }
#define INTRI16_FLOAT(isa) \ #define INTRI16_FLOAT(isa, expisa) \
template <> \ template <> \
void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \ void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \ const { \
__m256 tmp0 = _mm256_loadu_ps(x); \ __m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \ __m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_VTANH(tmp0); \ INTRI_VTANH(tmp0, expisa); \
INTRI_VTANH(tmp1); \ INTRI_VTANH(tmp1, expisa); \
_mm256_storeu_ps(y, tmp0); \ _mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \ _mm256_storeu_ps(y + 8, tmp1); \
} }
#define INTRI_GT8LT16_FLOAT(isa) \ #define INTRI_GT8LT16_FLOAT(isa, expisa) \
template <> \ template <> \
VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d) \ VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \ : VTanhKernel<float>() { \
...@@ -327,7 +468,7 @@ class VTanhKernelImpl : public VTanhKernel<T> { ...@@ -327,7 +468,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \ void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \ float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \ __m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \ INTRI_VTANH(tmp, expisa); \
_mm256_storeu_ps(y, tmp); \ _mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \ x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \ y += AVX_FLOAT_BLOCK; \
...@@ -337,7 +478,7 @@ class VTanhKernelImpl : public VTanhKernel<T> { ...@@ -337,7 +478,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
vaddbias_->Compute(-1.f, y, y); \ vaddbias_->Compute(-1.f, y, y); \
} }
#define INTRI_GT16_FLOAT(isa) \ #define INTRI_GT16_FLOAT(isa, expisa) \
template <> \ template <> \
VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d) \ VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \ : VTanhKernel<float>() { \
...@@ -356,7 +497,7 @@ class VTanhKernelImpl : public VTanhKernel<T> { ...@@ -356,7 +497,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
const { \ const { \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \ __m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_VTANH(tmp); \ INTRI_VTANH(tmp, expisa); \
_mm256_storeu_ps(y + i, tmp); \ _mm256_storeu_ps(y + i, tmp); \
} \ } \
x += this->end_; \ x += this->end_; \
...@@ -368,19 +509,19 @@ class VTanhKernelImpl : public VTanhKernel<T> { ...@@ -368,19 +509,19 @@ class VTanhKernelImpl : public VTanhKernel<T> {
} }
#ifdef __AVX__ #ifdef __AVX__
INTRI8_FLOAT(jit::avx); INTRI8_FLOAT(jit::avx, detail::ExpAVX);
INTRI16_FLOAT(jit::avx); INTRI16_FLOAT(jit::avx, detail::ExpAVX);
INTRI_GT8LT16_FLOAT(jit::avx); INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX);
INTRI_GT16_FLOAT(jit::avx); INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX);
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
INTRI8_FLOAT(jit::avx2); INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
INTRI16_FLOAT(jit::avx2); INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
// maybe use avx at gt8lt16 and gt16 // maybe use avx at gt8lt16 and gt16
#endif #endif
#ifdef __AVX512F__ #ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f); INTRI8_FLOAT(jit::avx512f, detail::ExpAVX2);
INTRI16_FLOAT(jit::avx512f); INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
// maybe use avx at gt8lt16 and gt16 // maybe use avx at gt8lt16 and gt16
#endif #endif
......
...@@ -25,13 +25,18 @@ limitations under the License. */ ...@@ -25,13 +25,18 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
#ifdef __AVX__ namespace jitkernel {
namespace detail { namespace detail {
__m256 Exp(__m256 a); #ifdef __AVX__
} // namespace detail __m256 ExpAVX(__m256 x);
#endif #endif
namespace jitkernel { #ifdef __AVX2__
__m256 ExpAVX2(__m256 x);
#endif
} // namespace detail
namespace jit = platform::jit; namespace jit = platform::jit;
#ifdef __AVX__ #ifdef __AVX__
...@@ -43,43 +48,72 @@ class AVXAct { ...@@ -43,43 +48,72 @@ class AVXAct {
virtual __m256 Compute(__m256 x) const = 0; virtual __m256 Compute(__m256 x) const = 0;
}; };
template <act_type type> template <act_type type, jit::cpu_isa_t isa>
class AVXActImpl : public AVXAct { class AVXActImpl : public AVXAct {
public: public:
__m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); } __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); }
}; };
template <> #define AVX_SIGMOID(isa, expisa) \
__m256 AVXActImpl<kSigmoid>::Compute(__m256 x) const { template <> \
__m256 ones = _mm256_set1_ps(1.0f); __m256 AVXActImpl<kSigmoid, isa>::Compute(__m256 x) const { \
x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); __m256 ones = _mm256_set1_ps(1.0f); \
x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \
x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \
x = detail::Exp(x); x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); \
x = _mm256_add_ps(ones, x); x = expisa(x); \
return _mm256_div_ps(ones, x); x = _mm256_add_ps(ones, x); \
} return _mm256_div_ps(ones, x); \
}
template <> #define AVX_TANH(isa, expisa) \
__m256 AVXActImpl<kTanh>::Compute(__m256 x) const { template <> \
__m256 ones = _mm256_set1_ps(1.0f); __m256 AVXActImpl<kTanh, isa>::Compute(__m256 x) const { \
x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); __m256 ones = _mm256_set1_ps(1.0f); \
x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); \
x = detail::Exp(x); x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); \
x = _mm256_add_ps(ones, x); x = expisa(x); \
x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); x = _mm256_add_ps(ones, x); \
return _mm256_sub_ps(x, ones); x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); \
} return _mm256_sub_ps(x, ones); \
}
template <> #define AVX_RELU(isa) \
__m256 AVXActImpl<kRelu>::Compute(__m256 x) const { template <> \
return _mm256_max_ps(x, _mm256_setzero_ps()); __m256 AVXActImpl<kRelu, isa>::Compute(__m256 x) const { \
} return _mm256_max_ps(x, _mm256_setzero_ps()); \
}
#define AVX_IDENTITY(isa) \
template <> \
__m256 AVXActImpl<kIdentity, isa>::Compute(__m256 x) const { \
return x; \
}
#define FOR_EACH_AVX_ISA(macro_) \
macro_(jit::avx); \
macro_(jit::avx2); \
macro_(jit::avx512f)
FOR_EACH_AVX_ISA(AVX_RELU);
FOR_EACH_AVX_ISA(AVX_IDENTITY);
AVX_SIGMOID(jit::avx, detail::ExpAVX);
AVX_TANH(jit::avx, detail::ExpAVX);
#ifdef __AVX2__
AVX_SIGMOID(jit::avx2, detail::ExpAVX2);
AVX_SIGMOID(jit::avx512f, detail::ExpAVX2);
AVX_TANH(jit::avx2, detail::ExpAVX2);
AVX_TANH(jit::avx512f, detail::ExpAVX2);
#endif
#undef FOR_EACH_AVX_ISA
#undef AVX_IDENTITY
#undef AVX_RELU
#undef AVX_TANH
#undef AVX_SIGMOID
template <>
__m256 AVXActImpl<kIdentity>::Compute(__m256 x) const {
return x;
}
#endif #endif
template <typename T> template <typename T>
...@@ -119,23 +153,6 @@ class LSTMKernelImpl : public LSTMKernel<T> { ...@@ -119,23 +153,6 @@ class LSTMKernelImpl : public LSTMKernel<T> {
act_cell_d_ = GetActKernel<T>(act_cell, d); act_cell_d_ = GetActKernel<T>(act_cell, d);
vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d); vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d); vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
#ifdef __AVX__
auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr<AVXAct> {
if (type == "sigmoid") {
return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid>());
} else if (type == "relu") {
return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu>());
} else if (type == "tanh") {
return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh>());
} else if (type == "identity" || type == "") {
return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity>());
}
PADDLE_THROW("Not support type: %s", type);
};
avx_act_gate_ = GetAVXAct(act_gate);
avx_act_cand_ = GetAVXAct(act_cand);
avx_act_cell_ = GetAVXAct(act_cell);
#endif
} }
void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
...@@ -175,26 +192,61 @@ class LSTMKernelImpl : public LSTMKernel<T> { ...@@ -175,26 +192,61 @@ class LSTMKernelImpl : public LSTMKernel<T> {
#endif #endif
}; };
#define INTRI8_FLOAT(isa) \ #define INTRI8_FLOAT(isa) \
template <> \ template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt( \ LSTMKernelImpl<float, isa, kEQ8>::LSTMKernelImpl( \
float* gates, const float* ct_1, float* ct, float* ht, \ const std::string& act_gate, const std::string& act_cand, \
const float* wp_data, float* checked) const { \ const std::string& act_cell, int d) \
/* gates: W_ch, W_ih, W_fh, W_oh */ \ : LSTMKernel<float>() { \
__m256 c, i, f, o; \ auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr<AVXAct> { \
c = _mm256_loadu_ps(gates); \ if (type == "sigmoid") { \
i = _mm256_loadu_ps(gates + 8); \ return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid, isa>()); \
f = _mm256_loadu_ps(gates + 16); \ } else if (type == "relu") { \
o = _mm256_loadu_ps(gates + 24); \ return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu, isa>()); \
/* C_t = C_t-1 * fgated + cand_gated * igated*/ \ } else if (type == "tanh") { \
c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh, isa>()); \
i = _mm256_loadu_ps(ct_1); \ } else if (type == "identity" || type == "") { \
f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity, isa>()); \
f = _mm256_add_ps(c, f); \ } \
_mm256_storeu_ps(ct, f); \ PADDLE_THROW("Not support type: %s", type); \
/* H_t = act_cell(C_t) * ogated */ \ }; \
o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ avx_act_gate_ = GetAVXAct(act_gate); \
_mm256_storeu_ps(ht, o); \ avx_act_cand_ = GetAVXAct(act_cand); \
avx_act_cell_ = GetAVXAct(act_cell); \
} \
template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt( \
float* gates, const float* ct_1, float* ct, float* ht, \
const float* wp_data, float* checked) const { \
/* gates: W_ch, W_ih, W_fh, W_oh */ \
__m256 c, i, f, o; \
c = _mm256_loadu_ps(gates); \
i = _mm256_loadu_ps(gates + 8); \
f = _mm256_loadu_ps(gates + 16); \
o = _mm256_loadu_ps(gates + 24); \
/* C_t = C_t-1 * fgated + cand_gated * igated*/ \
c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
i = _mm256_loadu_ps(ct_1); \
f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \
f = _mm256_add_ps(c, f); \
_mm256_storeu_ps(ct, f); \
/* H_t = act_cell(C_t) * ogated */ \
o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
_mm256_storeu_ps(ht, o); \
} \
template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeC1H1( \
float* gates, float* ct, float* ht, const float* wp_data) const { \
__m256 c, i, o; \
c = _mm256_loadu_ps(gates); \
i = _mm256_loadu_ps(gates + 8); \
o = _mm256_loadu_ps(gates + 24); \
/* C_t = igated * cgated*/ \
c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \
_mm256_storeu_ps(ct, c); \
/* H_t = act_cell(C_t) * ogated */ \
o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \
_mm256_storeu_ps(ht, o); \
} }
// TODO(TJ): optimize keq16 // TODO(TJ): optimize keq16
......
...@@ -712,6 +712,63 @@ TEST(JitKernel, vadd) { ...@@ -712,6 +712,63 @@ TEST(JitKernel, vadd) {
} }
} }
void vaddrelu_ref(const int n, const float* x, const float* y, float* z) {
for (int i = 0; i < n; ++i) {
z[i] = x[i] + y[i];
z[i] = z[i] > 0 ? z[i] : 0;
}
}
void vaddrelu_better(
const std::shared_ptr<
const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
const std::shared_ptr<
const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
const float* x, const float* y, float* z) {
vadd->Compute(x, y, z);
vrelu->Compute(z, z);
}
TEST(JitKernel, vaddrelu) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 256, 512}) {
std::vector<float> x(d), y(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data());
RandomVec<float>(d, y.data());
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VAddReluKernel<float>>(d);
const auto& vadd =
jit::KernelPool::Instance().template Get<jit::VAddKernel<float>>(d);
const auto& vrelu =
jit::KernelPool::Instance().template Get<jit::VReluKernel<float>>(d);
const float* x_data = x.data();
const float* y_data = y.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vadd_ref(d, x_data, y_data, zref_data);
}
auto trefe = GetCurrentUS();
auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data);
}
auto tmkle = GetCurrentUS();
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, y_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
<< "tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
TEST(JitKernel, pool) { TEST(JitKernel, pool) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
const int frame_size = 4; const int frame_size = 4;
......
...@@ -229,7 +229,7 @@ TEST(BlockingQueue, speed_test_mode) { ...@@ -229,7 +229,7 @@ TEST(BlockingQueue, speed_test_mode) {
q1.Receive(&b); q1.Receive(&b);
EXPECT_EQ(b, i); EXPECT_EQ(b, i);
} }
EXPECT_EQ(q1.Size(), 0); EXPECT_EQ(q1.Size(), 0UL);
BlockingQueue<size_t> q2(queue_size, true); BlockingQueue<size_t> q2(queue_size, true);
for (size_t i = 0; i < queue_size; ++i) { for (size_t i = 0; i < queue_size; ++i) {
...@@ -237,7 +237,7 @@ TEST(BlockingQueue, speed_test_mode) { ...@@ -237,7 +237,7 @@ TEST(BlockingQueue, speed_test_mode) {
} }
for (size_t i = 0; i < queue_size; ++i) { for (size_t i = 0; i < queue_size; ++i) {
q2.Receive(&b); q2.Receive(&b);
EXPECT_EQ(b, 0); EXPECT_EQ(b, 0UL);
} }
EXPECT_EQ(q2.Size(), queue_size); EXPECT_EQ(q2.Size(), queue_size);
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/roi_align_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
class ROIAlignOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ROIAlignOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("ROIs"),
"Input(ROIs) of ROIAlignOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ROIAlignOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
auto rois_dims = ctx->GetInputDim("ROIs");
PADDLE_ENFORCE(input_dims.size() == 4,
"The format of input tensor is NCHW.");
PADDLE_ENFORCE(rois_dims.size() == 2,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2], …].");
PADDLE_ENFORCE(rois_dims[1] == 4,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2], …].");
int pooled_height = ctx->Attrs().Get<int>("pooled_height");
int pooled_width = ctx->Attrs().Get<int>("pooled_width");
float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
PADDLE_ENFORCE_GT(pooled_height, 0,
"The pooled output height must greater than 0");
PADDLE_ENFORCE_GT(pooled_width, 0,
"The pooled output width must greater than 0");
PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
"The spatial scale must greater than 0");
auto out_dims = input_dims;
out_dims[0] = rois_dims[0];
out_dims[1] = input_dims[1];
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
ctx->SetOutputDim("Out", out_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
};
class ROIAlignGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"The GRAD@Out of ROIAlignGradOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
"The GRAD@X of ROIAlignGradOp should not be null.");
ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
};
class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor), "
"The input of ROIAlignOp. "
"The format of input tensor is NCHW. Where N is batch size, "
"C is the number of input channels, "
"H is the height of the feature, and "
"W is the width of the feature.");
AddInput("ROIs",
"(LoDTensor), "
"ROIs (Regions of Interest) to pool over. "
"should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2], …]. "
"(x1, y1) is the top left coordinates, and "
"(x2, y2) is the bottom right coordinates.");
AddOutput("Out",
"(Tensor), "
"The output of ROIAlignOp is a 4-D tensor with shape "
"(num_rois, channels, pooled_h, pooled_w).");
AddAttr<float>("spatial_scale",
"(float, default 1.0), "
"Multiplicative spatial scale factor "
"to translate ROI coords from their input scale "
"to the scale used when pooling.")
.SetDefault(1.0);
AddAttr<int>("pooled_height",
"(int, default 1), "
"The pooled output height.")
.SetDefault(1);
AddAttr<int>("pooled_width",
"(int, default 1), "
"The pooled output width.")
.SetDefault(1);
AddAttr<int>("sampling_ratio",
"(int,default -1),"
"number of sampling points in the interpolation grid"
"If <=0, then grid points are adaptive to roi_width "
"and pooled_w, likewise for height")
.SetDefault(-1);
AddComment(R"DOC(
**RoIAlign Operator**
Region of interest align (also known as RoI align) is to perform
bilinear interpolation on inputs of nonuniform sizes to obtain
fixed-size feature maps (e.g. 7*7)
Dividing each region proposal into equal-sized sections with
the pooled_width and pooled_height. Location remains the origin
result.
In each ROI bin, the value of the four regularly sampled locations
are computed directly through bilinear interpolation. The output is
the mean of four locations.
Thus avoid the misaligned problem.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp);
REGISTER_OP_CPU_KERNEL(
roi_align,
ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
roi_align_grad,
ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/roi_align_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <class T>
__device__ T BilinearInterpolate(const T* input_data, const int height,
const int width, T y, T x) {
if (y < -1.0 || y > height || x < -1.0 || x > width) {
return 0;
}
y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x;
int y_low = static_cast<int>(y);
int x_low = static_cast<int>(x);
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = static_cast<T>(y_low);
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = static_cast<T>(x_low);
} else {
x_high = x_low + 1;
}
T ly = y - y_low, lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T v1 = input_data[y_low * width + x_low];
T v2 = input_data[y_low * width + x_high];
T v3 = input_data[y_high * width + x_low];
T v4 = input_data[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <class T>
__device__ void BilinearInterpolateGradient(const int height, const int width,
T y, T x, T* w1, T* w2, T* w3,
T* w4, int* x_low, int* x_high,
int* y_low, int* y_high) {
if (y < -1.0 || y > height || x < -1.0 || x > width) {
return;
}
y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x;
*y_low = static_cast<int>(y);
*x_low = static_cast<int>(x);
if (*y_low >= height - 1) {
*y_high = *y_low = height - 1;
y = static_cast<T>(*y_low);
} else {
*y_high = *y_low + 1;
}
if (*x_low >= width - 1) {
*x_high = *x_low = width - 1;
x = static_cast<T>(*x_low);
} else {
*x_high = *x_low + 1;
}
T ly = y - *y_low, lx = x - *x_low;
T hy = 1. - ly, hx = 1. - lx;
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <class T>
__global__ void GPUROIAlignForward(
const int nthreads, const T* input_data, const T* input_rois,
const float spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int sampling_ratio, int* roi_batch_id_data, T* output_data) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
int pw = i % pooled_width;
int ph = (i / pooled_width) % pooled_height;
int c = (i / pooled_width / pooled_height) % channels;
int n = i / pooled_width / pooled_height / channels;
const T* offset_input_rois = input_rois + n * kROISize;
int roi_batch_ind = roi_batch_id_data[n];
T roi_xmin = offset_input_rois[0] * spatial_scale;
T roi_ymin = offset_input_rois[1] * spatial_scale;
T roi_xmax = offset_input_rois[2] * spatial_scale;
T roi_ymax = offset_input_rois[3] * spatial_scale;
T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_input_data =
input_data + (roi_batch_ind * channels + c) * height * width;
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height);
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
const T count = roi_bin_grid_h * roi_bin_grid_w;
T output_val = 0;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_ymin + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_xmin + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val = BilinearInterpolate(offset_input_data, height, width, y, x);
output_val += val;
}
}
output_val /= count;
output_data[i] = output_val;
}
}
template <typename T>
__global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
const T* out_grad, const int num_rois,
const float spatial_scale,
const int channels, const int height,
const int width, const int pooled_height,
const int pooled_width,
const int sampling_ratio,
int* roi_batch_id_data, T* input_grad) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
int pw = i % pooled_width;
int ph = (i / pooled_width) % pooled_height;
int c = (i / pooled_width / pooled_height) % channels;
int n = i / pooled_width / pooled_height / channels;
const T* offset_input_rois = input_rois + n * kROISize;
int roi_batch_ind = roi_batch_id_data[n];
T roi_xmin = offset_input_rois[0] * spatial_scale;
T roi_ymin = offset_input_rois[1] * spatial_scale;
T roi_xmax = offset_input_rois[2] * spatial_scale;
T roi_ymax = offset_input_rois[3] * spatial_scale;
T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.));
T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.));
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_input_grad =
input_grad + (roi_batch_ind * channels + c) * height * width;
const T* offset_out_grad =
out_grad + (n * channels + c) * pooled_height * pooled_width;
const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height);
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
const T count = roi_bin_grid_h * roi_bin_grid_w;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_ymin + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_xmin + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4,
&x_low, &x_high, &y_low, &y_high);
T diff1 = out_grad_this_bin * w1 / count;
T diff2 = out_grad_this_bin * w2 / count;
T diff3 = out_grad_this_bin * w3 / count;
T diff4 = out_grad_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low,
diff1);
platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high,
diff2);
platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low,
diff3);
platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
diff4);
}
}
}
}
}
template <typename Place, typename T>
class GPUROIAlignOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<Tensor>("X");
auto* rois = ctx.Input<LoDTensor>("ROIs");
auto* out = ctx.Output<Tensor>("Out");
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto in_dims = in->dims();
int batch_size = in_dims[0];
int channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
if (rois_num == 0) return;
int output_size = out->numel();
int blocks = NumBlocks(output_size);
int threads = kNumCUDAThreads;
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ(
rois_batch_size, batch_size,
"The rois_batch_size and imgs batch_size must be the same.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
Tensor roi_batch_id_list_gpu;
framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
&roi_batch_id_list_gpu);
GPUROIAlignForward<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
height, width, pooled_height, pooled_width, sampling_ratio,
roi_batch_id_list_gpu.data<int>(),
out->mutable_data<T>(ctx.GetPlace()));
}
};
template <typename Place, typename T>
class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<Tensor>("X");
auto* rois = ctx.Input<LoDTensor>("ROIs");
auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
int rois_num = rois->dims()[0];
int channels = in->dims()[1];
int height = in->dims()[2];
int width = in->dims()[3];
if (!in_grad) {
return;
}
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
Tensor roi_batch_id_list_gpu;
framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
&roi_batch_id_list_gpu);
in_grad->mutable_data<T>(ctx.GetPlace());
math::SetConstant<Place, T> set_zero;
set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
int output_grad_size = out_grad->numel();
int blocks = NumBlocks(output_grad_size);
int threads = kNumCUDAThreads;
if (output_grad_size > 0) {
GPUROIAlignBackward<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
spatial_scale, channels, height, width, pooled_height, pooled_width,
sampling_ratio, roi_batch_id_list_gpu.data<int>(),
in_grad->mutable_data<T>(ctx.GetPlace()));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
roi_align,
ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::GPUROIAlignOpKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
roi_align_grad,
ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
static constexpr int kROISize = 4;
template <class T>
void PreCalcForBilinearInterpolate(
const platform::DeviceContext& ctx, const int height, const int width,
const int pooled_height, const int pooled_width, const int iy_upper,
const int ix_upper, T roi_ymin, T roi_xmin, T bin_size_h, T bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, Tensor* pre_pos, Tensor* pre_w) {
int pre_calc_index = 0;
int* pre_pos_data = pre_pos->mutable_data<int>(ctx.GetPlace());
T* pre_w_data = pre_w->mutable_data<T>(ctx.GetPlace());
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
// calculate y of sample points
T y = roi_ymin + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
// calculate x of samle points
for (int ix = 0; ix < ix_upper; ix++) {
T x = roi_xmin + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// deal with elements out of map
if (y < -1.0 || y > height || x < -1.0 || x > width) {
for (int i = 0; i < kROISize; ++i) {
pre_pos_data[i + pre_calc_index * kROISize] = 0;
pre_w_data[i + pre_calc_index * kROISize] = 0;
}
pre_calc_index += 1;
continue;
}
y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x;
int y_low = static_cast<int>(y);
int x_low = static_cast<int>(x);
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = static_cast<T>(y_low);
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = static_cast<T>(x_low);
} else {
x_high = x_low + 1;
}
T ly = y - y_low, lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low;
pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high;
pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low;
pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high;
pre_w_data[pre_calc_index * kROISize] = hy * hx;
pre_w_data[pre_calc_index * kROISize + 1] = hy * lx;
pre_w_data[pre_calc_index * kROISize + 2] = ly * hx;
pre_w_data[pre_calc_index * kROISize + 3] = ly * lx;
pre_calc_index += 1;
}
}
}
}
}
template <class T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
const T out_grad_this_bin, const T count,
T* batch_grad_data) {
int x_low, y_low, x_high, y_high;
T w1, w2, w3, w4;
if (y < -1.0 || y > height || x < -1.0 || x > width) {
w1 = w2 = w3 = w4 = 0;
x_low = x_high = y_low = y_high = -1;
return;
}
y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x;
y_low = static_cast<int>(y);
x_low = static_cast<int>(x);
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = static_cast<T>(y_low);
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = static_cast<T>(x_low);
} else {
x_high = x_low + 1;
}
T ly = y - y_low, lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T diff1 = out_grad_this_bin * w1 / count;
T diff2 = out_grad_this_bin * w2 / count;
T diff3 = out_grad_this_bin * w3 / count;
T diff4 = out_grad_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
*(batch_grad_data + y_low * width + x_low) += diff1;
*(batch_grad_data + y_low * width + x_high) += diff2;
*(batch_grad_data + y_high * width + x_low) += diff3;
*(batch_grad_data + y_high * width + x_high) += diff4;
}
}
template <typename DeviceContext, typename T>
class CPUROIAlignOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out = ctx.Output<framework::Tensor>("Out");
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto in_dims = in->dims();
int batch_size = in_dims[0];
int channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
auto in_stride = framework::stride(in_dims);
auto roi_stride = framework::stride(rois->dims());
auto out_stride = framework::stride(out->dims());
const T* input_data = in->data<T>();
framework::Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ(
rois_batch_size, batch_size,
"The rois_batch_size and imgs batch_size must be the same.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
T* output_data = out->mutable_data<T>(ctx.GetPlace());
const T* rois_data = rois->data<T>();
for (int n = 0; n < rois_num; ++n) {
int roi_batch_id = roi_batch_id_data[n];
T roi_xmin = rois_data[0] * spatial_scale;
T roi_ymin = rois_data[1] * spatial_scale;
T roi_xmax = rois_data[2] * spatial_scale;
T roi_ymax = rois_data[3] * spatial_scale;
T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* batch_data = input_data + roi_batch_id * in_stride[0];
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height);
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_width / pooled_width);
const T count = roi_bin_grid_h * roi_bin_grid_w;
Tensor pre_pos;
Tensor pre_w;
int pre_size = count * out_stride[1];
pre_pos.Resize({pre_size, kROISize});
pre_w.Resize({pre_size, kROISize});
PreCalcForBilinearInterpolate(
dev_ctx, height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_ymin, roi_xmin, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, &pre_pos, &pre_w);
const int* pre_pos_data = pre_pos.data<int>();
const T* pre_w_data = pre_w.data<T>();
for (int c = 0; c < channels; c++) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
const int pool_index = ph * pooled_width + pw;
T output_val = 0;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
for (int i = 0; i < kROISize; i++) {
int pos = pre_pos_data[pre_calc_index * kROISize + i];
T w = pre_w_data[pre_calc_index * kROISize + i];
output_val += w * batch_data[pos];
}
pre_calc_index += 1;
}
}
output_val /= count;
output_data[pool_index] = output_val;
}
}
batch_data += in_stride[1];
output_data += out_stride[1];
}
rois_data += roi_stride[0];
}
}
};
template <typename DeviceContext, typename T>
class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto in_dims = in->dims();
if (!in_grad) {
return;
}
int channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({rois_num});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
const T* rois_data = rois->data<T>();
const T* out_grad_data = out_grad->data<T>();
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
auto in_stride = framework::stride(in->dims());
auto roi_stride = framework::stride(rois->dims());
auto out_stride = framework::stride(out_grad->dims());
for (int n = 0; n < rois_num; ++n) {
int roi_batch_idx = roi_batch_id_data[n];
T roi_xmin = rois_data[0] * spatial_scale;
T roi_ymin = rois_data[1] * spatial_scale;
T roi_xmax = rois_data[2] * spatial_scale;
T roi_ymax = rois_data[3] * spatial_scale;
T roi_width = std::max(roi_xmax - roi_xmin, static_cast<T>(1.));
T roi_height = std::max(roi_ymax - roi_ymin, static_cast<T>(1.));
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
for (int c = 0; c < channels; ++c) {
T* batch_grad_data =
in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
const T* batch_out_grad_data =
out_grad_data + n * out_stride[0] + c * out_stride[1];
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int pool_index = ph * pooled_width + pw;
T out_grad_this_bin = batch_out_grad_data[pool_index];
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height);
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_width / pooled_width);
T count = roi_bin_grid_h * roi_bin_grid_w;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_ymin + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_xmin + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
bilinear_interpolate_gradient(height, width, y, x,
out_grad_this_bin, count,
batch_grad_data);
}
}
}
}
}
rois_data += roi_stride[0];
}
}
};
} // namespace operators
} // namespace paddle
...@@ -174,4 +174,4 @@ REGISTER_OP_CPU_KERNEL( ...@@ -174,4 +174,4 @@ REGISTER_OP_CPU_KERNEL(
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
roi_pool_grad, roi_pool_grad,
ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>, ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>); ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -249,4 +249,4 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -249,4 +249,4 @@ REGISTER_OP_CUDA_KERNEL(
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
roi_pool_grad, roi_pool_grad,
ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>, ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>); ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat_and_split.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -106,7 +106,7 @@ class SeqConcatGradKernel : public framework::OpKernel<T> { ...@@ -106,7 +106,7 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
} }
} }
math::ConcatGradFunctor<DeviceContext, T> functor; math::SplitFunctor<DeviceContext, T> functor;
std::vector<const framework::Tensor *> sliced_x_ptr; std::vector<const framework::Tensor *> sliced_x_ptr;
std::vector<framework::Tensor *> sliced_dx_ptr; std::vector<framework::Tensor *> sliced_dx_ptr;
for (auto &x : sliced_x) { for (auto &x : sliced_x) {
......
...@@ -50,7 +50,7 @@ class SequenceUnpadOp : public framework::OperatorWithKernel { ...@@ -50,7 +50,7 @@ class SequenceUnpadOp : public framework::OperatorWithKernel {
if (x_dims.size() == 2) { if (x_dims.size() == 2) {
out_dims_vec.push_back(1); out_dims_vec.push_back(1);
} else { } else {
for (size_t i = 2; i < x_dims.size(); ++i) { for (int i = 2; i < x_dims.size(); ++i) {
out_dims_vec.push_back(x_dims[i]); out_dims_vec.push_back(x_dims[i]);
} }
} }
......
...@@ -61,7 +61,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> { ...@@ -61,7 +61,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
if (x_t->dims().size() == 2) { if (x_t->dims().size() == 2) {
out_dims_vec.push_back(1); out_dims_vec.push_back(1);
} else { } else {
for (size_t i = 2; i < x_t->dims().size(); ++i) { for (int i = 2; i < x_t->dims().size(); ++i) {
out_dims_vec.push_back(x_t->dims()[i]); out_dims_vec.push_back(x_t->dims()[i]);
} }
} }
......
...@@ -111,11 +111,10 @@ Example: ...@@ -111,11 +111,10 @@ Example:
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
USE_CPU_ONLY_OP(concat);
REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker); REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
REGISTER_OP_CPU_KERNEL(split, REGISTER_OP_CPU_KERNEL(
ops::SplitOpKernel<paddle::platform::CPUPlace, double>, split, ops::SplitOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SplitOpKernel<paddle::platform::CPUPlace, float>, ops::SplitOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>, ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::SplitOpKernel<paddle::platform::CPUPlace, int>); ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int>);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
namespace paddle { namespace paddle {
...@@ -28,18 +29,22 @@ class SplitOpKernel : public framework::OpKernel<T> { ...@@ -28,18 +29,22 @@ class SplitOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X"); auto* in = ctx.Input<framework::Tensor>("X");
auto outs = ctx.MultiOutput<framework::Tensor>("Out"); auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto in_stride = framework::stride_numel(in->dims()); int axis = ctx.Attr<int>("axis");
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
size_t input_offset = 0; std::vector<const framework::Tensor*> shape_refer;
for (auto& out : outs) { for (size_t j = 0; j < outs.size(); ++j) {
out->mutable_data<T>(ctx.GetPlace()); outs[j]->mutable_data<T>(ctx.GetPlace());
auto out_stride = framework::stride_numel(out->dims()); shape_refer.emplace_back(outs[j]);
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(), }
out_stride, in->data<T>() + input_offset,
in_stride, out_stride[axis]); auto& dev_ctx = ctx.template device_context<DeviceContext>();
input_offset += out_stride[axis]; // Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && outs.size() < 10) {
StridedMemcpyWithAxis0<T>(dev_ctx, *in, shape_refer, &outs);
} else {
math::SplitFunctor<DeviceContext, T> functor;
functor(dev_ctx, *in, shape_refer, axis, &outs);
} }
} }
}; };
......
...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/strided_memcpy.h" #include "paddle/fluid/operators/detail/strided_memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -98,5 +99,26 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -98,5 +99,26 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
} }
} }
template <typename T>
inline void StridedMemcpyWithAxis0(
const platform::DeviceContext& dev_ctx, const framework::Tensor& input,
const std::vector<const framework::Tensor*>& shape_refer,
std::vector<framework::Tensor*>* outputs) {
const framework::DDim in_stride = stride_numel(input.dims());
const int axis = 0;
size_t input_offset = 0;
for (size_t i = 0; i < outputs->size(); ++i) {
auto out_stride = stride_numel(shape_refer[i]->dims());
auto out = outputs->at(i);
if (out != nullptr) {
StridedNumelCopyWithAxis<T>(dev_ctx, axis, out->data<T>(), out_stride,
input.data<T>() + input_offset, in_stride,
out_stride[axis]);
}
input_offset += out_stride[axis];
}
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -35,6 +35,16 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { ...@@ -35,6 +35,16 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
return it->second.get(); return it->second.get();
} }
const std::vector<const DeviceContext*>
DeviceContextPool::GetAllDeviceContexts() const {
std::vector<const DeviceContext*> all_device_ctx;
all_device_ctx.reserve(device_contexts_.size());
for (auto& dev_ctx : device_contexts_) {
all_device_ctx.emplace_back(dev_ctx.second.get());
}
return all_device_ctx;
}
DeviceContextPool::DeviceContextPool( DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) { const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
......
...@@ -217,6 +217,9 @@ class DeviceContextPool { ...@@ -217,6 +217,9 @@ class DeviceContextPool {
/*! \brief Return handle of single device context. */ /*! \brief Return handle of single device context. */
platform::DeviceContext* Get(const platform::Place& place); platform::DeviceContext* Get(const platform::Place& place);
/*! \brief Return all the device contexts. */
const std::vector<const DeviceContext*> GetAllDeviceContexts() const;
template <typename Place> template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace( const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
const Place& place) { const Place& place) {
......
...@@ -30,6 +30,8 @@ limitations under the License. */ ...@@ -30,6 +30,8 @@ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -193,6 +195,13 @@ RecordEvent::~RecordEvent() { ...@@ -193,6 +195,13 @@ RecordEvent::~RecordEvent() {
PopEvent(name_, dev_ctx_); PopEvent(name_, dev_ctx_);
} }
RecordRPCEvent::RecordRPCEvent(const std::string& name,
const DeviceContext* dev_ctx) {
if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name, dev_ctx));
}
}
RecordBlock::RecordBlock(int block_id) RecordBlock::RecordBlock(int block_id)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
......
...@@ -87,6 +87,16 @@ struct RecordEvent { ...@@ -87,6 +87,16 @@ struct RecordEvent {
std::string full_name_; std::string full_name_;
}; };
class RecordRPCEvent {
public:
// dev_ctx can be set to nullptr if device is cpu.
RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordRPCEvent() {}
private:
std::unique_ptr<RecordEvent> event_;
};
struct RecordBlock { struct RecordBlock {
explicit RecordBlock(int block_id); explicit RecordBlock(int block_id);
~RecordBlock(); ~RecordBlock();
......
...@@ -120,6 +120,7 @@ def __bootstrap__(): ...@@ -120,6 +120,7 @@ def __bootstrap__():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
read_env_flags.append('rpc_server_profile_period') read_env_flags.append('rpc_server_profile_period')
read_env_flags.append('rpc_server_profile_path') read_env_flags.append('rpc_server_profile_path')
read_env_flags.append('enable_rpc_profiler')
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
......
...@@ -324,10 +324,19 @@ class LayerHelper(object): ...@@ -324,10 +324,19 @@ class LayerHelper(object):
raise ValueError("no Parameter name %s found" % name) raise ValueError("no Parameter name %s found" % name)
return param return param
def create_tmp_variable(self, dtype, stop_gradient=False): def create_variable_for_type_inference(self, dtype, stop_gradient=False):
"""Create a temporary variable that should be type inferred layer.
Note:
The default type will be set to LOD_TENSOR. However, when
the var is used as operator output, its type will be updated
based on operator's `VarTypeInference` implementation in
infer_var_type.
"""
return self.main_program.current_block().create_var( return self.main_program.current_block().create_var(
name=unique_name.generate(".".join([self.name, 'tmp'])), name=unique_name.generate(".".join([self.name, 'tmp'])),
dtype=dtype, dtype=dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False, persistable=False,
stop_gradient=stop_gradient) stop_gradient=stop_gradient)
...@@ -388,7 +397,7 @@ class LayerHelper(object): ...@@ -388,7 +397,7 @@ class LayerHelper(object):
b = self.create_parameter( b = self.create_parameter(
attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True) attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
tmp = self.create_tmp_variable(dtype=input_var.dtype) tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
self.append_op( self.append_op(
type='elementwise_add', type='elementwise_add',
inputs={'X': [input_var], inputs={'X': [input_var],
...@@ -414,7 +423,7 @@ class LayerHelper(object): ...@@ -414,7 +423,7 @@ class LayerHelper(object):
tmp = input_var tmp = input_var
# NOTE(dzhwinter): some activation support inplace compution. # NOTE(dzhwinter): some activation support inplace compution.
if not core.IsInplace(act_type): if not core.IsInplace(act_type):
tmp = self.create_tmp_variable(dtype=input_var.dtype) tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
self.append_op( self.append_op(
type=act_type, type=act_type,
inputs={"X": [input_var]}, inputs={"X": [input_var]},
......
...@@ -80,8 +80,8 @@ def split_lod_tensor(input, mask, level=0): ...@@ -80,8 +80,8 @@ def split_lod_tensor(input, mask, level=0):
""" """
helper = LayerHelper('split_lod_tensor', **locals()) helper = LayerHelper('split_lod_tensor', **locals())
out_true = helper.create_tmp_variable(dtype=input.dtype) out_true = helper.create_variable_for_type_inference(dtype=input.dtype)
out_false = helper.create_tmp_variable(dtype=input.dtype) out_false = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type='split_lod_tensor', type='split_lod_tensor',
inputs={ inputs={
...@@ -131,7 +131,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0): ...@@ -131,7 +131,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
in_true=out_true, in_false=out_false, mask=y, x=x, level=level) in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
""" """
helper = LayerHelper('merge_lod_tensor', **locals()) helper = LayerHelper('merge_lod_tensor', **locals())
out = helper.create_tmp_variable(dtype=in_true.dtype) out = helper.create_variable_for_type_inference(dtype=in_true.dtype)
helper.append_op( helper.append_op(
type='merge_lod_tensor', type='merge_lod_tensor',
inputs={'X': x, inputs={'X': x,
...@@ -524,7 +524,7 @@ class StaticRNN(object): ...@@ -524,7 +524,7 @@ class StaticRNN(object):
if not isinstance(o, Variable): if not isinstance(o, Variable):
raise TypeError("step output takes a Variable") raise TypeError("step output takes a Variable")
tmp_o = self.helper.create_tmp_variable(dtype=o.dtype) tmp_o = self.helper.create_variable_for_type_inference(dtype=o.dtype)
self.helper.append_op( self.helper.append_op(
type='rnn_memory_helper', type='rnn_memory_helper',
inputs={'X': [o]}, inputs={'X': [o]},
...@@ -606,7 +606,8 @@ class StaticRNN(object): ...@@ -606,7 +606,8 @@ class StaticRNN(object):
pre_memories.append(mem.pre_mem.name) pre_memories.append(mem.pre_mem.name)
mem_var = rnn_block.var(mem.mem.name) mem_var = rnn_block.var(mem.mem.name)
assert isinstance(mem_var, Variable) assert isinstance(mem_var, Variable)
new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype) new_mem = self.helper.create_variable_for_type_inference(
dtype=mem_var.dtype)
rnn_block.append_op( rnn_block.append_op(
type='rnn_memory_helper', type='rnn_memory_helper',
...@@ -813,7 +814,7 @@ def max_sequence_len(rank_table): ...@@ -813,7 +814,7 @@ def max_sequence_len(rank_table):
${out_comment}. ${out_comment}.
""" """
helper = LayerHelper("max_seqence_len", **locals()) helper = LayerHelper("max_seqence_len", **locals())
res = helper.create_tmp_variable(dtype="int64") res = helper.create_variable_for_type_inference(dtype="int64")
helper.append_op( helper.append_op(
type="max_sequence_len", type="max_sequence_len",
inputs={"RankTable": rank_table}, inputs={"RankTable": rank_table},
...@@ -884,7 +885,7 @@ def array_to_lod_tensor(x, table): ...@@ -884,7 +885,7 @@ def array_to_lod_tensor(x, table):
lod_tensor = fluid.layers.array_to_lod_tensor(array, table) lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
""" """
helper = LayerHelper("array_to_lod_tensor", **locals()) helper = LayerHelper("array_to_lod_tensor", **locals())
tmp = helper.create_tmp_variable(dtype=x.dtype) tmp = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type="array_to_lod_tensor", type="array_to_lod_tensor",
inputs={'X': x, inputs={'X': x,
...@@ -915,7 +916,7 @@ def increment(x, value=1.0, in_place=True): ...@@ -915,7 +916,7 @@ def increment(x, value=1.0, in_place=True):
""" """
helper = LayerHelper("increment", **locals()) helper = LayerHelper("increment", **locals())
if not in_place: if not in_place:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = x out = x
helper.append_op( helper.append_op(
...@@ -1012,7 +1013,7 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored): ...@@ -1012,7 +1013,7 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
""" """
helper = LayerHelper("less_than", **locals()) helper = LayerHelper("less_than", **locals())
if cond is None: if cond is None:
cond = helper.create_tmp_variable(dtype='bool') cond = helper.create_variable_for_type_inference(dtype='bool')
cond.stop_gradient = True cond.stop_gradient = True
attrs = dict() attrs = dict()
...@@ -1051,7 +1052,7 @@ def equal(x, y, cond=None, **ignored): ...@@ -1051,7 +1052,7 @@ def equal(x, y, cond=None, **ignored):
""" """
helper = LayerHelper("equal", **locals()) helper = LayerHelper("equal", **locals())
if cond is None: if cond is None:
cond = helper.create_tmp_variable(dtype='bool') cond = helper.create_variable_for_type_inference(dtype='bool')
cond.stop_gradient = True cond.stop_gradient = True
helper.append_op( helper.append_op(
...@@ -1098,7 +1099,7 @@ def array_read(array, i): ...@@ -1098,7 +1099,7 @@ def array_read(array, i):
array, array,
Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY: Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
raise TypeError("array should be tensor array vairable") raise TypeError("array should be tensor array vairable")
out = helper.create_tmp_variable(dtype=array.dtype) out = helper.create_variable_for_type_inference(dtype=array.dtype)
helper.append_op( helper.append_op(
type='read_from_array', type='read_from_array',
inputs={'X': [array], inputs={'X': [array],
...@@ -1133,7 +1134,7 @@ def shrink_memory(x, i, table): ...@@ -1133,7 +1134,7 @@ def shrink_memory(x, i, table):
usage. usage.
""" """
helper = LayerHelper('shrink_memory', **locals()) helper = LayerHelper('shrink_memory', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='shrink_rnn_memory', type='shrink_rnn_memory',
inputs={'X': [x], inputs={'X': [x],
...@@ -1170,7 +1171,7 @@ def array_length(array): ...@@ -1170,7 +1171,7 @@ def array_length(array):
""" """
helper = LayerHelper('array_length', **locals()) helper = LayerHelper('array_length', **locals())
tmp = helper.create_tmp_variable(dtype='int64') tmp = helper.create_variable_for_type_inference(dtype='int64')
tmp.stop_gradient = True tmp.stop_gradient = True
helper.append_op( helper.append_op(
type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]}) type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
...@@ -1590,7 +1591,7 @@ class DynamicRNN(object): ...@@ -1590,7 +1591,7 @@ class DynamicRNN(object):
self.mem_dict = dict() self.mem_dict = dict()
self.output_array = [] self.output_array = []
self.outputs = [] self.outputs = []
self.cond = self.helper.create_tmp_variable(dtype='bool') self.cond = self.helper.create_variable_for_type_inference(dtype='bool')
self.cond.stop_gradient = False self.cond.stop_gradient = False
self.while_op = While(self.cond) self.while_op = While(self.cond)
self.input_array = [] self.input_array = []
...@@ -1924,7 +1925,7 @@ def reorder_lod_tensor_by_rank(x, rank_table): ...@@ -1924,7 +1925,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
helper.is_instance('x', Variable) helper.is_instance('x', Variable)
helper.is_instance('rank_table', Variable) helper.is_instance('rank_table', Variable)
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='reorder_lod_tensor_by_rank', type='reorder_lod_tensor_by_rank',
inputs={'X': [x], inputs={'X': [x],
...@@ -1958,7 +1959,7 @@ def is_empty(x, cond=None, **ignored): ...@@ -1958,7 +1959,7 @@ def is_empty(x, cond=None, **ignored):
""" """
helper = LayerHelper("is_empty", **locals()) helper = LayerHelper("is_empty", **locals())
if cond is None: if cond is None:
cond = helper.create_tmp_variable(dtype='bool') cond = helper.create_variable_for_type_inference(dtype='bool')
cond.stop_gradient = True cond.stop_gradient = True
elif not isinstance(cond, Variable): elif not isinstance(cond, Variable):
raise TypeError("cond takes a variable") raise TypeError("cond takes a variable")
......
...@@ -147,10 +147,11 @@ def rpn_target_assign(bbox_pred, ...@@ -147,10 +147,11 @@ def rpn_target_assign(bbox_pred,
helper = LayerHelper('rpn_target_assign', **locals()) helper = LayerHelper('rpn_target_assign', **locals())
# Assign target label to anchors # Assign target label to anchors
loc_index = helper.create_tmp_variable(dtype='int32') loc_index = helper.create_variable_for_type_inference(dtype='int32')
score_index = helper.create_tmp_variable(dtype='int32') score_index = helper.create_variable_for_type_inference(dtype='int32')
target_label = helper.create_tmp_variable(dtype='int32') target_label = helper.create_variable_for_type_inference(dtype='int32')
target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) target_bbox = helper.create_variable_for_type_inference(
dtype=anchor_box.dtype)
helper.append_op( helper.append_op(
type="rpn_target_assign", type="rpn_target_assign",
inputs={ inputs={
...@@ -282,7 +283,8 @@ def detection_output(loc, ...@@ -282,7 +283,8 @@ def detection_output(loc,
scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape) scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
scores = nn.transpose(scores, perm=[0, 2, 1]) scores = nn.transpose(scores, perm=[0, 2, 1])
scores.stop_gradient = True scores.stop_gradient = True
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) nmsed_outs = helper.create_variable_for_type_inference(
dtype=decoded_box.dtype)
helper.append_op( helper.append_op(
type="multiclass_nms", type="multiclass_nms",
inputs={'Scores': scores, inputs={'Scores': scores,
...@@ -314,7 +316,7 @@ def iou_similarity(x, y, name=None): ...@@ -314,7 +316,7 @@ def iou_similarity(x, y, name=None):
""" """
helper = LayerHelper("iou_similarity", **locals()) helper = LayerHelper("iou_similarity", **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -351,7 +353,8 @@ def box_coder(prior_box, ...@@ -351,7 +353,8 @@ def box_coder(prior_box,
helper = LayerHelper("box_coder", **locals()) helper = LayerHelper("box_coder", **locals())
if name is None: if name is None:
output_box = helper.create_tmp_variable(dtype=prior_box.dtype) output_box = helper.create_variable_for_type_inference(
dtype=prior_box.dtype)
else: else:
output_box = helper.create_variable( output_box = helper.create_variable(
name=name, dtype=prior_box.dtype, persistable=False) name=name, dtype=prior_box.dtype, persistable=False)
...@@ -382,7 +385,7 @@ def polygon_box_transform(input, name=None): ...@@ -382,7 +385,7 @@ def polygon_box_transform(input, name=None):
""" """
helper = LayerHelper("polygon_box_transform", **locals()) helper = LayerHelper("polygon_box_transform", **locals())
if name is None: if name is None:
output = helper.create_tmp_variable(dtype=input.dtype) output = helper.create_variable_for_type_inference(dtype=input.dtype)
else: else:
output = helper.create_variable( output = helper.create_variable(
name=name, dtype=prior_box.input, persistable=False) name=name, dtype=prior_box.input, persistable=False)
...@@ -450,7 +453,7 @@ def detection_map(detect_res, ...@@ -450,7 +453,7 @@ def detection_map(detect_res,
helper = LayerHelper("detection_map", **locals()) helper = LayerHelper("detection_map", **locals())
def __create_var(type): def __create_var(type):
return helper.create_tmp_variable(dtype=type) return helper.create_variable_for_type_inference(dtype=type)
map_out = __create_var('float32') map_out = __create_var('float32')
accum_pos_count_out = out_states[0] if out_states else __create_var('int32') accum_pos_count_out = out_states[0] if out_states else __create_var('int32')
...@@ -557,8 +560,9 @@ def bipartite_match(dist_matrix, ...@@ -557,8 +560,9 @@ def bipartite_match(dist_matrix,
>>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou) >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
""" """
helper = LayerHelper('bipartite_match', **locals()) helper = LayerHelper('bipartite_match', **locals())
match_indices = helper.create_tmp_variable(dtype='int32') match_indices = helper.create_variable_for_type_inference(dtype='int32')
match_distance = helper.create_tmp_variable(dtype=dist_matrix.dtype) match_distance = helper.create_variable_for_type_inference(
dtype=dist_matrix.dtype)
helper.append_op( helper.append_op(
type='bipartite_match', type='bipartite_match',
inputs={'DistMat': dist_matrix}, inputs={'DistMat': dist_matrix},
...@@ -644,8 +648,8 @@ def target_assign(input, ...@@ -644,8 +648,8 @@ def target_assign(input,
gt, matched_indices, mismatch_value=0) gt, matched_indices, mismatch_value=0)
""" """
helper = LayerHelper('target_assign', **locals()) helper = LayerHelper('target_assign', **locals())
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
out_weight = helper.create_tmp_variable(dtype='float32') out_weight = helper.create_variable_for_type_inference(dtype='float32')
helper.append_op( helper.append_op(
type='target_assign', type='target_assign',
inputs={ inputs={
...@@ -816,9 +820,10 @@ def ssd_loss(location, ...@@ -816,9 +820,10 @@ def ssd_loss(location,
conf_loss = nn.reshape( conf_loss = nn.reshape(
x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape) x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)
conf_loss.stop_gradient = True conf_loss.stop_gradient = True
neg_indices = helper.create_tmp_variable(dtype='int32') neg_indices = helper.create_variable_for_type_inference(dtype='int32')
dtype = matched_indices.dtype dtype = matched_indices.dtype
updated_matched_indices = helper.create_tmp_variable(dtype=dtype) updated_matched_indices = helper.create_variable_for_type_inference(
dtype=dtype)
helper.append_op( helper.append_op(
type='mine_hard_examples', type='mine_hard_examples',
inputs={ inputs={
...@@ -998,8 +1003,8 @@ def prior_box(input, ...@@ -998,8 +1003,8 @@ def prior_box(input,
max_sizes = [max_sizes] max_sizes = [max_sizes]
attrs['max_sizes'] = max_sizes attrs['max_sizes'] = max_sizes
box = helper.create_tmp_variable(dtype) box = helper.create_variable_for_type_inference(dtype)
var = helper.create_tmp_variable(dtype) var = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="prior_box", type="prior_box",
inputs={"Input": input, inputs={"Input": input,
...@@ -1337,8 +1342,8 @@ def anchor_generator(input, ...@@ -1337,8 +1342,8 @@ def anchor_generator(input,
'offset': offset 'offset': offset
} }
anchor = helper.create_tmp_variable(dtype) anchor = helper.create_variable_for_type_inference(dtype)
var = helper.create_tmp_variable(dtype) var = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="anchor_generator", type="anchor_generator",
inputs={"Input": input}, inputs={"Input": input},
...@@ -1384,7 +1389,7 @@ def roi_perspective_transform(input, ...@@ -1384,7 +1389,7 @@ def roi_perspective_transform(input,
""" """
helper = LayerHelper('roi_perspective_transform', **locals()) helper = LayerHelper('roi_perspective_transform', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="roi_perspective_transform", type="roi_perspective_transform",
inputs={"X": input, inputs={"X": input,
...@@ -1418,11 +1423,15 @@ def generate_proposal_labels(rpn_rois, ...@@ -1418,11 +1423,15 @@ def generate_proposal_labels(rpn_rois,
helper = LayerHelper('generate_proposal_labels', **locals()) helper = LayerHelper('generate_proposal_labels', **locals())
rois = helper.create_tmp_variable(dtype=rpn_rois.dtype) rois = helper.create_variable_for_type_inference(dtype=rpn_rois.dtype)
labels_int32 = helper.create_tmp_variable(dtype=gt_classes.dtype) labels_int32 = helper.create_variable_for_type_inference(
bbox_targets = helper.create_tmp_variable(dtype=rpn_rois.dtype) dtype=gt_classes.dtype)
bbox_inside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype) bbox_targets = helper.create_variable_for_type_inference(
bbox_outside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype) dtype=rpn_rois.dtype)
bbox_inside_weights = helper.create_variable_for_type_inference(
dtype=rpn_rois.dtype)
bbox_outside_weights = helper.create_variable_for_type_inference(
dtype=rpn_rois.dtype)
helper.append_op( helper.append_op(
type="generate_proposal_labels", type="generate_proposal_labels",
...@@ -1504,8 +1513,10 @@ def generate_proposals(scores, ...@@ -1504,8 +1513,10 @@ def generate_proposals(scores,
""" """
helper = LayerHelper('generate_proposals', **locals()) helper = LayerHelper('generate_proposals', **locals())
rpn_rois = helper.create_tmp_variable(dtype=bbox_deltas.dtype) rpn_rois = helper.create_variable_for_type_inference(
rpn_roi_probs = helper.create_tmp_variable(dtype=scores.dtype) dtype=bbox_deltas.dtype)
rpn_roi_probs = helper.create_variable_for_type_inference(
dtype=scores.dtype)
helper.append_op( helper.append_op(
type="generate_proposals", type="generate_proposals",
inputs={ inputs={
......
...@@ -954,7 +954,7 @@ def read_file(reader): ...@@ -954,7 +954,7 @@ def read_file(reader):
""" """
helper = LayerHelper('read_file') helper = LayerHelper('read_file')
out = [ out = [
helper.create_tmp_variable( helper.create_variable_for_type_inference(
stop_gradient=True, dtype='float32') stop_gradient=True, dtype='float32')
for _ in range(len(reader.desc.shapes())) for _ in range(len(reader.desc.shapes()))
] ]
......
...@@ -202,10 +202,12 @@ def generate_layer_fn(op_type): ...@@ -202,10 +202,12 @@ def generate_layer_fn(op_type):
out_var = out[0] if (isinstance(out, list) or out_var = out[0] if (isinstance(out, list) or
isinstance(out, tuple)) else out isinstance(out, tuple)) else out
else: else:
out_var = helper.create_tmp_variable(dtype=dtype) out_var = helper.create_variable_for_type_inference(dtype=dtype)
outputs[o_name] = [out_var] outputs[o_name] = [out_var]
for name in intermediate_output_names: for name in intermediate_output_names:
outputs[name] = [helper.create_tmp_variable(dtype=dtype)] outputs[name] = [
helper.create_variable_for_type_inference(dtype=dtype)
]
helper.append_op( helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs) type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
return helper.append_activation(out_var) return helper.append_activation(out_var)
...@@ -229,7 +231,7 @@ def generate_layer_fn_noattr(op_type): ...@@ -229,7 +231,7 @@ def generate_layer_fn_noattr(op_type):
def func(x, name=None): def func(x, name=None):
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
output = helper.create_tmp_variable(dtype=x.dtype) output = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output}) helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output})
return output return output
......
...@@ -58,11 +58,11 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -58,11 +58,11 @@ def accuracy(input, label, k=1, correct=None, total=None):
""" """
helper = LayerHelper("accuracy", **locals()) helper = LayerHelper("accuracy", **locals())
topk_out, topk_indices = nn.topk(input, k=k) topk_out, topk_indices = nn.topk(input, k=k)
acc_out = helper.create_tmp_variable(dtype="float32") acc_out = helper.create_variable_for_type_inference(dtype="float32")
if correct is None: if correct is None:
correct = helper.create_tmp_variable(dtype="int64") correct = helper.create_variable_for_type_inference(dtype="int64")
if total is None: if total is None:
total = helper.create_tmp_variable(dtype="int64") total = helper.create_variable_for_type_inference(dtype="int64")
helper.append_op( helper.append_op(
type="accuracy", type="accuracy",
inputs={ inputs={
...@@ -124,8 +124,8 @@ def auc(input, ...@@ -124,8 +124,8 @@ def auc(input,
auc_out=fluid.layers.auc(input=prediction, label=label) auc_out=fluid.layers.auc(input=prediction, label=label)
""" """
helper = LayerHelper("auc", **locals()) helper = LayerHelper("auc", **locals())
auc_out = helper.create_tmp_variable(dtype="float64") auc_out = helper.create_variable_for_type_inference(dtype="float64")
batch_auc_out = helper.create_tmp_variable(dtype="float64") batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches. # make tp, tn, fp, fn persistable, so that can accumulate all batches.
# for batch auc # for batch auc
......
...@@ -96,6 +96,7 @@ __all__ = [ ...@@ -96,6 +96,7 @@ __all__ = [
'pad_constant_like', 'pad_constant_like',
'label_smooth', 'label_smooth',
'roi_pool', 'roi_pool',
'roi_align',
'dice_loss', 'dice_loss',
'image_resize', 'image_resize',
'image_resize_short', 'image_resize_short',
...@@ -241,7 +242,7 @@ def fc(input, ...@@ -241,7 +242,7 @@ def fc(input,
w = helper.create_parameter( w = helper.create_parameter(
attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False) attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="mul", type="mul",
inputs={"X": input_var, inputs={"X": input_var,
...@@ -254,7 +255,7 @@ def fc(input, ...@@ -254,7 +255,7 @@ def fc(input,
if len(mul_results) == 1: if len(mul_results) == 1:
pre_bias = mul_results[0] pre_bias = mul_results[0]
else: else:
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="sum", type="sum",
inputs={"X": mul_results}, inputs={"X": mul_results},
...@@ -313,7 +314,7 @@ def embedding(input, ...@@ -313,7 +314,7 @@ def embedding(input,
helper = LayerHelper('embedding', **locals()) helper = LayerHelper('embedding', **locals())
w = helper.create_parameter( w = helper.create_parameter(
attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False) attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_variable_for_type_inference(dtype)
padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else ( padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
size[0] + padding_idx) size[0] + padding_idx)
helper.append_op( helper.append_op(
...@@ -355,7 +356,6 @@ def dynamic_lstm(input, ...@@ -355,7 +356,6 @@ def dynamic_lstm(input,
c_0(Variable): The initial cell state is an optional input, default is zero. c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time. batch size. `h_0` and `c_0` can be NULL but only at the same time.
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weights. hidden-hidden weights.
...@@ -363,6 +363,11 @@ def dynamic_lstm(input, ...@@ -363,6 +363,11 @@ def dynamic_lstm(input,
W_{fh}, W_{oh}`} W_{fh}, W_{oh}`}
- The shape is (D x 4D), where D is the hidden - The shape is (D x 4D), where D is the hidden
size. size.
If it is set to None or one attribute of ParamAttr,
dynamic_lstm will create ParamAttr as param_attr.
If the Initializer of the param_attr is not set, the
parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The bias attribute for the learnable bias bias_attr (ParamAttr|None): The bias attribute for the learnable bias
weights, which contains two parts, input-hidden weights, which contains two parts, input-hidden
bias weights and peephole connections weights if bias weights and peephole connections weights if
...@@ -375,6 +380,11 @@ def dynamic_lstm(input, ...@@ -375,6 +380,11 @@ def dynamic_lstm(input,
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
W_{fc}, W_{oc}`}. W_{fc}, W_{oc}`}.
- The shape is (1 x 7D). - The shape is (1 x 7D).
If it is set to None or one attribute of ParamAttr,
dynamic_lstm will create ParamAttr as bias_attr.
If the Initializer of the bias_attr is not set,
the bias is initialized zero. Default: None.
use_peepholes (bool): ${use_peepholes_comment} use_peepholes (bool): ${use_peepholes_comment}
is_reverse (bool): ${is_reverse_comment} is_reverse (bool): ${is_reverse_comment}
gate_activation (str): ${gate_activation_comment} gate_activation (str): ${gate_activation_comment}
...@@ -393,11 +403,11 @@ def dynamic_lstm(input, ...@@ -393,11 +403,11 @@ def dynamic_lstm(input,
hidden_dim = 512 hidden_dim = 512
forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
act=None, bias_attr=None) bias_attr=False)
forward, _ = fluid.layers.dynamic_lstm( forward, _ = fluid.layers.dynamic_lstm(
input=forward_proj, size=hidden_dim * 4, use_peepholes=False) input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
""" """
assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
helper = LayerHelper('lstm', **locals()) helper = LayerHelper('lstm', **locals())
size = size // 4 size = size // 4
weight = helper.create_parameter( weight = helper.create_parameter(
...@@ -408,10 +418,10 @@ def dynamic_lstm(input, ...@@ -408,10 +418,10 @@ def dynamic_lstm(input,
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
hidden = helper.create_tmp_variable(dtype) hidden = helper.create_variable_for_type_inference(dtype)
cell = helper.create_tmp_variable(dtype) cell = helper.create_variable_for_type_inference(dtype)
batch_gate = helper.create_tmp_variable(dtype) batch_gate = helper.create_variable_for_type_inference(dtype)
batch_cell_pre_act = helper.create_tmp_variable(dtype) batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
inputs = {'Input': input, 'Weight': weight, 'Bias': bias} inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
batch_size = input.shape[0] batch_size = input.shape[0]
if h_0: if h_0:
...@@ -532,6 +542,11 @@ def dynamic_lstmp(input, ...@@ -532,6 +542,11 @@ def dynamic_lstmp(input,
size. size.
- Projection weight = {:math:`W_{rh}`}. - Projection weight = {:math:`W_{rh}`}.
- The shape of projection weight is (D x P). - The shape of projection weight is (D x P).
If it is set to None or one attribute of ParamAttr,
dynamic_lstm will create ParamAttr as param_attr.
If the Initializer of the param_attr is not set, the
parameter is initialized with Xavier. Default: None.
bias_attr(ParamAttr|None): The bias attribute for the learnable bias bias_attr(ParamAttr|None): The bias attribute for the learnable bias
weights, which contains two parts, input-hidden weights, which contains two parts, input-hidden
bias weights and peephole connections weights if bias weights and peephole connections weights if
...@@ -544,6 +559,11 @@ def dynamic_lstmp(input, ...@@ -544,6 +559,11 @@ def dynamic_lstmp(input,
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
W_{fc}, W_{oc}`}. W_{fc}, W_{oc}`}.
- The shape is (1 x 7D). - The shape is (1 x 7D).
If it is set to None or one attribute of ParamAttr,
dynamic_lstm will create ParamAttr as bias_attr.
If the Initializer of the bias_attr is not set,
the bias is initialized zero. Default: None.
use_peepholes(bool): Whether to enable diagonal/peephole connections, use_peepholes(bool): Whether to enable diagonal/peephole connections,
default `True`. default `True`.
is_reverse(bool): Whether to compute reversed LSTM, default `False`. is_reverse(bool): Whether to compute reversed LSTM, default `False`.
...@@ -588,6 +608,7 @@ def dynamic_lstmp(input, ...@@ -588,6 +608,7 @@ def dynamic_lstmp(input,
proj_activation="tanh") proj_activation="tanh")
""" """
assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
helper = LayerHelper('lstmp', **locals()) helper = LayerHelper('lstmp', **locals())
size = size // 4 size = size // 4
weight = helper.create_parameter( weight = helper.create_parameter(
...@@ -600,12 +621,12 @@ def dynamic_lstmp(input, ...@@ -600,12 +621,12 @@ def dynamic_lstmp(input,
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
projection = helper.create_tmp_variable(dtype) projection = helper.create_variable_for_type_inference(dtype)
cell = helper.create_tmp_variable(dtype) cell = helper.create_variable_for_type_inference(dtype)
ordered_proj0 = helper.create_tmp_variable(dtype) ordered_proj0 = helper.create_variable_for_type_inference(dtype)
batch_hidden = helper.create_tmp_variable(dtype) batch_hidden = helper.create_variable_for_type_inference(dtype)
batch_gate = helper.create_tmp_variable(dtype) batch_gate = helper.create_variable_for_type_inference(dtype)
batch_cell_pre_act = helper.create_tmp_variable(dtype) batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='lstmp', type='lstmp',
...@@ -730,10 +751,10 @@ def dynamic_gru(input, ...@@ -730,10 +751,10 @@ def dynamic_gru(input,
), 'The shape of h0 should be(batch_size, %d)' % size ), 'The shape of h0 should be(batch_size, %d)' % size
inputs['H0'] = h_0 inputs['H0'] = h_0
hidden = helper.create_tmp_variable(dtype) hidden = helper.create_variable_for_type_inference(dtype)
batch_gate = helper.create_tmp_variable(dtype) batch_gate = helper.create_variable_for_type_inference(dtype)
batch_reset_hidden_prev = helper.create_tmp_variable(dtype) batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype)
batch_hidden = helper.create_tmp_variable(dtype) batch_hidden = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='gru', type='gru',
...@@ -823,9 +844,9 @@ def gru_unit(input, ...@@ -823,9 +844,9 @@ def gru_unit(input,
weight = helper.create_parameter( weight = helper.create_parameter(
attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
gate = helper.create_tmp_variable(dtype) gate = helper.create_variable_for_type_inference(dtype)
reset_hidden_pre = helper.create_tmp_variable(dtype) reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
updated_hidden = helper.create_tmp_variable(dtype) updated_hidden = helper.create_variable_for_type_inference(dtype)
inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight} inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
# create bias # create bias
if helper.bias_attr: if helper.bias_attr:
...@@ -875,10 +896,14 @@ def linear_chain_crf(input, label, param_attr=None): ...@@ -875,10 +896,14 @@ def linear_chain_crf(input, label, param_attr=None):
attr=helper.param_attr, attr=helper.param_attr,
shape=[size + 2, size], shape=[size + 2, size],
dtype=helper.input_dtype()) dtype=helper.input_dtype())
alpha = helper.create_tmp_variable(dtype=helper.input_dtype()) alpha = helper.create_variable_for_type_inference(
emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) dtype=helper.input_dtype())
transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) emission_exps = helper.create_variable_for_type_inference(
log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype()) dtype=helper.input_dtype())
transition_exps = helper.create_variable_for_type_inference(
dtype=helper.input_dtype())
log_likelihood = helper.create_variable_for_type_inference(
dtype=helper.input_dtype())
helper.append_op( helper.append_op(
type='linear_chain_crf', type='linear_chain_crf',
inputs={"Emission": [input], inputs={"Emission": [input],
...@@ -917,7 +942,8 @@ def crf_decoding(input, param_attr, label=None): ...@@ -917,7 +942,8 @@ def crf_decoding(input, param_attr, label=None):
""" """
helper = LayerHelper('crf_decoding', **locals()) helper = LayerHelper('crf_decoding', **locals())
transition = helper.get_parameter(param_attr.name) transition = helper.get_parameter(param_attr.name)
viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype()) viterbi_path = helper.create_variable_for_type_inference(
dtype=helper.input_dtype())
helper.append_op( helper.append_op(
type='crf_decoding', type='crf_decoding',
inputs={"Emission": [input], inputs={"Emission": [input],
...@@ -941,9 +967,9 @@ def cos_sim(X, Y): ...@@ -941,9 +967,9 @@ def cos_sim(X, Y):
Variable: the output of cosine(X, Y). Variable: the output of cosine(X, Y).
""" """
helper = LayerHelper('cos_sim', **locals()) helper = LayerHelper('cos_sim', **locals())
out = helper.create_tmp_variable(dtype=X.dtype) out = helper.create_variable_for_type_inference(dtype=X.dtype)
xnorm = helper.create_tmp_variable(dtype=X.dtype) xnorm = helper.create_variable_for_type_inference(dtype=X.dtype)
ynorm = helper.create_tmp_variable(dtype=X.dtype) ynorm = helper.create_variable_for_type_inference(dtype=X.dtype)
helper.append_op( helper.append_op(
type='cos_sim', type='cos_sim',
inputs={'X': [X], inputs={'X': [X],
...@@ -987,8 +1013,9 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): ...@@ -987,8 +1013,9 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
""" """
helper = LayerHelper('dropout', **locals()) helper = LayerHelper('dropout', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True) mask = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True)
if (seed is None or seed == 0) and helper.main_program.random_seed != 0: if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
seed = helper.main_program.random_seed seed = helper.main_program.random_seed
...@@ -1073,7 +1100,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): ...@@ -1073,7 +1100,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
""" """
helper = LayerHelper('cross_entropy', **locals()) helper = LayerHelper('cross_entropy', **locals())
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type='cross_entropy', type='cross_entropy',
inputs={'X': [input], inputs={'X': [input],
...@@ -1120,14 +1147,14 @@ def square_error_cost(input, label): ...@@ -1120,14 +1147,14 @@ def square_error_cost(input, label):
""" """
helper = LayerHelper('square_error_cost', **locals()) helper = LayerHelper('square_error_cost', **locals())
minus_out = helper.create_tmp_variable(dtype=input.dtype) minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type='elementwise_sub', type='elementwise_sub',
inputs={'X': [input], inputs={'X': [input],
'Y': [label]}, 'Y': [label]},
outputs={'Out': [minus_out]}) outputs={'Out': [minus_out]})
square_out = helper.create_tmp_variable(dtype=input.dtype) square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type='square', inputs={'X': [minus_out]}, type='square', inputs={'X': [minus_out]},
outputs={'Out': [square_out]}) outputs={'Out': [square_out]})
...@@ -1233,12 +1260,13 @@ def chunk_eval(input, ...@@ -1233,12 +1260,13 @@ def chunk_eval(input,
helper = LayerHelper("chunk_eval", **locals()) helper = LayerHelper("chunk_eval", **locals())
# prepare output # prepare output
precision = helper.create_tmp_variable(dtype="float32") precision = helper.create_variable_for_type_inference(dtype="float32")
recall = helper.create_tmp_variable(dtype="float32") recall = helper.create_variable_for_type_inference(dtype="float32")
f1_score = helper.create_tmp_variable(dtype="float32") f1_score = helper.create_variable_for_type_inference(dtype="float32")
num_infer_chunks = helper.create_tmp_variable(dtype="int64") num_infer_chunks = helper.create_variable_for_type_inference(dtype="int64")
num_label_chunks = helper.create_tmp_variable(dtype="int64") num_label_chunks = helper.create_variable_for_type_inference(dtype="int64")
num_correct_chunks = helper.create_tmp_variable(dtype="int64") num_correct_chunks = helper.create_variable_for_type_inference(
dtype="int64")
helper.append_op( helper.append_op(
type="chunk_eval", type="chunk_eval",
...@@ -1269,7 +1297,8 @@ def sequence_conv(input, ...@@ -1269,7 +1297,8 @@ def sequence_conv(input,
padding=None, padding=None,
bias_attr=None, bias_attr=None,
param_attr=None, param_attr=None,
act=None): act=None,
name=None):
""" """
This function creates the op for sequence_conv, using the inputs and This function creates the op for sequence_conv, using the inputs and
other convolutional configurations for the filters and stride as given other convolutional configurations for the filters and stride as given
...@@ -1281,9 +1310,19 @@ def sequence_conv(input, ...@@ -1281,9 +1310,19 @@ def sequence_conv(input,
filter_size (int): the filter size (H and W). filter_size (int): the filter size (H and W).
filter_stride (int): stride of the filter. filter_stride (int): stride of the filter.
padding (bool): if True, add paddings. padding (bool): if True, add paddings.
bias_attr (ParamAttr|None): attributes for bias bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
param_attr (ParamAttr|None): attributes for parameter If it is set to False, no bias will be added to the output units.
act (str): the activation type If it is set to None or one attribute of ParamAttr, sequence_conv
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
act (str): Activation type, if it is set to None, activation is not appended.
Default: None.
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
Returns: Returns:
Variable: output of sequence_conv Variable: output of sequence_conv
...@@ -1294,7 +1333,7 @@ def sequence_conv(input, ...@@ -1294,7 +1333,7 @@ def sequence_conv(input,
filter_shape = [filter_size * input.shape[1], num_filters] filter_shape = [filter_size * input.shape[1], num_filters]
filter_param = helper.create_parameter( filter_param = helper.create_parameter(
attr=helper.param_attr, shape=filter_shape, dtype=dtype) attr=helper.param_attr, shape=filter_shape, dtype=dtype)
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='sequence_conv', type='sequence_conv',
...@@ -1312,7 +1351,7 @@ def sequence_conv(input, ...@@ -1312,7 +1351,7 @@ def sequence_conv(input,
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): def sequence_softmax(input, use_cudnn=False, name=None):
""" """
This function computes the softmax activation among all time-steps for each This function computes the softmax activation among all time-steps for each
sequence. The dimension of each time-step should be 1. Thus, the shape of sequence. The dimension of each time-step should be 1. Thus, the shape of
...@@ -1332,10 +1371,10 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): ...@@ -1332,10 +1371,10 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False):
Args: Args:
input (Variable): The input variable which is a LoDTensor. input (Variable): The input variable which is a LoDTensor.
bias_attr (ParamAttr|None): attributes for bias
param_attr (ParamAttr|None): attributes for parameter
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \ use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
library is installed. Default: False library is installed. Default: False.
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
Returns: Returns:
Variable: output of sequence_softmax Variable: output of sequence_softmax
...@@ -1350,7 +1389,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): ...@@ -1350,7 +1389,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False):
""" """
helper = LayerHelper('sequence_softmax', **locals()) helper = LayerHelper('sequence_softmax', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
softmax_out = helper.create_tmp_variable(dtype) softmax_out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="sequence_softmax", type="sequence_softmax",
inputs={"X": input}, inputs={"X": input},
...@@ -1359,7 +1398,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): ...@@ -1359,7 +1398,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False):
return softmax_out return softmax_out
def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): def softmax(input, use_cudnn=True, name=None):
""" """
The input of the softmax operator is a tensor of any rank. The output tensor The input of the softmax operator is a tensor of any rank. The output tensor
has the same shape as the input. has the same shape as the input.
...@@ -1386,10 +1425,10 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): ...@@ -1386,10 +1425,10 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
Args: Args:
input (Variable): The input variable. input (Variable): The input variable.
bias_attr (ParamAttr): attributes for bias
param_attr (ParamAttr): attributes for parameter
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \ use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
library is installed. library is installed.
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
Returns: Returns:
Variable: output of softmax Variable: output of softmax
...@@ -1404,7 +1443,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): ...@@ -1404,7 +1443,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
""" """
helper = LayerHelper('softmax', **locals()) helper = LayerHelper('softmax', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
softmax_out = helper.create_tmp_variable(dtype) softmax_out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="softmax", type="softmax",
inputs={"X": input}, inputs={"X": input},
...@@ -1495,14 +1534,23 @@ def conv2d(input, ...@@ -1495,14 +1534,23 @@ def conv2d(input,
convolution in Alex Krizhevsky's Deep CNN paper: when group=2, convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1 connected to the second half of the input channels. Default: groups=1.
param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
act (str): Activation type. Default: None act (str): Activation type, if it is set to None, activation is not appended.
Default: None
name (str|None): A name for this layer(optional). If set None, the layer name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically. Default: None
Returns: Returns:
Variable: The tensor variable storing the convolution and \ Variable: The tensor variable storing the convolution and \
...@@ -1520,7 +1568,7 @@ def conv2d(input, ...@@ -1520,7 +1568,7 @@ def conv2d(input,
""" """
num_channels = input.shape[1] num_channels = input.shape[1]
assert param_attr is not False, "param_attr should not be False here."
l_type = 'conv2d' l_type = 'conv2d'
if (num_channels == groups and num_filters % num_channels == 0 and if (num_channels == groups and num_filters % num_channels == 0 and
not use_cudnn): not use_cudnn):
...@@ -1548,7 +1596,8 @@ def conv2d(input, ...@@ -1548,7 +1596,8 @@ def conv2d(input,
filter_shape = [num_filters, int(num_filter_channels)] + filter_size filter_shape = [num_filters, int(num_filter_channels)] + filter_size
def _get_default_param_initializer(): def _get_default_param_initializer():
std = (2.0 / (filter_size[0]**2 * num_channels))**0.5 filter_elem_num = filter_size[0] * filter_size[1] * num_channels
std = (2.0 / filter_elem_num)**0.5
return Normal(0.0, std, 0) return Normal(0.0, std, 0)
filter_param = helper.create_parameter( filter_param = helper.create_parameter(
...@@ -1557,7 +1606,7 @@ def conv2d(input, ...@@ -1557,7 +1606,7 @@ def conv2d(input,
dtype=dtype, dtype=dtype,
default_initializer=_get_default_param_initializer()) default_initializer=_get_default_param_initializer())
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type=l_type, type=l_type,
...@@ -1659,13 +1708,22 @@ def conv3d(input, ...@@ -1659,13 +1708,22 @@ def conv3d(input,
the first half of the filters is only connected to the first half the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1 connected to the second half of the input channels. Default: groups=1
param_attr (ParamAttr): The parameters to the Conv3d Layer. Default: None param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
will create ParamAttr as param_attr. If it is set to None, the parameter
is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
:math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, conv3d
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
act (str): Activation type. Default: None act (str): Activation type, if it is set to None, activation is not appended.
Default: None.
name (str|None): A name for this layer(optional). If set None, the layer name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically. Default: None.
Returns: Returns:
Variable: The tensor variable storing the convolution and \ Variable: The tensor variable storing the convolution and \
...@@ -1683,7 +1741,7 @@ def conv3d(input, ...@@ -1683,7 +1741,7 @@ def conv3d(input,
""" """
l_type = 'conv3d' l_type = 'conv3d'
assert param_attr is not False, "param_attr should not be False here."
helper = LayerHelper(l_type, **locals()) helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
...@@ -1708,7 +1766,9 @@ def conv3d(input, ...@@ -1708,7 +1766,9 @@ def conv3d(input,
filter_shape = [num_filters, num_filter_channels] + filter_size filter_shape = [num_filters, num_filter_channels] + filter_size
def _get_default_param_initializer(): def _get_default_param_initializer():
std = (2.0 / (filter_size[0]**3 * num_channels))**0.5 filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
2] * num_channels
std = (2.0 / filter_elem_num)**0.5
return Normal(0.0, std, 0) return Normal(0.0, std, 0)
filter_param = helper.create_parameter( filter_param = helper.create_parameter(
...@@ -1717,7 +1777,7 @@ def conv3d(input, ...@@ -1717,7 +1777,7 @@ def conv3d(input,
dtype=dtype, dtype=dtype,
default_initializer=_get_default_param_initializer()) default_initializer=_get_default_param_initializer())
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type=l_type, type=l_type,
...@@ -1796,8 +1856,8 @@ def sequence_pool(input, pool_type): ...@@ -1796,8 +1856,8 @@ def sequence_pool(input, pool_type):
""" """
helper = LayerHelper('sequence_pool', **locals()) helper = LayerHelper('sequence_pool', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
pool_out = helper.create_tmp_variable(dtype) pool_out = helper.create_variable_for_type_inference(dtype)
max_index = helper.create_tmp_variable(dtype) max_index = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="sequence_pool", type="sequence_pool",
...@@ -1833,7 +1893,7 @@ def sequence_concat(input, name=None): ...@@ -1833,7 +1893,7 @@ def sequence_concat(input, name=None):
out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3]) out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
""" """
helper = LayerHelper('sequence_concat', **locals()) helper = LayerHelper('sequence_concat', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
helper.append_op( helper.append_op(
type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]}) type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]})
return out return out
...@@ -1960,7 +2020,7 @@ def sequence_slice(input, offset, length, name=None): ...@@ -1960,7 +2020,7 @@ def sequence_slice(input, offset, length, name=None):
""" """
helper = LayerHelper("sequence_slice", **locals()) helper = LayerHelper("sequence_slice", **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
offset.stop_gradient = True offset.stop_gradient = True
length.stop_gradient = True length.stop_gradient = True
...@@ -2046,7 +2106,7 @@ def pool2d(input, ...@@ -2046,7 +2106,7 @@ def pool2d(input,
helper = LayerHelper(l_type, **locals()) helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
pool_out = helper.create_tmp_variable(dtype) pool_out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type=l_type, type=l_type,
...@@ -2114,7 +2174,7 @@ def pool3d(input, ...@@ -2114,7 +2174,7 @@ def pool3d(input,
l_type = "pool3d" l_type = "pool3d"
helper = LayerHelper(l_type, **locals()) helper = LayerHelper(l_type, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
pool_out = helper.create_tmp_variable(dtype) pool_out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type=l_type, type=l_type,
...@@ -2180,8 +2240,14 @@ def batch_norm(input, ...@@ -2180,8 +2240,14 @@ def batch_norm(input,
is_test(bool, Default False): Used for training or training. is_test(bool, Default False): Used for training or training.
momentum(float, Default 0.9): momentum(float, Default 0.9):
epsilon(float, Default 1e-05): epsilon(float, Default 1e-05):
param_attr(ParamAttr): The parameter attribute for Parameter `scale`. param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
bias_attr(ParamAttr): The parameter attribute for Parameter `bias`. of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
data_layout(string, default NCHW): NCHW|NHWC data_layout(string, default NCHW): NCHW|NHWC
in_place(bool, Default False): Make the input and output of batch norm reuse memory. in_place(bool, Default False): Make the input and output of batch norm reuse memory.
name(string, Default None): A name for this layer(optional). If set None, the layer name(string, Default None): A name for this layer(optional). If set None, the layer
...@@ -2201,6 +2267,7 @@ def batch_norm(input, ...@@ -2201,6 +2267,7 @@ def batch_norm(input,
hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
hidden2 = fluid.layers.batch_norm(input=hidden1) hidden2 = fluid.layers.batch_norm(input=hidden1)
""" """
assert bias_attr is not False, "bias_attr should not be False in batch_norm."
helper = LayerHelper('batch_norm', **locals()) helper = LayerHelper('batch_norm', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
...@@ -2250,10 +2317,13 @@ def batch_norm(input, ...@@ -2250,10 +2317,13 @@ def batch_norm(input,
mean_out = mean mean_out = mean
# variance and variance out share the same memory # variance and variance out share the same memory
variance_out = variance variance_out = variance
saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) saved_mean = helper.create_variable_for_type_inference(
saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) dtype=dtype, stop_gradient=True)
saved_variance = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
batch_norm_out = input if in_place else helper.create_tmp_variable(dtype) batch_norm_out = input if in_place else helper.create_variable_for_type_inference(
dtype)
helper.append_op( helper.append_op(
type="batch_norm", type="batch_norm",
...@@ -2317,19 +2387,28 @@ def layer_norm(input, ...@@ -2317,19 +2387,28 @@ def layer_norm(input,
Args: Args:
input(Variable): The input tensor variable. input(Variable): The input tensor variable.
scale(bool): Whether to learn the adaptive gain :math:`g` after scale(bool): Whether to learn the adaptive gain :math:`g` after
normalization. normalization. Default True.
shift(bool): Whether to learn the adaptive bias :math:`b` after shift(bool): Whether to learn the adaptive bias :math:`b` after
normalization. normalization. Default True.
begin_norm_axis(bool): The normalization will be performed along begin_norm_axis(int): The normalization will be performed along
dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`. dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
Default 1.
epsilon(float): The small value added to the variance to prevent epsilon(float): The small value added to the variance to prevent
division by zero. division by zero. Default 1e-05.
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
gain :math:`g`. gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
a default :code:`ParamAttr` would be added as scale. The
:attr:`param_attr` is initialized as 1 if it is added. Default None.
bias_attr(ParamAttr|None): The parameter attribute for the learnable bias_attr(ParamAttr|None): The parameter attribute for the learnable
bias :math:`b`. bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
a default :code:`ParamAttr` would be added as bias. The
:attr:`bias_attr` is initialized as 0 if it is added. Default None.
act(str): Activation to be applied to the output of layer normalizaiton. act(str): Activation to be applied to the output of layer normalizaiton.
name (str): The name of this layer. It is optional. Default None.
name(str): The name of this layer. It is optional. Default None, and a
unique name would be generated automatically.
Returns: Returns:
${y_comment} ${y_comment}
...@@ -2361,9 +2440,11 @@ def layer_norm(input, ...@@ -2361,9 +2440,11 @@ def layer_norm(input,
inputs['Bias'] = bias inputs['Bias'] = bias
# create output # create output
mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) mean_out = helper.create_variable_for_type_inference(
variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) dtype=dtype, stop_gradient=True)
layer_norm_out = helper.create_tmp_variable(dtype) variance_out = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
layer_norm_out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="layer_norm", type="layer_norm",
...@@ -2470,15 +2551,22 @@ def conv2d_transpose(input, ...@@ -2470,15 +2551,22 @@ def conv2d_transpose(input,
when group=2, the first half of the filters is only connected to the when group=2, the first half of the filters is only connected to the
first half of the input channels, while the second half of the first half of the input channels, while the second half of the
filters is only connected to the second half of the input channels. filters is only connected to the second half of the input channels.
Default: groups=1 Default: groups = 1.
param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
Default: None of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, conv2d_transpose
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True.
act(str): Activation type. Default: None act (str): Activation type, if it is set to None, activation is not appended.
Default: None.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically. Default: True.
Returns: Returns:
Variable: The tensor variable storing the convolution transpose result. Variable: The tensor variable storing the convolution transpose result.
...@@ -2493,7 +2581,7 @@ def conv2d_transpose(input, ...@@ -2493,7 +2581,7 @@ def conv2d_transpose(input,
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3) conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
""" """
assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
input_channel = input.shape[1] input_channel = input.shape[1]
op_type = 'conv2d_transpose' op_type = 'conv2d_transpose'
...@@ -2529,6 +2617,7 @@ def conv2d_transpose(input, ...@@ -2529,6 +2617,7 @@ def conv2d_transpose(input,
else: else:
filter_size = utils.convert_to_list(filter_size, 2, filter_size = utils.convert_to_list(filter_size, 2,
'conv2d_transpose.filter_size') 'conv2d_transpose.filter_size')
if output_size is None: if output_size is None:
output_size = [] output_size = []
elif isinstance(output_size, list) or isinstance(output_size, int): elif isinstance(output_size, list) or isinstance(output_size, int):
...@@ -2538,10 +2627,11 @@ def conv2d_transpose(input, ...@@ -2538,10 +2627,11 @@ def conv2d_transpose(input,
padding = utils.convert_to_list(padding, 2, 'padding') padding = utils.convert_to_list(padding, 2, 'padding')
groups = 1 if groups is None else groups groups = 1 if groups is None else groups
filter_shape = [input_channel, num_filters // groups] + filter_size filter_shape = [input_channel, num_filters // groups] + filter_size
img_filter = helper.create_parameter( img_filter = helper.create_parameter(
dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
pre_bias = helper.create_tmp_variable(dtype=input.dtype) pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type=op_type, type=op_type,
inputs={'Input': [input], inputs={'Input': [input],
...@@ -2650,12 +2740,19 @@ def conv3d_transpose(input, ...@@ -2650,12 +2740,19 @@ def conv3d_transpose(input,
first half of the input channels, while the second half of the first half of the input channels, while the second half of the
filters is only connected to the second half of the input channels. filters is only connected to the second half of the input channels.
Default: groups=1 Default: groups=1
param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer. param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
Default: None of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, conv3d_transpose
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
act(str): Activation type. Default: None act (str): Activation type, if it is set to None, activation is not appended.
Default: None.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -2672,6 +2769,7 @@ def conv3d_transpose(input, ...@@ -2672,6 +2769,7 @@ def conv3d_transpose(input,
data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32') data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3) conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
""" """
assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
l_type = "conv3d_transpose" l_type = "conv3d_transpose"
helper = LayerHelper(l_type, **locals()) helper = LayerHelper(l_type, **locals())
if not isinstance(input, Variable): if not isinstance(input, Variable):
...@@ -2711,7 +2809,7 @@ def conv3d_transpose(input, ...@@ -2711,7 +2809,7 @@ def conv3d_transpose(input,
img_filter = helper.create_parameter( img_filter = helper.create_parameter(
dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
pre_bias = helper.create_tmp_variable(dtype=input.dtype) pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type=l_type, type=l_type,
inputs={'Input': [input], inputs={'Input': [input],
...@@ -2790,7 +2888,7 @@ def sequence_expand(x, y, ref_level=-1, name=None): ...@@ -2790,7 +2888,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
""" """
helper = LayerHelper('sequence_expand', input=x, **locals()) helper = LayerHelper('sequence_expand', input=x, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='sequence_expand', type='sequence_expand',
inputs={'X': x, inputs={'X': x,
...@@ -2856,7 +2954,7 @@ def sequence_expand_as(x, y, name=None): ...@@ -2856,7 +2954,7 @@ def sequence_expand_as(x, y, name=None):
""" """
helper = LayerHelper('sequence_expand_as', input=x, **locals()) helper = LayerHelper('sequence_expand_as', input=x, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='sequence_expand_as', type='sequence_expand_as',
inputs={'X': x, inputs={'X': x,
...@@ -2901,8 +2999,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None): ...@@ -2901,8 +2999,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
helper = LayerHelper('sequence_pad', input=x, **locals()) helper = LayerHelper('sequence_pad', input=x, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
length = helper.create_tmp_variable(dtype) length = helper.create_variable_for_type_inference(dtype)
pad_value.stop_gradient = True pad_value.stop_gradient = True
length.stop_gradient = True length.stop_gradient = True
...@@ -2967,7 +3065,7 @@ def sequence_unpad(x, length, name=None): ...@@ -2967,7 +3065,7 @@ def sequence_unpad(x, length, name=None):
helper = LayerHelper('sequence_unpad', input=x, **locals()) helper = LayerHelper('sequence_unpad', input=x, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
length.stop_gradient = True length.stop_gradient = True
...@@ -3066,8 +3164,9 @@ def beam_search(pre_ids, ...@@ -3066,8 +3164,9 @@ def beam_search(pre_ids,
score_type = scores.dtype score_type = scores.dtype
id_type = ids.dtype id_type = ids.dtype
selected_scores = helper.create_tmp_variable(dtype=score_type) selected_scores = helper.create_variable_for_type_inference(
selected_ids = helper.create_tmp_variable(dtype=id_type) dtype=score_type)
selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
helper.append_op( helper.append_op(
type='beam_search', type='beam_search',
...@@ -3124,8 +3223,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None): ...@@ -3124,8 +3223,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
ids, scores, beam_size=5, end_id=0) ids, scores, beam_size=5, end_id=0)
""" """
helper = LayerHelper('beam_search_decode', **locals()) helper = LayerHelper('beam_search_decode', **locals())
sentence_ids = helper.create_tmp_variable(dtype=ids.dtype) sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype)
sentence_scores = helper.create_tmp_variable(dtype=ids.dtype) sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype)
helper.append_op( helper.append_op(
type="beam_search_decode", type="beam_search_decode",
...@@ -3190,10 +3289,18 @@ def lstm_unit(x_t, ...@@ -3190,10 +3289,18 @@ def lstm_unit(x_t,
cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
shape M x S, M for batch size and S for size of lstm unit. shape M x S, M for batch size and S for size of lstm unit.
forget_bias (float): The forget bias of lstm unit. forget_bias (float): The forget bias of lstm unit.
param_attr (ParamAttr): The attributes of parameter weights, used to set param_attr(ParamAttr|None): The parameter attribute for the learnable
initializer, name etc. hidden-hidden weights.
bias_attr (ParamAttr): The attributes of bias weights, if not False, If it is set to None or one attribute of ParamAttr,
bias weights will be created and be set to default value. lstm_unit will create ParamAttr as param_attr.
If the Initializer of the param_attr is not set, the
parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The bias attribute for the learnable bias
weights. If it is set to False, no bias will be added
to the output units. If it is set to None or one attribute of ParamAttr,
lstm_unit will create ParamAttr as bias_attr.
If the Initializer of the bias_attr is not set,
the bias is initialized zero. Default: None.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -3247,8 +3354,8 @@ def lstm_unit(x_t, ...@@ -3247,8 +3354,8 @@ def lstm_unit(x_t,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr) bias_attr=bias_attr)
dtype = x_t.dtype dtype = x_t.dtype
c = helper.create_tmp_variable(dtype) c = helper.create_variable_for_type_inference(dtype)
h = helper.create_tmp_variable(dtype) h = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='lstm_unit', type='lstm_unit',
...@@ -3302,7 +3409,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): ...@@ -3302,7 +3409,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
""" """
helper = LayerHelper('reduce_sum', **locals()) helper = LayerHelper('reduce_sum', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
if dim is not None and not isinstance(dim, list): if dim is not None and not isinstance(dim, list):
dim = [dim] dim = [dim]
helper.append_op( helper.append_op(
...@@ -3359,7 +3466,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None): ...@@ -3359,7 +3466,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0] fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0]
""" """
helper = LayerHelper('reduce_mean', **locals()) helper = LayerHelper('reduce_mean', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
if dim is not None and not isinstance(dim, list): if dim is not None and not isinstance(dim, list):
dim = [dim] dim = [dim]
helper.append_op( helper.append_op(
...@@ -3414,7 +3521,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None): ...@@ -3414,7 +3521,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0] fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0]
""" """
helper = LayerHelper('reduce_max', **locals()) helper = LayerHelper('reduce_max', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
if dim is not None and not isinstance(dim, list): if dim is not None and not isinstance(dim, list):
dim = [dim] dim = [dim]
helper.append_op( helper.append_op(
...@@ -3469,7 +3576,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None): ...@@ -3469,7 +3576,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0] fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0]
""" """
helper = LayerHelper('reduce_min', **locals()) helper = LayerHelper('reduce_min', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
if dim is not None and not isinstance(dim, list): if dim is not None and not isinstance(dim, list):
dim = [dim] dim = [dim]
helper.append_op( helper.append_op(
...@@ -3525,7 +3632,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): ...@@ -3525,7 +3632,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0] fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0]
""" """
helper = LayerHelper('reduce_prod', **locals()) helper = LayerHelper('reduce_prod', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
if dim is not None and not isinstance(dim, list): if dim is not None and not isinstance(dim, list):
dim = [dim] dim = [dim]
helper.append_op( helper.append_op(
...@@ -3585,7 +3692,7 @@ def split(input, num_or_sections, dim=-1, name=None): ...@@ -3585,7 +3692,7 @@ def split(input, num_or_sections, dim=-1, name=None):
dim], 'len(num_or_sections) must not be more than input.shape[dim].' dim], 'len(num_or_sections) must not be more than input.shape[dim].'
num = len(num_or_sections) num = len(num_or_sections)
outs = [ outs = [
helper.create_tmp_variable(dtype=helper.input_dtype()) helper.create_variable_for_type_inference(dtype=helper.input_dtype())
for i in range(num) for i in range(num)
] ]
helper.append_op( helper.append_op(
...@@ -3642,8 +3749,8 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): ...@@ -3642,8 +3749,8 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
axis = 0 axis = 0
helper = LayerHelper("l2_normalize", **locals()) helper = LayerHelper("l2_normalize", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
norm = helper.create_tmp_variable(dtype=x.dtype) norm = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type="norm", type="norm",
inputs={"X": x}, inputs={"X": x},
...@@ -3752,7 +3859,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): ...@@ -3752,7 +3859,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
__check_input(x, y) __check_input(x, y)
helper = LayerHelper('matmul', **locals()) helper = LayerHelper('matmul', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='matmul', type='matmul',
inputs={'X': x, inputs={'X': x,
...@@ -3823,8 +3930,8 @@ def topk(input, k, name=None): ...@@ -3823,8 +3930,8 @@ def topk(input, k, name=None):
top5_values, top5_indices = layers.topk(input, k=5) top5_values, top5_indices = layers.topk(input, k=5)
""" """
helper = LayerHelper("top_k", **locals()) helper = LayerHelper("top_k", **locals())
values = helper.create_tmp_variable(dtype=input.dtype) values = helper.create_variable_for_type_inference(dtype=input.dtype)
indices = helper.create_tmp_variable(dtype="int64") indices = helper.create_variable_for_type_inference(dtype="int64")
helper.append_op( helper.append_op(
type="top_k", type="top_k",
inputs={"X": [input]}, inputs={"X": [input]},
...@@ -3882,8 +3989,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None): ...@@ -3882,8 +3989,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
# remove some tokens from input and labels # remove some tokens from input and labels
if ignored_tokens is not None and len(ignored_tokens) > 0: if ignored_tokens is not None and len(ignored_tokens) > 0:
erased_input = helper.create_tmp_variable(dtype="int64") erased_input = helper.create_variable_for_type_inference(dtype="int64")
erased_label = helper.create_tmp_variable(dtype="int64") erased_label = helper.create_variable_for_type_inference(dtype="int64")
helper.append_op( helper.append_op(
type="sequence_erase", type="sequence_erase",
...@@ -3900,8 +4007,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None): ...@@ -3900,8 +4007,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
label = erased_label label = erased_label
# edit distance op # edit distance op
edit_distance_out = helper.create_tmp_variable(dtype="int64") edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
sequence_num = helper.create_tmp_variable(dtype="int64") sequence_num = helper.create_variable_for_type_inference(dtype="int64")
helper.append_op( helper.append_op(
type="edit_distance", type="edit_distance",
inputs={"Hyps": [input], inputs={"Hyps": [input],
...@@ -3976,7 +4083,7 @@ def ctc_greedy_decoder(input, blank, name=None): ...@@ -3976,7 +4083,7 @@ def ctc_greedy_decoder(input, blank, name=None):
_, topk_indices = topk(input, k=1) _, topk_indices = topk(input, k=1)
# ctc align op # ctc align op
ctc_out = helper.create_tmp_variable(dtype="int64") ctc_out = helper.create_variable_for_type_inference(dtype="int64")
helper.append_op( helper.append_op(
type="ctc_align", type="ctc_align",
inputs={"Input": [topk_indices]}, inputs={"Input": [topk_indices]},
...@@ -4026,8 +4133,8 @@ def warpctc(input, label, blank=0, norm_by_times=False): ...@@ -4026,8 +4133,8 @@ def warpctc(input, label, blank=0, norm_by_times=False):
""" """
helper = LayerHelper('warpctc', **locals()) helper = LayerHelper('warpctc', **locals())
loss_out = helper.create_tmp_variable(dtype=input.dtype) loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
grad_out = helper.create_tmp_variable(dtype=input.dtype) grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type='warpctc', type='warpctc',
inputs={'Logits': [input], inputs={'Logits': [input],
...@@ -4088,7 +4195,7 @@ def sequence_reshape(input, new_dim): ...@@ -4088,7 +4195,7 @@ def sequence_reshape(input, new_dim):
x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10) x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
""" """
helper = LayerHelper('sequence_reshape', **locals()) helper = LayerHelper('sequence_reshape', **locals())
out = helper.create_tmp_variable(helper.input_dtype()) out = helper.create_variable_for_type_inference(helper.input_dtype())
helper.append_op( helper.append_op(
type='sequence_reshape', type='sequence_reshape',
inputs={'X': [input]}, inputs={'X': [input]},
...@@ -4107,7 +4214,8 @@ def nce(input, ...@@ -4107,7 +4214,8 @@ def nce(input,
sample_weight=None, sample_weight=None,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
num_neg_samples=None): num_neg_samples=None,
name=None):
""" """
${comment} ${comment}
...@@ -4118,9 +4226,18 @@ def nce(input, ...@@ -4118,9 +4226,18 @@ def nce(input,
sample_weight (Variable|None): A Variable of shape [batch_size, 1] sample_weight (Variable|None): A Variable of shape [batch_size, 1]
storing a weight for each sample. The default weight for each storing a weight for each sample. The default weight for each
sample is 1.0. sample is 1.0.
param_attr (ParamAttr|None): attributes for parameter param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
bias_attr (ParamAttr|None): attributes for bias of nce. If it is set to None or one attribute of ParamAttr, nce
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of nce.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, nce
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
num_neg_samples (int): ${num_neg_samples_comment} num_neg_samples (int): ${num_neg_samples_comment}
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
Returns: Returns:
Variable: The output nce loss. Variable: The output nce loss.
...@@ -4153,22 +4270,31 @@ def nce(input, ...@@ -4153,22 +4270,31 @@ def nce(input,
""" """
helper = LayerHelper('nce', **locals()) helper = LayerHelper('nce', **locals())
assert isinstance(input, Variable) assert isinstance(input, Variable)
dim = input.shape[1]
assert isinstance(label, Variable) assert isinstance(label, Variable)
dim = input.shape[1]
num_true_class = label.shape[1] num_true_class = label.shape[1]
w = helper.create_parameter( w = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr,
shape=[num_total_classes, dim], shape=[num_total_classes, dim],
is_bias=False, is_bias=False,
dtype=input.dtype) dtype=input.dtype)
b = helper.create_parameter( inputs = {
attr=helper.bias_attr, 'Input': input,
shape=[num_total_classes, 1], 'Label': label,
is_bias=True, 'Weight': w,
dtype=input.dtype) 'SampleWeight': sample_weight if sample_weight is not None else []
cost = helper.create_tmp_variable(dtype=input.dtype) }
sample_logits = helper.create_tmp_variable(dtype=input.dtype) if helper.bias_attr:
sample_labels = helper.create_tmp_variable(dtype=label.dtype) b = helper.create_parameter(
attr=helper.bias_attr,
shape=[num_total_classes, 1],
is_bias=True,
dtype=input.dtype)
inputs['Bias'] = b
cost = helper.create_variable_for_type_inference(dtype=input.dtype)
sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
if num_neg_samples is None: if num_neg_samples is None:
num_neg_samples = 10 num_neg_samples = 10
...@@ -4182,13 +4308,7 @@ def nce(input, ...@@ -4182,13 +4308,7 @@ def nce(input,
helper.append_op( helper.append_op(
type='nce', type='nce',
inputs={ inputs=inputs,
'Input': input,
'Label': label,
'Weight': w,
'Bias': b,
'SampleWeight': sample_weight if sample_weight is not None else []
},
outputs={ outputs={
'Cost': cost, 'Cost': cost,
'SampleLogits': sample_logits, 'SampleLogits': sample_logits,
...@@ -4198,7 +4318,12 @@ def nce(input, ...@@ -4198,7 +4318,12 @@ def nce(input,
return cost / (num_neg_samples + 1) return cost / (num_neg_samples + 1)
def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None): def hsigmoid(input,
label,
num_classes,
param_attr=None,
bias_attr=None,
name=None):
""" """
The hierarchical sigmoid operator is used to accelerate the training The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a process of language model. This operator organizes the classes into a
...@@ -4219,11 +4344,17 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None): ...@@ -4219,11 +4344,17 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
label (Variable): The tensor variable contains labels of training data. label (Variable): The tensor variable contains labels of training data.
It's a tensor with shape is :math:`[N \\times 1]`. It's a tensor with shape is :math:`[N \\times 1]`.
num_classes: (int), The number of classes, must not be less than 2. num_classes: (int), The number of classes, must not be less than 2.
param_attr (ParamAttr|list of ParamAttr, default None): The parameter param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
attribute for learnable parameters/weights of this layer. of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter will create ParamAttr as param_attr. If the Initializer of the param_attr
attribute for the bias of this layer. If it is set to False, no is not set, the parameter is initialized with Xavier. Default: None.
bias will be applied. bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of hsigmoid.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, hsigmoid
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
Returns: Returns:
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1] Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
...@@ -4239,8 +4370,8 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None): ...@@ -4239,8 +4370,8 @@ def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
helper = LayerHelper('hierarchical_sigmoid', **locals()) helper = LayerHelper('hierarchical_sigmoid', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
pre_out = helper.create_tmp_variable(dtype) pre_out = helper.create_variable_for_type_inference(dtype)
dim = input.shape[1] dim = input.shape[1]
if num_classes < 2: if num_classes < 2:
raise ValueError("num_classes must not be less than 2.") raise ValueError("num_classes must not be less than 2.")
...@@ -4300,8 +4431,8 @@ def transpose(x, perm, name=None): ...@@ -4300,8 +4431,8 @@ def transpose(x, perm, name=None):
(idx, perm[idx], len(x.shape))) (idx, perm[idx], len(x.shape)))
helper = LayerHelper('transpose', **locals()) helper = LayerHelper('transpose', **locals())
out = helper.create_tmp_variable(x.dtype) out = helper.create_variable_for_type_inference(x.dtype)
x_shape = helper.create_tmp_variable(x.dtype) x_shape = helper.create_variable_for_type_inference(x.dtype)
helper.append_op( helper.append_op(
type='transpose2', type='transpose2',
inputs={'X': [x]}, inputs={'X': [x]},
...@@ -4443,7 +4574,7 @@ def im2sequence(input, ...@@ -4443,7 +4574,7 @@ def im2sequence(input,
inputs["Y"] = input_image_size inputs["Y"] = input_image_size
attrs["out_stride"] = out_stride attrs["out_stride"] = out_stride
helper = LayerHelper('im2sequence', **locals()) helper = LayerHelper('im2sequence', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
helper.append_op( helper.append_op(
type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs) type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
return out return out
...@@ -4476,7 +4607,7 @@ def row_conv(input, future_context_size, param_attr=None, act=None): ...@@ -4476,7 +4607,7 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
filter_shape = [future_context_size + 1, input.shape[1]] filter_shape = [future_context_size + 1, input.shape[1]]
filter_param = helper.create_parameter( filter_param = helper.create_parameter(
attr=helper.param_attr, shape=filter_shape, dtype=dtype) attr=helper.param_attr, shape=filter_shape, dtype=dtype)
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='row_conv', type='row_conv',
inputs={'X': [input], inputs={'X': [input],
...@@ -4509,7 +4640,7 @@ def multiplex(inputs, index): ...@@ -4509,7 +4640,7 @@ def multiplex(inputs, index):
raise ValueError("inputs should be a list object and contains at least " raise ValueError("inputs should be a list object and contains at least "
"2 elements.") "2 elements.")
out = helper.create_tmp_variable(inputs[0].dtype) out = helper.create_variable_for_type_inference(inputs[0].dtype)
helper.append_op( helper.append_op(
type='multiplex', type='multiplex',
inputs={'X': inputs, inputs={'X': inputs,
...@@ -4580,8 +4711,8 @@ def softmax_with_cross_entropy(logits, ...@@ -4580,8 +4711,8 @@ def softmax_with_cross_entropy(logits,
logits=fc, label=label) logits=fc, label=label)
""" """
helper = LayerHelper('softmax_with_cross_entropy', **locals()) helper = LayerHelper('softmax_with_cross_entropy', **locals())
softmax = helper.create_tmp_variable(dtype=logits.dtype) softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
loss = helper.create_tmp_variable(dtype=logits.dtype) loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
helper.append_op( helper.append_op(
type='softmax_with_cross_entropy', type='softmax_with_cross_entropy',
inputs={'Logits': logits, inputs={'Logits': logits,
...@@ -4631,8 +4762,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): ...@@ -4631,8 +4762,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
""" """
helper = LayerHelper('smooth_l1_loss', **locals()) helper = LayerHelper('smooth_l1_loss', **locals())
diff = helper.create_tmp_variable(dtype=x.dtype) diff = helper.create_variable_for_type_inference(dtype=x.dtype)
loss = helper.create_tmp_variable(dtype=x.dtype) loss = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='smooth_l1_loss', type='smooth_l1_loss',
inputs={ inputs={
...@@ -4665,7 +4796,7 @@ def one_hot(input, depth): ...@@ -4665,7 +4796,7 @@ def one_hot(input, depth):
one_hot_label = layers.one_hot(input=label, depth=10) one_hot_label = layers.one_hot(input=label, depth=10)
""" """
helper = LayerHelper("one_hot", **locals()) helper = LayerHelper("one_hot", **locals())
one_hot_out = helper.create_tmp_variable(dtype='float32') one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
helper.append_op( helper.append_op(
type="one_hot", type="one_hot",
inputs={'X': input}, inputs={'X': input},
...@@ -4807,8 +4938,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4807,8 +4938,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"except one unknown dimension.") "except one unknown dimension.")
helper = LayerHelper("reshape2", **locals()) helper = LayerHelper("reshape2", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
x_shape = helper.create_tmp_variable(dtype=x.dtype) x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type="reshape2", type="reshape2",
inputs=inputs, inputs=inputs,
...@@ -4857,8 +4988,8 @@ def squeeze(input, axes, name=None): ...@@ -4857,8 +4988,8 @@ def squeeze(input, axes, name=None):
y = layers.sequeeze(input=x, axes=[1]) y = layers.sequeeze(input=x, axes=[1])
""" """
helper = LayerHelper("squeeze", **locals()) helper = LayerHelper("squeeze", **locals())
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
x_shape = helper.create_tmp_variable(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type="squeeze2", type="squeeze2",
inputs={"X": input}, inputs={"X": input},
...@@ -4894,8 +5025,8 @@ def unsqueeze(input, axes, name=None): ...@@ -4894,8 +5025,8 @@ def unsqueeze(input, axes, name=None):
y = layers.unsequeeze(input=x, axes=[1]) y = layers.unsequeeze(input=x, axes=[1])
""" """
helper = LayerHelper("unsqueeze", **locals()) helper = LayerHelper("unsqueeze", **locals())
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
x_shape = helper.create_tmp_variable(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
type="unsqueeze2", type="unsqueeze2",
inputs={"X": input}, inputs={"X": input},
...@@ -4985,7 +5116,7 @@ def lod_reset(x, y=None, target_lod=None): ...@@ -4985,7 +5116,7 @@ def lod_reset(x, y=None, target_lod=None):
out = layers.lod_reset(x=x, y=y) out = layers.lod_reset(x=x, y=y)
""" """
helper = LayerHelper("lod_reset", **locals()) helper = LayerHelper("lod_reset", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
if y is not None: if y is not None:
helper.append_op( helper.append_op(
type="lod_reset", inputs={'X': x, type="lod_reset", inputs={'X': x,
...@@ -5054,8 +5185,9 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): ...@@ -5054,8 +5185,9 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
"dims of input must be 4(not %d), and it's order must be NCHW" % "dims of input must be 4(not %d), and it's order must be NCHW" %
(dims)) (dims))
mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) mid_out = helper.create_variable_for_type_inference(
lrn_out = helper.create_tmp_variable(dtype) dtype=dtype, stop_gradient=True)
lrn_out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="lrn", type="lrn",
inputs={"X": input}, inputs={"X": input},
...@@ -5120,7 +5252,7 @@ def pad(x, paddings, pad_value=0., name=None): ...@@ -5120,7 +5252,7 @@ def pad(x, paddings, pad_value=0., name=None):
""" """
helper = LayerHelper('pad', input=x, **locals()) helper = LayerHelper('pad', input=x, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='pad', type='pad',
inputs={'X': x}, inputs={'X': x},
...@@ -5200,7 +5332,7 @@ def pad_constant_like(x, y, pad_value=0., name=None): ...@@ -5200,7 +5332,7 @@ def pad_constant_like(x, y, pad_value=0., name=None):
""" """
helper = LayerHelper('pad_constant_like', input=x, **locals()) helper = LayerHelper('pad_constant_like', input=x, **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='pad_constant_like', type='pad_constant_like',
inputs={'X': x, inputs={'X': x,
...@@ -5265,7 +5397,7 @@ def label_smooth(label, ...@@ -5265,7 +5397,7 @@ def label_smooth(label,
raise ValueError("The value of epsilon must be between 0 and 1.") raise ValueError("The value of epsilon must be between 0 and 1.")
helper = LayerHelper("label_smooth", **locals()) helper = LayerHelper("label_smooth", **locals())
label.stop_gradient = True label.stop_gradient = True
smooth_label = helper.create_tmp_variable(dtype) smooth_label = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="label_smooth", type="label_smooth",
inputs={"X": label, inputs={"X": label,
...@@ -5297,8 +5429,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): ...@@ -5297,8 +5429,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
""" """
helper = LayerHelper('roi_pool', **locals()) helper = LayerHelper('roi_pool', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
pool_out = helper.create_tmp_variable(dtype) pool_out = helper.create_variable_for_type_inference(dtype)
argmaxes = helper.create_tmp_variable(dtype='int32') argmaxes = helper.create_variable_for_type_inference(dtype='int32')
helper.append_op( helper.append_op(
type="roi_pool", type="roi_pool",
inputs={"X": input, inputs={"X": input,
...@@ -5313,6 +5445,54 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): ...@@ -5313,6 +5445,54 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
return pool_out return pool_out
@templatedoc()
def roi_align(input,
rois,
pooled_height=1,
pooled_width=1,
spatial_scale=1.0,
sampling_ratio=-1,
name=None):
"""
${comment}
Args:
input (Variable): ${x_comment}
rois (Variable): ROIs (Regions of Interest) to pool over.
pooled_height (integer): ${pooled_height_comment} Default: 1
pooled_width (integer): ${pooled_width_comment} Default: 1
spatial_scale (float): ${spatial_scale_comment} Default: 1.0
sampling_ratio(intger): ${sampling_ratio_comment} Default: -1
Returns:
Variable: ${out_comment}.
Examples:
.. code-block:: python
align_out = fluid.layers.roi_align(input=x,
rois=rois,
pooled_height=7,
pooled_width=7,
spatial_scale=0.5,
sampling_ratio=-1)
"""
helper = LayerHelper('roi_align', **locals())
dtype = helper.input_dtype()
align_out = helper.create_variable_for_type_inference(dtype)
helper.append_op(
type="roi_align",
inputs={"X": input,
"ROIs": rois},
outputs={"Out": align_out},
attrs={
"pooled_height": pooled_height,
"pooled_width": pooled_width,
"spatial_scale": spatial_scale,
"sampling_ratio": sampling_ratio
})
return align_out
def dice_loss(input, label, epsilon=0.00001): def dice_loss(input, label, epsilon=0.00001):
""" """
Dice loss for comparing the similarity of two batch of data, Dice loss for comparing the similarity of two batch of data,
...@@ -5423,7 +5603,7 @@ def image_resize(input, ...@@ -5423,7 +5603,7 @@ def image_resize(input,
out_h = int(input.shape[2] * scale) out_h = int(input.shape[2] * scale)
out_w = int(input.shape[3] * scale) out_w = int(input.shape[3] * scale)
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type=resample_methods[resample], type=resample_methods[resample],
inputs=inputs, inputs=inputs,
...@@ -5532,7 +5712,7 @@ def gather(input, index): ...@@ -5532,7 +5712,7 @@ def gather(input, index):
""" """
helper = LayerHelper('gather', **locals()) helper = LayerHelper('gather', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="gather", type="gather",
inputs={"X": input, inputs={"X": input,
...@@ -5572,7 +5752,7 @@ def scatter(input, index, updates, name=None): ...@@ -5572,7 +5752,7 @@ def scatter(input, index, updates, name=None):
""" """
helper = LayerHelper('scatter', **locals()) helper = LayerHelper('scatter', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="scatter", type="scatter",
inputs={"X": input, inputs={"X": input,
...@@ -5632,7 +5812,7 @@ def sequence_scatter(input, index, updates, name=None): ...@@ -5632,7 +5812,7 @@ def sequence_scatter(input, index, updates, name=None):
""" """
helper = LayerHelper('sequence_scatter', **locals()) helper = LayerHelper('sequence_scatter', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="sequence_scatter", type="sequence_scatter",
inputs={"X": input, inputs={"X": input,
...@@ -5662,7 +5842,7 @@ def random_crop(x, shape, seed=None): ...@@ -5662,7 +5842,7 @@ def random_crop(x, shape, seed=None):
""" """
helper = LayerHelper("random_crop", **locals()) helper = LayerHelper("random_crop", **locals())
dtype = x.dtype dtype = x.dtype
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
if seed is None: if seed is None:
seed = np.random.randint(-65536, 65536) seed = np.random.randint(-65536, 65536)
op_attrs = {"shape": shape} op_attrs = {"shape": shape}
...@@ -5708,7 +5888,7 @@ def log(x, name=None): ...@@ -5708,7 +5888,7 @@ def log(x, name=None):
""" """
helper = LayerHelper('log', **locals()) helper = LayerHelper('log', **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out}) helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out})
return out return out
...@@ -5739,7 +5919,7 @@ def relu(x, name=None): ...@@ -5739,7 +5919,7 @@ def relu(x, name=None):
""" """
helper = LayerHelper('relu', **locals()) helper = LayerHelper('relu', **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
return out return out
...@@ -5778,9 +5958,9 @@ def mean_iou(input, label, num_classes): ...@@ -5778,9 +5958,9 @@ def mean_iou(input, label, num_classes):
""" """
helper = LayerHelper('mean_iou', **locals()) helper = LayerHelper('mean_iou', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
out_mean_iou = helper.create_tmp_variable(dtype='float32') out_mean_iou = helper.create_variable_for_type_inference(dtype='float32')
out_wrong = helper.create_tmp_variable(dtype='int32') out_wrong = helper.create_variable_for_type_inference(dtype='int32')
out_correct = helper.create_tmp_variable(dtype='int32') out_correct = helper.create_variable_for_type_inference(dtype='int32')
helper.append_op( helper.append_op(
type="mean_iou", type="mean_iou",
inputs={"Predictions": input, inputs={"Predictions": input,
...@@ -5872,7 +6052,7 @@ def crop(x, shape=None, offsets=None, name=None): ...@@ -5872,7 +6052,7 @@ def crop(x, shape=None, offsets=None, name=None):
if offsets is None: if offsets is None:
offsets = [0] * len(x.shape) offsets = [0] * len(x.shape)
out = helper.create_tmp_variable(x.dtype) out = helper.create_variable_for_type_inference(x.dtype)
ipts = {'X': x} ipts = {'X': x}
attrs = {} attrs = {}
if isinstance(shape, Variable): if isinstance(shape, Variable):
...@@ -5952,7 +6132,7 @@ def rank_loss(label, left, right, name=None): ...@@ -5952,7 +6132,7 @@ def rank_loss(label, left, right, name=None):
if not (isinstance(right, Variable)): if not (isinstance(right, Variable)):
raise ValueError("The right should be a Variable") raise ValueError("The right should be a Variable")
out = helper.create_tmp_variable("float32") out = helper.create_variable_for_type_inference("float32")
helper.append_op( helper.append_op(
type='rank_loss', type='rank_loss',
...@@ -5998,8 +6178,8 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None): ...@@ -5998,8 +6178,8 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
raise ValueError("The left should be a Variable.") raise ValueError("The left should be a Variable.")
if not isinstance(right, Variable): if not isinstance(right, Variable):
raise ValueError("The right should be a Variable.") raise ValueError("The right should be a Variable.")
out = helper.create_tmp_variable(left.dtype) out = helper.create_variable_for_type_inference(left.dtype)
act = helper.create_tmp_variable(left.dtype) act = helper.create_variable_for_type_inference(left.dtype)
helper.append_op( helper.append_op(
type='margin_rank_loss', type='margin_rank_loss',
inputs={"Label": label, inputs={"Label": label,
...@@ -6084,7 +6264,7 @@ def pad2d(input, ...@@ -6084,7 +6264,7 @@ def pad2d(input,
helper = LayerHelper('pad2d', **locals()) helper = LayerHelper('pad2d', **locals())
dtype = helper.input_dtype(input_param_name='input') dtype = helper.input_dtype(input_param_name='input')
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='pad2d', type='pad2d',
inputs={'X': input}, inputs={'X': input},
...@@ -6113,7 +6293,7 @@ def elu(x, alpha=1.0, name=None): ...@@ -6113,7 +6293,7 @@ def elu(x, alpha=1.0, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('elu', **locals()) helper = LayerHelper('elu', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='elu', type='elu',
inputs={'X': x}, inputs={'X': x},
...@@ -6136,7 +6316,7 @@ def relu6(x, threshold=6.0, name=None): ...@@ -6136,7 +6316,7 @@ def relu6(x, threshold=6.0, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('relu6', **locals()) helper = LayerHelper('relu6', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='relu6', type='relu6',
inputs={'X': x}, inputs={'X': x},
...@@ -6159,7 +6339,7 @@ def pow(x, factor=1.0, name=None): ...@@ -6159,7 +6339,7 @@ def pow(x, factor=1.0, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('pow', **locals()) helper = LayerHelper('pow', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='pow', type='pow',
inputs={'X': x}, inputs={'X': x},
...@@ -6183,7 +6363,7 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None): ...@@ -6183,7 +6363,7 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('stanh', **locals()) helper = LayerHelper('stanh', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='stanh', type='stanh',
inputs={'X': x}, inputs={'X': x},
...@@ -6208,7 +6388,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None): ...@@ -6208,7 +6388,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('hard_sigmoid', **locals()) helper = LayerHelper('hard_sigmoid', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='hard_sigmoid', type='hard_sigmoid',
inputs={'X': x}, inputs={'X': x},
...@@ -6232,7 +6412,7 @@ def swish(x, beta=1.0, name=None): ...@@ -6232,7 +6412,7 @@ def swish(x, beta=1.0, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('swish', **locals()) helper = LayerHelper('swish', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='swish', type='swish',
inputs={'X': x}, inputs={'X': x},
...@@ -6284,7 +6464,7 @@ def prelu(x, mode, param_attr=None, name=None): ...@@ -6284,7 +6464,7 @@ def prelu(x, mode, param_attr=None, name=None):
dtype='float32', dtype='float32',
is_bias=False, is_bias=False,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0))
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type="prelu", type="prelu",
inputs={"X": x, inputs={"X": x,
...@@ -6308,7 +6488,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): ...@@ -6308,7 +6488,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('brelu', **locals()) helper = LayerHelper('brelu', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='brelu', type='brelu',
inputs={'X': x}, inputs={'X': x},
...@@ -6331,7 +6511,7 @@ def leaky_relu(x, alpha=0.02, name=None): ...@@ -6331,7 +6511,7 @@ def leaky_relu(x, alpha=0.02, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('leaky_relu', **locals()) helper = LayerHelper('leaky_relu', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='leaky_relu', type='leaky_relu',
inputs={'X': x}, inputs={'X': x},
...@@ -6353,7 +6533,7 @@ def soft_relu(x, threshold=40.0, name=None): ...@@ -6353,7 +6533,7 @@ def soft_relu(x, threshold=40.0, name=None):
output(${out_type}): ${out_comment} output(${out_type}): ${out_comment}
""" """
helper = LayerHelper('soft_relu', **locals()) helper = LayerHelper('soft_relu', **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='soft_relu', type='soft_relu',
inputs={'X': x}, inputs={'X': x},
...@@ -6420,8 +6600,8 @@ def flatten(x, axis=1, name=None): ...@@ -6420,8 +6600,8 @@ def flatten(x, axis=1, name=None):
if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0: if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0:
raise ValueError("The axis should be a int, and in range [0, rank(x)]") raise ValueError("The axis should be a int, and in range [0, rank(x)]")
out = helper.create_tmp_variable(x.dtype) out = helper.create_variable_for_type_inference(x.dtype)
x_shape = helper.create_tmp_variable(x.dtype) x_shape = helper.create_variable_for_type_inference(x.dtype)
helper.append_op( helper.append_op(
type='flatten2', type='flatten2',
inputs={"X": x}, inputs={"X": x},
...@@ -6467,7 +6647,8 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None): ...@@ -6467,7 +6647,8 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0) out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
""" """
helper = LayerHelper('sequence_enumerate', **locals()) helper = LayerHelper('sequence_enumerate', **locals())
out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True) out = helper.create_variable_for_type_inference(
helper.input_dtype(), stop_gradient=True)
helper.append_op( helper.append_op(
type='sequence_enumerate', type='sequence_enumerate',
inputs={'X': input}, inputs={'X': input},
...@@ -6507,9 +6688,9 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None): ...@@ -6507,9 +6688,9 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
helper = LayerHelper('sequence_mask', **locals()) helper = LayerHelper('sequence_mask', **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=dtype) out = helper.create_variable_for_type_inference(dtype=dtype)
else: else:
out = helper.create_tmp_variable(dtype=dtype, name=name) out = helper.create_variable_for_type_inference(dtype=dtype, name=name)
helper.append_op( helper.append_op(
type='sequence_mask', type='sequence_mask',
...@@ -6552,7 +6733,7 @@ def stack(x, axis=0): ...@@ -6552,7 +6733,7 @@ def stack(x, axis=0):
if not isinstance(x, list) and not isinstance(x, tuple): if not isinstance(x, list) and not isinstance(x, tuple):
x = [x] x = [x]
out = helper.create_tmp_variable(x[0].dtype) out = helper.create_variable_for_type_inference(x[0].dtype)
helper.append_op( helper.append_op(
type='stack', inputs={'X': x}, outputs={'Y': out}, type='stack', inputs={'X': x}, outputs={'Y': out},
attrs={'axis': axis}) attrs={'axis': axis})
...@@ -6590,7 +6771,7 @@ def unstack(x, axis=0, num=None): ...@@ -6590,7 +6771,7 @@ def unstack(x, axis=0, num=None):
outs = [] outs = []
for _ in num: for _ in num:
outs.append(helper.create_tmp_variable(x.dtype)) outs.append(helper.create_variable_for_type_inference(x.dtype))
helper.append_op( helper.append_op(
type='unstack', type='unstack',
...@@ -6642,7 +6823,7 @@ def expand(x, expand_times, name=None): ...@@ -6642,7 +6823,7 @@ def expand(x, expand_times, name=None):
""" """
helper = LayerHelper('expand', input=x, **locals()) helper = LayerHelper('expand', input=x, **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='expand', type='expand',
inputs={'X': x}, inputs={'X': x},
...@@ -6681,7 +6862,7 @@ def uniform_random_batch_size_like(input, ...@@ -6681,7 +6862,7 @@ def uniform_random_batch_size_like(input,
""" """
helper = LayerHelper('uniform_random_batch_size_like', **locals()) helper = LayerHelper('uniform_random_batch_size_like', **locals())
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype) c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op( helper.append_op(
type='uniform_random_batch_size_like', type='uniform_random_batch_size_like',
...@@ -6718,7 +6899,7 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'): ...@@ -6718,7 +6899,7 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
""" """
helper = LayerHelper('gaussian_random', **locals()) helper = LayerHelper('gaussian_random', **locals())
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype) c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op( helper.append_op(
type='gaussian_random', type='gaussian_random',
...@@ -6753,7 +6934,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'): ...@@ -6753,7 +6934,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
""" """
helper = LayerHelper('sampling_id', **locals()) helper = LayerHelper('sampling_id', **locals())
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
helper.append_op( helper.append_op(
type='sampling_id', type='sampling_id',
inputs={'X': x}, inputs={'X': x},
...@@ -6792,7 +6973,7 @@ def gaussian_random_batch_size_like(input, ...@@ -6792,7 +6973,7 @@ def gaussian_random_batch_size_like(input,
""" """
helper = LayerHelper('gaussian_random_batch_size_like', **locals()) helper = LayerHelper('gaussian_random_batch_size_like', **locals())
out = helper.create_tmp_variable(dtype) out = helper.create_variable_for_type_inference(dtype)
c_dtype = convert_np_dtype_to_dtype_(dtype) c_dtype = convert_np_dtype_to_dtype_(dtype)
helper.append_op( helper.append_op(
type='gaussian_random_batch_size_like', type='gaussian_random_batch_size_like',
...@@ -6824,7 +7005,8 @@ def sum(x): ...@@ -6824,7 +7005,8 @@ def sum(x):
""" """
helper = LayerHelper('sum', **locals()) helper = LayerHelper('sum', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('x')) out = helper.create_variable_for_type_inference(
dtype=helper.input_dtype('x'))
helper.append_op( helper.append_op(
type='sum', type='sum',
inputs={'X': x}, inputs={'X': x},
...@@ -6851,7 +7033,8 @@ def slice(input, axes, starts, ends): ...@@ -6851,7 +7033,8 @@ def slice(input, axes, starts, ends):
""" """
helper = LayerHelper('slice', **locals()) helper = LayerHelper('slice', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) out = helper.create_variable_for_type_inference(
dtype=helper.input_dtype('input'))
helper.append_op( helper.append_op(
type='slice', type='slice',
inputs={'Input': input}, inputs={'Input': input},
...@@ -6877,7 +7060,8 @@ def shape(input): ...@@ -6877,7 +7060,8 @@ def shape(input):
""" """
helper = LayerHelper('shape', **locals()) helper = LayerHelper('shape', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) out = helper.create_variable_for_type_inference(
dtype=helper.input_dtype('input'))
helper.append_op( helper.append_op(
type='shape', inputs={'Input': input}, outputs={'Out': out}) type='shape', inputs={'Input': input}, outputs={'Out': out})
...@@ -6894,7 +7078,7 @@ def _elementwise_op(helper): ...@@ -6894,7 +7078,7 @@ def _elementwise_op(helper):
use_mkldnn = helper.kwargs.get('use_mkldnn', False) use_mkldnn = helper.kwargs.get('use_mkldnn', False)
name = helper.kwargs.get('name', None) name = helper.kwargs.get('name', None)
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -6928,7 +7112,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): ...@@ -6928,7 +7112,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
helper = LayerHelper('scale', **locals()) helper = LayerHelper('scale', **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -6994,7 +7178,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True): ...@@ -6994,7 +7178,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
if out is None: if out is None:
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -7102,7 +7286,7 @@ def clip(x, min, max, name=None): ...@@ -7102,7 +7286,7 @@ def clip(x, min, max, name=None):
helper = LayerHelper("clip", **locals()) helper = LayerHelper("clip", **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -7134,7 +7318,7 @@ def clip_by_norm(x, max_norm, name=None): ...@@ -7134,7 +7318,7 @@ def clip_by_norm(x, max_norm, name=None):
helper = LayerHelper("clip_by_norm", **locals()) helper = LayerHelper("clip_by_norm", **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -7164,7 +7348,7 @@ def mean(x, name=None): ...@@ -7164,7 +7348,7 @@ def mean(x, name=None):
helper = LayerHelper("mean", **locals()) helper = LayerHelper("mean", **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -7194,7 +7378,7 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): ...@@ -7194,7 +7378,7 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
helper = LayerHelper("mul", **locals()) helper = LayerHelper("mul", **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -7228,7 +7412,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None): ...@@ -7228,7 +7412,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals()) helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -7258,7 +7442,7 @@ def maxout(x, groups, name=None): ...@@ -7258,7 +7442,7 @@ def maxout(x, groups, name=None):
helper = LayerHelper("maxout", **locals()) helper = LayerHelper("maxout", **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
...@@ -7297,7 +7481,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): ...@@ -7297,7 +7481,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
helper = LayerHelper("affine_channel", **locals()) helper = LayerHelper("affine_channel", **locals())
if name is None: if name is None:
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
else: else:
out = helper.create_variable( out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
......
...@@ -152,7 +152,7 @@ def cast(x, dtype): ...@@ -152,7 +152,7 @@ def cast(x, dtype):
result = fluid.layers.cast(x=data, dtype='float64') result = fluid.layers.cast(x=data, dtype='float64')
""" """
helper = LayerHelper('cast', **locals()) helper = LayerHelper('cast', **locals())
out = helper.create_tmp_variable(dtype=dtype) out = helper.create_variable_for_type_inference(dtype=dtype)
helper.append_op( helper.append_op(
type='cast', type='cast',
inputs={'X': [x]}, inputs={'X': [x]},
...@@ -184,7 +184,7 @@ def concat(input, axis=0, name=None): ...@@ -184,7 +184,7 @@ def concat(input, axis=0, name=None):
out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth]) out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
""" """
helper = LayerHelper('concat', **locals()) helper = LayerHelper('concat', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
helper.append_op( helper.append_op(
type='concat', type='concat',
inputs={'X': input}, inputs={'X': input},
...@@ -221,7 +221,8 @@ def sums(input, out=None): ...@@ -221,7 +221,8 @@ def sums(input, out=None):
""" """
helper = LayerHelper('sum', **locals()) helper = LayerHelper('sum', **locals())
if out is None: if out is None:
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_variable_for_type_inference(
dtype=helper.input_dtype())
helper.append_op( helper.append_op(
type='sum', type='sum',
inputs={'X': input}, inputs={'X': input},
...@@ -252,7 +253,7 @@ def assign(input, output=None): ...@@ -252,7 +253,7 @@ def assign(input, output=None):
""" """
helper = LayerHelper('assign', **locals()) helper = LayerHelper('assign', **locals())
if output is None: if output is None:
output = helper.create_tmp_variable(dtype=input.dtype) output = helper.create_variable_for_type_inference(dtype=input.dtype)
if isinstance(input, Variable): if isinstance(input, Variable):
helper.append_op( helper.append_op(
type='assign', inputs={'X': [input]}, outputs={'Out': [output]}) type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
...@@ -311,7 +312,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): ...@@ -311,7 +312,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
helper = LayerHelper("fill_constant", **locals()) helper = LayerHelper("fill_constant", **locals())
if out is None: if out is None:
out = helper.create_tmp_variable(dtype=dtype) out = helper.create_variable_for_type_inference(dtype=dtype)
helper.append_op( helper.append_op(
type='fill_constant', type='fill_constant',
inputs={}, inputs={},
...@@ -358,7 +359,7 @@ def fill_constant_batch_size_like(input, ...@@ -358,7 +359,7 @@ def fill_constant_batch_size_like(input,
${out_comment}. ${out_comment}.
""" """
helper = LayerHelper("fill_constant_batch_size_like", **locals()) helper = LayerHelper("fill_constant_batch_size_like", **locals())
out = helper.create_tmp_variable(dtype=dtype) out = helper.create_variable_for_type_inference(dtype=dtype)
helper.append_op( helper.append_op(
type='fill_constant_batch_size_like', type='fill_constant_batch_size_like',
inputs={'Input': input}, inputs={'Input': input},
...@@ -396,7 +397,7 @@ def argmin(x, axis=0): ...@@ -396,7 +397,7 @@ def argmin(x, axis=0):
out = fluid.layers.argmin(x=in, axis=-1) out = fluid.layers.argmin(x=in, axis=-1)
""" """
helper = LayerHelper("arg_min", **locals()) helper = LayerHelper("arg_min", **locals())
out = helper.create_tmp_variable(VarDesc.VarType.INT64) out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
helper.append_op( helper.append_op(
type='arg_min', type='arg_min',
inputs={'X': x}, inputs={'X': x},
...@@ -427,7 +428,7 @@ def argmax(x, axis=0): ...@@ -427,7 +428,7 @@ def argmax(x, axis=0):
out = fluid.layers.argmax(x=in, axis=-1) out = fluid.layers.argmax(x=in, axis=-1)
""" """
helper = LayerHelper("arg_max", **locals()) helper = LayerHelper("arg_max", **locals())
out = helper.create_tmp_variable(VarDesc.VarType.INT64) out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
helper.append_op( helper.append_op(
type='arg_max', type='arg_max',
inputs={'X': x}, inputs={'X': x},
...@@ -477,8 +478,10 @@ def argsort(input, axis=-1, name=None): ...@@ -477,8 +478,10 @@ def argsort(input, axis=-1, name=None):
out, indices = fluid.layers.argsort(input, axis=0) out, indices = fluid.layers.argsort(input, axis=0)
""" """
helper = LayerHelper("argsort", **locals()) helper = LayerHelper("argsort", **locals())
out = helper.create_tmp_variable(dtype=input.dtype, stop_gradient=True) out = helper.create_variable_for_type_inference(
ids = helper.create_tmp_variable(VarDesc.VarType.INT64, stop_gradient=True) dtype=input.dtype, stop_gradient=True)
ids = helper.create_variable_for_type_inference(
VarDesc.VarType.INT64, stop_gradient=True)
helper.append_op( helper.append_op(
type='argsort', type='argsort',
inputs={'X': input}, inputs={'X': input},
...@@ -562,7 +565,7 @@ def reverse(x, axis): ...@@ -562,7 +565,7 @@ def reverse(x, axis):
if isinstance(axis, int): if isinstance(axis, int):
axis = [axis] axis = [axis]
helper = LayerHelper("reverse", **locals()) helper = LayerHelper("reverse", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='reverse', type='reverse',
inputs={'Input': x}, inputs={'Input': x},
...@@ -654,7 +657,7 @@ def has_inf(x): ...@@ -654,7 +657,7 @@ def has_inf(x):
Variable: The tensor variable storing the output, only a bool value. Variable: The tensor variable storing the output, only a bool value.
""" """
helper = LayerHelper("isinf", **locals()) helper = LayerHelper("isinf", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type="isinf", inputs={"X": x}, outputs={"Out": out}) helper.append_op(type="isinf", inputs={"X": x}, outputs={"Out": out})
return out return out
...@@ -670,7 +673,7 @@ def has_nan(x): ...@@ -670,7 +673,7 @@ def has_nan(x):
Variable: The tensor variable storing the output, only a bool value. Variable: The tensor variable storing the output, only a bool value.
""" """
helper = LayerHelper("isnan", **locals()) helper = LayerHelper("isnan", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out}) helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out})
return out return out
...@@ -687,6 +690,6 @@ def isfinite(x): ...@@ -687,6 +690,6 @@ def isfinite(x):
Variable: The tensor variable storing the output, contains a bool value. Variable: The tensor variable storing the output, contains a bool value.
""" """
helper = LayerHelper("isfinite", **locals()) helper = LayerHelper("isfinite", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out}) helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
return out return out
...@@ -64,23 +64,33 @@ def simple_img_conv_pool(input, ...@@ -64,23 +64,33 @@ def simple_img_conv_pool(input,
average-pooling. Default :math:`max`. average-pooling. Default :math:`max`.
global_pooling (bool): Whether to use the global pooling. If global_pooling = true, global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
pool_size and pool_padding while be ignored. Default False pool_size and pool_padding while be ignored. Default False
conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a conv_stride (int|list|tuple): The stride size of the conv2d Layer. If stride is a
list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise, list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise,
the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1. the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1.
conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is conv_padding (int|list|tuple): The padding size of the conv2d Layer. If padding is
a list or tuple, it must contain two integers, (conv_padding_H, conv_padding_W). a list or tuple, it must contain two integers, (conv_padding_H, conv_padding_W).
Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0. Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0.
conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is conv_dilation (int|list|tuple): The dilation size of the conv2d Layer. If dilation is
a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W). a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W).
Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1. Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1.
conv_groups (int): The groups number of the Conv2d Layer. According to grouped conv_groups (int): The groups number of the conv2d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2, convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1 connected to the second half of the input channels. Default: groups=1.
param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
act (str): Activation type for Conv2d. Default: None will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`.
Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
act (str): Activation type for conv2d, if it is set to None, activation is not
appended. Default: None.
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
......
...@@ -151,7 +151,7 @@ class L2DecayRegularizer(WeightDecayRegularizer): ...@@ -151,7 +151,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
decay = block.create_var( decay = block.create_var(
dtype="float32", dtype="float32",
shape=param.shape, shape=param.shape,
type=core.VarDesc.VarType.SELECTED_ROWS) type=core.VarDesc.VarType.LOD_TENSOR)
block.append_op( block.append_op(
type='extract_rows', inputs={'X': grad}, outputs={'Out': idx}) type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
block.append_op( block.append_op(
...@@ -228,7 +228,7 @@ class L1DecayRegularizer(WeightDecayRegularizer): ...@@ -228,7 +228,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
decay = block.create_var( decay = block.create_var(
dtype="float32", dtype="float32",
shape=param.shape, shape=param.shape,
type=core.VarDesc.VarType.SELECTED_ROWS) type=core.VarDesc.VarType.LOD_TENSOR)
block.append_op( block.append_op(
type='extract_rows', inputs={'X': grad}, outputs={'Out': idx}) type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
block.append_op( block.append_op(
...@@ -237,6 +237,7 @@ class L1DecayRegularizer(WeightDecayRegularizer): ...@@ -237,6 +237,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
'Ids': idx}, 'Ids': idx},
outputs={'Out': decay}, outputs={'Out': decay},
attrs={'is_sparse': True}) attrs={'is_sparse': True})
param = decay
# Append sign op # Append sign op
block.append_op( block.append_op(
......
set(PYTHON_TESTS_DIR ${CMAKE_CURRENT_BINARY_DIR} CACHE PATH "python tests directory") set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import random
from op_test import OpTest
from test_seq_conv import seqconv
class TestSeqConvEltAddRelu(OpTest):
def set_conf(self):
pass
def setUp(self):
self.op_type = 'fusion_seqconv_eltadd_relu'
self.lod = [[6, 4]]
self.in_fea_size = 16
self.out_fea_size = 8
self.context_length = 4
self.context_stride = 1
self.context_start = 0
self.set_conf()
assert self.context_stride == 1
T = sum(self.lod[0])
x = np.random.uniform(-1, 1, [T, self.in_fea_size]).astype('float32')
w = np.random.uniform(
-1, 1, [self.in_fea_size * self.context_length,
self.out_fea_size]).astype('float32')
b = np.random.uniform(-2, 1, [1, self.out_fea_size]).astype('float32')
out = seqconv(x, self.lod, w, self.context_length, self.context_start)
out = np.maximum(out + b, 0)
self.inputs = {'X': (x, self.lod), 'Filter': w, 'Bias': b}
self.attrs = {
'contextStart': self.context_start,
'contextLength': self.context_length,
'contextStride': self.context_stride
}
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
class TestSeqConvEltAddReluBS1(TestSeqConvEltAddRelu):
def set_conf(self):
self.lod = [[10]]
class TestSeqConvEltAddReluBS1Case2(TestSeqConvEltAddRelu):
def set_conf(self):
self.lod = [[2]]
class TestSeqConvEltAddReluCase1(TestSeqConvEltAddRelu):
def set_conf(self):
self.lod = [[3, 5, 1, 6]]
self.context_length = 3
self.context_start = -2
class TestSeqConvEltAddReluCase2(TestSeqConvEltAddRelu):
def set_conf(self):
self.lod = [[10, 1, 2, 4, 1, 5, 6]]
self.in_fea_size = 2
self.context_length = 4
self.context_start = -1
class TestSeqConvEltAddReluCase3(TestSeqConvEltAddRelu):
def set_conf(self):
self.lod = [[10, 1, 2, 4, 1, 5, 6]]
self.context_length = 5
self.context_start = -4
if __name__ == '__main__':
unittest.main()
...@@ -465,6 +465,16 @@ class TestBook(unittest.TestCase): ...@@ -465,6 +465,16 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output) self.assertIsNotNone(output)
print(str(program)) print(str(program))
def test_roi_align(self):
program = Program()
with program_guard(program):
x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
rois = layers.data(
name="rois", shape=[4], dtype="float32", lod_level=1)
output = layers.roi_align(x, rois, 14, 14, 0.5, 2)
self.assertIsNotNone(output)
print(str(program))
def test_resize_bilinear(self): def test_resize_bilinear(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -37,7 +37,7 @@ def PolygonBoxRestore(input): ...@@ -37,7 +37,7 @@ def PolygonBoxRestore(input):
indexes = indexes.repeat( indexes = indexes.repeat(
[batch_size], axis=0) # [batch_size, geo_channels/2, 2, h, w] [batch_size], axis=0) # [batch_size, geo_channels/2, 2, h, w]
return indexes.reshape( return indexes.reshape(
input.shape) - input # [batch_size, geo_channels, h, w] input.shape) * 4 - input # [batch_size, geo_channels, h, w]
class TestPolygonBoxRestoreOp(OpTest): class TestPolygonBoxRestoreOp(OpTest):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import math
import sys
from op_test import OpTest
class TestROIAlignOp(OpTest):
def set_data(self):
self.init_test_case()
self.make_rois()
self.calc_roi_align()
self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
self.attrs = {
'spatial_scale': self.spatial_scale,
'pooled_height': self.pooled_height,
'pooled_width': self.pooled_width,
'sampling_ratio': self.sampling_ratio
}
self.outputs = {'Out': self.out_data}
def init_test_case(self):
self.batch_size = 3
self.channels = 3
self.height = 8
self.width = 6
# n, c, h, w
self.x_dim = (self.batch_size, self.channels, self.height, self.width)
self.spatial_scale = 1.0 / 2.0
self.pooled_height = 2
self.pooled_width = 2
self.sampling_ratio = -1
self.x = np.random.random(self.x_dim).astype('float32')
def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
bin_size_h, bin_size_w):
count = roi_bin_grid_h * roi_bin_grid_w
bilinear_pos = np.zeros(
[self.channels, self.pooled_height, self.pooled_width, count, 4],
np.float32)
bilinear_w = np.zeros(
[self.pooled_height, self.pooled_width, count, 4], np.float32)
for ph in range(self.pooled_width):
for pw in range(self.pooled_height):
c = 0
for iy in range(roi_bin_grid_h):
y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \
bin_size_h / roi_bin_grid_h
for ix in range(roi_bin_grid_w):
x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \
bin_size_w / roi_bin_grid_w
if y < -1.0 or y > self.height or \
x < -1.0 or x > self.width:
continue
if y <= 0:
y = 0
if x <= 0:
x = 0
y_low = int(y)
x_low = int(x)
if y_low >= self.height - 1:
y = y_high = y_low = self.height - 1
else:
y_high = y_low + 1
if x_low >= self.width - 1:
x = x_high = x_low = self.width - 1
else:
x_high = x_low + 1
ly = y - y_low
lx = x - x_low
hy = 1 - ly
hx = 1 - lx
for ch in range(self.channels):
bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low,
x_low]
bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low,
x_high]
bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high,
x_low]
bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high,
x_high]
bilinear_w[ph, pw, c, 0] = hy * hx
bilinear_w[ph, pw, c, 1] = hy * lx
bilinear_w[ph, pw, c, 2] = ly * hx
bilinear_w[ph, pw, c, 3] = ly * lx
c = c + 1
return bilinear_pos, bilinear_w
def calc_roi_align(self):
self.out_data = np.zeros(
(self.rois_num, self.channels, self.pooled_height,
self.pooled_width)).astype('float32')
for i in range(self.rois_num):
roi = self.rois[i]
roi_batch_id = int(roi[0])
x_i = self.x[roi_batch_id]
roi_xmin = roi[1] * self.spatial_scale
roi_ymin = roi[2] * self.spatial_scale
roi_xmax = roi[3] * self.spatial_scale
roi_ymax = roi[4] * self.spatial_scale
roi_width = max(roi_xmax - roi_xmin, 1)
roi_height = max(roi_ymax - roi_ymin, 1)
bin_size_h = float(roi_height) / float(self.pooled_height)
bin_size_w = float(roi_width) / float(self.pooled_width)
roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
math.ceil(roi_height / self.pooled_height)
roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
math.ceil(roi_width / self.pooled_width)
count = int(roi_bin_grid_h * roi_bin_grid_w)
pre_size = count * self.pooled_width * self.pooled_height
bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
int(roi_bin_grid_h),
int(roi_bin_grid_w),
bin_size_h, bin_size_w)
for ch in range(self.channels):
align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1)
output_val = align_per_bin.mean(axis=-1)
self.out_data[i, ch, :, :] = output_val
def make_rois(self):
rois = []
self.rois_lod = [[]]
for bno in range(self.batch_size):
self.rois_lod[0].append(bno + 1)
for i in range(bno + 1):
x1 = np.random.random_integers(
0, self.width // self.spatial_scale - self.pooled_width)
y1 = np.random.random_integers(
0, self.height // self.spatial_scale - self.pooled_height)
x2 = np.random.random_integers(x1 + self.pooled_width,
self.width // self.spatial_scale)
y2 = np.random.random_integers(
y1 + self.pooled_height, self.height // self.spatial_scale)
roi = [bno, x1, y1, x2, y2]
rois.append(roi)
self.rois_num = len(rois)
self.rois = np.array(rois).astype("float32")
def setUp(self):
self.op_type = "roi_align"
self.set_data()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
...@@ -20,6 +20,53 @@ import random ...@@ -20,6 +20,53 @@ import random
from op_test import OpTest from op_test import OpTest
def seqconv(x,
lod,
filter,
context_length,
context_start,
padding_trainable=False,
padding_data=None):
[T, M] = x.shape
col = np.zeros((T, context_length * M)).astype('float32')
offset = [0]
for seq_len in lod[0]:
offset.append(offset[-1] + seq_len)
begin_pad = np.max([0, -context_start])
for i in range(len(offset) - 1):
for j in range(context_length):
in_begin = offset[i] + context_start + j
in_end = offset[i + 1] + context_start + j
out_begin = offset[i]
out_end = offset[i + 1]
if in_begin < offset[i]:
pad_size = np.min(
[offset[i] - in_begin, offset[i + 1] - offset[i]])
if padding_trainable:
sub_w = padding_data[j:j + pad_size, :]
col[offset[i]:offset[i] + pad_size, j * M:(j + 1) *
M] = sub_w
out_begin = offset[i] + pad_size
in_begin = offset[i]
if in_end > offset[i + 1]:
pad_size = np.min(
[in_end - offset[i + 1], offset[i + 1] - offset[i]])
if padding_trainable:
sub_w = padding_data[begin_pad + context_start + j -
pad_size:begin_pad + context_start +
j, :]
col[offset[i + 1] - pad_size:offset[i + 1], j * M:(j + 1) *
M] = sub_w
in_end = offset[i + 1]
out_end = offset[i + 1] - pad_size
if in_end <= in_begin:
continue
in_sub = x[in_begin:in_end, :]
col[out_begin:out_end, j * M:(j + 1) * M] += in_sub
return np.dot(col, filter)
class TestSeqProject(OpTest): class TestSeqProject(OpTest):
def setUp(self): def setUp(self):
self.init_test_case() self.init_test_case()
...@@ -66,57 +113,9 @@ class TestSeqProject(OpTest): ...@@ -66,57 +113,9 @@ class TestSeqProject(OpTest):
'paddingTrainable': self.padding_trainable, 'paddingTrainable': self.padding_trainable,
'contextStride': self.context_stride 'contextStride': self.context_stride
} }
out = np.zeros( out = seqconv(x, self.lod, w, self.context_length, self.context_start,
(self.input_size[0], self.output_represention)).astype('float32') self.padding_trainable, self.pad_data)
self.outputs = {'Out': out} self.outputs = {'Out': out}
self.compute()
def compute(self):
x, lod = self.inputs['X']
filter = self.inputs['Filter']
pading_data = self.pad_data
out = np.zeros((self.input_size[0], self.context_length *
self.input_size[1])).astype('float32')
offset = [0]
for seq_len in lod[0]:
offset.append(offset[-1] + seq_len)
begin_pad = np.max([0, -self.context_start])
for i in range(len(offset) - 1):
for j in range(self.context_length):
in_begin = offset[i] + self.context_start + j
in_end = offset[i + 1] + self.context_start + j
out_begin = offset[i]
out_end = offset[i + 1]
if in_begin < offset[i]:
pad_size = np.min(
[offset[i] - in_begin, offset[i + 1] - offset[i]])
if self.padding_trainable:
sub_w = pading_data[j:j + pad_size, :]
out[offset[i]:offset[i] + pad_size, j * self.input_size[
1]:(j + 1) * self.input_size[1]] = sub_w
out_begin = offset[i] + pad_size
in_begin = offset[i]
if in_end > offset[i + 1]:
pad_size = np.min(
[in_end - offset[i + 1], offset[i + 1] - offset[i]])
if self.padding_trainable:
sub_w = pading_data[begin_pad + self.context_start + j -
pad_size:begin_pad +
self.context_start + j, :]
out[offset[i + 1] - pad_size:offset[i + 1], j * self.
input_size[1]:(j + 1) * self.input_size[1]] = sub_w
in_end = offset[i + 1]
out_end = offset[i + 1] - pad_size
if in_end <= in_begin:
continue
in_sub = x[in_begin:in_end, :]
out[out_begin:out_end, j * self.input_size[1]:(j + 1) *
self.input_size[1]] += in_sub
np.dot(out, filter, out=self.outputs['Out'])
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
......
...@@ -30,7 +30,6 @@ class TestSliceVar(unittest.TestCase): ...@@ -30,7 +30,6 @@ class TestSliceVar(unittest.TestCase):
var = program.global_block().create_var( var = program.global_block().create_var(
name=str(random.randint(10000, 99999)), name=str(random.randint(10000, 99999)),
persistable=True, persistable=True,
# dtype=core.VarDesc.VarType.LOD_TENSOR,
shape=shape) shape=shape)
var_list.append(var) var_list.append(var)
blocks = slice_variable(var_list, 10, min_size) blocks = slice_variable(var_list, 10, min_size)
......
...@@ -74,7 +74,7 @@ class InferenceTranspiler(object): ...@@ -74,7 +74,7 @@ class InferenceTranspiler(object):
''' '''
Transpile the program fusing elementwise_add into conv for MKLDNN Transpile the program fusing elementwise_add into conv for MKLDNN
program. Elementwise add following convolution OP can be fused by adding program. Elementwise add following convolution OP can be fused by adding
'fuse_eltwise' attribute to convolution OP and replacing its output 'fuse_residual_connection' attribute to convolution OP and replacing its output
Tensor with second parameter of elementwise_add. Tensor with second parameter of elementwise_add.
The result of fuse is: The result of fuse is:
- before: - before:
...@@ -92,7 +92,8 @@ class InferenceTranspiler(object): ...@@ -92,7 +92,8 @@ class InferenceTranspiler(object):
if current_op.type in ['conv2d']: if current_op.type in ['conv2d']:
next_op = self.block.ops[i + 1] next_op = self.block.ops[i + 1]
if next_op.type == 'elementwise_add': if next_op.type == 'elementwise_add':
self._fuse_conv_eltwise(current_op, next_op) self._fuse_conv_eltwise(i, current_op, next_op)
self.block._remove_op(i + 1) # Remove old conv
self.block._remove_op(i + 1) # Remove elementwise_add self.block._remove_op(i + 1) # Remove elementwise_add
i = i + 1 i = i + 1
self._adjust_input() self._adjust_input()
...@@ -444,7 +445,7 @@ class InferenceTranspiler(object): ...@@ -444,7 +445,7 @@ class InferenceTranspiler(object):
outputs={"Output": out_var}, outputs={"Output": out_var},
attrs=attrs) attrs=attrs)
def _fuse_conv_eltwise(self, conv_op, eltwise_op): def _fuse_conv_eltwise(self, index, conv_op, eltwise_op):
''' '''
fuse the conv op with elementwise_add fuse the conv op with elementwise_add
...@@ -454,9 +455,30 @@ class InferenceTranspiler(object): ...@@ -454,9 +455,30 @@ class InferenceTranspiler(object):
:type eltwise_op: Operator :type eltwise_op: Operator
''' '''
conv_op._set_attr("fuse_eltwise", True) eltwise_input = "X"
self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0] if eltwise_op.input("X")[0] == conv_op.output("Output")[0]:
self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0] eltwise_input = "Y"
residual_var = self.block.vars[eltwise_op.input(eltwise_input)[0]]
out_var = self.block.vars[eltwise_op.output("Out")[0]]
filter_var = self.block.vars[conv_op.input("Filter")[0]]
in_var = self.block.vars[conv_op.input("Input")[0]]
bias_var = self.block.vars[conv_op.input("Bias")[0]]
conv_op._set_attr("fuse_residual_connection", True)
attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
self.block._insert_op(
index,
type="conv2d",
inputs={
"Input": in_var,
"Filter": filter_var,
"Bias": bias_var,
"ResidualData": residual_var
},
outputs={"Output": out_var},
attrs=attrs)
def _adjust_input(self): def _adjust_input(self):
for i in range(len(self.block.ops)): for i in range(len(self.block.ops)):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册