提交 9c9ad7d4 编写于 作者: D dzhwinter

Merge remote-tracking branch 'origin/develop' into feature/ir_inplace_pass

test=develop
...@@ -52,8 +52,8 @@ function(op_library TARGET) ...@@ -52,8 +52,8 @@ function(op_library TARGET)
endif() endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc) list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
endif() endif()
endif() endif()
else() else()
......
...@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, ...@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
...@@ -142,10 +142,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', ...@@ -142,10 +142,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)) paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None)) paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1))
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1))
paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True))
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -322,9 +322,10 @@ paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_class ...@@ -322,9 +322,10 @@ paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_class
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0))
paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.box_clip ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)) paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None))
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
...@@ -361,6 +362,9 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b ...@@ -361,6 +362,9 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.Calibrator.__init__ ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.contrib.Calibrator.sample_data ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.Calibrator.save_int8_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)) paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
......
...@@ -10,8 +10,22 @@ function(pass_library TARGET DEST) ...@@ -10,8 +10,22 @@ function(pass_library TARGET DEST)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
set(targetPrefix "")
# Get optional argument
set(extraMacroArgs ${ARGN})
list(LENGTH extraMacroArgs numExtraMacroArgs)
if(numExtraMacroArgs GREATER 0)
list(GET extraMacroArgs 0 targetPrefix)
endif()
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) if(targetPrefix)
cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
else()
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
endif()
# add more DEST here, such as train, dist and collect USE_PASS into a file automatically. # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
message(STATUS "add pass ${TARGET} ${DEST}") message(STATUS "add pass ${TARGET} ${DEST}")
...@@ -51,6 +65,7 @@ pass_library(conv_elementwise_add2_act_fuse_pass inference) ...@@ -51,6 +65,7 @@ pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base)
# There may be many transpose-flatten structures in a model, and the output of # There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will # these structures will be used as inputs to the concat Op. This pattern will
...@@ -62,11 +77,11 @@ foreach (index RANGE 3 6) ...@@ -62,11 +77,11 @@ foreach (index RANGE 3 6)
endforeach() endforeach()
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base mkldnn)
pass_library(depthwise_conv_mkldnn_pass base) pass_library(depthwise_conv_mkldnn_pass base mkldnn)
pass_library(conv_bias_mkldnn_fuse_pass inference) pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
endif() endif()
cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
...@@ -86,7 +101,7 @@ cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framewor ...@@ -86,7 +101,7 @@ cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framewor
cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
endif () endif ()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init("identity_scale_op_clean", graph.get());
// pre_op -> scale_in -> scale_op -> scale_out
// ->
// pre_op -> scale_out
GraphPatternDetector detector;
auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op();
auto scale_in = detector.mutable_pattern()
->NewNode("scale_in")
->assert_is_op_input("scale")
->AsIntermediate();
auto scale_op = detector.mutable_pattern()
->NewNode("scale_fuse")
->assert_is_op("scale")
->assert_op_attr<float>("scale", 1.)
->assert_op_attr<float>("bias", 0.);
auto scale_out = detector.mutable_pattern()
->NewNode("scale_out")
->assert_is_op_output("scale");
pre_op->LinksTo({scale_in});
scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
GraphPatternDetector::handle_t handler = [&](
const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
Node* scale_op_var = subgraph.at(scale_op);
Node* scale_in_var = subgraph.at(scale_in);
Node* scale_out_var = subgraph.at(scale_out);
Node* pre_op_var = subgraph.at(pre_op);
// Link pre_op directly to scale_out
const std::string scale_in_name = scale_in_var->Name();
const std::string scale_out_name = scale_out_var->Name();
// Remove links in graph
GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
// Modify proto message
auto* pre_op_desc = pre_op_var->Op();
for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) {
auto* arguments = parameter.mutable_arguments();
auto it = std::find(arguments->begin(), arguments->end(), scale_in_name);
PADDLE_ENFORCE(it != arguments->end());
*it = scale_out_name;
}
IR_NODE_LINK_TO(pre_op_var, scale_out_var);
};
detector(graph.get(), handler);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(identity_scale_op_clean_pass,
paddle::framework::ir::IdentityScaleOpCleanPass);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class IdentityScaleOpCleanPass : public FusePassBase {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
private:
virtual ~IdentityScaleOpCleanPass() = default;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
#include <functional> #include <functional>
#include <string> #include <string>
#include <vector> #include <vector>
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
#include <functional> #include <functional>
#include <list> #include <list>
#include <map> #include <map>
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string> #include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h" #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle { namespace paddle {
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h" #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h" #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
#include <string> #include <string>
namespace paddle { namespace paddle {
......
...@@ -83,7 +83,6 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -83,7 +83,6 @@ void IRPassManager::CreatePasses(Argument *argument,
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
} }
// graph_ = pass->Apply(std::move(graph_));
pre_pass = pass_name; pre_pass = pass_name;
passes_.emplace_back(std::move(pass)); passes_.emplace_back(std::move(pass));
...@@ -97,8 +96,9 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { ...@@ -97,8 +96,9 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE(graph.get());
// Apply all the passes // Apply all the passes
for (const auto &pass : passes_) { for (const auto &pass : passes_) {
if (pass->Type() == "graph_viz_pass") continue; if (pass->Type() != "graph_viz_pass") {
PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
}
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
} }
return std::move(graph); return std::move(graph);
......
...@@ -318,4 +318,9 @@ NativeConfig AnalysisConfig::ToNativeConfig() const { ...@@ -318,4 +318,9 @@ NativeConfig AnalysisConfig::ToNativeConfig() const {
return config; return config;
} }
void AnalysisConfig::SwitchIrDebug(int x) {
ir_debug_ = x;
Update();
}
} // namespace paddle } // namespace paddle
...@@ -196,7 +196,7 @@ TEST(AnalysisPredictor, memory_optim) { ...@@ -196,7 +196,7 @@ TEST(AnalysisPredictor, memory_optim) {
AnalysisConfig config(FLAGS_dirname); AnalysisConfig config(FLAGS_dirname);
config.DisableGpu(); config.DisableGpu();
config.EnableMemoryOptim(true); config.EnableMemoryOptim(true);
config.pass_builder()->TurnOnDebug(); config.SwitchIrDebug();
auto native_predictor = auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig()); CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
......
...@@ -140,9 +140,12 @@ struct AnalysisConfig { ...@@ -140,9 +140,12 @@ struct AnalysisConfig {
*/ */
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
/** Control whther to debug IR graph analysis phase. /** \brief Control whether to debug IR graph analysis phase.
*
* This will generate DOT files for visualizing the computation graph after
* each analysis pass applied.
*/ */
void SwitchIrDebug(int x = true) { ir_debug_ = x; } void SwitchIrDebug(int x = true);
/** Turn on MKLDNN. /** Turn on MKLDNN.
*/ */
......
...@@ -117,6 +117,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -117,6 +117,7 @@ class CpuPassStrategy : public PassStrategy {
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", // "is_test_pass", //
"identity_scale_op_clean_pass", //
}); });
use_gpu_ = false; use_gpu_ = false;
} }
...@@ -155,6 +156,7 @@ class GpuPassStrategy : public PassStrategy { ...@@ -155,6 +156,7 @@ class GpuPassStrategy : public PassStrategy {
GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy() : PassStrategy({}) {
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", // "conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
......
...@@ -128,9 +128,9 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 ...@@ -128,9 +128,9 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
# bert, max_len=20 # bert, max_len=20, embedding_dim=128
set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert20") set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
download_model_and_data(${BERT_INSTALL_DIR} "bert_model.tar.gz" "bert_data_len20.txt.tar.gz") download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc SERIAL) inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc SERIAL)
# anakin # anakin
......
...@@ -142,7 +142,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { ...@@ -142,7 +142,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->pass_builder()->TurnOnDebug(); cfg->SwitchIrDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
if (use_mkldnn) { if (use_mkldnn) {
cfg->EnableMKLDNN(); cfg->EnableMKLDNN();
......
...@@ -69,7 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -69,7 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_Text_Classification, profile) { TEST(Analyzer_Text_Classification, profile) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.pass_builder()->TurnOnDebug(); cfg.SwitchIrDebug();
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
......
cc_library(benchmark SRCS benchmark.cc DEPS enforce) cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
#cc_binary(visualizer SRCS visualizer.cc DEPS analysis cc_binary(visualizer SRCS visualizer.cc DEPS analysis
# paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes) paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
......
...@@ -51,6 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -51,6 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("selected_scores", AddOutput("selected_scores",
"A LoDTensor containing the accumulated scores corresponding to " "A LoDTensor containing the accumulated scores corresponding to "
"Output(selected_ids)."); "Output(selected_ids).");
AddOutput(
"parent_idx",
"A Tensor preserving the selected_ids' parent indice in pre_ids.");
// Attributes stored in AttributeMap // Attributes stored in AttributeMap
AddAttr<int>("level", "the level of LoDTensor"); AddAttr<int>("level", "the level of LoDTensor");
......
...@@ -41,13 +41,15 @@ class BeamSearchOpKernel : public framework::OpKernel<T> { ...@@ -41,13 +41,15 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
auto selected_ids = context.Output<framework::LoDTensor>("selected_ids"); auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
auto selected_scores = auto selected_scores =
context.Output<framework::LoDTensor>("selected_scores"); context.Output<framework::LoDTensor>("selected_scores");
auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_ids);
PADDLE_ENFORCE_NOT_NULL(selected_scores); PADDLE_ENFORCE_NOT_NULL(selected_scores);
PADDLE_ENFORCE_NOT_NULL(parent_idx);
math::BeamSearchFunctor<DeviceContext, T> alg; math::BeamSearchFunctor<DeviceContext, T> alg;
alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores, alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
ids, scores, selected_ids, selected_scores, level, beam_size, end_id, ids, scores, selected_ids, selected_scores, parent_idx, level,
is_accumulated); beam_size, end_id, is_accumulated);
} }
}; };
......
...@@ -31,6 +31,8 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc ...@@ -31,6 +31,8 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
polygon_box_transform_op.cu) polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
if(WITH_GPU) if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
......
...@@ -99,5 +99,29 @@ void BboxOverlaps(const framework::Tensor& r_boxes, ...@@ -99,5 +99,29 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
} }
} }
template <class T>
void ClipTiledBoxes(const platform::DeviceContext& ctx,
const framework::Tensor& im_info,
const framework::Tensor& input_boxes,
framework::Tensor* out) {
T* out_data = out->mutable_data<T>(ctx.GetPlace());
const T* im_info_data = im_info.data<T>();
const T* input_boxes_data = input_boxes.data<T>();
T zero(0);
T im_w = round(im_info_data[1] / im_info_data[2]);
T im_h = round(im_info_data[0] / im_info_data[2]);
for (int64_t i = 0; i < input_boxes.numel(); ++i) {
if (i % 4 == 0) {
out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
} else if (i % 4 == 1) {
out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
} else if (i % 4 == 2) {
out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
} else {
out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
}
}
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/box_clip_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class BoxClipOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of BoxClipOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
"Input(ImInfo) of BoxClipOp should not be null.");
auto input_box_dims = ctx->GetInputDim("Input");
auto im_info_dims = ctx->GetInputDim("ImInfo");
if (ctx->IsRuntime()) {
auto input_box_size = input_box_dims.size();
PADDLE_ENFORCE_EQ(input_box_dims[input_box_size - 1], 4,
"The last dimension of Input must be 4");
PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
"The rank of Input(Input) in BoxClipOp must be 2");
PADDLE_ENFORCE_EQ(im_info_dims[1], 3,
"The last dimension of ImInfo must be 3");
}
ctx->ShareDim("Input", /*->*/ "Output");
ctx->ShareLoD("Input", /*->*/ "Output");
}
};
class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input",
"(LoDTensor) "
"Input is a LoDTensor with shape [..., 4] holds 4 points"
"in last dimension in format [xmin, ymin, xmax, ymax]");
AddInput("ImInfo",
"(Tensor) Information for image reshape is in shape (N, 3), "
"in format (height, width, im_scale)");
AddOutput("Output",
"(LoDTensor) "
"Output is a LoDTensor with the same shape as Input"
"and it is the result after clip");
AddComment(R"DOC(
This operator clips input boxes to original input images.
For each input box, The formula is given as follows:
$$xmin = \max(\min(xmin, im_w - 1), 0)$$
$$ymin = \max(\min(ymin, im_h - 1), 0)$$
$$xmax = \max(\min(xmax, im_w - 1), 0)$$
$$ymax = \max(\min(ymax, im_h - 1), 0)$$
where im_w and im_h are computed from ImInfo, the formula is given as follows:
$$im_w = \round(width / im_scale)$$
$$im_h = \round(height / im_scale)$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(box_clip, ops::BoxClipOp, ops::BoxClipOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
box_clip, ops::BoxClipKernel<paddle::platform::CPUDeviceContext, float>,
ops::BoxClipKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/box_clip_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTenso = framework::LoDTensor;
static constexpr int ImInfoSize = 3;
template <typename T, int BlockSize>
static __global__ void GPUBoxClip(const T *input, const size_t *lod,
const size_t width, const T *im_info,
T *output) {
T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] /
im_info[blockIdx.x * ImInfoSize + 2]);
T im_h = round(im_info[blockIdx.x * ImInfoSize] /
im_info[blockIdx.x * ImInfoSize + 2]);
for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width;
i += BlockSize) {
int idx = lod[blockIdx.x] * width + i;
T im_size = (idx % 2 == 0) ? im_w : im_h;
output[idx] = max(min(input[idx], im_size - 1), T(0.));
}
}
template <typename DeviceContext, typename T>
class GPUBoxClipKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto *input = context.Input<LoDTensor>("Input");
auto *im_info = context.Input<Tensor>("ImInfo");
auto *output = context.Output<LoDTensor>("Output");
const int64_t num = input->dims()[0];
const int64_t bbox_width = input->numel() / num;
auto lod = input->lod();
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto &dev_ctx = context.template device_context<DeviceContext>();
auto stream = dev_ctx.stream();
const size_t batch_size = lod.back().size() - 1;
T *output_data = output->mutable_data<T>(dev_ctx.GetPlace());
GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()),
bbox_width, im_info->data<T>(), output_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
box_clip, ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, float>,
ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T>
class BoxClipKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input_box = context.Input<LoDTensor>("Input");
auto* im_info = context.Input<LoDTensor>("ImInfo");
auto* output_box = context.Output<LoDTensor>("Output");
auto& dev_ctx =
context.template device_context<platform::CPUDeviceContext>();
output_box->mutable_data<T>(context.GetPlace());
if (input_box->lod().size()) {
PADDLE_ENFORCE_EQ(input_box->lod().size(), 1UL,
"Only support 1 level of LoD.");
}
auto box_lod = input_box->lod().back();
int64_t n = static_cast<int64_t>(box_lod.size() - 1);
for (int i = 0; i < n; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]);
ClipTiledBoxes<T>(dev_ctx, im_info_slice, box_slice, &output_slice);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection/box_coder_op.h" #include "paddle/fluid/operators/detection/box_coder_op.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -32,32 +33,57 @@ class BoxCoderOp : public framework::OperatorWithKernel { ...@@ -32,32 +33,57 @@ class BoxCoderOp : public framework::OperatorWithKernel {
if (ctx->IsRuntime()) { if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
"The rank of Input of PriorBoxVar must be 2"); "The rank of Input PriorBox must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, PADDLE_ENFORCE_EQ(prior_box_dims[1], 4,
"The shape of PriorBox is [N, 4]"); "The shape of PriorBox is [N, 4]");
if (ctx->HasInput("PriorBoxVar")) { if (ctx->HasInput("PriorBoxVar")) {
auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims); PADDLE_ENFORCE(
prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2,
"Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
if (prior_box_var_dims.size() == 1) {
PADDLE_ENFORCE_EQ(
prior_box_var_dims[0], 4,
"The 1st dimension of Input(PriorBoxVar) should be 4"
"when the rank is 1.");
} else {
PADDLE_ENFORCE_EQ(
prior_box_dims, prior_box_var_dims,
"The dimension of Input(PriorBoxVar) should be equal to"
"the dimension of Input(PriorBox when the rank is 2.)");
}
} }
}
auto code_type = auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type")); int axis = ctx->Attrs().Get<int>("axis");
if (code_type == BoxCodeType::kEncodeCenterSize) { if (code_type == BoxCodeType::kEncodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
"The rank of Input of TargetBox must be 2"); "The rank of Input TargetBox must be 2");
PADDLE_ENFORCE_EQ(target_box_dims[1], 4, PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
"The shape of TargetBox is [M, 4]"); "The shape of TargetBox is [M, 4]");
} else if (code_type == BoxCodeType::kDecodeCenterSize) { ctx->SetOutputDim(
PADDLE_ENFORCE_EQ(target_box_dims.size(), 3, "OutputBox",
"The rank of Input of TargetBox must be 3"); framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
} else if (code_type == BoxCodeType::kDecodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
"The rank of Input TargetBox must be 3");
if (axis == 0) {
PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); } else if (axis == 1) {
PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
} else {
PADDLE_THROW("axis must be 0 or 1.");
} }
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
}
if (code_type == BoxCodeType::kDecodeCenterSize && axis == 1) {
ctx->ShareLoD("PriorBox", /*->*/ "OutputBox");
} else {
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
} }
ctx->SetOutputDim(
"OutputBox",
framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
} }
}; };
...@@ -100,6 +126,21 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -100,6 +126,21 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default true) " "(bool, default true) "
"whether treat the priorbox as a noramlized box") "whether treat the priorbox as a noramlized box")
.SetDefault(true); .SetDefault(true);
AddAttr<int>("axis",
"(int, default 0)"
"which axis in PriorBox to broadcast for box decode,"
"for example, if axis is 0 and TargetBox has shape"
"[N, M, 4] and PriorBox has shape [M, 4], then PriorBox "
"will broadcast to [N, M, 4] for decoding. It is only valid"
"when code type is decode_center_size")
.SetDefault(0)
.InEnum({0, 1});
AddAttr<std::vector<float>>(
"variance",
"(vector<float>, default {}),"
"variance of prior box with shape [4]. PriorBoxVar and variance can"
"not be provided at the same time.")
.SetDefault(std::vector<float>{});
AddOutput("OutputBox", AddOutput("OutputBox",
"(LoDTensor or Tensor) " "(LoDTensor or Tensor) "
"When code_type is 'encode_center_size', the output tensor of " "When code_type is 'encode_center_size', the output tensor of "
...@@ -138,7 +179,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width ...@@ -138,7 +179,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
encoded/decoded coordinates, width and height. encoded/decoded coordinates, width and height.
During Box Decoding, two modes for broadcast are supported. Say target box has
shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior
box will broadcast to target box along the assigned axis.
)DOC"); )DOC");
} }
}; };
......
...@@ -9,6 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,6 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/box_coder_op.h" #include "paddle/fluid/operators/detection/box_coder_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -16,11 +19,11 @@ namespace paddle { ...@@ -16,11 +19,11 @@ namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename T>
__global__ void EncodeCenterSizeKernel(const T* prior_box_data, __global__ void EncodeCenterSizeKernel(
const T* prior_box_var_data, const T* prior_box_data, const T* prior_box_var_data,
const T* target_box_data, const int row, const T* target_box_data, const int row, const int col, const int len,
const int col, const int len, const bool normalized, const T prior_box_var_size, const float* variance,
const bool normalized, T* output) { const int var_size, T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) { if (idx < row * col) {
const int row_idx = idx / col; const int row_idx = idx / col;
...@@ -30,11 +33,9 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data, ...@@ -30,11 +33,9 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
T prior_box_height = prior_box_data[col_idx * len + 3] - T prior_box_height = prior_box_data[col_idx * len + 3] -
prior_box_data[col_idx * len + 1] + prior_box_data[col_idx * len + 1] +
(normalized == false); (normalized == false);
T prior_box_center_x = T prior_box_center_x = prior_box_data[col_idx * len] + prior_box_width / 2;
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; T prior_box_center_y =
T prior_box_center_y = (prior_box_data[col_idx * len + 3] + prior_box_data[col_idx * len + 1] + prior_box_height / 2;
prior_box_data[col_idx * len + 1]) /
2;
T target_box_center_x = T target_box_center_x =
(target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) / (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
...@@ -55,58 +56,73 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data, ...@@ -55,58 +56,73 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
if (prior_box_var_data) { if (prior_box_var_data) {
output[idx * len] /= prior_box_var_data[col_idx * len]; int prior_var_offset = 0;
output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1]; if (prior_box_var_size == 2) {
output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2]; prior_var_offset = col_idx * len;
output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3]; }
output[idx * len] /= prior_box_var_data[prior_var_offset];
output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
output[idx * len + 3] /= prior_box_var_data[prior_var_offset + 3];
} else if (var_size == 4) {
for (int k = 0; k < 4; ++k) {
output[idx * len + k] /= static_cast<T>(variance[k]);
}
} }
} }
} }
template <typename T> template <typename T>
__global__ void DecodeCenterSizeKernel(const T* prior_box_data, __global__ void DecodeCenterSizeKernel(
const T* prior_box_var_data, const T* prior_box_data, const T* prior_box_var_data,
const T* target_box_data, const int row, const T* target_box_data, const int row, const int col, const int len,
const int col, const int len, const bool normalized, const T prior_box_var_size, const float* variance,
const bool normalized, T* output) { const int var_size, const int axis, T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int idx = threadIdx.x + blockIdx.x * blockDim.x;
int prior_box_offset = 0;
if (idx < row * col) { if (idx < row * col) {
const int col_idx = idx % col; const int col_idx = idx % col;
T prior_box_width = prior_box_data[col_idx * len + 2] - const int row_idx = idx / col;
prior_box_data[col_idx * len] + (normalized == false); prior_box_offset = axis == 0 ? col_idx * len : row_idx * len;
T prior_box_height = prior_box_data[col_idx * len + 3] - T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[col_idx * len + 1] + prior_box_data[prior_box_offset] +
(normalized == false);
T prior_box_height = prior_box_data[prior_box_offset + 3] -
prior_box_data[prior_box_offset + 1] +
(normalized == false); (normalized == false);
T prior_box_center_x = T prior_box_center_x =
(prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2; prior_box_data[prior_box_offset] + prior_box_width / 2;
T prior_box_center_y = (prior_box_data[col_idx * len + 3] + T prior_box_center_y =
prior_box_data[col_idx * len + 1]) / prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
2;
T target_box_width, target_box_height; T target_box_width, target_box_height;
T target_box_center_x, target_box_center_y; T target_box_center_x, target_box_center_y;
T box_var_x = T(1), box_var_y = T(1);
T box_var_w = T(1), box_var_h = T(1);
if (prior_box_var_data) { if (prior_box_var_data) {
target_box_width = exp(prior_box_var_data[col_idx * len + 2] * int prior_var_offset = 0;
target_box_data[idx * len + 2]) * if (prior_box_var_size == 2) {
prior_box_width; prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
target_box_height = exp(prior_box_var_data[col_idx * len + 3] * }
target_box_data[idx * len + 3]) * box_var_x = prior_box_var_data[prior_var_offset];
prior_box_height; box_var_y = prior_box_var_data[prior_var_offset + 1];
target_box_center_x = prior_box_var_data[col_idx * len] * box_var_w = prior_box_var_data[prior_var_offset + 2];
target_box_data[idx * len] * prior_box_width + box_var_h = prior_box_var_data[prior_var_offset + 3];
prior_box_center_x; } else if (var_size == 4) {
target_box_center_y = prior_box_var_data[col_idx * len + 1] * box_var_x = static_cast<T>(variance[0]);
target_box_data[idx * len + 1] * box_var_y = static_cast<T>(variance[1]);
prior_box_height + box_var_w = static_cast<T>(variance[2]);
prior_box_center_y; box_var_h = static_cast<T>(variance[3]);
} else {
target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width;
target_box_height =
exp(target_box_data[idx * len + 3]) * prior_box_height;
target_box_center_x =
target_box_data[idx * len] * prior_box_width + prior_box_center_x;
target_box_center_y = target_box_data[idx * len + 1] * prior_box_height +
prior_box_center_y;
} }
target_box_width =
exp(box_var_w * target_box_data[idx * len + 2]) * prior_box_width;
target_box_height =
exp(box_var_h * target_box_data[idx * len + 3]) * prior_box_height;
target_box_center_x =
box_var_x * target_box_data[idx * len] * prior_box_width +
prior_box_center_x;
target_box_center_y =
box_var_y * target_box_data[idx * len + 1] * prior_box_height +
prior_box_center_y;
output[idx * len] = target_box_center_x - target_box_width / 2; output[idx * len] = target_box_center_x - target_box_width / 2;
output[idx * len + 1] = target_box_center_y - target_box_height / 2; output[idx * len + 1] = target_box_center_y - target_box_height / 2;
...@@ -127,36 +143,64 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> { ...@@ -127,36 +143,64 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar"); auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox"); auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox"); auto* output_box = context.Output<framework::Tensor>("OutputBox");
std::vector<float> variance = context.Attr<std::vector<float>>("variance");
const T* prior_box_data = prior_box->data<T>(); const T* prior_box_data = prior_box->data<T>();
const T* target_box_data = target_box->data<T>(); const T* target_box_data = target_box->data<T>();
const T* prior_box_var_data = nullptr; const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>(); auto prior_box_var_size = 0;
if (prior_box_var) {
PADDLE_ENFORCE(variance.empty(),
"Input 'PriorBoxVar' and attribute 'variance' should not"
"be used at the same time.");
prior_box_var_data = prior_box_var->data<T>();
prior_box_var_size = prior_box_var->dims().size();
}
if (!(variance.empty())) {
PADDLE_ENFORCE(static_cast<int>(variance.size()) == 4,
"Size of attribute 'variance' should be 4");
}
if (target_box->lod().size()) { if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
"Only support 1 level of LoD."); "Only support 1 level of LoD.");
} }
const int var_size = static_cast<int>(variance.size());
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
bool normalized = context.Attr<bool>("box_normalized");
int axis = context.Attr<int>("axis");
auto row = target_box->dims()[0]; auto row = target_box->dims()[0];
auto col = prior_box->dims()[0]; auto col = prior_box->dims()[0];
if (code_type == BoxCodeType::kDecodeCenterSize) {
col = target_box->dims()[1];
}
auto len = prior_box->dims()[1]; auto len = prior_box->dims()[1];
int block = 512; int block = 512;
int grid = (row * col + block - 1) / block; int grid = (row * col + block - 1) / block;
auto& device_ctx = context.cuda_device_context(); auto& device_ctx = context.cuda_device_context();
auto& allocator =
platform::DeviceTemporaryAllocator::Instance().Get(device_ctx);
int bytes = var_size * sizeof(float);
auto dev_var = allocator.Allocate(bytes);
float* dev_var_data = reinterpret_cast<float*>(dev_var->ptr());
auto cplace = platform::CPUPlace();
const auto gplace = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(gplace, dev_var_data, cplace, &variance[0], bytes,
device_ctx.stream());
output_box->mutable_data<T>({row, col, len}, context.GetPlace()); output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>(); T* output = output_box->data<T>();
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
bool normalized = context.Attr<bool>("box_normalized");
if (code_type == BoxCodeType::kEncodeCenterSize) { if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>( EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len, prior_box_data, prior_box_var_data, target_box_data, row, col, len,
normalized, output); normalized, prior_box_var_size, dev_var_data, var_size, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) { } else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>( DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
prior_box_data, prior_box_var_data, target_box_data, row, col, len, prior_box_data, prior_box_var_data, target_box_data, row, col, len,
normalized, output); normalized, prior_box_var_size, dev_var_data, var_size, axis, output);
} }
} }
}; };
......
...@@ -11,6 +11,7 @@ limitations under the License. */ ...@@ -11,6 +11,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -34,7 +35,8 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -34,7 +35,8 @@ class BoxCoderKernel : public framework::OpKernel<T> {
void EncodeCenterSize(const framework::Tensor* target_box, void EncodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box, const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor* prior_box_var,
const bool normalized, T* output) const { const bool normalized,
const std::vector<float> variance, T* output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = prior_box->dims()[0]; int64_t col = prior_box->dims()[0];
int64_t len = prior_box->dims()[1]; int64_t len = prior_box->dims()[1];
...@@ -53,10 +55,9 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -53,10 +55,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T prior_box_height = prior_box_data[j * len + 3] - T prior_box_height = prior_box_data[j * len + 3] -
prior_box_data[j * len + 1] + prior_box_data[j * len + 1] +
(normalized == false); (normalized == false);
T prior_box_center_x = T prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
T prior_box_center_y = T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; prior_box_data[j * len + 1] + prior_box_height / 2;
T target_box_center_x = T target_box_center_x =
(target_box_data[i * len + 2] + target_box_data[i * len]) / 2; (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
...@@ -78,10 +79,18 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -78,10 +79,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output[offset + 3] = output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)); std::log(std::fabs(target_box_height / prior_box_height));
if (prior_box_var) { if (prior_box_var) {
output[offset] /= prior_box_var_data[j * len]; int prior_var_offset = 0;
output[offset + 1] /= prior_box_var_data[j * len + 1]; if (prior_box_var->dims().size() == 2) {
output[offset + 2] /= prior_box_var_data[j * len + 2]; prior_var_offset = j * len;
output[offset + 3] /= prior_box_var_data[j * len + 3]; }
output[offset] /= prior_box_var_data[prior_var_offset];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
output[offset + 3] /= prior_box_var_data[prior_var_offset + 3];
} else if (!(variance.empty())) {
for (int k = 0; k < 4; ++k) {
output[offset + k] /= static_cast<T>(variance[k]);
}
} }
} }
} }
...@@ -89,58 +98,71 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -89,58 +98,71 @@ class BoxCoderKernel : public framework::OpKernel<T> {
void DecodeCenterSize(const framework::Tensor* target_box, void DecodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box, const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor* prior_box_var,
const bool normalized, T* output) const { const bool normalized, const int axis,
const std::vector<float> variance, T* output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = prior_box->dims()[0]; int64_t col = target_box->dims()[1];
int64_t len = prior_box->dims()[1]; int64_t len = target_box->dims()[2];
auto* target_box_data = target_box->data<T>(); auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>(); auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr; const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>(); if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
size_t offset = i * col * len + j * len; size_t offset = i * col * len + j * len;
T prior_box_width = prior_box_data[j * len + 2] - if (axis == 0) {
prior_box_data[j * len] + (normalized == false); prior_box_offset = j * len;
T prior_box_height = prior_box_data[j * len + 3] - } else if (axis == 1) {
prior_box_data[j * len + 1] + prior_box_offset = i * len;
}
T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] +
(normalized == false);
T prior_box_height = prior_box_data[prior_box_offset + 3] -
prior_box_data[prior_box_offset + 1] +
(normalized == false); (normalized == false);
T prior_box_center_x = T prior_box_center_x =
(prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2; prior_box_data[prior_box_offset] + prior_box_width / 2;
T prior_box_center_y = T prior_box_center_y =
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2; prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
T target_box_center_x = 0, target_box_center_y = 0; T target_box_center_x = 0, target_box_center_y = 0;
T target_box_width = 0, target_box_height = 0; T target_box_width = 0, target_box_height = 0;
T box_var_x = T(1), box_var_y = T(1);
T box_var_w = T(1), box_var_h = T(1);
if (prior_box_var) { if (prior_box_var) {
target_box_center_x = prior_box_var_data[j * len] * int prior_var_offset = 0;
target_box_data[offset] * prior_box_width + if (prior_box_var->dims().size() == 2) {
prior_box_center_x; if (axis == 0)
target_box_center_y = prior_box_var_data[j * len + 1] * prior_var_offset = j * len;
target_box_data[offset + 1] * else if (axis == 1)
prior_box_height + prior_var_offset = i * len;
prior_box_center_y; }
target_box_width = std::exp(prior_box_var_data[j * len + 2] * box_var_x = prior_box_var_data[prior_var_offset];
target_box_data[offset + 2]) * box_var_y = prior_box_var_data[prior_var_offset + 1];
prior_box_width; box_var_w = prior_box_var_data[prior_var_offset + 2];
target_box_height = std::exp(prior_box_var_data[j * len + 3] * box_var_h = prior_box_var_data[prior_var_offset + 3];
target_box_data[offset + 3]) * } else if (!(variance.empty())) {
prior_box_height; box_var_x = static_cast<T>(variance[0]);
} else { box_var_y = static_cast<T>(variance[1]);
target_box_center_x = box_var_w = static_cast<T>(variance[2]);
target_box_data[offset] * prior_box_width + prior_box_center_x; box_var_h = static_cast<T>(variance[3]);
target_box_center_y = target_box_data[offset + 1] * prior_box_height +
prior_box_center_y;
target_box_width =
std::exp(target_box_data[offset + 2]) * prior_box_width;
target_box_height =
std::exp(target_box_data[offset + 3]) * prior_box_height;
} }
target_box_center_x =
box_var_x * target_box_data[offset] * prior_box_width +
prior_box_center_x;
target_box_center_y =
box_var_y * target_box_data[offset + 1] * prior_box_height +
prior_box_center_y;
target_box_width =
std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
target_box_height = std::exp(box_var_h * target_box_data[offset + 3]) *
prior_box_height;
output[offset] = target_box_center_x - target_box_width / 2; output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2; output[offset + 1] = target_box_center_y - target_box_height / 2;
...@@ -157,26 +179,40 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -157,26 +179,40 @@ class BoxCoderKernel : public framework::OpKernel<T> {
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar"); auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox"); auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox"); auto* output_box = context.Output<framework::Tensor>("OutputBox");
std::vector<float> variance = context.Attr<std::vector<float>>("variance");
const int axis = context.Attr<int>("axis");
if (target_box->lod().size()) { if (target_box->lod().size()) {
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL, PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
"Only support 1 level of LoD."); "Only support 1 level of LoD.");
} }
if (prior_box_var) {
PADDLE_ENFORCE(variance.empty(),
"Input 'PriorBoxVar' and attribute 'variance' should not"
"be used at the same time.");
}
if (!(variance.empty())) {
PADDLE_ENFORCE(static_cast<int>(variance.size()) == 4,
"Size of attribute 'variance' should be 4");
}
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
bool normalized = context.Attr<bool>("box_normalized");
auto row = target_box->dims()[0]; auto row = target_box->dims()[0];
auto col = prior_box->dims()[0]; auto col = prior_box->dims()[0];
if (code_type == BoxCodeType::kDecodeCenterSize) {
col = target_box->dims()[1];
}
auto len = prior_box->dims()[1]; auto len = prior_box->dims()[1];
output_box->mutable_data<T>({row, col, len}, context.GetPlace()); output_box->mutable_data<T>({row, col, len}, context.GetPlace());
auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
bool normalized = context.Attr<bool>("box_normalized");
T* output = output_box->data<T>(); T* output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) { if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
output); variance, output);
} else if (code_type == BoxCodeType::kDecodeCenterSize) { } else if (code_type == BoxCodeType::kDecodeCenterSize) {
DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis,
output); variance, output);
} }
} }
}; };
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/yolov3_loss_op.h" #include "paddle/fluid/operators/detection/yolov3_loss_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
...@@ -29,23 +29,33 @@ class Yolov3LossOp : public framework::OperatorWithKernel { ...@@ -29,23 +29,33 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
"Input(GTLabel) of Yolov3LossOp should not be null."); "Input(GTLabel) of Yolov3LossOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Loss"), PADDLE_ENFORCE(ctx->HasOutput("Loss"),
"Output(Loss) of Yolov3LossOp should not be null."); "Output(Loss) of Yolov3LossOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("ObjectnessMask"),
"Output(ObjectnessMask) of Yolov3LossOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("GTMatchMask"),
"Output(GTMatchMask) of Yolov3LossOp should not be null.");
auto dim_x = ctx->GetInputDim("X"); auto dim_x = ctx->GetInputDim("X");
auto dim_gtbox = ctx->GetInputDim("GTBox"); auto dim_gtbox = ctx->GetInputDim("GTBox");
auto dim_gtlabel = ctx->GetInputDim("GTLabel"); auto dim_gtlabel = ctx->GetInputDim("GTLabel");
auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors"); auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
int anchor_num = anchors.size() / 2;
auto anchor_mask = ctx->Attrs().Get<std::vector<int>>("anchor_mask");
int mask_num = anchor_mask.size();
auto class_num = ctx->Attrs().Get<int>("class_num"); auto class_num = ctx->Attrs().Get<int>("class_num");
PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
"Input(X) dim[3] and dim[4] should be euqal."); "Input(X) dim[3] and dim[4] should be euqal.");
PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), PADDLE_ENFORCE_EQ(
"Input(X) dim[1] should be equal to (anchor_number * (5 " dim_x[1], mask_num * (5 + class_num),
"+ class_num))."); "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
"+ class_num)).");
PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3, PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
"Input(GTBox) should be a 3-D tensor"); "Input(GTBox) should be a 3-D tensor");
PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5"); PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2, PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
"Input(GTBox) should be a 2-D tensor"); "Input(GTLabel) should be a 2-D tensor");
PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0], PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
"Input(GTBox) and Input(GTLabel) dim[0] should be same"); "Input(GTBox) and Input(GTLabel) dim[0] should be same");
PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1], PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
...@@ -54,11 +64,22 @@ class Yolov3LossOp : public framework::OperatorWithKernel { ...@@ -54,11 +64,22 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
"Attr(anchors) length should be greater then 0."); "Attr(anchors) length should be greater then 0.");
PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
"Attr(anchors) length should be even integer."); "Attr(anchors) length should be even integer.");
for (size_t i = 0; i < anchor_mask.size(); i++) {
PADDLE_ENFORCE_LT(
anchor_mask[i], anchor_num,
"Attr(anchor_mask) should not crossover Attr(anchors).");
}
PADDLE_ENFORCE_GT(class_num, 0, PADDLE_ENFORCE_GT(class_num, 0,
"Attr(class_num) should be an integer greater then 0."); "Attr(class_num) should be an integer greater then 0.");
std::vector<int64_t> dim_out({1}); std::vector<int64_t> dim_out({dim_x[0]});
ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
std::vector<int64_t> dim_obj_mask({dim_x[0], mask_num, dim_x[2], dim_x[3]});
ctx->SetOutputDim("ObjectnessMask", framework::make_ddim(dim_obj_mask));
std::vector<int64_t> dim_gt_match_mask({dim_gtbox[0], dim_gtbox[1]});
ctx->SetOutputDim("GTMatchMask", framework::make_ddim(dim_gt_match_mask));
} }
protected: protected:
...@@ -73,11 +94,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -73,11 +94,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", AddInput("X",
"The input tensor of YOLO v3 loss operator, " "The input tensor of YOLOv3 loss operator, "
"This is a 4-D tensor with shape of [N, C, H, W]." "This is a 4-D tensor with shape of [N, C, H, W]."
"H and W should be same, and the second dimention(C) stores" "H and W should be same, and the second dimention(C) stores"
"box locations, confidence score and classification one-hot" "box locations, confidence score and classification one-hot"
"key of each anchor box"); "keys of each anchor box");
AddInput("GTBox", AddInput("GTBox",
"The input tensor of ground truth boxes, " "The input tensor of ground truth boxes, "
"This is a 3-D tensor with shape of [N, max_box_num, 5], " "This is a 3-D tensor with shape of [N, max_box_num, 5], "
...@@ -89,32 +110,39 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -89,32 +110,39 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("GTLabel", AddInput("GTLabel",
"The input tensor of ground truth label, " "The input tensor of ground truth label, "
"This is a 2-D tensor with shape of [N, max_box_num], " "This is a 2-D tensor with shape of [N, max_box_num], "
"and each element shoudl be an integer to indicate the " "and each element should be an integer to indicate the "
"box class id."); "box class id.");
AddOutput("Loss", AddOutput("Loss",
"The output yolov3 loss tensor, " "The output yolov3 loss tensor, "
"This is a 1-D tensor with shape of [1]"); "This is a 1-D tensor with shape of [N]");
AddOutput("ObjectnessMask",
"This is an intermediate tensor with shape of [N, M, H, W], "
"M is the number of anchor masks. This parameter caches the "
"mask for calculate objectness loss in gradient kernel.")
.AsIntermediate();
AddOutput("GTMatchMask",
"This is an intermediate tensor with shape of [N, B], "
"B is the max box number of GT boxes. This parameter caches "
"matched mask index of each GT boxes for gradient calculate.")
.AsIntermediate();
AddAttr<int>("class_num", "The number of classes to predict."); AddAttr<int>("class_num", "The number of classes to predict.");
AddAttr<std::vector<int>>("anchors", AddAttr<std::vector<int>>("anchors",
"The anchor width and height, " "The anchor width and height, "
"it will be parsed pair by pair."); "it will be parsed pair by pair.")
.SetDefault(std::vector<int>{});
AddAttr<std::vector<int>>("anchor_mask",
"The mask index of anchors used in "
"current YOLOv3 loss calculation.")
.SetDefault(std::vector<int>{});
AddAttr<int>("downsample_ratio",
"The downsample ratio from network input to YOLOv3 loss "
"input, so 32, 16, 8 should be set for the first, second, "
"and thrid YOLOv3 loss operators.")
.SetDefault(32);
AddAttr<float>("ignore_thresh", AddAttr<float>("ignore_thresh",
"The ignore threshold to ignore confidence loss."); "The ignore threshold to ignore confidence loss.")
AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.") .SetDefault(0.7);
.SetDefault(1.0);
AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
.SetDefault(1.0);
AddAttr<float>(
"loss_weight_conf_target",
"The weight of confidence score loss in locations with target object.")
.SetDefault(1.0);
AddAttr<float>("loss_weight_conf_notarget",
"The weight of confidence score loss in locations without "
"target object.")
.SetDefault(1.0);
AddAttr<float>("loss_weight_class", "The weight of classification loss.")
.SetDefault(1.0);
AddComment(R"DOC( AddComment(R"DOC(
This operator generate yolov3 loss by given predict result and ground This operator generate yolov3 loss by given predict result and ground
truth boxes. truth boxes.
...@@ -147,17 +175,28 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -147,17 +175,28 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
thresh, the confidence score loss of this anchor box will be ignored. thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss, Therefore, the yolov3 loss consist of three major parts, box location loss,
confidence score loss, and classification loss. The MSE loss is used for confidence score loss, and classification loss. The L2 loss is used for
box location, and binary cross entropy loss is used for confidence score box coordinates (w, h), and sigmoid cross entropy loss is used for box
loss and classification loss. coordinates (x, y), confidence score loss and classification loss.
Each groud truth box find a best matching anchor box in all anchors,
prediction of this anchor box will incur all three parts of losses, and
prediction of anchor boxes with no GT box matched will only incur objectness
loss.
In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be mutiplied by scale weight, which is
calculated as follow.
$$
weight_{box} = 2.0 - t_w * t_h
$$
Final loss will be represented as follow. Final loss will be represented as follow.
$$ $$
loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh} loss = (loss_{xy} + loss_{wh}) * weight_{box}
+ \loss_weight_{conf_target} * loss_{conf_target} + loss_{conf} + loss_{class}
+ \loss_weight_{conf_notarget} * loss_{conf_notarget}
+ \loss_weight_{class} * loss_{class}
$$ $$
)DOC"); )DOC");
} }
...@@ -196,6 +235,8 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { ...@@ -196,6 +235,8 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
op->SetInput("GTBox", Input("GTBox")); op->SetInput("GTBox", Input("GTBox"));
op->SetInput("GTLabel", Input("GTLabel")); op->SetInput("GTLabel", Input("GTLabel"));
op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
op->SetInput("GTMatchMask", Output("GTMatchMask"));
op->SetAttrMap(Attrs()); op->SetAttrMap(Attrs());
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T>
static inline bool LessEqualZero(T x) {
return x < 1e-6;
}
template <typename T>
static T SigmoidCrossEntropy(T x, T label) {
return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x)));
}
template <typename T>
static T L2Loss(T x, T y) {
return 0.5 * (y - x) * (y - x);
}
template <typename T>
static T SigmoidCrossEntropyGrad(T x, T label) {
return 1.0 / (1.0 + std::exp(-x)) - label;
}
template <typename T>
static T L2LossGrad(T x, T y) {
return x - y;
}
static int GetMaskIndex(std::vector<int> mask, int val) {
for (size_t i = 0; i < mask.size(); i++) {
if (mask[i] == val) {
return i;
}
}
return -1;
}
template <typename T>
struct Box {
T x, y, w, h;
};
template <typename T>
static inline T sigmoid(T x) {
return 1.0 / (1.0 + std::exp(-x));
}
template <typename T>
static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
int j, int an_idx, int grid_size,
int input_size, int index, int stride) {
Box<T> b;
b.x = (i + sigmoid<T>(x[index])) / grid_size;
b.y = (j + sigmoid<T>(x[index + stride])) / grid_size;
b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] / input_size;
b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] / input_size;
return b;
}
template <typename T>
static inline Box<T> GetGtBox(const T* gt, int batch, int max_boxes, int idx) {
Box<T> b;
b.x = gt[(batch * max_boxes + idx) * 4];
b.y = gt[(batch * max_boxes + idx) * 4 + 1];
b.w = gt[(batch * max_boxes + idx) * 4 + 2];
b.h = gt[(batch * max_boxes + idx) * 4 + 3];
return b;
}
template <typename T>
static inline T BoxOverlap(T c1, T w1, T c2, T w2) {
T l1 = c1 - w1 / 2.0;
T l2 = c2 - w2 / 2.0;
T left = l1 > l2 ? l1 : l2;
T r1 = c1 + w1 / 2.0;
T r2 = c2 + w2 / 2.0;
T right = r1 < r2 ? r1 : r2;
return right - left;
}
template <typename T>
static inline T CalcBoxIoU(Box<T> b1, Box<T> b2) {
T w = BoxOverlap(b1.x, b1.w, b2.x, b2.w);
T h = BoxOverlap(b1.y, b1.h, b2.y, b2.h);
T inter_area = (w < 0 || h < 0) ? 0.0 : w * h;
T union_area = b1.w * b1.h + b2.w * b2.h - inter_area;
return inter_area / union_area;
}
static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num,
int an_stride, int stride, int entry) {
return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
}
template <typename T>
static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
std::vector<int> anchors, int an_idx,
int box_idx, int gi, int gj, int grid_size,
int input_size, int stride) {
T tx = gt.x * grid_size - gi;
T ty = gt.y * grid_size - gj;
T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
T scale = (2.0 - gt.w * gt.h);
loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale;
loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale;
}
template <typename T>
static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
Box<T> gt, std::vector<int> anchors,
int an_idx, int box_idx, int gi, int gj,
int grid_size, int input_size, int stride) {
T tx = gt.x * grid_size - gi;
T ty = gt.y * grid_size - gj;
T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
T scale = (2.0 - gt.w * gt.h);
input_grad[box_idx] =
SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
input_grad[box_idx + stride] =
SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
input_grad[box_idx + 2 * stride] =
L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
input_grad[box_idx + 3 * stride] =
L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
}
template <typename T>
static inline void CalcLabelLoss(T* loss, const T* input, const int index,
const int label, const int class_num,
const int stride) {
for (int i = 0; i < class_num; i++) {
T pred = input[index + i * stride];
loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0);
}
}
template <typename T>
static inline void CalcLabelLossGrad(T* input_grad, const T loss,
const T* input, const int index,
const int label, const int class_num,
const int stride) {
for (int i = 0; i < class_num; i++) {
T pred = input[index + i * stride];
input_grad[index + i * stride] =
SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss;
}
}
template <typename T>
static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
const int n, const int an_num, const int h,
const int w, const int stride,
const int an_stride) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < an_num; j++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
T obj = objness[k * w + l];
if (obj > 1e-5) {
// positive sample: obj = 1
loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0);
} else if (obj > -0.5) {
// negetive sample: obj = 0
loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
}
}
}
objness += stride;
input += an_stride;
}
}
}
template <typename T>
static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
const T* input, const T* objness,
const int n, const int an_num,
const int h, const int w,
const int stride, const int an_stride) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < an_num; j++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
T obj = objness[k * w + l];
if (obj > 1e-5) {
input_grad[k * w + l] =
SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i];
} else if (obj > -0.5) {
input_grad[k * w + l] =
SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
}
}
}
objness += stride;
input += an_stride;
input_grad += an_stride;
}
}
}
template <typename T>
static void inline GtValid(bool* valid, const T* gtbox, const int n,
const int b) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < b; j++) {
if (LessEqualZero(gtbox[j * 4 + 2]) || LessEqualZero(gtbox[j * 4 + 3])) {
valid[j] = false;
} else {
valid[j] = true;
}
}
valid += b;
gtbox += b * 4;
}
}
template <typename T>
class Yolov3LossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto* loss = ctx.Output<Tensor>("Loss");
auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
auto anchors = ctx.Attr<std::vector<int>>("anchors");
auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
int class_num = ctx.Attr<int>("class_num");
float ignore_thresh = ctx.Attr<float>("ignore_thresh");
int downsample_ratio = ctx.Attr<int>("downsample_ratio");
const int n = input->dims()[0];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int an_num = anchors.size() / 2;
const int mask_num = anchor_mask.size();
const int b = gt_box->dims()[1];
int input_size = downsample_ratio * h;
const int stride = h * w;
const int an_stride = (class_num + 5) * stride;
const T* input_data = input->data<T>();
const T* gt_box_data = gt_box->data<T>();
const int* gt_label_data = gt_label->data<int>();
T* loss_data = loss->mutable_data<T>({n}, ctx.GetPlace());
memset(loss_data, 0, loss->numel() * sizeof(T));
T* obj_mask_data =
objness_mask->mutable_data<T>({n, mask_num, h, w}, ctx.GetPlace());
memset(obj_mask_data, 0, objness_mask->numel() * sizeof(T));
int* gt_match_mask_data =
gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
// calc valid gt box mask, avoid calc duplicately in following code
Tensor gt_valid_mask;
bool* gt_valid_mask_data =
gt_valid_mask.mutable_data<bool>({n, b}, ctx.GetPlace());
GtValid<T>(gt_valid_mask_data, gt_box_data, n, b);
for (int i = 0; i < n; i++) {
for (int j = 0; j < mask_num; j++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
// each predict box find a best match gt box, if overlap is bigger
// then ignore_thresh, ignore the objectness loss.
int box_idx =
GetEntryIndex(i, j, k * w + l, mask_num, an_stride, stride, 0);
Box<T> pred = GetYoloBox(input_data, anchors, l, k, anchor_mask[j],
h, input_size, box_idx, stride);
T best_iou = 0;
for (int t = 0; t < b; t++) {
if (!gt_valid_mask_data[i * b + t]) {
continue;
}
Box<T> gt = GetGtBox(gt_box_data, i, b, t);
T iou = CalcBoxIoU(pred, gt);
if (iou > best_iou) {
best_iou = iou;
}
}
// If best IoU is bigger then ignore_thresh,
// ignore the objectness loss.
if (best_iou > ignore_thresh) {
int obj_idx = (i * mask_num + j) * stride + k * w + l;
obj_mask_data[obj_idx] = static_cast<T>(-1);
}
// all losses should be calculated if best IoU
// is bigger then truth thresh, but currently,
// truth thresh is an unreachable value as 1.0.
}
}
}
for (int t = 0; t < b; t++) {
if (!gt_valid_mask_data[i * b + t]) {
gt_match_mask_data[i * b + t] = -1;
continue;
}
Box<T> gt = GetGtBox(gt_box_data, i, b, t);
int gi = static_cast<int>(gt.x * w);
int gj = static_cast<int>(gt.y * h);
Box<T> gt_shift = gt;
gt_shift.x = 0.0;
gt_shift.y = 0.0;
T best_iou = 0.0;
int best_n = 0;
// each gt box find a best match anchor box as positive sample,
// for positive sample, all losses should be calculated, and for
// other samples, only objectness loss is required.
for (int an_idx = 0; an_idx < an_num; an_idx++) {
Box<T> an_box;
an_box.x = 0.0;
an_box.y = 0.0;
an_box.w = anchors[2 * an_idx] / static_cast<T>(input_size);
an_box.h = anchors[2 * an_idx + 1] / static_cast<T>(input_size);
float iou = CalcBoxIoU<T>(an_box, gt_shift);
if (iou > best_iou) {
best_iou = iou;
best_n = an_idx;
}
}
int mask_idx = GetMaskIndex(anchor_mask, best_n);
gt_match_mask_data[i * b + t] = mask_idx;
if (mask_idx >= 0) {
int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 0);
CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
box_idx, gi, gj, h, input_size, stride);
int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
obj_mask_data[obj_idx] = 1.0;
int label = gt_label_data[i * b + t];
int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 5);
CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
class_num, stride);
}
}
}
CalcObjnessLoss<T>(loss_data, input_data + 4 * stride, obj_mask_data, n,
mask_num, h, w, stride, an_stride);
}
};
template <typename T>
class Yolov3LossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
auto* gt_match_mask = ctx.Input<Tensor>("GTMatchMask");
auto anchors = ctx.Attr<std::vector<int>>("anchors");
auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
int class_num = ctx.Attr<int>("class_num");
int downsample_ratio = ctx.Attr<int>("downsample_ratio");
const int n = input_grad->dims()[0];
const int c = input_grad->dims()[1];
const int h = input_grad->dims()[2];
const int w = input_grad->dims()[3];
const int mask_num = anchor_mask.size();
const int b = gt_match_mask->dims()[1];
int input_size = downsample_ratio * h;
const int stride = h * w;
const int an_stride = (class_num + 5) * stride;
const T* input_data = input->data<T>();
const T* gt_box_data = gt_box->data<T>();
const int* gt_label_data = gt_label->data<int>();
const T* loss_grad_data = loss_grad->data<T>();
const T* obj_mask_data = objness_mask->data<T>();
const int* gt_match_mask_data = gt_match_mask->data<int>();
T* input_grad_data =
input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
for (int i = 0; i < n; i++) {
for (int t = 0; t < b; t++) {
int mask_idx = gt_match_mask_data[i * b + t];
if (mask_idx >= 0) {
Box<T> gt = GetGtBox(gt_box_data, i, b, t);
int gi = static_cast<int>(gt.x * w);
int gj = static_cast<int>(gt.y * h);
int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 0);
CalcBoxLocationLossGrad<T>(
input_grad_data, loss_grad_data[i], input_data, gt, anchors,
anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride);
int label = gt_label_data[i * b + t];
int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 5);
CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
label_idx, label, class_num, stride);
}
}
}
CalcObjnessLossGrad<T>(input_grad_data + 4 * stride, loss_grad_data,
input_data + 4 * stride, obj_mask_data, n, mask_num,
h, w, stride, an_stride);
}
};
} // namespace operators
} // namespace paddle
...@@ -114,4 +114,5 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -114,4 +114,5 @@ REGISTER_OP_CUDA_KERNEL(
ops::GPUDropoutKernel<plat::CUDADeviceContext, double>); ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>, dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>,
ops::DropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
ops::DropoutGradKernel<plat::CUDADeviceContext, double>); ops::DropoutGradKernel<plat::CUDADeviceContext, double>);
...@@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() { ...@@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() {
} }
template <typename T> template <typename T>
static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n, static void fc_relu(const T* x, const T* w, const T* b, T* y,
int k) { const jit::matmul_attr_t& attr) {
auto matmul = auto matmul =
jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k); jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
auto addbias_relu = auto addbias_relu =
jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(n); jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
matmul(x, w, y, m, n, k); matmul(x, w, y, &attr);
T* dst = y; T* dst = y;
for (int i = 0; i < m; ++i) { for (int i = 0; i < attr.m; ++i) {
addbias_relu(b, dst, dst, n); addbias_relu(b, dst, dst, attr.n);
dst += n; dst += attr.n;
} }
} }
...@@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> { ...@@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
auto i_dims = in->dims(); auto i_dims = in->dims();
auto w_dims = weights[0]->dims(); auto w_dims = weights[0]->dims();
int m = i_dims[0]; jit::matmul_attr_t attr;
int n = w_dims[1]; attr.m = i_dims[0];
int k = w_dims[0]; attr.n = w_dims[1];
relus[0]->Resize({m, n}); attr.k = w_dims[0];
relus[0]->Resize({attr.m, attr.n});
fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(), fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(),
relus[0]->mutable_data<T>(place), m, n, k); relus[0]->mutable_data<T>(place), attr);
for (int i = 1; i < weight_sz - 1; ++i) { for (int i = 1; i < weight_sz - 1; ++i) {
auto i_dims = relus[i - 1]->dims(); auto i_dims = relus[i - 1]->dims();
auto w_dims = weights[i]->dims(); auto w_dims = weights[i]->dims();
int m = i_dims[0]; attr.m = i_dims[0];
int n = w_dims[1]; attr.n = w_dims[1];
int k = w_dims[0]; attr.k = w_dims[0];
relus[i]->Resize({m, n}); relus[i]->Resize({attr.m, attr.n});
fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(), fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(),
biases[i]->data<T>(), relus[i]->mutable_data<T>(place), m, n, k); biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
} }
auto i_dims_last = relus[weight_sz - 2]->dims(); auto i_dims_last = relus[weight_sz - 2]->dims();
auto w_dims_last = weights[weight_sz - 1]->dims(); auto w_dims_last = weights[weight_sz - 1]->dims();
m = i_dims_last[0]; attr.m = i_dims_last[0];
n = w_dims_last[1]; attr.n = w_dims_last[1];
k = w_dims_last[0]; attr.k = w_dims_last[0];
fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(), fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(),
biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place), m, n, biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place),
k); attr);
} }
}; };
......
...@@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> { ...@@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims = y->dims();
int m = x_dims[0]; jit::matmul_attr_t attr;
int k = x_dims[1]; attr.m = x_dims[0];
int n = y_dims[1]; attr.k = x_dims[1];
int o_numel = m * n; attr.n = y_dims[1];
int o_numel = attr.m * attr.n;
auto vsquare_x = auto vsquare_x =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(m * k); jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
attr.k);
auto vsquare_y = auto vsquare_y =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(k * n); jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
attr.n);
auto vsquare_xy = auto vsquare_xy =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel); jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
auto vsub = auto vsub =
...@@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> { ...@@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
auto vscal = auto vscal =
jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel); jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
auto matmul = auto matmul =
jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k); jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* y_data = y->data<T>(); const T* y_data = y->data<T>();
...@@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> { ...@@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
T* squared_xy_data = squared_xy->mutable_data<T>(place); T* squared_xy_data = squared_xy->mutable_data<T>(place);
T* o_data = out->mutable_data<T>(place); T* o_data = out->mutable_data<T>(place);
matmul(x_data, y_data, squared_xy_data, m, n, k); matmul(x_data, y_data, squared_xy_data, &attr);
vsquare_xy(squared_xy_data, squared_xy_data, o_numel); vsquare_xy(squared_xy_data, squared_xy_data, o_numel);
vsquare_x(x_data, squared_x_data, m * k); vsquare_x(x_data, squared_x_data, attr.m * attr.k);
vsquare_y(y_data, squared_y_data, k * n); vsquare_y(y_data, squared_y_data, attr.k * attr.n);
matmul(squared_x_data, squared_y_data, o_data, m, n, k); matmul(squared_x_data, squared_y_data, o_data, &attr);
vsub(squared_xy_data, o_data, o_data, o_numel); vsub(squared_xy_data, o_data, o_data, o_numel);
vscal(&scalar, o_data, o_data, o_numel); vscal(&scalar, o_data, o_data, o_numel);
......
...@@ -31,7 +31,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -31,7 +31,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
auto *output = ctx.Output<Tensor>("Out"); auto *output = ctx.Output<Tensor>("Out");
output->mutable_data<T>(ctx.GetPlace()); output->mutable_data<T>(ctx.GetPlace());
if (x->numel() == 0) return;
GPUGather<T>(ctx.device_context(), *x, *index, output); GPUGather<T>(ctx.device_context(), *x, *index, output);
} }
}; };
...@@ -45,14 +45,13 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -45,14 +45,13 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
auto *Index = ctx.Input<Tensor>("Index"); auto *Index = ctx.Input<Tensor>("Index");
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X")); auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *x = ctx.Input<Tensor>("X");
dX->mutable_data<T>(ctx.GetPlace()); dX->mutable_data<T>(ctx.GetPlace());
auto dxt = framework::EigenVector<T>::Flatten(*dX); auto dxt = framework::EigenVector<T>::Flatten(*dX);
auto &place = *ctx.template device_context<platform::CUDADeviceContext>() auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device(); .eigen_device();
dxt.device(place) = dxt.constant(static_cast<T>(0)); dxt.device(place) = dxt.constant(static_cast<T>(0));
if (dO->numel() == 0) return;
GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX); GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
} }
}; };
...@@ -61,11 +60,14 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -61,11 +60,14 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>, ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>, ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>); ops::GatherOpCUDAKernel<int>,
ops::GatherOpCUDAKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
ops::GatherGradOpCUDAKernel<double>, ops::GatherGradOpCUDAKernel<double>,
ops::GatherGradOpCUDAKernel<int64_t>, ops::GatherGradOpCUDAKernel<int64_t>,
ops::GatherGradOpCUDAKernel<int>); ops::GatherGradOpCUDAKernel<int>,
ops::GatherGradOpCUDAKernel<plat::float16>);
...@@ -35,7 +35,7 @@ class GatherOpKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
auto *output = ctx.Output<Tensor>("Out"); auto *output = ctx.Output<Tensor>("Out");
output->mutable_data<T>(ctx.GetPlace()); output->mutable_data<T>(ctx.GetPlace());
if (x->numel() == 0) return;
CPUGather<T>(ctx.device_context(), *x, *index, output); CPUGather<T>(ctx.device_context(), *x, *index, output);
} }
}; };
...@@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> { ...@@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
auto &place = *ctx.template device_context<platform::CPUDeviceContext>() auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device(); .eigen_device();
dxt.device(place) = dxt.constant(static_cast<T>(0)); dxt.device(place) = dxt.constant(static_cast<T>(0));
if (dO->numel() == 0) return;
ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX); ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
} }
}; };
......
...@@ -82,6 +82,18 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -82,6 +82,18 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
"bilinear interpolation and \"nearest\" for nearest " "bilinear interpolation and \"nearest\" for nearest "
"neighbor interpolation.") "neighbor interpolation.")
.SetDefault("bilinear"); .SetDefault("bilinear");
AddAttr<bool>(
"align_corners",
"an optinal bool. Defaults to True. "
"If True, the centers of 4 corner pixels of the input and output "
"tensors are aligned, preserving the values at the corner pixels, "
"if Flase, are not aligned")
.SetDefault(true);
AddAttr<int>("align_mode",
"(int, default \'1\'), optional for bilinear interpolation"
"can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
"can be \'1\' for src_idx = scale*dst_index .")
.SetDefault(1);
AddComment(R"DOC( AddComment(R"DOC(
This operator samples input X to given output shape by using specified This operator samples input X to given output shape by using specified
interpolation method, the interpolation methods can be \"nearest\" interpolation method, the interpolation methods can be \"nearest\"
...@@ -98,6 +110,64 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -98,6 +110,64 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
to perform linear interpolation first in one direction, and then to perform linear interpolation first in one direction, and then
again in the other direction. again in the other direction.
Align_corners and align_mode are optinal parameters,the calculation method
of interpolation can be selected by them.
Example:
For scale:
if align_corners = True and out_{size}>1 :
scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
else:
scale_{factor} = float(in_{size}/out_{size})
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
else:
align_corners = True
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
For details of nearest neighbor interpolation, please refer to Wikipedia: For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
......
...@@ -23,7 +23,8 @@ __global__ void KeNearestNeighborInterpFw( ...@@ -23,7 +23,8 @@ __global__ void KeNearestNeighborInterpFw(
const T* in, const size_t in_img_h, const size_t in_img_w, const T* in, const size_t in_img_h, const size_t in_img_w,
const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const float ratio_h, const float ratio_w) { const size_t num_channels, const float ratio_h, const float ratio_w,
const bool align_corners) {
int nthreads = output_h * output_w; int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
...@@ -35,10 +36,14 @@ __global__ void KeNearestNeighborInterpFw( ...@@ -35,10 +36,14 @@ __global__ void KeNearestNeighborInterpFw(
int channel_id = out_id_w / out_img_size; int channel_id = out_id_w / out_img_size;
int out_img_idy = (out_id_w % out_img_size) / out_img_w; int out_img_idy = (out_id_w % out_img_size) / out_img_w;
int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5); int in_img_idy = (align_corners)
? static_cast<int>(ratio_h * out_img_idy + 0.5)
: static_cast<int>(ratio_h * out_img_idy);
int out_img_idx = tid % out_img_w; int out_img_idx = tid % out_img_w;
int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5); int in_img_idx = (align_corners)
? static_cast<int>(ratio_w * out_img_idx + 0.5)
: static_cast<int>(ratio_w * out_img_idx);
out[tid] = in[out_id_h * input_w + channel_id * in_img_size + out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
in_img_idy * in_img_w + in_img_idx]; in_img_idy * in_img_w + in_img_idx];
...@@ -50,7 +55,8 @@ __global__ void KeNearestNeighborInterpBw( ...@@ -50,7 +55,8 @@ __global__ void KeNearestNeighborInterpBw(
T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
const size_t input_w, const T* out, const size_t out_img_h, const size_t input_w, const T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const float ratio_h, const float ratio_w) { const size_t num_channels, const float ratio_h, const float ratio_w,
const bool align_corners) {
int nthreads = output_h * output_w; int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
...@@ -62,10 +68,14 @@ __global__ void KeNearestNeighborInterpBw( ...@@ -62,10 +68,14 @@ __global__ void KeNearestNeighborInterpBw(
int channel_id = out_id_w / out_img_size; int channel_id = out_id_w / out_img_size;
int out_img_idy = (out_id_w % out_img_size) / out_img_w; int out_img_idy = (out_id_w % out_img_size) / out_img_w;
int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5); int in_img_idy = (align_corners)
? static_cast<int>(ratio_h * out_img_idy + 0.5)
: static_cast<int>(ratio_h * out_img_idy);
int out_img_idx = tid % out_img_w; int out_img_idx = tid % out_img_w;
int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5); int in_img_idx = (align_corners)
? static_cast<int>(ratio_w * out_img_idx + 0.5)
: static_cast<int>(ratio_w * out_img_idx);
T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
in_img_idy * in_img_w + in_img_idx]; in_img_idy * in_img_w + in_img_idx];
...@@ -79,10 +89,12 @@ __global__ void KeBilinearInterpFw( ...@@ -79,10 +89,12 @@ __global__ void KeBilinearInterpFw(
const T* in, const size_t in_img_h, const size_t in_img_w, const T* in, const size_t in_img_h, const size_t in_img_w,
const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const float ratio_h, const float ratio_w) { const size_t num_channels, const float ratio_h, const float ratio_w,
const bool align_corners, const int align_mode) {
int nthreads = output_h * output_w; int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
bool align_flag = (align_mode == 0 && !align_corners);
for (; tid < nthreads; tid += stride) { for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w; int out_id_h = tid / output_w;
int out_id_w = tid % output_w; int out_id_w = tid % output_w;
...@@ -91,15 +103,23 @@ __global__ void KeBilinearInterpFw( ...@@ -91,15 +103,23 @@ __global__ void KeBilinearInterpFw(
int channel_id = out_id_w / out_img_size; int channel_id = out_id_w / out_img_size;
int out_img_idy = (out_id_w % out_img_size) / out_img_w; int out_img_idy = (out_id_w % out_img_size) / out_img_w;
int in_img_idy = ratio_h * out_img_idy; int in_img_idy = align_flag
? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
: static_cast<int>(ratio_h * out_img_idy);
in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
T h1lambda = ratio_h * out_img_idy - in_img_idy; T h1lambda = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - in_img_idy
: ratio_h * out_img_idy - in_img_idy;
T h2lambda = 1.f - h1lambda; T h2lambda = 1.f - h1lambda;
int out_img_idx = tid % out_img_w; int out_img_idx = tid % out_img_w;
int in_img_idx = ratio_w * out_img_idx; int in_img_idx = align_flag
? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
: static_cast<int>(ratio_w * out_img_idx);
in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
T w1lambda = ratio_w * out_img_idx - in_img_idx; T w1lambda = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - in_img_idx
: ratio_w * out_img_idx - in_img_idx;
T w2lambda = 1.f - w1lambda; T w2lambda = 1.f - w1lambda;
const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
...@@ -118,10 +138,12 @@ __global__ void KeBilinearInterpBw( ...@@ -118,10 +138,12 @@ __global__ void KeBilinearInterpBw(
T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
const size_t input_w, const T* out, const size_t out_img_h, const size_t input_w, const T* out, const size_t out_img_h,
const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t out_img_w, const size_t output_h, const size_t output_w,
const size_t num_channels, const T ratio_h, const T ratio_w) { const size_t num_channels, const T ratio_h, const T ratio_w,
const bool align_corners, const int align_mode) {
int nthreads = output_h * output_w; int nthreads = output_h * output_w;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; int stride = blockDim.x * gridDim.x;
bool align_flag = (align_mode == 0 && !align_corners);
for (; tid < nthreads; tid += stride) { for (; tid < nthreads; tid += stride) {
int out_id_h = tid / output_w; int out_id_h = tid / output_w;
int out_id_w = tid % output_w; int out_id_w = tid % output_w;
...@@ -130,15 +152,22 @@ __global__ void KeBilinearInterpBw( ...@@ -130,15 +152,22 @@ __global__ void KeBilinearInterpBw(
int channel_id = out_id_w / out_img_size; int channel_id = out_id_w / out_img_size;
int out_img_idy = (out_id_w % out_img_size) / out_img_w; int out_img_idy = (out_id_w % out_img_size) / out_img_w;
int in_img_idy = ratio_h * out_img_idy; int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
: ratio_h * out_img_idy;
in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
T h1lambda = ratio_h * out_img_idy - in_img_idy; T h1lambda = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - in_img_idy
: ratio_h * out_img_idy - in_img_idy;
T h2lambda = 1.f - h1lambda; T h2lambda = 1.f - h1lambda;
int out_img_idx = tid % out_img_w; int out_img_idx = tid % out_img_w;
int in_img_idx = ratio_w * out_img_idx; int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
: ratio_w * out_img_idx;
in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
T w1lambda = ratio_w * out_img_idx - in_img_idx; T w1lambda = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - in_img_idx
: ratio_w * out_img_idx - in_img_idx;
T w2lambda = 1.f - w1lambda; T w2lambda = 1.f - w1lambda;
T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
...@@ -175,6 +204,9 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> { ...@@ -175,6 +204,9 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
out_w = size_data[1]; out_w = size_data[1];
} }
bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode");
int n = input->dims()[0]; int n = input->dims()[0];
int c = input->dims()[1]; int c = input->dims()[1];
int in_h = input->dims()[2]; int in_h = input->dims()[2];
...@@ -188,10 +220,16 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> { ...@@ -188,10 +220,16 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
int in_chw = c * in_hw; int in_chw = c * in_hw;
int out_chw = c * out_hw; int out_chw = c * out_hw;
float ratio_h = float ratio_h = 0.f;
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f; float ratio_w = 0.f;
float ratio_w = if (out_h > 1) {
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f; ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h;
}
if (out_w > 1) {
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w;
}
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*input, ctx.GetPlace(), output); framework::TensorCopy(*input, ctx.GetPlace(), output);
...@@ -206,12 +244,12 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> { ...@@ -206,12 +244,12 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
KeNearestNeighborInterpFw< KeNearestNeighborInterpFw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>( T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
out_chw, c, ratio_h, ratio_w); out_chw, c, ratio_h, ratio_w, align_corners);
} else if ("bilinear" == interp_method) { } else if ("bilinear" == interp_method) {
KeBilinearInterpFw< KeBilinearInterpFw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>( T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
out_chw, c, ratio_h, ratio_w); out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
} }
} }
}; };
...@@ -234,6 +272,10 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -234,6 +272,10 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
int out_h = ctx.Attr<int>("out_h"); int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w"); int out_w = ctx.Attr<int>("out_w");
auto out_size = ctx.Input<Tensor>("OutSize"); auto out_size = ctx.Input<Tensor>("OutSize");
bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode");
if (out_size != nullptr) { if (out_size != nullptr) {
Tensor sizes; Tensor sizes;
framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes); framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
...@@ -252,10 +294,16 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -252,10 +294,16 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
int in_chw = c * in_hw; int in_chw = c * in_hw;
int out_chw = c * out_hw; int out_chw = c * out_hw;
float ratio_h = float ratio_h = 0.f;
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f; float ratio_w = 0.f;
float ratio_w = if (out_h > 1) {
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f; ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h;
}
if (out_w > 1) {
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w;
}
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
...@@ -270,12 +318,12 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -270,12 +318,12 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
KeNearestNeighborInterpBw< KeNearestNeighborInterpBw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>( T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
out_w, n, out_chw, c, ratio_h, ratio_w); out_w, n, out_chw, c, ratio_h, ratio_w, align_corners);
} else if ("bilinear" == interp_method) { } else if ("bilinear" == interp_method) {
KeBilinearInterpBw< KeBilinearInterpBw<
T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>( T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
out_w, n, out_chw, c, ratio_h, ratio_w); out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
} }
} }
}; };
......
...@@ -26,14 +26,17 @@ template <typename T> ...@@ -26,14 +26,17 @@ template <typename T>
static void NearestNeighborInterpolate(const Tensor& input, Tensor* output, static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
const float ratio_h, const float ratio_w, const float ratio_h, const float ratio_w,
const int n, const int c, const int n, const int c,
const int out_h, const int out_w) { const int out_h, const int out_w,
const bool align_corners) {
auto input_t = EigenTensor<T, 4>::From(input); auto input_t = EigenTensor<T, 4>::From(input);
auto output_t = EigenTensor<T, 4>::From(*output); auto output_t = EigenTensor<T, 4>::From(*output);
for (int k = 0; k < out_h; k++) { // loop for images for (int k = 0; k < out_h; k++) { // loop for images
int in_k = static_cast<int>(ratio_h * k + 0.5); int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
: static_cast<int>(ratio_h * k);
for (int l = 0; l < out_w; l++) { for (int l = 0; l < out_w; l++) {
int in_l = static_cast<int>(ratio_w * l + 0.5); int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
: static_cast<int>(ratio_w * l);
for (int i = 0; i < n; i++) { // loop for batches for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels for (int j = 0; j < c; j++) { // loop for channels
...@@ -48,20 +51,29 @@ template <typename T> ...@@ -48,20 +51,29 @@ template <typename T>
static void BilinearInterpolation(const Tensor& input, Tensor* output, static void BilinearInterpolation(const Tensor& input, Tensor* output,
const float ratio_h, const float ratio_w, const float ratio_h, const float ratio_w,
const int in_h, const int in_w, const int n, const int in_h, const int in_w, const int n,
const int c, const int out_h, const int c, const int out_h, const int out_w,
const int out_w) { const bool align_corners,
const bool align_mode) {
auto input_t = EigenTensor<T, 4>::From(input); auto input_t = EigenTensor<T, 4>::From(input);
auto output_t = EigenTensor<T, 4>::From(*output); auto output_t = EigenTensor<T, 4>::From(*output);
bool align_flag = (align_mode == 0 && !align_corners);
for (int k = 0; k < out_h; k++) { // loop for images for (int k = 0; k < out_h; k++) { // loop for images
int y_n = static_cast<int>(ratio_h * k); int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
: static_cast<int>(ratio_h * k);
y_n = (y_n > 0) ? y_n : 0;
int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
float d_n = ratio_h * k - y_n; float d_n =
align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
float d_s = 1.f - d_n; float d_s = 1.f - d_n;
for (int l = 0; l < out_w; l++) { for (int l = 0; l < out_w; l++) {
int x_w = static_cast<int>(ratio_w * l); int x_w = (align_mode == 0 && !align_corners)
? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
: static_cast<int>(ratio_w * l);
x_w = (x_w > 0) ? x_w : 0;
int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
float d_w = ratio_w * l - x_w; float d_w =
align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
float d_e = 1.f - d_w; float d_e = 1.f - d_w;
for (int i = 0; i < n; i++) { // loop for batches for (int i = 0; i < n; i++) { // loop for batches
...@@ -78,19 +90,20 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output, ...@@ -78,19 +90,20 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
} }
template <typename T> template <typename T>
static void NearestNeighborInterpolateGrad(const Tensor& output_grad, static void NearestNeighborInterpolateGrad(
Tensor* input_grad, const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
const float ratio_h, const float ratio_w, const int n, const int c, const int out_h,
const float ratio_w, const int n, const int out_w, const bool align_corners) {
const int c, const int out_h,
const int out_w) {
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad); auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad); auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
for (int k = 0; k < out_h; k++) { // loop for images for (int k = 0; k < out_h; k++) { // loop for images
int in_k = static_cast<int>(ratio_h * k + 0.5); int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
: static_cast<int>(ratio_h * k);
for (int l = 0; l < out_w; l++) { for (int l = 0; l < out_w; l++) {
int in_l = static_cast<int>(ratio_w * l + 0.5); int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
: static_cast<int>(ratio_w * l);
for (int i = 0; i < n; i++) { // loop for batches for (int i = 0; i < n; i++) { // loop for batches
for (int j = 0; j < c; j++) { // loop for channels for (int j = 0; j < c; j++) { // loop for channels
...@@ -106,19 +119,28 @@ static void BilinearInterpolationGrad(const Tensor& output_grad, ...@@ -106,19 +119,28 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
Tensor* input_grad, const float ratio_h, Tensor* input_grad, const float ratio_h,
const float ratio_w, const int in_h, const float ratio_w, const int in_h,
const int in_w, const int n, const int c, const int in_w, const int n, const int c,
const int out_h, const int out_w) { const int out_h, const int out_w,
const bool align_corners,
const int align_mode) {
auto input_grad_t = EigenTensor<T, 4>::From(*input_grad); auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
auto output_grad_t = EigenTensor<T, 4>::From(output_grad); auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
bool align_flag = (align_mode == 0 && !align_corners);
for (int k = 0; k < out_h; k++) { // loop for images for (int k = 0; k < out_h; k++) { // loop for images
int y_n = static_cast<int>(ratio_h * k); int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
: static_cast<int>(ratio_h * k);
y_n = (y_n > 0) ? y_n : 0;
int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
float d_n = ratio_h * k - y_n; float d_n =
align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
float d_s = 1.f - d_n; float d_s = 1.f - d_n;
for (int l = 0; l < out_w; l++) { for (int l = 0; l < out_w; l++) {
int x_w = static_cast<int>(ratio_w * l); int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
: static_cast<int>(ratio_w * l);
x_w = (x_w > 0) ? x_w : 0;
int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
float d_w = ratio_w * l - x_w; float d_w =
align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
float d_e = 1.f - d_w; float d_e = 1.f - d_w;
for (int i = 0; i < n; i++) { // loop for batches for (int i = 0; i < n; i++) { // loop for batches
...@@ -134,7 +156,6 @@ static void BilinearInterpolationGrad(const Tensor& output_grad, ...@@ -134,7 +156,6 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
} }
} }
} }
template <typename T> template <typename T>
class InterpolateKernel : public framework::OpKernel<T> { class InterpolateKernel : public framework::OpKernel<T> {
public: public:
...@@ -151,6 +172,8 @@ class InterpolateKernel : public framework::OpKernel<T> { ...@@ -151,6 +172,8 @@ class InterpolateKernel : public framework::OpKernel<T> {
out_h = out_size_data[0]; out_h = out_size_data[0];
out_w = out_size_data[1]; out_w = out_size_data[1];
} }
bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode");
const int n = input->dims()[0]; const int n = input->dims()[0];
const int c = input->dims()[1]; const int c = input->dims()[1];
...@@ -168,17 +191,24 @@ class InterpolateKernel : public framework::OpKernel<T> { ...@@ -168,17 +191,24 @@ class InterpolateKernel : public framework::OpKernel<T> {
return; return;
} }
float ratio_h = float ratio_h = 0.f;
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f; float ratio_w = 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f; if (out_h > 1) {
ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h;
}
if (out_w > 1) {
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w;
}
if ("bilinear" == interp_method) { if ("bilinear" == interp_method) {
BilinearInterpolation<T>(*input, output, ratio_h, ratio_w, in_h, in_w, n, BilinearInterpolation<T>(*input, output, ratio_h, ratio_w, in_h, in_w, n,
c, out_h, out_w); c, out_h, out_w, align_corners, align_mode);
} else if ("nearest" == interp_method) { } else if ("nearest" == interp_method) {
NearestNeighborInterpolate<T>(*input, output, ratio_h, ratio_w, n, c, NearestNeighborInterpolate<T>(*input, output, ratio_h, ratio_w, n, c,
out_h, out_w); out_h, out_w, align_corners);
} }
} }
}; };
...@@ -200,6 +230,8 @@ class InterpolateGradKernel : public framework::OpKernel<T> { ...@@ -200,6 +230,8 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
out_h = out_size_data[0]; out_h = out_size_data[0];
out_w = out_size_data[1]; out_w = out_size_data[1];
} }
bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode");
const int n = input->dims()[0]; const int n = input->dims()[0];
const int c = input->dims()[1]; const int c = input->dims()[1];
...@@ -217,17 +249,26 @@ class InterpolateGradKernel : public framework::OpKernel<T> { ...@@ -217,17 +249,26 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
return; return;
} }
float ratio_h = float ratio_h = 0.f;
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f; float ratio_w = 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f; if (out_h > 1) {
ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h;
}
if (out_w > 1) {
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w;
}
if ("bilinear" == interp_method) { if ("bilinear" == interp_method) {
BilinearInterpolationGrad<T>(*output_grad, input_grad, ratio_h, ratio_w, BilinearInterpolationGrad<T>(*output_grad, input_grad, ratio_h, ratio_w,
in_h, in_w, n, c, out_h, out_w); in_h, in_w, n, c, out_h, out_w,
align_corners, align_mode);
} else if ("nearest" == interp_method) { } else if ("nearest" == interp_method) {
NearestNeighborInterpolateGrad<T>(*output_grad, input_grad, ratio_h, NearestNeighborInterpolateGrad<T>(*output_grad, input_grad, ratio_h,
ratio_w, n, c, out_h, out_w); ratio_w, n, c, out_h, out_w,
align_corners);
} }
} }
}; };
......
...@@ -93,6 +93,7 @@ std::vector<int> TestSizes() { ...@@ -93,6 +93,7 @@ std::vector<int> TestSizes() {
template <typename KernelTuples, typename... Args> template <typename KernelTuples, typename... Args>
struct BenchFunc { struct BenchFunc {
// return this function avg time // return this function avg time
// TODO(TJ): clear cache every time
double operator()(const typename KernelTuples::func_type tgt, Args... args) { double operator()(const typename KernelTuples::func_type tgt, Args... args) {
for (int i = 0; i < FLAGS_burning; ++i) { for (int i = 0; i < FLAGS_burning; ++i) {
tgt(args...); tgt(args...);
...@@ -172,6 +173,9 @@ void BenchXYZNKernel() { ...@@ -172,6 +173,9 @@ void BenchXYZNKernel() {
RandomVec<T>(d, y_data); RandomVec<T>(d, y_data);
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
y.data<T>(), z_data, d); y.data<T>(), z_data, d);
// test inplace
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), z_data,
z_data, d);
} }
} }
...@@ -311,8 +315,9 @@ void BenchMatMulKernel() { ...@@ -311,8 +315,9 @@ void BenchMatMulKernel() {
const T* a_data = a.data<T>(); const T* a_data = a.data<T>();
const T* b_data = b.data<T>(); const T* b_data = b.data<T>();
T* c_data = c.mutable_data<T>(PlaceType()); T* c_data = c.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data, const jit::matmul_attr_t attr{m, n, k};
c_data, m, n, k); BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
c_data, &attr);
} }
} }
} }
......
...@@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET) ...@@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET)
endfunction() endfunction()
# use gen jitcode kernel by name # use gen jitcode kernel by name
USE_JITKERNEL_GEN(kMatMul)
USE_JITKERNEL_GEN(kVMul) USE_JITKERNEL_GEN(kVMul)
USE_JITKERNEL_GEN(kVAdd) USE_JITKERNEL_GEN(kVAdd)
USE_JITKERNEL_GEN(kVSub) USE_JITKERNEL_GEN(kVSub)
......
...@@ -155,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> { ...@@ -155,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
class name##Creator : public JitCodeCreator<int> { \ class name##Creator : public JitCodeCreator<int> { \
public: \ public: \
bool UseMe(const int& attr) const override { \ bool UseMe(const int& attr) const override { \
return platform::MayIUse(platform::avx); \ return platform::MayIUse(platform::avx) && attr <= 1024; \
} \ } \
size_t CodeSize(const int& d) const override { \ size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
......
...@@ -61,6 +61,7 @@ class VXXJitCode : public JitCode { ...@@ -61,6 +61,7 @@ class VXXJitCode : public JitCode {
base += "_Vec"; base += "_Vec";
} }
base += (with_relu_ ? "_Relu" : ""); base += (with_relu_ ? "_Relu" : "");
base += "_D" + std::to_string(num_);
return base.c_str(); return base.c_str();
} }
void genCode() override; void genCode() override;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/matmul.h"
#include <stddef.h> // offsetof
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void MatMulJitCode::genCode() {
preCode();
int block, rest;
const auto groups = packed_groups(n_, k_, &block, &rest);
PADDLE_ENFORCE_GT(groups.front(), 0);
const int block_len = sizeof(float) * block;
const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
const int w_reg_idx = x_reg_idx - 1;
// from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t,
// packed_weight)]);
mov(reg_ptr_wgt, param_y);
size_t z_offset = 0;
size_t wgt_offset = 0;
for (size_t g = 0; g < groups.size(); ++g) {
size_t x_offset = 0;
for (int k = 0; k < k_; ++k) {
vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
// clean
if (k == 0) {
for (int i = 0; i < groups[g]; ++i) {
vxorps(zmm_t(i), zmm_t(i), zmm_t(i));
}
}
for (int i = 0; i < groups[g]; ++i) {
vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
wgt_offset += block_len;
}
// last one, save
if (k == k_ - 1) {
for (int i = 0; i < groups[g]; ++i) {
// only rest save should be careful
if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) {
break;
}
vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i));
}
}
x_offset += sizeof(float);
}
z_offset += block_len * groups[g];
}
if (rest != 0) {
// below should refine with mask
int reg_idx = groups.back() - 1;
z_offset = (n_ - rest) * sizeof(float);
int inner_block = 8;
while (rest > 0) {
if (rest >= 8) {
inner_block = 8;
vmovups(ptr[param_z + z_offset], ymm_t(reg_idx));
// shift zmm of inner_block, change reg_idx if update
} else if (rest >= 4) {
inner_block = 4;
vmovups(ptr[param_z + z_offset], xmm_t(reg_idx));
} else if (rest >= 2) {
inner_block = 2;
vmovq(ptr[param_z + z_offset], xmm_t(reg_idx));
} else {
inner_block = 1;
vmovss(ptr[param_z + z_offset], xmm_t(reg_idx));
}
z_offset += inner_block * sizeof(float);
rest -= inner_block;
}
}
postCode();
}
class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
public:
bool UseMe(const matmul_attr_t& attr) const override {
return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
}
size_t CodeSize(const matmul_attr_t& attr) const override {
int block = YMM_FLOAT_BLOCK;
if (platform::MayIUse(platform::avx512f)) {
block = ZMM_FLOAT_BLOCK;
}
return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
}
std::unique_ptr<GenBase> CreateJitCode(
const matmul_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.m, 0);
PADDLE_ENFORCE_GT(attr.n, 0);
PADDLE_ENFORCE_GT(attr.k, 0);
return make_unique<MatMulJitCode>(attr, CodeSize(attr));
}
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <stdlib.h> // for malloc and free
#include <string>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
class MatMulJitCode : public JitCode {
public:
explicit MatMulJitCode(const matmul_attr_t& attr,
size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
this->genCode();
}
virtual const char* name() const {
std::string base = "MatMulJitCode";
base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
std::to_string(k_);
return base.c_str();
}
void genCode() override;
private:
int m_, n_, k_;
reg64_t param_x{abi_param1};
reg64_t param_y{abi_param2};
reg64_t param_z{abi_param3};
reg64_t param_attr{abi_param4};
reg64_t reg_tmp{rax};
reg64_t reg_ptr_wgt{r10};
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <vector>
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
...@@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const { ...@@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const {
} }
} }
std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
int block;
int max_num_regs;
if (platform::MayIUse(platform::avx512f)) {
block = ZMM_FLOAT_BLOCK;
max_num_regs = 32;
} else {
block = YMM_FLOAT_BLOCK;
max_num_regs = 16;
}
// one for x, one for y, others for z
const int max_used_regs_for_n = max_num_regs - 2;
const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
const int num_block = aligned_n / block;
const int num_groups = num_block / max_used_regs_for_n;
std::vector<int> groups(num_groups, max_used_regs_for_n);
int rest_num_regs = num_block % max_used_regs_for_n;
if (rest_num_regs != 0) {
groups.push_back(rest_num_regs);
}
if (block_out) {
*block_out = block;
}
if (rest_out) {
*rest_out = n % block;
}
return groups;
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <vector>
#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_base.h"
DECLARE_bool(dump_jitcode); DECLARE_bool(dump_jitcode);
...@@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator { ...@@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator {
virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0; virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
}; };
// unify the method of packed groups
// output the packed groups which used in weights, the block size and rest size
std::vector<int> packed_groups(int n, int k, int* block = nullptr,
int* rest = nullptr);
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "paddle/fluid/operators/jit/helper.h" #include "paddle/fluid/operators/jit/helper.h"
#include <algorithm> // tolower #include <algorithm> // tolower
#include <numeric>
#include <string>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -91,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) { ...@@ -91,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) {
return kNone; return kNone;
} }
template <>
void pack_weights<float>(const float* src, float* dst, int n, int k) {
int block, rest;
const auto groups = packed_groups(n, k, &block, &rest);
std::for_each(groups.begin(), groups.end(), [&](int i) {
PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
});
int sum = std::accumulate(groups.begin(), groups.end(), 0);
std::memset(dst, 0, k * sum * block * sizeof(float));
PADDLE_ENFORCE_GE(sum * block, n,
"The packed n should be equal to or larger than n");
const int block_len = sizeof(float) * block;
int n_offset = 0;
for (size_t g = 0; g < groups.size(); ++g) {
const float* from = src + n_offset;
for (int j = 0; j < k; ++j) {
size_t copy_sz = groups[g] * block_len;
if (g == groups.size() - 1 && rest != 0) {
copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float);
}
std::memcpy(dst, from + j * n, copy_sz);
dst += groups[g] * block;
}
n_offset += groups[g] * block;
}
}
template <typename T>
typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
const T* src, T* dst, int n, int k) {
PADDLE_THROW("Only support pack with float type.");
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -118,26 +118,33 @@ typename KernelTuples::func_type Get( ...@@ -118,26 +118,33 @@ typename KernelTuples::func_type Get(
return GetRefer<KT, KernelTuples>(); return GetRefer<KT, KernelTuples>();
} }
template <KernelType KT, typename KernelTuples> template <KernelType KT, typename KernelTuples, typename PlaceType>
class KernelFuncsCache { class KernelFuncs {
public: public:
KernelFuncsCache() = default; KernelFuncs() = default;
static KernelFuncsCache& Instance() { static KernelFuncs& Cache() {
static thread_local KernelFuncsCache<KT, KernelTuples> g_func_cache; static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache;
return g_func_cache; return g_func_cache;
} }
bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
typename KernelTuples::func_type At(int key) { return funcs_.at(key); }
void Insert(int key, typename KernelTuples::func_type func) { void Insert(int key, typename KernelTuples::func_type func) {
funcs_.emplace(key, func); funcs_.emplace(key, func);
} }
typename KernelTuples::func_type At(int key) {
if (Has(key)) {
return funcs_.at(key);
}
auto func = Get<KT, KernelTuples, PlaceType>(key);
Insert(key, func);
return func;
}
private: private:
std::unordered_map<int, typename KernelTuples::func_type> funcs_; std::unordered_map<int, typename KernelTuples::func_type> funcs_;
DISABLE_COPY_AND_ASSIGN(KernelFuncsCache); DISABLE_COPY_AND_ASSIGN(KernelFuncs);
}; };
const char* to_string(KernelType kt); const char* to_string(KernelType kt);
...@@ -152,17 +159,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) { ...@@ -152,17 +159,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
<< (attr.use_peephole ? "True" : "False") << "]"; << (attr.use_peephole ? "True" : "False") << "]";
return os; return os;
} }
inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
<< "],act_cand[" << to_string(attr.act_cand) << "]"; << "],act_cand[" << to_string(attr.act_cand) << "]";
return os; return os;
} }
inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type[" os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
<< to_string(attr.type) << "]"; << to_string(attr.type) << "]";
return os; return os;
} }
inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
return os;
}
// expose the method to pack matmul weight
template <typename T>
void pack_weights(const T* src, T* dst, int n, int k);
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -145,11 +145,19 @@ struct SeqPoolTuples { ...@@ -145,11 +145,19 @@ struct SeqPoolTuples {
typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
}; };
typedef struct matmul_attr_s {
int m, n, k;
void* packed_weight{nullptr};
matmul_attr_s() = default;
explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr)
: m(m_), n(n_), k(k_), packed_weight(packed_weight_) {}
} matmul_attr_t;
template <typename T> template <typename T>
struct MatMulTuples { struct MatMulTuples {
typedef T data_type; typedef T data_type;
typedef int attr_type; typedef matmul_attr_t attr_type;
typedef void (*func_type)(const T*, const T*, T*, int, int, int); typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
}; };
template <typename T> template <typename T>
......
...@@ -49,6 +49,13 @@ size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) { ...@@ -49,6 +49,13 @@ size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
return (key << pool_type_shift) + static_cast<int>(attr.type); return (key << pool_type_shift) + static_cast<int>(attr.type);
} }
template <>
size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
size_t key = attr.m;
constexpr int shift = 21;
return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -49,49 +49,16 @@ void VTanh(const T* x, T* y, int n) { ...@@ -49,49 +49,16 @@ void VTanh(const T* x, T* y, int n) {
} }
void Softmax(const T* x, T* y, int n, int bs) { void Softmax(const T* x, T* y, int n, int bs) {
typename XRNTuples<T>::func_type compute_hmax{nullptr}; auto compute_hmax =
typename XRNTuples<T>::func_type compute_hsum{nullptr}; KernelFuncs<kHMax, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
typename AXYNTuples<T>::func_type compute_vscal{nullptr}; auto compute_hsum =
typename AXYNTuples<T>::func_type compute_vaddbias{nullptr}; KernelFuncs<kHSum, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
typename XYNTuples<T>::func_type compute_vexp{nullptr}; auto compute_vscal =
KernelFuncs<kVScal, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
if (!KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Has(n)) { auto compute_vaddbias =
compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n); KernelFuncs<kVAddBias, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Insert(n, compute_hmax); auto compute_vexp =
} else { KernelFuncs<kVExp, XYNTuples<T>, platform::CPUPlace>::Cache().At(n);
compute_hmax = KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().At(n);
}
if (!KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Has(n)) {
compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Insert(n, compute_hsum);
} else {
compute_hsum = KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().At(n);
}
if (!KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Has(n)) {
compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Insert(n,
compute_vscal);
} else {
compute_vscal = KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().At(n);
}
if (!KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Has(n)) {
compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Insert(
n, compute_vaddbias);
} else {
compute_vaddbias =
KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().At(n);
}
if (!KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Has(n)) {
compute_vexp = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Insert(n, compute_vexp);
} else {
compute_vexp = KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().At(n);
}
for (int i = 0; i < bs; ++i) { for (int i = 0; i < bs; ++i) {
T scalar; T scalar;
......
...@@ -25,17 +25,19 @@ namespace more { ...@@ -25,17 +25,19 @@ namespace more {
namespace mkl { namespace mkl {
template <> template <>
void MatMul<float>(const float* a, const float* b, float* c, int m, int n, void MatMul<float>(const float* a, const float* b, float* c,
int k) { const matmul_attr_t* attr) {
platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, k, 1.f, a, k, b, n, 0.f, c, n); attr->m, attr->n, attr->k, 1.f, a, attr->k, b,
attr->n, 0.f, c, attr->n);
} }
template <> template <>
void MatMul<double>(const double* a, const double* b, double* c, int m, int n, void MatMul<double>(const double* a, const double* b, double* c,
int k) { const matmul_attr_t* attr) {
platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
n, k, 1.0, a, k, b, n, 0.0, c, n); attr->m, attr->n, attr->k, 1.0, a, attr->k, b,
attr->n, 0.0, c, attr->n);
} }
template <> template <>
...@@ -127,11 +129,6 @@ void ASum<double>(const double* x, double* res, int n) { ...@@ -127,11 +129,6 @@ void ASum<double>(const double* x, double* res, int n) {
} }
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <>
bool MatMulKernel<float>::UseMe(const int& d) const {
return platform::MayIUse(platform::avx);
}
template <> template <>
bool VMulKernel<float>::UseMe(const int& d) const { bool VMulKernel<float>::UseMe(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512; return platform::MayIUse(platform::avx512f) && d > 512;
...@@ -139,7 +136,7 @@ bool VMulKernel<float>::UseMe(const int& d) const { ...@@ -139,7 +136,7 @@ bool VMulKernel<float>::UseMe(const int& d) const {
template <> template <>
bool VAddKernel<float>::UseMe(const int& d) const { bool VAddKernel<float>::UseMe(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512; return platform::MayIUse(platform::avx) && d > 512;
} }
template <> template <>
...@@ -177,6 +174,16 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const { ...@@ -177,6 +174,16 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
return true; return true;
} }
template <>
bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
return platform::MayIUse(platform::avx);
}
template <>
bool MatMulKernel<double>::UseMe(const matmul_attr_t& attr) const {
return true;
}
template <> template <>
bool SoftmaxKernel<float>::UseMe(const int& d) const { bool SoftmaxKernel<float>::UseMe(const int& d) const {
// tuned on avx2 // tuned on avx2
...@@ -189,7 +196,6 @@ bool SoftmaxKernel<float>::UseMe(const int& d) const { ...@@ -189,7 +196,6 @@ bool SoftmaxKernel<float>::UseMe(const int& d) const {
return true; \ return true; \
} }
AWALYS_USE_ME_WITH_DOUBLE(MatMul);
AWALYS_USE_ME_WITH_DOUBLE(VMul); AWALYS_USE_ME_WITH_DOUBLE(VMul);
AWALYS_USE_ME_WITH_DOUBLE(VAdd); AWALYS_USE_ME_WITH_DOUBLE(VAdd);
AWALYS_USE_ME_WITH_DOUBLE(VScal); AWALYS_USE_ME_WITH_DOUBLE(VScal);
......
...@@ -26,7 +26,7 @@ namespace more { ...@@ -26,7 +26,7 @@ namespace more {
namespace mkl { namespace mkl {
template <typename T> template <typename T>
void MatMul(const T* a, const T* b, T* c, int m, int n, int k); void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr);
template <typename T> template <typename T>
void VMul(const T* x, const T* y, T* z, int n); void VMul(const T* x, const T* y, T* z, int n);
......
...@@ -363,17 +363,19 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { ...@@ -363,17 +363,19 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
// A(M,K) * B(K,N) = C(M,N) // A(M,K) * B(K,N) = C(M,N)
template <typename T> template <typename T>
void MatMul(const T* A, const T* B, T* C, int M, int N, int K) { void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) {
int M = attr->m;
int N = attr->n;
int K = attr->k;
for (int m = 0; m < M; ++m) { for (int m = 0; m < M; ++m) {
const T* pa = A + m * K; const T* pa = A + m * K;
T* pc = C + m * N; T* pc = C + m * N;
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
const T* pb = B + n; const T* pb = B + n;
T sum = static_cast<T>(0); pc[n] = pa[0] * pb[0];
for (int k = 0; k < K; ++k) { for (int k = 1; k < K; ++k) {
sum += (pa[k] * pb[k * N]); pc[n] += pa[k] * pb[k * N];
} }
*(pc + n) = sum;
} }
} }
} }
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
static double acc = 1e-5; DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
template <typename T> template <typename T>
void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f), void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
...@@ -39,7 +39,7 @@ template <typename T> ...@@ -39,7 +39,7 @@ template <typename T>
void ExpectEQ(const T* target, const T* refer, int n) { void ExpectEQ(const T* target, const T* refer, int n) {
if (std::is_floating_point<T>::value) { if (std::is_floating_point<T>::value) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
EXPECT_NEAR(target[i], refer[i], acc); EXPECT_NEAR(target[i], refer[i], FLAGS_acc);
} }
} else { } else {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
...@@ -272,21 +272,23 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>, ...@@ -272,21 +272,23 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
template <typename T> template <typename T>
struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>, struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
std::vector<T>, int, int, int> { std::vector<T>,
typename jit::MatMulTuples<T>::attr_type> {
void operator()(const typename jit::MatMulTuples<T>::func_type tgt, void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
const std::vector<T>& a, const std::vector<T>& b, const std::vector<T>& a, const std::vector<T>& b,
const std::vector<T>& cref, int m, int n, int k) { const std::vector<T>& cref,
const typename jit::MatMulTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(a.size(), static_cast<size_t>(m * k)); EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
EXPECT_EQ(b.size(), static_cast<size_t>(k * n)); EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
EXPECT_EQ(cref.size(), static_cast<size_t>(m * n)); EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
std::vector<T> c(cref.size()); std::vector<T> c(cref.size());
const T* a_data = a.data(); const T* a_data = a.data();
const T* b_data = b.data(); const T* b_data = b.data();
const T* cref_data = cref.data(); const T* cref_data = cref.data();
T* c_data = c.data(); T* c_data = c.data();
tgt(a_data, b_data, c_data, m, n, k); tgt(a_data, b_data, c_data, &attr);
ExpectEQ<T>(c_data, cref_data, m * n); ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
} }
}; };
...@@ -383,8 +385,8 @@ void TestAXYNKernel() { ...@@ -383,8 +385,8 @@ void TestAXYNKernel() {
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestXRNKernel() { void TestXRNKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
auto last_acc = acc; auto last_acc = FLAGS_acc;
acc = 1e-4; FLAGS_acc = 1e-4;
for (int d : TestSizes()) { for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>(); auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(ref != nullptr);
...@@ -395,7 +397,7 @@ void TestXRNKernel() { ...@@ -395,7 +397,7 @@ void TestXRNKernel() {
TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x, TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
ref_res); ref_res);
} }
acc = last_acc; FLAGS_acc = last_acc;
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
...@@ -535,9 +537,10 @@ void TestSeqPoolKernel() { ...@@ -535,9 +537,10 @@ void TestSeqPoolKernel() {
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestMatMulKernel() { void TestMatMulKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
auto last_acc = acc; auto last_acc = FLAGS_acc;
// TODO(intel): this should be acc issue of MKL // TODO(intel): fix MKL acc issue
acc = 1e-3; // https://github.com/PaddlePaddle/Paddle/issues/15447
FLAGS_acc = 1e-3;
for (int m : {1, 2, 3, 4}) { for (int m : {1, 2, 3, 4}) {
for (int n : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) {
for (int k : TestSizes()) { for (int k : TestSizes()) {
...@@ -549,13 +552,14 @@ void TestMatMulKernel() { ...@@ -549,13 +552,14 @@ void TestMatMulKernel() {
const T* a_data = a.data(); const T* a_data = a.data();
const T* b_data = b.data(); const T* b_data = b.data();
T* c_data = c.data(); T* c_data = c.data();
ref(a_data, b_data, c_data, m, n, k); const jit::matmul_attr_t attr{m, n, k};
ref(a_data, b_data, c_data, &attr);
TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>, TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>,
std::vector<T>, std::vector<T>>(k, a, b, c, m, n, k); std::vector<T>, std::vector<T>>(attr, a, b, c, attr);
} }
} }
} }
acc = last_acc; FLAGS_acc = last_acc;
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/operators/lookup_table_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -193,8 +194,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -193,8 +194,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
ops::LookupTableCUDAKernel<double>); ops::LookupTableCUDAKernel<double>,
ops::LookupTableCUDAKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(lookup_table_grad, REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
ops::LookupTableGradCUDAKernel<float>, ops::LookupTableGradCUDAKernel<float>,
ops::LookupTableGradCUDAKernel<double>); ops::LookupTableGradCUDAKernel<double>,
ops::LookupTableGradCUDAKernel<plat::float16>);
...@@ -29,8 +29,9 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> { ...@@ -29,8 +29,9 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
const framework::LoDTensor *ids, const framework::LoDTensor *ids,
const framework::LoDTensor *scores, const framework::LoDTensor *scores,
framework::LoDTensor *selected_ids, framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores, size_t level, framework::LoDTensor *selected_scores,
size_t beam_size, int end_id, bool is_accumulated) { framework::Tensor *parent_idx, size_t level, size_t beam_size,
int end_id, bool is_accumulated) {
auto abs_lod = framework::ToAbsOffset(scores->lod()); auto abs_lod = framework::ToAbsOffset(scores->lod());
auto &high_level = abs_lod[level]; auto &high_level = abs_lod[level];
...@@ -57,11 +58,13 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> { ...@@ -57,11 +58,13 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
std::vector<int64_t>({static_cast<int>(num_instances), 1})); std::vector<int64_t>({static_cast<int>(num_instances), 1}));
selected_ids->Resize(dims); selected_ids->Resize(dims);
selected_scores->Resize(dims); selected_scores->Resize(dims);
parent_idx->Resize({static_cast<int64_t>(num_instances)});
auto *selected_ids_data = auto *selected_ids_data =
selected_ids->mutable_data<int64_t>(platform::CPUPlace()); selected_ids->mutable_data<int64_t>(platform::CPUPlace());
auto *selected_scores_data = auto *selected_scores_data =
selected_scores->mutable_data<float>(platform::CPUPlace()); selected_scores->mutable_data<float>(platform::CPUPlace());
auto *parent_idx_data = parent_idx->mutable_data<int>(platform::CPUPlace());
// fill in data // fill in data
std::vector<size_t> low_level; std::vector<size_t> low_level;
...@@ -69,6 +72,7 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> { ...@@ -69,6 +72,7 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
for (auto &items : selected_items) { for (auto &items : selected_items) {
low_level.push_back(low_offset); low_level.push_back(low_offset);
for (auto &item : items) { for (auto &item : items) {
parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
selected_ids_data[low_offset] = item.id; selected_ids_data[low_offset] = item.id;
selected_scores_data[low_offset] = item.score; selected_scores_data[low_offset] = item.score;
low_offset++; low_offset++;
......
...@@ -157,10 +157,10 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, ...@@ -157,10 +157,10 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
} }
__device__ __forceinline__ void WriteBack( __device__ __forceinline__ void WriteBack(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, int64_t* selected_ids, float* selected_scores, int* parent_idx,
Triple* top_beam_local, const int seq_offset_start, size_t* selected_offsets, Triple* top_beam_local,
const int seq_offset_end, const int selected_seq_start, const int seq_offset_start, const int seq_offset_end,
const int selected_seq_length) { const int selected_seq_start, const int selected_seq_length) {
const int tid = threadIdx.x; // use 1 thread only for each sequence const int tid = threadIdx.x; // use 1 thread only for each sequence
int global_index = selected_seq_start; int global_index = selected_seq_start;
for (int global_offset = seq_offset_start; global_offset < seq_offset_end; for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
...@@ -171,6 +171,7 @@ __device__ __forceinline__ void WriteBack( ...@@ -171,6 +171,7 @@ __device__ __forceinline__ void WriteBack(
selected_ids[global_index] = selected_ids[global_index] =
static_cast<int64_t>(top_beam_local[local_index].id); static_cast<int64_t>(top_beam_local[local_index].id);
selected_scores[global_index] = top_beam_local[local_index].score; selected_scores[global_index] = top_beam_local[local_index].score;
parent_idx[global_index] = static_cast<int>(global_offset);
global_index++; global_index++;
} }
} }
...@@ -180,11 +181,11 @@ __device__ __forceinline__ void WriteBack( ...@@ -180,11 +181,11 @@ __device__ __forceinline__ void WriteBack(
template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs> template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
__device__ void BeamSearchDetails( __device__ void BeamSearchDetails(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, int64_t* selected_ids, float* selected_scores, int* parent_idx,
const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
const float* scores, const int seq_offset_start, const int seq_offset_end, const int64_t* ids, const float* scores, const int seq_offset_start,
const int seq_width, int beam_size, int end_id, bool is_accumulated, const int seq_offset_end, const int seq_width, int beam_size, int end_id,
int num_used_threads) { bool is_accumulated, int num_used_threads) {
__shared__ Triple top_beam[MaxLength]; __shared__ Triple top_beam[MaxLength];
int num_items = 0; int num_items = 0;
...@@ -228,15 +229,15 @@ __device__ void BeamSearchDetails( ...@@ -228,15 +229,15 @@ __device__ void BeamSearchDetails(
selected_offsets[0] = 0; selected_offsets[0] = 0;
} }
WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local, WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets,
seq_offset_start, seq_offset_end, selected_seq_start, top_beam_local, seq_offset_start, seq_offset_end,
selected_seq_length); selected_seq_start, selected_seq_length);
} }
} }
template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs> template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
size_t* selected_offsets, int* parent_idx, size_t* selected_offsets,
const int64_t* pre_ids, const int64_t* pre_ids,
const float* pre_scores, const int64_t* ids, const float* pre_scores, const int64_t* ids,
const float* scores, const size_t* seq_offsets, const float* scores, const size_t* seq_offsets,
...@@ -250,24 +251,25 @@ __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores, ...@@ -250,24 +251,25 @@ __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]); int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>( BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
is_accumulated, num_used_threads); beam_size, end_id, is_accumulated, num_used_threads);
} }
template <int MaxLength, int MaxThreadsPerSeq> template <int MaxLength, int MaxThreadsPerSeq>
__global__ void BeamSearchKernelSingle( __global__ void BeamSearchKernelSingle(
int64_t* selected_ids, float* selected_scores, size_t* selected_offsets, int64_t* selected_ids, float* selected_scores, int* parent_idx,
const int64_t* pre_ids, const float* pre_scores, const int64_t* ids, size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
const float* scores, const int seq_length, const int seq_width, const int64_t* ids, const float* scores, const int seq_length,
int beam_size, int end_id, bool is_accumulated, int num_used_threads) { const int seq_width, int beam_size, int end_id, bool is_accumulated,
int num_used_threads) {
const int seq_offset_start = 0; const int seq_offset_start = 0;
const int seq_offset_end = seq_length; const int seq_offset_end = seq_length;
BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>( BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids, selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id, pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
is_accumulated, num_used_threads); beam_size, end_id, is_accumulated, num_used_threads);
} }
static inline int GetNumUsedThreads(const int max_threads_per_seq, static inline int GetNumUsedThreads(const int max_threads_per_seq,
...@@ -300,8 +302,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> { ...@@ -300,8 +302,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
const framework::LoDTensor* ids, const framework::LoDTensor* ids,
const framework::LoDTensor* scores, const framework::LoDTensor* scores,
framework::LoDTensor* selected_ids, framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores, size_t level, framework::LoDTensor* selected_scores,
size_t beam_size, int end_id, bool is_accumulated) { framework::Tensor* parent_idx, size_t level, size_t beam_size,
int end_id, bool is_accumulated) {
auto abs_lod = framework::ToAbsOffset(scores->lod()); auto abs_lod = framework::ToAbsOffset(scores->lod());
const int64_t* pre_ids_data = pre_ids->data<int64_t>(); const int64_t* pre_ids_data = pre_ids->data<int64_t>();
...@@ -322,6 +325,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> { ...@@ -322,6 +325,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace()); selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
float* selected_scores_data = float* selected_scores_data =
selected_scores->mutable_data<float>(selected_dims, context.GetPlace()); selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
int* parent_idx_data = parent_idx->mutable_data<int>(
{static_cast<int64_t>(num_seqs * beam_size)}, context.GetPlace());
framework::LoD selected_lod(2); framework::LoD selected_lod(2);
selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
...@@ -339,9 +344,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> { ...@@ -339,9 +344,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
CUDA_LAUNCH_KERNEL_HELPER( CUDA_LAUNCH_KERNEL_HELPER(
BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<< BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
1, kMaxThreadsPerSeq, 0, context.stream()>>>( 1, kMaxThreadsPerSeq, 0, context.stream()>>>(
selected_ids_data, selected_scores_data, selected_offsets, selected_ids_data, selected_scores_data, parent_idx_data,
pre_ids_data, pre_scores_data, ids_data, scores_data, selected_offsets, pre_ids_data, pre_scores_data, ids_data,
seq_length, static_cast<int>(seq_width), scores_data, seq_length, static_cast<int>(seq_width),
static_cast<int>(beam_size), static_cast<int>(end_id), static_cast<int>(beam_size), static_cast<int>(end_id),
is_accumulated, num_used_threads)); is_accumulated, num_used_threads));
} }
...@@ -357,9 +362,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> { ...@@ -357,9 +362,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
CUDA_LAUNCH_KERNEL_HELPER( CUDA_LAUNCH_KERNEL_HELPER(
BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<< BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>( 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
selected_ids_data, selected_scores_data, selected_offsets, selected_ids_data, selected_scores_data, parent_idx_data,
pre_ids_data, pre_scores_data, ids_data, scores_data, selected_offsets, pre_ids_data, pre_scores_data, ids_data,
seq_offsets, static_cast<int>(num_seqs), scores_data, seq_offsets, static_cast<int>(num_seqs),
static_cast<int>(seq_width), static_cast<int>(beam_size), static_cast<int>(seq_width), static_cast<int>(beam_size),
end_id, is_accumulated, num_used_threads)); end_id, is_accumulated, num_used_threads));
} }
...@@ -379,6 +384,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> { ...@@ -379,6 +384,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
{static_cast<int64_t>(selected_lod[1].back()), 1}); {static_cast<int64_t>(selected_lod[1].back()), 1});
selected_ids->Resize(final_selected_dims); selected_ids->Resize(final_selected_dims);
selected_scores->Resize(final_selected_dims); selected_scores->Resize(final_selected_dims);
parent_idx->Resize({static_cast<int64_t>(selected_lod[1].back())});
} }
} }
}; };
......
...@@ -104,14 +104,12 @@ class BeamSearchFunctor { ...@@ -104,14 +104,12 @@ class BeamSearchFunctor {
* Return false if all the input tensor is empty, in machine translation task * Return false if all the input tensor is empty, in machine translation task
* that means no candidates is provided, and the task will stop running. * that means no candidates is provided, and the task will stop running.
*/ */
void operator()(const DeviceContext& context, void operator()(
const framework::LoDTensor* pre_ids, const DeviceContext& context, const framework::LoDTensor* pre_ids,
const framework::LoDTensor* pre_scores, const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids,
const framework::LoDTensor* ids, const framework::LoDTensor* scores, framework::LoDTensor* selected_ids,
const framework::LoDTensor* scores, framework::LoDTensor* selected_scores, framework::Tensor* parent_idx,
framework::LoDTensor* selected_ids, size_t level, size_t beam_size, int end_id, bool is_accumulated);
framework::LoDTensor* selected_scores, size_t level,
size_t beam_size, int end_id, bool is_accumulated);
}; };
} // namespace math } // namespace math
......
...@@ -93,13 +93,14 @@ void TestBeamSearch() { ...@@ -93,13 +93,14 @@ void TestBeamSearch() {
paddle::framework::LoDTensor selected_ids; paddle::framework::LoDTensor selected_ids;
paddle::framework::LoDTensor selected_scores; paddle::framework::LoDTensor selected_scores;
paddle::framework::LoDTensor parent_idx;
size_t level = 0; size_t level = 0;
size_t beam_size = 2; size_t beam_size = 2;
int end_id = 0; int end_id = 0;
paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch; paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids, beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
&selected_scores, level, beam_size, end_id, true); &selected_scores, &parent_idx, level, beam_size, end_id, true);
ASSERT_EQ(selected_ids.lod(), selected_scores.lod()); ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
......
...@@ -30,15 +30,17 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M, ...@@ -30,15 +30,17 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
return; return;
} }
if (relu) { if (relu) {
auto compute = auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>,
jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(N); platform::CPUPlace>::Cache()
.At(N);
for (int i = 0; i < M; i++) { for (int i = 0; i < M; i++) {
T* dst = Y + i * N; T* dst = Y + i * N;
compute(B, dst, dst, N); compute(B, dst, dst, N);
} }
} else { } else {
auto compute = auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>,
jit::Get<jit::kVAdd, jit::XYZNTuples<T>, platform::CPUPlace>(N); platform::CPUPlace>::Cache()
.At(N);
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
#endif #endif
......
...@@ -82,8 +82,9 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> { ...@@ -82,8 +82,9 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
const int kClassDim = 1; const int kClassDim = 1;
// 2D data. Batch x C // 2D data. Batch x C
auto compute_softmax = auto compute_softmax =
jit::Get<jit::kSoftmax, jit::SoftmaxTuples<float>, platform::CPUPlace>( jit::KernelFuncs<jit::kSoftmax, jit::SoftmaxTuples<float>,
in_dims[kClassDim]); platform::CPUPlace>::Cache()
.At(in_dims[kClassDim]);
compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
} }
}; };
......
...@@ -31,6 +31,8 @@ std::map<std::string, ...@@ -31,6 +31,8 @@ std::map<std::string,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = { NgraphBridge::NG_NODE_MAP = {
{"conv2d", NG_OPS::BuildConv2dNode},
{"conv2d_grad", NG_OPS::BuildConv2dGradNode},
{"elementwise_add", NG_OPS::BuildElementwiseAddNode}, {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
{"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode}, {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
{"fill_constant", NG_OPS::BuildFillConstantNode}, {"fill_constant", NG_OPS::BuildFillConstantNode},
...@@ -38,6 +40,8 @@ std::map<std::string, ...@@ -38,6 +40,8 @@ std::map<std::string,
{"mean_grad", NG_OPS::BuildMeanGradNode}, {"mean_grad", NG_OPS::BuildMeanGradNode},
{"mul", NG_OPS::BuildMulNode}, {"mul", NG_OPS::BuildMulNode},
{"mul_grad", NG_OPS::BuildMulGradNode}, {"mul_grad", NG_OPS::BuildMulGradNode},
{"pool2d", NG_OPS::BuildPool2dNode},
{"pool2d_grad", NG_OPS::BuildPool2dGradNode},
{"softmax", NG_OPS::BuildSoftmaxNode}, {"softmax", NG_OPS::BuildSoftmaxNode},
{"softmax_grad", NG_OPS::BuildSoftmaxGradNode}, {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
{"scale", NG_OPS::BuildScaleNode}, {"scale", NG_OPS::BuildScaleNode},
......
...@@ -22,10 +22,12 @@ limitations under the License. */ ...@@ -22,10 +22,12 @@ limitations under the License. */
#pragma once #pragma once
#include "ops/binary_unnary_op.h" #include "ops/binary_unnary_op.h"
#include "ops/conv2d_op.h"
#include "ops/elementwise_add_op.h" #include "ops/elementwise_add_op.h"
#include "ops/fill_constant_op.h" #include "ops/fill_constant_op.h"
#include "ops/mean_op.h" #include "ops/mean_op.h"
#include "ops/mul_op.h" #include "ops/mul_op.h"
#include "ops/pool2d_op.h"
#include "ops/scale_op.h" #include "ops/scale_op.h"
#include "ops/softmax_op.h" #include "ops/softmax_op.h"
#include "ops/top_k_op.h" #include "ops/top_k_op.h"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
std::shared_ptr<ngraph::Node> GroupedConvolution(
const std::shared_ptr<ngraph::Node>& data_batch,
const std::shared_ptr<ngraph::Node>& filters, const ngraph::Strides strides,
const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
size_t groups) {
auto& data_shape = data_batch->get_shape();
auto& filter_shape = filters->get_shape();
ngraph::NodeVector ng_slices;
for (size_t i = 0; i < groups; ++i) {
size_t channel_step = filter_shape.at(1);
const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
const std::vector<size_t> upper_bound{data_shape.at(0),
(i + 1) * channel_step,
data_shape.at(2), data_shape.at(3)};
auto data_slice = std::make_shared<ngraph::op::Slice>(
data_batch, lower_bound, upper_bound);
size_t filter_step = filter_shape.at(0) / groups;
const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
const std::vector<size_t> filter_upper_bound{
(i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
filter_shape.at(3)};
auto filter_slice = std::make_shared<ngraph::op::Slice>(
filters, filter_lower_bound, filter_upper_bound);
auto ng_conv = std::make_shared<ngraph::op::Convolution>(
data_slice, filter_slice, strides, dilations, paddings, paddings);
ng_slices.push_back(ng_conv);
}
size_t concat_axis = 1;
return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
}
std::shared_ptr<ngraph::Node> GroupedGradConvolutionFilter(
const std::shared_ptr<ngraph::Node>& data_batch,
const std::shared_ptr<ngraph::Node>& filters,
const std::shared_ptr<ngraph::Node>& doutput, const ngraph::Strides strides,
const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
size_t groups) {
auto& data_shape = data_batch->get_shape();
auto& filter_shape = filters->get_shape();
auto& out_shape = doutput->get_shape();
ngraph::NodeVector ng_slices;
for (size_t i = 0; i < groups; ++i) {
size_t channel_step = filter_shape.at(1);
const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
const std::vector<size_t> upper_bound{data_shape.at(0),
(i + 1) * channel_step,
data_shape.at(2), data_shape.at(3)};
auto data_slice = std::make_shared<ngraph::op::Slice>(
data_batch, lower_bound, upper_bound);
size_t filter_step = data_shape.at(0);
const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
const std::vector<size_t> filter_upper_bound{
(i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
filter_shape.at(3)};
auto filter_slice = std::make_shared<ngraph::op::Slice>(
filters, filter_lower_bound, filter_upper_bound);
const std::vector<size_t> olower_bound{0, i * filter_step, 0, 0};
const std::vector<size_t> oupper_bound{out_shape.at(0),
(i + 1) * filter_step,
out_shape.at(2), out_shape.at(3)};
auto out_slice = std::make_shared<ngraph::op::Slice>(doutput, olower_bound,
oupper_bound);
auto ng_conv = std::make_shared<ngraph::op::ConvolutionBackpropFilters>(
data_slice, filter_slice->get_shape(), out_slice, strides, dilations,
paddings, paddings, ngraph::Strides{1, 1});
ng_slices.push_back(ng_conv);
}
size_t concat_axis = 0;
return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
}
std::shared_ptr<ngraph::Node> GroupedGradConvolutionData(
const std::shared_ptr<ngraph::Node>& data_batch,
const std::shared_ptr<ngraph::Node>& filters,
const std::shared_ptr<ngraph::Node>& doutput, const ngraph::Strides strides,
const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
size_t groups) {
auto& data_shape = data_batch->get_shape();
auto& filter_shape = filters->get_shape();
auto& out_shape = doutput->get_shape();
ngraph::NodeVector ng_slices;
for (size_t i = 0; i < groups; ++i) {
size_t channel_step = filter_shape.at(1);
const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
const std::vector<size_t> upper_bound{data_shape.at(0),
(i + 1) * channel_step,
data_shape.at(2), data_shape.at(3)};
auto data_slice = std::make_shared<ngraph::op::Slice>(
data_batch, lower_bound, upper_bound);
size_t filter_step = data_shape.at(0);
const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
const std::vector<size_t> filter_upper_bound{
(i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
filter_shape.at(3)};
auto filter_slice = std::make_shared<ngraph::op::Slice>(
filters, filter_lower_bound, filter_upper_bound);
const std::vector<size_t> olower_bound{0, i * filter_step, 0, 0};
const std::vector<size_t> oupper_bound{out_shape.at(0),
(i + 1) * filter_step,
out_shape.at(2), out_shape.at(3)};
auto out_slice = std::make_shared<ngraph::op::Slice>(doutput, olower_bound,
oupper_bound);
auto ng_conv = std::make_shared<ngraph::op::ConvolutionBackpropData>(
data_slice->get_shape(), filter_slice, out_slice, strides, dilations,
paddings, paddings, ngraph::Strides{1, 1});
ng_slices.push_back(ng_conv);
}
size_t concat_axis = 1;
return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
}
void BuildConv2dNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto filters = paddle::platform::GetInputNode(op, "Filter", ngb_node_map);
auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map);
std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
std::vector<int> dilations = op_attrs.Get<std::vector<int>>("dilations");
const ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
static_cast<size_t>(strides.at(1))};
const ngraph::Strides ng_dilations{static_cast<size_t>(dilations.at(0)),
static_cast<size_t>(dilations.at(1))};
const ngraph::CoordinateDiff ng_paddings{
static_cast<std::ptrdiff_t>(paddings.at(0)),
static_cast<std::ptrdiff_t>(paddings.at(1))};
int groups = static_cast<size_t>(op_attrs.Get<int>("groups"));
PADDLE_ENFORCE_GE(groups, 1, "conv groups needs be no less than 1");
std::shared_ptr<ngraph::Node> result;
if (groups == 1) {
result = std::make_shared<ngraph::op::Convolution>(
input, filters, ng_strides, ng_dilations, ng_paddings, ng_paddings);
} else {
result = GroupedConvolution(input, filters, ng_strides, ng_dilations,
ng_paddings, groups);
}
paddle::platform::SetOutputNode(op, "Output", result, ngb_node_map);
}
void BuildConv2dGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto filter = paddle::platform::GetInputNode(op, "Filter", ngb_node_map);
auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map);
auto doutput =
paddle::platform::GetInputNode(op, "Output@GRAD", ngb_node_map);
int groups = op_attrs.Get<int>("groups");
std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
std::vector<int> dilations = op_attrs.Get<std::vector<int>>("dilations");
const ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
static_cast<size_t>(strides.at(1))};
const ngraph::Strides ng_dilations{static_cast<size_t>(dilations.at(0)),
static_cast<size_t>(dilations.at(1))};
const ngraph::CoordinateDiff ng_paddings{
static_cast<std::ptrdiff_t>(paddings.at(0)),
static_cast<std::ptrdiff_t>(paddings.at(1))};
std::shared_ptr<ngraph::Node> dfilter;
std::shared_ptr<ngraph::Node> dinput;
if (groups == 1) {
dfilter = std::make_shared<ngraph::op::ConvolutionBackpropFilters>(
input, filter->get_shape(), doutput, ng_strides, ng_dilations,
ng_paddings, ng_paddings, ngraph::Strides{1, 1});
dinput = std::make_shared<ngraph::op::ConvolutionBackpropData>(
input->get_shape(), filter, doutput, ng_strides, ng_dilations,
ng_paddings, ng_paddings, ngraph::Strides{1, 1});
} else {
dfilter = GroupedGradConvolutionFilter(input, filter, doutput, ng_strides,
ng_dilations, ng_paddings, groups);
dinput = GroupedGradConvolutionData(input, filter, doutput, ng_strides,
ng_dilations, ng_paddings, groups);
}
paddle::platform::SetOutputNode(op, "Filter@GRAD", dfilter, ngb_node_map);
paddle::platform::SetOutputNode(op, "Input@GRAD", dinput, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildPool2dNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto x_shape = x->get_shape();
std::string pooling_type = op_attrs.Get<std::string>("pooling_type");
std::vector<int> ksize = op_attrs.Get<std::vector<int>>("ksize");
std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(),
"Handling 2d pooling only");
if (op_attrs.Get<bool>("global_pooling")) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0;
ksize[i] = static_cast<int>(x_shape.at(i + 2));
}
}
ngraph::Shape ng_padding_below{static_cast<size_t>(paddings.at(0)),
static_cast<size_t>(paddings.at(1))};
ngraph::Shape ng_padding_above{static_cast<size_t>(paddings.at(0)),
static_cast<size_t>(paddings.at(1))};
ngraph::Shape ng_ksize_shape{static_cast<size_t>(ksize.at(0)),
static_cast<size_t>(ksize.at(1))};
ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
static_cast<size_t>(strides.at(1))};
auto ComputeCeiledOutput = [](size_t in, size_t k, size_t p, size_t s) {
return (in - k + 2 * p) / s + 1;
};
if (op_attrs.Get<bool>("ceil_mode")) {
auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
auto dummpy_shape = dummy_out->get_shape();
for (size_t i = 0; i < ng_padding_above.size(); ++i) {
auto desired_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i],
paddings[i], strides[i]);
if (desired_size != dummpy_shape[i + 2]) {
ng_padding_above[i] += strides[i];
}
}
}
bool padding_exclusive = op_attrs.Get<bool>("exclusive");
if (pooling_type == "max") {
auto pool2d = std::make_shared<ngraph::op::MaxPool>(
x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above);
paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map);
} else if (pooling_type == "avg") {
std::shared_ptr<ngraph::Node> pool2d;
if (op_attrs.Get<bool>("adaptive")) {
auto ComputeAdaptive = [](size_t in, size_t k) {
return std::floor(in / k);
};
ng_strides[0] = x_shape.size() == 4
? ComputeAdaptive(x_shape[3], ksize[0])
: ng_strides[0];
ng_strides[1] = x_shape.size() == 4
? ComputeAdaptive(x_shape[3], ksize[0])
: ng_strides[1];
pool2d =
std::make_shared<ngraph::op::AvgPool>(x, ng_ksize_shape, ng_strides);
} else {
pool2d = std::make_shared<ngraph::op::AvgPool>(
x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above,
!padding_exclusive);
}
paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map);
} else {
PADDLE_THROW("Support max and avg pooling only");
}
}
void BuildPool2dGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto x_shape = x->get_shape();
std::string pooling_type = op_attrs.Get<std::string>("pooling_type");
std::vector<int> ksize = op_attrs.Get<std::vector<int>>("ksize");
std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(),
"Handling 2d pooling only");
if (op_attrs.Get<bool>("global_pooling")) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0;
ksize[i] = static_cast<int>(x_shape.at(i + 2));
}
}
ngraph::Shape ng_padding_below{static_cast<size_t>(paddings.at(0)),
static_cast<size_t>(paddings.at(1))};
ngraph::Shape ng_padding_above{static_cast<size_t>(paddings.at(0)),
static_cast<size_t>(paddings.at(1))};
ngraph::Shape ng_ksize_shape{static_cast<size_t>(ksize.at(0)),
static_cast<size_t>(ksize.at(1))};
ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
static_cast<size_t>(strides.at(1))};
bool padding_exclusive = op_attrs.Get<bool>("exclusive");
if (pooling_type == "max") {
auto pool2d_grad = std::make_shared<ngraph::op::MaxPoolBackprop>(
x, dout, out, ng_ksize_shape, ng_strides, ng_padding_below,
ng_padding_above);
paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map);
} else if (pooling_type == "avg") {
std::shared_ptr<ngraph::Node> pool2d_grad;
if (op_attrs.Get<bool>("adaptive")) {
auto ComputeAdaptive = [](size_t in, size_t k) {
return std::floor(in / k);
};
ng_strides[0] = x_shape.size() == 4
? ComputeAdaptive(x_shape[3], ksize[0])
: ng_strides[0];
ng_strides[1] = x_shape.size() == 4
? ComputeAdaptive(x_shape[3], ksize[0])
: ng_strides[1];
pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
ng_padding_above, !padding_exclusive);
} else {
pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
ng_padding_above, !padding_exclusive);
}
paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map);
} else {
PADDLE_THROW("Support max and avg pooling only");
}
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
...@@ -359,6 +359,7 @@ class ReshapeGradInplaceInToOut : public framework::InplaceInToOut { ...@@ -359,6 +359,7 @@ class ReshapeGradInplaceInToOut : public framework::InplaceInToOut {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>, paddle::framework::DefaultGradOpDescMaker<true>,
...@@ -388,16 +389,20 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, ...@@ -388,16 +389,20 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel, ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel); int64_t, ops::ReshapeKernel, plat::float16,
ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int, double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel, plat::float16,
ops::ReshapeGradKernel); ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel, ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel); int64_t, ops::ReshapeKernel, plat::float16,
ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int, double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel, plat::float16,
ops::ReshapeGradKernel); ops::ReshapeGradKernel);
#endif #endif
...@@ -54,6 +54,9 @@ class SliceOp : public framework::OperatorWithKernel { ...@@ -54,6 +54,9 @@ class SliceOp : public framework::OperatorWithKernel {
out_dims[axes[i]] = end - start; out_dims[axes[i]] = end - start;
} }
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
if (axes[0] != 0) {
ctx->ShareLoD("Input", /*->*/ "Out");
}
} }
protected: protected:
......
...@@ -17,13 +17,16 @@ ...@@ -17,13 +17,16 @@
namespace plat = paddle::platform; namespace plat = paddle::platform;
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>, REGISTER_OP_CUDA_KERNEL(
ops::StackKernel<plat::CUDADeviceContext, double>, stack, ops::StackKernel<plat::CUDADeviceContext, float>,
ops::StackKernel<plat::CUDADeviceContext, int>, ops::StackKernel<plat::CUDADeviceContext, double>,
ops::StackKernel<plat::CUDADeviceContext, int64_t>); ops::StackKernel<plat::CUDADeviceContext, int>,
ops::StackKernel<plat::CUDADeviceContext, int64_t>,
ops::StackKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(stack_grad, REGISTER_OP_CUDA_KERNEL(
ops::StackGradKernel<plat::CUDADeviceContext, float>, stack_grad, ops::StackGradKernel<plat::CUDADeviceContext, float>,
ops::StackGradKernel<plat::CUDADeviceContext, double>, ops::StackGradKernel<plat::CUDADeviceContext, double>,
ops::StackGradKernel<plat::CUDADeviceContext, int>, ops::StackGradKernel<plat::CUDADeviceContext, int>,
ops::StackGradKernel<plat::CUDADeviceContext, int64_t>); ops::StackGradKernel<plat::CUDADeviceContext, int64_t>,
ops::StackGradKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -15,19 +15,27 @@ limitations under the License. */ ...@@ -15,19 +15,27 @@ limitations under the License. */
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>, transpose, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>); ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose_grad, transpose_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>, ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>); ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose2, transpose2,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>); ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose2_grad, transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>, ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>); ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
using Array5 = Eigen::DSizes<int64_t, 5>;
template <typename T>
static inline bool isZero(T x) {
return fabs(x) < 1e-6;
}
template <typename T>
static inline T sigmoid(T x) {
return 1.0 / (exp(-1.0 * x) + 1.0);
}
template <typename T>
static inline T CalcMaskPointNum(const Tensor& mask) {
auto mask_t = EigenVector<int>::Flatten(mask);
T count = 0.0;
for (int i = 0; i < mask_t.dimensions()[0]; i++) {
if (mask_t(i)) {
count += 1.0;
}
}
return count;
}
template <typename T>
static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y,
const Tensor& mask) {
auto x_t = EigenVector<T>::Flatten(x);
auto y_t = EigenVector<T>::Flatten(y);
auto mask_t = EigenVector<int>::Flatten(mask);
T error_sum = 0.0;
T points = 0.0;
for (int i = 0; i < x_t.dimensions()[0]; i++) {
if (mask_t(i)) {
error_sum += pow(x_t(i) - y_t(i), 2);
points += 1;
}
}
return (error_sum / points);
}
template <typename T>
static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y,
const Tensor& mask, T mf) {
auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
auto x_t = EigenVector<T>::Flatten(x);
auto y_t = EigenVector<T>::Flatten(y);
auto mask_t = EigenVector<int>::Flatten(mask);
for (int i = 0; i < x_t.dimensions()[0]; i++) {
if (mask_t(i)) {
grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf;
}
}
}
template <typename T>
static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y,
const Tensor& mask) {
auto x_t = EigenVector<T>::Flatten(x);
auto y_t = EigenVector<T>::Flatten(y);
auto mask_t = EigenVector<int>::Flatten(mask);
T error_sum = 0.0;
T points = 0.0;
for (int i = 0; i < x_t.dimensions()[0]; i++) {
if (mask_t(i)) {
error_sum +=
-1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i)));
points += 1;
}
}
return (error_sum / points);
}
template <typename T>
static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x,
const Tensor& y, const Tensor& mask,
T mf) {
auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
auto x_t = EigenVector<T>::Flatten(x);
auto y_t = EigenVector<T>::Flatten(y);
auto mask_t = EigenVector<int>::Flatten(mask);
for (int i = 0; i < x_t.dimensions()[0]; i++) {
if (mask_t(i)) {
grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf;
}
}
}
template <typename T>
static void CalcPredResult(const Tensor& input, Tensor* pred_conf,
Tensor* pred_class, Tensor* pred_x, Tensor* pred_y,
Tensor* pred_w, Tensor* pred_h, const int anchor_num,
const int class_num) {
const int n = input.dims()[0];
const int h = input.dims()[2];
const int w = input.dims()[3];
const int box_attr_num = 5 + class_num;
auto input_t = EigenTensor<T, 4>::From(input);
auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
for (int i = 0; i < n; i++) {
for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
for (int j = 0; j < h; j++) {
for (int k = 0; k < w; k++) {
pred_x_t(i, an_idx, j, k) =
sigmoid(input_t(i, box_attr_num * an_idx, j, k));
pred_y_t(i, an_idx, j, k) =
sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k));
pred_w_t(i, an_idx, j, k) =
input_t(i, box_attr_num * an_idx + 2, j, k);
pred_h_t(i, an_idx, j, k) =
input_t(i, box_attr_num * an_idx + 3, j, k);
pred_conf_t(i, an_idx, j, k) =
sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k));
for (int c = 0; c < class_num; c++) {
pred_class_t(i, an_idx, j, k, c) =
sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k));
}
}
}
}
}
}
template <typename T>
static T CalcBoxIoU(std::vector<T> box1, std::vector<T> box2) {
T b1_x1 = box1[0] - box1[2] / 2;
T b1_x2 = box1[0] + box1[2] / 2;
T b1_y1 = box1[1] - box1[3] / 2;
T b1_y2 = box1[1] + box1[3] / 2;
T b2_x1 = box2[0] - box2[2] / 2;
T b2_x2 = box2[0] + box2[2] / 2;
T b2_y1 = box2[1] - box2[3] / 2;
T b2_y2 = box2[1] + box2[3] / 2;
T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1);
T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1);
T inter_rect_x1 = std::max(b1_x1, b2_x1);
T inter_rect_y1 = std::max(b1_y1, b2_y1);
T inter_rect_x2 = std::min(b1_x2, b2_x2);
T inter_rect_y2 = std::min(b1_y2, b2_y2);
T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast<T>(0.0)) *
std::max(inter_rect_y2 - inter_rect_y1, static_cast<T>(0.0));
return inter_area / (b1_area + b2_area - inter_area);
}
template <typename T>
static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
const float ignore_thresh, std::vector<int> anchors,
const int grid_size, Tensor* obj_mask,
Tensor* noobj_mask, Tensor* tx, Tensor* ty,
Tensor* tw, Tensor* th, Tensor* tconf,
Tensor* tclass) {
const int n = gt_box.dims()[0];
const int b = gt_box.dims()[1];
const int anchor_num = anchors.size() / 2;
auto gt_box_t = EigenTensor<T, 3>::From(gt_box);
auto gt_label_t = EigenTensor<int, 2>::From(gt_label);
auto obj_mask_t = EigenTensor<int, 4>::From(*obj_mask).setConstant(0);
auto noobj_mask_t = EigenTensor<int, 4>::From(*noobj_mask).setConstant(1);
auto tx_t = EigenTensor<T, 4>::From(*tx).setConstant(0.0);
auto ty_t = EigenTensor<T, 4>::From(*ty).setConstant(0.0);
auto tw_t = EigenTensor<T, 4>::From(*tw).setConstant(0.0);
auto th_t = EigenTensor<T, 4>::From(*th).setConstant(0.0);
auto tconf_t = EigenTensor<T, 4>::From(*tconf).setConstant(0.0);
auto tclass_t = EigenTensor<T, 5>::From(*tclass).setConstant(0.0);
for (int i = 0; i < n; i++) {
for (int j = 0; j < b; j++) {
if (isZero<T>(gt_box_t(i, j, 0)) && isZero<T>(gt_box_t(i, j, 1)) &&
isZero<T>(gt_box_t(i, j, 2)) && isZero<T>(gt_box_t(i, j, 3))) {
continue;
}
int cur_label = gt_label_t(i, j);
T gx = gt_box_t(i, j, 0) * grid_size;
T gy = gt_box_t(i, j, 1) * grid_size;
T gw = gt_box_t(i, j, 2) * grid_size;
T gh = gt_box_t(i, j, 3) * grid_size;
int gi = static_cast<int>(gx);
int gj = static_cast<int>(gy);
T max_iou = static_cast<T>(0);
T iou;
int best_an_index = -1;
std::vector<T> gt_box_shape({0, 0, gw, gh});
for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
std::vector<T> anchor_shape({0, 0, static_cast<T>(anchors[2 * an_idx]),
static_cast<T>(anchors[2 * an_idx + 1])});
iou = CalcBoxIoU<T>(gt_box_shape, anchor_shape);
if (iou > max_iou) {
max_iou = iou;
best_an_index = an_idx;
}
if (iou > ignore_thresh) {
noobj_mask_t(i, an_idx, gj, gi) = 0;
}
}
obj_mask_t(i, best_an_index, gj, gi) = 1;
noobj_mask_t(i, best_an_index, gj, gi) = 0;
tx_t(i, best_an_index, gj, gi) = gx - gi;
ty_t(i, best_an_index, gj, gi) = gy - gj;
tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]);
th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]);
tclass_t(i, best_an_index, gj, gi, cur_label) = 1;
tconf_t(i, best_an_index, gj, gi) = 1;
}
}
}
static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand,
const Tensor& obj_mask) {
const int n = obj_mask_expand->dims()[0];
const int an_num = obj_mask_expand->dims()[1];
const int h = obj_mask_expand->dims()[2];
const int w = obj_mask_expand->dims()[3];
const int class_num = obj_mask_expand->dims()[4];
auto obj_mask_expand_t = EigenTensor<int, 5>::From(*obj_mask_expand);
auto obj_mask_t = EigenTensor<int, 4>::From(obj_mask);
obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
.broadcast(Array5(1, 1, 1, 1, class_num));
}
template <typename T>
static void AddAllGradToInputGrad(
Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y,
const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x,
const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h,
const Tensor& grad_conf_target, const Tensor& grad_conf_notarget,
const Tensor& grad_class, const int class_num, const float loss_weight_xy,
const float loss_weight_wh, const float loss_weight_conf_target,
const float loss_weight_conf_notarget, const float loss_weight_class) {
const int n = pred_x.dims()[0];
const int an_num = pred_x.dims()[1];
const int h = pred_x.dims()[2];
const int w = pred_x.dims()[3];
const int attr_num = class_num + 5;
auto grad_t = EigenTensor<T, 4>::From(*grad).setConstant(0.0);
auto pred_x_t = EigenTensor<T, 4>::From(pred_x);
auto pred_y_t = EigenTensor<T, 4>::From(pred_y);
auto pred_conf_t = EigenTensor<T, 4>::From(pred_conf);
auto pred_class_t = EigenTensor<T, 5>::From(pred_class);
auto grad_x_t = EigenTensor<T, 4>::From(grad_x);
auto grad_y_t = EigenTensor<T, 4>::From(grad_y);
auto grad_w_t = EigenTensor<T, 4>::From(grad_w);
auto grad_h_t = EigenTensor<T, 4>::From(grad_h);
auto grad_conf_target_t = EigenTensor<T, 4>::From(grad_conf_target);
auto grad_conf_notarget_t = EigenTensor<T, 4>::From(grad_conf_notarget);
auto grad_class_t = EigenTensor<T, 5>::From(grad_class);
for (int i = 0; i < n; i++) {
for (int j = 0; j < an_num; j++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
grad_t(i, j * attr_num, k, l) =
grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) *
(1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy;
grad_t(i, j * attr_num + 1, k, l) =
grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) *
(1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy;
grad_t(i, j * attr_num + 2, k, l) =
grad_w_t(i, j, k, l) * loss * loss_weight_wh;
grad_t(i, j * attr_num + 3, k, l) =
grad_h_t(i, j, k, l) * loss * loss_weight_wh;
grad_t(i, j * attr_num + 4, k, l) =
grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
(1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target;
grad_t(i, j * attr_num + 4, k, l) +=
grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
(1.0 - pred_conf_t(i, j, k, l)) * loss *
loss_weight_conf_notarget;
for (int c = 0; c < class_num; c++) {
grad_t(i, j * attr_num + 5 + c, k, l) =
grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) *
(1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class;
}
}
}
}
}
}
template <typename T>
class Yolov3LossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto* loss = ctx.Output<Tensor>("Loss");
auto anchors = ctx.Attr<std::vector<int>>("anchors");
int class_num = ctx.Attr<int>("class_num");
float ignore_thresh = ctx.Attr<float>("ignore_thresh");
float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
float loss_weight_conf_notarget =
ctx.Attr<float>("loss_weight_conf_notarget");
float loss_weight_class = ctx.Attr<float>("loss_weight_class");
const int n = input->dims()[0];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int an_num = anchors.size() / 2;
Tensor pred_x, pred_y, pred_w, pred_h;
Tensor pred_conf, pred_class;
pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
&pred_w, &pred_h, an_num, class_num);
Tensor obj_mask, noobj_mask;
Tensor tx, ty, tw, th, tconf, tclass;
obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
&noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
Tensor obj_mask_expand;
obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
ctx.GetPlace());
ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
T loss_x = CalcMSEWithMask<T>(pred_x, tx, obj_mask);
T loss_y = CalcMSEWithMask<T>(pred_y, ty, obj_mask);
T loss_w = CalcMSEWithMask<T>(pred_w, tw, obj_mask);
T loss_h = CalcMSEWithMask<T>(pred_h, th, obj_mask);
T loss_conf_target = CalcBCEWithMask<T>(pred_conf, tconf, obj_mask);
T loss_conf_notarget = CalcBCEWithMask<T>(pred_conf, tconf, noobj_mask);
T loss_class = CalcBCEWithMask<T>(pred_class, tclass, obj_mask_expand);
auto* loss_data = loss->mutable_data<T>({1}, ctx.GetPlace());
loss_data[0] = loss_weight_xy * (loss_x + loss_y) +
loss_weight_wh * (loss_w + loss_h) +
loss_weight_conf_target * loss_conf_target +
loss_weight_conf_notarget * loss_conf_notarget +
loss_weight_class * loss_class;
}
};
template <typename T>
class Yolov3LossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto anchors = ctx.Attr<std::vector<int>>("anchors");
int class_num = ctx.Attr<int>("class_num");
float ignore_thresh = ctx.Attr<float>("ignore_thresh");
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
const T loss = output_grad->data<T>()[0];
float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
float loss_weight_conf_notarget =
ctx.Attr<float>("loss_weight_conf_notarget");
float loss_weight_class = ctx.Attr<float>("loss_weight_class");
const int n = input->dims()[0];
const int c = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int an_num = anchors.size() / 2;
Tensor pred_x, pred_y, pred_w, pred_h;
Tensor pred_conf, pred_class;
pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
&pred_w, &pred_h, an_num, class_num);
Tensor obj_mask, noobj_mask;
Tensor tx, ty, tw, th, tconf, tclass;
obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
&noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
Tensor obj_mask_expand;
obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
ctx.GetPlace());
ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
Tensor grad_x, grad_y, grad_w, grad_h;
Tensor grad_conf_target, grad_conf_notarget, grad_class;
grad_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_conf_target.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_conf_notarget.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
T obj_mf = CalcMaskPointNum<int>(obj_mask);
T noobj_mf = CalcMaskPointNum<int>(noobj_mask);
T obj_expand_mf = CalcMaskPointNum<int>(obj_mask_expand);
CalcMSEGradWithMask<T>(&grad_x, pred_x, tx, obj_mask, obj_mf);
CalcMSEGradWithMask<T>(&grad_y, pred_y, ty, obj_mask, obj_mf);
CalcMSEGradWithMask<T>(&grad_w, pred_w, tw, obj_mask, obj_mf);
CalcMSEGradWithMask<T>(&grad_h, pred_h, th, obj_mask, obj_mf);
CalcBCEGradWithMask<T>(&grad_conf_target, pred_conf, tconf, obj_mask,
obj_mf);
CalcBCEGradWithMask<T>(&grad_conf_notarget, pred_conf, tconf, noobj_mask,
noobj_mf);
CalcBCEGradWithMask<T>(&grad_class, pred_class, tclass, obj_mask_expand,
obj_expand_mf);
input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
AddAllGradToInputGrad<T>(
input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y,
grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class,
class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target,
loss_weight_conf_notarget, loss_weight_class);
}
};
} // namespace operators
} // namespace paddle
...@@ -22,6 +22,8 @@ from . import op_frequence ...@@ -22,6 +22,8 @@ from . import op_frequence
from .op_frequence import * from .op_frequence import *
from . import quantize from . import quantize
from .quantize import * from .quantize import *
from . import int8_inference
from .int8_inference import *
from . import reader from . import reader
from .reader import * from .reader import *
from . import slim from . import slim
...@@ -34,6 +36,7 @@ __all__ += decoder.__all__ ...@@ -34,6 +36,7 @@ __all__ += decoder.__all__
__all__ += memory_usage_calc.__all__ __all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__ __all__ += op_frequence.__all__
__all__ += quantize.__all__ __all__ += quantize.__all__
__all__ += int8_inference.__all__
__all__ += reader.__all__ __all__ += reader.__all__
__all__ += slim.__all__ __all__ += slim.__all__
__all__ += utils.__all__ __all__ += utils.__all__
# Offline INT8 Calibration Tool
PaddlePaddle supports offline INT8 calibration to accelerate the inference speed. In this document, we provide the instructions on how to enable INT8 calibration and show the ResNet-50 and MobileNet-V1 results in accuracy.
## 0. Prerequisite
You need to install at least PaddlePaddle-1.3 python package `pip install paddlepaddle==1.3`.
## 1. How to generate INT8 model
You can refer to the unit test in [test_calibration.py](../tests/test_calibration.py). Basically, there are three steps:
* Construct calibration object.
```python
calibrator = int8_utility.Calibrator( # Step 1
program=infer_program, # required, FP32 program
pretrained_model=model_path, # required, FP32 pretrained model
algo=algo, # required, calibration algorithm; default is max, the alternative is KL (Kullback–Leibler divergence)
exe=exe, # required, executor
output=int8_model, # required, INT8 model
feed_var_names=feed_dict, # required, feed dict
fetch_list=fetch_targets) # required, fetch targets
```
* Call the calibrator.sample_data() after executor run.
```python
_, acc1, _ = exe.run(
program,
feed={feed_dict[0]: image,
feed_dict[1]: label},
fetch_list=fetch_targets)
calibrator.sample_data() # Step 2
```
* Call the calibrator.save_int8_model() after sampling over specified iterations (e.g., iterations = 50)
```python
calibrator.save_int8_model() # Step 3
```
## 2. How to run INT8 model
You can load INT8 model by load_inference_model [API](https://github.com/PaddlePaddle/Paddle/blob/8b50ad80ff6934512d3959947ac1e71ea3fb9ea3/python/paddle/fluid/io.py#L991) and run INT8 inference similar as [FP32](https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/object_detection/eval.py "FP32").
```python
[infer_program, feed_dict,
fetch_targets] = fluid.io.load_inference_model(model_path, exe)
```
## 3. Result
We provide the results of accuracy measurd on [Intel® Xeon® Platinum Gold Processor](https://ark.intel.com/products/120489/Intel-Xeon-Gold-6148-Processor-27-5M-Cache-2-40-GHz- "Intel® Xeon® Gold 6148 Processor") (also known as Intel® Xeon® Skylake6148).
| Model | Dataset | FP32 Accuracy | INT8 Accuracy | Accuracy Diff |
| ------------ | ------------ | ------------ | ------------ | ------------ |
| ResNet-50 | Small | 72.00% | 72.00% | 0.00% |
| MobileNet-V1 | Small | 62.00% | 62.00% | 0.00% |
| ResNet-50 | Full ImageNet Val | 76.63% | 76.17% | 0.46% |
| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.49% | 0.29% |
Please note that [Small](http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz "Small") is a subset of [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset").
Notes:
* The accuracy measurement requires the model with `label`.
* The INT8 theoretical speedup is ~1.33X on Intel® Xeon® Skylake Server (please refer to `This allows for 4x more input at the cost of 3x more instructions or 33.33% more compute` in [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")).
## 4. How to reproduce the results
* Small dataset
```bash
python python/paddle/fluid/contrib/tests/test_calibration.py
```
* Full dataset
```bash
DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
```
...@@ -11,3 +11,10 @@ ...@@ -11,3 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
from . import utility
from .utility import *
__all__ = utility.__all__
...@@ -11,11 +11,15 @@ ...@@ -11,11 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.fluid.core as core
from paddle.fluid import core
import numpy as np import numpy as np
import math import math
import os import os
import paddle.fluid as fluid from paddle.fluid.executor import global_scope
from paddle.fluid import io
__all__ = ['Calibrator']
class Calibrator(object): class Calibrator(object):
...@@ -76,8 +80,7 @@ class Calibrator(object): ...@@ -76,8 +80,7 @@ class Calibrator(object):
''' '''
for i in self.sampling_program.list_vars(): for i in self.sampling_program.list_vars():
if i.name in self.sampling_vars: if i.name in self.sampling_vars:
np_data = np.array(fluid.global_scope().find_var(i.name) np_data = np.array(global_scope().find_var(i.name).get_tensor())
.get_tensor())
if i.name not in self._sampling_data: if i.name not in self._sampling_data:
self._sampling_data[i.name] = [] self._sampling_data[i.name] = []
self._sampling_data[i.name].append(np_data) self._sampling_data[i.name].append(np_data)
...@@ -86,9 +89,9 @@ class Calibrator(object): ...@@ -86,9 +89,9 @@ class Calibrator(object):
''' '''
Save the quantized model to the disk. Save the quantized model to the disk.
''' '''
fluid.io.save_inference_model(self.output, self.feed_var_names, io.save_inference_model(self.output, self.feed_var_names,
self.fetch_list, self.exe, self.fetch_list, self.exe,
self.sampling_program) self.sampling_program)
def __display_debug(self): def __display_debug(self):
if self.debug: if self.debug:
......
...@@ -19,15 +19,12 @@ import sys ...@@ -19,15 +19,12 @@ import sys
import random import random
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import argparse
import functools import functools
import contextlib import contextlib
import paddle.fluid.profiler as profiler
from paddle.dataset.common import download from paddle.dataset.common import download
from PIL import Image, ImageEnhance from PIL import Image, ImageEnhance
import math import math
sys.path.append('..') import paddle.fluid.contrib.int8_inference.utility as int8_utility
import int8_inference.utility as int8_utility
random.seed(0) random.seed(0)
np.random.seed(0) np.random.seed(0)
...@@ -43,7 +40,7 @@ img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) ...@@ -43,7 +40,7 @@ img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
# TODO(guomingz): Remove duplicated code from line 45 ~ line 114 # TODO(guomingz): Remove duplicated code from resize_short, crop_image, process_image, _reader_creator
def resize_short(img, target_size): def resize_short(img, target_size):
percent = float(target_size) / min(img.size[0], img.size[1]) percent = float(target_size) / min(img.size[0], img.size[1])
resized_width = int(round(img.size[0] * percent)) resized_width = int(round(img.size[0] * percent))
...@@ -123,16 +120,37 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -123,16 +120,37 @@ class TestCalibrationForResnet50(unittest.TestCase):
self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' + self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
self.int8_download) self.int8_download)
data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz' data_urls = []
data_md5 = '1b6c1c434172cca1bf9ba1e4d7a3157d' data_md5s = []
self.data_cache_folder = self.download_data(data_url, data_md5, "data") self.data_cache_folder = ''
if os.environ.get('DATASET') == 'full':
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
)
data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
data_urls.append(
'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
)
data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
self.data_cache_folder = self.download_data(data_urls, data_md5s,
"full_data", False)
else:
data_urls.append(
'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
)
data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
self.data_cache_folder = self.download_data(data_urls, data_md5s,
"small_data", False)
# reader/decorator.py requires the relative path to the data folder # reader/decorator.py requires the relative path to the data folder
cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data", cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
self.data_cache_folder) self.data_cache_folder)
os.system(cmd) os.system(cmd)
self.iterations = 50 self.batch_size = 1
self.sample_iterations = 50
self.infer_iterations = 50000 if os.environ.get(
'DATASET') == 'full' else 50
def cache_unzipping(self, target_folder, zip_path): def cache_unzipping(self, target_folder, zip_path):
if not os.path.exists(target_folder): if not os.path.exists(target_folder):
...@@ -140,20 +158,44 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -140,20 +158,44 @@ class TestCalibrationForResnet50(unittest.TestCase):
zip_path) zip_path)
os.system(cmd) os.system(cmd)
def download_data(self, data_url, data_md5, folder_name): def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
download(data_url, self.int8_download, data_md5)
data_cache_folder = os.path.join(self.cache_folder, folder_name) data_cache_folder = os.path.join(self.cache_folder, folder_name)
file_name = data_url.split('/')[-1] zip_path = ''
zip_path = os.path.join(self.cache_folder, file_name) if os.environ.get('DATASET') == 'full':
file_names = []
for i in range(0, len(data_urls)):
download(data_urls[i], self.int8_download, data_md5s[i])
file_names.append(data_urls[i].split('/')[-1])
zip_path = os.path.join(self.cache_folder,
'full_imagenet_val.tar.gz')
if not os.path.exists(zip_path):
cat_command = 'cat'
for file_name in file_names:
cat_command += ' ' + os.path.join(self.cache_folder,
file_name)
cat_command += ' > ' + zip_path
os.system(cat_command)
if os.environ.get('DATASET') != 'full' or is_model:
download(data_urls[0], self.int8_download, data_md5s[0])
file_name = data_urls[0].split('/')[-1]
zip_path = os.path.join(self.cache_folder, file_name)
print('Data is downloaded at {0}').format(zip_path)
self.cache_unzipping(data_cache_folder, zip_path) self.cache_unzipping(data_cache_folder, zip_path)
return data_cache_folder return data_cache_folder
def download_resnet50_model(self): def download_model(self):
# resnet50 fp32 data # resnet50 fp32 data
data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz' data_urls = [
data_md5 = '4a5194524823d9b76da6e738e1367881' 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
self.model_cache_folder = self.download_data(data_url, data_md5, ]
data_md5s = ['4a5194524823d9b76da6e738e1367881']
self.model_cache_folder = self.download_data(data_urls, data_md5s,
"resnet50_fp32") "resnet50_fp32")
self.model = "ResNet-50"
self.algo = "direct"
def run_program(self, model_path, generate_int8=False, algo='direct'): def run_program(self, model_path, generate_int8=False, algo='direct'):
image_shape = [3, 224, 224] image_shape = [3, 224, 224]
...@@ -169,17 +211,17 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -169,17 +211,17 @@ class TestCalibrationForResnet50(unittest.TestCase):
t = fluid.transpiler.InferenceTranspiler() t = fluid.transpiler.InferenceTranspiler()
t.transpile(infer_program, fluid.CPUPlace()) t.transpile(infer_program, fluid.CPUPlace())
val_reader = paddle.batch(val(), batch_size=1) val_reader = paddle.batch(val(), self.batch_size)
iterations = self.infer_iterations
if generate_int8: if generate_int8:
int8_model = os.path.join(os.getcwd(), "calibration_out") int8_model = os.path.join(os.getcwd(), "calibration_out")
iterations = self.sample_iterations
if os.path.exists(int8_model): if os.path.exists(int8_model):
os.system("rm -rf " + int8_model) os.system("rm -rf " + int8_model)
os.system("mkdir " + int8_model) os.system("mkdir " + int8_model)
print("Start calibration ...")
calibrator = int8_utility.Calibrator( calibrator = int8_utility.Calibrator(
program=infer_program, program=infer_program,
pretrained_model=model_path, pretrained_model=model_path,
...@@ -191,6 +233,7 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -191,6 +233,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
test_info = [] test_info = []
cnt = 0 cnt = 0
periods = []
for batch_id, data in enumerate(val_reader()): for batch_id, data in enumerate(val_reader()):
image = np.array( image = np.array(
[x[0].reshape(image_shape) for x in data]).astype("float32") [x[0].reshape(image_shape) for x in data]).astype("float32")
...@@ -202,21 +245,28 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -202,21 +245,28 @@ class TestCalibrationForResnet50(unittest.TestCase):
if op.has_attr("use_mkldnn"): if op.has_attr("use_mkldnn"):
op._set_attr("use_mkldnn", True) op._set_attr("use_mkldnn", True)
t1 = time.time()
_, acc1, _ = exe.run( _, acc1, _ = exe.run(
running_program, running_program,
feed={feed_dict[0]: image, feed={feed_dict[0]: image,
feed_dict[1]: label}, feed_dict[1]: label},
fetch_list=fetch_targets) fetch_list=fetch_targets)
t2 = time.time()
period = t2 - t1
periods.append(period)
if generate_int8: if generate_int8:
calibrator.sample_data() calibrator.sample_data()
test_info.append(np.mean(acc1) * len(data)) test_info.append(np.mean(acc1) * len(data))
cnt += len(data) cnt += len(data)
if batch_id != self.iterations - 1: if (batch_id + 1) % 100 == 0:
continue print("{0} images,".format(batch_id + 1))
sys.stdout.flush()
break if (batch_id + 1) == iterations:
break
if generate_int8: if generate_int8:
calibrator.save_int8_model() calibrator.save_int8_model()
...@@ -225,32 +275,49 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -225,32 +275,49 @@ class TestCalibrationForResnet50(unittest.TestCase):
"Calibration is done and the corresponding files are generated at {}". "Calibration is done and the corresponding files are generated at {}".
format(os.path.abspath("calibration_out"))) format(os.path.abspath("calibration_out")))
else: else:
return np.sum(test_info) / cnt throughput = cnt / np.sum(periods)
latency = np.average(periods)
acc1 = np.sum(test_info) / cnt
return (throughput, latency, acc1)
def test_calibration(self): def test_calibration(self):
self.download_resnet50_model() self.download_model()
fp32_acc1 = self.run_program(self.model_cache_folder + "/model") print("Start FP32 inference for {0} on {1} images ...").format(
self.run_program(self.model_cache_folder + "/model", True) self.model, self.infer_iterations)
int8_acc1 = self.run_program("calibration_out") (fp32_throughput, fp32_latency,
fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
print("Start INT8 calibration for {0} on {1} images ...").format(
self.model, self.sample_iterations)
self.run_program(
self.model_cache_folder + "/model", True, algo=self.algo)
print("Start INT8 inference for {0} on {1} images ...").format(
self.model, self.infer_iterations)
(int8_throughput, int8_latency,
int8_acc1) = self.run_program("calibration_out")
delta_value = np.abs(fp32_acc1 - int8_acc1) delta_value = np.abs(fp32_acc1 - int8_acc1)
self.assertLess(delta_value, 0.01) self.assertLess(delta_value, 0.01)
print(
"FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
format(self.model, self.batch_size, fp32_throughput, fp32_latency,
fp32_acc1))
print(
"INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
format(self.model, self.batch_size, int8_throughput, int8_latency,
int8_acc1))
sys.stdout.flush()
class TestCalibrationForMobilenetv1(TestCalibrationForResnet50): class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
def download_mobilenetv1_model(self): def download_model(self):
# mobilenetv1 fp32 data # mobilenetv1 fp32 data
data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz' data_urls = [
data_md5 = '13892b0716d26443a8cdea15b3c6438b' 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
self.model_cache_folder = self.download_data(data_url, data_md5, ]
data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
self.model_cache_folder = self.download_data(data_urls, data_md5s,
"mobilenetv1_fp32") "mobilenetv1_fp32")
self.model = "MobileNet-V1"
def test_calibration(self): self.algo = "KL"
self.download_mobilenetv1_model()
fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
self.run_program(self.model_cache_folder + "/model", True, algo='KL')
int8_acc1 = self.run_program("calibration_out")
delta_value = np.abs(fp32_acc1 - int8_acc1)
self.assertLess(delta_value, 0.01)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -366,17 +366,40 @@ class TruncatedNormalInitializer(Initializer): ...@@ -366,17 +366,40 @@ class TruncatedNormalInitializer(Initializer):
# Initialization Ops should be prepended and not appended # Initialization Ops should be prepended and not appended
if self._seed == 0: if self._seed == 0:
self._seed = block.program.random_seed self._seed = block.program.random_seed
# to be compatible of fp16 initalizers
if var.dtype == VarDesc.VarType.FP16:
out_dtype = VarDesc.VarType.FP32
out_var = block.create_var(
name=unique_name.generate(".".join(
['truncated_gaussian_random', 'tmp'])),
shape=var.shape,
dtype=out_dtype,
type=VarDesc.VarType.LOD_TENSOR,
persistable=False)
else:
out_dtype = var.dtype
out_var = var
op = block._prepend_op( op = block._prepend_op(
type="truncated_gaussian_random", type="truncated_gaussian_random",
outputs={"Out": var}, outputs={"Out": out_var},
attrs={ attrs={
"shape": var.shape, "shape": var.shape,
"dtype": int(var.dtype), "dtype": out_dtype,
"mean": self._mean, "mean": self._mean,
"std": self._std_dev, "std": self._std_dev,
"seed": self._seed "seed": self._seed
}, },
stop_gradient=True) stop_gradient=True)
if var.dtype == VarDesc.VarType.FP16:
block.append_op(
type="cast",
inputs={"X": out_var},
outputs={"Out": var},
attrs={"in_dtype": out_var.dtype,
"out_dtype": var.dtype})
var.op = op var.op = op
return op return op
......
...@@ -22,9 +22,10 @@ import shutil ...@@ -22,9 +22,10 @@ import shutil
import six import six
from functools import reduce from functools import reduce
from paddle.fluid import layers
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.evaluator import Evaluator from paddle.fluid.evaluator import Evaluator
from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard
from . import core from . import core
__all__ = [ __all__ = [
...@@ -939,6 +940,17 @@ def save_inference_model(dirname, ...@@ -939,6 +940,17 @@ def save_inference_model(dirname,
we save the original program as inference model.", we save the original program as inference model.",
RuntimeWarning) RuntimeWarning)
# fix the bug that the activation op's output as target will be pruned.
# will affect the inference performance.
# TODO(Superjomn) add an IR pass to remove 1-scale op.
with program_guard(main_program):
uniq_target_vars = []
for var in target_vars:
if isinstance(var, Variable):
var1 = layers.scale(var, 1.)
uniq_target_vars.append(var1)
target_vars = uniq_target_vars
# when a pserver and a trainer running on the same machine, mkdir may conflict # when a pserver and a trainer running on the same machine, mkdir may conflict
try: try:
os.makedirs(dirname) os.makedirs(dirname)
......
...@@ -49,6 +49,7 @@ __all__ = [ ...@@ -49,6 +49,7 @@ __all__ = [
'box_coder', 'box_coder',
'polygon_box_transform', 'polygon_box_transform',
'yolov3_loss', 'yolov3_loss',
'box_clip',
'multiclass_nms', 'multiclass_nms',
] ]
...@@ -346,19 +347,107 @@ def box_coder(prior_box, ...@@ -346,19 +347,107 @@ def box_coder(prior_box,
target_box, target_box,
code_type="encode_center_size", code_type="encode_center_size",
box_normalized=True, box_normalized=True,
name=None): name=None,
axis=0):
""" """
${comment} **Box Coder Layer**
Encode/Decode the target bounding box with the priorbox information.
The Encoding schema described below:
.. math::
ox = (tx - px) / pw / pxv
oy = (ty - py) / ph / pyv
ow = \log(\abs(tw / pw)) / pwv
oh = \log(\abs(th / ph)) / phv
The Decoding schema described below:
.. math::
ox = (pw * pxv * tx * + px) - tw / 2
oy = (ph * pyv * ty * + py) - th / 2
ow = \exp(pwv * tw) * pw + tw / 2
oh = \exp(phv * th) * ph + th / 2
where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates,
width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote
the priorbox's (anchor) center coordinates, width and height. `pxv`,
`pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`,
`ow`, `oh` denote the encoded/decoded coordinates, width and height.
During Box Decoding, two modes for broadcast are supported. Say target
box has shape [N, M, 4], and the shape of prior box can be [N, 4] or
[M, 4]. Then prior box will broadcast to target box along the
assigned axis.
Args: Args:
prior_box(${prior_box_type}): ${prior_box_comment} prior_box(Variable): Box list prior_box is a 2-D Tensor with shape
prior_box_var(${prior_box_var_type}): ${prior_box_var_comment} [M, 4] holds M boxes, each box is represented as
target_box(${target_box_type}): ${target_box_comment} [xmin, ymin, xmax, ymax], [xmin, ymin] is the
code_type(${code_type_type}): ${code_type_comment} left top coordinate of the anchor box, if the
box_normalized(${box_normalized_type}): ${box_normalized_comment} input is image feature map, they are close to
the origin of the coordinate system. [xmax, ymax]
is the right bottom coordinate of the anchor box.
prior_box_var(Variable|list): prior_box_var supports two types of input.
One is variable with shape [M, 4] holds M group.
The other one is list consist of 4 elements
shared by all boxes.
target_box(Variable): This input can be a 2-D LoDTensor with shape
[N, 4] when code_type is 'encode_center_size'.
This input also can be a 3-D Tensor with shape
[N, M, 4] when code_type is 'decode_center_size'.
Each box is represented as
[xmin, ymin, xmax, ymax]. This tensor can
contain LoD information to represent a batch
of inputs.
code_type(string): The code type used with the target box. It can be
encode_center_size or decode_center_size
box_normalized(int): Whether treat the priorbox as a noramlized box.
Set true by default.
name(string): The name of box coder.
axis(int): Which axis in PriorBox to broadcast for box decode,
for example, if axis is 0 and TargetBox has shape
[N, M, 4] and PriorBox has shape [M, 4], then PriorBox
will broadcast to [N, M, 4] for decoding. It is only valid
when code type is decode_center_size. Set 0 by default.
Returns: Returns:
output_box(${output_box_type}): ${output_box_comment} output_box(Variable): When code_type is 'encode_center_size', the
output tensor of box_coder_op with shape
[N, M, 4] representing the result of N target
boxes encoded with M Prior boxes and variances.
When code_type is 'decode_center_size',
N represents the batch size and M represents
the number of deocded boxes.
Examples:
.. code-block:: python
prior_box = fluid.layers.data(name='prior_box',
shape=[512, 4],
dtype='float32',
append_batch_size=False)
target_box = fluid.layers.data(name='target_box',
shape=[512,81,4],
dtype='float32',
append_batch_size=False)
output = fluid.layers.box_coder(prior_box=prior_box,
prior_box_var=[0.1,0.1,0.2,0.2],
target_box=target_box,
code_type="decode_center_size",
box_normalized=False,
axis=1)
""" """
helper = LayerHelper("box_coder", **locals()) helper = LayerHelper("box_coder", **locals())
...@@ -369,15 +458,22 @@ def box_coder(prior_box, ...@@ -369,15 +458,22 @@ def box_coder(prior_box,
output_box = helper.create_variable( output_box = helper.create_variable(
name=name, dtype=prior_box.dtype, persistable=False) name=name, dtype=prior_box.dtype, persistable=False)
inputs = {"PriorBox": prior_box, "TargetBox": target_box}
attrs = {
"code_type": code_type,
"box_normalized": box_normalized,
"axis": axis
}
if isinstance(prior_box_var, Variable):
inputs['PriorBoxVar'] = prior_box_var
elif isinstance(prior_box_var, list):
attrs['variance'] = prior_box_var
else:
raise TypeError("Input variance of box_coder must be Variable or lisz")
helper.append_op( helper.append_op(
type="box_coder", type="box_coder",
inputs={ inputs=inputs,
"PriorBox": prior_box, attrs=attrs,
"PriorBoxVar": prior_box_var,
"TargetBox": target_box
},
attrs={"code_type": code_type,
"box_normalized": box_normalized},
outputs={"OutputBox": output_box}) outputs={"OutputBox": output_box})
return output_box return output_box
...@@ -413,13 +509,10 @@ def yolov3_loss(x, ...@@ -413,13 +509,10 @@ def yolov3_loss(x,
gtbox, gtbox,
gtlabel, gtlabel,
anchors, anchors,
anchor_mask,
class_num, class_num,
ignore_thresh, ignore_thresh,
loss_weight_xy=None, downsample_ratio,
loss_weight_wh=None,
loss_weight_conf_target=None,
loss_weight_conf_notarget=None,
loss_weight_class=None,
name=None): name=None):
""" """
${comment} ${comment}
...@@ -431,16 +524,13 @@ def yolov3_loss(x, ...@@ -431,16 +524,13 @@ def yolov3_loss(x,
and x, y, w, h should be relative value of input image. and x, y, w, h should be relative value of input image.
N is the batch number and B is the max box number in N is the batch number and B is the max box number in
an image. an image.
gtlabel (Variable): class id of ground truth boxes, shoud be ins shape gtlabel (Variable): class id of ground truth boxes, shoud be in shape
of [N, B]. of [N, B].
anchors (list|tuple): ${anchors_comment} anchors (list|tuple): ${anchors_comment}
anchor_mask (list|tuple): ${anchor_mask_comment}
class_num (int): ${class_num_comment} class_num (int): ${class_num_comment}
ignore_thresh (float): ${ignore_thresh_comment} ignore_thresh (float): ${ignore_thresh_comment}
loss_weight_xy (float|None): ${loss_weight_xy_comment} downsample_ratio (int): ${downsample_ratio_comment}
loss_weight_wh (float|None): ${loss_weight_wh_comment}
loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment}
loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment}
loss_weight_class (float|None): ${loss_weight_class_comment}
name (string): the name of yolov3 loss name (string): the name of yolov3 loss
Returns: Returns:
...@@ -460,9 +550,10 @@ def yolov3_loss(x, ...@@ -460,9 +550,10 @@ def yolov3_loss(x,
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23] anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 anchors = [0, 1, 2]
anchors=anchors, ignore_thresh=0.5) loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors,
ignore_thresh=0.5, downsample_ratio=32)
""" """
helper = LayerHelper('yolov3_loss', **locals()) helper = LayerHelper('yolov3_loss', **locals())
...@@ -474,6 +565,8 @@ def yolov3_loss(x, ...@@ -474,6 +565,8 @@ def yolov3_loss(x,
raise TypeError("Input gtlabel of yolov3_loss must be Variable") raise TypeError("Input gtlabel of yolov3_loss must be Variable")
if not isinstance(anchors, list) and not isinstance(anchors, tuple): if not isinstance(anchors, list) and not isinstance(anchors, tuple):
raise TypeError("Attr anchors of yolov3_loss must be list or tuple") raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple")
if not isinstance(class_num, int): if not isinstance(class_num, int):
raise TypeError("Attr class_num of yolov3_loss must be an integer") raise TypeError("Attr class_num of yolov3_loss must be an integer")
if not isinstance(ignore_thresh, float): if not isinstance(ignore_thresh, float):
...@@ -486,31 +579,29 @@ def yolov3_loss(x, ...@@ -486,31 +579,29 @@ def yolov3_loss(x,
loss = helper.create_variable( loss = helper.create_variable(
name=name, dtype=x.dtype, persistable=False) name=name, dtype=x.dtype, persistable=False)
objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
attrs = { attrs = {
"anchors": anchors, "anchors": anchors,
"anchor_mask": anchor_mask,
"class_num": class_num, "class_num": class_num,
"ignore_thresh": ignore_thresh, "ignore_thresh": ignore_thresh,
"downsample_ratio": downsample_ratio,
} }
if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
self.attrs['loss_weight_xy'] = loss_weight_xy
if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
self.attrs['loss_weight_wh'] = loss_weight_wh
if loss_weight_conf_target is not None and isinstance(
loss_weight_conf_target, float):
self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
if loss_weight_conf_notarget is not None and isinstance(
loss_weight_conf_notarget, float):
self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
if loss_weight_class is not None and isinstance(loss_weight_class, float):
self.attrs['loss_weight_class'] = loss_weight_class
helper.append_op( helper.append_op(
type='yolov3_loss', type='yolov3_loss',
inputs={"X": x, inputs={
"GTBox": gtbox, "X": x,
"GTLabel": gtlabel}, "GTBox": gtbox,
outputs={'Loss': loss}, "GTLabel": gtlabel,
},
outputs={
'Loss': loss,
'ObjectnessMask': objectness_mask,
'GTMatchMask': gt_match_mask
},
attrs=attrs) attrs=attrs)
return loss return loss
...@@ -1965,6 +2056,54 @@ def generate_proposals(scores, ...@@ -1965,6 +2056,54 @@ def generate_proposals(scores,
return rpn_rois, rpn_roi_probs return rpn_rois, rpn_roi_probs
def box_clip(input, im_info, name=None):
"""
Clip the box into the size given by im_info
For each input box, The formula is given as follows:
.. code-block:: text
xmin = max(min(xmin, im_w - 1), 0)
ymin = max(min(ymin, im_h - 1), 0)
xmax = max(min(xmax, im_w - 1), 0)
ymax = max(min(ymax, im_h - 1), 0)
where im_w and im_h are computed from im_info:
.. code-block:: text
im_h = round(height / scale)
im_w = round(weight / scale)
Args:
input(variable): The input box, the last dimension is 4.
im_info(variable): The information of image with shape [N, 3] with
layout (height, width, scale). height and width
is the input size and scale is the ratio of input
size and original size.
name (str): The name of this layer. It is optional.
Returns:
Variable: The cliped tensor variable.
Examples:
.. code-block:: python
boxes = fluid.layers.data(
name='data', shape=[8, 4], dtype='float32', lod_level=1)
im_info = fluid.layers.data(name='im_info', shape=[3])
out = fluid.layers.box_clip(
input=boxes, im_info=im_info, inplace=True)
"""
helper = LayerHelper("box_clip", **locals())
output = helper.create_variable_for_type_inference(dtype=input.dtype)
inputs = {"Input": input, "ImInfo": im_info}
helper.append_op(type="box_clip", inputs=inputs, outputs={"Output": output})
return output
def multiclass_nms(bboxes, def multiclass_nms(bboxes,
scores, scores,
score_threshold, score_threshold,
...@@ -2042,9 +2181,11 @@ def multiclass_nms(bboxes, ...@@ -2042,9 +2181,11 @@ def multiclass_nms(bboxes,
(After version 1.3, when no boxes detected, the lod is changed (After version 1.3, when no boxes detected, the lod is changed
from {0} to {1}) from {0} to {1})
Examples: Examples:
.. code-block:: python .. code-block:: python
boxes = fluid.layers.data(name='bboxes', shape=[81, 4], boxes = fluid.layers.data(name='bboxes', shape=[81, 4],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
scores = fluid.layers.data(name='scores', shape=[81], scores = fluid.layers.data(name='scores', shape=[81],
......
...@@ -932,7 +932,7 @@ def dynamic_gru(input, ...@@ -932,7 +932,7 @@ def dynamic_gru(input,
create ParamAttr as param_attr. If the Initializer of the param_attr create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None. is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations. the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate, If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one reset gate and candidate calculations. If it is set to None or one
...@@ -1073,7 +1073,7 @@ def gru_unit(input, ...@@ -1073,7 +1073,7 @@ def gru_unit(input,
create ParamAttr as param_attr. If the Initializer of the param_attr create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None. is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations. the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate, If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one reset gate and candidate calculations. If it is set to None or one
...@@ -3877,7 +3877,8 @@ def beam_search(pre_ids, ...@@ -3877,7 +3877,8 @@ def beam_search(pre_ids,
end_id, end_id,
level=0, level=0,
is_accumulated=True, is_accumulated=True,
name=None): name=None,
return_parent_idx=False):
""" """
Beam search is a classical algorithm for selecting candidate words in a Beam search is a classical algorithm for selecting candidate words in a
machine translation task. machine translation task.
...@@ -3933,10 +3934,16 @@ def beam_search(pre_ids, ...@@ -3933,10 +3934,16 @@ def beam_search(pre_ids,
accumulated scores. accumulated scores.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
return_parent_idx(bool): Whether to return an extra Tensor variable
preserving the selected_ids' parent indice in pre_ids
in output, which can be used to gather cell states at
the next time step.
Returns: Returns:
Variable: The LodTensor pair containing the selected ids and the \ Variable: The LodTensor tuple containing the selected ids and the \
corresponding scores. corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \
an extra Tensor variable preserving the selected_ids' parent indice \
is included.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -3969,6 +3976,11 @@ def beam_search(pre_ids, ...@@ -3969,6 +3976,11 @@ def beam_search(pre_ids,
selected_scores = helper.create_variable_for_type_inference( selected_scores = helper.create_variable_for_type_inference(
dtype=score_type) dtype=score_type)
selected_ids = helper.create_variable_for_type_inference(dtype=id_type) selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
# parent_idx is a tensor used to gather cell states at the next time
# step. Though lod in selected_ids can also be used to gather by
# sequence_expand, it is not efficient.
# gather_op's index input only supports int32 dtype currently
parent_idx = helper.create_variable_for_type_inference(dtype="int32")
helper.append_op( helper.append_op(
type='beam_search', type='beam_search',
...@@ -3976,6 +3988,7 @@ def beam_search(pre_ids, ...@@ -3976,6 +3988,7 @@ def beam_search(pre_ids,
outputs={ outputs={
'selected_ids': selected_ids, 'selected_ids': selected_ids,
'selected_scores': selected_scores, 'selected_scores': selected_scores,
'parent_idx': parent_idx
}, },
attrs={ attrs={
# TODO(ChunweiYan) to assure other value support # TODO(ChunweiYan) to assure other value support
...@@ -3984,8 +3997,10 @@ def beam_search(pre_ids, ...@@ -3984,8 +3997,10 @@ def beam_search(pre_ids,
'end_id': end_id, 'end_id': end_id,
'is_accumulated': is_accumulated, 'is_accumulated': is_accumulated,
}) })
if return_parent_idx:
return selected_ids, selected_scores return selected_ids, selected_scores, parent_idx
else:
return selected_ids, selected_scores
def beam_search_decode(ids, scores, beam_size, end_id, name=None): def beam_search_decode(ids, scores, beam_size, end_id, name=None):
...@@ -5403,7 +5418,7 @@ def transpose(x, perm, name=None): ...@@ -5403,7 +5418,7 @@ def transpose(x, perm, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
# use append_batch_size=False to avoid prepending extra # use append_batch_size=False to avoid prepending extra
# batch size in shape # batch size in shape
x = fluid.layers.data(name='x', shape=[5, 10, 15], x = fluid.layers.data(name='x', shape=[5, 10, 15],
dtype='float32', append_batch_size=False) dtype='float32', append_batch_size=False)
...@@ -5920,7 +5935,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): ...@@ -5920,7 +5935,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
than :attr:`shape`. than :attr:`shape`.
act (str): The non-linear activation to be applied to the reshaped tensor act (str): The non-linear activation to be applied to the reshaped tensor
variable. variable.
inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
operators. If this flag is set :attr:`True`, reuse input operators. If this flag is set :attr:`True`, reuse input
:attr:`x` to reshape, which will change the shape of :attr:`x` to reshape, which will change the shape of
tensor variable :attr:`x` and might cause errors when tensor variable :attr:`x` and might cause errors when
...@@ -6581,7 +6596,9 @@ def image_resize(input, ...@@ -6581,7 +6596,9 @@ def image_resize(input,
scale=None, scale=None,
name=None, name=None,
resample='BILINEAR', resample='BILINEAR',
actual_shape=None): actual_shape=None,
align_corners=True,
align_mode=1):
""" """
**Resize a Batch of Images** **Resize a Batch of Images**
...@@ -6594,6 +6611,80 @@ def image_resize(input, ...@@ -6594,6 +6611,80 @@ def image_resize(input,
'NEAREST' : Nearest neighbor interpolation 'NEAREST' : Nearest neighbor interpolation
Nearest neighbor interpolation is to perform nearest neighbor interpolation
in both the 3rd dimention(in height direction) and the 4th dimention(in width
direction) on input tensor.
Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and
W-direction in this op) on a rectilinear 2D grid. The key idea is
to perform linear interpolation first in one direction, and then
again in the other direction.
Align_corners and align_mode are optinal parameters,the calculation method
of interpolation can be selected by them.
Example:
For scale:
if align_corners = True && out_size > 1 :
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
else:
align_corners = True
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
For details of bilinear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation.
Args: Args:
input (Variable): The input tensor of image resize layer, input (Variable): The input tensor of image resize layer,
This is a 4-D tensor of the shape This is a 4-D tensor of the shape
...@@ -6623,6 +6714,13 @@ def image_resize(input, ...@@ -6623,6 +6714,13 @@ def image_resize(input,
set, otherwise errors would be occured in graph set, otherwise errors would be occured in graph
constructing stage. constructing stage.
Default: None Default: None
align_corners(bool) : An optional bool, If True, the centers of the 4 corner pixels of the
input and output tensors are aligned, preserving the values at the
corner pixels.
Default: True
align_mode(int) : An optional for bilinear interpolation. can be \'0\'
for src_idx = scale*(dst_indx+0.5)-0.5 , can be \'1\' for
src_idx = scale*dst_index .
Returns: Returns:
Variable: The output is a 4-D tensor of the shape Variable: The output is a 4-D tensor of the shape
...@@ -6635,6 +6733,8 @@ def image_resize(input, ...@@ -6635,6 +6733,8 @@ def image_resize(input,
or 'NEAREST' currently. or 'NEAREST' currently.
ValueError: One of out_shape and scale must not be None. ValueError: One of out_shape and scale must not be None.
ValueError: out_shape length should be 2. ValueError: out_shape length should be 2.
TypeError: align_corners shoule be a bool value
ValueError: align_mode can only be '0' or '1'
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -6650,6 +6750,12 @@ def image_resize(input, ...@@ -6650,6 +6750,12 @@ def image_resize(input,
"The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently." "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
) )
resample_type = resample_methods[resample] resample_type = resample_methods[resample]
if not isinstance(align_corners, bool):
raise TypeError("Attr align_corners should be a bool value")
if align_mode != 0 and align_mode != 1:
raise ValueError("align_mode can only be 0 or 1")
if out_shape is None and scale is None: if out_shape is None and scale is None:
raise ValueError("One of out_shape and scale must not be None.") raise ValueError("One of out_shape and scale must not be None.")
helper = LayerHelper('{}_interp'.format(resample_type), **locals()) helper = LayerHelper('{}_interp'.format(resample_type), **locals())
...@@ -6689,9 +6795,13 @@ def image_resize(input, ...@@ -6689,9 +6795,13 @@ def image_resize(input,
type='{}_interp'.format(resample_type), type='{}_interp'.format(resample_type),
inputs=inputs, inputs=inputs,
outputs={"Out": out}, outputs={"Out": out},
attrs={"out_h": out_h, attrs={
"out_w": out_w, "out_h": out_h,
"interp_method": resample_type}) "out_w": out_w,
"interp_method": resample_type,
"align_corners": align_corners,
"align_mode": align_mode
})
return out return out
...@@ -6700,7 +6810,9 @@ def resize_bilinear(input, ...@@ -6700,7 +6810,9 @@ def resize_bilinear(input,
out_shape=None, out_shape=None,
scale=None, scale=None,
name=None, name=None,
actual_shape=None): actual_shape=None,
align_corners=True,
align_mode=1):
""" """
Resize input by performing bilinear interpolation based on given Resize input by performing bilinear interpolation based on given
output shape which specified by actual_shape, out_shape and scale output shape which specified by actual_shape, out_shape and scale
...@@ -6715,6 +6827,47 @@ def resize_bilinear(input, ...@@ -6715,6 +6827,47 @@ def resize_bilinear(input,
For details of bilinear interpolation, please refer to Wikipedia: For details of bilinear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation https://en.wikipedia.org/wiki/Bilinear_interpolation
Align_corners and align_mode are optinal parameters,the calculation
method of interpolation can be selected by them.
Align_corners and align_mode are optinal parameters,the calculation method
of interpolation can be selected by them.
Example:
For scale:
if align_corners = True && out_size > 1 :
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Args: Args:
input(${x_type}): ${x_comment}. input(${x_type}): ${x_comment}.
...@@ -6738,6 +6891,8 @@ def resize_bilinear(input, ...@@ -6738,6 +6891,8 @@ def resize_bilinear(input,
set, otherwise errors would be occured in graph set, otherwise errors would be occured in graph
constructing stage. constructing stage.
Default: None Default: None
align_corners(bool): ${align_corners_comment}
align_mode(bool): ${align_mode_comment}
Returns: Returns:
${out_comment}. ${out_comment}.
...@@ -6748,7 +6903,8 @@ def resize_bilinear(input, ...@@ -6748,7 +6903,8 @@ def resize_bilinear(input,
out = fluid.layers.resize_bilinear(input, out_shape=[12, 12]) out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
""" """
return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape) return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape,
align_corners, align_mode)
@templatedoc(op_type="nearest_interp") @templatedoc(op_type="nearest_interp")
...@@ -6756,13 +6912,48 @@ def resize_nearest(input, ...@@ -6756,13 +6912,48 @@ def resize_nearest(input,
out_shape=None, out_shape=None,
scale=None, scale=None,
name=None, name=None,
actual_shape=None): actual_shape=None,
align_corners=True):
""" """
Resize input by performing nearest neighbor interpolation in both the Resize input by performing nearest neighbor interpolation in both the
3rd dimention(in height direction) and the 4th dimention(in width 3rd dimention(in height direction) and the 4th dimention(in width
direction) based on given output shape which specified by actual_shape, direction) based on given output shape which specified by actual_shape,
out_shape and scale in priority order. out_shape and scale in priority order.
Example:
For scale:
if align_corners = True && out_size > 1 :
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
else:
align_corners = True
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
For details of nearest neighbor interpolation, please refer to Wikipedia: For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
...@@ -6789,6 +6980,7 @@ def resize_nearest(input, ...@@ -6789,6 +6980,7 @@ def resize_nearest(input,
set, otherwise errors would be occured in graph set, otherwise errors would be occured in graph
constructing stage. constructing stage.
Default: None Default: None
align_corners(bool): ${align_corners_comment}
Returns: Returns:
${out_comment}. ${out_comment}.
...@@ -6799,7 +6991,8 @@ def resize_nearest(input, ...@@ -6799,7 +6991,8 @@ def resize_nearest(input,
out = fluid.layers.resize_nearest(input, out_shape=[12, 12]) out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
""" """
return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape) return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape,
align_corners)
def image_resize_short(input, out_short_len, resample='BILINEAR'): def image_resize_short(input, out_short_len, resample='BILINEAR'):
......
...@@ -135,7 +135,7 @@ def thresholded_relu(x, threshold=None): ...@@ -135,7 +135,7 @@ def thresholded_relu(x, threshold=None):
if val is not None: if val is not None:
kwargs[name] = val kwargs[name] = val
_thresholded_relu_(**kwargs) return _thresholded_relu_(**kwargs)
thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """ thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
......
...@@ -50,6 +50,19 @@ class TestDetection(unittest.TestCase): ...@@ -50,6 +50,19 @@ class TestDetection(unittest.TestCase):
self.assertEqual(out.shape[-1], 6) self.assertEqual(out.shape[-1], 6)
print(str(program)) print(str(program))
def test_box_coder_api(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[4], dtype='float32')
y = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
bcoder = layers.box_coder(
prior_box=x,
prior_box_var=[0.1, 0.2, 0.1, 0.2],
target_box=y,
code_type='encode_center_size')
self.assertIsNotNone(bcoder)
print(str(program))
def test_detection_api(self): def test_detection_api(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
...@@ -463,12 +476,23 @@ class TestYoloDetection(unittest.TestCase): ...@@ -463,12 +476,23 @@ class TestYoloDetection(unittest.TestCase):
x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10, loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13],
0.5) [0, 1], 10, 0.7, 32)
self.assertIsNotNone(loss) self.assertIsNotNone(loss)
class TestBoxClip(unittest.TestCase):
def test_box_clip(self):
program = Program()
with program_guard(program):
input_box = layers.data(
name='input_box', shape=[7, 4], dtype='float32', lod_level=1)
im_info = layers.data(name='im_info', shape=[3], dtype='float32')
out = layers.box_clip(input_box, im_info)
self.assertIsNotNone(out)
class TestMulticlassNMS(unittest.TestCase): class TestMulticlassNMS(unittest.TestCase):
def test_multiclass_nms(self): def test_multiclass_nms(self):
program = Program() program = Program()
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# The MKLDNN tests are skiped when the MKLDNN flag is OFF
if(NOT WITH_MKLDNN)
foreach(src ${TEST_OPS})
if(${src} MATCHES ".*_mkldnn_op$")
list(REMOVE_ITEM TEST_OPS ${src})
endif()
endforeach()
endif(NOT WITH_MKLDNN)
if(NOT WITH_DISTRIBUTE) if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_recv_op) list(REMOVE_ITEM TEST_OPS test_recv_op)
list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
...@@ -123,3 +114,7 @@ endif() ...@@ -123,3 +114,7 @@ endif()
if (WITH_NGRAPH) if (WITH_NGRAPH)
add_subdirectory(ngraph) add_subdirectory(ngraph)
endif() endif()
if (WITH_MKLDNN)
add_subdirectory(mkldnn)
endif()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -17,9 +17,9 @@ from __future__ import print_function ...@@ -17,9 +17,9 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
from scipy.special import expit from scipy.special import expit
from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
class TestMKLDNNReluDim2(TestRelu): class TestMKLDNNReluDim2(TestRelu):
......
...@@ -19,9 +19,9 @@ import numpy as np ...@@ -19,9 +19,9 @@ import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
import paddle.fluid as fluid import paddle.fluid as fluid
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
from paddle.fluid.framework import grad_var_name from paddle.fluid.framework import grad_var_name
from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining): class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3 from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
class TestMKLDNNConcatOp(TestConcatOp): class TestMKLDNNConcatOp(TestConcatOp):
......
...@@ -18,8 +18,8 @@ import unittest ...@@ -18,8 +18,8 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
from test_conv2d_op import conv2d_forward_naive, TestConv2dOp from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
def conv2d_forward_refer(input, filter, group, conv_param): def conv2d_forward_refer(input, filter, group, conv_param):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1 from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
class TestMKLDNN(TestConv2dOp): class TestMKLDNN(TestConv2dOp):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride from paddle.fluid.tests.unittests.test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
class TestMKLDNN(TestConv2dTransposeOp): class TestMKLDNN(TestConv2dTransposeOp):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1 from paddle.fluid.tests.unittests.test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
class TestMKLDNN(TestConv3dOp): class TestMKLDNN(TestConv3dOp):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
class TestDeQuantizeOp(OpTest): class TestDeQuantizeOp(OpTest):
......
...@@ -16,8 +16,8 @@ from __future__ import print_function ...@@ -16,8 +16,8 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
from test_elementwise_add_op import * from paddle.fluid.tests.unittests.test_elementwise_add_op import *
''' '''
Some tests differ from the tests defined in test_elementwise_add_op.py Some tests differ from the tests defined in test_elementwise_add_op.py
because MKLDNN does not support tensors of number of dimensions 3. because MKLDNN does not support tensors of number of dimensions 3.
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
from test_elementwise_mul_op import * from paddle.fluid.tests.unittests.test_elementwise_mul_op import *
class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp): class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
def fully_connected_naive(input, weights, bias_data=None): def fully_connected_naive(input, weights, bias_data=None):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
from test_gaussian_random_op import TestGaussianRandomOp from paddle.fluid.tests.unittests.test_gaussian_random_op import TestGaussianRandomOp
class TestMKLDNN(TestGaussianRandomOp): class TestMKLDNN(TestGaussianRandomOp):
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from test_lrn_op import TestLRNOp from paddle.fluid.tests.unittests.test_lrn_op import TestLRNOp
class TestLRNMKLDNNOp(TestLRNOp): class TestLRNMKLDNNOp(TestLRNOp):
......
...@@ -19,8 +19,8 @@ import unittest ...@@ -19,8 +19,8 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
from test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
class TestPool2dMKLDNNInt8_Op(TestPool2D_Op): class TestPool2dMKLDNNInt8_Op(TestPool2D_Op):
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5 from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
def create_test_mkldnn_class(parent): def create_test_mkldnn_class(parent):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
class TestQuantizeOp(OpTest): class TestQuantizeOp(OpTest):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
from test_sum_op import TestSumOp from paddle.fluid.tests.unittests.test_sum_op import TestSumOp
class TestMKLDNN(TestSumOp): class TestMKLDNN(TestSumOp):
......
...@@ -16,7 +16,7 @@ from __future__ import print_function ...@@ -16,7 +16,7 @@ from __future__ import print_function
import unittest import unittest
from test_transpose_op import TestTransposeOp from paddle.fluid.tests.unittests.test_transpose_op import TestTransposeOp
class TestTransposeMKLDNN(TestTransposeOp): class TestTransposeMKLDNN(TestTransposeOp):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_conv2d_op import *
class TestNGRAPH(TestConv2dOp):
def init_kernel_type(self):
super(TestNGRAPH, self).init_kernel_type()
class TestNGRAPHWithPad(TestWithPad):
def init_kernel_type(self):
super(TestNGRAPHWithPad, self).init_kernel_type()
class TestNGRAPHWithStride(TestWithStride):
def init_kernel_type(self):
super(TestNGRAPHWithStride, self).init_kernel_type()
class TestNGRAPHWithGroup(TestWithGroup):
def init_kernel_type(self):
super(TestNGRAPHWithGroup, self).init_kernel_type()
class TestNGRAPHWith1x1(TestWith1x1):
def init_kernel_type(self):
super(TestNGRAPHWith1x1, self).init_kernel_type()
class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddle.fluid.tests.unittests.test_pool2d_op import *
class TestNGRAPHPool2D_Op(TestPool2D_Op):
def init_test_case(self):
super(TestNGRAPHPool2D_Op, self).init_test_case()
class TestNGRAPHCase1(TestCase1):
def init_test_case(self):
super(TestNGRAPHCase1, self).init_test_case()
class TestNGRAPHCase2(TestCase2):
def init_test_case(self):
super(TestNGRAPHCase2, self).init_test_case()
class TestNGRAPHCase3(TestCase3):
def init_pool_type(self):
super(TestNGRAPHCase3, self).init_pool_type()
class TestNGRAPHCase4(TestCase4):
def init_pool_type(self):
super(TestNGRAPHCase4, self).init_pool_type()
class TestNGRAPHCase5(TestCase5):
def init_pool_type(self):
super(TestNGRAPHCase5, self).init_pool_type()
if __name__ == '__main__':
unittest.main()
...@@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase): ...@@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase):
self._create_pre_ids() self._create_pre_ids()
self.scope.var('selected_ids') self.scope.var('selected_ids')
self.scope.var('selected_scores') self.scope.var('selected_scores')
self.scope.var('parent_idx')
def test_run(self): def test_run(self):
op = Operator( op = Operator(
...@@ -48,12 +49,14 @@ class BeamSearchOpTester(unittest.TestCase): ...@@ -48,12 +49,14 @@ class BeamSearchOpTester(unittest.TestCase):
scores='scores', scores='scores',
selected_ids='selected_ids', selected_ids='selected_ids',
selected_scores='selected_scores', selected_scores='selected_scores',
parent_idx='parent_idx',
level=0, level=0,
beam_size=2, beam_size=2,
end_id=0, ) end_id=0, )
op.run(self.scope, core.CPUPlace()) op.run(self.scope, core.CPUPlace())
selected_ids = self.scope.find_var("selected_ids").get_tensor() selected_ids = self.scope.find_var("selected_ids").get_tensor()
selected_scores = self.scope.find_var("selected_scores").get_tensor() selected_scores = self.scope.find_var("selected_scores").get_tensor()
parent_idx = self.scope.find_var("parent_idx").get_tensor()
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis])) np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
...@@ -62,6 +65,8 @@ class BeamSearchOpTester(unittest.TestCase): ...@@ -62,6 +65,8 @@ class BeamSearchOpTester(unittest.TestCase):
np.array(selected_scores), np.array(selected_scores),
np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis])) np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]]) self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
self.assertTrue(
np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3])))
def _create_pre_ids(self): def _create_pre_ids(self):
np_data = np.array([[1, 2, 3, 4]], dtype='int64') np_data = np.array([[1, 2, 3, 4]], dtype='int64')
......
...@@ -20,7 +20,13 @@ from op_test import OpTest ...@@ -20,7 +20,13 @@ from op_test import OpTest
import paddle.fluid.core as core import paddle.fluid.core as core
def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None): def bilinear_interp_np(input,
out_h,
out_w,
out_size=None,
actual_shape=None,
align_corners=True,
align_mode=0):
"""bilinear interpolation implement in shape [N, C, H, W]""" """bilinear interpolation implement in shape [N, C, H, W]"""
if out_size is not None: if out_size is not None:
out_h = out_size[0] out_h = out_size[0]
...@@ -29,25 +35,45 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None): ...@@ -29,25 +35,45 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
out_h = actual_shape[0] out_h = actual_shape[0]
out_w = actual_shape[1] out_w = actual_shape[1]
batch_size, channel, in_h, in_w = input.shape batch_size, channel, in_h, in_w = input.shape
ratio_h = ratio_w = 0.0
if out_h > 1: if out_h > 1:
ratio_h = (in_h - 1.0) / (out_h - 1.0) if (align_corners):
else: ratio_h = (in_h - 1.0) / (out_h - 1.0)
ratio_h = 0.0 else:
ratio_h = 1.0 * in_h / out_h
if out_w > 1: if out_w > 1:
ratio_w = (in_w - 1.0) / (out_w - 1.0) if (align_corners):
else: ratio_w = (in_w - 1.0) / (out_w - 1.0)
ratio_w = 0.0 else:
ratio_w = 1.0 * in_w / out_w
out = np.zeros((batch_size, channel, out_h, out_w)) out = np.zeros((batch_size, channel, out_h, out_w))
for i in range(out_h): for i in range(out_h):
h = int(ratio_h * i) if (align_mode == 0 and not align_corners):
h = int(ratio_h * (i + 0.5) - 0.5)
else:
h = int(ratio_h * i)
h = max(0, h)
hid = 1 if h < in_h - 1 else 0 hid = 1 if h < in_h - 1 else 0
h1lambda = ratio_h * i - h if (align_mode == 0 and not align_corners):
h1lambda = ratio_h * (i + 0.5) - 0.5 - h
else:
h1lambda = ratio_h * i - h
h2lambda = 1.0 - h1lambda h2lambda = 1.0 - h1lambda
for j in range(out_w): for j in range(out_w):
w = int(ratio_w * j) if (align_mode == 0 and not align_corners):
w = int(ratio_w * (j + 0.5) - 0.5)
else:
w = int(ratio_w * j)
w = max(0, w)
wid = 1 if w < in_w - 1 else 0 wid = 1 if w < in_w - 1 else 0
w1lambda = ratio_w * j - w if (align_mode == 0 and not align_corners):
w1lambda = ratio_w * (j + 0.5) - 0.5 - w
else:
w1lambda = ratio_w * j - w
w2lambda = 1.0 - w1lambda w2lambda = 1.0 - w1lambda
out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] + out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
...@@ -66,7 +92,8 @@ class TestBilinearInterpOp(OpTest): ...@@ -66,7 +92,8 @@ class TestBilinearInterpOp(OpTest):
input_np = np.random.random(self.input_shape).astype("float32") input_np = np.random.random(self.input_shape).astype("float32")
output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
self.out_size, self.actual_shape) self.out_size, self.actual_shape,
self.align_corners, self.align_mode)
self.inputs = {'X': input_np} self.inputs = {'X': input_np}
if self.out_size is not None: if self.out_size is not None:
self.inputs['OutSize'] = self.out_size self.inputs['OutSize'] = self.out_size
...@@ -75,7 +102,9 @@ class TestBilinearInterpOp(OpTest): ...@@ -75,7 +102,9 @@ class TestBilinearInterpOp(OpTest):
self.attrs = { self.attrs = {
'out_h': self.out_h, 'out_h': self.out_h,
'out_w': self.out_w, 'out_w': self.out_w,
'interp_method': self.interp_method 'interp_method': self.interp_method,
'align_corners': self.align_corners,
'align_mode': self.align_mode
} }
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
...@@ -91,6 +120,8 @@ class TestBilinearInterpOp(OpTest): ...@@ -91,6 +120,8 @@ class TestBilinearInterpOp(OpTest):
self.out_h = 2 self.out_h = 2
self.out_w = 2 self.out_w = 2
self.out_size = np.array([3, 3]).astype("int32") self.out_size = np.array([3, 3]).astype("int32")
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpCase1(TestBilinearInterpOp): class TestBilinearInterpCase1(TestBilinearInterpOp):
...@@ -99,6 +130,8 @@ class TestBilinearInterpCase1(TestBilinearInterpOp): ...@@ -99,6 +130,8 @@ class TestBilinearInterpCase1(TestBilinearInterpOp):
self.input_shape = [4, 1, 7, 8] self.input_shape = [4, 1, 7, 8]
self.out_h = 1 self.out_h = 1
self.out_w = 1 self.out_w = 1
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpCase2(TestBilinearInterpOp): class TestBilinearInterpCase2(TestBilinearInterpOp):
...@@ -107,6 +140,8 @@ class TestBilinearInterpCase2(TestBilinearInterpOp): ...@@ -107,6 +140,8 @@ class TestBilinearInterpCase2(TestBilinearInterpOp):
self.input_shape = [3, 3, 9, 6] self.input_shape = [3, 3, 9, 6]
self.out_h = 12 self.out_h = 12
self.out_w = 12 self.out_w = 12
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpCase3(TestBilinearInterpOp): class TestBilinearInterpCase3(TestBilinearInterpOp):
...@@ -115,6 +150,8 @@ class TestBilinearInterpCase3(TestBilinearInterpOp): ...@@ -115,6 +150,8 @@ class TestBilinearInterpCase3(TestBilinearInterpOp):
self.input_shape = [1, 1, 128, 64] self.input_shape = [1, 1, 128, 64]
self.out_h = 64 self.out_h = 64
self.out_w = 128 self.out_w = 128
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpCase4(TestBilinearInterpOp): class TestBilinearInterpCase4(TestBilinearInterpOp):
...@@ -124,6 +161,8 @@ class TestBilinearInterpCase4(TestBilinearInterpOp): ...@@ -124,6 +161,8 @@ class TestBilinearInterpCase4(TestBilinearInterpOp):
self.out_h = 1 self.out_h = 1
self.out_w = 1 self.out_w = 1
self.out_size = np.array([2, 2]).astype("int32") self.out_size = np.array([2, 2]).astype("int32")
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpCase5(TestBilinearInterpOp): class TestBilinearInterpCase5(TestBilinearInterpOp):
...@@ -133,6 +172,8 @@ class TestBilinearInterpCase5(TestBilinearInterpOp): ...@@ -133,6 +172,8 @@ class TestBilinearInterpCase5(TestBilinearInterpOp):
self.out_h = 12 self.out_h = 12
self.out_w = 12 self.out_w = 12
self.out_size = np.array([11, 11]).astype("int32") self.out_size = np.array([11, 11]).astype("int32")
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpCase6(TestBilinearInterpOp): class TestBilinearInterpCase6(TestBilinearInterpOp):
...@@ -142,6 +183,8 @@ class TestBilinearInterpCase6(TestBilinearInterpOp): ...@@ -142,6 +183,8 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
self.out_h = 64 self.out_h = 64
self.out_w = 128 self.out_w = 128
self.out_size = np.array([65, 129]).astype("int32") self.out_size = np.array([65, 129]).astype("int32")
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpActualShape(TestBilinearInterpOp): class TestBilinearInterpActualShape(TestBilinearInterpOp):
...@@ -151,6 +194,8 @@ class TestBilinearInterpActualShape(TestBilinearInterpOp): ...@@ -151,6 +194,8 @@ class TestBilinearInterpActualShape(TestBilinearInterpOp):
self.out_h = 64 self.out_h = 64
self.out_w = 32 self.out_w = 32
self.out_size = np.array([66, 40]).astype("int32") self.out_size = np.array([66, 40]).astype("int32")
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpOpUint8(OpTest): class TestBilinearInterpOpUint8(OpTest):
...@@ -162,14 +207,17 @@ class TestBilinearInterpOpUint8(OpTest): ...@@ -162,14 +207,17 @@ class TestBilinearInterpOpUint8(OpTest):
input_np = np.random.randint( input_np = np.random.randint(
low=0, high=256, size=self.input_shape).astype("uint8") low=0, high=256, size=self.input_shape).astype("uint8")
output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
self.out_size, self.actual_shape) self.out_size, self.actual_shape,
self.align_corners, self.align_mode)
self.inputs = {'X': input_np} self.inputs = {'X': input_np}
if self.out_size is not None: if self.out_size is not None:
self.inputs['OutSize'] = self.out_size self.inputs['OutSize'] = self.out_size
self.attrs = { self.attrs = {
'out_h': self.out_h, 'out_h': self.out_h,
'out_w': self.out_w, 'out_w': self.out_w,
'interp_method': self.interp_method 'interp_method': self.interp_method,
'align_corners': self.align_corners,
'align_mode': self.align_mode
} }
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
...@@ -181,6 +229,8 @@ class TestBilinearInterpOpUint8(OpTest): ...@@ -181,6 +229,8 @@ class TestBilinearInterpOpUint8(OpTest):
self.input_shape = [1, 3, 9, 6] self.input_shape = [1, 3, 9, 6]
self.out_h = 10 self.out_h = 10
self.out_w = 9 self.out_w = 9
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8): class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
...@@ -189,6 +239,8 @@ class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8): ...@@ -189,6 +239,8 @@ class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
self.input_shape = [2, 3, 128, 64] self.input_shape = [2, 3, 128, 64]
self.out_h = 120 self.out_h = 120
self.out_w = 50 self.out_w = 50
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8): class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
...@@ -198,6 +250,26 @@ class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8): ...@@ -198,6 +250,26 @@ class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
self.out_h = 5 self.out_h = 5
self.out_w = 13 self.out_w = 13
self.out_size = np.array([6, 15]).astype("int32") self.out_size = np.array([6, 15]).astype("int32")
self.align_corners = True
self.align_mode = 1
class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
def set_align_mode(self):
self.align_corners = False
self.align_mode = 1
class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
def set_align_mode(self):
self.align_corners = False
self.align_mode = 0
class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
def set_align_mode(self):
self.align_corners = True
self.align_mode = 0
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
import copy
def box_clip(input_box, im_info, output_box):
im_w = round(im_info[1] / im_info[2])
im_h = round(im_info[0] / im_info[2])
output_box[:, :, 0] = np.maximum(
np.minimum(input_box[:, :, 0], im_w - 1), 0)
output_box[:, :, 1] = np.maximum(
np.minimum(input_box[:, :, 1], im_h - 1), 0)
output_box[:, :, 2] = np.maximum(
np.minimum(input_box[:, :, 2], im_w - 1), 0)
output_box[:, :, 3] = np.maximum(
np.minimum(input_box[:, :, 3], im_h - 1), 0)
def batch_box_clip(input_boxes, im_info, lod):
n = input_boxes.shape[0]
m = input_boxes.shape[1]
output_boxes = np.zeros((n, m, 4), dtype=np.float32)
cur_offset = 0
for i in range(len(lod)):
box_clip(input_boxes[cur_offset:(cur_offset + lod[i]), :, :],
im_info[i, :],
output_boxes[cur_offset:(cur_offset + lod[i]), :, :])
cur_offset += lod[i]
return output_boxes
class TestBoxClipOp(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_clip"
lod = [[1, 2, 3]]
input_boxes = np.random.random((6, 10, 4)) * 5
im_info = np.array([[5, 8, 1.], [6, 6, 1.], [7, 5, 1.]])
output_boxes = batch_box_clip(input_boxes, im_info, lod[0])
self.inputs = {
'Input': (input_boxes.astype('float32'), lod),
'ImInfo': im_info.astype('float32'),
}
self.outputs = {'Output': output_boxes}
if __name__ == '__main__':
unittest.main()
...@@ -21,80 +21,80 @@ import math ...@@ -21,80 +21,80 @@ import math
from op_test import OpTest from op_test import OpTest
def box_coder(target_box, prior_box, prior_box_var, output_box, code_type, def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
box_normalized): pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
prior_box_x = ( pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
(prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0]) pb_x = pb_w * 0.5 + p_box[:, 0]
prior_box_y = ( pb_y = pb_h * 0.5 + p_box[:, 1]
(prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0]) shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1)
prior_box_width = (
(prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0]) pb_w = pb_w.reshape(shape)
prior_box_height = ( pb_h = pb_h.reshape(shape)
(prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0]) pb_x = pb_x.reshape(shape)
prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0], pb_y = pb_y.reshape(shape)
prior_box_var.shape[1])
if not box_normalized: if pb_v.ndim == 2:
prior_box_height = prior_box_height + 1 pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1])
prior_box_width = prior_box_width + 1 if pb_v.ndim == 1:
tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
if (code_type == "EncodeCenterSize"): tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y
target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape( tb_w = np.exp(pb_v[2] * t_box[:, :, 2]) * pb_w
target_box.shape[0], 1) tb_h = np.exp(pb_v[3] * t_box[:, :, 3]) * pb_h
target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape( else:
target_box.shape[0], 1) tb_x = pb_v[:, :, 0] * t_box[:, :, 0] * pb_w + pb_x
target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape( tb_y = pb_v[:, :, 1] * t_box[:, :, 1] * pb_h + pb_y
target_box.shape[0], 1) tb_w = np.exp(pb_v[:, :, 2] * t_box[:, :, 2]) * pb_w
target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape( tb_h = np.exp(pb_v[:, :, 3] * t_box[:, :, 3]) * pb_h
target_box.shape[0], 1) output_box[:, :, 0] = tb_x - tb_w / 2
if not box_normalized: output_box[:, :, 1] = tb_y - tb_h / 2
target_box_height = target_box_height + 1 output_box[:, :, 2] = tb_x + tb_w / 2 - (not norm)
target_box_width = target_box_width + 1 output_box[:, :, 3] = tb_y + tb_h / 2 - (not norm)
output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
prior_box_var[:,:,0] def box_encoder(t_box, p_box, pb_v, output_box, norm):
output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \ pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
prior_box_var[:,:,1] pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \ pb_x = pb_w * 0.5 + p_box[:, 0]
prior_box_var[:,:,2] pb_y = pb_h * 0.5 + p_box[:, 1]
output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \ shape = (1, p_box.shape[0])
prior_box_var[:,:,3]
pb_w = pb_w.reshape(shape)
elif (code_type == "DecodeCenterSize"): pb_h = pb_h.reshape(shape)
target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \ pb_x = pb_x.reshape(shape)
prior_box_width + prior_box_x pb_y = pb_y.reshape(shape)
target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
prior_box_height + prior_box_y if pb_v.ndim == 2:
target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \ pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1])
prior_box_width tb_x = ((t_box[:, 2] + t_box[:, 0]) / 2).reshape(t_box.shape[0], 1)
target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \ tb_y = ((t_box[:, 3] + t_box[:, 1]) / 2).reshape(t_box.shape[0], 1)
prior_box_height tb_w = (t_box[:, 2] - t_box[:, 0]).reshape(t_box.shape[0], 1) + (not norm)
tb_h = (t_box[:, 3] - t_box[:, 1]).reshape(t_box.shape[0], 1) + (not norm)
output_box[:, :, 0] = target_box_x - target_box_width / 2 if pb_v.ndim == 1:
output_box[:, :, 1] = target_box_y - target_box_height / 2 output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[0]
output_box[:, :, 2] = target_box_x + target_box_width / 2 output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[1]
output_box[:, :, 3] = target_box_y + target_box_height / 2 output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[2]
if not box_normalized: output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[3]
output_box[:, :, 2] = output_box[:, :, 2] - 1 else:
output_box[:, :, 3] = output_box[:, :, 3] - 1 output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[:, :, 0]
output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[:, :, 1]
output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[:, :, 2]
def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type, output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[:, :, 3]
box_normalized):
n = target_box.shape[0]
m = prior_box.shape[0] def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0):
n = t_box.shape[0]
m = p_box.shape[0]
if code_type == "DecodeCenterSize":
m = t_box.shape[1]
output_box = np.zeros((n, m, 4), dtype=np.float32) output_box = np.zeros((n, m, 4), dtype=np.float32)
cur_offset = 0 cur_offset = 0
for i in range(len(lod)): for i in range(len(lod)):
if (code_type == "EncodeCenterSize"): if (code_type == "EncodeCenterSize"):
box_coder(target_box[cur_offset:(cur_offset + lod[i]), :], box_encoder(t_box[cur_offset:(cur_offset + lod[i]), :], p_box, pb_v,
prior_box, prior_box_var, output_box[cur_offset:(cur_offset + lod[i]), :, :],
output_box[cur_offset:(cur_offset + lod[i]), :, :], norm)
code_type, box_normalized)
elif (code_type == "DecodeCenterSize"): elif (code_type == "DecodeCenterSize"):
box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :], box_decoder(t_box, p_box, pb_v, output_box, norm, axis)
prior_box, prior_box_var,
output_box[cur_offset:(cur_offset + lod[i]), :, :],
code_type, box_normalized)
cur_offset += lod[i] cur_offset += lod[i]
return output_box return output_box
...@@ -106,9 +106,35 @@ class TestBoxCoderOp(OpTest): ...@@ -106,9 +106,35 @@ class TestBoxCoderOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "box_coder" self.op_type = "box_coder"
lod = [[1, 1, 1, 1, 1]] lod = [[1, 1, 1, 1, 1]]
prior_box = np.random.random((10, 4)).astype('float32') prior_box = np.random.random((81, 4)).astype('float32')
prior_box_var = np.random.random((10, 4)).astype('float32') prior_box_var = np.random.random((81, 4)).astype('float32')
target_box = np.random.random((5, 10, 4)).astype('float32') target_box = np.random.random((20, 81, 4)).astype('float32')
code_type = "DecodeCenterSize"
box_normalized = False
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type, box_normalized)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': target_box,
}
self.attrs = {
'code_type': 'decode_center_size',
'box_normalized': False
}
self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithOneRankVar(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[1, 1, 1, 1, 1]]
prior_box = np.random.random((81, 4)).astype('float32')
prior_box_var = np.random.random((4)).astype('float32')
target_box = np.random.random((20, 81, 4)).astype('float32')
code_type = "DecodeCenterSize" code_type = "DecodeCenterSize"
box_normalized = False box_normalized = False
output_box = batch_box_coder(prior_box, prior_box_var, target_box, output_box = batch_box_coder(prior_box, prior_box_var, target_box,
...@@ -133,9 +159,9 @@ class TestBoxCoderOpWithoutBoxVar(OpTest): ...@@ -133,9 +159,9 @@ class TestBoxCoderOpWithoutBoxVar(OpTest):
def setUp(self): def setUp(self):
self.op_type = "box_coder" self.op_type = "box_coder"
lod = [[0, 1, 2, 3, 4, 5]] lod = [[0, 1, 2, 3, 4, 5]]
prior_box = np.random.random((10, 4)).astype('float32') prior_box = np.random.random((81, 4)).astype('float32')
prior_box_var = np.ones((10, 4)).astype('float32') prior_box_var = np.ones((81, 4)).astype('float32')
target_box = np.random.random((5, 10, 4)).astype('float32') target_box = np.random.random((20, 81, 4)).astype('float32')
code_type = "DecodeCenterSize" code_type = "DecodeCenterSize"
box_normalized = False box_normalized = False
output_box = batch_box_coder(prior_box, prior_box_var, target_box, output_box = batch_box_coder(prior_box, prior_box_var, target_box,
...@@ -158,10 +184,10 @@ class TestBoxCoderOpWithLoD(OpTest): ...@@ -158,10 +184,10 @@ class TestBoxCoderOpWithLoD(OpTest):
def setUp(self): def setUp(self):
self.op_type = "box_coder" self.op_type = "box_coder"
lod = [[4, 8, 8]] lod = [[10, 20, 20]]
prior_box = np.random.random((10, 4)).astype('float32') prior_box = np.random.random((20, 4)).astype('float32')
prior_box_var = np.random.random((10, 4)).astype('float32') prior_box_var = np.random.random((20, 4)).astype('float32')
target_box = np.random.random((20, 4)).astype('float32') target_box = np.random.random((50, 4)).astype('float32')
code_type = "EncodeCenterSize" code_type = "EncodeCenterSize"
box_normalized = True box_normalized = True
output_box = batch_box_coder(prior_box, prior_box_var, target_box, output_box = batch_box_coder(prior_box, prior_box_var, target_box,
...@@ -176,5 +202,63 @@ class TestBoxCoderOpWithLoD(OpTest): ...@@ -176,5 +202,63 @@ class TestBoxCoderOpWithLoD(OpTest):
self.outputs = {'OutputBox': output_box} self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithAxis(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[1, 1, 1, 1, 1]]
prior_box = np.random.random((30, 4)).astype('float32')
prior_box_var = np.random.random((4)).astype('float32')
target_box = np.random.random((30, 81, 4)).astype('float32')
code_type = "DecodeCenterSize"
box_normalized = False
axis = 1
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type, box_normalized, axis)
self.inputs = {
'PriorBox': prior_box,
'PriorBoxVar': prior_box_var,
'TargetBox': target_box,
}
self.attrs = {
'code_type': 'decode_center_size',
'box_normalized': False,
'axis': axis
}
self.outputs = {'OutputBox': output_box}
class TestBoxCoderOpWithVariance(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "box_coder"
lod = [[1, 1, 1, 1, 1]]
prior_box = np.random.random((30, 4)).astype('float32')
prior_box_var = np.random.random((4)).astype('float32')
target_box = np.random.random((30, 81, 4)).astype('float32')
code_type = "DecodeCenterSize"
box_normalized = False
axis = 1
output_box = batch_box_coder(prior_box, prior_box_var, target_box,
lod[0], code_type, box_normalized, axis)
self.inputs = {
'PriorBox': prior_box,
'TargetBox': target_box,
}
self.attrs = {
'code_type': 'decode_center_size',
'box_normalized': False,
'variance': prior_box_var.astype(np.float).flatten(),
'axis': axis
}
self.outputs = {'OutputBox': output_box}
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -268,9 +268,6 @@ class TestImperativePtbRnn(unittest.TestCase): ...@@ -268,9 +268,6 @@ class TestImperativePtbRnn(unittest.TestCase):
sgd.minimize(dy_loss) sgd.minimize(dy_loss)
for param in ptb_model.parameters(): for param in ptb_model.parameters():
dy_param_updated[param.name] = param._numpy() dy_param_updated[param.name] = param._numpy()
# print("dy_loss is {}".format(dy_loss._numpy()))
# print("last_hidden is {}".format(last_hidden._numpy()))
# print("last_cell is {}".format(last_cell._numpy()))
with new_program_scope(): with new_program_scope():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
......
...@@ -83,7 +83,8 @@ class TestBook(unittest.TestCase): ...@@ -83,7 +83,8 @@ class TestBook(unittest.TestCase):
self.assertEqual(feed_var_names, ["x", "y"]) self.assertEqual(feed_var_names, ["x", "y"])
self.assertEqual(len(fetch_vars), 1) self.assertEqual(len(fetch_vars), 1)
self.assertEqual(str(fetch_vars[0]), str(avg_cost)) print("fetch %s" % str(fetch_vars[0]))
self.assertTrue("scale" in str(fetch_vars[0]))
self.assertEqual(expected, actual) self.assertEqual(expected, actual)
......
...@@ -24,7 +24,8 @@ def nearest_neighbor_interp_np(X, ...@@ -24,7 +24,8 @@ def nearest_neighbor_interp_np(X,
out_h, out_h,
out_w, out_w,
out_size=None, out_size=None,
actual_shape=None): actual_shape=None,
align_corners=True):
"""nearest neighbor interpolation implement in shape [N, C, H, W]""" """nearest neighbor interpolation implement in shape [N, C, H, W]"""
if out_size is not None: if out_size is not None:
out_h = out_size[0] out_h = out_size[0]
...@@ -35,17 +36,31 @@ def nearest_neighbor_interp_np(X, ...@@ -35,17 +36,31 @@ def nearest_neighbor_interp_np(X,
n, c, in_h, in_w = X.shape n, c, in_h, in_w = X.shape
ratio_h = ratio_w = 0.0 ratio_h = ratio_w = 0.0
if out_h > 1: if (out_h > 1):
ratio_h = (in_h - 1.0) / (out_h - 1.0) if (align_corners):
if out_w > 1: ratio_h = (in_h - 1.0) / (out_h - 1.0)
ratio_w = (in_w - 1.0) / (out_w - 1.0) else:
ratio_h = 1.0 * in_h / out_h
if (out_w > 1):
if (align_corners):
ratio_w = (in_w - 1.0) / (out_w - 1.0)
else:
ratio_w = 1.0 * in_w / out_w
out = np.zeros((n, c, out_h, out_w)) out = np.zeros((n, c, out_h, out_w))
for i in range(out_h):
in_i = int(ratio_h * i + 0.5) if align_corners:
for j in range(out_w): for i in range(out_h):
in_j = int(ratio_w * j + 0.5) in_i = int(ratio_h * i + 0.5)
out[:, :, i, j] = X[:, :, in_i, in_j] for j in range(out_w):
in_j = int(ratio_w * j + 0.5)
out[:, :, i, j] = X[:, :, in_i, in_j]
else:
for i in range(out_h):
in_i = int(ratio_h * i)
for j in range(out_w):
in_j = int(ratio_w * j)
out[:, :, i, j] = X[:, :, in_i, in_j]
return out.astype(X.dtype) return out.astype(X.dtype)
...@@ -59,7 +74,8 @@ class TestNearestInterpOp(OpTest): ...@@ -59,7 +74,8 @@ class TestNearestInterpOp(OpTest):
input_np = np.random.random(self.input_shape).astype("float32") input_np = np.random.random(self.input_shape).astype("float32")
output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w, output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
self.out_size, self.actual_shape) self.out_size, self.actual_shape,
self.align_corners)
self.inputs = {'X': input_np} self.inputs = {'X': input_np}
if self.out_size is not None: if self.out_size is not None:
self.inputs['OutSize'] = self.out_size self.inputs['OutSize'] = self.out_size
...@@ -68,7 +84,8 @@ class TestNearestInterpOp(OpTest): ...@@ -68,7 +84,8 @@ class TestNearestInterpOp(OpTest):
self.attrs = { self.attrs = {
'out_h': self.out_h, 'out_h': self.out_h,
'out_w': self.out_w, 'out_w': self.out_w,
'interp_method': self.interp_method 'interp_method': self.interp_method,
'align_corners': self.align_corners,
} }
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
...@@ -84,6 +101,7 @@ class TestNearestInterpOp(OpTest): ...@@ -84,6 +101,7 @@ class TestNearestInterpOp(OpTest):
self.out_h = 2 self.out_h = 2
self.out_w = 2 self.out_w = 2
self.out_size = np.array([3, 3]).astype("int32") self.out_size = np.array([3, 3]).astype("int32")
self.align_corners = True
class TestNearestNeighborInterpCase1(TestNearestInterpOp): class TestNearestNeighborInterpCase1(TestNearestInterpOp):
...@@ -92,6 +110,7 @@ class TestNearestNeighborInterpCase1(TestNearestInterpOp): ...@@ -92,6 +110,7 @@ class TestNearestNeighborInterpCase1(TestNearestInterpOp):
self.input_shape = [4, 1, 7, 8] self.input_shape = [4, 1, 7, 8]
self.out_h = 1 self.out_h = 1
self.out_w = 1 self.out_w = 1
self.align_corners = True
class TestNearestNeighborInterpCase2(TestNearestInterpOp): class TestNearestNeighborInterpCase2(TestNearestInterpOp):
...@@ -100,6 +119,7 @@ class TestNearestNeighborInterpCase2(TestNearestInterpOp): ...@@ -100,6 +119,7 @@ class TestNearestNeighborInterpCase2(TestNearestInterpOp):
self.input_shape = [3, 3, 9, 6] self.input_shape = [3, 3, 9, 6]
self.out_h = 12 self.out_h = 12
self.out_w = 12 self.out_w = 12
self.align_corners = True
class TestNearestNeighborInterpCase3(TestNearestInterpOp): class TestNearestNeighborInterpCase3(TestNearestInterpOp):
...@@ -108,6 +128,7 @@ class TestNearestNeighborInterpCase3(TestNearestInterpOp): ...@@ -108,6 +128,7 @@ class TestNearestNeighborInterpCase3(TestNearestInterpOp):
self.input_shape = [1, 1, 128, 64] self.input_shape = [1, 1, 128, 64]
self.out_h = 64 self.out_h = 64
self.out_w = 128 self.out_w = 128
self.align_corners = True
class TestNearestNeighborInterpCase4(TestNearestInterpOp): class TestNearestNeighborInterpCase4(TestNearestInterpOp):
...@@ -117,6 +138,7 @@ class TestNearestNeighborInterpCase4(TestNearestInterpOp): ...@@ -117,6 +138,7 @@ class TestNearestNeighborInterpCase4(TestNearestInterpOp):
self.out_h = 1 self.out_h = 1
self.out_w = 1 self.out_w = 1
self.out_size = np.array([2, 2]).astype("int32") self.out_size = np.array([2, 2]).astype("int32")
self.align_corners = True
class TestNearestNeighborInterpCase5(TestNearestInterpOp): class TestNearestNeighborInterpCase5(TestNearestInterpOp):
...@@ -126,6 +148,7 @@ class TestNearestNeighborInterpCase5(TestNearestInterpOp): ...@@ -126,6 +148,7 @@ class TestNearestNeighborInterpCase5(TestNearestInterpOp):
self.out_h = 12 self.out_h = 12
self.out_w = 12 self.out_w = 12
self.out_size = np.array([11, 11]).astype("int32") self.out_size = np.array([11, 11]).astype("int32")
self.align_corners = True
class TestNearestNeighborInterpCase6(TestNearestInterpOp): class TestNearestNeighborInterpCase6(TestNearestInterpOp):
...@@ -135,6 +158,7 @@ class TestNearestNeighborInterpCase6(TestNearestInterpOp): ...@@ -135,6 +158,7 @@ class TestNearestNeighborInterpCase6(TestNearestInterpOp):
self.out_h = 64 self.out_h = 64
self.out_w = 128 self.out_w = 128
self.out_size = np.array([65, 129]).astype("int32") self.out_size = np.array([65, 129]).astype("int32")
self.align_corners = True
class TestNearestNeighborInterpActualShape(TestNearestInterpOp): class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
...@@ -144,6 +168,7 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp): ...@@ -144,6 +168,7 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
self.out_h = 64 self.out_h = 64
self.out_w = 32 self.out_w = 32
self.out_size = np.array([66, 40]).astype("int32") self.out_size = np.array([66, 40]).astype("int32")
self.align_corners = True
class TestNearestInterpOpUint8(OpTest): class TestNearestInterpOpUint8(OpTest):
...@@ -155,14 +180,16 @@ class TestNearestInterpOpUint8(OpTest): ...@@ -155,14 +180,16 @@ class TestNearestInterpOpUint8(OpTest):
input_np = np.random.randint( input_np = np.random.randint(
low=0, high=256, size=self.input_shape).astype("uint8") low=0, high=256, size=self.input_shape).astype("uint8")
output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w, output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
self.out_size, self.actual_shape) self.out_size, self.actual_shape,
self.align_corners)
self.inputs = {'X': input_np} self.inputs = {'X': input_np}
if self.out_size is not None: if self.out_size is not None:
self.inputs['OutSize'] = self.out_size self.inputs['OutSize'] = self.out_size
self.attrs = { self.attrs = {
'out_h': self.out_h, 'out_h': self.out_h,
'out_w': self.out_w, 'out_w': self.out_w,
'interp_method': self.interp_method 'interp_method': self.interp_method,
'align_corners': self.align_corners
} }
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
...@@ -174,6 +201,7 @@ class TestNearestInterpOpUint8(OpTest): ...@@ -174,6 +201,7 @@ class TestNearestInterpOpUint8(OpTest):
self.input_shape = [1, 3, 9, 6] self.input_shape = [1, 3, 9, 6]
self.out_h = 10 self.out_h = 10
self.out_w = 9 self.out_w = 9
self.align_corners = True
class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8): class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
...@@ -182,6 +210,7 @@ class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8): ...@@ -182,6 +210,7 @@ class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
self.input_shape = [2, 3, 128, 64] self.input_shape = [2, 3, 128, 64]
self.out_h = 120 self.out_h = 120
self.out_w = 50 self.out_w = 50
self.align_corners = True
class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8): class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
...@@ -191,6 +220,12 @@ class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8): ...@@ -191,6 +220,12 @@ class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
self.out_h = 5 self.out_h = 5
self.out_w = 13 self.out_w = 13
self.out_size = np.array([6, 15]).astype("int32") self.out_size = np.array([6, 15]).astype("int32")
self.align_corners = True
class TestNearestInterpWithoutCorners(TestNearestInterpOp):
def set_align_corners(self):
self.align_corners = False
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -16,174 +16,179 @@ from __future__ import division ...@@ -16,174 +16,179 @@ from __future__ import division
import unittest import unittest
import numpy as np import numpy as np
from scipy.special import logit
from scipy.special import expit
from op_test import OpTest from op_test import OpTest
from paddle.fluid import core from paddle.fluid import core
def sigmoid(x): def l2loss(x, y):
return 1.0 / (1.0 + np.exp(-1.0 * x)) return 0.5 * (y - x) * (y - x)
def mse(x, y, num): def sce(x, label):
return ((y - x)**2).sum() / num sigmoid_x = expit(x)
term1 = label * np.log(sigmoid_x)
term2 = (1.0 - label) * np.log(1.0 - sigmoid_x)
return -term1 - term2
def bce(x, y, mask): def sigmoid(x):
x = x.reshape((-1)) return 1.0 / (1.0 + np.exp(-1.0 * x))
y = y.reshape((-1))
mask = mask.reshape((-1))
error_sum = 0.0
count = 0
for i in range(x.shape[0]):
if mask[i] > 0:
error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i])
count += 1
return error_sum / (-1.0 * count)
def batch_xywh_box_iou(box1, box2):
b1_left = box1[:, :, 0] - box1[:, :, 2] / 2
b1_right = box1[:, :, 0] + box1[:, :, 2] / 2
b1_top = box1[:, :, 1] - box1[:, :, 3] / 2
b1_bottom = box1[:, :, 1] + box1[:, :, 3] / 2
def box_iou(box1, box2): b2_left = box2[:, :, 0] - box2[:, :, 2] / 2
b1_x1 = box1[0] - box1[2] / 2 b2_right = box2[:, :, 0] + box2[:, :, 2] / 2
b1_x2 = box1[0] + box1[2] / 2 b2_top = box2[:, :, 1] - box2[:, :, 3] / 2
b1_y1 = box1[1] - box1[3] / 2 b2_bottom = box2[:, :, 1] + box2[:, :, 3] / 2
b1_y2 = box1[1] + box1[3] / 2
b2_x1 = box2[0] - box2[2] / 2
b2_x2 = box2[0] + box2[2] / 2
b2_y1 = box2[1] - box2[3] / 2
b2_y2 = box2[1] + box2[3] / 2
b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) left = np.maximum(b1_left[:, :, np.newaxis], b2_left[:, np.newaxis, :])
b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) right = np.minimum(b1_right[:, :, np.newaxis], b2_right[:, np.newaxis, :])
top = np.maximum(b1_top[:, :, np.newaxis], b2_top[:, np.newaxis, :])
bottom = np.minimum(b1_bottom[:, :, np.newaxis],
b2_bottom[:, np.newaxis, :])
inter_rect_x1 = max(b1_x1, b2_x1) inter_w = np.clip(right - left, 0., 1.)
inter_rect_y1 = max(b1_y1, b2_y1) inter_h = np.clip(bottom - top, 0., 1.)
inter_rect_x2 = min(b1_x2, b2_x2) inter_area = inter_w * inter_h
inter_rect_y2 = min(b1_y2, b2_y2)
inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max(
inter_rect_y2 - inter_rect_y1, 0)
return inter_area / (b1_area + b2_area + inter_area) b1_area = (b1_right - b1_left) * (b1_bottom - b1_top)
b2_area = (b2_right - b2_left) * (b2_bottom - b2_top)
union = b1_area[:, :, np.newaxis] + b2_area[:, np.newaxis, :] - inter_area
return inter_area / union
def build_target(gtboxs, gtlabel, attrs, grid_size):
n, b, _ = gtboxs.shape
ignore_thresh = attrs["ignore_thresh"]
anchors = attrs["anchors"]
class_num = attrs["class_num"]
an_num = len(anchors) // 2
obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
tcls = np.zeros(
(n, an_num, grid_size, grid_size, class_num)).astype('float32')
def YOLOv3Loss(x, gtbox, gtlabel, attrs):
n, c, h, w = x.shape
b = gtbox.shape[1]
anchors = attrs['anchors']
an_num = len(anchors) // 2
anchor_mask = attrs['anchor_mask']
mask_num = len(anchor_mask)
class_num = attrs["class_num"]
ignore_thresh = attrs['ignore_thresh']
downsample = attrs['downsample']
input_size = downsample * h
x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
loss = np.zeros((n)).astype('float32')
pred_box = x[:, :, :, :, :4].copy()
grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
class_num)
mask_anchors = []
for m in anchor_mask:
mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
anchors_s = np.array(
[(an_w / input_size, an_h / input_size) for an_w, an_h in mask_anchors])
anchor_w = anchors_s[:, 0:1].reshape((1, mask_num, 1, 1))
anchor_h = anchors_s[:, 1:2].reshape((1, mask_num, 1, 1))
pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
pred_box = pred_box.reshape((n, -1, 4))
pred_obj = x[:, :, :, :, 4].reshape((n, -1))
objness = np.zeros(pred_box.shape[:2]).astype('float32')
ious = batch_xywh_box_iou(pred_box, gtbox)
ious_max = np.max(ious, axis=-1)
objness = np.where(ious_max > ignore_thresh, -np.ones_like(objness),
objness)
gtbox_shift = gtbox.copy()
gtbox_shift[:, :, 0] = 0
gtbox_shift[:, :, 1] = 0
anchors = [(anchors[2 * i], anchors[2 * i + 1]) for i in range(0, an_num)]
anchors_s = np.array(
[(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
anchor_boxes = np.concatenate(
[np.zeros_like(anchors_s), anchors_s], axis=-1)
anchor_boxes = np.tile(anchor_boxes[np.newaxis, :, :], (n, 1, 1))
ious = batch_xywh_box_iou(gtbox_shift, anchor_boxes)
iou_matches = np.argmax(ious, axis=-1)
gt_matches = iou_matches.copy()
for i in range(n): for i in range(n):
for j in range(b): for j in range(b):
if gtboxs[i, j, :].sum() == 0: if gtbox[i, j, 2:].sum() == 0:
gt_matches[i, j] = -1
continue continue
if iou_matches[i, j] not in anchor_mask:
gt_matches[i, j] = -1
continue
an_idx = anchor_mask.index(iou_matches[i, j])
gt_matches[i, j] = an_idx
gi = int(gtbox[i, j, 0] * w)
gj = int(gtbox[i, j, 1] * h)
gt_label = gtlabel[i, j] tx = gtbox[i, j, 0] * w - gi
gx = gtboxs[i, j, 0] * grid_size ty = gtbox[i, j, 1] * w - gj
gy = gtboxs[i, j, 1] * grid_size tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
gw = gtboxs[i, j, 2] * grid_size th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
gh = gtboxs[i, j, 3] * grid_size scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3])
loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
gi = int(gx) loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
gj = int(gy) loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale
loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale
gtbox = [0, 0, gw, gh]
max_iou = 0 objness[i, an_idx * h * w + gj * w + gi] = 1.0
for k in range(an_num):
anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]] for label_idx in range(class_num):
iou = box_iou(gtbox, anchor_box) loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx],
if iou > max_iou: float(label_idx == gtlabel[i, j]))
max_iou = iou
best_an_index = k for j in range(mask_num * h * w):
if iou > ignore_thresh: if objness[i, j] > 0:
noobj_mask[i, best_an_index, gj, gi] = 0 loss[i] += sce(pred_obj[i, j], 1.0)
elif objness[i, j] == 0:
obj_mask[i, best_an_index, gj, gi] = 1 loss[i] += sce(pred_obj[i, j], 0.0)
noobj_mask[i, best_an_index, gj, gi] = 0
tx[i, best_an_index, gj, gi] = gx - gi return (loss, objness.reshape((n, mask_num, h, w)).astype('float32'), \
ty[i, best_an_index, gj, gi] = gy - gj gt_matches.astype('int32'))
tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 *
best_an_index])
th[i, best_an_index, gj, gi] = np.log(
gh / anchors[2 * best_an_index + 1])
tconf[i, best_an_index, gj, gi] = 1
tcls[i, best_an_index, gj, gi, gt_label] = 1
return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask)
def YoloV3Loss(x, gtbox, gtlabel, attrs):
n, c, h, w = x.shape
an_num = len(attrs['anchors']) // 2
class_num = attrs["class_num"]
x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
pred_x = sigmoid(x[:, :, :, :, 0])
pred_y = sigmoid(x[:, :, :, :, 1])
pred_w = x[:, :, :, :, 2]
pred_h = x[:, :, :, :, 3]
pred_conf = sigmoid(x[:, :, :, :, 4])
pred_cls = sigmoid(x[:, :, :, :, 5:])
tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target(
gtbox, gtlabel, attrs, x.shape[2])
obj_mask_expand = np.tile(
np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num'])))
loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum())
loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum())
loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum())
loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum())
loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask)
loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask,
noobj_mask)
loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand,
obj_mask_expand)
return attrs['loss_weight_xy'] * (loss_x + loss_y) \
+ attrs['loss_weight_wh'] * (loss_w + loss_h) \
+ attrs['loss_weight_conf_target'] * loss_conf_target \
+ attrs['loss_weight_conf_notarget'] * loss_conf_notarget \
+ attrs['loss_weight_class'] * loss_class
class TestYolov3LossOp(OpTest): class TestYolov3LossOp(OpTest):
def setUp(self): def setUp(self):
self.loss_weight_xy = 1.0
self.loss_weight_wh = 1.0
self.loss_weight_conf_target = 1.0
self.loss_weight_conf_notarget = 1.0
self.loss_weight_class = 1.0
self.initTestCase() self.initTestCase()
self.op_type = 'yolov3_loss' self.op_type = 'yolov3_loss'
x = np.random.random(size=self.x_shape).astype('float32') x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32'))
gtbox = np.random.random(size=self.gtbox_shape).astype('float32') gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
gtlabel = np.random.randint(0, self.class_num, gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
self.gtbox_shape[:2]).astype('int32') gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
gtbox = gtbox * gtmask[:, :, np.newaxis]
gtlabel = gtlabel * gtmask
self.attrs = { self.attrs = {
"anchors": self.anchors, "anchors": self.anchors,
"anchor_mask": self.anchor_mask,
"class_num": self.class_num, "class_num": self.class_num,
"ignore_thresh": self.ignore_thresh, "ignore_thresh": self.ignore_thresh,
"loss_weight_xy": self.loss_weight_xy, "downsample": self.downsample,
"loss_weight_wh": self.loss_weight_wh,
"loss_weight_conf_target": self.loss_weight_conf_target,
"loss_weight_conf_notarget": self.loss_weight_conf_notarget,
"loss_weight_class": self.loss_weight_class,
} }
self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel} self.inputs = {
'X': x,
'GTBox': gtbox.astype('float32'),
'GTLabel': gtlabel.astype('int32'),
}
loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
self.outputs = { self.outputs = {
'Loss': np.array( 'Loss': loss,
[YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32') 'ObjectnessMask': objness,
"GTMatchMask": gt_matches
} }
def test_check_output(self): def test_check_output(self):
...@@ -196,19 +201,16 @@ class TestYolov3LossOp(OpTest): ...@@ -196,19 +201,16 @@ class TestYolov3LossOp(OpTest):
place, ['X'], place, ['X'],
'Loss', 'Loss',
no_grad_set=set(["GTBox", "GTLabel"]), no_grad_set=set(["GTBox", "GTLabel"]),
max_relative_error=0.06) max_relative_error=0.3)
def initTestCase(self): def initTestCase(self):
self.anchors = [10, 13, 12, 12] self.anchors = [10, 13, 16, 30, 33, 23]
self.class_num = 10 self.anchor_mask = [1, 2]
self.class_num = 5
self.ignore_thresh = 0.5 self.ignore_thresh = 0.5
self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) self.downsample = 32
self.gtbox_shape = (5, 10, 4) self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
self.loss_weight_xy = 2.5 self.gtbox_shape = (3, 5, 4)
self.loss_weight_wh = 0.8
self.loss_weight_conf_target = 1.5
self.loss_weight_conf_notarget = 0.5
self.loss_weight_class = 1.2
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -17,3 +17,4 @@ from __future__ import print_function ...@@ -17,3 +17,4 @@ from __future__ import print_function
from .program_utils import * from .program_utils import *
from .ufind import * from .ufind import *
from .checkport import * from .checkport import *
from .vars_distributed import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddle.fluid.framework import Variable
class VarStruct(object):
"""
record part properties of a Variable in python.
"""
def __init__(self, name, shape, dtype, type, lod_level, persistable):
self.name = name
self.shape = shape
self.dtype = dtype
self.type = type
self.lod_level = lod_level
self.persistable = persistable
class VarDistributed(object):
"""
a class to record the var distributed on parameter servers.
the class will record the relationship between origin var and slice var.
the slice var's properties, such as type/shape/offset/endpoint.
"""
def __init__(self,
origin_var,
slice_var,
is_slice=None,
block_id=None,
offset=None,
vtype=None,
endpoint=None):
"""
Args:
origin_var(Variable|VarStruct): origin var properties
slice_var(Variable|VarStruct): slice var properties
is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
block_id(int|None): the number about the slice var.
offset(int|None): if the slice var is sliced, offset is the numel before the var.
vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
"""
if isinstance(origin_var, Variable):
self.origin = self.__create_var_struct(origin_var)
else:
self.origin = origin_var
if isinstance(slice_var, Variable):
self.slice = self.__create_var_struct(slice_var)
else:
self.slice = slice_var
if self.equal(self.origin, self.slice):
self.is_slice = False
self.block_id = 0
self.offset = 0
else:
self.is_slice = True
self.block_id = 0
self.offset = 0
if is_slice is not None:
self.is_slice = is_slice
if block_id is not None:
self.block_id = block_id
if offset is not None:
self.offset = offset
self.vtype = vtype
self.endpoint = endpoint
@staticmethod
def __create_var_struct(var):
return VarStruct(var.name, var.shape, var.dtype, var.type,
var.lod_level, var.persistable)
@staticmethod
def equal(var1, var2):
"""
the two var is equal or not.
Returns:
bool: equal will return True else False
"""
assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
return var1.name == var2.name and \
var1.type == var2.type and \
var1.shape == var2.shape and \
var1.dtype == var2.dtype and \
var1.lod_level == var2.lod_level and \
var1.persistable == var2.persistable
def __str__(self):
origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
format(i="{", e="}", name=self.origin.name, type=self.origin.type,
shape=self.origin.shape, dtype=self.origin.dtype)
slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
".slice({is_slice}).block({block_id}).offset({offset})". \
format(i="{", e="}", name=self.slice.name, type=self.slice.type,
shape=self.slice.shape, dtype=self.slice.dtype,
is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
self.vtype, origin_var_str, slice_var_str, self.endpoint)
class VarsDistributed(object):
"""
a gather about VarDistributed with many methods to find distributed vars.
through the class, we can get overview about the distributed parameters on parameter servers.
this class may centralized and convenient for developer to manage and get variable's distribute.
other module can also use this to find variables such io.py.
"""
def __init__(self):
self.distributed_vars = []
def add_distributed_var(self,
origin_var,
slice_var,
is_slice=None,
block_id=None,
offset=None,
vtype=None,
endpoint=None):
"""
add distributed var in this.
Args:
origin_var(Variable|VarStruct): origin var properties
slice_var(Variable|VarStruct): slice var properties
is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
block_id(int|None): the number about the slice var.
offset(int|None): if the slice var is sliced, offset is the numel before the var.
vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
Returns:
None
"""
self.distributed_vars.append(
VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
vtype, endpoint))
def get_distributed_var_by_slice(self, var_name):
"""
get distributed var by conditions.
Args:
var_name(str): slice var name, such as "w.traier0.block1"
Returns:
VarDistributed: distributed var.
"""
for dist_var in self.distributed_vars:
if dist_var.slice.name == var_name:
return dist_var
return None
@staticmethod
def equal(var1, var2):
"""
the two var is equal or not.
Returns:
bool: equal will return True else False
"""
return var1.name == var2.name and \
var1.type == var2.type and \
var1.shape == var2.shape and \
var1.dtype == var2.dtype and \
var1.lod_level == var2.lod_level and \
var1.persistable == var2.persistable
def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
"""
get distributed var by conditions.
Args:
origin_var_name(str):
endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
Returns:
VarDistributed: distributed var.
"""
for dist_var in self.distributed_vars:
if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
return dist_var
return None
def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
"""
get distributed vars by conditions.
Args:
vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
groupby(bool|False): group by origin var or not.
Returns:
list: distributed var list.
dict: distributed var map when groupby=True
"""
vtype_vars = []
for var in self.distributed_vars:
if var.vtype in vtypes:
vtype_vars.append(var)
if not groupby:
return vtype_vars
params_map = {}
for var in vtype_vars:
origin_var_name = var.origin.name
if origin_var_name in params_map.keys():
optimizers = params_map.get(origin_var_name)
else:
optimizers = []
optimizers.append(var)
params_map[origin_var_name] = optimizers
return params_map
def get_distributed_vars_by_ep(self, endpoint, vtype=None):
"""
get distributed vars by conditions.
Args:
endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
Returns:
list: distributed var list.
"""
endpoint_vars = []
for var in self.distributed_vars:
if var.endpoint == endpoint:
endpoint_vars.append(var)
if not vtype:
return endpoint_vars
vtype_vars = []
for var in endpoint_vars:
if var.vtype == vtype:
vtype_vars.append(var)
return vtype_vars
def overview(self):
"""
get the overview string about all params on all parameter servers.
Returns:
Str: overview string.
"""
vars_str = []
for var in self.distributed_vars:
vars_str.append(str(var))
return "\n".join(vars_str)
...@@ -30,19 +30,23 @@ Steps to transpile pserver: ...@@ -30,19 +30,23 @@ Steps to transpile pserver:
5. add listen_and_serv op 5. add listen_and_serv op
""" """
import sys
import math import math
import numpy as np from functools import reduce
import collections import collections
import six
import logging import logging
import numpy as np
from .ps_dispatcher import RoundRobin, PSDispatcher from .ps_dispatcher import RoundRobin, PSDispatcher
from .. import core, framework, unique_name from .. import core, framework, unique_name
from ..framework import Program, default_main_program, \ from ..framework import Program, default_main_program, \
default_startup_program, Block, \ default_startup_program, Block, Parameter, grad_var_name
Parameter, Variable, grad_var_name from .details import wait_server_ready, UnionFind, VarStruct, VarsDistributed
from .details import * from .details import delete_ops, find_op_by_output_arg
from ..distribute_lookup_table import find_distributed_lookup_table from ..distribute_lookup_table import find_distributed_lookup_table
from functools import reduce
LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_TYPE = "lookup_table"
LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
...@@ -62,260 +66,6 @@ def log(*args): ...@@ -62,260 +66,6 @@ def log(*args):
print(args) print(args)
class VarStruct(object):
"""
record part properties of a Variable in python.
"""
def __init__(self, name, shape, dtype, type, lod_level, persistable):
self.name = name
self.shape = shape
self.dtype = dtype
self.type = type
self.lod_level = lod_level
self.persistable = persistable
class VarDistributed(object):
"""
a class to record the var distributed on parameter servers.
the class will record the relationship between origin var and slice var.
the slice var's properties, such as type/shape/offset/endpoint.
"""
def __init__(self,
origin_var,
slice_var,
is_slice=None,
block_id=None,
offset=None,
vtype=None,
endpoint=None):
"""
Args:
origin_var(Variable|VarStruct): origin var properties
slice_var(Variable|VarStruct): slice var properties
is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
block_id(int|None): the number about the slice var.
offset(int|None): if the slice var is sliced, offset is the numel before the var.
vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
"""
if isinstance(origin_var, Variable):
self.origin = self.__create_var_struct(origin_var)
else:
self.origin = origin_var
if isinstance(slice_var, Variable):
self.slice = self.__create_var_struct(slice_var)
else:
self.slice = slice_var
if self.equal(self.origin, self.slice):
self.is_slice = False
self.block_id = 0
self.offset = 0
else:
self.is_slice = True
self.block_id = 0
self.offset = 0
if is_slice is not None:
self.is_slice = is_slice
if block_id is not None:
self.block_id = block_id
if offset is not None:
self.offset = offset
self.vtype = vtype
self.endpoint = endpoint
@staticmethod
def __create_var_struct(var):
return VarStruct(var.name, var.shape, var.dtype, var.type,
var.lod_level, var.persistable)
@staticmethod
def equal(var1, var2):
"""
the two var is equal or not.
Returns:
bool: equal will return True else False
"""
assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
return var1.name == var2.name and \
var1.type == var2.type and \
var1.shape == var2.shape and \
var1.dtype == var2.dtype and \
var1.lod_level == var2.lod_level and \
var1.persistable == var2.persistable
def __str__(self):
origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
format(i="{", e="}", name=self.origin.name, type=self.origin.type,
shape=self.origin.shape, dtype=self.origin.dtype)
slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
".slice({is_slice}).block({block_id}).offset({offset})". \
format(i="{", e="}", name=self.slice.name, type=self.slice.type,
shape=self.slice.shape, dtype=self.slice.dtype,
is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
self.vtype, origin_var_str, slice_var_str, self.endpoint)
class VarsDistributed(object):
"""
a gather about VarDistributed with many methods to find distributed vars.
through the class, we can get overview about the distributed parameters on parameter servers.
this class may centralized and convenient for developer to manage and get variable's distribute.
other module can also use this to find variables such io.py.
"""
def __init__(self):
self.distributed_vars = []
def add_distributed_var(self,
origin_var,
slice_var,
is_slice=None,
block_id=None,
offset=None,
vtype=None,
endpoint=None):
"""
add distributed var in this.
Args:
origin_var(Variable|VarStruct): origin var properties
slice_var(Variable|VarStruct): slice var properties
is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
block_id(int|None): the number about the slice var.
offset(int|None): if the slice var is sliced, offset is the numel before the var.
vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
Returns:
None
"""
self.distributed_vars.append(
VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
vtype, endpoint))
def get_distributed_var_by_slice(self, var_name):
"""
get distributed var by conditions.
Args:
var_name(str): slice var name, such as "w.traier0.block1"
Returns:
VarDistributed: distributed var.
"""
for dist_var in self.distributed_vars:
if dist_var.slice.name == var_name:
return dist_var
return None
@staticmethod
def equal(var1, var2):
"""
the two var is equal or not.
Returns:
bool: equal will return True else False
"""
return var1.name == var2.name and \
var1.type == var2.type and \
var1.shape == var2.shape and \
var1.dtype == var2.dtype and \
var1.lod_level == var2.lod_level and \
var1.persistable == var2.persistable
def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
"""
get distributed var by conditions.
Args:
origin_var_name(str):
endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
Returns:
VarDistributed: distributed var.
"""
for dist_var in self.distributed_vars:
if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
return dist_var
return None
def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
"""
get distributed vars by conditions.
Args:
vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
groupby(bool|False): group by origin var or not.
Returns:
list: distributed var list.
dict: distributed var map when groupby=True
"""
vtype_vars = []
for var in self.distributed_vars:
if var.vtype in vtypes:
vtype_vars.append(var)
if not groupby:
return vtype_vars
params_map = {}
for var in vtype_vars:
origin_var_name = var.origin.name
if origin_var_name in params_map.keys():
optimizers = params_map.get(origin_var_name)
else:
optimizers = []
optimizers.append(var)
params_map[origin_var_name] = optimizers
return params_map
def get_distributed_vars_by_ep(self, endpoint, vtype=None):
"""
get distributed vars by conditions.
Args:
endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
Returns:
list: distributed var list.
"""
endpoint_vars = []
for var in self.distributed_vars:
if var.endpoint == endpoint:
endpoint_vars.append(var)
if not vtype:
return endpoint_vars
vtype_vars = []
for var in endpoint_vars:
if var.vtype == vtype:
vtype_vars.append(var)
return vtype_vars
def overview(self):
"""
get the overview string about all params on all parameter servers.
Returns:
Str: overview string.
"""
vars_str = []
for var in self.distributed_vars:
vars_str.append(str(var))
return "\n".join(vars_str)
class VarBlock: class VarBlock:
def __init__(self, varname, offset, size): def __init__(self, varname, offset, size):
self.varname = varname self.varname = varname
......
...@@ -109,6 +109,7 @@ packages=['paddle', ...@@ -109,6 +109,7 @@ packages=['paddle',
'paddle.fluid.contrib', 'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize', 'paddle.fluid.contrib.quantize',
'paddle.fluid.contrib.int8_inference',
'paddle.fluid.contrib.reader', 'paddle.fluid.contrib.reader',
'paddle.fluid.contrib.slim', 'paddle.fluid.contrib.slim',
'paddle.fluid.contrib.slim.core', 'paddle.fluid.contrib.slim.core',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册