未验证 提交 74037cc1 编写于 作者: K Kaipeng Deng 提交者: GitHub

Merge branch 'develop' into yolo_box

...@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, ...@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
else: else:
build_strategy.reduce_strategy = fluid.BuildStrategy( build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce ).ReduceStrategy.AllReduce
build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
avg_loss = train_args[0] avg_loss = train_args[0]
......
...@@ -110,7 +110,7 @@ function(op_library TARGET) ...@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op") "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
...@@ -91,7 +91,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po ...@@ -91,7 +91,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po
paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625'))
paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95'))
paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497')) paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497'))
paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d')) paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '320c6973b02ea179fa89fecc80796464'))
paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab')) paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab'))
paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb')) paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb'))
paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9')) paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9'))
...@@ -293,6 +293,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords= ...@@ -293,6 +293,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6')) paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
...@@ -300,6 +301,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None ...@@ -300,6 +301,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))
...@@ -327,7 +330,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', ...@@ -327,7 +330,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1')) paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e')) paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b')) paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691')) paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340')) paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <memory> #include <memory>
#include <utility>
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
...@@ -49,6 +50,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -49,6 +50,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("sequential_execution_pass"); AppendPass("sequential_execution_pass");
} }
// Add op fusion.
if (strategy.sync_batch_norm_) {
AppendPass("sync_batch_norm_pass");
}
// Add op fusion. // Add op fusion.
if (strategy.fuse_relu_depthwise_conv_) { if (strategy.fuse_relu_depthwise_conv_) {
AppendPass("fuse_relu_depthwise_conv_pass"); AppendPass("fuse_relu_depthwise_conv_pass");
...@@ -227,6 +233,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -227,6 +233,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(sync_batch_norm_pass);
USE_PASS(fuse_relu_depthwise_conv_pass); USE_PASS(fuse_relu_depthwise_conv_pass);
USE_PASS(fuse_elewise_add_act_pass); USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
......
...@@ -77,6 +77,8 @@ struct BuildStrategy { ...@@ -77,6 +77,8 @@ struct BuildStrategy {
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
bool sync_batch_norm_{false};
bool memory_optimize_{true}; bool memory_optimize_{true};
// TODO(dzhwinter): // TODO(dzhwinter):
// make enable_inplace, memory_optimize_ // make enable_inplace, memory_optimize_
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <deque> #include <deque>
#include <iterator> #include <iterator>
#include <memory>
#include <stack> #include <stack>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, ...@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
void InplacePass::TryInplaceOpInputOutput(ir::Node* op, void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const { ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name(); VLOG(4) << "Try to inplace op " << op->Name();
// FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
// ProgramDescs.
// The operations related to BlockDesc or ProgramDesc should perform on Graph
// or Node directly!
PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
"op_desc is nullptr"); "op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced. // some pre-requirments need to meet if the op want to inplaced.
......
...@@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) { ...@@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
// only these types holds bulk of gpu memory // only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR || if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) { type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false; return false;
} }
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <type_traits> #include <type_traits>
#include <unordered_set>
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -191,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -191,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// immediately to make the subblock variable reuse strategy take // immediately to make the subblock variable reuse strategy take
// effect. Because it is a single op in graph. No need to // effect. Because it is a single op in graph. No need to
// update the ir nodes. // update the ir nodes.
// FIXME(liuwei1031): Graph is not aware of the existence of
// BlockDescs and ProgramDescs.
// The operations related to BlockDesc or ProgramDesc should perform
// on Graph or Node directly!
sub_op_desc->Rename(var->Name(), cache->Name()); sub_op_desc->Rename(var->Name(), cache->Name());
if (sub_op_desc->Block() != nullptr && if (sub_op_desc->Block() != nullptr &&
sub_op_desc->Block()->HasVar(var->Name())) { sub_op_desc->Block()->HasVar(var->Name())) {
......
...@@ -34,11 +34,11 @@ limitations under the License. */ ...@@ -34,11 +34,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH #ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/operators/ngraph/ngraph_engine.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
#endif #endif
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -194,9 +194,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -194,9 +194,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool force_disable_gc) { bool force_disable_gc) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
#endif
auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
} }
...@@ -372,6 +369,12 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ...@@ -372,6 +369,12 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) {
paddle::operators::NgraphEngine::FuseNgraphOps(
ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
}
#endif
return ctx; return ctx;
} }
......
...@@ -46,6 +46,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) ...@@ -46,6 +46,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(lock_free_optimize_pass base) pass_library(lock_free_optimize_pass base)
pass_library(cpu_quantize_squash_pass inference)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
...@@ -66,6 +67,7 @@ pass_library(conv_elementwise_add_fuse_pass inference) ...@@ -66,6 +67,7 @@ pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base) pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base)
# There may be many transpose-flatten structures in a model, and the output of # There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will # these structures will be used as inputs to the concat Op. This pattern will
...@@ -100,6 +102,8 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g ...@@ -100,6 +102,8 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor) cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file eint8_outcept in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or
// implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
namespace ir {
using string::PrettyLogDetail;
void CPUQuantizeSquashPass::FindNodesToKeep(
Graph* graph,
std::unordered_map<const Node*, int>* nodes_keep_counter) const {
GraphPatternDetector gpd;
patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"};
deq_any_pattern();
int found_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern);
if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end())
(*nodes_keep_counter)[dequant_out] = 1;
else
(*nodes_keep_counter)[dequant_out] += 1;
found_count++;
};
gpd(graph, handler);
AddStatis(found_count);
}
void CPUQuantizeSquashPass::Squash(
Graph* graph,
std::unordered_map<const Node*, int>* nodes_keep_counter) const {
GraphPatternDetector gpd;
patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"};
squash_pattern();
int found_squash_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "squash requantize-quantize ops pair";
GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
auto* next_op_desc = next_op->Op();
float dequant_scale = boost::get<float>(dequant_op->Op()->GetAttr("Scale"));
float quant_scale = boost::get<float>(quant_op->Op()->GetAttr("Scale"));
PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) !=
nodes_keep_counter->end());
// check if dequantize op should be kept or removed, decrease the counter
bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
if (dequant_scale == quant_scale) {
// squash dequantize-quantize to nothing
auto quant_out_var_name = quant_out->Name();
auto next_op_inputs = next_op_desc->InputNames();
for (const auto& name : next_op_inputs) {
auto var_name = next_op_desc->Input(name)[0];
if (var_name.compare(quant_out_var_name) == 0) {
next_op_desc->SetInput(
name, std::vector<std::string>({dequant_in->Name()}));
break;
}
}
if (keep_dequant)
GraphSafeRemoveNodes(graph, {quant_op, quant_out});
else
GraphSafeRemoveNodes(graph,
{dequant_op, quant_op, dequant_out, quant_out});
IR_NODE_LINK_TO(dequant_in, next_op);
found_squash_count++;
} else {
// squash dequantize-quantize to requantize op
OpDesc desc;
desc.SetType("requantize");
desc.SetInput("Input", std::vector<std::string>({dequant_in->Name()}));
desc.SetOutput("Output", std::vector<std::string>({quant_out->Name()}));
desc.SetAttr("Scale_in", dequant_scale);
desc.SetAttr("Scale_out", quant_scale);
auto requant_op = g->CreateOpNode(&desc);
if (keep_dequant)
GraphSafeRemoveNodes(graph, {quant_op});
else
GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out});
IR_NODE_LINK_TO(dequant_in, requant_op);
IR_NODE_LINK_TO(requant_op, quant_out);
found_squash_count++;
}
};
gpd(graph, handler);
AddStatis(found_squash_count);
PrettyLogDetail("--- squashed %d dequantize-quantize pairs",
found_squash_count);
}
std::unique_ptr<ir::Graph> CPUQuantizeSquashPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("cpu_quantize_squash_pass", graph.get());
std::unordered_map<const Node*, int> nodes_keep_counter;
FindNodesToKeep(graph.get(), &nodes_keep_counter);
Squash(graph.get(), &nodes_keep_counter);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(cpu_quantize_squash_pass,
paddle::framework::ir::CPUQuantizeSquashPass);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Squash dequantize->quantize pair pattern into requantize op
*/
class CPUQuantizeSquashPass : public FusePassBase {
public:
virtual ~CPUQuantizeSquashPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
/*
* For each dequantize's output find the number of operators it is an input to
*/
void FindNodesToKeep(
Graph* graph,
std::unordered_map<const Node*, int>* nodes_keep_counter) const;
/*
* Squash dequantize-quantize ops pairs into requantize or nothing
*/
void Squash(Graph* graph,
std::unordered_map<const Node*, int>* nodes_keep_counter) const;
const std::string name_scope_{"squash"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn,
float scale = 0) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("use_mkldnn", use_mkldnn);
op->SetAttr("name", name);
if (type == "conv2d") {
op->SetInput("Input", {inputs[0]});
if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
op->SetOutput("Output", {outputs[0]});
} else if (type == "quantize") {
op->SetInput("Input", {inputs[0]});
op->SetOutput("Output", {outputs[0]});
op->SetAttr("Scale", scale);
} else if (type == "dequantize") {
op->SetInput("Input", {inputs[0]});
op->SetOutput("Output", {outputs[0]});
op->SetAttr("Scale", scale);
}
}
// (a,w1,b1)->Conv1->d
// d->Dequant->e
// e->Quant->f
// (f,w2,b2)->Conv2->i
ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) {
ProgramDesc prog;
for (auto& v : std::initializer_list<std::string>(
{"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) {
auto* var = prog.MutableBlock(0)->Var(v);
if (v.find("w") == 0 || v.find("b") == 0) {
var->SetPersistable(true);
}
}
SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn);
SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1);
SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2);
SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn);
return prog;
}
static const std::initializer_list<std::string> variable_names{
"a", "b", "c", "d", "e", "f", "g", "h"};
// a->Conv1->b
// b->Dequant->c
//
// c->Quant1->d and d->Conv2->e
//
// c->Conv3->f
//
// c->Quant2->g and g->Conv4->h
//
ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2,
float scale3) {
ProgramDesc prog;
for (auto& v : variable_names) {
prog.MutableBlock(0)->Var(v);
}
SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn);
SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2);
SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn);
SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn);
SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3);
SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn);
return prog;
}
void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
const char* var_name) {
auto x = scope->Var(var_name);
auto tensor = x->GetMutable<LoDTensor>();
tensor->mutable_data(place, proto::VarType::FP32,
::paddle::memory::Allocator::kDefault, 1);
}
void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
// Init scope, as it is used in pass
auto place = paddle::platform::CPUPlace();
NaiveExecutor exe{place};
Scope scope;
exe.CreateVariables(prog, 0, true, &scope);
for (auto& v : variable_names) {
InitTensorHolder(&scope, place, v.c_str());
}
graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num);
}
TEST(CpuQuantizeSquashPass, equal_scales) {
auto scale = 1.2345f;
auto use_mkldnn = true;
// Remove 4 nodes: Dequant, Quant, e, f
auto remove_nodes = 4;
MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
use_mkldnn = !use_mkldnn;
MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
}
TEST(CpuQuantizeSquashPass, inequal_scales) {
auto scale1 = 1.2345f;
auto scale2 = 21.0f;
auto use_mkldnn = true;
// Remove 3 nodes: Dequant, Quant, e
// Insert 1 node: requantize
auto remove_nodes = 2;
MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
use_mkldnn = !use_mkldnn;
MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
}
TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) {
// Delete both quantize ops,
// bypass dequantize in both branches,
// insert requantize on one branch
auto scale = 1.2345f;
auto scale2 = 21.0f;
auto use_mkldnn = true;
// Remove 3 nodes: Quant1, Quant2, g
// Insert 1 node: requantize
auto remove_nodes = 2;
MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
use_mkldnn = !use_mkldnn;
MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(cpu_quantize_squash_pass);
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <unordered_set> #include <unordered_map>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
...@@ -152,6 +152,39 @@ void Graph::ResolveHazard( ...@@ -152,6 +152,39 @@ void Graph::ResolveHazard(
} }
} }
std::shared_ptr<Graph> Graph::Clone() {
auto cloned_graph = std::make_shared<Graph>(this->program_);
cloned_graph->ReleaseNodes();
cloned_graph->num_node_created_ = 0;
std::unordered_map<ir::Node *, ir::Node *> origin_to_cloned;
for (auto *n : this->node_set_) {
ir::Node *cloned_node = nullptr;
if (n->IsCtrlVar()) {
cloned_node = cloned_graph->CreateControlDepVar();
} else if (!n->var_desc_ && !n->op_desc_) { // empty node
cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType());
} else if (n->IsVar()) {
cloned_node = cloned_graph->CreateVarNode(n->Var());
} else if (n->IsOp()) {
cloned_node = cloned_graph->CreateOpNode(n->Op());
}
if (cloned_node) {
origin_to_cloned[n] = cloned_node;
} else {
PADDLE_THROW("The cloned node's type is not supported!");
}
}
for (auto *n : this->node_set_) {
for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) {
origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]);
}
for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) {
origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]);
}
}
return cloned_graph;
}
bool IsControlDepVar(const ir::Node &var) { bool IsControlDepVar(const ir::Node &var) {
return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos; return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
} }
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
...@@ -199,7 +200,12 @@ class Graph { ...@@ -199,7 +200,12 @@ class Graph {
// WARN: After a series of passes, the current graph can be quite // WARN: After a series of passes, the current graph can be quite
// different from OriginProgram. Caller shouldn't assume much from // different from OriginProgram. Caller shouldn't assume much from
// the returned OriginProgram. // the returned OriginProgram.
const ProgramDesc &OriginProgram() const { return program_; } const ProgramDesc &OriginProgram() const {
LOG(WARNING) << "WARN: After a series of passes, the current graph can be "
"quite different from OriginProgram. So, please avoid "
"using the `OriginProgram()` method!";
return program_;
}
// This method takes ownership of `node`. // This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) { ir::Node *AddNode(ir::Node *node) {
...@@ -212,6 +218,10 @@ class Graph { ...@@ -212,6 +218,10 @@ class Graph {
void ResolveHazard( void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes); const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
// Create a new and duplicated graph.
// WARN: The method only clones the graph structure, not its attributes.
std::shared_ptr<Graph> Clone();
private: private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram( std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program); const ProgramDesc &program);
......
...@@ -1301,6 +1301,51 @@ PDNode *patterns::ConvAffineChannel::operator()( ...@@ -1301,6 +1301,51 @@ PDNode *patterns::ConvAffineChannel::operator()(
return ac_out_var; return ac_out_var;
} }
PDNode *patterns::DequantQuantAny::operator()() {
auto *dequant_in = pattern->NewNode(dequant_in_repr())
->AsInput()
->assert_is_op_input("dequantize", "Input");
auto *dequant_op =
pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
auto *dequant_out = pattern->NewNode(dequant_out_repr())
->AsOutput()
->assert_is_op_output("dequantize", "Output");
auto *quant_op = pattern->NewNode(quant_op_repr())
->assert_is_op("quantize")
->AsIntermediate();
auto *quant_out = pattern->NewNode(quant_out_repr())
->AsOutput()
->assert_is_op_output("quantize");
auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
quant_op->LinksFrom({dequant_out}).LinksTo({quant_out});
next_op->LinksFrom({quant_out});
return quant_out;
}
PDNode *patterns::DequantAny::operator()() {
auto *dequant_op =
pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
auto *dequant_out = pattern->NewNode(dequant_out_repr())
->AsOutput()
->assert_is_op_output("dequantize", "Output");
auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
dequant_op->LinksTo({dequant_out});
next_op->LinksFrom({dequant_out});
return dequant_out;
}
// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a // a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b // b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
// ... // ...
......
...@@ -18,8 +18,11 @@ ...@@ -18,8 +18,11 @@
#include <gtest/gtest_prod.h> #include <gtest/gtest_prod.h>
#endif #endif
#include <memory>
#include <numeric> #include <numeric>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -766,6 +769,34 @@ struct ConvAffineChannel : public PatternBase { ...@@ -766,6 +769,34 @@ struct ConvAffineChannel : public PatternBase {
PATTERN_DECL_NODE(ac_out); // Out PATTERN_DECL_NODE(ac_out); // Out
}; };
// Dequantize + Quantize + anyOP
// This pattern is used for squashing the dequantize-quantize pairs.
struct DequantQuantAny : public PatternBase {
DequantQuantAny(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "dequant_quant_any") {}
PDNode* operator()();
PATTERN_DECL_NODE(dequant_in);
PATTERN_DECL_NODE(dequant_op);
PATTERN_DECL_NODE(dequant_out);
PATTERN_DECL_NODE(quant_op);
PATTERN_DECL_NODE(quant_out);
PATTERN_DECL_NODE(next_op);
};
// Dequantize + anyOP
// This quantize is used for getting number of ops the Dequantize's
// output is an input to.
struct DequantAny : public PatternBase {
DequantAny(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "dequant_any") {}
PDNode* operator()();
PATTERN_DECL_NODE(dequant_op);
PATTERN_DECL_NODE(dequant_out);
PATTERN_DECL_NODE(next_op);
};
struct TransposeFlattenConcat : public PatternBase { struct TransposeFlattenConcat : public PatternBase {
TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope) TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "transpose_flatten_concat") {} : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <typeindex> #include <typeindex>
#include <typeinfo> #include <typeinfo>
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
#include <memory>
#include <string>
#include <utility>
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Use synchronous batch norm";
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
auto* op = n->Op();
if (op->Type() == "batch_norm") {
op->SetType("sync_batch_norm");
}
if (op->Type() == "batch_norm_grad") {
op->SetType("sync_batch_norm_grad");
}
}
}
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class SyncBatchNormPass : public Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("name", name);
op->SetInput("X", inputs);
op->SetOutput("Out", outputs);
}
// (a, conv_w)->conv2d->b
// (b, bn_scale, bn_bias, mean, var)->batch_norm
// ->(c, mean, var, save_mean, save_inv_var)
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v : std::vector<std::string>({"a", "conv_w", "b", "bn_scale",
"bn_bias", "mean", "var", "c",
"save_mean", "save_inv_var"})) {
auto* var = prog.MutableBlock(0)->Var(v);
if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" ||
v == "var") {
var->SetPersistable(true);
}
}
SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"a", "conv_w"}),
std::vector<std::string>({"b"}));
SetOp(&prog, "batch_norm", "bn",
std::vector<std::string>({"b", "bn_scale", "bn_bias", "mean", "var"}),
std::vector<std::string>(
{"c", "mean", "var", "save_mean", "save_inv_var"}));
return prog;
}
TEST(IsTestPass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass");
graph = pass->Apply(std::move(graph));
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "bn") {
ASSERT_EQ(op->Type(), "sync_batch_norm");
}
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(sync_batch_norm_pass);
...@@ -186,14 +186,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -186,14 +186,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(3) << place << " " << DebugStringEx(&scope); VLOG(3) << place << " " << DebugStringEx(&scope);
} catch (platform::EnforceNotMet exception) { } catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) { if (Attrs().count("sub_block") != 0) {
throw; throw std::move(exception);
} }
auto& callstack = Attr<std::vector<std::string>>( auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) { if (callstack.empty()) {
throw; throw std::move(exception);
} }
std::ostringstream sout; std::ostringstream sout;
sout << "Invoke operator " << Type() << " error.\n"; sout << "Invoke operator " << Type() << " error.\n";
...@@ -204,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -204,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
sout << "C++ Callstacks: \n"; sout << "C++ Callstacks: \n";
sout << exception.err_str_; sout << exception.err_str_;
exception.err_str_ = sout.str(); exception.err_str_ = sout.str();
throw; throw std::move(exception);
} catch (...) { } catch (...) {
std::rethrow_exception(std::current_exception()); std::rethrow_exception(std::current_exception());
} }
...@@ -926,8 +926,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -926,8 +926,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx = pool.Get(expected_kernel_key.place_); dev_ctx = pool.Get(expected_kernel_key.place_);
} }
RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
this->InferShape(&infer_shape_ctx); RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
this->InferShape(&infer_shape_ctx);
}
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs. // not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second( kernel_iter->second(
......
...@@ -62,6 +62,15 @@ constexpr char kZeroVarSuffix[] = "@ZERO"; ...@@ -62,6 +62,15 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
/// Variables with this suffix are the new Gradient. /// Variables with this suffix are the new Gradient.
constexpr char kNewGradSuffix[] = "@NEWGRAD@"; constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// If an Op has this attribute, all its kernels should calculate output
/// variable's shape in the corresponding Compute() function. And
/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
/// function in its runtime for speedup.
/// TODO(luotao): Note that this temporal attribute would be deleted after all
/// ops contain it.
constexpr char kAllKernelsMustComputeRuntimeShape[] =
"@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
// define some kernel priority // define some kernel priority
/* Define multiple kernel type fallback order*/ /* Define multiple kernel type fallback order*/
extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority; extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
......
...@@ -14,8 +14,10 @@ limitations under the License. */ ...@@ -14,8 +14,10 @@ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
...@@ -181,13 +183,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() { ...@@ -181,13 +183,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_; return member_->local_scopes_;
} }
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const std::vector<platform::Place> &places, const std::vector<std::string> &bcast_vars,
const std::unordered_set<std::string> &bcast_vars, const std::string &loss_var_name,
const std::string &loss_var_name, Scope *scope, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, const ExecutionStrategy &exec_strategy,
ir::Graph *graph) const BuildStrategy &build_strategy,
ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
member_->use_cuda_ = exec_strategy.use_cuda_; member_->use_cuda_ = exec_strategy.use_cuda_;
...@@ -250,13 +253,41 @@ ParallelExecutor::ParallelExecutor( ...@@ -250,13 +253,41 @@ ParallelExecutor::ParallelExecutor(
member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
member_->places_, nccl_id, build_strategy.num_trainers_, member_->places_, nccl_id, build_strategy.num_trainers_,
build_strategy.trainer_id_)); build_strategy.trainer_id_));
std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
// Initialize device context's nccl comm
// Note, more than one ParallelExecutor with same place, the nccl comm will
// be rewrite and there will be some problem.
for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
auto &nccl_ctx = dev_nccl_ctxs->at(dev_id);
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
pool.Get(member_->places_[dev_id]));
dev_ctx->set_nccl_comm(nccl_ctx.comm());
}
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
} }
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { // broadcast parameters from the 0th device to others:
BCastParamsToDevices(bcast_vars); auto need_broadcast = [&]() -> bool {
if (build_strategy.num_trainers_ > 1) {
// 1. num_tariners would be grater than 1 for nccl distributed training.
return true;
} else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
// 2. Only one trainer process, but ParallelExecutor hold multiple
// devices.
return true;
}
return false;
};
if (need_broadcast()) {
BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
...@@ -338,7 +369,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -338,7 +369,7 @@ ParallelExecutor::ParallelExecutor(
} }
void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::BCastParamsToDevices(
const std::unordered_set<std::string> &vars) const { const std::vector<std::string> &vars, int trainer_id) const {
// the initializing bcast, all vars would be bcast from device(0). // the initializing bcast, all vars would be bcast from device(0).
for (auto &var : vars) { for (auto &var : vars) {
framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
...@@ -362,7 +393,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -362,7 +393,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
if (i == 0) { if (i == 0 && trainer_id == 0) {
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
......
...@@ -14,9 +14,11 @@ limitations under the License. */ ...@@ -14,9 +14,11 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/build_strategy.h"
...@@ -45,7 +47,7 @@ class ParallelExecutor { ...@@ -45,7 +47,7 @@ class ParallelExecutor {
public: public:
explicit ParallelExecutor(const std::vector<platform::Place> &places, explicit ParallelExecutor(const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &bcast_vars, const std::vector<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope, const std::string &loss_var_name, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
...@@ -70,7 +72,10 @@ class ParallelExecutor { ...@@ -70,7 +72,10 @@ class ParallelExecutor {
const std::string &fetched_var_name); const std::string &fetched_var_name);
private: private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const; // broadcast the parameters from the 0th device.
// trainer_id the trainer index in nccl distributed training.
void BCastParamsToDevices(const std::vector<std::string> &vars,
int trainer_id = 0) const;
bool EnableParallelGraphExecution(const ir::Graph &graph, bool EnableParallelGraphExecution(const ir::Graph &graph,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy) const; const BuildStrategy &build_strategy) const;
......
...@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) ...@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU) if (WITH_GPU)
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -24,9 +22,11 @@ ...@@ -24,9 +22,11 @@
#endif #endif
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
...@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { ...@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
} // namespace legacy } // namespace legacy
namespace allocation { namespace allocation {
LegacyMemMonitor GPUMemMonitor; LegacyMemMonitor GPUMemMonitor;
Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
return new Allocation(ptr, size, place_); auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc;
} }
void LegacyAllocator::Free(Allocation *allocation) { void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor( boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()), legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place()); allocation->place());
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation; delete allocation;
} }
......
...@@ -44,10 +44,10 @@ if (WITH_DISTRIBUTE) ...@@ -44,10 +44,10 @@ if (WITH_DISTRIBUTE)
SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
endif() endif()
register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
# warpctc_op needs cudnn 7 above
if (WITH_GPU) if (WITH_GPU)
# warpctc_op needs cudnn 7 above
if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
else() else()
...@@ -58,6 +58,8 @@ if (WITH_GPU) ...@@ -58,6 +58,8 @@ if (WITH_GPU)
op_library(conv_fusion_op) op_library(conv_fusion_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
endif() endif()
op_library(sync_batch_norm_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
else() else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif() endif()
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$ ...@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$
)DOC"; )DOC";
class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of acos operator");
AddOutput("Out", "Output of acos operator");
AddComment(R"DOC(
Arccosine Activation Operator.
$$out = \cos^{-1}(x)$$
)DOC");
}
};
class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of asin operator");
AddOutput("Out", "Output of asin operator");
AddComment(R"DOC(
Arcsine Activation Operator.
$$out = \sin^{-1}(x)$$
)DOC");
}
};
class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of atan operator");
AddOutput("Out", "Output of atan operator");
AddComment(R"DOC(
Arctanh Activation Operator.
$$out = \tanh^{-1}(x)$$
)DOC");
}
};
class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -543,7 +587,10 @@ namespace ops = paddle::operators; ...@@ -543,7 +587,10 @@ namespace ops = paddle::operators;
__macro(SoftShrink, softshrink); \ __macro(SoftShrink, softshrink); \
__macro(Abs, abs); \ __macro(Abs, abs); \
__macro(Cos, cos); \ __macro(Cos, cos); \
__macro(Acos, acos); \
__macro(Sin, sin); \ __macro(Sin, sin); \
__macro(Asin, asin); \
__macro(Atan, atan); \
__macro(Round, round); \ __macro(Round, round); \
__macro(Log, log); \ __macro(Log, log); \
__macro(Square, square); \ __macro(Square, square); \
......
...@@ -39,9 +39,8 @@ namespace operators { ...@@ -39,9 +39,8 @@ namespace operators {
Please refer to the layer_helper.py and get the details. Please refer to the layer_helper.py and get the details.
*/ */
static std::unordered_set<std::string> InplaceOpSet = { static std::unordered_set<std::string> InplaceOpSet = {
"sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil",
"floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};
};
static bool IsInplace(const std::string& op) { static bool IsInplace(const std::string& op) {
bool inplace = InplaceOpSet.count(op); bool inplace = InplaceOpSet.count(op);
...@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> { ...@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct Acos {
HOSTDEVICE T operator()(const T& val) const { return acos(val); }
};
template <>
struct Acos<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(acos(static_cast<float>(val)));
}
};
// Acos(x) = acos(x)
template <typename T>
struct AcosFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Acos<T>());
}
};
// acos'(x) = -1/sqrt(1-x^2)
template <typename T>
struct AcosGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
-dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
}
};
template <typename T>
struct Asin {
HOSTDEVICE T operator()(const T& val) const { return asin(val); }
};
template <>
struct Asin<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(asin(static_cast<float>(val)));
}
};
// Asin(x) = asin(x)
template <typename T>
struct AsinFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Asin<T>());
}
};
// asin'(x) = 1/sqrt(1-x^2)
template <typename T>
struct AsinGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
}
};
template <typename T>
struct Atan {
HOSTDEVICE T operator()(const T& val) const { return atan(val); }
};
template <>
struct Atan<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(atan(static_cast<float>(val)));
}
};
// Atan(x) = atan(x)
template <typename T>
struct AtanFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Atan<T>());
}
};
// atan'(x) = 1 / (1 + x^2)
template <typename T>
struct AtanGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
}
};
// round(x) = [x] // round(x) = [x]
template <typename T> template <typename T>
struct RoundFunctor : public BaseActivationFunctor<T> { struct RoundFunctor : public BaseActivationFunctor<T> {
...@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(gelu, GeluFunctor, GeluGradFunctor); \ __macro(gelu, GeluFunctor, GeluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(atan, AtanFunctor, AtanGradFunctor); \
__macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
__macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \
__macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \
__macro(cos, CosFunctor, CosGradFunctor); \ __macro(cos, CosFunctor, CosGradFunctor); \
__macro(acos, AcosFunctor, AcosGradFunctor); \
__macro(sin, SinFunctor, SinGradFunctor); \ __macro(sin, SinFunctor, SinGradFunctor); \
__macro(asin, AsinFunctor, AsinGradFunctor); \
__macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, LogFunctor, LogGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -22,147 +24,150 @@ limitations under the License. */ ...@@ -22,147 +24,150 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class BatchNormOp : public framework::OperatorWithKernel { void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
public: PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null.");
using framework::OperatorWithKernel::OperatorWithKernel; PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of ConvOp should not be null.");
void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Bias"),
PADDLE_ENFORCE(ctx->HasInput("X"), ""); "Input(Bias) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); PADDLE_ENFORCE(ctx->HasInput("Mean"),
PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); "Input(Mean) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Mean"), ""); PADDLE_ENFORCE(ctx->HasInput("Variance"),
PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); "Input(Variance) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); PADDLE_ENFORCE(ctx->HasOutput("Y"),
PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), ""); "Output(Y) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), ""); bool is_test = ctx->Attrs().Get<bool>("is_test");
PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); if (!is_test) {
PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); PADDLE_ENFORCE(ctx->HasOutput("MeanOut"),
"Output(MeanOut) of ConvOp should not be null.");
// make sure Mean/MeanOut and Variance/VarianceOut share memory in Python PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"),
PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], "Output(VarianceOut) of ConvOp should not be null.");
"Mean and MeanOut should share the same memory"); PADDLE_ENFORCE(ctx->HasOutput("SavedMean"),
PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], "Output(SavedMean) of ConvOp should not be null.");
ctx->Outputs("VarianceOut")[0], PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"),
"Variance and VarianceOut should share the same memory"); "Output(SavedVariance) of ConvOp should not be null.");
const auto x_dims = ctx->GetInputDim("X");
const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout"));
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"Input X must have 2 to 5 dimensions.");
const int64_t C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
ctx->SetOutputDim("Y", x_dims);
ctx->SetOutputDim("MeanOut", {C});
ctx->SetOutputDim("VarianceOut", {C});
ctx->SetOutputDim("SavedMean", {C});
ctx->SetOutputDim("SavedVariance", {C});
ctx->ShareLoD("X", "Y");
} }
protected: // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
framework::OpKernelType GetExpectedKernelType( PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
const framework::ExecutionContext &ctx) const override { "Mean and MeanOut should share the same memory");
auto input_data_type = ctx.Input<Tensor>("X")->type(); PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
// By default, the type of the scale, bias, mean, "Variance and VarianceOut should share the same memory");
// and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor). const auto x_dims = ctx->GetInputDim("X");
auto bn_param_type = framework::proto::VarType::FP32; const DataLayout data_layout = framework::StringToDataLayout(
if (input_data_type == framework::proto::VarType::FP64) { ctx->Attrs().Get<std::string>("data_layout"));
bn_param_type = framework::proto::VarType::FP64;
} PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(), "Input X must have 2 to 5 dimensions.");
"Scale input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(), const int64_t C =
"Bias input should be of float type"); (data_layout == DataLayout::kNCHW ? x_dims[1]
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(), : x_dims[x_dims.size() - 1]);
"Mean input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(), PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
"Variance input should be of float type"); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout; ctx->SetOutputDim("Y", x_dims);
ctx->SetOutputDim("MeanOut", {C});
ctx->SetOutputDim("VarianceOut", {C});
ctx->SetOutputDim("SavedMean", {C});
ctx->SetOutputDim("SavedVariance", {C});
ctx->ShareLoD("X", "Y");
}
framework::OpKernelType BatchNormOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
auto input_data_type = ctx.Input<Tensor>("X")->type();
// By default, the type of the scale, bias, mean,
// and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto bn_param_type = framework::proto::VarType::FP32;
if (input_data_type == framework::proto::VarType::FP64) {
bn_param_type = framework::proto::VarType::FP64;
}
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
"Scale input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
"Bias input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
"Mean input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
"Variance input should be of float type");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain && if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN; layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library);
} }
}; #endif
class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
public: library);
void Make() override { }
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false " void BatchNormOpMaker::Make() {
"for training. Some layers may run faster when this is true.") AddAttr<bool>("is_test",
.SetDefault(false); "(bool, default false) Set to true for inference only, false "
AddAttr<float>("momentum", "").SetDefault(0.9); "for training. Some layers may run faster when this is true.")
AddAttr<float>("epsilon", "") .SetDefault(false);
.SetDefault(1e-5) AddAttr<float>("momentum", "").SetDefault(0.9);
.AddCustomChecker([](const float &epsilon) { AddAttr<float>("epsilon", "")
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, .SetDefault(1e-5)
"'epsilon' should be between 0.0 and 0.001."); .AddCustomChecker([](const float &epsilon) {
}); PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
AddAttr<std::string>("data_layout", "").SetDefault("NCHW"); "'epsilon' should be between 0.0 and 0.001.");
AddInput("X", "The input tensor"); });
AddInput("Scale", AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
"Scale is a 1-dimensional tensor of size C " AddInput("X", "The input tensor");
"that is applied to the output"); AddInput("Scale",
AddInput("Bias", "Scale is a 1-dimensional tensor of size C "
"Bias is a 1-dimensional tensor of size C " "that is applied to the output");
"that is applied to the output"); AddInput("Bias",
AddInput("Mean", "Bias is a 1-dimensional tensor of size C "
"The global mean (for training) or " "that is applied to the output");
"estimated mean (for testing)"); AddInput("Mean",
AddInput("Variance", "The global mean (for training) or "
"The global variance (for training) " "estimated mean (for testing)");
"or estimated Variance (for testing)"); AddInput("Variance",
AddOutput("Y", "result after normalization"); "The global variance (for training) "
AddOutput("MeanOut", "or estimated Variance (for testing)");
"Share memory with Mean. " AddOutput("Y", "result after normalization");
"Store the global mean when training"); AddOutput("MeanOut",
AddOutput("VarianceOut", "Share memory with Mean. "
"Share memory with Variance. " "Store the global mean when training");
"Store the global Variance when training"); AddOutput("VarianceOut",
AddOutput("SavedMean", "Share memory with Variance. "
"Mean of the current mini batch, " "Store the global Variance when training");
"will apply to output when training") AddOutput("SavedMean",
.AsIntermediate(); "Mean of the current mini batch, "
AddOutput("SavedVariance", "will apply to output when training")
"Variance of the current mini batch, " .AsIntermediate();
"will apply to output when training") AddOutput("SavedVariance",
.AsIntermediate(); "Variance of the current mini batch, "
AddAttr<bool>("use_mkldnn", "will apply to output when training")
"(bool, default false) Only used in mkldnn kernel") .AsIntermediate();
.SetDefault(false); AddAttr<bool>("use_mkldnn",
AddAttr<bool>("fuse_with_relu", "(bool, default false) Only used in mkldnn kernel")
"(bool, default false) Only used in mkldnn kernel") .SetDefault(false);
.SetDefault(false); AddAttr<bool>("fuse_with_relu",
AddAttr<bool>("use_global_stats", "(bool, default false) Only used in mkldnn kernel")
"(bool, default false) Whether to use global mean and " .SetDefault(false);
"variance. In inference or test mode, set use_global_stats " AddAttr<bool>("use_global_stats",
"to true or is_test true. the behavior is equivalent. " "(bool, default false) Whether to use global mean and "
"In train mode, when setting use_global_stats True, the " "variance. In inference or test mode, set use_global_stats "
"global mean and variance are also used during train time, " "to true or is_test true. the behavior is equivalent. "
"the BN acts as scaling and shiffting.") "In train mode, when setting use_global_stats True, the "
.SetDefault(false); "global mean and variance are also used during train time, "
AddComment(R"DOC( "the BN acts as scaling and shiffting.")
.SetDefault(false);
AddComment(R"DOC(
Batch Normalization. Batch Normalization.
Batch Norm has been implemented as discussed in the paper: Batch Norm has been implemented as discussed in the paper:
...@@ -173,17 +178,7 @@ The required data format for this layer is one of the following: ...@@ -173,17 +178,7 @@ The required data format for this layer is one of the following:
2. NCHW `[batch, in_channels, in_height, in_width]` 2. NCHW `[batch, in_channels, in_height, in_width]`
)DOC"); )DOC");
} }
};
class BatchNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
template <typename T> template <typename T>
class BatchNormKernel<platform::CPUDeviceContext, T> class BatchNormKernel<platform::CPUDeviceContext, T>
...@@ -336,82 +331,75 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -336,82 +331,75 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
} }
}; };
class BatchNormGradOp : public framework::OperatorWithKernel { void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
public: // check input
using framework::OperatorWithKernel::OperatorWithKernel; PADDLE_ENFORCE(ctx->HasInput("X"));
PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
// check input "Input(Y@GRAD) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("X")); PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null."); "Input(SavedMean) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
"Input(Y@GRAD) should not be null."); "Input(SavedVariance) should not be null");
PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
"Input(SavedMean) should not be null."); // check output
PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
"Input(SavedVariance) should not be null"); if (ctx->HasOutput(framework::GradVarName("Scale"))) {
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
// check output "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); "null at same time");
if (ctx->HasOutput(framework::GradVarName("Scale"))) { }
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
"Output(Scale@GRAD) and Output(Bias@GRAD) should not be " if (use_global_stats) {
"null at same time"); PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
} "Using global stats during training is not supported "
const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats"); "in gradient op kernel of batch_norm_mkldnn_op now.");
if (use_global_stats) { }
PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
"Using global stats during training is not supported "
"in gradient op kernel of batch_norm_mkldnn_op now.");
}
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
const DataLayout data_layout = framework::StringToDataLayout( const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout")); ctx->Attrs().Get<std::string>("data_layout"));
const int C = const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
(data_layout == DataLayout::kNCHW ? x_dims[1] : x_dims[x_dims.size() - 1]);
: x_dims[x_dims.size() - 1]);
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
if (ctx->HasOutput(framework::GradVarName("Scale"))) { if (ctx->HasOutput(framework::GradVarName("Scale"))) {
ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
}
} }
}
protected: framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const {
const framework::ExecutionContext &ctx) const override { const auto *var = ctx.InputVar(framework::GradVarName("Y"));
const auto *var = ctx.InputVar(framework::GradVarName("Y")); if (var == nullptr) {
if (var == nullptr) { PADDLE_THROW("can't find Y@GRAD");
PADDLE_THROW("can't find Y@GRAD"); }
} const Tensor *t = nullptr;
const Tensor *t = nullptr; if (var->IsType<Tensor>()) {
if (var->IsType<Tensor>()) { t = &var->Get<Tensor>();
t = &var->Get<Tensor>(); } else if (var->IsType<LoDTensor>()) {
} else if (var->IsType<LoDTensor>()) { t = &var->Get<LoDTensor>();
t = &var->Get<LoDTensor>(); }
} if (t == nullptr) {
if (t == nullptr) { PADDLE_THROW("can't find Y@GRAD");
PADDLE_THROW("can't find Y@GRAD"); }
}
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::LibraryType library = framework::LibraryType::kPlain; framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout; framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain && if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN; layout = framework::DataLayout::kMKLDNN;
} }
#endif #endif
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
ctx.GetPlace(), layout, library); layout, library);
} }
};
template <typename T> template <typename T>
class BatchNormGradKernel<platform::CPUDeviceContext, T> class BatchNormGradKernel<platform::CPUDeviceContext, T>
...@@ -572,37 +560,31 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T> ...@@ -572,37 +560,31 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
} }
}; };
class BatchNormGradMaker : public framework::SingleGradOpDescMaker { std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
public: auto *op = new framework::OpDesc();
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; op->SetType(GradOpType());
op->SetInput("X", Input("X"));
protected: op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *op = new framework::OpDesc(); op->SetInput("Scale", Input("Scale"));
op->SetType("batch_norm_grad"); op->SetInput("Bias", Input("Bias"));
op->SetInput("X", Input("X")); op->SetInput("SavedMean", Output("SavedMean"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetInput("SavedVariance", Output("SavedVariance"));
op->SetInput("Scale", Input("Scale")); // used when setting use_global_stats True during training
op->SetInput("Bias", Input("Bias")); if (boost::get<bool>(GetAttr("use_global_stats"))) {
op->SetInput("SavedMean", Output("SavedMean")); op->SetInput("Mean", Output("MeanOut"));
op->SetInput("SavedVariance", Output("SavedVariance")); op->SetInput("Variance", Output("VarianceOut"));
}
// used when setting use_global_stats True during training
if (boost::get<bool>(GetAttr("use_global_stats"))) {
op->SetInput("Mean", Output("MeanOut"));
op->SetInput("Variance", Output("VarianceOut"));
}
op->SetAttrMap(Attrs()); op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
return std::unique_ptr<framework::OpDesc>(op); return std::unique_ptr<framework::OpDesc>(op);
} }
};
class BatchNormInplaceInToOut : public framework::InplaceInToOut { class BatchNormInplaceInToOut : public framework::InplaceInToOut {
public: public:
...@@ -642,10 +624,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { ...@@ -642,10 +624,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, ops::BatchNormOpInferVarType, ops::BatchNormGradMaker)
ops::BatchNormInplaceInToOut); // ops::BatchNormInplaceInToOut);
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp)
ops::BatchNormGradInplaceInToOut); // ops::BatchNormGradInplaceInToOut);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>, batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType<T>; ...@@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType<T>;
template <typename T> template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType; using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
int *N, int *C, int *H, int *W, int *D) {
*N = dims[0];
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
*H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
*W = dims.size() > 3
? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
: 1;
*D = dims.size() > 4
? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
: 1;
}
}
template <typename T> template <typename T>
class BatchNormKernel<platform::CUDADeviceContext, T> class BatchNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
...@@ -196,22 +176,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -196,22 +176,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
} }
}; };
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon, const int C,
const int HxW, const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T, int BlockDim, framework::DataLayout layout> template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ void KeBNBackwardScaleBias( static __global__ void KeBNBackwardScaleBias(
const T *dy, const T *x, const BatchNormParamType<T> *mean, const T *dy, const T *x, const BatchNormParamType<T> *mean,
...@@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias( ...@@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias(
} }
} }
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon, const int C,
const int HxW, const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T> template <typename T>
class BatchNormGradKernel<platform::CUDADeviceContext, T> class BatchNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
...@@ -383,7 +363,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -383,7 +363,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<< KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
grid2, block, 0, dev_ctx.stream()>>>( grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data, d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(), epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>()); d_bias->data<BatchNormParamType<T>>());
} }
} else { } else {
...@@ -394,10 +374,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -394,10 +374,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
running_var_data, epsilon, C, H * W, num, d_x->data<T>()); running_var_data, epsilon, C, H * W, num, d_x->data<T>());
} }
if (d_scale && d_bias) { if (d_scale && d_bias) {
KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<< KeBNBackwardScaleBias<T, block, framework::DataLayout::kNHWC><<<
grid2, block, 0, dev_ctx.stream()>>>( grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data, d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(), epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>()); d_bias->data<BatchNormParamType<T>>());
} }
} }
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -35,17 +38,84 @@ template <typename T> ...@@ -35,17 +38,84 @@ template <typename T>
using ConstEigenVectorArrayMap = using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>; Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
class BatchNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override;
};
class BatchNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override;
};
class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override;
virtual std::string GradOpType() const {
return this->ForwardOpType() + "_grad";
}
};
class BatchNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BatchNormKernel : public framework::OpKernel<T> { class BatchNormKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override; void Compute(const framework::ExecutionContext &ctx) const override;
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BatchNormGradKernel : public framework::OpKernel<T> { class BatchNormGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override; void Compute(const framework::ExecutionContext &ctx) const override;
}; };
inline void ExtractNCWHD(const framework::DDim &dims,
const DataLayout &data_layout, int *N, int *C, int *H,
int *W, int *D) {
*N = dims[0];
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
*H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
*W = dims.size() > 3
? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
: 1;
*D = dims.size() > 4
? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
: 1;
}
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and ...@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/cross_entropy_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class CrossEntropyOp : public framework::OperatorWithKernel { class CrossEntropyOpBase : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
...@@ -43,7 +46,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel { ...@@ -43,7 +46,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
"Input(X) and Input(Label) shall have the same shape " "Input(X) and Input(Label) shall have the same shape "
"except the last dimension."); "except the last dimension.");
} }
if (ctx->Attrs().Get<bool>("soft_label")) {
if (IsSoftLabel(ctx)) {
if (check) { if (check) {
PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
"If Attr(soft_label) == true, the last dimension of " "If Attr(soft_label) == true, the last dimension of "
...@@ -69,21 +73,24 @@ class CrossEntropyOp : public framework::OperatorWithKernel { ...@@ -69,21 +73,24 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context()); ctx.device_context());
} }
virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
return ctx->Attrs().Get<bool>("soft_label");
}
}; };
class CrossEntropyGradientOp : public framework::OperatorWithKernel { class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) shoudl be not null."); "Input(Y@GRAD) shoudl be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null."); "Output(X@GRAD) should be not null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = GetXDim(ctx);
auto label_dims = ctx->GetInputDim("Label"); auto label_dims = ctx->GetInputDim("Label");
auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
int rank = x_dims.size(); int rank = x_dims.size();
...@@ -108,9 +115,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -108,9 +115,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
"The Input(X) and Input(Y@Grad) should have the same " "The Input(X) and Input(Y@Grad) should have the same "
"shape except the last dimension."); "shape except the last dimension.");
} }
PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, if (IsSoftLabel(ctx)) {
"The last dimension of Input(Y@Grad) should be 1.");
if (ctx->Attrs().Get<bool>("soft_label")) {
if (check) { if (check) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
x_dims[rank - 1], label_dims[rank - 1], x_dims[rank - 1], label_dims[rank - 1],
...@@ -123,7 +128,10 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -123,7 +128,10 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
"Input(Label) should be 1."); "Input(Label) should be 1.");
} }
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X")); PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
"The last dimension of Input(Y@Grad) should be 1.");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD(VarNameWithXLoD(), framework::GradVarName("X"));
} }
protected: protected:
...@@ -131,8 +139,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -131,8 +139,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
// is determined by its input "X". // is determined by its input "X".
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), return framework::OpKernelType(
ctx.device_context()); ctx.Input<Tensor>(framework::GradVarName("Y"))->type(),
ctx.device_context());
}
virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
return ctx->GetInputDim("X");
}
virtual const char* VarNameWithXLoD() const { return "X"; }
virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
return ctx->Attrs().Get<bool>("soft_label");
}
};
class CrossEntropyOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
} }
}; };
...@@ -200,22 +228,132 @@ or not. But the output only shares the LoD information with input X. ...@@ -200,22 +228,132 @@ or not. But the output only shares the LoD information with input X.
} }
}; };
class CrossEntropyOpInferVarType class CrossEntropyGradientOp : public CrossEntropyGradientOpBase {
: public framework::PassInDtypeAndVarTypeToOutput { public:
using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
CrossEntropyGradientOpBase::InferShape(ctx);
}
};
class CrossEntropyOp2 : public CrossEntropyOpBase {
public:
using CrossEntropyOpBase::CrossEntropyOpBase;
void InferShape(framework::InferShapeContext* ctx) const override {
CrossEntropyOpBase::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("MatchX"),
"Output(MatchX) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto x_dims_vec = framework::vectorize(x_dims);
x_dims_vec.push_back(0);
ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec));
x_dims[x_dims.size() - 1] = 1;
ctx->SetOutputDim("MatchX", x_dims);
ctx->ShareLoD("X", /*->*/ "XShape");
}
protected: protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType() bool IsSoftLabel(framework::InferShapeContext* ctx) const override {
const override { return false;
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}}; }
};
class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase {
public:
using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("MatchX"), "Input(MatchX) must exist");
CrossEntropyGradientOpBase::InferShape(ctx);
}
protected:
virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
auto x_shape = ctx->GetInputDim("XShape");
return framework::DDim(x_shape.Get(), x_shape.size() - 1);
}
virtual const char* VarNameWithXLoD() const { return "XShape"; }
virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
return false;
} }
}; };
class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), a tensor whose last dimension "
"size is equal to the number of classes. This input is a "
"probability computed by the previous operator, which is almost "
"always the result of a softmax operator.");
AddInput(
"Label",
"(Tensor), the tensor which represents the ground truth. It has the "
"same shape with 'X' except the last dimension. One hot Tensor.");
AddOutput("Y",
"(Tensor, default Tensor<float>), a tensor whose shape is same "
"with 'X' except that the last dimension size is 1. It "
"represents the cross entropy loss.");
AddOutput("XShape", "Temporaily variable to save shape and LoD of X.");
AddOutput("MatchX",
"X value that matches label, used for gradient computation.");
AddAttr<int>("ignore_index",
"(int, default -100), Specifies a target value that is"
"ignored and does not contribute to the input gradient."
"Only valid if soft_label is set to False")
.SetDefault(-100);
AddComment(R"DOC(
Hard-label CrossEntropy Operator.
The input 'X' and 'Label' will first be logically flattened to 2-D matrixs.
The matrix's second dimension(row length) is as same as the original last
dimension, and the first dimension(column length) is the product of all other
original dimensions. Then the softmax computation will take palce on each raw
of flattened matrixs.
Only support hard label.
Both the input X and Label can carry the LoD (Level of Details) information,
or not. But the output only shares the LoD information with input X.
)DOC");
}
};
class CrossEntropyGradOpDescMaker2 : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("cross_entropy_grad2");
op->SetInput("Label", Input("Label"));
op->SetInput("MatchX", Output("MatchX"));
op->SetInput("XShape", Output("XShape"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext; using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase,
ops::CrossEntropyOpInferVarType, ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>, REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
...@@ -223,3 +361,14 @@ REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>, ...@@ -223,3 +361,14 @@ REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
REGISTER_OP_CPU_KERNEL(cross_entropy_grad, REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<CPUCtx, float>, ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
ops::CrossEntropyGradientOpKernel<CPUCtx, double>); ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
REGISTER_OPERATOR(cross_entropy2, ops::CrossEntropyOp2,
ops::CrossEntropyOpMaker2, ops::CrossEntropyOpInferVarType,
ops::CrossEntropyGradOpDescMaker2);
REGISTER_OPERATOR(cross_entropy_grad2, ops::CrossEntropyGradientOp2);
REGISTER_OP_CPU_KERNEL(cross_entropy2,
ops::CrossEntropyOpKernel2<CPUCtx, float>,
ops::CrossEntropyOpKernel2<CPUCtx, double>);
REGISTER_OP_CPU_KERNEL(cross_entropy_grad2,
ops::CrossEntropyGradientOpKernel2<CPUCtx, float>,
ops::CrossEntropyGradientOpKernel2<CPUCtx, double>);
...@@ -27,3 +27,13 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -27,3 +27,13 @@ REGISTER_OP_CUDA_KERNEL(
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>, cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>, ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>); ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
REGISTER_OP_CUDA_KERNEL(cross_entropy2,
ops::CrossEntropyOpKernel2<CUDACtx, float>,
ops::CrossEntropyOpKernel2<CUDACtx, double>,
ops::CrossEntropyOpKernel2<CUDACtx, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel2<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel2<CUDACtx, plat::float16>);
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
...@@ -137,5 +138,124 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> { ...@@ -137,5 +138,124 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
} }
}; };
template <typename T>
struct HardLabelCrossEntropyForwardFunctor {
HardLabelCrossEntropyForwardFunctor(const T* x, T* y, T* match_x,
const int64_t* label,
int64_t ignore_index,
int64_t feature_size)
: x_(x),
y_(y),
match_x_(match_x),
label_(label),
ignore_index_(ignore_index),
feature_size_(feature_size) {}
HOSTDEVICE void operator()(int64_t idx) const {
auto label = label_[idx];
if (label != ignore_index_) {
auto match_x = x_[idx * feature_size_ + label];
y_[idx] = -math::TolerableValue<T>()(real_log(match_x));
match_x_[idx] = match_x;
} else {
y_[idx] = 0;
match_x_[idx] = 0; // any value is ok
}
}
const T* x_;
T* y_;
T* match_x_;
const int64_t* label_;
int64_t ignore_index_;
int64_t feature_size_;
};
template <typename T>
struct HardLabelCrossEntropyBackwardFunctor {
HardLabelCrossEntropyBackwardFunctor(T* dx, const T* dy, const T* match_x,
const int64_t* label,
int64_t ignore_index,
int64_t feature_size)
: dx_(dx),
dy_(dy),
match_x_(match_x),
label_(label),
ignore_index_(ignore_index),
feature_size_(feature_size) {}
HOSTDEVICE void operator()(int64_t idx) const {
auto row_idx = idx / feature_size_;
auto col_idx = idx % feature_size_;
auto label = label_[row_idx];
if (label == col_idx && label != ignore_index_) {
dx_[idx] = -dy_[row_idx] / match_x_[row_idx];
} else {
dx_[idx] = 0;
}
}
T* dx_;
const T* dy_;
const T* match_x_;
const int64_t* label_;
int64_t ignore_index_;
int64_t feature_size_;
};
template <typename DeviceContext, typename T>
class CrossEntropyOpKernel2 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* label = ctx.Input<Tensor>("Label");
auto* y = ctx.Output<Tensor>("Y");
auto* match_x = ctx.Output<Tensor>("MatchX");
auto& x_dims = x->dims();
auto feature_size = x_dims[x_dims.size() - 1];
auto batch_size = framework::product(x->dims()) / feature_size;
auto* p_x = x->data<T>();
auto* p_label = label->data<int64_t>();
auto* p_y = y->mutable_data<T>(ctx.GetPlace());
auto* p_match_x = match_x->mutable_data<T>(ctx.GetPlace());
auto ignore_index = ctx.Attr<int>("ignore_index");
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(), batch_size);
for_range(HardLabelCrossEntropyForwardFunctor<T>(
p_x, p_y, p_match_x, p_label, ignore_index, feature_size));
}
};
template <typename DeviceContext, typename T>
class CrossEntropyGradientOpKernel2 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto* match_x = ctx.Input<Tensor>("MatchX");
auto* label = ctx.Input<Tensor>("Label");
auto* p_dx = dx->mutable_data<T>(ctx.GetPlace());
auto* p_dy = dy->data<T>();
auto* p_match_x = match_x->data<T>();
auto* p_label = label->data<int64_t>();
int64_t ignore_index = ctx.Attr<int>("ignore_index");
int rank = dx->dims().size();
int64_t feature_size = dx->dims()[rank - 1];
int64_t batch_size = framework::product(dx->dims()) / feature_size;
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(),
batch_size * feature_size);
for_range(HardLabelCrossEntropyBackwardFunctor<T>(
p_dx, p_dy, p_match_x, p_label, ignore_index, feature_size));
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -20,7 +20,7 @@ namespace operators { ...@@ -20,7 +20,7 @@ namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) { inline BoxCodeType GetBoxCodeType(const std::string &type) {
if (type == "encode_center_size") { if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize; return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") { } else if (type == "decode_center_size") {
...@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) { ...@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BoxCoderKernel : public framework::OpKernel<T> { class BoxCoderKernel : public framework::OpKernel<T> {
public: public:
void EncodeCenterSize(const framework::Tensor* target_box, void EncodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor* prior_box, const framework::Tensor *prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor *prior_box_var,
const bool normalized, const bool normalized,
const std::vector<float> variance, T* output) const { const std::vector<float> variance, T *output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = prior_box->dims()[0]; int64_t col = prior_box->dims()[0];
int64_t len = prior_box->dims()[1]; int64_t len = prior_box->dims()[1];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
size_t offset = i * col * len + j * len;
T prior_box_width = prior_box_data[j * len + 2] - T prior_box_width = prior_box_data[j * len + 2] -
prior_box_data[j * len] + (normalized == false); prior_box_data[j * len] + (normalized == false);
T prior_box_height = prior_box_data[j * len + 3] - T prior_box_height = prior_box_data[j * len + 3] -
...@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
target_box_data[i * len + 1] + target_box_data[i * len + 1] +
(normalized == false); (normalized == false);
size_t offset = i * col * len + j * len;
output[offset] = output[offset] =
(target_box_center_x - prior_box_center_x) / prior_box_width; (target_box_center_x - prior_box_center_x) / prior_box_width;
output[offset + 1] = output[offset + 1] =
...@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
std::log(std::fabs(target_box_width / prior_box_width)); std::log(std::fabs(target_box_width / prior_box_width));
output[offset + 3] = output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)); std::log(std::fabs(target_box_height / prior_box_height));
if (prior_box_var) { }
int prior_var_offset = j * len; }
output[offset] /= prior_box_var_data[prior_var_offset];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; if (prior_box_var) {
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; const T *prior_box_var_data = prior_box_var->data<T>();
output[offset + 3] /= prior_box_var_data[prior_var_offset + 3]; #ifdef PADDLE_WITH_MKLML
} else if (!(variance.empty())) { #pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
int prior_var_offset = j * len;
output[offset + k] /= prior_box_var_data[prior_var_offset + k];
}
}
}
} else if (!(variance.empty())) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
output[offset + k] /= static_cast<T>(variance[k]); output[offset + k] /= static_cast<T>(variance[k]);
} }
} }
} }
} }
} }
template <int axis, int var_size> template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box, void DecodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor* prior_box, const framework::Tensor *prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor *prior_box_var,
const bool normalized, std::vector<float> variance, const bool normalized, std::vector<float> variance,
T* output) const { T *output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1]; int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2]; int64_t len = target_box->dims()[2];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
T var_data[4] = {1., 1., 1., 1.};
T *var_ptr = var_data;
size_t offset = i * col * len + j * len; size_t offset = i * col * len + j * len;
prior_box_offset = axis == 0 ? j * len : i * len; int prior_box_offset = axis == 0 ? j * len : i * len;
T prior_box_width = prior_box_data[prior_box_offset + 2] - T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] + prior_box_data[prior_box_offset] +
(normalized == false); (normalized == false);
...@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_width = 0, target_box_height = 0; T target_box_width = 0, target_box_height = 0;
int prior_var_offset = axis == 0 ? j * len : i * len; int prior_var_offset = axis == 0 ? j * len : i * len;
if (var_size == 2) { if (var_size == 2) {
std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
4 * sizeof(T)); 4 * sizeof(T));
} else if (var_size == 1) { } else if (var_size == 1) {
var_ptr = reinterpret_cast<T*>(variance.data()); var_ptr = reinterpret_cast<T *>(variance.data());
} }
T box_var_x = *var_ptr; T box_var_x = *var_ptr;
T box_var_y = *(var_ptr + 1); T box_var_y = *(var_ptr + 1);
...@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
} }
} }
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox"); auto *prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar"); auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox"); auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox"); auto *output_box = context.Output<framework::Tensor>("OutputBox");
std::vector<float> variance = context.Attr<std::vector<float>>("variance"); std::vector<float> variance = context.Attr<std::vector<float>>("variance");
const int axis = context.Attr<int>("axis"); const int axis = context.Attr<int>("axis");
if (target_box->lod().size()) { if (target_box->lod().size()) {
...@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output_box->mutable_data<T>({row, col, len}, context.GetPlace()); output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>(); T *output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) { if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output); variance, output);
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection/yolov3_loss_op.h" #include "paddle/fluid/operators/detection/yolov3_loss_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
...@@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel { ...@@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT(class_num, 0, PADDLE_ENFORCE_GT(class_num, 0,
"Attr(class_num) should be an integer greater then 0."); "Attr(class_num) should be an integer greater then 0.");
if (ctx->HasInput("GTScore")) {
auto dim_gtscore = ctx->GetInputDim("GTScore");
PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2,
"Input(GTScore) should be a 2-D tensor");
PADDLE_ENFORCE_EQ(
dim_gtscore[0], dim_gtbox[0],
"Input(GTBox) and Input(GTScore) dim[0] should be same");
PADDLE_ENFORCE_EQ(
dim_gtscore[1], dim_gtbox[1],
"Input(GTBox) and Input(GTScore) dim[1] should be same");
}
std::vector<int64_t> dim_out({dim_x[0]}); std::vector<int64_t> dim_out({dim_x[0]});
ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
...@@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"This is a 2-D tensor with shape of [N, max_box_num], " "This is a 2-D tensor with shape of [N, max_box_num], "
"and each element should be an integer to indicate the " "and each element should be an integer to indicate the "
"box class id."); "box class id.");
AddInput("GTScore",
"The score of GTLabel, This is a 2-D tensor in same shape "
"GTLabel, and score values should in range (0, 1). This "
"input is for GTLabel score can be not 1.0 in image mixup "
"augmentation.")
.AsDispensable();
AddOutput("Loss", AddOutput("Loss",
"The output yolov3 loss tensor, " "The output yolov3 loss tensor, "
"This is a 1-D tensor with shape of [N]"); "This is a 1-D tensor with shape of [N]");
...@@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("ignore_thresh", AddAttr<float>("ignore_thresh",
"The ignore threshold to ignore confidence loss.") "The ignore threshold to ignore confidence loss.")
.SetDefault(0.7); .SetDefault(0.7);
AddAttr<bool>("use_label_smooth",
"Whether to use label smooth. Default True.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
This operator generates yolov3 loss based on given predict result and ground This operator generates yolov3 loss based on given predict result and ground
truth boxes. truth boxes.
...@@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
loss = (loss_{xy} + loss_{wh}) * weight_{box} loss = (loss_{xy} + loss_{wh}) * weight_{box}
+ loss_{conf} + loss_{class} + loss_{conf} + loss_{class}
$$ $$
While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
target will be smoothed when calculating classification loss, target of
positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
negetive samples will be smoothed to :math:`1.0 / class\_num`.
While :attr:`GTScore` is given, which means the mixup score of ground truth
boxes, all losses incured by a ground truth box will be multiplied by its
mixup score.
)DOC"); )DOC");
} }
}; };
...@@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { ...@@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
op->SetInput("X", Input("X")); op->SetInput("X", Input("X"));
op->SetInput("GTBox", Input("GTBox")); op->SetInput("GTBox", Input("GTBox"));
op->SetInput("GTLabel", Input("GTLabel")); op->SetInput("GTLabel", Input("GTLabel"));
op->SetInput("GTScore", Input("GTScore"));
op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
op->SetInput("ObjectnessMask", Output("ObjectnessMask")); op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
op->SetInput("GTMatchMask", Output("GTMatchMask")); op->SetInput("GTMatchMask", Output("GTMatchMask"));
...@@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { ...@@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("GTBox"), {}); op->SetOutput(framework::GradVarName("GTBox"), {});
op->SetOutput(framework::GradVarName("GTLabel"), {}); op->SetOutput(framework::GradVarName("GTLabel"), {});
op->SetOutput(framework::GradVarName("GTScore"), {});
return std::unique_ptr<framework::OpDesc>(op); return std::unique_ptr<framework::OpDesc>(op);
} }
}; };
......
...@@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) { ...@@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) {
} }
template <typename T> template <typename T>
static T L2Loss(T x, T y) { static T L1Loss(T x, T y) {
return 0.5 * (y - x) * (y - x); return std::abs(y - x);
} }
template <typename T> template <typename T>
...@@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) { ...@@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) {
} }
template <typename T> template <typename T>
static T L2LossGrad(T x, T y) { static T L1LossGrad(T x, T y) {
return x - y; return x > y ? 1.0 : -1.0;
} }
static int GetMaskIndex(std::vector<int> mask, int val) { static int GetMaskIndex(std::vector<int> mask, int val) {
...@@ -121,47 +121,49 @@ template <typename T> ...@@ -121,47 +121,49 @@ template <typename T>
static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt, static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
std::vector<int> anchors, int an_idx, std::vector<int> anchors, int an_idx,
int box_idx, int gi, int gj, int grid_size, int box_idx, int gi, int gj, int grid_size,
int input_size, int stride) { int input_size, int stride, T score) {
T tx = gt.x * grid_size - gi; T tx = gt.x * grid_size - gi;
T ty = gt.y * grid_size - gj; T ty = gt.y * grid_size - gj;
T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
T scale = (2.0 - gt.w * gt.h); T scale = (2.0 - gt.w * gt.h) * score;
loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale; loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale; loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale; loss[0] += L1Loss<T>(input[box_idx + 2 * stride], tw) * scale;
loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale; loss[0] += L1Loss<T>(input[box_idx + 3 * stride], th) * scale;
} }
template <typename T> template <typename T>
static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input, static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
Box<T> gt, std::vector<int> anchors, Box<T> gt, std::vector<int> anchors,
int an_idx, int box_idx, int gi, int gj, int an_idx, int box_idx, int gi, int gj,
int grid_size, int input_size, int stride) { int grid_size, int input_size, int stride,
T score) {
T tx = gt.x * grid_size - gi; T tx = gt.x * grid_size - gi;
T ty = gt.y * grid_size - gj; T ty = gt.y * grid_size - gj;
T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
T scale = (2.0 - gt.w * gt.h); T scale = (2.0 - gt.w * gt.h) * score;
input_grad[box_idx] = input_grad[box_idx] =
SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss; SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
input_grad[box_idx + stride] = input_grad[box_idx + stride] =
SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss; SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
input_grad[box_idx + 2 * stride] = input_grad[box_idx + 2 * stride] =
L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss; L1LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
input_grad[box_idx + 3 * stride] = input_grad[box_idx + 3 * stride] =
L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss; L1LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
} }
template <typename T> template <typename T>
static inline void CalcLabelLoss(T* loss, const T* input, const int index, static inline void CalcLabelLoss(T* loss, const T* input, const int index,
const int label, const int class_num, const int label, const int class_num,
const int stride) { const int stride, const T pos, const T neg,
T score) {
for (int i = 0; i < class_num; i++) { for (int i = 0; i < class_num; i++) {
T pred = input[index + i * stride]; T pred = input[index + i * stride];
loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0); loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? pos : neg) * score;
} }
} }
...@@ -169,11 +171,13 @@ template <typename T> ...@@ -169,11 +171,13 @@ template <typename T>
static inline void CalcLabelLossGrad(T* input_grad, const T loss, static inline void CalcLabelLossGrad(T* input_grad, const T loss,
const T* input, const int index, const T* input, const int index,
const int label, const int class_num, const int label, const int class_num,
const int stride) { const int stride, const T pos, const T neg,
T score) {
for (int i = 0; i < class_num; i++) { for (int i = 0; i < class_num; i++) {
T pred = input[index + i * stride]; T pred = input[index + i * stride];
input_grad[index + i * stride] = input_grad[index + i * stride] =
SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss; SigmoidCrossEntropyGrad<T>(pred, (i == label) ? pos : neg) * score *
loss;
} }
} }
...@@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness, ...@@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
for (int l = 0; l < w; l++) { for (int l = 0; l < w; l++) {
T obj = objness[k * w + l]; T obj = objness[k * w + l];
if (obj > 1e-5) { if (obj > 1e-5) {
// positive sample: obj = 1 // positive sample: obj = mixup score
loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0); loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0) * obj;
} else if (obj > -0.5) { } else if (obj > -0.5) {
// negetive sample: obj = 0 // negetive sample: obj = 0
loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0); loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
...@@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss, ...@@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
T obj = objness[k * w + l]; T obj = objness[k * w + l];
if (obj > 1e-5) { if (obj > 1e-5) {
input_grad[k * w + l] = input_grad[k * w + l] =
SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i]; SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * obj *
loss[i];
} else if (obj > -0.5) { } else if (obj > -0.5) {
input_grad[k * w + l] = input_grad[k * w + l] =
SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i]; SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
...@@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox"); auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel"); auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto* gt_score = ctx.Input<Tensor>("GTScore");
auto* loss = ctx.Output<Tensor>("Loss"); auto* loss = ctx.Output<Tensor>("Loss");
auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask"); auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask"); auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
...@@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
float ignore_thresh = ctx.Attr<float>("ignore_thresh"); float ignore_thresh = ctx.Attr<float>("ignore_thresh");
int downsample_ratio = ctx.Attr<int>("downsample_ratio"); int downsample_ratio = ctx.Attr<int>("downsample_ratio");
bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
const int n = input->dims()[0]; const int n = input->dims()[0];
const int h = input->dims()[2]; const int h = input->dims()[2];
...@@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
const int stride = h * w; const int stride = h * w;
const int an_stride = (class_num + 5) * stride; const int an_stride = (class_num + 5) * stride;
T label_pos = 1.0;
T label_neg = 0.0;
if (use_label_smooth) {
label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
label_neg = 1.0 / static_cast<T>(class_num);
}
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* gt_box_data = gt_box->data<T>(); const T* gt_box_data = gt_box->data<T>();
const int* gt_label_data = gt_label->data<int>(); const int* gt_label_data = gt_label->data<int>();
...@@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
int* gt_match_mask_data = int* gt_match_mask_data =
gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace()); gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
const T* gt_score_data;
if (!gt_score) {
Tensor gtscore;
gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
math::SetConstant<platform::CPUDeviceContext, T>()(
ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
static_cast<T>(1.0));
gt_score = &gtscore;
gt_score_data = gtscore.data<T>();
} else {
gt_score_data = gt_score->data<T>();
}
// calc valid gt box mask, avoid calc duplicately in following code // calc valid gt box mask, avoid calc duplicately in following code
Tensor gt_valid_mask; Tensor gt_valid_mask;
bool* gt_valid_mask_data = bool* gt_valid_mask_data =
...@@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
int mask_idx = GetMaskIndex(anchor_mask, best_n); int mask_idx = GetMaskIndex(anchor_mask, best_n);
gt_match_mask_data[i * b + t] = mask_idx; gt_match_mask_data[i * b + t] = mask_idx;
if (mask_idx >= 0) { if (mask_idx >= 0) {
T score = gt_score_data[i * b + t];
int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 0); an_stride, stride, 0);
CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n, CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
box_idx, gi, gj, h, input_size, stride); box_idx, gi, gj, h, input_size, stride, score);
int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
obj_mask_data[obj_idx] = 1.0; obj_mask_data[obj_idx] = score;
int label = gt_label_data[i * b + t]; int label = gt_label_data[i * b + t];
int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 5); an_stride, stride, 5);
CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label, CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
class_num, stride); class_num, stride, label_pos, label_neg, score);
} }
} }
} }
...@@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> { ...@@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox"); auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel"); auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto* gt_score = ctx.Input<Tensor>("GTScore");
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X")); auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss")); auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask"); auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
...@@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> { ...@@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask"); auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
int downsample_ratio = ctx.Attr<int>("downsample_ratio"); int downsample_ratio = ctx.Attr<int>("downsample_ratio");
bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
const int n = input_grad->dims()[0]; const int n = input_grad->dims()[0];
const int c = input_grad->dims()[1]; const int c = input_grad->dims()[1];
...@@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> { ...@@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
const int stride = h * w; const int stride = h * w;
const int an_stride = (class_num + 5) * stride; const int an_stride = (class_num + 5) * stride;
T label_pos = 1.0;
T label_neg = 0.0;
if (use_label_smooth) {
label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
label_neg = 1.0 / static_cast<T>(class_num);
}
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* gt_box_data = gt_box->data<T>(); const T* gt_box_data = gt_box->data<T>();
const int* gt_label_data = gt_label->data<int>(); const int* gt_label_data = gt_label->data<int>();
...@@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> { ...@@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace()); input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
memset(input_grad_data, 0, input_grad->numel() * sizeof(T)); memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
const T* gt_score_data;
if (!gt_score) {
Tensor gtscore;
gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
math::SetConstant<platform::CPUDeviceContext, T>()(
ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
static_cast<T>(1.0));
gt_score = &gtscore;
gt_score_data = gtscore.data<T>();
} else {
gt_score_data = gt_score->data<T>();
}
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
for (int t = 0; t < b; t++) { for (int t = 0; t < b; t++) {
int mask_idx = gt_match_mask_data[i * b + t]; int mask_idx = gt_match_mask_data[i * b + t];
if (mask_idx >= 0) { if (mask_idx >= 0) {
T score = gt_score_data[i * b + t];
Box<T> gt = GetGtBox(gt_box_data, i, b, t); Box<T> gt = GetGtBox(gt_box_data, i, b, t);
int gi = static_cast<int>(gt.x * w); int gi = static_cast<int>(gt.x * w);
int gj = static_cast<int>(gt.y * h); int gj = static_cast<int>(gt.y * h);
int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 0); an_stride, stride, 0);
CalcBoxLocationLossGrad<T>( CalcBoxLocationLossGrad<T>(input_grad_data, loss_grad_data[i],
input_grad_data, loss_grad_data[i], input_data, gt, anchors, input_data, gt, anchors,
anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride); anchor_mask[mask_idx], box_idx, gi, gj, h,
input_size, stride, score);
int label = gt_label_data[i * b + t]; int label = gt_label_data[i * b + t];
int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 5); an_stride, stride, 5);
CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data, CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
label_idx, label, class_num, stride); label_idx, label, class_num, stride, label_pos,
label_neg, score);
} }
} }
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/expand_op.h" #include "paddle/fluid/operators/expand_op.h"
#include <memory>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -138,12 +139,28 @@ class ExpandGradOp : public framework::OperatorWithKernel { ...@@ -138,12 +139,28 @@ class ExpandGradOp : public framework::OperatorWithKernel {
} }
}; };
class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("expand_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ExpandGradOpDescMaker);
REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp); REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>, expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.h" #include "paddle/fluid/operators/fake_dequantize_op.h"
#include <string> #include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -76,6 +77,63 @@ $$Out = \frac{scale*X}{ max_range }$$ ...@@ -76,6 +77,63 @@ $$Out = \frac{scale*X}{ max_range }$$
} }
}; };
class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("X"),
"Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
PADDLE_ENFORCE(ctx->HasInputs("Scales"),
"Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
ctx->ShareDim("X", /*->*/ "Out");
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class FakeChannelWiseDequantizeMaxAbsOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input with float-32/64 type is the "
"low precision tensor.");
AddInput("Scales",
"(Tensors) The scales in quantization stage. "
"Now, `Scales` is a vector with at most two tensors. "
"If Scales has two elements, the second tensor should only have "
"one value.")
.AsDuplicable();
AddOutput("Out",
"(Tensor) The output is the dequantized high "
"precision tensor.");
AddAttr<std::vector<int>>(
"quant_bits",
"Quantization bit numbers in quantization stage. "
"The size of `quant_bits` should be equal to the size of `Scales`.")
.SetDefault({8});
AddComment(R"DOC(
FakeChannelWiseDequantizeMaxAbsOp operator.
This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp:
$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$
In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$.
Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$ in the formula.
Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization.
)DOC");
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -88,3 +146,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp, ...@@ -88,3 +146,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs, REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
ops::FakeDequantizeMaxAbsKernel<CPU, float>, ops::FakeDequantizeMaxAbsKernel<CPU, float>,
ops::FakeDequantizeMaxAbsKernel<CPU, double>); ops::FakeDequantizeMaxAbsKernel<CPU, double>);
REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsOp,
ops::FakeChannelWiseDequantizeMaxAbsOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, float>,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, double>);
...@@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext; ...@@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
ops::FakeDequantizeMaxAbsKernel<CUDA, float>, ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
ops::FakeDequantizeMaxAbsKernel<CUDA, double>); ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(
fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float>,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -45,5 +46,42 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> { ...@@ -45,5 +46,42 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& ctx) const {
auto* in = ctx.Input<framework::Tensor>("X");
auto scales = ctx.MultiInput<framework::Tensor>("Scales");
auto* out = ctx.Output<framework::Tensor>("Out");
PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
"The number of first scale values must be the same with "
"first dimension value of Input(X).");
auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
int max_range = std::pow(2, quant_bits[0] - 1) - 1;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace());
auto dequant = DequantizeFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
dequant(dev_ctx, &one_channel_in, &one_channel_scale,
static_cast<T>(max_range), &one_channel_out);
}
if (scales.size() == 2) {
PADDLE_ENFORCE_EQ(
scales[1]->numel(), 1,
"The second scale tensor should only have one value at now.");
max_range = std::pow(2, quant_bits[1] - 1) - 1;
dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -134,6 +134,60 @@ $$Out = round(X/scale * range)$$ ...@@ -134,6 +134,60 @@ $$Out = round(X/scale * range)$$
} }
}; };
class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeChannelWiseQuantizeOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("OutScales"),
"Output(Scales) of FakeChannelWiseQuantizeOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]});
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
ctx.GetPlace());
}
};
class FakeChannelWiseQuantizeAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddOutput("Out",
"(Tensor) Output of quantized low level tensor, "
"but also saved as float data type.");
AddOutput("OutScales", "(Tensor) Current channel wise scale");
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16.");
});
AddComment(R"DOC(
The scale of FakeChannelWiseQuantize operator is a vector.
In detail, each channel of the input X has a scale value.
$$scale_c = max(abs(X_c))$$
$$range = 2^{bit\_length - 1} - 1$$
$$Out_c = round(\frac{X_c * range} {scale_c})$$
In above three formulas, the range value of c is as follow:
$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
)DOC");
}
};
class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
public: public:
FakeQuantizeRangeAbsMaxOp(const std::string& type, FakeQuantizeRangeAbsMaxOp(const std::string& type,
...@@ -218,3 +272,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, ...@@ -218,3 +272,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>); ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxOp,
ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);
...@@ -174,5 +174,7 @@ namespace ops = paddle::operators; ...@@ -174,5 +174,7 @@ namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max, REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
ops::FakeQuantizeAbsMaxKernel<CUDA, float>); ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>); ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
...@@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> { ...@@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
auto* out_scales = context.Output<framework::Tensor>("OutScales");
T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace());
out->mutable_data<T>(context.GetPlace());
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
auto& dev_ctx = context.template device_context<DeviceContext>();
auto find_abs_max = FindAbsMaxFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel = in->Slice(i, i + 1);
const T* one_channel_data = one_channel.data<T>();
find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
&out_scales_data[i]);
}
auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1);
clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
&one_channel_out);
}
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> { class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { ...@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE(ctx->HasInput("W"), PADDLE_ENFORCE(ctx->HasInput("W"),
"Input W of FusedEmbeddingSeqPoolOp should not be null."); "Input W of FusedEmbeddingSeqPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"), PADDLE_ENFORCE(ctx->HasInput("Ids"),
...@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) " "(boolean, default false) "
"Sparse update.") "Sparse update.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
FusedEmbeddingSeqPool Operator. FusedEmbeddingSeqPool Operator.
......
...@@ -121,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> { ...@@ -121,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto *ids = context.Input<LoDTensor>("Ids"); auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out")); auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W")); auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
// runtime shape
d_table->set_height(table_dim[0]);
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel(); int64_t ids_num = ids->numel();
......
...@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel { ...@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of HashOp should not be null."); "Input(X) of HashOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
...@@ -57,6 +54,8 @@ $$Out = scale * X$$ ...@@ -57,6 +54,8 @@ $$Out = scale * X$$
)DOC"); )DOC");
AddAttr<int>("num_hash", "").SetDefault(1); AddAttr<int>("num_hash", "").SetDefault(1);
AddAttr<int>("mod_by", "").SetDefault(100000); AddAttr<int>("mod_by", "").SetDefault(100000);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
} }
}; };
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "math.h" // NOLINT
namespace paddle {
namespace operators {
inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
return static_cast<platform::float16>(::expf(static_cast<float>(x)));
}
inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
return static_cast<platform::float16>(::logf(static_cast<float>(x)));
}
inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
inline HOSTDEVICE double real_log(double x) { return ::log(x); }
} // namespace operators
} // namespace paddle
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -20,17 +21,6 @@ namespace paddle { ...@@ -20,17 +21,6 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
namespace {
__device__ __forceinline__ float real_log(float x) { return logf(x); }
__device__ __forceinline__ double real_log(double x) { return log(x); }
__device__ __forceinline__ platform::float16 real_log(
const platform::float16& val) {
return static_cast<platform::float16>(logf(static_cast<float>(val)));
}
template <typename T> template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
const int N, const int D, const int N, const int D,
...@@ -61,7 +51,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, ...@@ -61,7 +51,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
Y[blockIdx.x] = -val; Y[blockIdx.x] = -val;
} }
} }
} // namespace
template <typename T> template <typename T>
class CrossEntropyFunctor<platform::CUDADeviceContext, T> { class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
......
...@@ -29,7 +29,6 @@ limitations under the License. */ ...@@ -29,7 +29,6 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_engine.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
...@@ -42,44 +41,75 @@ static ngraph::Shape Ddim2Shape(const framework::DDim& dims) { ...@@ -42,44 +41,75 @@ static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
for (int i = 0; i < dims.size(); ++i) { for (int i = 0; i < dims.size(); ++i) {
int k = dims[i]; int k = dims[i];
k = k == 0 ? 1 : k; k = k == 0 ? 1 : k;
sp.push_back(k); sp.emplace_back(k);
} }
return sp; return sp;
} }
static framework::DDim Shape2Ddim(const ngraph::Shape& shape) {
std::vector<int64_t> dims;
for (size_t i = 0; i < shape.size(); ++i) {
int64_t k = shape[i];
dims.emplace_back(k);
}
return framework::make_ddim(dims);
}
static std::map<framework::proto::VarType::Type, ngraph::element::Type> static std::map<framework::proto::VarType::Type, ngraph::element::Type>
pd2ng_type_map = { pd2ng_type_map = {
{framework::proto::VarType::FP32, ngraph::element::f32}, {framework::proto::VarType::FP32, ngraph::element::f32},
{framework::proto::VarType::FP64, ngraph::element::f64}, {framework::proto::VarType::FP64, ngraph::element::f64},
{framework::proto::VarType::INT32, ngraph::element::i32}, {framework::proto::VarType::INT32, ngraph::element::i32},
{framework::proto::VarType::INT64, ngraph::element::i64}, {framework::proto::VarType::INT64, ngraph::element::i64},
{framework::proto::VarType::BOOL, ngraph::element::boolean}, {framework::proto::VarType::BOOL, ngraph::element::boolean}};
};
static std::map<ngraph::element::Type, framework::proto::VarType::Type>
std::unordered_map<std::string, std::shared_ptr<ngraph::Function>> ng2pd_type_map = {
NgraphEngine::func_cache_ = {}; {ngraph::element::f32, framework::proto::VarType::FP32},
{ngraph::element::f64, framework::proto::VarType::FP64},
{ngraph::element::i32, framework::proto::VarType::INT32},
{ngraph::element::i64, framework::proto::VarType::INT64},
{ngraph::element::boolean, framework::proto::VarType::BOOL}};
std::vector<std::string> NgraphEngine::feed_vars = {};
std::vector<std::string> NgraphEngine::fetch_vars = {};
framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
std::unordered_map<std::string,
std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
NgraphEngine::t_in_cache_ = {};
std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ = std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
ngraph::runtime::Backend::create("CPU"); ngraph::runtime::Backend::create("CPU");
static std::vector<std::vector<int>> NgraphOpIntervals( static std::vector<std::vector<int>> NgraphOpIntervals(
framework::BlockDesc* block) { std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
NgraphEngine::feed_vars.clear();
NgraphEngine::fetch_vars.clear();
std::vector<std::vector<int>> intervals; std::vector<std::vector<int>> intervals;
auto ops = block->AllOps();
int size = ops.size(); int size = ops->size();
int left = 0; int left = 0;
while (left < size && ops.at(left)->Type() != framework::kFeedOpType) { while (left < size && ops->at(left)->Type() != framework::kFeedOpType) {
++left; ++left;
} }
if (left == size) { if (left == size) {
return intervals; return intervals;
} }
while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
for (auto& var_name_item : ops->at(left)->Outputs()) {
for (auto& var_name : var_name_item.second) {
NgraphEngine::feed_vars.emplace_back(var_name);
}
}
++left; ++left;
} }
int right = left; int right = left;
while (right < size && ops.at(right)->Type() != framework::kFetchOpType) { while (right < size && ops->at(right)->Type() != framework::kFetchOpType) {
++right; ++right;
} }
if (right == size) { if (right == size) {
...@@ -87,85 +117,124 @@ static std::vector<std::vector<int>> NgraphOpIntervals( ...@@ -87,85 +117,124 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
} }
if (left >= right) return intervals; if (left >= right) return intervals;
int index = right;
while (index < size && ops->at(index)->Type() == framework::kFetchOpType) {
for (auto& var_name_item : ops->at(index)->Inputs()) {
for (auto& var_name : var_name_item.second) {
NgraphEngine::fetch_vars.emplace_back(var_name);
}
}
++index;
}
// (left, right - 1) represents indices between feed and fetch // (left, right - 1) represents indices between feed and fetch
int pivot = left; int pivot = left;
while (pivot < right) { while (pivot < right) {
auto op_type = ops.at(pivot)->Type(); auto op_type = ops->at(pivot)->Type();
if (NgraphBridge::isRegister(op_type)) { if (NgraphBridge::isRegister(op_type)) {
++pivot; ++pivot;
} else { } else {
int start = pivot, end = start; int start = pivot, end = start;
while (pivot < right && while (pivot < right &&
(!NgraphBridge::isRegister(ops.at(pivot)->Type()))) { (!NgraphBridge::isRegister(ops->at(pivot)->Type()))) {
++pivot; ++pivot;
++end; ++end;
} }
std::vector<int> interval = {start, end}; std::vector<int> interval = {start, end};
intervals.push_back(interval); intervals.emplace_back(interval);
} }
} // end while } // end while
return intervals; return intervals;
} }
static void SubstituteNgraphOp(framework::BlockDesc* block, static void SubstituteNgraphOp(
std::string block_str, std::vector<std::unique_ptr<framework::OperatorBase>>* ops,
std::vector<int> interval) { std::string engine_key, std::string block_str, std::vector<int> interval) {
framework::ProgramDesc program; framework::OpDesc ng_op_desc(nullptr);
block->RemoveOp(interval.at(0), interval.at(1)); ng_op_desc.SetType("ngraph_engine");
auto* ng_op = block->InsertOp(interval.at(0)); ng_op_desc.SetAttr("interval", interval);
ng_op->SetType("ngraph_engine"); ng_op_desc.SetAttr("engine_key", engine_key);
ng_op->SetAttr("interval", interval); ng_op_desc.SetAttr("graph", block_str);
ng_op->SetAttr("graph", block_str);
ops->erase(ops->begin() + interval[0], ops->begin() + interval[1]);
ops->insert(ops->begin() + interval[0],
framework::OpRegistry::CreateOp(ng_op_desc));
} }
// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089 std::string SerializedBlock(const std::vector<framework::OpDesc*>& op_descs) {
void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) { framework::proto::BlockDesc block_proto;
#ifdef PADDLE_WITH_NGRAPH framework::BlockDesc block_desc(nullptr, &block_proto);
VLOG(4) << "use_ngraph=True"; block_desc.Proto()->set_parent_idx(-1);
for (size_t bid = 0; bid < program.Size(); ++bid) { block_desc.Proto()->set_idx(0);
// TODO(baojun-nervana): Remove the const_cast
auto* block = for (auto* op_desc : op_descs) {
const_cast<framework::ProgramDesc&>(program).MutableBlock(bid); auto* op = block_desc.AppendOp();
std::string block_str = block->Proto()->SerializeAsString(); *op->Proto() = *op_desc->Proto();
auto intervals = NgraphOpIntervals(block); }
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { return block_desc.Proto()->SerializeAsString();
SubstituteNgraphOp(block, block_str, *it); }
}
std::string GenerateEngineKey(const framework::BlockDesc& bdesc) {
framework::proto::BlockDesc block_proto;
framework::BlockDesc block_desc(nullptr, &block_proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
for (auto& op_desc : bdesc.AllOps()) {
auto* op = block_desc.AppendOp();
*op->Proto() = *op_desc->Proto();
}
auto engine_key = std::to_string(
std::hash<std::string>()(block_desc.Proto()->SerializeAsString()));
return engine_key;
}
std::string GenerateEngineKey(const std::vector<std::string>& engine_inputs,
const std::vector<std::string>& engine_outputs,
int size) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
engine_hash_key += name;
}
for (auto name : engine_outputs) {
engine_hash_key += name;
}
engine_hash_key += std::to_string(size);
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key;
}
void NgraphEngine::FuseNgraphOps(
const framework::BlockDesc& block_desc,
std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
NgraphEngine::p_bdesc = &block_desc;
auto intervals = NgraphOpIntervals(ops);
std::string engine_key =
GenerateEngineKey(feed_vars, fetch_vars, ops->size());
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
SubstituteNgraphOp(ops, engine_key, "", *it);
} }
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
} }
NgraphEngine::NgraphEngine(const framework::Scope& scope, NgraphEngine::NgraphEngine(const framework::Scope& scope,
const platform::Place& place, const platform::Place& place,
const std::string& serialized_graph, const framework::ExecutionContext& ctx)
const std::vector<int>& interval)
: scope_(scope), place_(place) { : scope_(scope), place_(place) {
std::string serialized_graph = ctx.Attr<std::string>("graph");
auto interval = ctx.Attr<std::vector<int>>("interval");
std::string engine_key = ctx.Attr<std::string>("engine_key");
var_in_node_map_ = std::make_shared< var_in_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>(); std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
var_node_map_ = std::make_shared< var_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>(); std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) + GetNgFunction(engine_key, interval);
serialized_graph;
framework::proto::BlockDesc bdesc;
bdesc.ParseFromString(serialized_graph);
framework::BlockDesc block(nullptr, &bdesc);
Prepare(block, interval);
BuildNgIO();
GetNgFunction();
} }
void NgraphEngine::Prepare(const framework::BlockDesc& block, void NgraphEngine::Prepare(const std::vector<int>& interval) {
const std::vector<int>& interval) { for (auto& var : p_bdesc->AllVars()) {
for (auto& var : block.AllVars()) {
if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
var->GetType() == framework::proto::VarType::LOD_TENSOR || var->GetType() == framework::proto::VarType::LOD_TENSOR ||
var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) { var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
...@@ -192,108 +261,57 @@ void NgraphEngine::Prepare(const framework::BlockDesc& block, ...@@ -192,108 +261,57 @@ void NgraphEngine::Prepare(const framework::BlockDesc& block,
} }
} }
auto ops_desc = block.AllOps(); std::vector<paddle::framework::OpDesc*> ops_desc;
int idx = interval[0]; for (auto op_desc : p_bdesc->AllOps()) {
while (idx < interval[1]) { ops_desc.emplace_back(op_desc);
auto op_desc = ops_desc.at(idx);
auto op = framework::OpRegistry::CreateOp(*op_desc);
fused_ops_.push_back(std::move(op));
++idx;
}
while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
auto op_desc = ops_desc.at(idx);
for (auto& var_name_item : op_desc->Inputs()) {
for (auto& var_name : var_name_item.second) {
post_op_inputs_.insert(var_name);
}
}
++idx;
}
while (idx < static_cast<int>(ops_desc.size()) &&
ops_desc.at(idx)->Type() == framework::kFetchOpType) {
std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
fetches_.insert(fetch_target_name);
++idx;
}
if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType &&
ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
ng_op_state_ = OpState::FULL;
} }
for (auto* op_desc : ops_desc) { for (auto op_desc : ops_desc) {
if (op_desc->Type().find("_grad") != std::string::npos) { if (op_desc->Type().find("_grad") != std::string::npos) {
ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN this->is_test_ = false;
: OpState::PARTIAL_TRAIN;
break; break;
} }
} }
if (ng_op_state_ != OpState::FULL_TRAIN && if (interval[0] > 0 &&
ng_op_state_ != OpState::PARTIAL_TRAIN) { ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType &&
ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST interval[1] < static_cast<int>(ops_desc.size()) &&
: OpState::PARTIAL_TEST; ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
this->op_state_ = OpState::FULL;
} }
}
void NgraphEngine::GetNgInputShape( if (this->op_state_ == OpState::FULL) {
std::shared_ptr<framework::OperatorBase> op) { this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN;
framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); } else {
op->RuntimeInferShape(scope_, place_, ctx); this->op_state_ =
for (auto& var_name_item : op->Inputs()) { this->is_test_ ? OpState::PARTIAL_TEST : OpState::PARTIAL_TRAIN;
for (auto& var_name : var_name_item.second) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto sp = Ddim2Shape(tensor_pd->dims());
if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
var_in_.end()) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
// auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
auto ng_type = var_type_map_.at(var_name);
auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
(*var_node_map_)[var_name] = prm;
(*var_in_node_map_)[var_name] = prm;
}
}
}
}
} }
}
void NgraphEngine::BuildNgNodes() { int idx = interval[0];
for (auto& op : fused_ops_) { while (idx < interval[1]) {
for (auto& var_name_item : op->Outputs()) { this->fused_ops_.emplace_back(
framework::OpRegistry::CreateOp(*(ops_desc[idx])));
++idx;
}
while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
auto op_desc = ops_desc.at(idx);
for (auto& var_name_item : op_desc->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
if (var_node_map_->find(var_name) == var_node_map_->end()) { this->post_op_inputs_.insert(var_name);
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
auto ng_shape = Ddim2Shape(ddim);
auto ng_type = var_type_map_.at(var_name);
auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
ng_shape, true);
(*var_node_map_)[var_name] = prm;
}
}
} }
} }
++idx;
} }
NgraphBridge ngb(var_node_map_);
for (auto& op : fused_ops_) { BuildNgIO(ops_desc, interval);
ngb.BuildNgNode(op);
}
} }
void NgraphEngine::BuildNgIO() { void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
const std::vector<int>& interval) {
std::unordered_set<std::string> inputs; std::unordered_set<std::string> inputs;
std::unordered_set<std::string> outputs; std::unordered_set<std::string> outputs;
for (int i = interval[0]; i < interval[1]; ++i) {
for (auto& op : fused_ops_) { auto op = ops_desc[i];
for (auto& var_name_item : op->Inputs()) { for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
inputs.insert(var_name); inputs.insert(var_name);
...@@ -302,15 +320,11 @@ void NgraphEngine::BuildNgIO() { ...@@ -302,15 +320,11 @@ void NgraphEngine::BuildNgIO() {
std::find(var_in_.begin(), var_in_.end(), var_name) == std::find(var_in_.begin(), var_in_.end(), var_name) ==
var_in_.end()) { var_in_.end()) {
// fill var_in here to keep lhs and rhs order // fill var_in here to keep lhs and rhs order
var_in_.push_back(var_name); this->var_in_.emplace_back(var_name);
} }
} }
} }
if (op->Type() != "fill_constant") {
GetNgInputShape(op);
}
for (auto& var_name_item : op->Outputs()) { for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet", "op %s has more than 1 output - Not handling yet",
...@@ -322,172 +336,278 @@ void NgraphEngine::BuildNgIO() { ...@@ -322,172 +336,278 @@ void NgraphEngine::BuildNgIO() {
} }
// var_out.clear(); // var_out.clear();
for (auto& op : fused_ops_) { for (int i = interval[0]; i < interval[1]; ++i) {
auto op = ops_desc[i];
for (auto& var_name_item : op->Outputs()) { for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet", "op %s has more than 1 output - Not handling yet",
op->Type()); op->Type());
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
switch (ng_op_state_) { switch (this->op_state_) {
case OpState::PARTIAL_TEST: case OpState::PARTIAL_TEST:
if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
fetches_.find(var_name) != fetches_.end()) { find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
var_out_.push_back(var_name); fetch_vars.end()) {
this->var_out_.emplace_back(var_name);
} }
break; break;
case OpState::FULL_TEST: case OpState::FULL_TEST:
if (fetches_.find(var_name) != fetches_.end()) { if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
var_out_.push_back(var_name); fetch_vars.end()) {
this->var_out_.emplace_back(var_name);
} }
break; break;
case OpState::PARTIAL_TRAIN: case OpState::PARTIAL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() || if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
fetch_vars.end() ||
post_op_inputs_.find(var_name) != post_op_inputs_.end() || post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
persistables_.find(var_name) != persistables_.end()) { persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name); this->var_out_.emplace_back(var_name);
} }
break; break;
case OpState::FULL_TRAIN: case OpState::FULL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() || if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
fetch_vars.end() ||
persistables_.find(var_name) != persistables_.end()) { persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name); this->var_out_.emplace_back(var_name);
} }
break; break;
default: default:
var_out_.push_back(var_name); this->var_out_.emplace_back(var_name);
} }
} }
} }
} }
for (size_t i = 0; i < var_in_.size(); ++i) {
auto var_name = var_in_[i];
if (persistables_.find(var_name) == persistables_.end()) {
var_in_updates_.emplace_back(i);
}
}
} }
void NgraphEngine::BuildNgFunction() { void NgraphEngine::GetNgInputShape() {
for (auto& var_name : var_in_) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto sp = Ddim2Shape(tensor_pd->dims());
auto ng_type = var_type_map_[var_name];
auto prm = std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
(*var_node_map_)[var_name] = prm;
(*var_in_node_map_)[var_name] = prm;
}
}
}
void NgraphEngine::BuildNgNodes() {
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Outputs()) {
for (auto& var_name : var_name_item.second) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
auto ng_shape = Ddim2Shape(ddim);
auto ng_type = var_type_map_[var_name];
auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
ng_shape, true);
(*var_node_map_)[var_name] = prm;
}
}
}
}
}
NgraphBridge ngb(var_node_map_);
for (auto& op : fused_ops_) {
ngb.BuildNgNode(op);
}
}
void NgraphEngine::RunInferShape() {
for (auto& op : fused_ops_) {
framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
op->RuntimeInferShape(scope_, place_, ctx);
}
}
void NgraphEngine::BuildNgFunction(const std::vector<int>& interval) {
Prepare(interval);
RunInferShape();
GetNgInputShape();
BuildNgNodes(); BuildNgNodes();
ngraph_function_ = nullptr; ngraph_function_ = nullptr;
ngraph::NodeVector func_outputs; ngraph::NodeVector func_outputs;
ngraph::ParameterVector func_inputs; ngraph::ParameterVector func_inputs;
for (auto& vo : var_out_) { for (auto& vo : var_out_) {
func_outputs.push_back(var_node_map_->at(vo)); func_outputs.emplace_back(var_node_map_->at(vo));
} }
for (auto& vi : var_in_) { for (auto& vi : var_in_) {
std::shared_ptr<ngraph::op::Parameter> prm = std::shared_ptr<ngraph::op::Parameter> prm =
std::dynamic_pointer_cast<ngraph::op::Parameter>( std::dynamic_pointer_cast<ngraph::op::Parameter>(
var_in_node_map_->at(vi)); var_in_node_map_->at(vi));
func_inputs.push_back(prm); func_inputs.emplace_back(prm);
} }
ngraph_function_ = ngraph_function_ =
std::make_shared<ngraph::Function>(func_outputs, func_inputs); std::make_shared<ngraph::Function>(func_outputs, func_inputs);
} }
void NgraphEngine::GetNgFunction() { void NgraphEngine::GetNgFunction(std::string engine_key,
bool cache_on = true; const std::vector<int>& interval) {
if (cache_on) { bool use_cache = true;
std::string input_shape_str; if (use_cache) {
for (auto& var_name : var_in_) { this->func_cache_key_ = "";
auto shape = var_node_map_->at(var_name)->get_shape(); for (int i = 0; i < std::min(static_cast<int>(feed_vars.size()), 10); ++i) {
for (size_t i = 0; i < shape.size(); ++i) { auto* var = scope_.FindVar(feed_vars[i]);
input_shape_str += std::to_string(shape.at(i)); if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto dims = tensor_pd->dims();
for (int j = 0; j < dims.size(); ++j) {
func_cache_key_ += std::to_string(dims[j]);
}
} }
} }
func_cache_key_ = input_shape_str + func_cache_key_; func_cache_key_ += std::to_string(interval[0]) + "_" +
if (func_cache_.find(func_cache_key_) != func_cache_.end()) { std::to_string(interval[1]) + engine_key;
ngraph_function_ = func_cache_.at(func_cache_key_); func_cache_key_ = std::to_string(std::hash<std::string>()(func_cache_key_));
} else {
BuildNgFunction(); if (engine_cache.find(func_cache_key_) != engine_cache.end()) {
func_cache_[func_cache_key_] = ngraph_function_; if (engine_cache[func_cache_key_].persistables.size() == 0) {
engine_cache.clear();
t_in_cache_.clear();
} else {
auto var_name = engine_cache[func_cache_key_].persistables.begin();
framework::Variable* var = scope_.FindVar(*var_name);
if (var != pre_var_ptr) {
engine_cache.clear();
t_in_cache_.clear();
}
pre_var_ptr = var;
}
}
if (engine_cache.find(func_cache_key_) == engine_cache.end()) {
BuildNgFunction(interval);
engine_cache[func_cache_key_].ngraph_function = this->ngraph_function_;
engine_cache[func_cache_key_].persistables = this->persistables_;
engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_;
engine_cache[func_cache_key_].var_in = this->var_in_;
engine_cache[func_cache_key_].var_out = this->var_out_;
engine_cache[func_cache_key_].is_test = this->is_test_;
} }
} else { } else {
BuildNgFunction(); BuildNgFunction(interval);
} }
} }
void NgraphEngine::Run(const framework::Scope& scope, void NgraphEngine::Run(const framework::Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in; std::shared_ptr<ngraph::Function> ng_func;
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out; const std::set<std::string>* p_persistables;
const std::vector<size_t>* p_var_in_updates;
const std::vector<std::string>* p_var_in;
const std::vector<std::string>* p_var_out;
bool is_test;
bool use_cache = true;
if (use_cache) {
PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(),
"Cannot find cached data to run ngraph function");
ng_func = engine_cache[func_cache_key_].ngraph_function;
p_persistables = &(engine_cache[func_cache_key_].persistables);
p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates);
p_var_in = &(engine_cache[func_cache_key_].var_in);
p_var_out = &(engine_cache[func_cache_key_].var_out);
is_test = engine_cache[func_cache_key_].is_test;
} else {
ng_func = ngraph_function_;
p_persistables = &this->persistables_;
p_var_in_updates = &this->var_in_updates_;
p_var_in = &this->var_in_;
p_var_out = &this->var_out_;
is_test = this->is_test_;
}
for (size_t i = 0; i < var_in_.size(); ++i) { std::vector<std::shared_ptr<ngraph::runtime::Tensor>>* p_t_in;
auto vi = var_in_.at(i); std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in = {};
auto sp = var_node_map_->at(vi)->get_shape();
std::shared_ptr<ngraph::runtime::Tensor> ti; auto m_parameters = ng_func->get_parameters();
auto* var = scope.FindVar(vi); auto m_results = ng_func->get_results();
if (var && var->IsType<framework::LoDTensor>()) { if (is_test && use_cache &&
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), p_t_in = &(t_in_cache_[func_cache_key_]);
"Ensure ngraph tensor layout align with paddle tensor"); for (size_t i = 0; i < p_var_in_updates->size(); ++i) {
auto ng_type = var_type_map_.at(vi); int index = p_var_in_updates->at(i);
if (ng_type == ngraph::element::f32) { auto vi = p_var_in->at(index);
auto pd_arr = tensor_pd->mutable_data<float>(place); auto sp = m_parameters[index]->get_shape();
ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); auto ng_type = m_parameters[index]->get_element_type();
} else if (ng_type == ngraph::element::i32) { std::shared_ptr<ngraph::runtime::Tensor> ti;
const int* arr = tensor_pd->data<int>(); auto* var = scope.FindVar(vi);
ti = backend_->create_tensor(ngraph::element::i32, sp, if (var && var->IsType<framework::LoDTensor>()) {
const_cast<int*>(arr)); auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
} else if (ng_type == ngraph::element::i64) { void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
auto pd_arr = tensor_pd->mutable_data<int64_t>(place); ti = backend_->create_tensor(ng_type, sp, pd_arr);
ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); (*p_t_in)[index] = ti;
} else if (ng_type == ngraph::element::f64) {
auto pd_arr = tensor_pd->mutable_data<double>(place);
ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
} else if (ng_type == ngraph::element::boolean) {
auto pd_arr = tensor_pd->mutable_data<bool>(place);
ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
} else { } else {
PADDLE_THROW("Data type not handling for var %s", vi); PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
} }
}
} else {
if (is_test && use_cache) {
p_t_in = &(t_in_cache_[func_cache_key_]);
} else { } else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vi); p_t_in = &t_in;
} }
bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
ng_op_state_ == OpState::FULL_TEST) for (size_t i = 0; i < p_var_in->size(); ++i) {
? true auto vi = p_var_in->at(i);
: false; auto sp = m_parameters[i]->get_shape();
bool is_persistable = auto ng_type = m_parameters[i]->get_element_type();
(persistables_.find(vi) != persistables_.end()) ? true : false; std::shared_ptr<ngraph::runtime::Tensor> ti;
if (is_test && is_persistable) { auto* var = scope.FindVar(vi);
ti->set_stale(false); if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
"Ensure ngraph tensor layout align with paddle tensor");
ti = backend_->create_tensor(ng_type, sp, pd_arr);
} else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
}
bool is_persistable =
(p_persistables->find(vi) != p_persistables->end()) ? true : false;
if (is_test && is_persistable) {
ti->set_stale(false);
}
(*p_t_in).emplace_back(ti);
} }
t_in.push_back(ti);
} }
for (size_t i = 0; i < var_out_.size(); ++i) { std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out = {};
auto vo = var_out_[i]; for (size_t i = 0; i < p_var_out->size(); ++i) {
auto vo = p_var_out->at(i);
auto* var = scope.FindVar(vo); auto* var = scope.FindVar(vo);
std::shared_ptr<ngraph::runtime::Tensor> to;
if (var && var->IsType<framework::LoDTensor>()) { if (var && var->IsType<framework::LoDTensor>()) {
auto sp = m_results[i]->get_shape();
var->GetMutable<framework::LoDTensor>()->Resize(Shape2Ddim(sp));
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
auto dd = tensor_pd->dims(); auto ng_type = m_results[i]->get_element_type();
ngraph::Shape sp = Ddim2Shape(dd); void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
auto ng_type = var_type_map_.at(vo); std::shared_ptr<ngraph::runtime::Tensor> to =
if (ng_type == ngraph::element::f32) { backend_->create_tensor(ng_type, sp, pd_arr);
auto pd_arr = tensor_pd->mutable_data<float>(place); t_out.emplace_back(to);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::i64) {
auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::i32) {
auto pd_arr = tensor_pd->mutable_data<int>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::f64) {
auto pd_arr = tensor_pd->mutable_data<double>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::boolean) {
auto pd_arr = tensor_pd->mutable_data<bool>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else {
PADDLE_THROW("Data type not handled in for var %s", vo);
}
t_out.push_back(to);
} else { } else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vo); PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
} }
} }
auto handle = backend_->compile(ngraph_function_); auto handle = backend_->compile(ng_func);
handle->call_with_validate(t_out, t_in); handle->call_with_validate(t_out, *p_t_in);
} // NgraphEngine::Run } // NgraphEngine::Run
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#include <memory>
#include <set>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
...@@ -33,29 +39,47 @@ enum class OpState { /* nGraph support state on ops */ ...@@ -33,29 +39,47 @@ enum class OpState { /* nGraph support state on ops */
UNKNOWN /* Output all for debug purpose */ UNKNOWN /* Output all for debug purpose */
}; };
// cache engine repetitives
struct EngineCache {
std::shared_ptr<ngraph::Function> ngraph_function;
std::set<std::string> persistables;
std::vector<std::string> var_in;
std::vector<std::string> var_out;
std::vector<size_t> var_in_updates;
bool is_test = true;
};
// perform graph build through bridge and execute computation // perform graph build through bridge and execute computation
class NgraphEngine { class NgraphEngine {
public: public:
explicit NgraphEngine(const framework::Scope& scope, explicit NgraphEngine(const framework::Scope& scope,
const platform::Place& place, const platform::Place& place,
const std::string& serialized_graph, const framework::ExecutionContext& ctx);
const std::vector<int>& interval);
void Run(const framework::Scope& scope, const platform::Place& place) const; void Run(const framework::Scope& scope, const platform::Place& place) const;
static void EnableNgraph(const framework::ProgramDesc& program); static const framework::BlockDesc* p_bdesc;
static std::vector<std::string> feed_vars, fetch_vars;
static void FuseNgraphOps(
const framework::BlockDesc& prog,
std::vector<std::unique_ptr<framework::OperatorBase>>* ops);
private: private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>> static std::unordered_map<std::string, EngineCache> engine_cache;
func_cache_; static std::unordered_map<
std::string, std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
t_in_cache_;
static framework::Variable* pre_var_ptr;
const framework::Scope& scope_; const framework::Scope& scope_;
const platform::Place& place_; const platform::Place& place_;
std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_; std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_; std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_; std::set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_; std::unordered_set<std::string> post_op_inputs_;
OpState ng_op_state_ = OpState::UNKNOWN; OpState op_state_ = OpState::UNKNOWN;
bool is_test_{true};
std::string func_cache_key_; std::string func_cache_key_;
// ngraph backend eg. CPU // ngraph backend eg. CPU
...@@ -66,6 +90,8 @@ class NgraphEngine { ...@@ -66,6 +90,8 @@ class NgraphEngine {
std::vector<std::string> var_in_; std::vector<std::string> var_in_;
// var_name of outputs from fetch in order // var_name of outputs from fetch in order
std::vector<std::string> var_out_; std::vector<std::string> var_out_;
// non-persitable var_in
std::vector<size_t> var_in_updates_;
// map input vars to nodes // map input vars to nodes
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
...@@ -74,20 +100,23 @@ class NgraphEngine { ...@@ -74,20 +100,23 @@ class NgraphEngine {
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map_; var_node_map_;
// prepare info for nraph engine // prepare info for ngraph engine need
void Prepare(const framework::BlockDesc& block, void Prepare(const std::vector<int>& interval);
const std::vector<int>& interval); // get ngraph engine input and output list
void BuildNgIO(const std::vector<framework::OpDesc*>& op_descs,
const std::vector<int>& interval);
// get ngraph input and define ngraph input parameters // get ngraph input and define ngraph input parameters
void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op); void GetNgInputShape();
// Call ngraph bridge to map ops // Call ngraph bridge to map ops
void BuildNgNodes(); void BuildNgNodes();
// get the ngraph input and output var list // run paddle RuntimeInferShape to get the tensor shape
void BuildNgIO(); void RunInferShape();
// build ngraph function call // build ngraph function call
void BuildNgFunction(); void BuildNgFunction(const std::vector<int>& interval);
// Check cache for ngraph function or otherwise build the function // Check cache for ngraph function or otherwise build the function
void GetNgFunction(); void GetNgFunction(std::string engine_key, const std::vector<int>& interval);
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
...@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Xs", "A list of inputs.").AsDispensable(); AddInput("Xs", "A list of inputs.").AsDispensable();
AddOutput("Ys", "A list of outputs").AsDispensable(); AddOutput("Ys", "A list of outputs").AsDispensable();
AddAttr<std::string>("graph", "the graph."); AddAttr<std::string>("graph", "the graph.");
AddAttr<std::string>("engine_key", "the engine hash key.");
AddAttr<std::vector<int>>("interval", "op interval supported by ngraph"); AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
AddComment("ngraph engine operator."); AddComment("ngraph engine operator.");
} }
......
...@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> { ...@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& scope = ctx.scope(); auto& scope = ctx.scope();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
std::string serialized_graph = ctx.Attr<std::string>("graph");
auto interval = ctx.Attr<std::vector<int>>("interval");
NgraphEngine ngraph_engine(scope, place, serialized_graph, interval); NgraphEngine ngraph_engine(scope, place, ctx);
ngraph_engine.Run(scope, place); ngraph_engine.Run(scope, place);
} }
}; };
......
...@@ -219,14 +219,6 @@ class ReshapeKernel { ...@@ -219,14 +219,6 @@ class ReshapeKernel {
std::vector<int>(shape_data, shape_data + shape_tensor->numel()); std::vector<int>(shape_data, shape_data + shape_tensor->numel());
out_dims = ReshapeOp::ValidateShape(shape, in->dims()); out_dims = ReshapeOp::ValidateShape(shape, in->dims());
} }
if (!in->lod().empty()) {
PADDLE_ENFORCE_EQ(
out_dims[0], in->dims()[0],
"Reshape operator cannot reshape an input sequence batch "
"into an output sequence batch that has a different "
"number of time steps. Please consider using "
"sequence_reshape op.");
}
out->mutable_data(ctx.GetPlace(), in->type()); out->mutable_data(ctx.GetPlace(), in->type());
framework::TensorCopy( framework::TensorCopy(
......
...@@ -15,13 +15,12 @@ limitations under the License. */ ...@@ -15,13 +15,12 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static HOSTDEVICE float real_exp(float x) { return expf(x); }
static HOSTDEVICE float real_exp(double x) { return exp(x); }
template <typename T> template <typename T>
struct SeluFunctor { struct SeluFunctor {
SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr) SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
......
...@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { ...@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("X"), ctx->HasInput("X"),
"Input(X) of SequecceEnumerate operator should not be null."); "Input(X) of SequecceEnumerate operator should not be null.");
...@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
}); });
AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.") AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
.SetDefault(0); .SetDefault(0);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Sequence Enumerate Operator. Sequence Enumerate Operator.
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <cub/cub.cuh> // NOLINT #include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
namespace paddle { namespace paddle {
...@@ -21,9 +22,6 @@ namespace operators { ...@@ -21,9 +22,6 @@ namespace operators {
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
__device__ __forceinline__ float real_exp(float x) { return expf(x); }
__device__ __forceinline__ double real_exp(double x) { return exp(x); }
template <typename T, int BlockDim> template <typename T, int BlockDim>
using BlockReduce = cub::BlockReduce<T, BlockDim>; using BlockReduce = cub::BlockReduce<T, BlockDim>;
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "cub/cub.cuh" #include "cub/cub.cuh"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
...@@ -21,11 +22,6 @@ namespace operators { ...@@ -21,11 +22,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
static HOSTDEVICE float real_exp(float x) { return expf(x); }
static HOSTDEVICE float real_exp(double x) { return exp(x); }
static HOSTDEVICE float real_log(float x) { return logf(x); }
static HOSTDEVICE float real_log(double x) { return log(x); }
static constexpr int kNumCUDAThreads = 512; static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096; static constexpr int kNumMaxinumNumBlocks = 4096;
......
...@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/device_vector.h>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/slice_op.h" #include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <size_t D>
__global__ void Padding(const paddle::platform::float16* d_out,
const int* out_dims, const int* in_dims,
const int* offsets, int64_t n,
paddle::platform::float16* d_in) {
int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
if (out_idx < n) {
int coords[D] = {0};
for (int i = D - 1; i >= 0; --i) {
coords[i] = out_idx % out_dims[i];
out_idx /= out_dims[i];
coords[i] += offsets[i];
}
int64_t in_idx = 0;
for (int i = 0; i < D - 1; ++i) {
in_idx += coords[i] * in_dims[i + 1];
}
in_idx += coords[D - 1];
d_in[in_idx] = d_out[out_idx];
}
}
template <>
class SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
: public framework::OpKernel<paddle::platform::float16> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
auto out_dims = d_out->dims();
auto in_dims = d_in->dims();
int rank = out_dims.size();
std::vector<int> offsets(rank, 0);
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts = ctx.Attr<std::vector<int>>("starts");
for (size_t i = 0; i < starts.size(); ++i) {
if (starts[i] < 0) {
starts[i] += in_dims[axes[i]];
}
offsets[axes[i]] = std::max(starts[i], 0);
}
math::SetConstant<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
set_zero;
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
int64_t numel = d_out->numel();
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1);
dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1);
auto stream = ctx.cuda_device_context().stream();
auto out_shape = framework::vectorize2int(out_dims);
thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
auto in_shape = framework::vectorize2int(in_dims);
thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
switch (rank) {
case 1:
Padding<1><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 2:
Padding<2><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 3:
Padding<3><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 4:
Padding<4><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 5:
Padding<5><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 6:
Padding<6><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>, slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int>, ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice_grad, slice_grad,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h"
namespace ops = paddle::operators;
REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
REGISTER_OPERATOR(sync_batch_norm_grad, ops::BatchNormGradOp);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/nccl_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DataLayout = framework::DataLayout;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T, int BlockDim, framework::DataLayout layout>
__global__ void KeLocalStats(const T *x, int N, int M, int C, T *mean_var) {
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
for (int k = blockIdx.x; k < C; k += gridDim.x) {
T x_sum = 0;
T x2_sum = 0;
for (int i = threadIdx.x; i < N * M; i += BlockDim) {
int id = layout == framework::DataLayout::kNCHW
? (i / M) * C * M + k * M + i % M
: i * C + k;
T x_in = x[id];
x_sum += x_in;
x2_sum += x_in * x_in;
}
__syncthreads();
T out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
mean_var[k] = out / (N * M);
}
out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
mean_var[k + C] = out / (N * M);
}
}
if (blockIdx.x == 0 && threadIdx.x == 0) {
mean_var[2 * C] = static_cast<T>(1.0);
}
}
template <typename T>
__global__ void KeSyncAndMovingStats(T *means, T *variances, T *num_dev,
const int C, const T momentum,
const double epsilon, T *sv_mean_data,
T *sv_inv_var_data, T *moving_means,
T *moving_variances) {
// sync stats across multi-devices
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < C; i += stride) {
T mean = means[i] / (*num_dev);
T var = variances[i] / (*num_dev);
var = var - mean * mean;
// sync stats
sv_mean_data[i] = mean;
sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon);
variances[i] = var;
// moving stats
moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum);
moving_variances[i] =
moving_variances[i] * momentum + var * (1. - momentum);
}
}
template <typename T, framework::DataLayout layout>
static __global__ void KeNormAffine(const T *x, const T *scale, const T *bias,
const T *mean, const T *variance,
const double epsilon, const int C,
const int M, const int num, T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
y[i] = (x[i] - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c];
}
}
template <typename DeviceContext, typename T>
class SyncBatchNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const std::string layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout layout = framework::StringToDataLayout(layout_str);
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
PADDLE_ENFORCE(
!use_global_stats,
"sync_batch_norm doesn't support to set use_global_stats True. ",
"Please use batch_norm in this case.");
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
int x_numel = x->numel();
const T *x_d = x->data<T>();
const T *s_d = ctx.Input<Tensor>("Scale")->data<T>();
const T *b_d = ctx.Input<Tensor>("Bias")->data<T>();
auto *y = ctx.Output<Tensor>("Y");
T *y_d = y->mutable_data<T>(ctx.GetPlace());
const T *mean_data = nullptr;
const T *var_data = nullptr;
auto &dev_ctx = ctx.cuda_device_context();
auto stream = dev_ctx.stream();
auto *comm = dev_ctx.nccl_comm();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
paddle::memory::AllocationPtr alloc_ptr{nullptr};
if (is_test) {
const auto *est_mean = ctx.Input<Tensor>("Mean");
const auto *est_var = ctx.Input<Tensor>("Variance");
mean_data = est_mean->data<T>();
var_data = est_var->data<T>();
} else {
auto &allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
// x, x^2, 1, here 1 is used to calc device num
// device num also can be got from platform::DeviceContextPool
const int bytes = (C * 2 + 1) * sizeof(T);
alloc_ptr = allocator.Allocate(bytes);
T *stats = reinterpret_cast<T *>(alloc_ptr->ptr());
const int threads = 256;
int grid = std::min(C, (max_threads + threads - 1) / threads);
if (layout == framework::DataLayout::kNCHW) {
KeLocalStats<
T, threads,
framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
x_d, N, H * W * D, C, stats);
} else {
KeLocalStats<
T, threads,
framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
x_d, N, H * W * D, C, stats);
}
Tensor c_g_st;
T *c_g_st_d = c_g_st.mutable_data<T>({2 * C + 1}, platform::CPUPlace());
auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0);
int dtype = platform::ToNCCLDataType(x->type());
// In-place operation
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
// moving mean/variance
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
T *est_mean_data = mean_out->mutable_data<T>(ctx.GetPlace());
T *est_var_data = variance_out->mutable_data<T>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
T *sv_mean_data = saved_mean->mutable_data<T>(ctx.GetPlace());
T *sv_inv_var_data = saved_inv_variance->mutable_data<T>(ctx.GetPlace());
// Note, Input('Mean')/Input('Variance') share variable with
// Output('MeanOut')/Output('VarianceOut')
KeSyncAndMovingStats<T><<<(C + block - 1) / block, block, 0, stream>>>(
stats, stats + C, stats + 2 * C, C, momentum, epsilon, sv_mean_data,
sv_inv_var_data, est_mean_data, est_var_data);
mean_data = sv_mean_data;
var_data = stats + C;
}
int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
if (layout == framework::DataLayout::kNCHW) {
KeNormAffine<T,
framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
y_d);
} else {
KeNormAffine<T,
framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
y_d);
}
}
};
template <typename T, const int BlockDim, framework::DataLayout layout>
__global__ void KeBackwardLocalStats(const T *dy, const T *x, const T *means,
int N, int M, int C, T *sum_dy_prod) {
typedef cub::BlockReduce<double, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
for (int k = blockIdx.x; k < C; k += gridDim.x) {
T sum1 = 0;
T sum2 = 0;
T mean = means[k];
for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
int id = layout == framework::DataLayout::kNCHW
? (i / M) * C * M + k * M + i % M
: i * C + k;
T g = dy[id];
sum1 += g;
sum2 += g * (x[id] - mean);
}
__syncthreads();
T out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
sum_dy_prod[k] = out;
}
out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
sum_dy_prod[k + C] = out;
}
}
if (blockIdx.x == 0 && threadIdx.x == 0) {
sum_dy_prod[2 * C] = static_cast<T>(1.0);
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ void KeBNBackwardScaleBias(const T *dy, const T *x,
const T *mean,
const T *inv_variance,
const double epsilon, const int N,
const int C, const int HxW,
T *dscale, T *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<double, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
T ds_sum = static_cast<T>(0);
T db_sum = static_cast<T>(0);
T inv_var_i = inv_variance[i];
T mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int id = layout == framework::DataLayout::kNCHW
? ((j / HxW) * C + i) * HxW + (j % HxW)
: j * outer_size + i;
ds_sum += dy[id] * (x[id] - mean_i);
db_sum += dy[id];
}
__syncthreads();
double os = BlockReduce(temp_storage)
.Reduce(static_cast<double>(ds_sum), cub::Sum());
__syncthreads();
double ob = BlockReduce(temp_storage)
.Reduce(static_cast<double>(db_sum), cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
dscale[i] = static_cast<T>(os * inv_var_i);
dbias[i] = static_cast<T>(ob);
}
__syncthreads();
}
}
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy, const T *x, const T *beta,
const T *mean, const T *inv_variance,
const T *g_sum_dy,
const T *g_sum_dy_prod,
const T *num_dev, const double epsilon,
const int C, const int HxW,
const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
T scale = static_cast<T>(C) / num;
T dev_num = num_dev[0];
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
T inv_var = inv_variance[c];
T s_d = beta[c];
T gvar = -1.0 * (g_sum_dy_prod[c] / dev_num) * s_d * inv_var *
(inv_var * inv_var);
T gmean = -1.0 * (g_sum_dy[c] / dev_num) * s_d * inv_var;
dx[i] =
dy[i] * s_d * inv_var + gmean * scale + gvar * scale * (x[i] - mean[c]);
}
}
// Deriving the Gradient for the Backward Pass of Batch Normalization
// https://kevinzakka.github.io/2016/09/14/batch_normalization/
template <typename DeviceContext, typename T>
class SyncBatchNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout layout = framework::StringToDataLayout(layout_str);
const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
if (d_scale && d_bias) {
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
PADDLE_ENFORCE_EQ(scale->dims()[0], C);
std::vector<int> dims;
std::vector<int> strides;
if (layout == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
const T *x_d = x->data<T>();
const T *dy_d = d_y->data<T>();
auto &dev_ctx = ctx.cuda_device_context();
auto stream = dev_ctx.stream();
auto *comm = dev_ctx.nccl_comm();
const T *saved_mean = ctx.Input<Tensor>("SavedMean")->data<T>();
const T *saved_inv_var = ctx.Input<Tensor>("SavedVariance")->data<T>();
auto &allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
const int bytes = (C * 2 + 1) * sizeof(T);
auto alloc_ptr = allocator.Allocate(bytes);
T *stats = reinterpret_cast<T *>(alloc_ptr->ptr());
const int threads = 256;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
int grid = std::min(C, (max_threads + threads - 1) / threads);
int x_numel = x->numel();
int fsize = H * W * D;
if (layout == framework::DataLayout::kNCHW) {
KeBackwardLocalStats<
T, threads,
framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
dy_d, x_d, saved_mean, N, fsize, C, stats);
} else {
KeBackwardLocalStats<
T, threads,
framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
dy_d, x_d, saved_mean, N, fsize, C, stats);
}
int dtype = platform::ToNCCLDataType(x->type());
// In-place operation
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
const int block = 512;
int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
if (layout == framework::DataLayout::kNCHW) {
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, threads,
framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
d_scale->data<T>(), d_bias->data<T>());
}
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
dy_d, x_d, scale->data<T>(), saved_mean, saved_inv_var, stats,
stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(),
d_x->data<T>());
}
} else {
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, threads,
framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
d_scale->data<T>(), d_bias->data<T>());
}
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
dy_d, x_d, scale->data<T>(), saved_mean, saved_inv_var, stats,
stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(),
d_x->data<T>());
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
sync_batch_norm, ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
ops::SyncBatchNormKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
sync_batch_norm_grad,
ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, double>);
...@@ -57,7 +57,6 @@ DeviceContextPool::DeviceContextPool( ...@@ -57,7 +57,6 @@ DeviceContextPool::DeviceContextPool(
for (auto& p : places) { for (auto& p : places) {
set.insert(p); set.insert(p);
} }
for (auto& p : set) { for (auto& p : set) {
if (platform::is_cpu_place(p)) { if (platform::is_cpu_place(p)) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
...@@ -317,6 +316,7 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -317,6 +316,7 @@ CUDADeviceContext::~CUDADeviceContext() {
eigen_stream_.reset(); eigen_stream_.reset();
eigen_device_.reset(); eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_));
PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
} }
Place CUDADeviceContext::GetPlace() const { return place_; } Place CUDADeviceContext::GetPlace() const { return place_; }
......
...@@ -265,6 +265,12 @@ class CUDADeviceContext : public DeviceContext { ...@@ -265,6 +265,12 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const; cudaStream_t stream() const;
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; }
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
template <typename Callback> template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) { void RecordEvent(cudaEvent_t ev, Callback callback) {
callback(); callback();
...@@ -289,6 +295,13 @@ class CUDADeviceContext : public DeviceContext { ...@@ -289,6 +295,13 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<CublasHandleHolder> cublas_handle_; std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_; std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
ncclComm_t nccl_comm_{nullptr};
int compute_capability_; int compute_capability_;
int runtime_version_; int runtime_version_;
int driver_version_; int driver_version_;
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <deque> #include <deque>
#include <forward_list> #include <forward_list>
...@@ -30,6 +29,8 @@ limitations under the License. */ ...@@ -30,6 +29,8 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &alloc_in,
const std::string &free_in, int64_t thread_id) {
if (0 == start_ns || 0 == end_ns) {
VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
return;
}
thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
nullptr;
if (local_mem_info_record == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
mem_info_record_.emplace_front();
local_mem_info_record = &mem_info_record_.front();
}
local_mem_info_record->emplace_front(MemInfoRecord{
start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
}
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns, void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) { int64_t thread_id, uint32_t correlation_id) {
...@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_.clear(); correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear(); for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear(); for (auto &tmp : cpu_records_) tmp.clear();
for (auto &tmp : mem_info_record_) tmp.clear();
for (auto &tmp : active_kind_records_) tmp.clear(); for (auto &tmp : active_kind_records_) tmp.clear();
} }
...@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
proto::Profile profile_pb; proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
if (correlations_.empty()) if (correlations_.empty()) {
for (auto &tmp : correlations_pairs) for (auto &tmp : correlations_pairs) {
for (auto &pair : tmp) correlations_[pair.first] = pair.second; for (auto &pair : tmp) correlations_[pair.first] = pair.second;
}
}
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
...@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (auto &tmp : cpu_records_) { for (auto &tmp : cpu_records_) {
for (const CPURecord &r : tmp) { for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
} }
for (auto &tmp : active_kind_records_) { for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) { for (const ActiveKindRecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
for (auto &tmp : mem_info_record_) {
for (const auto &r : tmp) {
auto *event = profile_pb.add_mem_events();
event->set_device_id(0);
if (platform::is_cpu_place(r.place)) {
event->set_place(proto::MemEvent::CPUPlace);
} else if (platform::is_gpu_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPlace);
event->set_device_id(
boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
} else if (platform::is_cuda_pinned_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPinnedPlace);
} else {
PADDLE_THROW("The current place is not supported.");
}
event->set_alloc_in(r.alloc_in);
event->set_free_in(r.free_in);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_bytes(r.bytes);
event->set_thread_id(r.thread_id);
}
}
std::ofstream profile_f; std::ofstream profile_f;
profile_f.open(profile_path, profile_f.open(profile_path,
std::ios::out | std::ios::trunc | std::ios::binary); std::ios::out | std::ios::trunc | std::ios::binary);
...@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
std::forward_list<KernelRecord> kernel_records_; std::forward_list<KernelRecord> kernel_records_;
std::forward_list<MemRecord> mem_records_; std::forward_list<MemRecord> mem_records_;
std::forward_list<std::forward_list<CPURecord>> cpu_records_; std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_; std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>> std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs; correlations_pairs;
...@@ -575,7 +626,7 @@ Event *CurAnnotation() { ...@@ -575,7 +626,7 @@ Event *CurAnnotation() {
return annotation_stack.back(); return annotation_stack.back();
} }
std::string CurAnnotationName() { std::string CurAnnotationName() {
if (annotation_stack.empty()) return ""; if (annotation_stack.empty()) return "Unknown";
return annotation_stack.back()->name(); return annotation_stack.back()->name();
} }
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
...@@ -47,6 +48,7 @@ class DeviceTracer { ...@@ -47,6 +48,7 @@ class DeviceTracer {
int64_t stream_id; int64_t stream_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
struct CPURecord { struct CPURecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -54,6 +56,7 @@ class DeviceTracer { ...@@ -54,6 +56,7 @@ class DeviceTracer {
int64_t device_id; int64_t device_id;
int64_t thread_id; int64_t thread_id;
}; };
struct MemRecord { struct MemRecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -63,6 +66,17 @@ class DeviceTracer { ...@@ -63,6 +66,17 @@ class DeviceTracer {
uint32_t correlation_id; uint32_t correlation_id;
uint64_t bytes; uint64_t bytes;
}; };
struct MemInfoRecord {
uint64_t start_ns;
uint64_t end_ns;
size_t bytes;
Place place;
int64_t thread_id;
std::string alloc_in;
std::string free_in;
};
struct ActiveKindRecord { struct ActiveKindRecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -71,6 +85,7 @@ class DeviceTracer { ...@@ -71,6 +85,7 @@ class DeviceTracer {
int64_t thread_id; int64_t thread_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
virtual ~DeviceTracer() {} virtual ~DeviceTracer() {}
// Needs to be called once before use. // Needs to be called once before use.
virtual void Enable() = 0; virtual void Enable() = 0;
...@@ -97,6 +112,12 @@ class DeviceTracer { ...@@ -97,6 +112,12 @@ class DeviceTracer {
int64_t thread_id, int64_t thread_id,
uint32_t correlation_id) = 0; uint32_t correlation_id) = 0;
virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
size_t bytes, const Place& place,
const std::string& alloc_in,
const std::string& free_in,
int64_t thread_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end, virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
......
...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif #endif
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -64,5 +66,36 @@ class Event { ...@@ -64,5 +66,36 @@ class Event {
#endif #endif
#endif #endif
}; };
class MemEvent {
public:
MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
Place place, int64_t thread_id, const std::string& annotation)
: type_(type),
start_ns_(start_ns),
end_ns_(end_ns),
bytes_(bytes),
place_(place),
thread_id_(thread_id),
annotation_(annotation) {}
const EventType& type() const { return type_; }
uint64_t start_ns() const { return start_ns_; }
uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; }
Place place() const { return place_; }
int64_t thread_id() const { return thread_id_; }
const std::string& annotation() const { return annotation_; }
private:
EventType type_;
uint64_t start_ns_ = 0;
uint64_t end_ns_ = 0;
size_t bytes_;
Place place_;
int64_t thread_id_;
std::string annotation_;
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string.h> // for strdup #include <string.h> // for strdup
#include <algorithm> #include <algorithm>
#include <memory>
#include <set>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
...@@ -140,6 +142,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -140,6 +142,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
platform::DeviceTemporaryAllocator::Init(); platform::DeviceTemporaryAllocator::Init();
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
platform::SetNumThreads(FLAGS_paddle_num_threads); platform::SetNumThreads(FLAGS_paddle_num_threads);
#endif #endif
......
...@@ -16,9 +16,11 @@ ...@@ -16,9 +16,11 @@
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include <memory>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <typeindex> #include <typeindex>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
...@@ -78,6 +80,8 @@ struct NCCLContext { ...@@ -78,6 +80,8 @@ struct NCCLContext {
cudaStream_t stream() const { return ctx_->stream(); } cudaStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; }
int device_id() const { int device_id() const {
return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device; return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
} }
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -21,6 +20,8 @@ limitations under the License. */ ...@@ -21,6 +20,8 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <random> #include <random>
#include <string> #include <string>
#include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); ...@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct EventList;
static int64_t profiler_lister_id = 0; static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false; static bool should_send_profile_state = false;
std::mutex profiler_mu; std::mutex profiler_mu;
...@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0; ...@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
// The global mutex // The global mutex
static std::mutex g_all_event_lists_mutex; static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads // The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists; static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list; static thread_local std::shared_ptr<EventList<Event>> g_event_list;
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
Event* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<Event> Reduce() {
std::vector<Event> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); } static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
std::forward_list<std::vector<Event>> event_blocks; static std::mutex g_all_mem_event_lists_mutex;
}; static thread_local int32_t g_mem_thread_id;
static uint32_t g_mem_next_thread_id = 0;
inline uint64_t GetTimeInNsec() { inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady, using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
...@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id) ...@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
const EventType& Event::type() const { return type_; } const EventType &Event::type() const { return type_; }
double Event::CpuElapsedMs(const Event& e) const { double Event::CpuElapsedMs(const Event &e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0); return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
} }
double Event::CudaElapsedMs(const Event& e) const { double Event::CudaElapsedMs(const Event &e) const {
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
return gpu_ns_ / 1000000.0; return gpu_ns_ / 1000000.0;
#else #else
...@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const { ...@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
#endif #endif
} }
inline EventList& GetEventList() { inline EventList<MemEvent> &GetMemEventList() {
if (!g_mem_event_list) {
g_mem_event_list = std::make_shared<EventList<MemEvent>>();
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
g_mem_thread_id = g_mem_next_thread_id++;
g_all_mem_event_lists.emplace_front(g_mem_event_list);
}
return *g_mem_event_list;
}
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
place, g_mem_thread_id, annotation);
}
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
g_mem_thread_id, annotation);
}
inline EventList<Event> &GetEventList() {
if (!g_event_list) { if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>(); g_event_list = std::make_shared<EventList<Event>>();
g_thread_id = g_next_thread_id++; g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list); g_all_event_lists.emplace_front(g_event_list);
RecoreCurThreadId(g_thread_id); RecoreCurThreadId(g_thread_id);
...@@ -131,26 +124,26 @@ inline EventList& GetEventList() { ...@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name) { void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
Event* PushEvent(const std::string& name) { Event *PushEvent(const std::string &name) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id); return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
} }
void PopEvent(const std::string& name) { void PopEvent(const std::string &name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id); GetEventList().Record(EventType::kPopRange, name, g_thread_id);
} }
RecordEvent::RecordEvent(const std::string& name) RecordEvent::RecordEvent(const std::string &name)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
is_enabled_ = true; is_enabled_ = true;
name_ = name; name_ = name;
Event* e = PushEvent(name_); Event *e = PushEvent(name_);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(e); SetCurAnnotation(e);
} }
...@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name) ...@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
BlockDepth(), g_thread_id); BlockDepth(), g_thread_id);
...@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() { ...@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
PopEvent(name_); PopEvent(name_);
} }
RecordRPCEvent::RecordRPCEvent(const std::string& name) { MemEvenRecorder MemEvenRecorder::recorder;
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
size_t size) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
PADDLE_ENFORCE(events.count(ptr) == 0, "");
events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}
void MemEvenRecorder::Flush() {
std::lock_guard<std::mutex> guard(mtx_);
address_memevent_.clear();
}
MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
size_t bytes)
: place_(place),
bytes_(bytes),
start_ns_(PosixInNsec()),
alloc_in_(CurAnnotationName()) {
PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
}
MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
DeviceTracer *tracer = GetDeviceTracer();
end_ns_ = PosixInNsec();
auto annotation_free = CurAnnotationName();
if (tracer) {
tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
annotation_free, g_mem_thread_id);
}
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}
RecordRPCEvent::RecordRPCEvent(const std::string &name) {
if (FLAGS_enable_rpc_profiler) { if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name)); event_.reset(new platform::RecordEvent(name));
} }
...@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id) ...@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
RecordBlock::~RecordBlock() { RecordBlock::~RecordBlock() {
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
// We try to put all blocks at the same nested depth in the // We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id. // same timeline lane. and distinguish the using thread_id.
...@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) { ...@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
void ResetProfiler() { void ResetProfiler() {
SynchronizeAllDevice(); SynchronizeAllDevice();
GetDeviceTracer()->Reset(); GetDeviceTracer()->Reset();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
(*it)->Clear(); (*it)->Clear();
} }
for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end(); ++it) {
(*it)->Clear();
}
} }
std::vector<std::vector<Event>> GetAllEvents() { std::vector<std::vector<Event>> GetAllEvents() {
...@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() { ...@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
return result; return result;
} }
std::vector<std::vector<MemEvent>> GetMemEvents() {
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
std::vector<std::vector<MemEvent>> result;
for (auto &it : g_all_mem_event_lists) {
result.emplace_back((*it).Reduce());
}
return result;
}
// The information of each event given in the profiling report // The information of each event given in the profiling report
struct EventItem { struct EventItem {
std::string name; std::string name;
...@@ -263,8 +319,8 @@ struct EventItem { ...@@ -263,8 +319,8 @@ struct EventItem {
}; };
// Print results // Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
const std::string& sorted_domain, const size_t name_width, const std::string &sorted_domain, const size_t name_width,
const size_t data_width, bool merge_thread) { const size_t data_width, bool merge_thread) {
// Output header information // Output header information
std::cout << "\n------------------------->" std::cout << "\n------------------------->"
...@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<< std::setw(data_width) << "Ratio." << std::endl; << std::setw(data_width) << "Ratio." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) { for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) { for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem& event_item = events_table[i][j]; const EventItem &event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time; << std::setw(data_width) << event_item.total_time;
...@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
} }
// Parse the event list and output the profiling report // Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events, void ParseEvents(const std::vector<std::vector<Event>> &events,
bool merge_thread, bool merge_thread,
EventSortingKey sorted_by = EventSortingKey::kDefault) { EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
if (merge_thread && events.size() < 2) return; if (merge_thread && events.size() < 2) return;
std::string sorted_domain; std::string sorted_domain;
std::function<bool(const EventItem&, const EventItem&)> sorted_func; std::function<bool(const EventItem &, const EventItem &)> sorted_func;
switch (sorted_by) { switch (sorted_by) {
case EventSortingKey::kCalls: case EventSortingKey::kCalls:
sorted_domain = "number of calls"; sorted_domain = "number of calls";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.calls > b.calls; return a.calls > b.calls;
}; };
break; break;
case EventSortingKey::kTotal: case EventSortingKey::kTotal:
sorted_domain = "total time"; sorted_domain = "total time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.total_time > b.total_time; return a.total_time > b.total_time;
}; };
break; break;
case EventSortingKey::kMin: case EventSortingKey::kMin:
sorted_domain = "minimum time"; sorted_domain = "minimum time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.min_time > b.min_time; return a.min_time > b.min_time;
}; };
break; break;
case EventSortingKey::kMax: case EventSortingKey::kMax:
sorted_domain = "maximum time"; sorted_domain = "maximum time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.max_time > b.max_time; return a.max_time > b.max_time;
}; };
break; break;
case EventSortingKey::kAve: case EventSortingKey::kAve:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.ave_time > b.ave_time; return a.ave_time > b.ave_time;
}; };
break; break;
case EventSortingKey::kGPUTime: case EventSortingKey::kGPUTime:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.gpu_time > b.gpu_time; return a.gpu_time > b.gpu_time;
}; };
break; break;
case EventSortingKey::kCPUTime: case EventSortingKey::kCPUTime:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.cpu_time > b.cpu_time; return a.cpu_time > b.cpu_time;
}; };
break; break;
...@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
sorted_domain = "event first end time"; sorted_domain = "event first end time";
} }
const std::vector<std::vector<Event>>* analyze_events; const std::vector<std::vector<Event>> *analyze_events;
std::vector<std::vector<Event>> merged_events_list; std::vector<std::vector<Event>> merged_events_list;
if (merge_thread) { if (merge_thread) {
std::vector<Event> merged_events; std::vector<Event> merged_events;
...@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
} }
} }
// average time // average time
for (auto& item : event_items) { for (auto &item : event_items) {
item.ave_time = item.total_time / item.calls; item.ave_time = item.total_time / item.calls;
item.ratio = item.total_time / total; item.ratio = item.total_time / total;
} }
...@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
merge_thread); merge_thread);
} }
struct MemoryProfierReport {
size_t alloc_times{0};
size_t alloc_size{0};
size_t free_times{0};
size_t free_size{0};
};
// Print results
void PrintMemProfiler(
const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
&annotation_report,
const size_t name_width, const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Memory Profiling Report "
<< "<-------------------------\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Alloc Calls" << std::setw(data_width) << "Size(MB)"
<< std::setw(data_width) << "Free Calls" << std::setw(data_width)
<< "Size(MB)" << std::endl;
for (auto &tmp : annotation_report) {
for (auto &e : tmp.second) {
auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
std::cout << std::setw(name_width) << event_name;
std::cout << std::setw(data_width) << e.second.alloc_times;
std::cout << std::setw(data_width)
<< e.second.alloc_size / (1024.0 * 1024.0);
std::cout << std::setw(data_width) << e.second.free_times;
std::cout << std::setw(data_width)
<< e.second.free_size / (1024.0 * 1024.0) << std::endl;
}
}
std::cout << std::endl;
}
// parse memory events
void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
if (g_state == ProfilerState::kDisabled) return;
// place, annotation, alloc times, alloc size
std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
annotation_report;
for (auto &tmp : events) {
for (auto &e : tmp) {
if (e.type() == EventType::kPushRange) {
annotation_report[e.place()][e.annotation()].alloc_times += 1;
annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
} else if (e.type() == EventType::kPopRange) {
annotation_report[e.place()][e.annotation()].free_times += 1;
annotation_report[e.place()][e.annotation()].free_size += e.bytes();
}
}
}
PrintMemProfiler(annotation_report, 55, 18);
}
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string &profile_path) {
SynchronizeAllDevice(); SynchronizeAllDevice();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_"); Mark("_stop_profiler_");
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
...@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
std::vector<std::vector<Event>> all_events = GetAllEvents(); std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key); ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key); ParseEvents(all_events, false, sorted_key);
if (VLOG_IS_ON(5)) {
std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
ParseMemEvents(all_mem_events);
}
ResetProfiler(); ResetProfiler();
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true; should_send_profile_state = true;
......
...@@ -15,10 +15,17 @@ limitations under the License. */ ...@@ -15,10 +15,17 @@ limitations under the License. */
#pragma once #pragma once
#include <forward_list> #include <forward_list>
#include <list> #include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
...@@ -34,8 +41,41 @@ enum ProfilerState { ...@@ -34,8 +41,41 @@ enum ProfilerState {
void Mark(const std::string& name); void Mark(const std::string& name);
Event* PushEvent(const std::string& name); void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place);
struct MemEvenRecorder {
public:
void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place);
void Flush();
static MemEvenRecorder& Instance() { return recorder; }
private:
struct RecordMemEvent {
RecordMemEvent(const Place& place, size_t bytes);
~RecordMemEvent();
Place place_;
size_t bytes_;
uint64_t start_ns_;
uint64_t end_ns_;
std::string alloc_in_;
std::string free_in_;
};
static MemEvenRecorder recorder;
std::map<Place,
std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
address_memevent_;
std::mutex mtx_;
MemEvenRecorder() {}
DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
};
Event* PushEvent(const std::string& name);
void PopEvent(const std::string& name); void PopEvent(const std::string& name);
struct RecordEvent { struct RecordEvent {
...@@ -87,6 +127,41 @@ enum EventSortingKey { ...@@ -87,6 +127,41 @@ enum EventSortingKey {
kGPUTime kGPUTime
}; };
template <typename T>
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(T);
constexpr static size_t kEventAlign = alignof(T);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
T* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<T> Reduce() {
std::vector<T> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<T>> event_blocks;
};
// Enable the profiling function. // Enable the profiling function.
void EnableProfiler(ProfilerState state); void EnableProfiler(ProfilerState state);
......
...@@ -34,8 +34,25 @@ message Event { ...@@ -34,8 +34,25 @@ message Event {
optional string detail_info = 9; optional string detail_info = 9;
} }
message MemEvent {
enum Place {
CUDAPlace = 0;
CPUPlace = 1;
CUDAPinnedPlace = 2;
}
optional uint64 start_ns = 1;
optional uint64 end_ns = 2;
optional uint64 bytes = 3;
optional Place place = 4;
optional uint64 thread_id = 5;
optional uint32 device_id = 6;
optional string alloc_in = 7;
optional string free_in = 8;
}
message Profile { message Profile {
repeated Event events = 1; repeated Event events = 1;
optional uint64 start_ns = 2; optional uint64 start_ns = 2;
optional uint64 end_ns = 3; optional uint64 end_ns = 3;
repeated MemEvent mem_events = 4;
} }
\ No newline at end of file
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...@@ -54,12 +55,14 @@ void BindGraph(py::module *m) { ...@@ -54,12 +55,14 @@ void BindGraph(py::module *m) {
"The graph is a Directed Acyclic Single Static Assignment Graph, see " "The graph is a Directed Acyclic Single Static Assignment Graph, see "
"`paddle::ir::Graph` for details.") "`paddle::ir::Graph` for details.")
.def(py::init<const ProgramDesc &>()) .def(py::init<const ProgramDesc &>())
.def("clone", &Graph::Clone)
.def("has", &Graph::Has) .def("has", &Graph::Has)
.def("get_int", &Graph::Get<int>) .def("get_int", &Graph::Get<int>)
.def("get_float", &Graph::Get<float>) .def("get_float", &Graph::Get<float>)
.def("get_double", &Graph::Get<double>) .def("get_double", &Graph::Get<double>)
.def("get_string", &Graph::Get<std::string>) .def("get_string", &Graph::Get<std::string>)
.def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>) .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>,
return_value_policy::reference)
.def("set", [](Graph &self, const std::string &attr_name, .def("set", [](Graph &self, const std::string &attr_name,
int attr) { return self.Set(attr_name, new int(attr)); }) int attr) { return self.Set(attr_name, new int(attr)); })
.def("set", .def("set",
...@@ -103,7 +106,8 @@ void BindGraph(py::module *m) { ...@@ -103,7 +106,8 @@ void BindGraph(py::module *m) {
.def("retrieve_node", &Graph::RetrieveNode, .def("retrieve_node", &Graph::RetrieveNode,
return_value_policy::reference) return_value_policy::reference)
.def("resolve_hazard", &Graph::ResolveHazard) .def("resolve_hazard", &Graph::ResolveHazard)
.def("origin_program_desc", &Graph::OriginProgram); .def("origin_program_desc", &Graph::OriginProgram,
return_value_policy::reference);
} }
void BindNode(py::module *m) { void BindNode(py::module *m) {
......
...@@ -94,6 +94,14 @@ bool IsCompiledWithMKLDNN() { ...@@ -94,6 +94,14 @@ bool IsCompiledWithMKLDNN() {
#endif #endif
} }
bool IsCompiledWithNGRAPH() {
#ifndef PADDLE_WITH_NGRAPH
return false;
#else
return true;
#endif
}
bool IsCompiledWithBrpc() { bool IsCompiledWithBrpc() {
#ifndef PADDLE_WITH_DISTRIBUTE #ifndef PADDLE_WITH_DISTRIBUTE
return false; return false;
...@@ -874,6 +882,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -874,6 +882,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_devices", m.def("init_devices",
[](bool init_p2p) { framework::InitDevices(init_p2p); }); [](bool init_p2p) { framework::InitDevices(init_p2p); });
m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
...@@ -1221,6 +1230,21 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1221,6 +1230,21 @@ All parameter, weight, gradient are variables in Paddle.
it will save GPU memory and may make the execution faster. it will save GPU memory and may make the execution faster.
This options is only available in GPU devices. This options is only available in GPU devices.
Default False)DOC") Default False)DOC")
.def_property(
"sync_batch_norm",
[](const BuildStrategy &self) { return self.sync_batch_norm_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.sync_batch_norm_ = b;
},
R"DOC(The type is BOOL, sync_batch_norm indicates whether to use
synchronous batch normalization which synchronizes the mean
and variance through multi-devices in training phase.
Current implementation doesn't support FP16 training and CPU.
And only synchronous on one machine, not all machines.
Default False)DOC")
.def_property( .def_property(
"memory_optimize", "memory_optimize",
[](const BuildStrategy &self) { return self.memory_optimize_; }, [](const BuildStrategy &self) { return self.memory_optimize_; },
...@@ -1242,7 +1266,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1242,7 +1266,7 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"); cannot be updated after being finalized.)DOC");
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::string &, const std::vector<std::string> &, const std::string &,
Scope *, std::vector<Scope *> &, const ExecutionStrategy &, Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
const BuildStrategy &, ir::Graph *>()) const BuildStrategy &, ir::Graph *>())
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
......
...@@ -455,7 +455,11 @@ function assert_api_spec_approvals() { ...@@ -455,7 +455,11 @@ function assert_api_spec_approvals() {
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308` python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
if [ "${APPROVALS}" == "TRUE" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
fi
else else
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
...@@ -463,7 +467,7 @@ function assert_api_spec_approvals() { ...@@ -463,7 +467,7 @@ function assert_api_spec_approvals() {
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}" echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
else else
echo "You must have panyx0718 approval for the api change! ${API_FILE}" echo "You must have panyx0718 approval for the api change! ${API_FILE}"
fi fi
......
...@@ -125,7 +125,7 @@ def __bootstrap__(): ...@@ -125,7 +125,7 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
sysstr = platform.system() sysstr = platform.system()
read_env_flags = [ read_env_flags = [
'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_ngraph', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
...@@ -143,6 +143,9 @@ def __bootstrap__(): ...@@ -143,6 +143,9 @@ def __bootstrap__():
if core.is_compiled_with_mkldnn(): if core.is_compiled_with_mkldnn():
read_env_flags.append('use_mkldnn') read_env_flags.append('use_mkldnn')
if core.is_compiled_with_ngraph():
read_env_flags.append('use_ngraph')
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
read_env_flags.append('rpc_server_profile_path') read_env_flags.append('rpc_server_profile_path')
......
...@@ -223,22 +223,27 @@ class CompiledProgram(object): ...@@ -223,22 +223,27 @@ class CompiledProgram(object):
tps), "num_trainers == len(end_points)" tps), "num_trainers == len(end_points)"
self._build_strategy.trainers_endpoints = tps self._build_strategy.trainers_endpoints = tps
if self._build_strategy.sync_batch_norm:
self._build_strategy.enable_sequential_execution = True
self._persistable_vars = [] self._persistable_vars = []
for block_id in range(self._program_desc.num_blocks()): for node in self._graph.nodes():
bdesc = self._program_desc.block(block_id) if node.is_var() and node.var() is not None and node.var().persistable() and \
self._persistable_vars.extend([ node.var().type() != core.VarDesc.VarType.RAW:
cpt.to_text(v.name()) for v in bdesc.all_vars() self._persistable_vars.append(cpt.to_text(node.name()))
if v.persistable() and v.type() != core.VarDesc.VarType.RAW
])
places = list(map(_place_obj, self._places)) places = list(map(_place_obj, self._places))
# ParallelExecutor would broadcast all the parameters during initializing.
return core.ParallelExecutor(places, # The parameters of each process should be in the same ordered for the data-parallelism
set(self._persistable_vars), # distributed training to keep the broadcast correct.
cpt.to_text(self._loss_name) self._persistable_vars = list(set(self._persistable_vars))
if self._loss_name else six.u(''), scope, self._persistable_vars.sort()
self._local_scopes, self._exec_strategy,
self._build_strategy, self._graph) return core.ParallelExecutor(
places, self._persistable_vars,
cpt.to_text(self._loss_name)
if self._loss_name else six.u(''), self._scope, self._local_scopes,
self._exec_strategy, self._build_strategy, self._graph)
def _compile_inference(self): def _compile_inference(self):
return core.create_paddle_predictor(self._infer_config) return core.create_paddle_predictor(self._infer_config)
......
...@@ -13,58 +13,92 @@ ...@@ -13,58 +13,92 @@
# limitations under the license. # limitations under the license.
from __future__ import print_function from __future__ import print_function
import os
import six
import unittest import unittest
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import six
from paddle.fluid.framework import IrGraph from paddle.fluid.framework import IrGraph
from paddle.fluid import core from paddle.fluid import core
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CPU_NUM"] = "1"
def residual_block(num):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act)
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') def conv_block():
img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = data conv_pool_1 = fluid.nets.simple_img_conv_pool(
for _ in six.moves.xrange(num): input=img,
conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) filter_size=5,
short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) num_filters=20,
hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') pool_size=2,
fc = fluid.layers.fc(input=hidden, size=10) pool_stride=2,
loss = fluid.layers.cross_entropy(input=fc, label=label) act="relu")
loss = fluid.layers.mean(loss) conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
return loss conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
return [img, label], avg_loss
class TestGraph(unittest.TestCase): class TestGraph(unittest.TestCase):
def test_graph_functions(self): def graph_apis(self, use_cuda=False, for_ci=True):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
loss = residual_block(2) feeds, loss = conv_block()
opt = fluid.optimizer.Adam(learning_rate=0.001) opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss) opt.minimize(loss)
graph = IrGraph(core.Graph(main.desc), for_test=False) graph = IrGraph(core.Graph(main.desc), for_test=False)
backup_graph = graph.clone()
self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes()))
build_strategy = fluid.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
origin_binary = fluid.CompiledProgram(graph.graph).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
backup_binary = fluid.CompiledProgram(
backup_graph.graph).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
iters = 5
batch_size = 8
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
feeder = fluid.DataFeeder(feed_list=feeds, place=place)
def train(binary):
for _ in range(iters):
data = next(train_reader())
loss_v = exe.run(binary,
feed=feeder.feed(data),
fetch_list=[loss.name])
print('{}: {}'.format('loss', loss_v))
train(origin_binary)
train(backup_binary)
marked_nodes = set() marked_nodes = set()
for op in graph.all_op_nodes(): for op in graph.all_op_nodes():
if op.name().find('conv2d') > -1: if op.name().find('conv2d') > -1:
marked_nodes.add(op) marked_nodes.add(op)
graph.draw('.', 'residual', marked_nodes) if not for_ci:
graph.draw('.', 'residual', marked_nodes)
backup_marked_nodes = set()
for op in backup_graph.all_op_nodes():
if op.name().find('conv2d') > -1:
backup_marked_nodes.add(op)
backup_graph.draw('.', 'backup', backup_marked_nodes)
self.assertFalse(graph.has_circle()) self.assertFalse(graph.has_circle())
self.assertEqual(graph.graph_num(), 1) self.assertEqual(graph.graph_num(), 1)
nodes = graph.topology_sort() nodes = graph.topology_sort()
...@@ -75,6 +109,13 @@ class TestGraph(unittest.TestCase): ...@@ -75,6 +109,13 @@ class TestGraph(unittest.TestCase):
graph.safe_remove_nodes(marked_nodes) graph.safe_remove_nodes(marked_nodes)
self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes)) self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
def test_graph_apis_cpu(self):
self.graph_apis(use_cuda=False, for_ci=True)
def test_graph_apis_cuda(self):
if fluid.core.is_compiled_with_cuda():
self.graph_apis(use_cuda=True, for_ci=True)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# see the license for the specific language governing permissions and # see the license for the specific language governing permissions and
# limitations under the license. # limitations under the license.
import os
import unittest import unittest
import random import random
import numpy as np import numpy as np
...@@ -25,6 +26,9 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass ...@@ -25,6 +26,9 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
from paddle.fluid.contrib.slim.quantization import TransformForMobilePass from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
from paddle.fluid import core from paddle.fluid import core
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CPU_NUM"] = "1"
def linear_fc(num): def linear_fc(num):
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
...@@ -249,7 +253,11 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -249,7 +253,11 @@ class TestQuantizationFreezePass(unittest.TestCase):
marked_nodes.add(op) marked_nodes.add(op)
test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes) test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
quantized_main_program = main_graph.to_program() build_strategy = fluid.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
quantized_test_program = test_graph.to_program() quantized_test_program = test_graph.to_program()
iters = 5 iters = 5
batch_size = 8 batch_size = 8
...@@ -264,7 +272,7 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -264,7 +272,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
for _ in range(iters): for _ in range(iters):
data = next(train_reader()) data = next(train_reader())
loss_v = exe.run(program=quantized_main_program, loss_v = exe.run(binary,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[loss]) fetch_list=[loss])
if not for_ci: if not for_ci:
......
...@@ -2002,6 +2002,19 @@ class IrGraph(object): ...@@ -2002,6 +2002,19 @@ class IrGraph(object):
self.graph = graph self.graph = graph
self._for_test = for_test self._for_test = for_test
def clone(self):
"""
Create a new and duplicated IrGraph.
Warns:
The method only clones the graph structure, not its attributes.
Returns:
IrGraph: A new and duplicated graph.
"""
g = self.graph.clone()
return IrGraph(g, self._for_test)
def is_test(self): def is_test(self):
""" """
If the graph is used for testing, the function returns true. Otherwise, returns false. If the graph is used for testing, the function returns true. Otherwise, returns false.
...@@ -2232,10 +2245,10 @@ class IrGraph(object): ...@@ -2232,10 +2245,10 @@ class IrGraph(object):
Notes: the `graph` cannot contain a circle. Notes: the `graph` cannot contain a circle.
Returns: Returns:
set(IrNode): nodes in topology order. list(IrNode): nodes in topology order.
""" """
ordered_nodes = core.topology_sort(self.graph) ordered_nodes = core.topology_sort(self.graph)
return {IrNode(n) for n in ordered_nodes} return [IrNode(n) for n in ordered_nodes]
def build_adjacency_list(self): def build_adjacency_list(self):
""" """
...@@ -2303,7 +2316,7 @@ class IrGraph(object): ...@@ -2303,7 +2316,7 @@ class IrGraph(object):
""" """
Convert the graph into a Program. Convert the graph into a Program.
Notes: When the graph includes backward operator nodes, the WARN: When the graph includes backward operator nodes, the
conversion process may be failed. Usually, this function is conversion process may be failed. Usually, this function is
only used to convert a test graph. only used to convert a test graph.
......
...@@ -22,7 +22,8 @@ from . import layers ...@@ -22,7 +22,8 @@ from . import layers
from ..framework import Variable, OpProtoHolder from ..framework import Variable, OpProtoHolder
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from ..initializer import Normal, Constant from ..initializer import Normal, Constant
__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit']
class Conv2D(layers.Layer): class Conv2D(layers.Layer):
...@@ -468,3 +469,137 @@ class Embedding(layers.Layer): ...@@ -468,3 +469,137 @@ class Embedding(layers.Layer):
}) })
return out return out
class GRUUnit(layers.Layer):
"""
**GRU unit layer**
if origin_mode is True, then the equation of a gru step is from paper
`Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
if origin_mode is False, then the equation of a gru step is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
of the equation above, the :math:`z_t` is split into 3 parts -
:math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
implement a full GRU unit operator for an input, a fully
connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
an intermediate candidate hidden output, which is denoted by :math:`m_t`.
This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
Args:
input (Variable): The fc transformed input value of current step.
name_scope (str): See base class.
hidden (Variable): The hidden value of gru unit from previous step.
size (integer): The input dimension value.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight matrix. Note:
- The shape of the weight matrix is :math:`(T \\times 3D)`, where
:math:`D` is the hidden size.
- All elements in the weight matrix can be divided into two parts.
The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, gru_unit will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
activation (string): The activation type for cell (actNode).
Default: 'tanh'
gate_activation (string): The activation type for gates (actGate).
Default: 'sigmoid'
Returns:
tuple: The hidden value, reset-hidden value and gate values.
"""
def __init__(self,
name_scope,
size,
param_attr=None,
bias_attr=None,
activation='tanh',
gate_activation='sigmoid',
origin_mode=False,
dtype='float32'):
super(GRUUnit, self).__init__(name_scope)
activation_dict = dict(
identity=0,
sigmoid=1,
tanh=2,
relu=3, )
activation = activation_dict[activation]
gate_activation = activation_dict[gate_activation]
self._dtype = dtype
size = size // 3
# create weight
self._weight = self.create_parameter(
attr=param_attr, shape=[size, 3 * size], dtype=dtype)
# create bias
bias_size = [1, 3 * size]
self._bias = self.create_parameter(
attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
def forward(self, input, hidden):
inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': self._weight}
if self._bias:
inputs['Bias'] = self._bias
gate = self._helper.create_variable_for_type_inference(self._dtype)
reset_hidden_pre = self._helper.create_variable_for_type_inference(
self._dtype)
updated_hidden = self._helper.create_variable_for_type_inference(
self._dtype)
self._helper.append_op(
type='gru_unit',
inputs=inputs,
outputs={
'Gate': gate,
'ResetHiddenPrev': reset_hidden_pre,
'Hidden': updated_hidden,
},
attrs={
'activation': 2, # tanh
'gate_activation': 1, # sigmoid
})
return updated_hidden, reset_hidden_pre, gate
...@@ -516,6 +516,8 @@ def yolov3_loss(x, ...@@ -516,6 +516,8 @@ def yolov3_loss(x,
class_num, class_num,
ignore_thresh, ignore_thresh,
downsample_ratio, downsample_ratio,
gtscore=None,
use_label_smooth=True,
name=None): name=None):
""" """
${comment} ${comment}
...@@ -534,28 +536,35 @@ def yolov3_loss(x, ...@@ -534,28 +536,35 @@ def yolov3_loss(x,
class_num (int): ${class_num_comment} class_num (int): ${class_num_comment}
ignore_thresh (float): ${ignore_thresh_comment} ignore_thresh (float): ${ignore_thresh_comment}
downsample_ratio (int): ${downsample_ratio_comment} downsample_ratio (int): ${downsample_ratio_comment}
name (string): the name of yolov3 loss name (string): the name of yolov3 loss. Default None.
gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
of [N, B]. Default None.
use_label_smooth (bool): ${use_label_smooth_comment}
Returns: Returns:
Variable: A 1-D tensor with shape [1], the value of yolov3 loss Variable: A 1-D tensor with shape [N], the value of yolov3 loss
Raises: Raises:
TypeError: Input x of yolov3_loss must be Variable TypeError: Input x of yolov3_loss must be Variable
TypeError: Input gtbox of yolov3_loss must be Variable" TypeError: Input gtbox of yolov3_loss must be Variable
TypeError: Input gtlabel of yolov3_loss must be Variable" TypeError: Input gtlabel of yolov3_loss must be Variable
TypeError: Input gtscore of yolov3_loss must be None or Variable
TypeError: Attr anchors of yolov3_loss must be list or tuple TypeError: Attr anchors of yolov3_loss must be list or tuple
TypeError: Attr class_num of yolov3_loss must be an integer TypeError: Attr class_num of yolov3_loss must be an integer
TypeError: Attr ignore_thresh of yolov3_loss must be a float number TypeError: Attr ignore_thresh of yolov3_loss must be a float number
TypeError: Attr use_label_smooth of yolov3_loss must be a bool value
Examples: Examples:
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326] anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchor_mask = [0, 1, 2] anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
gtscore=gtscore, anchors=anchors,
anchor_mask=anchor_mask, class_num=80, anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32) ignore_thresh=0.7, downsample_ratio=32)
""" """
...@@ -567,6 +576,8 @@ def yolov3_loss(x, ...@@ -567,6 +576,8 @@ def yolov3_loss(x,
raise TypeError("Input gtbox of yolov3_loss must be Variable") raise TypeError("Input gtbox of yolov3_loss must be Variable")
if not isinstance(gtlabel, Variable): if not isinstance(gtlabel, Variable):
raise TypeError("Input gtlabel of yolov3_loss must be Variable") raise TypeError("Input gtlabel of yolov3_loss must be Variable")
if gtscore is not None and not isinstance(gtscore, Variable):
raise TypeError("Input gtscore of yolov3_loss must be Variable")
if not isinstance(anchors, list) and not isinstance(anchors, tuple): if not isinstance(anchors, list) and not isinstance(anchors, tuple):
raise TypeError("Attr anchors of yolov3_loss must be list or tuple") raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple): if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
...@@ -576,6 +587,9 @@ def yolov3_loss(x, ...@@ -576,6 +587,9 @@ def yolov3_loss(x,
if not isinstance(ignore_thresh, float): if not isinstance(ignore_thresh, float):
raise TypeError( raise TypeError(
"Attr ignore_thresh of yolov3_loss must be a float number") "Attr ignore_thresh of yolov3_loss must be a float number")
if not isinstance(use_label_smooth, bool):
raise TypeError(
"Attr use_label_smooth of yolov3_loss must be a bool value")
if name is None: if name is None:
loss = helper.create_variable_for_type_inference(dtype=x.dtype) loss = helper.create_variable_for_type_inference(dtype=x.dtype)
...@@ -586,21 +600,26 @@ def yolov3_loss(x, ...@@ -586,21 +600,26 @@ def yolov3_loss(x,
objectness_mask = helper.create_variable_for_type_inference(dtype='int32') objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
gt_match_mask = helper.create_variable_for_type_inference(dtype='int32') gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
inputs = {
"X": x,
"GTBox": gtbox,
"GTLabel": gtlabel,
}
if gtscore:
inputs["GTScore"] = gtscore
attrs = { attrs = {
"anchors": anchors, "anchors": anchors,
"anchor_mask": anchor_mask, "anchor_mask": anchor_mask,
"class_num": class_num, "class_num": class_num,
"ignore_thresh": ignore_thresh, "ignore_thresh": ignore_thresh,
"downsample_ratio": downsample_ratio, "downsample_ratio": downsample_ratio,
"use_label_smooth": use_label_smooth,
} }
helper.append_op( helper.append_op(
type='yolov3_loss', type='yolov3_loss',
inputs={ inputs=inputs,
"X": x,
"GTBox": gtbox,
"GTLabel": gtlabel,
},
outputs={ outputs={
'Loss': loss, 'Loss': loss,
'ObjectnessMask': objectness_mask, 'ObjectnessMask': objectness_mask,
......
...@@ -1432,6 +1432,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): ...@@ -1432,6 +1432,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
predict = fluid.layers.fc(input=net, size=classdim, act='softmax') predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
""" """
if not soft_label:
return cross_entropy2(input, label, ignore_index)
helper = LayerHelper('cross_entropy', **locals()) helper = LayerHelper('cross_entropy', **locals())
out = helper.create_variable_for_type_inference(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
...@@ -1444,6 +1446,22 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): ...@@ -1444,6 +1446,22 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
return out return out
def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
helper = LayerHelper('cross_entropy2', **locals())
out = helper.create_variable_for_type_inference(dtype=input.dtype)
xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
match_x = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op(
type='cross_entropy2',
inputs={'X': [input],
'Label': [label]},
outputs={'Y': [out],
'MatchX': [match_x],
'XShape': [xshape]},
attrs={'ignore_index': ignore_index})
return out
def bpr_loss(input, label, name=None): def bpr_loss(input, label, name=None):
""" """
Bayesian Personalized Ranking Loss Operator. Bayesian Personalized Ranking Loss Operator.
...@@ -2904,11 +2922,17 @@ def batch_norm(input, ...@@ -2904,11 +2922,17 @@ def batch_norm(input,
y_i &\\gets \\gamma \\hat{x_i} + \\beta y_i &\\gets \\gamma \\hat{x_i} + \\beta
Args: Args:
input(variable): The input variable which is a LoDTensor. input(variable): The rank of input variable can be 2, 3, 4, 5.
act(string, Default None): Activation type, linear|relu|prelu|... act(string, Default None): Activation type, linear|relu|prelu|...
is_test(bool, Default False): Used for training or training. is_test (bool, Default False): A flag indicating whether it is in
momentum(float, Default 0.9): test phrase or not.
epsilon(float, Default 1e-05): momentum(float, Default 0.9): The value used for the moving_mean and
moving_var computation. The updated formula is:
:math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
:math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
Default is 0.9.
epsilon(float, Default 1e-05): A value added to the denominator for
numerical stability. Default is 1e-5.
param_attr(ParamAttr|None): The parameter attribute for Parameter `scale` param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as param_attr. If the Initializer of the param_attr will create ParamAttr as param_attr. If the Initializer of the param_attr
...@@ -2966,15 +2990,8 @@ def batch_norm(input, ...@@ -2966,15 +2990,8 @@ def batch_norm(input,
shape=param_shape, shape=param_shape,
dtype=dtype, dtype=dtype,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0))
# setting stop_gradient=True to reduce computation
if use_global_stats and helper.param_attr.learning_rate == 0.:
scale.stop_gradient = True
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
# setting stop_gradient=True to reduce computation
if use_global_stats and helper.bias_attr.learning_rate == 0.:
bias.stop_gradient = True
mean = helper.create_parameter( mean = helper.create_parameter(
attr=ParamAttr( attr=ParamAttr(
......
...@@ -23,6 +23,7 @@ __activations_noattr__ = [ ...@@ -23,6 +23,7 @@ __activations_noattr__ = [
'logsigmoid', 'logsigmoid',
'exp', 'exp',
'tanh', 'tanh',
'atan',
'tanh_shrink', 'tanh_shrink',
'softshrink', 'softshrink',
'sqrt', 'sqrt',
...@@ -30,6 +31,8 @@ __activations_noattr__ = [ ...@@ -30,6 +31,8 @@ __activations_noattr__ = [
'ceil', 'ceil',
'floor', 'floor',
'cos', 'cos',
'acos',
'asin',
'sin', 'sin',
'round', 'round',
'reciprocal', 'reciprocal',
......
...@@ -476,8 +476,17 @@ class TestYoloDetection(unittest.TestCase): ...@@ -476,8 +476,17 @@ class TestYoloDetection(unittest.TestCase):
x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], gtscore = layers.data(name='gtscore', shape=[10], dtype='float32')
[0, 1], 10, 0.7, 32) loss = layers.yolov3_loss(
x,
gtbox,
gtlabel, [10, 13, 30, 13], [0, 1],
10,
0.7,
32,
gtscore=gtscore,
use_label_smooth=False)
self.assertIsNotNone(loss) self.assertIsNotNone(loss)
def test_yolo_box(self): def test_yolo_box(self):
......
...@@ -22,6 +22,7 @@ import six ...@@ -22,6 +22,7 @@ import six
import time import time
import itertools import itertools
import collections import collections
from collections import defaultdict
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -257,8 +258,65 @@ class OpTest(unittest.TestCase): ...@@ -257,8 +258,65 @@ class OpTest(unittest.TestCase):
outs, _ = self._calc_output(place) outs, _ = self._calc_output(place)
return outs return outs
def _calc_output(self, place, parallel=False, no_check_set=None): def _create_var_from_numpy(self, value):
if isinstance(value, tuple):
data = value[0]
lod = value[1]
v = fluid.imperative.base.to_variable(value=data)
v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod)
return v
else:
return fluid.imperative.base.to_variable(value)
def _calc_imperative_output(self, place, parallel=False, no_check_set=None):
with fluid.imperative.base.guard(place=place):
block = fluid.default_main_program().global_block()
# prepare input variable
inputs = defaultdict(list)
for name, np_value in six.iteritems(self.inputs):
if not isinstance(np_value, list):
np_value = [np_value]
for i in range(len(np_value)):
inputs[name].append(
self._create_var_from_numpy(np_value[i]))
# prepare output variable
outputs = defaultdict(list)
for name, np_value in six.iteritems(self.outputs):
if not isinstance(np_value, list):
np_value = [np_value]
for i in range(len(np_value)):
value = np_value[i]
if isinstance(value, tuple):
v = block.create_var(
name="%s_out%d" % (name, i),
dtype=value[0].dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=False)
v._ivar.value().get_tensor(
).set_recursive_sequence_lengths(value[1])
else:
v = block.create_var(
name="%s_out%d" % (name, i),
dtype=value.dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=False)
outputs[name].append(v)
block.append_op(
type=self.op_type,
inputs=inputs,
outputs=outputs,
attrs=self.attrs)
return outputs
def _calc_output(self, place, parallel=False, no_check_set=None):
program = Program() program = Program()
block = program.global_block() block = program.global_block()
self._append_ops(block) self._append_ops(block)
...@@ -305,8 +363,13 @@ class OpTest(unittest.TestCase): ...@@ -305,8 +363,13 @@ class OpTest(unittest.TestCase):
place, place,
atol, atol,
no_check_set=None, no_check_set=None,
equal_nan=False): equal_nan=False,
check_imperative=False):
if check_imperative:
imperative_outs = self._calc_imperative_output(
place, no_check_set=no_check_set)
outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
for out_name, out_dup in Operator.get_op_outputs(self.op_type): for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs: if out_name not in self.outputs:
continue continue
...@@ -330,6 +393,10 @@ class OpTest(unittest.TestCase): ...@@ -330,6 +393,10 @@ class OpTest(unittest.TestCase):
type(sub_out)) type(sub_out))
for item in sub_out: for item in sub_out:
sub_out_name, expect = item[0], item[1] sub_out_name, expect = item[0], item[1]
if check_imperative:
imperative_actual = imperative_outs[sub_out_name][0]
imperative_actual_t = np.array(
imperative_actual._ivar.value().get_tensor())
idx = find_actual(sub_out_name, fetch_list) idx = find_actual(sub_out_name, fetch_list)
actual = outs[idx] actual = outs[idx]
actual_t = np.array(actual) actual_t = np.array(actual)
...@@ -340,12 +407,31 @@ class OpTest(unittest.TestCase): ...@@ -340,12 +407,31 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " + "Output (" + sub_out_name + ") has diff at " +
str(place)) str(place))
if check_imperative:
self.assertTrue(
np.allclose(
imperative_actual_t,
expect_t,
atol=atol,
equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " +
str(place) + " in imperative mode")
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual( self.assertListEqual(
actual.recursive_sequence_lengths(), expect[1], actual.recursive_sequence_lengths(), expect[1],
"Output (" + sub_out_name + "Output (" + sub_out_name +
") has different lod at " + str(place)) ") has different lod at " + str(place))
if check_imperative:
self.assertListEqual(
imperative_actual._ivar.value().get_tensor()
.recursive_sequence_lengths(), expect[1],
"Output (" + out_name + ") has different lod at " +
str(place) + " in imperative mode")
else: else:
if check_imperative:
imperative_actual = imperative_outs[out_name][0]
imperative_actual_t = np.array(
imperative_actual._ivar.value().get_tensor())
idx = find_actual(out_name, fetch_list) idx = find_actual(out_name, fetch_list)
actual = outs[idx] actual = outs[idx]
actual_t = np.array(actual) actual_t = np.array(actual)
...@@ -357,10 +443,27 @@ class OpTest(unittest.TestCase): ...@@ -357,10 +443,27 @@ class OpTest(unittest.TestCase):
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t) + " in class " + self.__class__.__name__) str(actual_t) + " in class " + self.__class__.__name__)
if check_imperative:
self.assertTrue(
np.allclose(
imperative_actual_t,
expect_t,
atol=atol,
equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" +
str(imperative_actual_t) + " in class " +
self.__class__.__name__)
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +
") has different lod at " + str(place)) ") has different lod at " + str(place))
if check_imperative:
self.assertListEqual(
imperative_actual._ivar.value().get_tensor()
.recursive_sequence_lengths(), expect[1],
"Output (" + out_name + ") has different lod at " +
str(place) + " in imperative mode")
def _get_places(self): def _get_places(self):
if self.dtype == np.float16: if self.dtype == np.float16:
...@@ -383,10 +486,15 @@ class OpTest(unittest.TestCase): ...@@ -383,10 +486,15 @@ class OpTest(unittest.TestCase):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
return places return places
def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False): def check_output(self,
atol=1e-5,
no_check_set=None,
equal_nan=False,
check_imperative=False):
places = self._get_places() places = self._get_places()
for place in places: for place in places:
self.check_output_with_place(place, atol, no_check_set, equal_nan) self.check_output_with_place(place, atol, no_check_set, equal_nan,
check_imperative)
def check_output_customized(self, checker): def check_output_customized(self, checker):
places = self._get_places() places = self._get_places()
......
...@@ -100,6 +100,23 @@ class TestTanh(TestActivation): ...@@ -100,6 +100,23 @@ class TestTanh(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAtan(TestActivation):
def setUp(self):
self.op_type = "atan"
self.init_dtype()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.arctan(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestTanhShrink(TestActivation): class TestTanhShrink(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "tanh_shrink" self.op_type = "tanh_shrink"
...@@ -248,6 +265,23 @@ class TestCos(TestActivation): ...@@ -248,6 +265,23 @@ class TestCos(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAcos(TestActivation):
def setUp(self):
self.op_type = "acos"
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.arccos(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestSin(TestActivation): class TestSin(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "sin" self.op_type = "sin"
...@@ -265,6 +299,23 @@ class TestSin(TestActivation): ...@@ -265,6 +299,23 @@ class TestSin(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAsin(TestActivation):
def setUp(self):
self.op_type = "asin"
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.arcsin(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestRound(TestActivation): class TestRound(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "round" self.op_type = "round"
...@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs) ...@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs)
create_test_act_fp16_class(TestCeil, grad_check=False) create_test_act_fp16_class(TestCeil, grad_check=False)
create_test_act_fp16_class(TestFloor, grad_check=False) create_test_act_fp16_class(TestFloor, grad_check=False)
create_test_act_fp16_class(TestCos, grad_atol=0.85) create_test_act_fp16_class(TestCos, grad_atol=0.85)
create_test_act_fp16_class(TestAcos, grad_atol=0.85)
create_test_act_fp16_class(TestSin) create_test_act_fp16_class(TestSin)
create_test_act_fp16_class(TestAsin)
create_test_act_fp16_class(TestAtan)
create_test_act_fp16_class(TestRound, grad_check=False) create_test_act_fp16_class(TestRound, grad_check=False)
create_test_act_fp16_class(TestRelu) create_test_act_fp16_class(TestRelu)
create_test_act_fp16_class(TestGelu) create_test_act_fp16_class(TestGelu)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from op_test import OpTest
import unittest
import numpy as np
import six
class CrossEntropy2OpTestBase(OpTest):
def initParameters(self):
return [32, 64], 'float32', -100
def calc_output(self, logits, label, ignore_index):
ret = np.zeros(shape=label.shape, dtype=logits.dtype)
match_x = np.zeros(shape=label.shape, dtype=logits.dtype)
for idx in six.moves.range(label.shape[0]):
if label[idx] == ignore_index:
continue
match_x[idx] = logits[idx][label[idx]]
ret[idx] = -np.log(match_x[idx])
return ret, match_x
def setUp(self):
self.shape, self.dtype, self.ignore_index = self.initParameters()
self.op_type = 'cross_entropy2'
feature_size = int(self.shape[-1])
batch_size = int(np.prod(self.shape) / feature_size)
logits = (np.random.random(size=self.shape) + 1).astype(self.dtype)
label = np.random.random_integers(
low=0, high=feature_size - 1,
size=self.shape[0:-1] + [1]).astype('int64')
outputs, match_x = self.calc_output(
np.reshape(logits, [batch_size, feature_size]),
np.reshape(label, [batch_size, 1]), self.ignore_index)
self.inputs = {'X': logits, 'Label': label}
self.outputs = {
'Y': np.reshape(outputs, label.shape),
'MatchX': np.reshape(match_x, label.shape),
'XShape': np.zeros(
shape=logits.shape, dtype=logits.dtype)
}
self.attrs = {'ignore_index': self.ignore_index}
def test_check_output(self):
self.check_output(no_check_set=['XShape'])
def test_check_grad(self):
self.check_grad(
inputs_to_check=['X'],
output_names=['Y'],
no_grad_set=['XShape', 'MatchX', 'Label'])
class CrossEntropy2OpTest2(CrossEntropy2OpTestBase):
def initParameters(self):
return [32, 64], 'float64', 3
class CrossEntropy2OpTest3(CrossEntropy2OpTestBase):
def initParameters(self):
return [4, 8, 16, 32], 'float32', -100
class CrossEntropy2OpTest4(CrossEntropy2OpTestBase):
def initParameters(self):
return [4, 8, 16, 32], 'float32', 3
if __name__ == '__main__':
unittest.main()
...@@ -524,8 +524,8 @@ class TestLocalLookupTable(TestDistLookupTableBase): ...@@ -524,8 +524,8 @@ class TestLocalLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad', 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad', 'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
...@@ -564,8 +564,8 @@ class TestDistLookupTable(TestDistLookupTableBase): ...@@ -564,8 +564,8 @@ class TestDistLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', 'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'split_selected_rows', 'send', 'lookup_table_grad', 'split_selected_rows', 'send',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
...@@ -612,8 +612,8 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): ...@@ -612,8 +612,8 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad', 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad', 'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
...@@ -652,8 +652,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -652,8 +652,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', 'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'split_selected_rows', 'send', 'lookup_table_grad', 'split_selected_rows', 'send',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
...@@ -841,8 +841,8 @@ class TestRemoteLookupTable(TestDistLookupTableBase): ...@@ -841,8 +841,8 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad', 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad', 'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
......
...@@ -31,6 +31,80 @@ def dequantize_max_abs(x, scale, max_range): ...@@ -31,6 +31,80 @@ def dequantize_max_abs(x, scale, max_range):
return y return y
def channel_wise_quantize_max_abs(x, quant_bit=8):
scales = []
for i in range(x.shape[0]):
scales.append(np.max(np.abs(x[i])).astype("float32"))
y = x.copy()
max_range = math.pow(2, quant_bit - 1) - 1
for i, scale in enumerate(scales):
y[i] = np.round(y[i] / scale * max_range)
return y, scales
def channel_wise_dequantize_max_abs(x,
scales,
quant_bits,
activation_scale=None):
y = x.copy()
for i in range(x.shape[0]):
y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i]
if activation_scale is not None:
y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
return y
class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
def set_args(self):
self.quant_bits = [8, 8]
self.data_type = "float32"
self.activation_scale = 0.7861
def setUp(self):
self.set_args()
self.op_type = "fake_channel_wise_dequantize_max_abs"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
self.activation_scale)
self.inputs = {
'X': yq,
'Scales': [("scales0", np.array(scales).astype(self.data_type)),
("scales1", np.array(
[self.activation_scale]).astype(self.data_type))]
}
self.attrs = {'quant_bits': self.quant_bits}
self.outputs = {'Out': ydq}
def test_check_output(self):
self.check_output()
class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
def set_args(self):
self.quant_bits = [8]
self.data_type = "float32"
def setUp(self):
self.set_args()
self.op_type = "fake_channel_wise_dequantize_max_abs"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits)
self.inputs = {
'X': yq,
'Scales': [("scales0", np.array(scales).astype(self.data_type))]
}
self.attrs = {'quant_bits': self.quant_bits}
self.outputs = {'Out': ydq}
def test_check_output(self):
self.check_output()
class TestFakeDequantizeMaxAbsOp(OpTest): class TestFakeDequantizeMaxAbsOp(OpTest):
def set_args(self): def set_args(self):
self.num_bits = 8 self.num_bits = 8
......
...@@ -35,6 +35,30 @@ class TestFakeQuantizeOp(OpTest): ...@@ -35,6 +35,30 @@ class TestFakeQuantizeOp(OpTest):
self.check_output() self.check_output()
class TestFakeChannelWiseQuantizeOp(OpTest):
def setUp(self):
self.op_type = "fake_channel_wise_quantize_abs_max"
self.attrs = {'bit_length': 8}
self.inputs = {
'X': np.random.random((4, 3, 64, 64)).astype("float32"),
}
scales = []
for i in range(self.inputs['X'].shape[0]):
scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32"))
outputs = self.inputs['X'].copy()
for i, scale in enumerate(scales):
outputs[i] = np.round(outputs[i] / scale * (
(1 << (self.attrs['bit_length'] - 1)) - 1))
self.outputs = {
'Out': outputs,
'OutScales': np.array(scales).astype("float32"),
}
def test_check_output(self):
self.check_output()
class TestFakeQuantizeRangeAbsMaxOp(OpTest): class TestFakeQuantizeRangeAbsMaxOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "fake_quantize_range_abs_max" self.op_type = "fake_quantize_range_abs_max"
......
...@@ -156,7 +156,7 @@ class TestGRUOp(OpTest): ...@@ -156,7 +156,7 @@ class TestGRUOp(OpTest):
} }
def test_check_output(self): def test_check_output(self):
self.check_output(atol=1e-8) self.check_output(atol=1e-8, check_imperative=True)
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
......
...@@ -112,6 +112,47 @@ class TestLayer(LayerTest): ...@@ -112,6 +112,47 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
self.assertTrue(np.allclose(static_ret, static_ret2)) self.assertTrue(np.allclose(static_ret, static_ret2))
def test_gru_unit(self):
lod = [[2, 4, 3]]
D = 5
T = sum(lod[0])
N = len(lod[0])
input = np.random.rand(T, 3 * D).astype('float32')
hidden_input = np.random.rand(T, D).astype('float32')
with self.static_graph():
x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
input=x, hidden=hidden, size=D * 3)
static_ret = self.get_static_graph_result(
feed={'x': input,
'hidden': hidden_input},
fetch_list=[updated_hidden, reset_hidden_pre, gate])
with self.static_graph():
x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
input=x, hidden=hidden, size=D * 3)
gru = nn.GRUUnit('gru', size=D * 3)
updated_hidden, reset_hidden_pre, gate = gru(x, hidden)
static_ret2 = self.get_static_graph_result(
feed={'x': input,
'hidden': hidden_input},
fetch_list=[updated_hidden, reset_hidden_pre, gate])
with self.dynamic_graph():
gru = nn.GRUUnit('gru', size=D * 3)
dy_ret = gru(
base.to_variable(input), base.to_variable(hidden_input))
for i in range(len(static_ret)):
self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
class TestBook(unittest.TestCase): class TestBook(unittest.TestCase):
def test_fit_a_line(self): def test_fit_a_line(self):
......
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
...@@ -63,5 +64,28 @@ class TestCase2(TestSliceOp): ...@@ -63,5 +64,28 @@ class TestCase2(TestSliceOp):
self.out = self.input[-3:3, 0:100, :, 2:-1] self.out = self.input[-3:3, 0:100, :, 2:-1]
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16(TestSliceOp):
def config(self):
self.dtype = "float16"
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 3]
self.out = self.input[-3:3, 0:100, :, 2:-1]
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-5)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Input'], 'Out', max_relative_error=0.006)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import os
import six
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle.fluid import compiler
class TestSyncBatchNormOpTraining(unittest.TestCase):
def setUp(self):
#self.dtype = np.float32
self.dtype = np.float64
self.N = 32
self.C = 16
self.H = 64
self.W = 32
self.dshape = [self.N, self.C, self.H, self.W]
def build_program(self,
place,
layout,
seed,
sync_bn=False,
only_forward=False):
main = fluid.Program()
startup = fluid.Program()
main.random_seed = seed
startup.random_seed = seed
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
data = fluid.layers.data(
name='input',
shape=self.dshape,
dtype=self.dtype,
append_batch_size=False)
conv = fluid.layers.conv2d(
input=data,
num_filters=32,
filter_size=1,
param_attr=fluid.ParamAttr(name='conv2d_weight'),
bias_attr=False,
use_cudnn=False)
bn = fluid.layers.batch_norm(
conv,
param_attr=fluid.ParamAttr(name='bn_scale'),
bias_attr=fluid.ParamAttr(name='bn_bias'),
moving_mean_name='bn_moving_mean',
moving_variance_name='bn_moving_variance',
data_layout=layout,
is_test=only_forward)
sigmoid = fluid.layers.sigmoid(bn)
out = fluid.layers.reduce_sum(sigmoid)
if not sync_bn:
out = out / core.get_cuda_device_count()
if not only_forward:
sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
sgd_opt.backward(out)
return main, startup, [out, conv, bn]
def compare(self, place, layout, only_forward):
seed = 10
os.environ['FLAGS_cudnn_deterministic'] = "1"
data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
# Single-GPU, N = 32 per GPU
main, startup, outs = self.build_program(place, layout, seed, False,
only_forward)
exe = fluid.Executor(place)
exe.run(startup)
fetch_names = [v.name for v in outs] + [
'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
]
if not only_forward:
others = [
'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
]
fetch_names += others
bn_fetches = exe.run(program=main,
feed={'input': data},
fetch_list=fetch_names)
#####################################################################
# Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
main, startup, outs = self.build_program(place, layout, seed, True,
only_forward)
exe = fluid.Executor(place)
exe.run(startup)
fetch_names = [v.name for v in outs] + [
'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
]
if not only_forward:
others = [
'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
]
fetch_names += others
for nm in fetch_names:
fv = fluid.framework._get_var(str(nm), program=main)
fv.persistable = True
build_strategy = fluid.BuildStrategy()
build_strategy.sync_batch_norm = True
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
comp_prog = compiler.CompiledProgram(main).with_data_parallel(
outs[0].name if not only_forward else None,
build_strategy=build_strategy)
sync_bn_fetches = exe.run(program=comp_prog,
feed={'input': data},
fetch_list=fetch_names)
for i in six.moves.xrange(1, len(sync_bn_fetches)):
bn_val = bn_fetches[i]
sync_bn_val = sync_bn_fetches[i]
if sync_bn_val.shape != bn_val.shape:
sync_bn_val = sync_bn_val[:bn_val.shape[0]]
self.assertTrue(
np.allclose(
bn_val, sync_bn_val, atol=1e-3),
"Output (" + fetch_names[i] + ") has diff. \n" + "\nBN " +
str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
def test_train(self):
if not core.is_compiled_with_cuda():
return
places = [core.CUDAPlace(0)]
for place in places:
for layout in ["NCHW", "NHWC"]:
self.compare(place, layout, False)
def test_infer(self):
if not core.is_compiled_with_cuda():
return
places = [core.CUDAPlace(0)]
for place in places:
for layout in ["NCHW", "NHWC"]:
self.compare(place, layout, True)
if __name__ == '__main__':
unittest.main()
...@@ -23,8 +23,8 @@ from op_test import OpTest ...@@ -23,8 +23,8 @@ from op_test import OpTest
from paddle.fluid import core from paddle.fluid import core
def l2loss(x, y): def l1loss(x, y):
return 0.5 * (y - x) * (y - x) return abs(x - y)
def sce(x, label): def sce(x, label):
...@@ -66,7 +66,7 @@ def batch_xywh_box_iou(box1, box2): ...@@ -66,7 +66,7 @@ def batch_xywh_box_iou(box1, box2):
return inter_area / union return inter_area / union
def YOLOv3Loss(x, gtbox, gtlabel, attrs): def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
n, c, h, w = x.shape n, c, h, w = x.shape
b = gtbox.shape[1] b = gtbox.shape[1]
anchors = attrs['anchors'] anchors = attrs['anchors']
...@@ -75,21 +75,21 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs): ...@@ -75,21 +75,21 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
mask_num = len(anchor_mask) mask_num = len(anchor_mask)
class_num = attrs["class_num"] class_num = attrs["class_num"]
ignore_thresh = attrs['ignore_thresh'] ignore_thresh = attrs['ignore_thresh']
downsample = attrs['downsample'] downsample_ratio = attrs['downsample_ratio']
input_size = downsample * h use_label_smooth = attrs['use_label_smooth']
input_size = downsample_ratio * h
x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
loss = np.zeros((n)).astype('float32') loss = np.zeros((n)).astype('float32')
label_pos = 1.0 - 1.0 / class_num if use_label_smooth else 1.0
label_neg = 1.0 / class_num if use_label_smooth else 0.0
pred_box = x[:, :, :, :, :4].copy() pred_box = x[:, :, :, :, :4].copy()
grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1)) grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w)) grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
class_num)
mask_anchors = [] mask_anchors = []
for m in anchor_mask: for m in anchor_mask:
mask_anchors.append((anchors[2 * m], anchors[2 * m + 1])) mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
...@@ -138,21 +138,22 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs): ...@@ -138,21 +138,22 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
ty = gtbox[i, j, 1] * w - gj ty = gtbox[i, j, 1] * w - gj
tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0]) tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1]) th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) * gtscore[i, j]
loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale loss[i] += l1loss(x[i, an_idx, gj, gi, 2], tw) * scale
loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale loss[i] += l1loss(x[i, an_idx, gj, gi, 3], th) * scale
objness[i, an_idx * h * w + gj * w + gi] = 1.0 objness[i, an_idx * h * w + gj * w + gi] = gtscore[i, j]
for label_idx in range(class_num): for label_idx in range(class_num):
loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], label_pos
float(label_idx == gtlabel[i, j])) if label_idx == gtlabel[i, j] else
label_neg) * gtscore[i, j]
for j in range(mask_num * h * w): for j in range(mask_num * h * w):
if objness[i, j] > 0: if objness[i, j] > 0:
loss[i] += sce(pred_obj[i, j], 1.0) loss[i] += sce(pred_obj[i, j], 1.0) * objness[i, j]
elif objness[i, j] == 0: elif objness[i, j] == 0:
loss[i] += sce(pred_obj[i, j], 0.0) loss[i] += sce(pred_obj[i, j], 0.0)
...@@ -176,7 +177,8 @@ class TestYolov3LossOp(OpTest): ...@@ -176,7 +177,8 @@ class TestYolov3LossOp(OpTest):
"anchor_mask": self.anchor_mask, "anchor_mask": self.anchor_mask,
"class_num": self.class_num, "class_num": self.class_num,
"ignore_thresh": self.ignore_thresh, "ignore_thresh": self.ignore_thresh,
"downsample": self.downsample, "downsample_ratio": self.downsample_ratio,
"use_label_smooth": self.use_label_smooth,
} }
self.inputs = { self.inputs = {
...@@ -184,7 +186,14 @@ class TestYolov3LossOp(OpTest): ...@@ -184,7 +186,14 @@ class TestYolov3LossOp(OpTest):
'GTBox': gtbox.astype('float32'), 'GTBox': gtbox.astype('float32'),
'GTLabel': gtlabel.astype('int32'), 'GTLabel': gtlabel.astype('int32'),
} }
loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
gtscore = np.ones(self.gtbox_shape[:2]).astype('float32')
if self.gtscore:
gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32')
self.inputs['GTScore'] = gtscore
loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, gtscore,
self.attrs)
self.outputs = { self.outputs = {
'Loss': loss, 'Loss': loss,
'ObjectnessMask': objness, 'ObjectnessMask': objness,
...@@ -193,24 +202,57 @@ class TestYolov3LossOp(OpTest): ...@@ -193,24 +202,57 @@ class TestYolov3LossOp(OpTest):
def test_check_output(self): def test_check_output(self):
place = core.CPUPlace() place = core.CPUPlace()
self.check_output_with_place(place, atol=1e-3) self.check_output_with_place(place, atol=2e-3)
def test_check_grad_ignore_gtbox(self): def test_check_grad_ignore_gtbox(self):
place = core.CPUPlace() place = core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(place, ['X'], 'Loss', max_relative_error=0.2)
place, ['X'],
'Loss', def initTestCase(self):
no_grad_set=set(["GTBox", "GTLabel"]), self.anchors = [
max_relative_error=0.3) 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
373, 326
]
self.anchor_mask = [0, 1, 2]
self.class_num = 5
self.ignore_thresh = 0.7
self.downsample_ratio = 32
self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
self.gtbox_shape = (3, 5, 4)
self.gtscore = True
self.use_label_smooth = True
class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp):
def initTestCase(self):
self.anchors = [
10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
373, 326
]
self.anchor_mask = [0, 1, 2]
self.class_num = 5
self.ignore_thresh = 0.7
self.downsample_ratio = 32
self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
self.gtbox_shape = (3, 5, 4)
self.gtscore = True
self.use_label_smooth = False
class TestYolov3LossNoGTScore(TestYolov3LossOp):
def initTestCase(self): def initTestCase(self):
self.anchors = [10, 13, 16, 30, 33, 23] self.anchors = [
self.anchor_mask = [1, 2] 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
373, 326
]
self.anchor_mask = [0, 1, 2]
self.class_num = 5 self.class_num = 5
self.ignore_thresh = 0.5 self.ignore_thresh = 0.7
self.downsample = 32 self.downsample_ratio = 32
self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5) self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
self.gtbox_shape = (3, 5, 4) self.gtbox_shape = (3, 5, 4)
self.gtscore = False
self.use_label_smooth = True
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -30,6 +30,6 @@ if error: ...@@ -30,6 +30,6 @@ if error:
'''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI: '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
1. cd ${paddle_path}, compile paddle; 1. cd ${paddle_path}, compile paddle;
2. pip install build/python/dist/(build whl package); 2. pip install build/python/dist/(build whl package);
3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"''' 3. run "python tools/print_signatures.py paddle.fluid,paddle.reader > paddle/fluid/API.spec"'''
) )
sys.exit(1) sys.exit(1)
...@@ -51,6 +51,8 @@ def visit_member(parent_name, member): ...@@ -51,6 +51,8 @@ def visit_member(parent_name, member):
all = (args, doc) all = (args, doc)
member_dict[cur_name] = all member_dict[cur_name] = all
except TypeError: # special for PyBind method except TypeError: # special for PyBind method
if cur_name in check_modules_list:
return
member_dict[cur_name] = " ".join([ member_dict[cur_name] = " ".join([
line.strip() for line in pydoc.render_doc(member).split('\n') line.strip() for line in pydoc.render_doc(member).split('\n')
if "->" in line if "->" in line
...@@ -78,6 +80,7 @@ def visit_all_module(mod): ...@@ -78,6 +80,7 @@ def visit_all_module(mod):
visit_member(mod.__name__, instance) visit_member(mod.__name__, instance)
check_modules_list = ["paddle.reader.ComposeNotAligned.__init__"]
modules = sys.argv[1].split(",") modules = sys.argv[1].split(",")
for m in modules: for m in modules:
visit_all_module(importlib.import_module(m)) visit_all_module(importlib.import_module(m))
......
...@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object): ...@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
event['args'] = args event['args'] = args
self._events.append(event) self._events.append(event)
def emit_counter(self, category, name, pid, timestamp, counter, value):
"""Emits a record for a single counter.
Args:
category: The event category as string
name: The event name as string
pid: Identifier of the process generating this event as integer
timestamp: The timestamps of this event as long integer
counter: Name of the counter as string
value: Value of the counter as integer
tid: Thread id of the allocation as integer
"""
event = self._create_event('C', category, name, pid, 0, timestamp)
event['args'] = {counter: value}
self._events.append(event)
def format_to_string(self, pretty=False): def format_to_string(self, pretty=False):
"""Formats the chrome trace to a string. """Formats the chrome trace to a string.
...@@ -117,6 +133,7 @@ class Timeline(object): ...@@ -117,6 +133,7 @@ class Timeline(object):
self._profile_dict = profile_dict self._profile_dict = profile_dict
self._pid = 0 self._pid = 0
self._devices = dict() self._devices = dict()
self._mem_devices = dict()
self._chrome_trace = _ChromeTraceFormatter() self._chrome_trace = _ChromeTraceFormatter()
def _allocate_pid(self): def _allocate_pid(self):
...@@ -143,6 +160,47 @@ class Timeline(object): ...@@ -143,6 +160,47 @@ class Timeline(object):
self._devices[(k, event.device_id, "GPUKernel")] = pid self._devices[(k, event.device_id, "GPUKernel")] = pid
self._chrome_trace.emit_pid("%s:gpu:%d" % self._chrome_trace.emit_pid("%s:gpu:%d" %
(k, event.device_id), pid) (k, event.device_id), pid)
if not hasattr(profile_pb, "mem_events"):
continue
for mevent in profile_pb.mem_events:
if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
if (k, mevent.device_id, "GPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "GPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:gpu:%d" % (k, mevent.device_id),
pid)
elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
if (k, mevent.device_id, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "CPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cpu:%d" % (k, mevent.device_id),
pid)
elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
if (k, mevent.device_id, "CUDAPinnedPlace"
) not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id,
"CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" %
(k, mevent.device_id), pid)
if (k, 0, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
(k, 0), pid)
if (k, 0, "GPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "GPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
(k, 0), pid)
if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
def _allocate_events(self): def _allocate_events(self):
for k, profile_pb in six.iteritems(self._profile_dict): for k, profile_pb in six.iteritems(self._profile_dict):
...@@ -155,7 +213,7 @@ class Timeline(object): ...@@ -155,7 +213,7 @@ class Timeline(object):
args = {'name': event.name} args = {'name': event.name}
if event.memcopy.bytes > 0: if event.memcopy.bytes > 0:
args['mem_bytes'] = event.memcopy.bytes args['mem_bytes'] = event.memcopy.bytes
if event.detail_info: if hasattr(event, "detail_info") and event.detail_info:
args['detail_info'] = event.detail_info args['detail_info'] = event.detail_info
# TODO(panyx0718): Chrome tracing only handles ms. However, some # TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here. # ops takes micro-seconds. Hence, we keep the ns here.
...@@ -163,9 +221,59 @@ class Timeline(object): ...@@ -163,9 +221,59 @@ class Timeline(object):
event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
event.sub_device_id, 'Op', event.name, args) event.sub_device_id, 'Op', event.name, args)
def _allocate_memory_event(self):
if not hasattr(profiler_pb2, "MemEvent"):
return
place_to_str = {
profiler_pb2.MemEvent.CPUPlace: "CPU",
profiler_pb2.MemEvent.CUDAPlace: "GPU",
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
}
for k, profile_pb in six.iteritems(self._profile_dict):
mem_list = []
end_profiler = 0
for mevent in profile_pb.mem_events:
crt_info = dict()
crt_info['time'] = mevent.start_ns
crt_info['size'] = mevent.bytes
if mevent.place in place_to_str:
place = place_to_str[mevent.place]
else:
place = "UnDefine"
crt_info['place'] = place
pid = self._mem_devices[(k, mevent.device_id, place)]
crt_info['pid'] = pid
crt_info['thread_id'] = mevent.thread_id
crt_info['device_id'] = mevent.device_id
mem_list.append(crt_info)
crt_info = dict()
crt_info['place'] = place
crt_info['pid'] = pid
crt_info['thread_id'] = mevent.thread_id
crt_info['device_id'] = mevent.device_id
crt_info['time'] = mevent.end_ns
crt_info['size'] = -mevent.bytes
mem_list.append(crt_info)
end_profiler = max(end_profiler, crt_info['time'])
mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
i = 0
total_size = 0
while i < len(mem_list):
total_size += mem_list[i]['size']
while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
i + 1]['time']:
total_size += mem_list[i + 1]['size']
i += 1
self._chrome_trace.emit_counter(
"Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
0, total_size)
i += 1
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
self._allocate_events() self._allocate_events()
self._allocate_memory_event()
return self._chrome_trace.format_to_string() return self._chrome_trace.format_to_string()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册