Merge branch 'develop' into yolo_box

74037cc1 · Kaipeng Deng · GitHub · 0ff9a403 · 92b9ce34 · 74037cc1
100 changed file
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
    else:
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.AllReduce
-    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op

    avg_loss = train_args[0]


--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -91,7 +91,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po
 paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625'))
 paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95'))
 paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497'))
-paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d'))
+paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '320c6973b02ea179fa89fecc80796464'))
 paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab'))
 paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb'))
 paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9'))
@@ -293,6 +293,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
 paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
 paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
 paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
+paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
 paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
 paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
 paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
@@ -300,6 +301,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
 paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
 paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
+paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
+paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
 paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
 paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
 paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))
@@ -327,7 +330,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
 paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <glog/logging.h>
 #include <memory>
+#include <utility>

 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
@@ -49,6 +50,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      AppendPass("sequential_execution_pass");
    }

+    // Add op fusion.
+    if (strategy.sync_batch_norm_) {
+      AppendPass("sync_batch_norm_pass");
+    }
+
    // Add op fusion.
    if (strategy.fuse_relu_depthwise_conv_) {
      AppendPass("fuse_relu_depthwise_conv_pass");
@@ -227,6 +233,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 }  // namespace framework
 }  // namespace paddle

+USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -77,6 +77,8 @@ struct BuildStrategy {

  bool fuse_relu_depthwise_conv_{false};

+  bool sync_batch_norm_{false};
+
  bool memory_optimize_{true};
  // TODO(dzhwinter):
  // make enable_inplace, memory_optimize_

--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <deque>
 #include <iterator>
+#include <memory>
 #include <stack>
 #include <string>
 #include <unordered_map>
@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
 void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
                                          ir::Graph* graph) const {
  VLOG(4) << "Try to inplace op " << op->Name();
+  // FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
+  // ProgramDescs.
+  // The operations related to BlockDesc or ProgramDesc should perform on Graph
+  // or Node directly!
  PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
                 "op_desc is nullptr");
  // some pre-requirments need to meet if the op want to inplaced.

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) {
  auto type = node.GetType();
  // only these types holds bulk of gpu memory
  if (!(type == proto::VarType::LOD_TENSOR ||
-        type == proto::VarType::SELECTED_ROWS ||
        type == proto::VarType::LOD_TENSOR_ARRAY)) {
    return false;
  }

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -24,6 +24,7 @@
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include <unordered_set>
 #include <vector>
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
@@ -191,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
          // immediately to make the subblock variable reuse strategy take
          // effect. Because it is a single op in graph. No need to
          // update the ir nodes.
+          // FIXME(liuwei1031): Graph is not aware of the existence of
+          // BlockDescs and ProgramDescs.
+          // The operations related to BlockDesc or ProgramDesc should perform
+          // on Graph or Node directly!
          sub_op_desc->Rename(var->Name(), cache->Name());
          if (sub_op_desc->Block() != nullptr &&
              sub_op_desc->Block()->HasVar(var->Name())) {

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -34,11 +34,11 @@ limitations under the License. */

 #ifdef PADDLE_WITH_NGRAPH
 #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 #endif

 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
-DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");

 namespace paddle {
 namespace framework {
@@ -194,9 +194,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool force_disable_gc) {
  platform::RecordBlock b(block_id);
  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
-#ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
-#endif
  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -372,6 +369,12 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
+#ifdef PADDLE_WITH_NGRAPH
+  if (FLAGS_use_ngraph) {
+    paddle::operators::NgraphEngine::FuseNgraphOps(
+        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
+  }
+#endif
  return ctx;
 }


--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,6 +46,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
+pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -66,6 +67,7 @@ pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
+pass_library(sync_batch_norm_pass base)

 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -100,6 +102,8 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
+cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)

--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file eint8_outcept in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or
+// implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void CPUQuantizeSquashPass::FindNodesToKeep(
+    Graph* graph,
+    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
+  GraphPatternDetector gpd;
+  patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"};
+  deq_any_pattern();
+
+  int found_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern);
+
+    if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end())
+      (*nodes_keep_counter)[dequant_out] = 1;
+    else
+      (*nodes_keep_counter)[dequant_out] += 1;
+
+    found_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_count);
+}
+
+void CPUQuantizeSquashPass::Squash(
+    Graph* graph,
+    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
+  GraphPatternDetector gpd;
+  patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"};
+  squash_pattern();
+
+  int found_squash_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "squash requantize-quantize ops pair";
+
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
+
+    auto* next_op_desc = next_op->Op();
+    float dequant_scale = boost::get<float>(dequant_op->Op()->GetAttr("Scale"));
+    float quant_scale = boost::get<float>(quant_op->Op()->GetAttr("Scale"));
+    PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) !=
+                   nodes_keep_counter->end());
+
+    // check if dequantize op should be kept or removed, decrease the counter
+    bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
+
+    if (dequant_scale == quant_scale) {
+      // squash dequantize-quantize to nothing
+      auto quant_out_var_name = quant_out->Name();
+      auto next_op_inputs = next_op_desc->InputNames();
+      for (const auto& name : next_op_inputs) {
+        auto var_name = next_op_desc->Input(name)[0];
+        if (var_name.compare(quant_out_var_name) == 0) {
+          next_op_desc->SetInput(
+              name, std::vector<std::string>({dequant_in->Name()}));
+          break;
+        }
+      }
+
+      if (keep_dequant)
+        GraphSafeRemoveNodes(graph, {quant_op, quant_out});
+      else
+        GraphSafeRemoveNodes(graph,
+                             {dequant_op, quant_op, dequant_out, quant_out});
+
+      IR_NODE_LINK_TO(dequant_in, next_op);
+
+      found_squash_count++;
+    } else {
+      // squash dequantize-quantize to requantize op
+      OpDesc desc;
+      desc.SetType("requantize");
+      desc.SetInput("Input", std::vector<std::string>({dequant_in->Name()}));
+      desc.SetOutput("Output", std::vector<std::string>({quant_out->Name()}));
+      desc.SetAttr("Scale_in", dequant_scale);
+      desc.SetAttr("Scale_out", quant_scale);
+
+      auto requant_op = g->CreateOpNode(&desc);
+
+      if (keep_dequant)
+        GraphSafeRemoveNodes(graph, {quant_op});
+      else
+        GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out});
+
+      IR_NODE_LINK_TO(dequant_in, requant_op);
+      IR_NODE_LINK_TO(requant_op, quant_out);
+
+      found_squash_count++;
+    }
+  };
+  gpd(graph, handler);
+  AddStatis(found_squash_count);
+  PrettyLogDetail("---    squashed %d dequantize-quantize pairs",
+                  found_squash_count);
+}
+
+std::unique_ptr<ir::Graph> CPUQuantizeSquashPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("cpu_quantize_squash_pass", graph.get());
+
+  std::unordered_map<const Node*, int> nodes_keep_counter;
+  FindNodesToKeep(graph.get(), &nodes_keep_counter);
+  Squash(graph.get(), &nodes_keep_counter);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_squash_pass,
+              paddle::framework::ir::CPUQuantizeSquashPass);
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Squash dequantize->quantize pair pattern into requantize op
+ */
+class CPUQuantizeSquashPass : public FusePassBase {
+ public:
+  virtual ~CPUQuantizeSquashPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  /*
+   * For each dequantize's output find the number of operators it is an input to
+   */
+  void FindNodesToKeep(
+      Graph* graph,
+      std::unordered_map<const Node*, int>* nodes_keep_counter) const;
+
+  /*
+   * Squash dequantize-quantize ops pairs into requantize or nothing
+   */
+  void Squash(Graph* graph,
+              std::unordered_map<const Node*, int>* nodes_keep_counter) const;
+
+  const std::string name_scope_{"squash"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           float scale = 0) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Output", {outputs[0]});
+  } else if (type == "quantize") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("Scale", scale);
+  } else if (type == "dequantize") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("Scale", scale);
+  }
+}
+
+// (a,w1,b1)->Conv1->d
+// d->Dequant->e
+// e->Quant->f
+// (f,w2,b2)->Conv2->i
+ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) {
+  ProgramDesc prog;
+  for (auto& v : std::initializer_list<std::string>(
+           {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("w") == 0 || v.find("b") == 0) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn);
+  SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1);
+  SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2);
+  SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn);
+  return prog;
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "a", "b", "c", "d", "e", "f", "g", "h"};
+// a->Conv1->b
+// b->Dequant->c
+//
+// c->Quant1->d and d->Conv2->e
+//
+// c->Conv3->f
+//
+// c->Quant2->g and g->Conv4->h
+//
+ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2,
+                              float scale3) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn);
+  SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
+
+  SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2);
+  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn);
+
+  SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn);
+
+  SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3);
+  SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn);
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  // Init scope, as it is used in pass
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+  }
+
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num);
+}
+
+TEST(CpuQuantizeSquashPass, equal_scales) {
+  auto scale = 1.2345f;
+  auto use_mkldnn = true;
+  // Remove 4 nodes: Dequant, Quant, e, f
+  auto remove_nodes = 4;
+  MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
+}
+
+TEST(CpuQuantizeSquashPass, inequal_scales) {
+  auto scale1 = 1.2345f;
+  auto scale2 = 21.0f;
+  auto use_mkldnn = true;
+  // Remove 3 nodes: Dequant, Quant, e
+  // Insert 1 node: requantize
+  auto remove_nodes = 2;
+  MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
+}
+
+TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) {
+  // Delete both quantize ops,
+  // bypass dequantize in both branches,
+  // insert requantize on one branch
+  auto scale = 1.2345f;
+  auto scale2 = 21.0f;
+  auto use_mkldnn = true;
+  // Remove 3 nodes: Quant1, Quant2, g
+  // Insert 1 node: requantize
+  auto remove_nodes = 2;
+  MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_squash_pass);
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <algorithm>
-#include <unordered_set>
+#include <unordered_map>

 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -152,6 +152,39 @@ void Graph::ResolveHazard(
  }
 }

+std::shared_ptr<Graph> Graph::Clone() {
+  auto cloned_graph = std::make_shared<Graph>(this->program_);
+  cloned_graph->ReleaseNodes();
+  cloned_graph->num_node_created_ = 0;
+  std::unordered_map<ir::Node *, ir::Node *> origin_to_cloned;
+  for (auto *n : this->node_set_) {
+    ir::Node *cloned_node = nullptr;
+    if (n->IsCtrlVar()) {
+      cloned_node = cloned_graph->CreateControlDepVar();
+    } else if (!n->var_desc_ && !n->op_desc_) {  // empty node
+      cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType());
+    } else if (n->IsVar()) {
+      cloned_node = cloned_graph->CreateVarNode(n->Var());
+    } else if (n->IsOp()) {
+      cloned_node = cloned_graph->CreateOpNode(n->Op());
+    }
+    if (cloned_node) {
+      origin_to_cloned[n] = cloned_node;
+    } else {
+      PADDLE_THROW("The cloned node's type is not supported!");
+    }
+  }
+  for (auto *n : this->node_set_) {
+    for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) {
+      origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]);
+    }
+    for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) {
+      origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]);
+    }
+  }
+  return cloned_graph;
+}
+
 bool IsControlDepVar(const ir::Node &var) {
  return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
 }

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>

 #include "paddle/fluid/framework/ir/node.h"
@@ -199,7 +200,12 @@ class Graph {
  // WARN: After a series of passes, the current graph can be quite
  // different from OriginProgram. Caller shouldn't assume much from
  // the returned OriginProgram.
-  const ProgramDesc &OriginProgram() const { return program_; }
+  const ProgramDesc &OriginProgram() const {
+    LOG(WARNING) << "WARN: After a series of passes, the current graph can be "
+                    "quite different from OriginProgram. So, please avoid "
+                    "using the `OriginProgram()` method!";
+    return program_;
+  }

  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
@@ -212,6 +218,10 @@ class Graph {
  void ResolveHazard(
      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);

+  // Create a new and duplicated graph.
+  // WARN: The method only clones the graph structure, not its attributes.
+  std::shared_ptr<Graph> Clone();
+
 private:
  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
      const ProgramDesc &program);

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1301,6 +1301,51 @@ PDNode *patterns::ConvAffineChannel::operator()(
  return ac_out_var;
 }

+PDNode *patterns::DequantQuantAny::operator()() {
+  auto *dequant_in = pattern->NewNode(dequant_in_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("dequantize", "Input");
+
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
+
+  auto *dequant_out = pattern->NewNode(dequant_out_repr())
+                          ->AsOutput()
+                          ->assert_is_op_output("dequantize", "Output");
+
+  auto *quant_op = pattern->NewNode(quant_op_repr())
+                       ->assert_is_op("quantize")
+                       ->AsIntermediate();
+
+  auto *quant_out = pattern->NewNode(quant_out_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("quantize");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
+  quant_op->LinksFrom({dequant_out}).LinksTo({quant_out});
+  next_op->LinksFrom({quant_out});
+
+  return quant_out;
+}
+
+PDNode *patterns::DequantAny::operator()() {
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
+
+  auto *dequant_out = pattern->NewNode(dequant_out_repr())
+                          ->AsOutput()
+                          ->assert_is_op_output("dequantize", "Output");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  dequant_op->LinksTo({dequant_out});
+  next_op->LinksFrom({dequant_out});
+
+  return dequant_out;
+}
+
 // a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
 // b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
 // ...

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -18,8 +18,11 @@
 #include <gtest/gtest_prod.h>
 #endif

+#include <memory>
 #include <numeric>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
@@ -766,6 +769,34 @@ struct ConvAffineChannel : public PatternBase {
  PATTERN_DECL_NODE(ac_out);  // Out
 };

+// Dequantize + Quantize + anyOP
+// This pattern is used for squashing the dequantize-quantize pairs.
+struct DequantQuantAny : public PatternBase {
+  DequantQuantAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dequant_quant_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(dequant_in);
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+  PATTERN_DECL_NODE(quant_op);
+  PATTERN_DECL_NODE(quant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+// Dequantize + anyOP
+// This quantize is used for getting number of ops the Dequantize's
+// output is an input to.
+struct DequantAny : public PatternBase {
+  DequantAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dequant_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
 struct TransposeFlattenConcat : public PatternBase {
  TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}

--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
 #include <typeindex>
 #include <typeinfo>

--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Use synchronous batch norm";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->Type() == "batch_norm") {
+        op->SetType("sync_batch_norm");
+      }
+      if (op->Type() == "batch_norm_grad") {
+        op->SetType("sync_batch_norm_grad");
+      }
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass);
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.h
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class SyncBatchNormPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("name", name);
+  op->SetInput("X", inputs);
+  op->SetOutput("Out", outputs);
+}
+
+// (a, conv_w)->conv2d->b
+// (b, bn_scale, bn_bias, mean, var)->batch_norm
+//     ->(c, mean, var, save_mean, save_inv_var)
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "conv_w", "b", "bn_scale",
+                                           "bn_bias", "mean", "var", "c",
+                                           "save_mean", "save_inv_var"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" ||
+        v == "var") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"a", "conv_w"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "batch_norm", "bn",
+        std::vector<std::string>({"b", "bn_scale", "bn_bias", "mean", "var"}),
+        std::vector<std::string>(
+            {"c", "mean", "var", "save_mean", "save_inv_var"}));
+  return prog;
+}
+
+TEST(IsTestPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass");
+
+  graph = pass->Apply(std::move(graph));
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "bn") {
+        ASSERT_EQ(op->Type(), "sync_batch_norm");
+      }
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(sync_batch_norm_pass);
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -186,14 +186,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    VLOG(3) << place << " " << DebugStringEx(&scope);
  } catch (platform::EnforceNotMet exception) {
    if (Attrs().count("sub_block") != 0) {
-      throw;
+      throw std::move(exception);
    }

    auto& callstack = Attr<std::vector<std::string>>(
        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());

    if (callstack.empty()) {
-      throw;
+      throw std::move(exception);
    }
    std::ostringstream sout;
    sout << "Invoke operator " << Type() << " error.\n";
@@ -204,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    sout << "C++ Callstacks: \n";
    sout << exception.err_str_;
    exception.err_str_ = sout.str();
-    throw;
+    throw std::move(exception);
  } catch (...) {
    std::rethrow_exception(std::current_exception());
  }
@@ -926,8 +926,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    dev_ctx = pool.Get(expected_kernel_key.place_);
  }

-  RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
-  this->InferShape(&infer_shape_ctx);
+  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
+    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+    this->InferShape(&infer_shape_ctx);
+  }
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
  kernel_iter->second(

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -62,6 +62,15 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";

+/// If an Op has this attribute, all its kernels should calculate output
+/// variable's shape in the corresponding Compute() function. And
+/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
+/// function in its runtime for speedup.
+/// TODO(luotao): Note that this temporal attribute would be deleted after all
+/// ops contain it.
+constexpr char kAllKernelsMustComputeRuntimeShape[] =
+    "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
+
 // define some kernel priority
 /* Define multiple kernel type fallback order*/
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -14,8 +14,10 @@ limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"

@@ -181,13 +183,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
  return member_->local_scopes_;
 }

-ParallelExecutor::ParallelExecutor(
-    const std::vector<platform::Place> &places,
-    const std::unordered_set<std::string> &bcast_vars,
-    const std::string &loss_var_name, Scope *scope,
-    const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
-    ir::Graph *graph)
+ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
+                                   const std::vector<std::string> &bcast_vars,
+                                   const std::string &loss_var_name,
+                                   Scope *scope,
+                                   const std::vector<Scope *> &local_scopes,
+                                   const ExecutionStrategy &exec_strategy,
+                                   const BuildStrategy &build_strategy,
+                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
  member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -250,13 +253,41 @@ ParallelExecutor::ParallelExecutor(
    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
        member_->places_, nccl_id, build_strategy.num_trainers_,
        build_strategy.trainer_id_));
+
+    std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
+    dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
+    // Initialize device context's nccl comm
+    // Note, more than one ParallelExecutor with same place, the nccl comm will
+    // be rewrite and there will be some problem.
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto &nccl_ctx = dev_nccl_ctxs->at(dev_id);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      dev_ctx->set_nccl_comm(nccl_ctx.comm());
+    }
 #else
    PADDLE_THROW("Not compiled with CUDA");
 #endif
  }
-  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToDevices(bcast_vars);
+  // broadcast parameters from the 0th device to others:
+  auto need_broadcast = [&]() -> bool {
+    if (build_strategy.num_trainers_ > 1) {
+      // 1. num_tariners would be grater than 1 for nccl distributed training.
+      return true;
+    } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
+      // 2. Only one trainer process, but ParallelExecutor hold multiple
+      // devices.
+      return true;
+    }
+    return false;
+  };
+
+  if (need_broadcast()) {
+    BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
  }
+
 // Startup Program has been run. All local scopes has correct parameters.

 // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
@@ -338,7 +369,7 @@ ParallelExecutor::ParallelExecutor(
 }

 void ParallelExecutor::BCastParamsToDevices(
-    const std::unordered_set<std::string> &vars) const {
+    const std::vector<std::string> &vars, int trainer_id) const {
  // the initializing bcast, all vars would be bcast from device(0).
  for (auto &var : vars) {
    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
@@ -362,7 +393,7 @@ void ParallelExecutor::BCastParamsToDevices(
        auto place = member_->places_[i];
        void *buffer;

-        if (i == 0) {
+        if (i == 0 && trainer_id == 0) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,9 +14,11 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/details/build_strategy.h"
@@ -45,7 +47,7 @@ class ParallelExecutor {

 public:
  explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                            const std::unordered_set<std::string> &bcast_vars,
+                            const std::vector<std::string> &bcast_vars,
                            const std::string &loss_var_name, Scope *scope,
                            const std::vector<Scope *> &local_scopes,
                            const ExecutionStrategy &exec_strategy,
@@ -70,7 +72,10 @@ class ParallelExecutor {
           const std::string &fetched_var_name);

 private:
-  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
+  // broadcast the parameters from the 0th device.
+  // trainer_id the trainer index in nccl distributed training.
+  void BCastParamsToDevices(const std::vector<std::string> &vars,
+                            int trainer_id = 0) const;
  bool EnableParallelGraphExecution(const ir::Graph &graph,
                                    const ExecutionStrategy &exec_strategy,
                                    const BuildStrategy &build_strategy) const;

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
+cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)

 if (WITH_GPU)

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/memory/allocation/legacy_allocator.h"
-
 #include <memory>
 #include <string>
 #include <utility>
@@ -24,9 +22,11 @@
 #endif

 #include "glog/logging.h"
+#include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"

@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 }  // namespace legacy

 namespace allocation {
-
 LegacyMemMonitor GPUMemMonitor;

 Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
  void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
-  return new Allocation(ptr, size, place_);
+  auto *tmp_alloc = new Allocation(ptr, size, place_);
+  platform::MemEvenRecorder::Instance().PushMemRecord(
+      static_cast<void *>(tmp_alloc), place_, size);
+  return tmp_alloc;
 }

 void LegacyAllocator::Free(Allocation *allocation) {
  boost::apply_visitor(
      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
      allocation->place());
+  platform::MemEvenRecorder::Instance().PopMemRecord(
+      static_cast<void *>(allocation), place_);
  delete allocation;
 }


--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -44,10 +44,10 @@ if (WITH_DISTRIBUTE)
    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
 endif()

-register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})

-# warpctc_op needs cudnn 7 above
 if (WITH_GPU)
+    # warpctc_op needs cudnn 7 above
    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
    else()
@@ -58,6 +58,8 @@ if (WITH_GPU)
        op_library(conv_fusion_op)
        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
    endif()
+    op_library(sync_batch_norm_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
 else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/activation_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 #ifdef PADDLE_WITH_CUDA
@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$

 )DOC";

+class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of acos operator");
+    AddOutput("Out", "Output of acos operator");
+    AddComment(R"DOC(
+Arccosine Activation Operator.
+
+$$out = \cos^{-1}(x)$$
+
+)DOC");
+  }
+};
+
+class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of asin operator");
+    AddOutput("Out", "Output of asin operator");
+    AddComment(R"DOC(
+Arcsine Activation Operator.
+
+$$out = \sin^{-1}(x)$$
+
+)DOC");
+  }
+};
+
+class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of atan operator");
+    AddOutput("Out", "Output of atan operator");
+    AddComment(R"DOC(
+Arctanh Activation Operator.
+
+$$out = \tanh^{-1}(x)$$
+
+)DOC");
+  }
+};
+
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@@ -543,7 +587,10 @@ namespace ops = paddle::operators;
  __macro(SoftShrink, softshrink);   \
  __macro(Abs, abs);                 \
  __macro(Cos, cos);                 \
+  __macro(Acos, acos);               \
  __macro(Sin, sin);                 \
+  __macro(Asin, asin);               \
+  __macro(Atan, atan);               \
  __macro(Round, round);             \
  __macro(Log, log);                 \
  __macro(Square, square);           \

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -39,9 +39,8 @@ namespace operators {
   Please refer to the layer_helper.py and get the details.
 */
 static std::unordered_set<std::string> InplaceOpSet = {
-    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",         "ceil",
-    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
-};
+    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",        "ceil",
+    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};

 static bool IsInplace(const std::string& op) {
  bool inplace = InplaceOpSet.count(op);
@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> {
  }
 };

+template <typename T>
+struct Acos {
+  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
+};
+
+template <>
+struct Acos<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(acos(static_cast<float>(val)));
+  }
+};
+
+// Acos(x) = acos(x)
+template <typename T>
+struct AcosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acos<T>());
+  }
+};
+
+// acos'(x) = -1/sqrt(1-x^2)
+template <typename T>
+struct AcosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+};
+
+template <typename T>
+struct Asin {
+  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
+};
+
+template <>
+struct Asin<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(asin(static_cast<float>(val)));
+  }
+};
+
+// Asin(x) = asin(x)
+template <typename T>
+struct AsinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asin<T>());
+  }
+};
+
+// asin'(x) = 1/sqrt(1-x^2)
+template <typename T>
+struct AsinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+};
+
+template <typename T>
+struct Atan {
+  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
+};
+
+template <>
+struct Atan<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(atan(static_cast<float>(val)));
+  }
+};
+
+// Atan(x) = atan(x)
+template <typename T>
+struct AtanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atan<T>());
+  }
+};
+
+// atan'(x) =  1 / (1 + x^2)
+template <typename T>
+struct AtanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
+  }
+};
+
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
  __macro(gelu, GeluFunctor, GeluGradFunctor);                       \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(atan, AtanFunctor, AtanGradFunctor);                       \
  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
  __macro(cos, CosFunctor, CosGradFunctor);                          \
+  __macro(acos, AcosFunctor, AcosGradFunctor);                       \
  __macro(sin, SinFunctor, SinGradFunctor);                          \
+  __macro(asin, AsinFunctor, AsinGradFunctor);                       \
  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
  __macro(log, LogFunctor, LogGradFunctor);                          \

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/batch_norm_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -22,147 +24,150 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-class BatchNormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Bias"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"), "");
-    PADDLE_ENFORCE(ctx->HasInput("Variance"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
-
-    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
-                      "Mean and MeanOut should share the same memory");
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0],
-                      ctx->Outputs("VarianceOut")[0],
-                      "Variance and VarianceOut should share the same memory");
-
-    const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-
-    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "Input X must have 2 to 5 dimensions.");
-
-    const int64_t C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
-
-    ctx->SetOutputDim("Y", x_dims);
-    ctx->SetOutputDim("MeanOut", {C});
-    ctx->SetOutputDim("VarianceOut", {C});
-    ctx->SetOutputDim("SavedMean", {C});
-    ctx->SetOutputDim("SavedVariance", {C});
-    ctx->ShareLoD("X", "Y");
+void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                 "Input(Scale) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                 "Input(Bias) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                 "Input(Mean) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                 "Input(Variance) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                 "Output(Y) of ConvOp should not be null.");
+  bool is_test = ctx->Attrs().Get<bool>("is_test");
+  if (!is_test) {
+    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"),
+                   "Output(MeanOut) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"),
+                   "Output(VarianceOut) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"),
+                   "Output(SavedMean) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"),
+                   "Output(SavedVariance) of ConvOp should not be null.");
  }

- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("X")->type();
-    // By default, the type of the scale, bias, mean,
-    // and var tensors should both be float. (For float or float16 input tensor)
-    // or double (For double input tensor).
-    auto bn_param_type = framework::proto::VarType::FP32;
-    if (input_data_type == framework::proto::VarType::FP64) {
-      bn_param_type = framework::proto::VarType::FP64;
-    }
-    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
-                      "Scale input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
-                      "Bias input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
-                      "Mean input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
-                      "Variance input should be of float type");
-
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
+  PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
+                    "Mean and MeanOut should share the same memory");
+  PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
+                    "Variance and VarianceOut should share the same memory");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+
+  PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                 "Input X must have 2 to 5 dimensions.");
+
+  const int64_t C =
+      (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                        : x_dims[x_dims.size() - 1]);
+
+  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
+  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
+
+  ctx->SetOutputDim("Y", x_dims);
+  ctx->SetOutputDim("MeanOut", {C});
+  ctx->SetOutputDim("VarianceOut", {C});
+  ctx->SetOutputDim("SavedMean", {C});
+  ctx->SetOutputDim("SavedVariance", {C});
+  ctx->ShareLoD("X", "Y");
+}
+
+framework::OpKernelType BatchNormOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = ctx.Input<Tensor>("X")->type();
+  // By default, the type of the scale, bias, mean,
+  // and var tensors should both be float. (For float or float16 input tensor)
+  // or double (For double input tensor).
+  auto bn_param_type = framework::proto::VarType::FP32;
+  if (input_data_type == framework::proto::VarType::FP64) {
+    bn_param_type = framework::proto::VarType::FP64;
+  }
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+                    "Scale input should be of float type");
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+                    "Bias input should be of float type");
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
+                    "Mean input should be of float type");
+  PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
+                    "Variance input should be of float type");
+
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
-#endif
-
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library);
+  if (library == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
  }
-};
+#endif

-class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
-    AddAttr<float>("momentum", "").SetDefault(0.9);
-    AddAttr<float>("epsilon", "")
-        .SetDefault(1e-5)
-        .AddCustomChecker([](const float &epsilon) {
-          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
-                         "'epsilon' should be between 0.0 and 0.001.");
-        });
-    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
-    AddInput("X", "The input tensor");
-    AddInput("Scale",
-             "Scale is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("Bias",
-             "Bias is a 1-dimensional tensor of size C "
-             "that is applied to the output");
-    AddInput("Mean",
-             "The global mean (for training) or "
-             "estimated mean (for testing)");
-    AddInput("Variance",
-             "The global variance (for training) "
-             "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
-    AddOutput("MeanOut",
-              "Share memory with Mean. "
-              "Store the global mean when training");
-    AddOutput("VarianceOut",
-              "Share memory with Variance. "
-              "Store the global Variance when training");
-    AddOutput("SavedMean",
-              "Mean of the current mini batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddOutput("SavedVariance",
-              "Variance of the current mini batch, "
-              "will apply to output when training")
-        .AsIntermediate();
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>("fuse_with_relu",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>("use_global_stats",
-                  "(bool, default false) Whether to use global mean and "
-                  "variance. In inference or test mode, set use_global_stats "
-                  "to true or is_test true. the behavior is equivalent. "
-                  "In train mode, when setting use_global_stats True, the "
-                  "global mean and variance are also used during train time, "
-                  "the BN acts as scaling and shiffting.")
-        .SetDefault(false);
-    AddComment(R"DOC(
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
+}
+
+void BatchNormOpMaker::Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                       "'epsilon' should be between 0.0 and 0.001.");
+      });
+  AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
+  AddInput("X", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Mean",
+           "The global mean (for training) or "
+           "estimated mean (for testing)");
+  AddInput("Variance",
+           "The global variance (for training) "
+           "or estimated Variance (for testing)");
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_with_relu",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("use_global_stats",
+                "(bool, default false) Whether to use global mean and "
+                "variance. In inference or test mode, set use_global_stats "
+                "to true or is_test true. the behavior is equivalent. "
+                "In train mode, when setting use_global_stats True, the "
+                "global mean and variance are also used during train time, "
+                "the BN acts as scaling and shiffting.")
+      .SetDefault(false);
+  AddComment(R"DOC(
 Batch Normalization.

 Batch Norm has been implemented as discussed in the paper:
@@ -173,17 +178,7 @@ The required data format for this layer is one of the following:
 2. NCHW `[batch, in_channels, in_height, in_width]`

 )DOC");
-  }
-};
-
-class BatchNormOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
-  }
-};
+}

 template <typename T>
 class BatchNormKernel<platform::CPUDeviceContext, T>
@@ -336,82 +331,75 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
  }
 };

-class BatchNormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
-                   "Input(Y@GRAD) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
-                   "Input(SavedMean) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
-                   "Input(SavedVariance) should not be null");
-
-    // check output
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
-                     "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
-                     "null at same time");
-    }
-    const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
-    if (use_global_stats) {
-      PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
-                     "Using global stats during training is not supported "
-                     "in gradient op kernel of batch_norm_mkldnn_op now.");
-    }
+void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
+  // check input
+  PADDLE_ENFORCE(ctx->HasInput("X"));
+  PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                 "Input(Y@GRAD) should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
+                 "Input(SavedMean) should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
+                 "Input(SavedVariance) should not be null");
+
+  // check output
+  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
+  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                   "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
+                   "null at same time");
+  }
+  const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+  if (use_global_stats) {
+    PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
+                   "Using global stats during training is not supported "
+                   "in gradient op kernel of batch_norm_mkldnn_op now.");
+  }

-    const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout = framework::StringToDataLayout(
-        ctx->Attrs().Get<std::string>("data_layout"));
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
+  const auto x_dims = ctx->GetInputDim("X");
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);

-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-      ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-      ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-    }
+  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
  }
+}

- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
-    if (var == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
-    } else if (var->IsType<LoDTensor>()) {
-      t = &var->Get<LoDTensor>();
-    }
-    if (t == nullptr) {
-      PADDLE_THROW("can't find Y@GRAD");
-    }
+framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+  if (var == nullptr) {
+    PADDLE_THROW("can't find Y@GRAD");
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW("can't find Y@GRAD");
+  }

-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;

 #ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-    }
+  if (library == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
 #endif

-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.GetPlace(), layout, library);
-  }
-};
+  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
+                                 layout, library);
+}

 template <typename T>
 class BatchNormGradKernel<platform::CPUDeviceContext, T>
@@ -572,37 +560,31 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
  }
 };

-class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDesc();
-    op->SetType("batch_norm_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
-
-    op->SetInput("Scale", Input("Scale"));
-    op->SetInput("Bias", Input("Bias"));
-    op->SetInput("SavedMean", Output("SavedMean"));
-    op->SetInput("SavedVariance", Output("SavedVariance"));
-
-    // used when setting use_global_stats True during training
-    if (boost::get<bool>(GetAttr("use_global_stats"))) {
-      op->SetInput("Mean", Output("MeanOut"));
-      op->SetInput("Variance", Output("VarianceOut"));
-    }
+std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
+  auto *op = new framework::OpDesc();
+  op->SetType(GradOpType());
+  op->SetInput("X", Input("X"));
+  op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+
+  op->SetInput("Scale", Input("Scale"));
+  op->SetInput("Bias", Input("Bias"));
+  op->SetInput("SavedMean", Output("SavedMean"));
+  op->SetInput("SavedVariance", Output("SavedVariance"));
+
+  // used when setting use_global_stats True during training
+  if (boost::get<bool>(GetAttr("use_global_stats"))) {
+    op->SetInput("Mean", Output("MeanOut"));
+    op->SetInput("Variance", Output("VarianceOut"));
+  }

-    op->SetAttrMap(Attrs());
+  op->SetAttrMap(Attrs());

-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
-    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+  op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+  op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+  op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));

-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
+  return std::unique_ptr<framework::OpDesc>(op);
+}

 class BatchNormInplaceInToOut : public framework::InplaceInToOut {
 public:
@@ -642,10 +624,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut {

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker,
-                  ops::BatchNormInplaceInToOut);
-REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
-                  ops::BatchNormGradInplaceInToOut);
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker)
+// ops::BatchNormInplaceInToOut);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp)
+//                  ops::BatchNormGradInplaceInToOut);

 REGISTER_OP_CPU_KERNEL(
    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;

-void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
-                  int *N, int *C, int *H, int *W, int *D) {
-  *N = dims[0];
-  if (dims.size() == 2) {
-    *C = dims[1];
-    *H = 1;
-    *W = 1;
-    *D = 1;
-  } else {
-    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
-    *W = dims.size() > 3
-             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
-             : 1;
-    *D = dims.size() > 4
-             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
-             : 1;
-  }
-}
-
 template <typename T>
 class BatchNormKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
@@ -196,22 +176,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
  }
 };

-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(const T *dy,
-                                        const BatchNormParamType<T> *scale,
-                                        const BatchNormParamType<T> *variance,
-                                        const double epsilon, const int C,
-                                        const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
-    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
-                           scale[c] * inv_var);
-  }
-}
-
 template <typename T, int BlockDim, framework::DataLayout layout>
 static __global__ void KeBNBackwardScaleBias(
    const T *dy, const T *x, const BatchNormParamType<T> *mean,
@@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias(
  }
 }

+template <typename T, framework::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon, const int C,
+                                        const int HxW, const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
 template <typename T>
 class BatchNormGradKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
@@ -383,7 +363,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
              grid2, block, 0, dev_ctx.stream()>>>(
              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
              d_bias->data<BatchNormParamType<T>>());
        }
      } else {
@@ -394,10 +374,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
        }
        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNHWC><<<
              grid2, block, 0, dev_ctx.stream()>>>(
              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
              d_bias->data<BatchNormParamType<T>>());
        }
      }

--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

@@ -35,17 +38,84 @@ template <typename T>
 using ConstEigenVectorArrayMap =
    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;

+class BatchNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override;
+};
+
+class BatchNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override;
+};
+
+class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override;
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
+class BatchNormOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
+
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
+  void Compute(const framework::ExecutionContext &ctx) const override;
 };

 template <typename DeviceContext, typename T>
 class BatchNormGradKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
+  void Compute(const framework::ExecutionContext &ctx) const override;
 };

+inline void ExtractNCWHD(const framework::DDim &dims,
+                         const DataLayout &data_layout, int *N, int *C, int *H,
+                         int *W, int *D) {
+  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include <memory>
 #include <string>
+#include <unordered_map>

 namespace paddle {
 namespace operators {

-class CrossEntropyOp : public framework::OperatorWithKernel {
+class CrossEntropyOpBase : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+
    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");

    auto x_dims = ctx->GetInputDim("X");
@@ -43,7 +46,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
                        "Input(X) and Input(Label) shall have the same shape "
                        "except the last dimension.");
    }
-    if (ctx->Attrs().Get<bool>("soft_label")) {
+
+    if (IsSoftLabel(ctx)) {
      if (check) {
        PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
                          "If Attr(soft_label) == true, the last dimension of "
@@ -69,21 +73,24 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
                                   ctx.device_context());
  }
+
+  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
+    return ctx->Attrs().Get<bool>("soft_label");
+  }
 };

-class CrossEntropyGradientOp : public framework::OperatorWithKernel {
+class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+  void InferShape(framework::InferShapeContext* ctx) const {
    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
                   "Input(Y@GRAD) shoudl be not null.");
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                   "Output(X@GRAD) should be not null.");

-    auto x_dims = ctx->GetInputDim("X");
+    auto x_dims = GetXDim(ctx);
    auto label_dims = ctx->GetInputDim("Label");
    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
    int rank = x_dims.size();
@@ -108,9 +115,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                        "The Input(X) and Input(Y@Grad) should have the same "
                        "shape except the last dimension.");
    }
-    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
-                      "The last dimension of Input(Y@Grad) should be 1.");
-    if (ctx->Attrs().Get<bool>("soft_label")) {
+    if (IsSoftLabel(ctx)) {
      if (check) {
        PADDLE_ENFORCE_EQ(
            x_dims[rank - 1], label_dims[rank - 1],
@@ -123,7 +128,10 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                        "Input(Label) should be 1.");
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
+    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
+                      "The last dimension of Input(Y@Grad) should be 1.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD(VarNameWithXLoD(), framework::GradVarName("X"));
  }

 protected:
@@ -131,8 +139,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  // is determined by its input "X".
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Y"))->type(),
+        ctx.device_context());
+  }
+
+  virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
+    return ctx->GetInputDim("X");
+  }
+
+  virtual const char* VarNameWithXLoD() const { return "X"; }
+
+  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
+    return ctx->Attrs().Get<bool>("soft_label");
+  }
+};
+
+class CrossEntropyOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
  }
 };

@@ -200,22 +228,132 @@ or not. But the output only shares the LoD information with input X.
  }
 };

-class CrossEntropyOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
+class CrossEntropyGradientOp : public CrossEntropyGradientOpBase {
+ public:
+  using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    CrossEntropyGradientOpBase::InferShape(ctx);
+  }
+};
+
+class CrossEntropyOp2 : public CrossEntropyOpBase {
+ public:
+  using CrossEntropyOpBase::CrossEntropyOpBase;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    CrossEntropyOpBase::InferShape(ctx);
+
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("MatchX"),
+                   "Output(MatchX) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_dims_vec = framework::vectorize(x_dims);
+    x_dims_vec.push_back(0);
+    ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec));
+    x_dims[x_dims.size() - 1] = 1;
+    ctx->SetOutputDim("MatchX", x_dims);
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+
 protected:
-  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
-      const override {
-    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  bool IsSoftLabel(framework::InferShapeContext* ctx) const override {
+    return false;
+  }
+};
+
+class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase {
+ public:
+  using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("MatchX"), "Input(MatchX) must exist");
+    CrossEntropyGradientOpBase::InferShape(ctx);
+  }
+
+ protected:
+  virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
+    auto x_shape = ctx->GetInputDim("XShape");
+    return framework::DDim(x_shape.Get(), x_shape.size() - 1);
+  }
+
+  virtual const char* VarNameWithXLoD() const { return "XShape"; }
+
+  virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
+    return false;
  }
 };
+
+class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
+             "size is equal to the number of classes. This input is a "
+             "probability computed by the previous operator, which is almost "
+             "always the result of a softmax operator.");
+    AddInput(
+        "Label",
+        "(Tensor), the tensor which represents the ground truth. It has the "
+        "same shape with 'X' except the last dimension. One hot Tensor.");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
+              "with 'X' except that the last dimension size is 1. It "
+              "represents the cross entropy loss.");
+    AddOutput("XShape", "Temporaily variable to save shape and LoD of X.");
+    AddOutput("MatchX",
+              "X value that matches label, used for gradient computation.");
+    AddAttr<int>("ignore_index",
+                 "(int, default -100), Specifies a target value that is"
+                 "ignored and does not contribute to the input gradient."
+                 "Only valid if soft_label is set to False")
+        .SetDefault(-100);
+    AddComment(R"DOC(
+Hard-label CrossEntropy Operator.
+
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
+The matrix's second dimension(row length) is as same as the original last 
+dimension, and the first dimension(column length) is the product of all other 
+original dimensions. Then the softmax computation will take palce on each raw 
+of flattened matrixs.
+
+Only support hard label.
+
+Both the input X and Label can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input X.
+
+)DOC");
+  }
+};
+
+class CrossEntropyGradOpDescMaker2 : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("cross_entropy_grad2");
+    op->SetInput("Label", Input("Label"));
+    op->SetInput("MatchX", Output("MatchX"));
+    op->SetInput("XShape", Output("XShape"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
 using CPUCtx = paddle::platform::CPUDeviceContext;

-REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
-                  ops::CrossEntropyOpInferVarType,
+REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase,
+                  ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
@@ -223,3 +361,14 @@ REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
                       ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
                       ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
+
+REGISTER_OPERATOR(cross_entropy2, ops::CrossEntropyOp2,
+                  ops::CrossEntropyOpMaker2, ops::CrossEntropyOpInferVarType,
+                  ops::CrossEntropyGradOpDescMaker2);
+REGISTER_OPERATOR(cross_entropy_grad2, ops::CrossEntropyGradientOp2);
+REGISTER_OP_CPU_KERNEL(cross_entropy2,
+                       ops::CrossEntropyOpKernel2<CPUCtx, float>,
+                       ops::CrossEntropyOpKernel2<CPUCtx, double>);
+REGISTER_OP_CPU_KERNEL(cross_entropy_grad2,
+                       ops::CrossEntropyGradientOpKernel2<CPUCtx, float>,
+                       ops::CrossEntropyGradientOpKernel2<CPUCtx, double>);
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -27,3 +27,13 @@ REGISTER_OP_CUDA_KERNEL(
    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(cross_entropy2,
+                        ops::CrossEntropyOpKernel2<CUDACtx, float>,
+                        ops::CrossEntropyOpKernel2<CUDACtx, double>,
+                        ops::CrossEntropyOpKernel2<CUDACtx, plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2<CUDACtx, float>,
+    ops::CrossEntropyGradientOpKernel2<CUDACtx, double>,
+    ops::CrossEntropyGradientOpKernel2<CUDACtx, plat::float16>);
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -137,5 +138,124 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
  }
 };

+template <typename T>
+struct HardLabelCrossEntropyForwardFunctor {
+  HardLabelCrossEntropyForwardFunctor(const T* x, T* y, T* match_x,
+                                      const int64_t* label,
+                                      int64_t ignore_index,
+                                      int64_t feature_size)
+      : x_(x),
+        y_(y),
+        match_x_(match_x),
+        label_(label),
+        ignore_index_(ignore_index),
+        feature_size_(feature_size) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    auto label = label_[idx];
+    if (label != ignore_index_) {
+      auto match_x = x_[idx * feature_size_ + label];
+      y_[idx] = -math::TolerableValue<T>()(real_log(match_x));
+      match_x_[idx] = match_x;
+    } else {
+      y_[idx] = 0;
+      match_x_[idx] = 0;  // any value is ok
+    }
+  }
+
+  const T* x_;
+  T* y_;
+  T* match_x_;
+  const int64_t* label_;
+  int64_t ignore_index_;
+  int64_t feature_size_;
+};
+
+template <typename T>
+struct HardLabelCrossEntropyBackwardFunctor {
+  HardLabelCrossEntropyBackwardFunctor(T* dx, const T* dy, const T* match_x,
+                                       const int64_t* label,
+                                       int64_t ignore_index,
+                                       int64_t feature_size)
+      : dx_(dx),
+        dy_(dy),
+        match_x_(match_x),
+        label_(label),
+        ignore_index_(ignore_index),
+        feature_size_(feature_size) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    auto row_idx = idx / feature_size_;
+    auto col_idx = idx % feature_size_;
+    auto label = label_[row_idx];
+    if (label == col_idx && label != ignore_index_) {
+      dx_[idx] = -dy_[row_idx] / match_x_[row_idx];
+    } else {
+      dx_[idx] = 0;
+    }
+  }
+
+  T* dx_;
+  const T* dy_;
+  const T* match_x_;
+  const int64_t* label_;
+  int64_t ignore_index_;
+  int64_t feature_size_;
+};
+
+template <typename DeviceContext, typename T>
+class CrossEntropyOpKernel2 : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* match_x = ctx.Output<Tensor>("MatchX");
+
+    auto& x_dims = x->dims();
+    auto feature_size = x_dims[x_dims.size() - 1];
+    auto batch_size = framework::product(x->dims()) / feature_size;
+
+    auto* p_x = x->data<T>();
+    auto* p_label = label->data<int64_t>();
+    auto* p_y = y->mutable_data<T>(ctx.GetPlace());
+    auto* p_match_x = match_x->mutable_data<T>(ctx.GetPlace());
+
+    auto ignore_index = ctx.Attr<int>("ignore_index");
+
+    platform::ForRange<DeviceContext> for_range(
+        ctx.template device_context<DeviceContext>(), batch_size);
+    for_range(HardLabelCrossEntropyForwardFunctor<T>(
+        p_x, p_y, p_match_x, p_label, ignore_index, feature_size));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CrossEntropyGradientOpKernel2 : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* match_x = ctx.Input<Tensor>("MatchX");
+    auto* label = ctx.Input<Tensor>("Label");
+
+    auto* p_dx = dx->mutable_data<T>(ctx.GetPlace());
+    auto* p_dy = dy->data<T>();
+    auto* p_match_x = match_x->data<T>();
+    auto* p_label = label->data<int64_t>();
+
+    int64_t ignore_index = ctx.Attr<int>("ignore_index");
+    int rank = dx->dims().size();
+    int64_t feature_size = dx->dims()[rank - 1];
+    int64_t batch_size = framework::product(dx->dims()) / feature_size;
+
+    platform::ForRange<DeviceContext> for_range(
+        ctx.template device_context<DeviceContext>(),
+        batch_size * feature_size);
+    for_range(HardLabelCrossEntropyBackwardFunctor<T>(
+        p_dx, p_dy, p_match_x, p_label, ignore_index, feature_size));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -20,7 +20,7 @@ namespace operators {

 enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };

-inline BoxCodeType GetBoxCodeType(const std::string& type) {
+inline BoxCodeType GetBoxCodeType(const std::string &type) {
  if (type == "encode_center_size") {
    return BoxCodeType::kEncodeCenterSize;
  } else if (type == "decode_center_size") {
@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
 template <typename DeviceContext, typename T>
 class BoxCoderKernel : public framework::OpKernel<T> {
 public:
-  void EncodeCenterSize(const framework::Tensor* target_box,
-                        const framework::Tensor* prior_box,
-                        const framework::Tensor* prior_box_var,
+  void EncodeCenterSize(const framework::Tensor *target_box,
+                        const framework::Tensor *prior_box,
+                        const framework::Tensor *prior_box_var,
                        const bool normalized,
-                        const std::vector<float> variance, T* output) const {
+                        const std::vector<float> variance, T *output) const {
    int64_t row = target_box->dims()[0];
    int64_t col = prior_box->dims()[0];
    int64_t len = prior_box->dims()[1];
-    auto* target_box_data = target_box->data<T>();
-    auto* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = nullptr;
-    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();

 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
+        auto *target_box_data = target_box->data<T>();
+        auto *prior_box_data = prior_box->data<T>();
+        size_t offset = i * col * len + j * len;
        T prior_box_width = prior_box_data[j * len + 2] -
                            prior_box_data[j * len] + (normalized == false);
        T prior_box_height = prior_box_data[j * len + 3] -
@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
                              target_box_data[i * len + 1] +
                              (normalized == false);

-        size_t offset = i * col * len + j * len;
        output[offset] =
            (target_box_center_x - prior_box_center_x) / prior_box_width;
        output[offset + 1] =
@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
            std::log(std::fabs(target_box_width / prior_box_width));
        output[offset + 3] =
            std::log(std::fabs(target_box_height / prior_box_height));
-        if (prior_box_var) {
-          int prior_var_offset = j * len;
-          output[offset] /= prior_box_var_data[prior_var_offset];
-          output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
-          output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
-          output[offset + 3] /= prior_box_var_data[prior_var_offset + 3];
-        } else if (!(variance.empty())) {
+      }
+    }
+
+    if (prior_box_var) {
+      const T *prior_box_var_data = prior_box_var->data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
          for (int k = 0; k < 4; ++k) {
+            size_t offset = i * col * len + j * len;
+            int prior_var_offset = j * len;
+            output[offset + k] /= prior_box_var_data[prior_var_offset + k];
+          }
+        }
+      }
+    } else if (!(variance.empty())) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
+          for (int k = 0; k < 4; ++k) {
+            size_t offset = i * col * len + j * len;
            output[offset + k] /= static_cast<T>(variance[k]);
          }
        }
      }
    }
  }
+
  template <int axis, int var_size>
-  void DecodeCenterSize(const framework::Tensor* target_box,
-                        const framework::Tensor* prior_box,
-                        const framework::Tensor* prior_box_var,
+  void DecodeCenterSize(const framework::Tensor *target_box,
+                        const framework::Tensor *prior_box,
+                        const framework::Tensor *prior_box_var,
                        const bool normalized, std::vector<float> variance,
-                        T* output) const {
+                        T *output) const {
    int64_t row = target_box->dims()[0];
    int64_t col = target_box->dims()[1];
    int64_t len = target_box->dims()[2];

-    auto* target_box_data = target_box->data<T>();
-    auto* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = nullptr;
-    if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
-    int prior_box_offset = 0;
-    T var_data[4] = {1., 1., 1., 1.};
-    T* var_ptr = var_data;
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
+        auto *target_box_data = target_box->data<T>();
+        auto *prior_box_data = prior_box->data<T>();
+
+        T var_data[4] = {1., 1., 1., 1.};
+        T *var_ptr = var_data;
        size_t offset = i * col * len + j * len;
-        prior_box_offset = axis == 0 ? j * len : i * len;
+        int prior_box_offset = axis == 0 ? j * len : i * len;
+
        T prior_box_width = prior_box_data[prior_box_offset + 2] -
                            prior_box_data[prior_box_offset] +
                            (normalized == false);
@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
        T target_box_width = 0, target_box_height = 0;
        int prior_var_offset = axis == 0 ? j * len : i * len;
        if (var_size == 2) {
-          std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
+          std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
                      4 * sizeof(T));
        } else if (var_size == 1) {
-          var_ptr = reinterpret_cast<T*>(variance.data());
+          var_ptr = reinterpret_cast<T *>(variance.data());
        }
        T box_var_x = *var_ptr;
        T box_var_y = *(var_ptr + 1);
@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
    }
  }

-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto *output_box = context.Output<framework::Tensor>("OutputBox");
    std::vector<float> variance = context.Attr<std::vector<float>>("variance");
    const int axis = context.Attr<int>("axis");
    if (target_box->lod().size()) {
@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {

    output_box->mutable_data<T>({row, col, len}, context.GetPlace());

-    T* output = output_box->data<T>();
+    T *output = output_box->data<T>();
    if (code_type == BoxCodeType::kEncodeCenterSize) {
      EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
                       variance, output);

--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -10,6 +10,7 @@
   limitations under the License. */

 #include "paddle/fluid/operators/detection/yolov3_loss_op.h"
+#include <memory>
 #include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
@@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_GT(class_num, 0,
                      "Attr(class_num) should be an integer greater then 0.");

+    if (ctx->HasInput("GTScore")) {
+      auto dim_gtscore = ctx->GetInputDim("GTScore");
+      PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2,
+                        "Input(GTScore) should be a 2-D tensor");
+      PADDLE_ENFORCE_EQ(
+          dim_gtscore[0], dim_gtbox[0],
+          "Input(GTBox) and Input(GTScore) dim[0] should be same");
+      PADDLE_ENFORCE_EQ(
+          dim_gtscore[1], dim_gtbox[1],
+          "Input(GTBox) and Input(GTScore) dim[1] should be same");
+    }
+
    std::vector<int64_t> dim_out({dim_x[0]});
    ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));

@@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
             "This is a 2-D tensor with shape of [N, max_box_num], "
             "and each element should be an integer to indicate the "
             "box class id.");
+    AddInput("GTScore",
+             "The score of GTLabel, This is a 2-D tensor in same shape "
+             "GTLabel, and score values should in range (0, 1). This "
+             "input is for GTLabel score can be not 1.0 in image mixup "
+             "augmentation.")
+        .AsDispensable();
    AddOutput("Loss",
              "The output yolov3 loss tensor, "
              "This is a 1-D tensor with shape of [N]");
@@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<float>("ignore_thresh",
                   "The ignore threshold to ignore confidence loss.")
        .SetDefault(0.7);
+    AddAttr<bool>("use_label_smooth",
+                  "Whether to use label smooth. Default True.")
+        .SetDefault(true);
    AddComment(R"DOC(
         This operator generates yolov3 loss based on given predict result and ground
         truth boxes.
@@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
         loss = (loss_{xy} + loss_{wh}) * weight_{box}
              + loss_{conf} + loss_{class}
         $$
+
+         While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
+         target will be smoothed when calculating classification loss, target of 
+         positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
+         negetive samples will be smoothed to :math:`1.0 / class\_num`.
+
+         While :attr:`GTScore` is given, which means the mixup score of ground truth 
+         boxes, all losses incured by a ground truth box will be multiplied by its 
+         mixup score.
         )DOC");
  }
 };
@@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
    op->SetInput("X", Input("X"));
    op->SetInput("GTBox", Input("GTBox"));
    op->SetInput("GTLabel", Input("GTLabel"));
+    op->SetInput("GTScore", Input("GTScore"));
    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
    op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
    op->SetInput("GTMatchMask", Output("GTMatchMask"));
@@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
    op->SetOutput(framework::GradVarName("GTBox"), {});
    op->SetOutput(framework::GradVarName("GTLabel"), {});
+    op->SetOutput(framework::GradVarName("GTScore"), {});
    return std::unique_ptr<framework::OpDesc>(op);
  }
 };

--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) {
 }

 template <typename T>
-static T L2Loss(T x, T y) {
-  return 0.5 * (y - x) * (y - x);
+static T L1Loss(T x, T y) {
+  return std::abs(y - x);
 }

 template <typename T>
@@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) {
 }

 template <typename T>
-static T L2LossGrad(T x, T y) {
-  return x - y;
+static T L1LossGrad(T x, T y) {
+  return x > y ? 1.0 : -1.0;
 }

 static int GetMaskIndex(std::vector<int> mask, int val) {
@@ -121,47 +121,49 @@ template <typename T>
 static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
                                std::vector<int> anchors, int an_idx,
                                int box_idx, int gi, int gj, int grid_size,
-                                int input_size, int stride) {
+                                int input_size, int stride, T score) {
  T tx = gt.x * grid_size - gi;
  T ty = gt.y * grid_size - gj;
  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);

-  T scale = (2.0 - gt.w * gt.h);
+  T scale = (2.0 - gt.w * gt.h) * score;
  loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
  loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
-  loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale;
-  loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale;
+  loss[0] += L1Loss<T>(input[box_idx + 2 * stride], tw) * scale;
+  loss[0] += L1Loss<T>(input[box_idx + 3 * stride], th) * scale;
 }

 template <typename T>
 static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
                                    Box<T> gt, std::vector<int> anchors,
                                    int an_idx, int box_idx, int gi, int gj,
-                                    int grid_size, int input_size, int stride) {
+                                    int grid_size, int input_size, int stride,
+                                    T score) {
  T tx = gt.x * grid_size - gi;
  T ty = gt.y * grid_size - gj;
  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);

-  T scale = (2.0 - gt.w * gt.h);
+  T scale = (2.0 - gt.w * gt.h) * score;
  input_grad[box_idx] =
      SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
  input_grad[box_idx + stride] =
      SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
  input_grad[box_idx + 2 * stride] =
-      L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
+      L1LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
  input_grad[box_idx + 3 * stride] =
-      L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
+      L1LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
 }

 template <typename T>
 static inline void CalcLabelLoss(T* loss, const T* input, const int index,
                                 const int label, const int class_num,
-                                 const int stride) {
+                                 const int stride, const T pos, const T neg,
+                                 T score) {
  for (int i = 0; i < class_num; i++) {
    T pred = input[index + i * stride];
-    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0);
+    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? pos : neg) * score;
  }
 }

@@ -169,11 +171,13 @@ template <typename T>
 static inline void CalcLabelLossGrad(T* input_grad, const T loss,
                                     const T* input, const int index,
                                     const int label, const int class_num,
-                                     const int stride) {
+                                     const int stride, const T pos, const T neg,
+                                     T score) {
  for (int i = 0; i < class_num; i++) {
    T pred = input[index + i * stride];
    input_grad[index + i * stride] =
-        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss;
+        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? pos : neg) * score *
+        loss;
  }
 }

@@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
        for (int l = 0; l < w; l++) {
          T obj = objness[k * w + l];
          if (obj > 1e-5) {
-            // positive sample: obj = 1
-            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0);
+            // positive sample: obj = mixup score
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0) * obj;
          } else if (obj > -0.5) {
            // negetive sample: obj = 0
            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
@@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
          T obj = objness[k * w + l];
          if (obj > 1e-5) {
            input_grad[k * w + l] =
-                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i];
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * obj *
+                loss[i];
          } else if (obj > -0.5) {
            input_grad[k * w + l] =
                SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
@@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
    auto* input = ctx.Input<Tensor>("X");
    auto* gt_box = ctx.Input<Tensor>("GTBox");
    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* gt_score = ctx.Input<Tensor>("GTScore");
    auto* loss = ctx.Output<Tensor>("Loss");
    auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
    auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
@@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
    int class_num = ctx.Attr<int>("class_num");
    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");

    const int n = input->dims()[0];
    const int h = input->dims()[2];
@@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
    const int stride = h * w;
    const int an_stride = (class_num + 5) * stride;

+    T label_pos = 1.0;
+    T label_neg = 0.0;
+    if (use_label_smooth) {
+      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
+      label_neg = 1.0 / static_cast<T>(class_num);
+    }
+
    const T* input_data = input->data<T>();
    const T* gt_box_data = gt_box->data<T>();
    const int* gt_label_data = gt_label->data<int>();
@@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
    int* gt_match_mask_data =
        gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());

+    const T* gt_score_data;
+    if (!gt_score) {
+      Tensor gtscore;
+      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
+      math::SetConstant<platform::CPUDeviceContext, T>()(
+          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
+          static_cast<T>(1.0));
+      gt_score = &gtscore;
+      gt_score_data = gtscore.data<T>();
+    } else {
+      gt_score_data = gt_score->data<T>();
+    }
+
    // calc valid gt box mask, avoid calc duplicately in following code
    Tensor gt_valid_mask;
    bool* gt_valid_mask_data =
@@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
        int mask_idx = GetMaskIndex(anchor_mask, best_n);
        gt_match_mask_data[i * b + t] = mask_idx;
        if (mask_idx >= 0) {
+          T score = gt_score_data[i * b + t];
          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                      an_stride, stride, 0);
          CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
-                                 box_idx, gi, gj, h, input_size, stride);
+                                 box_idx, gi, gj, h, input_size, stride, score);

          int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
-          obj_mask_data[obj_idx] = 1.0;
+          obj_mask_data[obj_idx] = score;

          int label = gt_label_data[i * b + t];
          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                        an_stride, stride, 5);
          CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
-                           class_num, stride);
+                           class_num, stride, label_pos, label_neg, score);
        }
      }
    }
@@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
    auto* input = ctx.Input<Tensor>("X");
    auto* gt_box = ctx.Input<Tensor>("GTBox");
    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* gt_score = ctx.Input<Tensor>("GTScore");
    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
    auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
@@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
    int class_num = ctx.Attr<int>("class_num");
    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");

    const int n = input_grad->dims()[0];
    const int c = input_grad->dims()[1];
@@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
    const int stride = h * w;
    const int an_stride = (class_num + 5) * stride;

+    T label_pos = 1.0;
+    T label_neg = 0.0;
+    if (use_label_smooth) {
+      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
+      label_neg = 1.0 / static_cast<T>(class_num);
+    }
+
    const T* input_data = input->data<T>();
    const T* gt_box_data = gt_box->data<T>();
    const int* gt_label_data = gt_label->data<int>();
@@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
        input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));

+    const T* gt_score_data;
+    if (!gt_score) {
+      Tensor gtscore;
+      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
+      math::SetConstant<platform::CPUDeviceContext, T>()(
+          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
+          static_cast<T>(1.0));
+      gt_score = &gtscore;
+      gt_score_data = gtscore.data<T>();
+    } else {
+      gt_score_data = gt_score->data<T>();
+    }
+
    for (int i = 0; i < n; i++) {
      for (int t = 0; t < b; t++) {
        int mask_idx = gt_match_mask_data[i * b + t];
        if (mask_idx >= 0) {
+          T score = gt_score_data[i * b + t];
          Box<T> gt = GetGtBox(gt_box_data, i, b, t);
          int gi = static_cast<int>(gt.x * w);
          int gj = static_cast<int>(gt.y * h);

          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                      an_stride, stride, 0);
-          CalcBoxLocationLossGrad<T>(
-              input_grad_data, loss_grad_data[i], input_data, gt, anchors,
-              anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride);
+          CalcBoxLocationLossGrad<T>(input_grad_data, loss_grad_data[i],
+                                     input_data, gt, anchors,
+                                     anchor_mask[mask_idx], box_idx, gi, gj, h,
+                                     input_size, stride, score);

          int label = gt_label_data[i * b + t];
          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                        an_stride, stride, 5);
          CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
-                               label_idx, label, class_num, stride);
+                               label_idx, label, class_num, stride, label_pos,
+                               label_neg, score);
        }
      }
    }

--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/expand_op.h"
+#include <memory>
 #include <vector>

 namespace paddle {
@@ -138,12 +139,28 @@ class ExpandGradOp : public framework::OperatorWithKernel {
  }
 };

+class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("expand_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ExpandGradOpDescMaker);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/operators/fake_dequantize_op.h"
 #include <string>
+#include <vector>

 namespace paddle {
 namespace operators {
@@ -76,6 +77,63 @@ $$Out = \frac{scale*X}{ max_range }$$
  }
 };

+class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInputs("Scales"),
+                   "Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
+
+    ctx->ShareDim("X", /*->*/ "Out");
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeChannelWiseDequantizeMaxAbsOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input with float-32/64 type is the "
+             "low precision tensor.");
+    AddInput("Scales",
+             "(Tensors) The scales in quantization stage. "
+             "Now, `Scales` is a vector with at most two tensors. "
+             "If Scales has two elements, the second tensor should only have "
+             "one value.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "(Tensor) The output is the dequantized high "
+              "precision tensor.");
+    AddAttr<std::vector<int>>(
+        "quant_bits",
+        "Quantization bit numbers in quantization stage. "
+        "The size of `quant_bits` should be equal to the size of `Scales`.")
+        .SetDefault({8});
+
+    AddComment(R"DOC(
+FakeChannelWiseDequantizeMaxAbsOp operator.
+
+This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp:
+
+$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$
+
+In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$.
+Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$  in the formula.
+
+Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization.
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -88,3 +146,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
 REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
                       ops::FakeDequantizeMaxAbsKernel<CPU, float>,
                       ops::FakeDequantizeMaxAbsKernel<CPU, double>);
+
+REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs,
+                  ops::FakeChannelWiseDequantizeMaxAbsOp,
+                  ops::FakeChannelWiseDequantizeMaxAbsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
+                       ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, float>,
+                       ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, double>);
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
                        ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_channel_wise_dequantize_max_abs,
+    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float>,
+    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>);
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

@@ -45,5 +46,42 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
  }
 };

+template <typename DeviceContext, typename T>
+class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto scales = ctx.MultiInput<framework::Tensor>("Scales");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
+                      "The number of first scale values must be the same with "
+                      "first dimension value of Input(X).");
+
+    auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
+    int max_range = std::pow(2, quant_bits[0] - 1) - 1;
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());
+
+    auto dequant = DequantizeFunctor<DeviceContext, T>();
+    for (int64_t i = 0; i < in->dims()[0]; i++) {
+      framework::Tensor one_channel_in = in->Slice(i, i + 1);
+      framework::Tensor one_channel_out = out->Slice(i, i + 1);
+      framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
+      dequant(dev_ctx, &one_channel_in, &one_channel_scale,
+              static_cast<T>(max_range), &one_channel_out);
+    }
+
+    if (scales.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          scales[1]->numel(), 1,
+          "The second scale tensor should only have one value at now.");
+      max_range = std::pow(2, quant_bits[1] - 1) - 1;
+      dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -134,6 +134,60 @@ $$Out = round(X/scale * range)$$
  }
 };

+class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeChannelWiseQuantizeOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("OutScales"),
+        "Output(Scales) of FakeChannelWiseQuantizeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+
+class FakeChannelWiseQuantizeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized low level tensor, "
+              "but also saved as float data type.");
+    AddOutput("OutScales", "(Tensor) Current channel wise scale");
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddComment(R"DOC(
+The scale of FakeChannelWiseQuantize operator is a vector.
+In detail, each channel of the input X has a scale value.
+
+$$scale_c = max(abs(X_c))$$
+$$range = 2^{bit\_length - 1} - 1$$
+$$Out_c = round(\frac{X_c * range} {scale_c})$$
+In above three formulas, the range value of c is as follow:
+$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
+)DOC");
+  }
+};
+
 class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
 public:
  FakeQuantizeRangeAbsMaxOp(const std::string& type,
@@ -218,3 +272,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
                       ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
+
+REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
+                  ops::FakeChannelWiseQuantizeAbsMaxOp,
+                  ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
+                       ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -174,5 +174,7 @@ namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
+                        ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  }
 };

+template <typename DeviceContext, typename T>
+class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scales = context.Output<framework::Tensor>("OutScales");
+    T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace());
+    out->mutable_data<T>(context.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto find_abs_max = FindAbsMaxFunctor<DeviceContext, T>();
+    for (int64_t i = 0; i < in->dims()[0]; i++) {
+      framework::Tensor one_channel = in->Slice(i, i + 1);
+      const T* one_channel_data = one_channel.data<T>();
+      find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
+                   &out_scales_data[i]);
+    }
+    auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
+    for (int64_t i = 0; i < in->dims()[0]; i++) {
+      framework::Tensor one_channel_in = in->Slice(i, i + 1);
+      framework::Tensor one_channel_out = out->Slice(i, i + 1);
+      framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1);
+      clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
+                 &one_channel_out);
+    }
+  }
+};
+
 template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
    PADDLE_ENFORCE(ctx->HasInput("W"),
                   "Input W of FusedEmbeddingSeqPoolOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Ids"),
@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(boolean, default false) "
                  "Sparse update.")
        .SetDefault(false);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+        .SetDefault(true);
    AddComment(R"DOC(
 FusedEmbeddingSeqPool Operator.


--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -121,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      auto *ids = context.Input<LoDTensor>("Ids");
      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      // runtime shape
+      d_table->set_height(table_dim[0]);

      auto *ids_data = ids->data<int64_t>();
      int64_t ids_num = ids->numel();

--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of HashOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -57,6 +54,8 @@ $$Out = scale * X$$
 )DOC");
    AddAttr<int>("num_hash", "").SetDefault(1);
    AddAttr<int>("mod_by", "").SetDefault(100000);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+        .SetDefault(true);
  }
 };


--- a/paddle/fluid/operators/math.h
+++ b/paddle/fluid/operators/math.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+#include "math.h"  // NOLINT
+
+namespace paddle {
+namespace operators {
+
+inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
+  return static_cast<platform::float16>(::expf(static_cast<float>(x)));
+}
+
+inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
+
+inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
+
+inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
+  return static_cast<platform::float16>(::logf(static_cast<float>(x)));
+}
+
+inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
+
+inline HOSTDEVICE double real_log(double x) { return ::log(x); }
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -20,17 +21,6 @@ namespace paddle {
 namespace operators {
 namespace math {

-namespace {
-
-__device__ __forceinline__ float real_log(float x) { return logf(x); }
-
-__device__ __forceinline__ double real_log(double x) { return log(x); }
-
-__device__ __forceinline__ platform::float16 real_log(
-    const platform::float16& val) {
-  return static_cast<platform::float16>(logf(static_cast<float>(val)));
-}
-
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                   const int N, const int D,
@@ -61,7 +51,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
    Y[blockIdx.x] = -val;
  }
 }
-}  // namespace

 template <typename T>
 class CrossEntropyFunctor<platform::CUDADeviceContext, T> {

--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
 #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
@@ -42,44 +41,75 @@ static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
  for (int i = 0; i < dims.size(); ++i) {
    int k = dims[i];
    k = k == 0 ? 1 : k;
-    sp.push_back(k);
+    sp.emplace_back(k);
  }
  return sp;
 }

+static framework::DDim Shape2Ddim(const ngraph::Shape& shape) {
+  std::vector<int64_t> dims;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    int64_t k = shape[i];
+    dims.emplace_back(k);
+  }
+  return framework::make_ddim(dims);
+}
+
 static std::map<framework::proto::VarType::Type, ngraph::element::Type>
    pd2ng_type_map = {
        {framework::proto::VarType::FP32, ngraph::element::f32},
        {framework::proto::VarType::FP64, ngraph::element::f64},
        {framework::proto::VarType::INT32, ngraph::element::i32},
        {framework::proto::VarType::INT64, ngraph::element::i64},
-        {framework::proto::VarType::BOOL, ngraph::element::boolean},
-};
-
-std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-    NgraphEngine::func_cache_ = {};
+        {framework::proto::VarType::BOOL, ngraph::element::boolean}};
+
+static std::map<ngraph::element::Type, framework::proto::VarType::Type>
+    ng2pd_type_map = {
+        {ngraph::element::f32, framework::proto::VarType::FP32},
+        {ngraph::element::f64, framework::proto::VarType::FP64},
+        {ngraph::element::i32, framework::proto::VarType::INT32},
+        {ngraph::element::i64, framework::proto::VarType::INT64},
+        {ngraph::element::boolean, framework::proto::VarType::BOOL}};
+
+std::vector<std::string> NgraphEngine::feed_vars = {};
+std::vector<std::string> NgraphEngine::fetch_vars = {};
+framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
+const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
+
+std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
+std::unordered_map<std::string,
+                   std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
+    NgraphEngine::t_in_cache_ = {};

 std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
    ngraph::runtime::Backend::create("CPU");

 static std::vector<std::vector<int>> NgraphOpIntervals(
-    framework::BlockDesc* block) {
+    std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
+  NgraphEngine::feed_vars.clear();
+  NgraphEngine::fetch_vars.clear();
  std::vector<std::vector<int>> intervals;
-  auto ops = block->AllOps();
-  int size = ops.size();
+
+  int size = ops->size();
  int left = 0;
-  while (left < size && ops.at(left)->Type() != framework::kFeedOpType) {
+  while (left < size && ops->at(left)->Type() != framework::kFeedOpType) {
    ++left;
  }
  if (left == size) {
    return intervals;
  }
-  while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
+
+  while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
+    for (auto& var_name_item : ops->at(left)->Outputs()) {
+      for (auto& var_name : var_name_item.second) {
+        NgraphEngine::feed_vars.emplace_back(var_name);
+      }
+    }
    ++left;
  }

  int right = left;
-  while (right < size && ops.at(right)->Type() != framework::kFetchOpType) {
+  while (right < size && ops->at(right)->Type() != framework::kFetchOpType) {
    ++right;
  }
  if (right == size) {
@@ -87,85 +117,124 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
  }
  if (left >= right) return intervals;

+  int index = right;
+  while (index < size && ops->at(index)->Type() == framework::kFetchOpType) {
+    for (auto& var_name_item : ops->at(index)->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        NgraphEngine::fetch_vars.emplace_back(var_name);
+      }
+    }
+    ++index;
+  }
+
  // (left, right - 1) represents indices between feed and fetch
  int pivot = left;
  while (pivot < right) {
-    auto op_type = ops.at(pivot)->Type();
+    auto op_type = ops->at(pivot)->Type();
    if (NgraphBridge::isRegister(op_type)) {
      ++pivot;
    } else {
      int start = pivot, end = start;
      while (pivot < right &&
-             (!NgraphBridge::isRegister(ops.at(pivot)->Type()))) {
+             (!NgraphBridge::isRegister(ops->at(pivot)->Type()))) {
        ++pivot;
        ++end;
      }
      std::vector<int> interval = {start, end};
-      intervals.push_back(interval);
+      intervals.emplace_back(interval);
    }
  }  // end while
  return intervals;
 }

-static void SubstituteNgraphOp(framework::BlockDesc* block,
-                               std::string block_str,
-                               std::vector<int> interval) {
-  framework::ProgramDesc program;
-  block->RemoveOp(interval.at(0), interval.at(1));
-  auto* ng_op = block->InsertOp(interval.at(0));
-  ng_op->SetType("ngraph_engine");
-  ng_op->SetAttr("interval", interval);
-  ng_op->SetAttr("graph", block_str);
+static void SubstituteNgraphOp(
+    std::vector<std::unique_ptr<framework::OperatorBase>>* ops,
+    std::string engine_key, std::string block_str, std::vector<int> interval) {
+  framework::OpDesc ng_op_desc(nullptr);
+  ng_op_desc.SetType("ngraph_engine");
+  ng_op_desc.SetAttr("interval", interval);
+  ng_op_desc.SetAttr("engine_key", engine_key);
+  ng_op_desc.SetAttr("graph", block_str);
+
+  ops->erase(ops->begin() + interval[0], ops->begin() + interval[1]);
+  ops->insert(ops->begin() + interval[0],
+              framework::OpRegistry::CreateOp(ng_op_desc));
 }

-// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089
-void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) {
-#ifdef PADDLE_WITH_NGRAPH
-  VLOG(4) << "use_ngraph=True";
-  for (size_t bid = 0; bid < program.Size(); ++bid) {
-    // TODO(baojun-nervana): Remove the const_cast
-    auto* block =
-        const_cast<framework::ProgramDesc&>(program).MutableBlock(bid);
-    std::string block_str = block->Proto()->SerializeAsString();
-    auto intervals = NgraphOpIntervals(block);
-    for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
-      SubstituteNgraphOp(block, block_str, *it);
-    }
+std::string SerializedBlock(const std::vector<framework::OpDesc*>& op_descs) {
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+
+  for (auto* op_desc : op_descs) {
+    auto* op = block_desc.AppendOp();
+    *op->Proto() = *op_desc->Proto();
+  }
+  return block_desc.Proto()->SerializeAsString();
+}
+
+std::string GenerateEngineKey(const framework::BlockDesc& bdesc) {
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+
+  for (auto& op_desc : bdesc.AllOps()) {
+    auto* op = block_desc.AppendOp();
+    *op->Proto() = *op_desc->Proto();
+  }
+  auto engine_key = std::to_string(
+      std::hash<std::string>()(block_desc.Proto()->SerializeAsString()));
+  return engine_key;
+}
+
+std::string GenerateEngineKey(const std::vector<std::string>& engine_inputs,
+                              const std::vector<std::string>& engine_outputs,
+                              int size) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  engine_hash_key += std::to_string(size);
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+
+void NgraphEngine::FuseNgraphOps(
+    const framework::BlockDesc& block_desc,
+    std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
+  NgraphEngine::p_bdesc = &block_desc;
+  auto intervals = NgraphOpIntervals(ops);
+  std::string engine_key =
+      GenerateEngineKey(feed_vars, fetch_vars, ops->size());
+  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+    SubstituteNgraphOp(ops, engine_key, "", *it);
  }
-#else
-  LOG(WARNING)
-      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
-#endif
 }

 NgraphEngine::NgraphEngine(const framework::Scope& scope,
                           const platform::Place& place,
-                           const std::string& serialized_graph,
-                           const std::vector<int>& interval)
+                           const framework::ExecutionContext& ctx)
    : scope_(scope), place_(place) {
+  std::string serialized_graph = ctx.Attr<std::string>("graph");
+  auto interval = ctx.Attr<std::vector<int>>("interval");
+  std::string engine_key = ctx.Attr<std::string>("engine_key");
+
  var_in_node_map_ = std::make_shared<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();

  var_node_map_ = std::make_shared<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();

-  func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) +
-                    serialized_graph;
-
-  framework::proto::BlockDesc bdesc;
-  bdesc.ParseFromString(serialized_graph);
-  framework::BlockDesc block(nullptr, &bdesc);
-
-  Prepare(block, interval);
-
-  BuildNgIO();
-
-  GetNgFunction();
+  GetNgFunction(engine_key, interval);
 }

-void NgraphEngine::Prepare(const framework::BlockDesc& block,
-                           const std::vector<int>& interval) {
-  for (auto& var : block.AllVars()) {
+void NgraphEngine::Prepare(const std::vector<int>& interval) {
+  for (auto& var : p_bdesc->AllVars()) {
    if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
          var->GetType() == framework::proto::VarType::LOD_TENSOR ||
          var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
@@ -192,108 +261,57 @@ void NgraphEngine::Prepare(const framework::BlockDesc& block,
    }
  }

-  auto ops_desc = block.AllOps();
-  int idx = interval[0];
-  while (idx < interval[1]) {
-    auto op_desc = ops_desc.at(idx);
-    auto op = framework::OpRegistry::CreateOp(*op_desc);
-    fused_ops_.push_back(std::move(op));
-    ++idx;
-  }
-
-  while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
-    auto op_desc = ops_desc.at(idx);
-    for (auto& var_name_item : op_desc->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        post_op_inputs_.insert(var_name);
-      }
-    }
-    ++idx;
-  }
-
-  while (idx < static_cast<int>(ops_desc.size()) &&
-         ops_desc.at(idx)->Type() == framework::kFetchOpType) {
-    std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
-    fetches_.insert(fetch_target_name);
-    ++idx;
-  }
-
-  if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType &&
-      ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
-    ng_op_state_ = OpState::FULL;
+  std::vector<paddle::framework::OpDesc*> ops_desc;
+  for (auto op_desc : p_bdesc->AllOps()) {
+    ops_desc.emplace_back(op_desc);
  }

-  for (auto* op_desc : ops_desc) {
+  for (auto op_desc : ops_desc) {
    if (op_desc->Type().find("_grad") != std::string::npos) {
-      ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN
-                                                   : OpState::PARTIAL_TRAIN;
+      this->is_test_ = false;
      break;
    }
  }

-  if (ng_op_state_ != OpState::FULL_TRAIN &&
-      ng_op_state_ != OpState::PARTIAL_TRAIN) {
-    ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST
-                                                 : OpState::PARTIAL_TEST;
+  if (interval[0] > 0 &&
+      ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType &&
+      interval[1] < static_cast<int>(ops_desc.size()) &&
+      ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
+    this->op_state_ = OpState::FULL;
  }
-}

-void NgraphEngine::GetNgInputShape(
-    std::shared_ptr<framework::OperatorBase> op) {
-  framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
-  op->RuntimeInferShape(scope_, place_, ctx);
-  for (auto& var_name_item : op->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<framework::LoDTensor>()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto sp = Ddim2Shape(tensor_pd->dims());
-        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
-            var_in_.end()) {
-          if (var_node_map_->find(var_name) == var_node_map_->end()) {
-            // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
-            auto ng_type = var_type_map_.at(var_name);
-            auto prm =
-                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
-            (*var_node_map_)[var_name] = prm;
-            (*var_in_node_map_)[var_name] = prm;
-          }
-        }
-      }
-    }
+  if (this->op_state_ == OpState::FULL) {
+    this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN;
+  } else {
+    this->op_state_ =
+        this->is_test_ ? OpState::PARTIAL_TEST : OpState::PARTIAL_TRAIN;
  }
-}

-void NgraphEngine::BuildNgNodes() {
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Outputs()) {
+  int idx = interval[0];
+  while (idx < interval[1]) {
+    this->fused_ops_.emplace_back(
+        framework::OpRegistry::CreateOp(*(ops_desc[idx])));
+    ++idx;
+  }
+  while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
+    auto op_desc = ops_desc.at(idx);
+    for (auto& var_name_item : op_desc->Inputs()) {
      for (auto& var_name : var_name_item.second) {
-        if (var_node_map_->find(var_name) == var_node_map_->end()) {
-          auto* var = scope_.FindVar(var_name);
-          if (var && var->IsType<framework::LoDTensor>()) {
-            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-            auto& ddim = tensor_pd->dims();
-            auto ng_shape = Ddim2Shape(ddim);
-            auto ng_type = var_type_map_.at(var_name);
-            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
-                                                               ng_shape, true);
-            (*var_node_map_)[var_name] = prm;
-          }
-        }
+        this->post_op_inputs_.insert(var_name);
      }
    }
+    ++idx;
  }
-  NgraphBridge ngb(var_node_map_);
-  for (auto& op : fused_ops_) {
-    ngb.BuildNgNode(op);
-  }
+
+  BuildNgIO(ops_desc, interval);
 }

-void NgraphEngine::BuildNgIO() {
+void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
+                             const std::vector<int>& interval) {
  std::unordered_set<std::string> inputs;
  std::unordered_set<std::string> outputs;
-
-  for (auto& op : fused_ops_) {
+  for (int i = interval[0]; i < interval[1]; ++i) {
+    auto op = ops_desc[i];
    for (auto& var_name_item : op->Inputs()) {
      for (auto& var_name : var_name_item.second) {
        inputs.insert(var_name);
@@ -302,15 +320,11 @@ void NgraphEngine::BuildNgIO() {
            std::find(var_in_.begin(), var_in_.end(), var_name) ==
                var_in_.end()) {
          // fill var_in here to keep lhs and rhs order
-          var_in_.push_back(var_name);
+          this->var_in_.emplace_back(var_name);
        }
      }
    }

-    if (op->Type() != "fill_constant") {
-      GetNgInputShape(op);
-    }
-
    for (auto& var_name_item : op->Outputs()) {
      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
                        "op %s has more than 1 output - Not handling yet",
@@ -322,172 +336,278 @@ void NgraphEngine::BuildNgIO() {
  }

  // var_out.clear();
-  for (auto& op : fused_ops_) {
+  for (int i = interval[0]; i < interval[1]; ++i) {
+    auto op = ops_desc[i];
    for (auto& var_name_item : op->Outputs()) {
      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
                        "op %s has more than 1 output - Not handling yet",
                        op->Type());
      for (auto& var_name : var_name_item.second) {
-        switch (ng_op_state_) {
+        switch (this->op_state_) {
          case OpState::PARTIAL_TEST:
            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-                fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
+                find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                    fetch_vars.end()) {
+              this->var_out_.emplace_back(var_name);
            }
            break;
          case OpState::FULL_TEST:
-            if (fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
+            if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                fetch_vars.end()) {
+              this->var_out_.emplace_back(var_name);
            }
            break;
          case OpState::PARTIAL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
+            if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                    fetch_vars.end() ||
                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
                persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
+              this->var_out_.emplace_back(var_name);
            }
            break;
          case OpState::FULL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
+            if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
+                    fetch_vars.end() ||
                persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
+              this->var_out_.emplace_back(var_name);
            }
            break;
          default:
-            var_out_.push_back(var_name);
+            this->var_out_.emplace_back(var_name);
        }
      }
    }
  }
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto var_name = var_in_[i];
+    if (persistables_.find(var_name) == persistables_.end()) {
+      var_in_updates_.emplace_back(i);
+    }
+  }
 }

-void NgraphEngine::BuildNgFunction() {
+void NgraphEngine::GetNgInputShape() {
+  for (auto& var_name : var_in_) {
+    auto* var = scope_.FindVar(var_name);
+    if (var && var->IsType<framework::LoDTensor>()) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      auto sp = Ddim2Shape(tensor_pd->dims());
+      auto ng_type = var_type_map_[var_name];
+      auto prm = std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+      (*var_node_map_)[var_name] = prm;
+      (*var_in_node_map_)[var_name] = prm;
+    }
+  }
+}
+
+void NgraphEngine::BuildNgNodes() {
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      for (auto& var_name : var_name_item.second) {
+        if (var_node_map_->find(var_name) == var_node_map_->end()) {
+          auto* var = scope_.FindVar(var_name);
+          if (var && var->IsType<framework::LoDTensor>()) {
+            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+            auto& ddim = tensor_pd->dims();
+            auto ng_shape = Ddim2Shape(ddim);
+            auto ng_type = var_type_map_[var_name];
+            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
+                                                               ng_shape, true);
+            (*var_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+
+  NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgNode(op);
+  }
+}
+
+void NgraphEngine::RunInferShape() {
+  for (auto& op : fused_ops_) {
+    framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+    op->RuntimeInferShape(scope_, place_, ctx);
+  }
+}
+
+void NgraphEngine::BuildNgFunction(const std::vector<int>& interval) {
+  Prepare(interval);
+  RunInferShape();
+  GetNgInputShape();
  BuildNgNodes();
  ngraph_function_ = nullptr;
  ngraph::NodeVector func_outputs;
  ngraph::ParameterVector func_inputs;

  for (auto& vo : var_out_) {
-    func_outputs.push_back(var_node_map_->at(vo));
+    func_outputs.emplace_back(var_node_map_->at(vo));
  }

  for (auto& vi : var_in_) {
    std::shared_ptr<ngraph::op::Parameter> prm =
        std::dynamic_pointer_cast<ngraph::op::Parameter>(
            var_in_node_map_->at(vi));
-    func_inputs.push_back(prm);
+    func_inputs.emplace_back(prm);
  }

  ngraph_function_ =
      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
 }

-void NgraphEngine::GetNgFunction() {
-  bool cache_on = true;
-  if (cache_on) {
-    std::string input_shape_str;
-    for (auto& var_name : var_in_) {
-      auto shape = var_node_map_->at(var_name)->get_shape();
-      for (size_t i = 0; i < shape.size(); ++i) {
-        input_shape_str += std::to_string(shape.at(i));
+void NgraphEngine::GetNgFunction(std::string engine_key,
+                                 const std::vector<int>& interval) {
+  bool use_cache = true;
+  if (use_cache) {
+    this->func_cache_key_ = "";
+    for (int i = 0; i < std::min(static_cast<int>(feed_vars.size()), 10); ++i) {
+      auto* var = scope_.FindVar(feed_vars[i]);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto dims = tensor_pd->dims();
+        for (int j = 0; j < dims.size(); ++j) {
+          func_cache_key_ += std::to_string(dims[j]);
+        }
      }
    }
-    func_cache_key_ = input_shape_str + func_cache_key_;
-    if (func_cache_.find(func_cache_key_) != func_cache_.end()) {
-      ngraph_function_ = func_cache_.at(func_cache_key_);
-    } else {
-      BuildNgFunction();
-      func_cache_[func_cache_key_] = ngraph_function_;
+    func_cache_key_ += std::to_string(interval[0]) + "_" +
+                       std::to_string(interval[1]) + engine_key;
+    func_cache_key_ = std::to_string(std::hash<std::string>()(func_cache_key_));
+
+    if (engine_cache.find(func_cache_key_) != engine_cache.end()) {
+      if (engine_cache[func_cache_key_].persistables.size() == 0) {
+        engine_cache.clear();
+        t_in_cache_.clear();
+      } else {
+        auto var_name = engine_cache[func_cache_key_].persistables.begin();
+        framework::Variable* var = scope_.FindVar(*var_name);
+        if (var != pre_var_ptr) {
+          engine_cache.clear();
+          t_in_cache_.clear();
+        }
+        pre_var_ptr = var;
+      }
+    }
+
+    if (engine_cache.find(func_cache_key_) == engine_cache.end()) {
+      BuildNgFunction(interval);
+      engine_cache[func_cache_key_].ngraph_function = this->ngraph_function_;
+      engine_cache[func_cache_key_].persistables = this->persistables_;
+      engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_;
+      engine_cache[func_cache_key_].var_in = this->var_in_;
+      engine_cache[func_cache_key_].var_out = this->var_out_;
+      engine_cache[func_cache_key_].is_test = this->is_test_;
    }
  } else {
-    BuildNgFunction();
+    BuildNgFunction(interval);
  }
 }

 void NgraphEngine::Run(const framework::Scope& scope,
                       const platform::Place& place) const {
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+  std::shared_ptr<ngraph::Function> ng_func;
+  const std::set<std::string>* p_persistables;
+  const std::vector<size_t>* p_var_in_updates;
+  const std::vector<std::string>* p_var_in;
+  const std::vector<std::string>* p_var_out;
+  bool is_test;
+
+  bool use_cache = true;
+  if (use_cache) {
+    PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(),
+                   "Cannot find cached data to run ngraph function");
+    ng_func = engine_cache[func_cache_key_].ngraph_function;
+    p_persistables = &(engine_cache[func_cache_key_].persistables);
+    p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates);
+    p_var_in = &(engine_cache[func_cache_key_].var_in);
+    p_var_out = &(engine_cache[func_cache_key_].var_out);
+    is_test = engine_cache[func_cache_key_].is_test;
+  } else {
+    ng_func = ngraph_function_;
+    p_persistables = &this->persistables_;
+    p_var_in_updates = &this->var_in_updates_;
+    p_var_in = &this->var_in_;
+    p_var_out = &this->var_out_;
+    is_test = this->is_test_;
+  }

-  for (size_t i = 0; i < var_in_.size(); ++i) {
-    auto vi = var_in_.at(i);
-    auto sp = var_node_map_->at(vi)->get_shape();
-    std::shared_ptr<ngraph::runtime::Tensor> ti;
-    auto* var = scope.FindVar(vi);
-    if (var && var->IsType<framework::LoDTensor>()) {
-      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
-                     "Ensure ngraph tensor layout align with paddle tensor");
-      auto ng_type = var_type_map_.at(vi);
-      if (ng_type == ngraph::element::f32) {
-        auto pd_arr = tensor_pd->mutable_data<float>(place);
-        ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i32) {
-        const int* arr = tensor_pd->data<int>();
-        ti = backend_->create_tensor(ngraph::element::i32, sp,
-                                     const_cast<int*>(arr));
-      } else if (ng_type == ngraph::element::i64) {
-        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::f64) {
-        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::boolean) {
-        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-        ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>>* p_t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in = {};
+
+  auto m_parameters = ng_func->get_parameters();
+  auto m_results = ng_func->get_results();
+  if (is_test && use_cache &&
+      t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
+    p_t_in = &(t_in_cache_[func_cache_key_]);
+    for (size_t i = 0; i < p_var_in_updates->size(); ++i) {
+      int index = p_var_in_updates->at(i);
+      auto vi = p_var_in->at(index);
+      auto sp = m_parameters[index]->get_shape();
+      auto ng_type = m_parameters[index]->get_element_type();
+      std::shared_ptr<ngraph::runtime::Tensor> ti;
+      auto* var = scope.FindVar(vi);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+        void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
+        ti = backend_->create_tensor(ng_type, sp, pd_arr);
+        (*p_t_in)[index] = ti;
      } else {
-        PADDLE_THROW("Data type not handling for var %s", vi);
+        PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
      }
+    }
+  } else {
+    if (is_test && use_cache) {
+      p_t_in = &(t_in_cache_[func_cache_key_]);
    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+      p_t_in = &t_in;
    }
-    bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
-                    ng_op_state_ == OpState::FULL_TEST)
-                       ? true
-                       : false;
-    bool is_persistable =
-        (persistables_.find(vi) != persistables_.end()) ? true : false;
-    if (is_test && is_persistable) {
-      ti->set_stale(false);
+
+    for (size_t i = 0; i < p_var_in->size(); ++i) {
+      auto vi = p_var_in->at(i);
+      auto sp = m_parameters[i]->get_shape();
+      auto ng_type = m_parameters[i]->get_element_type();
+      std::shared_ptr<ngraph::runtime::Tensor> ti;
+      auto* var = scope.FindVar(vi);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+        void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
+        PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                       "Ensure ngraph tensor layout align with paddle tensor");
+        ti = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+      }
+      bool is_persistable =
+          (p_persistables->find(vi) != p_persistables->end()) ? true : false;
+      if (is_test && is_persistable) {
+        ti->set_stale(false);
+      }
+      (*p_t_in).emplace_back(ti);
    }
-    t_in.push_back(ti);
  }

-  for (size_t i = 0; i < var_out_.size(); ++i) {
-    auto vo = var_out_[i];
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out = {};
+  for (size_t i = 0; i < p_var_out->size(); ++i) {
+    auto vo = p_var_out->at(i);
    auto* var = scope.FindVar(vo);
-    std::shared_ptr<ngraph::runtime::Tensor> to;
    if (var && var->IsType<framework::LoDTensor>()) {
+      auto sp = m_results[i]->get_shape();
+      var->GetMutable<framework::LoDTensor>()->Resize(Shape2Ddim(sp));
      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-      auto dd = tensor_pd->dims();
-      ngraph::Shape sp = Ddim2Shape(dd);
-      auto ng_type = var_type_map_.at(vo);
-      if (ng_type == ngraph::element::f32) {
-        auto pd_arr = tensor_pd->mutable_data<float>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i64) {
-        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i32) {
-        auto pd_arr = tensor_pd->mutable_data<int>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else if (ng_type == ngraph::element::f64) {
-        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else if (ng_type == ngraph::element::boolean) {
-        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-        to = backend_->create_tensor(ng_type, sp, pd_arr);
-      } else {
-        PADDLE_THROW("Data type not handled in for var %s", vo);
-      }
-      t_out.push_back(to);
+      auto ng_type = m_results[i]->get_element_type();
+      void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
+      std::shared_ptr<ngraph::runtime::Tensor> to =
+          backend_->create_tensor(ng_type, sp, pd_arr);
+      t_out.emplace_back(to);
    } else {
      PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
    }
  }

-  auto handle = backend_->compile(ngraph_function_);
-  handle->call_with_validate(t_out, t_in);
+  auto handle = backend_->compile(ng_func);
+  handle->call_with_validate(t_out, *p_t_in);
 }  // NgraphEngine::Run
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
+#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
+#include <memory>
+#include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"

 #include "ngraph/ngraph.hpp"

@@ -33,29 +39,47 @@ enum class OpState {                /* nGraph support state on ops          */
                     UNKNOWN        /* Output all for debug purpose         */
 };

+// cache engine repetitives
+struct EngineCache {
+  std::shared_ptr<ngraph::Function> ngraph_function;
+  std::set<std::string> persistables;
+  std::vector<std::string> var_in;
+  std::vector<std::string> var_out;
+  std::vector<size_t> var_in_updates;
+  bool is_test = true;
+};
+
 // perform graph build through bridge and execute computation
 class NgraphEngine {
 public:
  explicit NgraphEngine(const framework::Scope& scope,
                        const platform::Place& place,
-                        const std::string& serialized_graph,
-                        const std::vector<int>& interval);
+                        const framework::ExecutionContext& ctx);

  void Run(const framework::Scope& scope, const platform::Place& place) const;

-  static void EnableNgraph(const framework::ProgramDesc& program);
+  static const framework::BlockDesc* p_bdesc;
+  static std::vector<std::string> feed_vars, fetch_vars;
+
+  static void FuseNgraphOps(
+      const framework::BlockDesc& prog,
+      std::vector<std::unique_ptr<framework::OperatorBase>>* ops);

 private:
-  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache_;
+  static std::unordered_map<std::string, EngineCache> engine_cache;
+  static std::unordered_map<
+      std::string, std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
+      t_in_cache_;
+  static framework::Variable* pre_var_ptr;
+
  const framework::Scope& scope_;
  const platform::Place& place_;
  std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
+  std::set<std::string> persistables_;
  std::unordered_set<std::string> post_op_inputs_;
-  OpState ng_op_state_ = OpState::UNKNOWN;
+  OpState op_state_ = OpState::UNKNOWN;
+  bool is_test_{true};
  std::string func_cache_key_;

  // ngraph backend eg. CPU
@@ -66,6 +90,8 @@ class NgraphEngine {
  std::vector<std::string> var_in_;
  // var_name of outputs from  fetch in order
  std::vector<std::string> var_out_;
+  // non-persitable var_in
+  std::vector<size_t> var_in_updates_;
  // map input vars to nodes
  std::shared_ptr<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
@@ -74,20 +100,23 @@ class NgraphEngine {
  std::shared_ptr<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
      var_node_map_;
-  // prepare info for nraph engine
-  void Prepare(const framework::BlockDesc& block,
-               const std::vector<int>& interval);
+  // prepare info for ngraph engine need
+  void Prepare(const std::vector<int>& interval);
+  // get ngraph engine input and output list
+  void BuildNgIO(const std::vector<framework::OpDesc*>& op_descs,
+                 const std::vector<int>& interval);
  // get ngraph input and define ngraph input parameters
-  void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op);
+  void GetNgInputShape();
  // Call ngraph bridge to map ops
  void BuildNgNodes();
-  // get the ngraph input and output var list
-  void BuildNgIO();
+  // run paddle RuntimeInferShape to get the tensor shape
+  void RunInferShape();
  // build ngraph function call
-  void BuildNgFunction();
+  void BuildNgFunction(const std::vector<int>& interval);
  // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction();
+  void GetNgFunction(std::string engine_key, const std::vector<int>& interval);
 };

 }  // namespace operators
 }  // namespace paddle
+#endif  // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Xs", "A list of inputs.").AsDispensable();
    AddOutput("Ys", "A list of outputs").AsDispensable();
    AddAttr<std::string>("graph", "the graph.");
+    AddAttr<std::string>("engine_key", "the engine hash key.");
    AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
    AddComment("ngraph engine operator.");
  }

--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto& scope = ctx.scope();
    auto place = ctx.GetPlace();
-    std::string serialized_graph = ctx.Attr<std::string>("graph");
-    auto interval = ctx.Attr<std::vector<int>>("interval");

-    NgraphEngine ngraph_engine(scope, place, serialized_graph, interval);
+    NgraphEngine ngraph_engine(scope, place, ctx);
    ngraph_engine.Run(scope, place);
  }
 };

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -219,14 +219,6 @@ class ReshapeKernel {
          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
    }
-    if (!in->lod().empty()) {
-      PADDLE_ENFORCE_EQ(
-          out_dims[0], in->dims()[0],
-          "Reshape operator cannot reshape an input sequence batch "
-          "into an output sequence batch that has a different "
-          "number of time steps. Please consider using "
-          "sequence_reshape op.");
-    }

    out->mutable_data(ctx.GetPlace(), in->type());
    framework::TensorCopy(

--- a/paddle/fluid/operators/selu_op.h
+++ b/paddle/fluid/operators/selu_op.h
@@ -15,13 +15,12 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/for_range.h"
+
 namespace paddle {
 namespace operators {

-static HOSTDEVICE float real_exp(float x) { return expf(x); }
-static HOSTDEVICE float real_exp(double x) { return exp(x); }
-
 template <typename T>
 struct SeluFunctor {
  SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)

--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->IsRuntime()) {
-      return;
-    }
    PADDLE_ENFORCE(
        ctx->HasInput("X"),
        "Input(X) of SequecceEnumerate operator should not be null.");
@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
        });
    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
        .SetDefault(0);
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+        .SetDefault(true);
    AddComment(R"DOC(
 Sequence Enumerate Operator.


--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */

 #include <algorithm>
 #include <cub/cub.cuh>  // NOLINT
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"

 namespace paddle {
@@ -21,9 +22,6 @@ namespace operators {

 using LoDTensor = framework::LoDTensor;

-__device__ __forceinline__ float real_exp(float x) { return expf(x); }
-__device__ __forceinline__ double real_exp(double x) { return exp(x); }
-
 template <typename T, int BlockDim>
 using BlockReduce = cub::BlockReduce<T, BlockDim>;


--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "cub/cub.cuh"
+#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
@@ -21,11 +22,6 @@ namespace operators {

 using Tensor = framework::Tensor;

-static HOSTDEVICE float real_exp(float x) { return expf(x); }
-static HOSTDEVICE float real_exp(double x) { return exp(x); }
-static HOSTDEVICE float real_log(float x) { return logf(x); }
-static HOSTDEVICE float real_log(double x) { return log(x); }
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;


--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <thrust/device_vector.h>
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <size_t D>
+__global__ void Padding(const paddle::platform::float16* d_out,
+                        const int* out_dims, const int* in_dims,
+                        const int* offsets, int64_t n,
+                        paddle::platform::float16* d_in) {
+  int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (out_idx < n) {
+    int coords[D] = {0};
+    for (int i = D - 1; i >= 0; --i) {
+      coords[i] = out_idx % out_dims[i];
+      out_idx /= out_dims[i];
+      coords[i] += offsets[i];
+    }
+
+    int64_t in_idx = 0;
+    for (int i = 0; i < D - 1; ++i) {
+      in_idx += coords[i] * in_dims[i + 1];
+    }
+    in_idx += coords[D - 1];
+
+    d_in[in_idx] = d_out[out_idx];
+  }
+}
+
+template <>
+class SliceGradKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>
+    : public framework::OpKernel<paddle::platform::float16> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
+    d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
+
+    auto out_dims = d_out->dims();
+    auto in_dims = d_in->dims();
+    int rank = out_dims.size();
+    std::vector<int> offsets(rank, 0);
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+
+    for (size_t i = 0; i < starts.size(); ++i) {
+      if (starts[i] < 0) {
+        starts[i] += in_dims[axes[i]];
+      }
+      offsets[axes[i]] = std::max(starts[i], 0);
+    }
+
+    math::SetConstant<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>
+        set_zero;
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::CUDADeviceContext>();
+    set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
+
+    int64_t numel = d_out->numel();
+    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1);
+    dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1);
+    auto stream = ctx.cuda_device_context().stream();
+
+    auto out_shape = framework::vectorize2int(out_dims);
+    thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
+    auto in_shape = framework::vectorize2int(in_dims);
+    thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
+    thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
+    const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
+    const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
+    const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
+
+    switch (rank) {
+      case 1:
+        Padding<1><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 2:
+        Padding<2><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 3:
+        Padding<3><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 4:
+        Padding<4><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 5:
+        Padding<5><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 6:
+        Padding<6><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);

 REGISTER_OP_CUDA_KERNEL(
    slice_grad,
    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/sync_batch_norm_op.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/batch_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
+REGISTER_OPERATOR(sync_batch_norm_grad, ops::BatchNormGradOp);
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void KeLocalStats(const T *x, int N, int M, int C, T *mean_var) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int k = blockIdx.x; k < C; k += gridDim.x) {
+    T x_sum = 0;
+    T x2_sum = 0;
+    for (int i = threadIdx.x; i < N * M; i += BlockDim) {
+      int id = layout == framework::DataLayout::kNCHW
+                   ? (i / M) * C * M + k * M + i % M
+                   : i * C + k;
+      T x_in = x[id];
+      x_sum += x_in;
+      x2_sum += x_in * x_in;
+    }
+    __syncthreads();
+    T out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      mean_var[k] = out / (N * M);
+    }
+    out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      mean_var[k + C] = out / (N * M);
+    }
+  }
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    mean_var[2 * C] = static_cast<T>(1.0);
+  }
+}
+
+template <typename T>
+__global__ void KeSyncAndMovingStats(T *means, T *variances, T *num_dev,
+                                     const int C, const T momentum,
+                                     const double epsilon, T *sv_mean_data,
+                                     T *sv_inv_var_data, T *moving_means,
+                                     T *moving_variances) {
+  // sync stats across multi-devices
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < C; i += stride) {
+    T mean = means[i] / (*num_dev);
+    T var = variances[i] / (*num_dev);
+    var = var - mean * mean;
+
+    // sync stats
+    sv_mean_data[i] = mean;
+    sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon);
+    variances[i] = var;
+
+    // moving stats
+    moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum);
+    moving_variances[i] =
+        moving_variances[i] * momentum + var * (1. - momentum);
+  }
+}
+
+template <typename T, framework::DataLayout layout>
+static __global__ void KeNormAffine(const T *x, const T *scale, const T *bias,
+                                    const T *mean, const T *variance,
+                                    const double epsilon, const int C,
+                                    const int M, const int num, T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
+    y[i] = (x[i] - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c];
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SyncBatchNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout layout = framework::StringToDataLayout(layout_str);
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    PADDLE_ENFORCE(
+        !use_global_stats,
+        "sync_batch_norm doesn't support to set use_global_stats True. ",
+        "Please use batch_norm in this case.");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+    int x_numel = x->numel();
+
+    const T *x_d = x->data<T>();
+    const T *s_d = ctx.Input<Tensor>("Scale")->data<T>();
+    const T *b_d = ctx.Input<Tensor>("Bias")->data<T>();
+
+    auto *y = ctx.Output<Tensor>("Y");
+    T *y_d = y->mutable_data<T>(ctx.GetPlace());
+
+    const T *mean_data = nullptr;
+    const T *var_data = nullptr;
+
+    auto &dev_ctx = ctx.cuda_device_context();
+    auto stream = dev_ctx.stream();
+    auto *comm = dev_ctx.nccl_comm();
+    const int block = 512;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+
+    paddle::memory::AllocationPtr alloc_ptr{nullptr};
+
+    if (is_test) {
+      const auto *est_mean = ctx.Input<Tensor>("Mean");
+      const auto *est_var = ctx.Input<Tensor>("Variance");
+      mean_data = est_mean->data<T>();
+      var_data = est_var->data<T>();
+    } else {
+      auto &allocator =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+      // x, x^2, 1, here 1 is used to calc device num
+      // device num also can be got from platform::DeviceContextPool
+      const int bytes = (C * 2 + 1) * sizeof(T);
+      alloc_ptr = allocator.Allocate(bytes);
+
+      T *stats = reinterpret_cast<T *>(alloc_ptr->ptr());
+      const int threads = 256;
+      int grid = std::min(C, (max_threads + threads - 1) / threads);
+      if (layout == framework::DataLayout::kNCHW) {
+        KeLocalStats<
+            T, threads,
+            framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
+            x_d, N, H * W * D, C, stats);
+      } else {
+        KeLocalStats<
+            T, threads,
+            framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
+            x_d, N, H * W * D, C, stats);
+      }
+
+      Tensor c_g_st;
+      T *c_g_st_d = c_g_st.mutable_data<T>({2 * C + 1}, platform::CPUPlace());
+      auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+      memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0);
+
+      int dtype = platform::ToNCCLDataType(x->type());
+      // In-place operation
+      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+          stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+          comm, stream));
+
+      // moving mean/variance
+      auto *mean_out = ctx.Output<Tensor>("MeanOut");
+      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      T *est_mean_data = mean_out->mutable_data<T>(ctx.GetPlace());
+      T *est_var_data = variance_out->mutable_data<T>(ctx.GetPlace());
+
+      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
+      T *sv_mean_data = saved_mean->mutable_data<T>(ctx.GetPlace());
+      T *sv_inv_var_data = saved_inv_variance->mutable_data<T>(ctx.GetPlace());
+
+      // Note, Input('Mean')/Input('Variance') share variable with
+      // Output('MeanOut')/Output('VarianceOut')
+      KeSyncAndMovingStats<T><<<(C + block - 1) / block, block, 0, stream>>>(
+          stats, stats + C, stats + 2 * C, C, momentum, epsilon, sv_mean_data,
+          sv_inv_var_data, est_mean_data, est_var_data);
+
+      mean_data = sv_mean_data;
+      var_data = stats + C;
+    }
+
+    int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
+    if (layout == framework::DataLayout::kNCHW) {
+      KeNormAffine<T,
+                   framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+          x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
+          y_d);
+    } else {
+      KeNormAffine<T,
+                   framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+          x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
+          y_d);
+    }
+  }
+};
+
+template <typename T, const int BlockDim, framework::DataLayout layout>
+__global__ void KeBackwardLocalStats(const T *dy, const T *x, const T *means,
+                                     int N, int M, int C, T *sum_dy_prod) {
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  for (int k = blockIdx.x; k < C; k += gridDim.x) {
+    T sum1 = 0;
+    T sum2 = 0;
+    T mean = means[k];
+    for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
+      int id = layout == framework::DataLayout::kNCHW
+                   ? (i / M) * C * M + k * M + i % M
+                   : i * C + k;
+      T g = dy[id];
+      sum1 += g;
+      sum2 += g * (x[id] - mean);
+    }
+
+    __syncthreads();
+    T out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      sum_dy_prod[k] = out;
+    }
+    out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      sum_dy_prod[k + C] = out;
+    }
+  }
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    sum_dy_prod[2 * C] = static_cast<T>(1.0);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ void KeBNBackwardScaleBias(const T *dy, const T *x,
+                                             const T *mean,
+                                             const T *inv_variance,
+                                             const double epsilon, const int N,
+                                             const int C, const int HxW,
+                                             T *dscale, T *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T ds_sum = static_cast<T>(0);
+    T db_sum = static_cast<T>(0);
+
+    T inv_var_i = inv_variance[i];
+    T mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int id = layout == framework::DataLayout::kNCHW
+                         ? ((j / HxW) * C + i) * HxW + (j % HxW)
+                         : j * outer_size + i;
+      ds_sum += dy[id] * (x[id] - mean_i);
+      db_sum += dy[id];
+    }
+    __syncthreads();
+    double os = BlockReduce(temp_storage)
+                    .Reduce(static_cast<double>(ds_sum), cub::Sum());
+    __syncthreads();
+    double ob = BlockReduce(temp_storage)
+                    .Reduce(static_cast<double>(db_sum), cub::Sum());
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      dscale[i] = static_cast<T>(os * inv_var_i);
+      dbias[i] = static_cast<T>(ob);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, framework::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy, const T *x, const T *beta,
+                                        const T *mean, const T *inv_variance,
+                                        const T *g_sum_dy,
+                                        const T *g_sum_dy_prod,
+                                        const T *num_dev, const double epsilon,
+                                        const int C, const int HxW,
+                                        const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T scale = static_cast<T>(C) / num;
+  T dev_num = num_dev[0];
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    T inv_var = inv_variance[c];
+    T s_d = beta[c];
+    T gvar = -1.0 * (g_sum_dy_prod[c] / dev_num) * s_d * inv_var *
+             (inv_var * inv_var);
+    T gmean = -1.0 * (g_sum_dy[c] / dev_num) * s_d * inv_var;
+
+    dx[i] =
+        dy[i] * s_d * inv_var + gmean * scale + gvar * scale * (x[i] - mean[c]);
+  }
+}
+
+// Deriving the Gradient for the Backward Pass of Batch Normalization
+// https://kevinzakka.github.io/2016/09/14/batch_normalization/
+template <typename DeviceContext, typename T>
+class SyncBatchNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const std::string layout_str = ctx.Attr<std::string>("data_layout");
+
+    const DataLayout layout = framework::StringToDataLayout(layout_str);
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+
+    const auto &x_dims = x->dims();
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      d_bias->mutable_data<T>(ctx.GetPlace());
+    }
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
+    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
+
+    std::vector<int> dims;
+    std::vector<int> strides;
+    if (layout == DataLayout::kNCHW) {
+      dims = {N, C, H, W, D};
+      strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    } else {
+      dims = {N, C, H, W, D};
+      strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    }
+
+    const T *x_d = x->data<T>();
+    const T *dy_d = d_y->data<T>();
+
+    auto &dev_ctx = ctx.cuda_device_context();
+    auto stream = dev_ctx.stream();
+    auto *comm = dev_ctx.nccl_comm();
+
+    const T *saved_mean = ctx.Input<Tensor>("SavedMean")->data<T>();
+    const T *saved_inv_var = ctx.Input<Tensor>("SavedVariance")->data<T>();
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    const int bytes = (C * 2 + 1) * sizeof(T);
+    auto alloc_ptr = allocator.Allocate(bytes);
+    T *stats = reinterpret_cast<T *>(alloc_ptr->ptr());
+
+    const int threads = 256;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    int grid = std::min(C, (max_threads + threads - 1) / threads);
+    int x_numel = x->numel();
+    int fsize = H * W * D;
+
+    if (layout == framework::DataLayout::kNCHW) {
+      KeBackwardLocalStats<
+          T, threads,
+          framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
+          dy_d, x_d, saved_mean, N, fsize, C, stats);
+    } else {
+      KeBackwardLocalStats<
+          T, threads,
+          framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
+          dy_d, x_d, saved_mean, N, fsize, C, stats);
+    }
+    int dtype = platform::ToNCCLDataType(x->type());
+    // In-place operation
+    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+        stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
+        comm, stream));
+
+    const int block = 512;
+    int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
+    if (layout == framework::DataLayout::kNCHW) {
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T, threads,
+            framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
+            dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
+            d_scale->data<T>(), d_bias->data<T>());
+      }
+      if (d_x) {
+        KeBNBackwardData<
+            T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+            dy_d, x_d, scale->data<T>(), saved_mean, saved_inv_var, stats,
+            stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(),
+            d_x->data<T>());
+      }
+    } else {
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T, threads,
+            framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
+            dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
+            d_scale->data<T>(), d_bias->data<T>());
+      }
+      if (d_x) {
+        KeBNBackwardData<
+            T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+            dy_d, x_d, scale->data<T>(), saved_mean, saved_inv_var, stats,
+            stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(),
+            d_x->data<T>());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    sync_batch_norm, ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
+    ops::SyncBatchNormKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    sync_batch_norm_grad,
+    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
+    ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -57,7 +57,6 @@ DeviceContextPool::DeviceContextPool(
  for (auto& p : places) {
    set.insert(p);
  }
-
  for (auto& p : set) {
    if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
@@ -317,6 +316,7 @@ CUDADeviceContext::~CUDADeviceContext() {
  eigen_stream_.reset();
  eigen_device_.reset();
  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+  PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
 }

 Place CUDADeviceContext::GetPlace() const { return place_; }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -265,6 +265,12 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return cuda stream in the device context. */
  cudaStream_t stream() const;

+  /*! \brief  Return nccl communicators. */
+  ncclComm_t nccl_comm() const { return nccl_comm_; }
+
+  /*! \brief  Set nccl communicators. */
+  void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
+
  template <typename Callback>
  void RecordEvent(cudaEvent_t ev, Callback callback) {
    callback();
@@ -289,6 +295,13 @@ class CUDADeviceContext : public DeviceContext {
  std::unique_ptr<CublasHandleHolder> cublas_handle_;
  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;

+  // NCCL communicator (single process version) for NCCL collective operations.
+  // NCCL collective operations provides fast collectives over multiple GPUs
+  // both within and across nodes.
+  // But, this collectives is used for collectives over multiple GPUs within
+  // nodes.
+  ncclComm_t nccl_comm_{nullptr};
+
  int compute_capability_;
  int runtime_version_;
  int driver_version_;

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/device_tracer.h"

 #include <deque>
 #include <forward_list>
@@ -30,6 +29,8 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"

 namespace paddle {
@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
                                      stream_id, correlation_id, bytes});
  }

+  void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                        const Place &place, const std::string &alloc_in,
+                        const std::string &free_in, int64_t thread_id) {
+    if (0 == start_ns || 0 == end_ns) {
+      VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
+      return;
+    }
+    thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
+        nullptr;
+    if (local_mem_info_record == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      mem_info_record_.emplace_front();
+      local_mem_info_record = &mem_info_record_.front();
+    }
+    local_mem_info_record->emplace_front(MemInfoRecord{
+        start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
+  }
+
  void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
                            uint64_t end_ns, int64_t device_id,
                            int64_t thread_id, uint32_t correlation_id) {
@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
    correlations_.clear();
    for (auto &tmp : correlations_pairs) tmp.clear();
    for (auto &tmp : cpu_records_) tmp.clear();
+    for (auto &tmp : mem_info_record_) tmp.clear();
    for (auto &tmp : active_kind_records_) tmp.clear();
  }

@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
    proto::Profile profile_pb;
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
-    if (correlations_.empty())
-      for (auto &tmp : correlations_pairs)
+    if (correlations_.empty()) {
+      for (auto &tmp : correlations_pairs) {
        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
+      }
+    }
+
    for (const KernelRecord &r : kernel_records_) {
      auto *event = profile_pb.add_events();
      event->set_type(proto::Event::GPUKernel);
@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_device_id(r.device_id);
    }
    VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
+
    for (auto &tmp : cpu_records_) {
      for (const CPURecord &r : tmp) {
        auto *event = profile_pb.add_events();
@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
        event->set_device_id(r.device_id);
      }
    }
+
    for (auto &tmp : active_kind_records_) {
      for (const ActiveKindRecord &r : tmp) {
        auto *event = profile_pb.add_events();
@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
      event->mutable_memcopy()->set_bytes(r.bytes);
    }
    VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
+
+    for (auto &tmp : mem_info_record_) {
+      for (const auto &r : tmp) {
+        auto *event = profile_pb.add_mem_events();
+        event->set_device_id(0);
+        if (platform::is_cpu_place(r.place)) {
+          event->set_place(proto::MemEvent::CPUPlace);
+        } else if (platform::is_gpu_place(r.place)) {
+          event->set_place(proto::MemEvent::CUDAPlace);
+          event->set_device_id(
+              boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
+        } else if (platform::is_cuda_pinned_place(r.place)) {
+          event->set_place(proto::MemEvent::CUDAPinnedPlace);
+        } else {
+          PADDLE_THROW("The current place is not supported.");
+        }
+        event->set_alloc_in(r.alloc_in);
+        event->set_free_in(r.free_in);
+        event->set_start_ns(r.start_ns);
+        event->set_end_ns(r.end_ns);
+        event->set_bytes(r.bytes);
+        event->set_thread_id(r.thread_id);
+      }
+    }
+
    std::ofstream profile_f;
    profile_f.open(profile_path,
                   std::ios::out | std::ios::trunc | std::ios::binary);
@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
  std::forward_list<KernelRecord> kernel_records_;
  std::forward_list<MemRecord> mem_records_;
  std::forward_list<std::forward_list<CPURecord>> cpu_records_;
+  std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
  std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
  std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
      correlations_pairs;
@@ -575,7 +626,7 @@ Event *CurAnnotation() {
  return annotation_stack.back();
 }
 std::string CurAnnotationName() {
-  if (annotation_stack.empty()) return "";
+  if (annotation_stack.empty()) return "Unknown";
  return annotation_stack.back()->name();
 }


--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -18,6 +18,7 @@ limitations under the License. */

 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"

@@ -47,6 +48,7 @@ class DeviceTracer {
    int64_t stream_id;
    uint32_t correlation_id;
  };
+
  struct CPURecord {
    std::string name;
    uint64_t start_ns;
@@ -54,6 +56,7 @@ class DeviceTracer {
    int64_t device_id;
    int64_t thread_id;
  };
+
  struct MemRecord {
    std::string name;
    uint64_t start_ns;
@@ -63,6 +66,17 @@ class DeviceTracer {
    uint32_t correlation_id;
    uint64_t bytes;
  };
+
+  struct MemInfoRecord {
+    uint64_t start_ns;
+    uint64_t end_ns;
+    size_t bytes;
+    Place place;
+    int64_t thread_id;
+    std::string alloc_in;
+    std::string free_in;
+  };
+
  struct ActiveKindRecord {
    std::string name;
    uint64_t start_ns;
@@ -71,6 +85,7 @@ class DeviceTracer {
    int64_t thread_id;
    uint32_t correlation_id;
  };
+
  virtual ~DeviceTracer() {}
  // Needs to be called once before use.
  virtual void Enable() = 0;
@@ -97,6 +112,12 @@ class DeviceTracer {
                                    int64_t thread_id,
                                    uint32_t correlation_id) = 0;

+  virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
+                                size_t bytes, const Place& place,
+                                const std::string& alloc_in,
+                                const std::string& free_in,
+                                int64_t thread_id) = 0;
+
  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.
  virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,

--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
 #include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace platform {
@@ -64,5 +66,36 @@ class Event {
 #endif
 #endif
 };
+
+class MemEvent {
+ public:
+  MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
+           Place place, int64_t thread_id, const std::string& annotation)
+      : type_(type),
+        start_ns_(start_ns),
+        end_ns_(end_ns),
+        bytes_(bytes),
+        place_(place),
+        thread_id_(thread_id),
+        annotation_(annotation) {}
+
+  const EventType& type() const { return type_; }
+  uint64_t start_ns() const { return start_ns_; }
+  uint64_t end_ns() const { return end_ns_; }
+  size_t bytes() const { return bytes_; }
+  Place place() const { return place_; }
+  int64_t thread_id() const { return thread_id_; }
+  const std::string& annotation() const { return annotation_; }
+
+ private:
+  EventType type_;
+  uint64_t start_ns_ = 0;
+  uint64_t end_ns_ = 0;
+  size_t bytes_;
+  Place place_;
+  int64_t thread_id_;
+  std::string annotation_;
+};
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string.h>  // for strdup
 #include <algorithm>
+#include <memory>
+#include <set>
 #include <stdexcept>
 #include <string>

@@ -140,6 +142,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  places.emplace_back(platform::CPUPlace());
  platform::DeviceContextPool::Init(places);
  platform::DeviceTemporaryAllocator::Init();
+
 #ifndef PADDLE_WITH_MKLDNN
  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -16,9 +16,11 @@
 #pragma once

 #include <stdio.h>
+#include <memory>
 #include <string>
 #include <thread>  // NOLINT
 #include <typeindex>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
@@ -78,6 +80,8 @@ struct NCCLContext {

  cudaStream_t stream() const { return ctx_->stream(); }

+  ncclComm_t comm() const { return comm_; }
+
  int device_id() const {
    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
  }

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/profiler.h"
-
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -21,6 +20,8 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <random>
 #include <string>
+#include <vector>
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
 namespace paddle {
 namespace platform {

-struct EventList;
-
 static int64_t profiler_lister_id = 0;
 static bool should_send_profile_state = false;
 std::mutex profiler_mu;
@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
 // The global mutex
 static std::mutex g_all_event_lists_mutex;
 // The total event lists of all threads
-static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
 // The thread local event list only can be accessed by the specific thread
-static thread_local std::shared_ptr<EventList> g_event_list;
-
-struct EventList {
-  constexpr static size_t kMB = 1024 * 1024;
-  constexpr static size_t kEventBlockSize = 16 * kMB;
-  constexpr static size_t kEventSize = sizeof(Event);
-  constexpr static size_t kEventAlign = alignof(Event);
-  constexpr static size_t kNumBlock =
-      kEventBlockSize /
-      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
-
-  template <typename... Args>
-  Event* Record(Args&&... args) {
-    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
-      event_blocks.emplace_front();
-      event_blocks.front().reserve(kNumBlock);
-    }
-    event_blocks.front().emplace_back(std::forward<Args>(args)...);
-    return &event_blocks.front().back();
-  }
-
-  std::vector<Event> Reduce() {
-    std::vector<Event> result;
-    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
-                    std::make_move_iterator(block.end()));
-    }
-    event_blocks.clear();
-    return result;
-  }
+static thread_local std::shared_ptr<EventList<Event>> g_event_list;

-  void Clear() { event_blocks.clear(); }
-
-  std::forward_list<std::vector<Event>> event_blocks;
-};
+static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
+static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
+static std::mutex g_all_mem_event_lists_mutex;
+static thread_local int32_t g_mem_thread_id;
+static uint32_t g_mem_next_thread_id = 0;

 inline uint64_t GetTimeInNsec() {
  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
  cpu_ns_ = GetTimeInNsec();
 }

-const EventType& Event::type() const { return type_; }
+const EventType &Event::type() const { return type_; }

-double Event::CpuElapsedMs(const Event& e) const {
+double Event::CpuElapsedMs(const Event &e) const {
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
 }

-double Event::CudaElapsedMs(const Event& e) const {
+double Event::CudaElapsedMs(const Event &e) const {
 #ifdef PADDLE_WITH_CUPTI
  return gpu_ns_ / 1000000.0;
 #else
@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
 #endif
 }

-inline EventList& GetEventList() {
+inline EventList<MemEvent> &GetMemEventList() {
+  if (!g_mem_event_list) {
+    g_mem_event_list = std::make_shared<EventList<MemEvent>>();
+    std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
+    g_mem_thread_id = g_mem_next_thread_id++;
+    g_all_mem_event_lists.emplace_front(g_mem_event_list);
+  }
+  return *g_mem_event_list;
+}
+
+void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                  const Place &place, const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
+                           place, g_mem_thread_id, annotation);
+}
+
+void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                 const Place &place, const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
+                           g_mem_thread_id, annotation);
+}
+
+inline EventList<Event> &GetEventList() {
  if (!g_event_list) {
    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
-    g_event_list = std::make_shared<EventList>();
+    g_event_list = std::make_shared<EventList<Event>>();
    g_thread_id = g_next_thread_id++;
    g_all_event_lists.emplace_front(g_event_list);
    RecoreCurThreadId(g_thread_id);
@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
  return *g_event_list;
 }

-void Mark(const std::string& name) {
+void Mark(const std::string &name) {
  GetEventList().Record(EventType::kMark, name, g_thread_id);
 }

-Event* PushEvent(const std::string& name) {
+Event *PushEvent(const std::string &name) {
  return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
 }

-void PopEvent(const std::string& name) {
+void PopEvent(const std::string &name) {
  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
 }

-RecordEvent::RecordEvent(const std::string& name)
+RecordEvent::RecordEvent(const std::string &name)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
  if (g_state == ProfilerState::kDisabled) return;
  // lock is not needed, the code below is thread-safe

  is_enabled_ = true;
  name_ = name;
-  Event* e = PushEvent(name_);
+  Event *e = PushEvent(name_);
  // Maybe need the same push/pop behavior.
  SetCurAnnotation(e);
 }
@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
 RecordEvent::~RecordEvent() {
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
  // lock is not needed, the code below is thread-safe
-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer) {
    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
                          BlockDepth(), g_thread_id);
@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
  PopEvent(name_);
 }

-RecordRPCEvent::RecordRPCEvent(const std::string& name) {
+MemEvenRecorder MemEvenRecorder::recorder;
+
+void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
+                                    size_t size) {
+  if (g_state == ProfilerState::kDisabled) return;
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto &events = address_memevent_[place];
+  PADDLE_ENFORCE(events.count(ptr) == 0, "");
+  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
+                          new MemEvenRecorder::RecordMemEvent(place, size)));
+}
+
+void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
+  if (g_state == ProfilerState::kDisabled) return;
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto &events = address_memevent_[place];
+  auto iter = events.find(ptr);
+  // The ptr maybe not in address_memevent
+  if (iter != events.end()) {
+    events.erase(iter);
+  }
+}
+
+void MemEvenRecorder::Flush() {
+  std::lock_guard<std::mutex> guard(mtx_);
+  address_memevent_.clear();
+}
+
+MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
+                                                size_t bytes)
+    : place_(place),
+      bytes_(bytes),
+      start_ns_(PosixInNsec()),
+      alloc_in_(CurAnnotationName()) {
+  PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
+}
+
+MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
+  DeviceTracer *tracer = GetDeviceTracer();
+  end_ns_ = PosixInNsec();
+
+  auto annotation_free = CurAnnotationName();
+  if (tracer) {
+    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
+                             annotation_free, g_mem_thread_id);
+  }
+  PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
+}
+
+RecordRPCEvent::RecordRPCEvent(const std::string &name) {
  if (FLAGS_enable_rpc_profiler) {
    event_.reset(new platform::RecordEvent(name));
  }
@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
 RecordBlock::~RecordBlock() {
  // lock is not needed, the code below is thread-safe
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
 void ResetProfiler() {
  SynchronizeAllDevice();
  GetDeviceTracer()->Reset();
+  MemEvenRecorder::Instance().Flush();
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    (*it)->Clear();
  }
+  for (auto it = g_all_mem_event_lists.begin();
+       it != g_all_mem_event_lists.end(); ++it) {
+    (*it)->Clear();
+  }
 }

 std::vector<std::vector<Event>> GetAllEvents() {
@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
  return result;
 }

+std::vector<std::vector<MemEvent>> GetMemEvents() {
+  std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
+  std::vector<std::vector<MemEvent>> result;
+  for (auto &it : g_all_mem_event_lists) {
+    result.emplace_back((*it).Reduce());
+  }
+  return result;
+}
+
 // The information of each event given in the profiling report
 struct EventItem {
  std::string name;
@@ -263,8 +319,8 @@ struct EventItem {
 };

 // Print results
-void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
-                   const std::string& sorted_domain, const size_t name_width,
+void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
+                   const std::string &sorted_domain, const size_t name_width,
                   const size_t data_width, bool merge_thread) {
  // Output header information
  std::cout << "\n------------------------->"
@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
            << std::setw(data_width) << "Ratio." << std::endl;
  for (size_t i = 0; i < events_table.size(); ++i) {
    for (size_t j = 0; j < events_table[i].size(); ++j) {
-      const EventItem& event_item = events_table[i][j];
+      const EventItem &event_item = events_table[i][j];
      std::cout << std::setw(name_width) << event_item.name
                << std::setw(data_width) << event_item.calls
                << std::setw(data_width) << event_item.total_time;
@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
 }

 // Parse the event list and output the profiling report
-void ParseEvents(const std::vector<std::vector<Event>>& events,
+void ParseEvents(const std::vector<std::vector<Event>> &events,
                 bool merge_thread,
                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
  if (g_state == ProfilerState::kDisabled) return;
  if (merge_thread && events.size() < 2) return;

  std::string sorted_domain;
-  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
+  std::function<bool(const EventItem &, const EventItem &)> sorted_func;
  switch (sorted_by) {
    case EventSortingKey::kCalls:
      sorted_domain = "number of calls";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.calls > b.calls;
      };
      break;
    case EventSortingKey::kTotal:
      sorted_domain = "total time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.total_time > b.total_time;
      };
      break;
    case EventSortingKey::kMin:
      sorted_domain = "minimum time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.min_time > b.min_time;
      };
      break;
    case EventSortingKey::kMax:
      sorted_domain = "maximum time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.max_time > b.max_time;
      };
      break;
    case EventSortingKey::kAve:
      sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.ave_time > b.ave_time;
      };
      break;
    case EventSortingKey::kGPUTime:
      sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.gpu_time > b.gpu_time;
      };
      break;
    case EventSortingKey::kCPUTime:
      sorted_domain = "average time";
-      sorted_func = [](const EventItem& a, const EventItem& b) {
+      sorted_func = [](const EventItem &a, const EventItem &b) {
        return a.cpu_time > b.cpu_time;
      };
      break;
@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
      sorted_domain = "event first end time";
  }

-  const std::vector<std::vector<Event>>* analyze_events;
+  const std::vector<std::vector<Event>> *analyze_events;
  std::vector<std::vector<Event>> merged_events_list;
  if (merge_thread) {
    std::vector<Event> merged_events;
@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
      }
    }
    // average time
-    for (auto& item : event_items) {
+    for (auto &item : event_items) {
      item.ave_time = item.total_time / item.calls;
      item.ratio = item.total_time / total;
    }
@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
                merge_thread);
 }

+struct MemoryProfierReport {
+  size_t alloc_times{0};
+  size_t alloc_size{0};
+  size_t free_times{0};
+  size_t free_size{0};
+};
+
+// Print results
+void PrintMemProfiler(
+    const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
+        &annotation_report,
+    const size_t name_width, const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "    Memory Profiling Report     "
+            << "<-------------------------\n\n";
+
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Alloc Calls" << std::setw(data_width) << "Size(MB)"
+            << std::setw(data_width) << "Free Calls" << std::setw(data_width)
+            << "Size(MB)" << std::endl;
+
+  for (auto &tmp : annotation_report) {
+    for (auto &e : tmp.second) {
+      auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
+      std::cout << std::setw(name_width) << event_name;
+      std::cout << std::setw(data_width) << e.second.alloc_times;
+      std::cout << std::setw(data_width)
+                << e.second.alloc_size / (1024.0 * 1024.0);
+      std::cout << std::setw(data_width) << e.second.free_times;
+      std::cout << std::setw(data_width)
+                << e.second.free_size / (1024.0 * 1024.0) << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
+// parse memory events
+void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
+  if (g_state == ProfilerState::kDisabled) return;
+  // place, annotation, alloc times,  alloc size
+  std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
+      annotation_report;
+
+  for (auto &tmp : events) {
+    for (auto &e : tmp) {
+      if (e.type() == EventType::kPushRange) {
+        annotation_report[e.place()][e.annotation()].alloc_times += 1;
+        annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
+      } else if (e.type() == EventType::kPopRange) {
+        annotation_report[e.place()][e.annotation()].free_times += 1;
+        annotation_report[e.place()][e.annotation()].free_size += e.bytes();
+      }
+    }
+  }
+  PrintMemProfiler(annotation_report, 55, 18);
+}
+
 void DisableProfiler(EventSortingKey sorted_key,
-                     const std::string& profile_path) {
+                     const std::string &profile_path) {
  SynchronizeAllDevice();
+  MemEvenRecorder::Instance().Flush();
+
  std::lock_guard<std::mutex> l(profiler_mu);
  if (g_state == ProfilerState::kDisabled) return;
  // Mark the profiling stop.
  Mark("_stop_profiler_");

-  DeviceTracer* tracer = GetDeviceTracer();
+  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer->IsEnabled()) {
    tracer->Disable();
    tracer->GenProfile(profile_path);
@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
  std::vector<std::vector<Event>> all_events = GetAllEvents();
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
+  if (VLOG_IS_ON(5)) {
+    std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
+    ParseMemEvents(all_mem_events);
+  }
+
  ResetProfiler();
  g_state = ProfilerState::kDisabled;
  should_send_profile_state = true;

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -15,10 +15,17 @@ limitations under the License. */
 #pragma once
 #include <forward_list>
 #include <list>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@@ -34,8 +41,41 @@ enum ProfilerState {

 void Mark(const std::string& name);

-Event* PushEvent(const std::string& name);
+void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                  const Place& place);
+void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                 const Place& place);
+
+struct MemEvenRecorder {
+ public:
+  void PushMemRecord(const void* ptr, const Place& place, size_t size);
+  void PopMemRecord(const void* ptr, const Place& place);
+  void Flush();
+  static MemEvenRecorder& Instance() { return recorder; }

+ private:
+  struct RecordMemEvent {
+    RecordMemEvent(const Place& place, size_t bytes);
+    ~RecordMemEvent();
+
+    Place place_;
+    size_t bytes_;
+    uint64_t start_ns_;
+    uint64_t end_ns_;
+    std::string alloc_in_;
+    std::string free_in_;
+  };
+
+  static MemEvenRecorder recorder;
+  std::map<Place,
+           std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
+      address_memevent_;
+  std::mutex mtx_;
+  MemEvenRecorder() {}
+  DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
+};
+
+Event* PushEvent(const std::string& name);
 void PopEvent(const std::string& name);

 struct RecordEvent {
@@ -87,6 +127,41 @@ enum EventSortingKey {
  kGPUTime
 };

+template <typename T>
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(T);
+  constexpr static size_t kEventAlign = alignof(T);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  T* Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+    return &event_blocks.front().back();
+  }
+
+  std::vector<T> Reduce() {
+    std::vector<T> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  void Clear() { event_blocks.clear(); }
+
+  std::forward_list<std::vector<T>> event_blocks;
+};
+
 // Enable the profiling function.
 void EnableProfiler(ProfilerState state);


--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -34,8 +34,25 @@ message Event {
  optional string detail_info = 9;
 }

+message MemEvent {
+  enum Place {
+    CUDAPlace = 0;
+    CPUPlace = 1;
+    CUDAPinnedPlace = 2;
+  }
+  optional uint64 start_ns = 1;
+  optional uint64 end_ns = 2;
+  optional uint64 bytes = 3;
+  optional Place place = 4;
+  optional uint64 thread_id = 5;
+  optional uint32 device_id = 6;
+  optional string alloc_in = 7;
+  optional string free_in = 8;
+}
+
 message Profile {
  repeated Event events = 1;
  optional uint64 start_ns = 2;
  optional uint64 end_ns = 3;
+  repeated MemEvent mem_events = 4;
 }
\ No newline at end of file
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -54,12 +55,14 @@ void BindGraph(py::module *m) {
      "The graph is a Directed Acyclic Single Static Assignment Graph, see "
      "`paddle::ir::Graph` for details.")
      .def(py::init<const ProgramDesc &>())
+      .def("clone", &Graph::Clone)
      .def("has", &Graph::Has)
      .def("get_int", &Graph::Get<int>)
      .def("get_float", &Graph::Get<float>)
      .def("get_double", &Graph::Get<double>)
      .def("get_string", &Graph::Get<std::string>)
-      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
+      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>,
+           return_value_policy::reference)
      .def("set", [](Graph &self, const std::string &attr_name,
                     int attr) { return self.Set(attr_name, new int(attr)); })
      .def("set",
@@ -103,7 +106,8 @@ void BindGraph(py::module *m) {
      .def("retrieve_node", &Graph::RetrieveNode,
           return_value_policy::reference)
      .def("resolve_hazard", &Graph::ResolveHazard)
-      .def("origin_program_desc", &Graph::OriginProgram);
+      .def("origin_program_desc", &Graph::OriginProgram,
+           return_value_policy::reference);
 }

 void BindNode(py::module *m) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -94,6 +94,14 @@ bool IsCompiledWithMKLDNN() {
 #endif
 }

+bool IsCompiledWithNGRAPH() {
+#ifndef PADDLE_WITH_NGRAPH
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
  return false;
@@ -874,6 +882,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("init_devices",
        [](bool init_p2p) { framework::InitDevices(init_p2p); });

+  m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH);
  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
@@ -1221,6 +1230,21 @@ All parameter, weight, gradient are variables in Paddle.
                      it will save GPU memory and may make the execution faster.
                      This options is only available in GPU devices.
                      Default False)DOC")
+      .def_property(
+          "sync_batch_norm",
+          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
+            self.sync_batch_norm_ = b;
+          },
+          R"DOC(The type is BOOL, sync_batch_norm indicates whether to use
+                synchronous batch normalization which synchronizes the mean
+                and variance through multi-devices in training phase.
+
+                Current implementation doesn't support FP16 training and CPU.
+                And only synchronous on one machine, not all machines.
+
+                Default False)DOC")
      .def_property(
          "memory_optimize",
          [](const BuildStrategy &self) { return self.memory_optimize_; },
@@ -1242,7 +1266,7 @@ All parameter, weight, gradient are variables in Paddle.
                cannot be updated after being finalized.)DOC");

  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::unordered_set<std::string> &, const std::string &,
+                  const std::vector<std::string> &, const std::string &,
                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
                  const BuildStrategy &, ir::Graph *>())
      // NOTE: even we return a vec<Scope*>* to Python use reference policy.

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -455,7 +455,11 @@ function assert_api_spec_approvals() {
          # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
          if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
+            if [ "${APPROVALS}" == "TRUE" ];then
+              APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+              python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
+            fi
          else
            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
@@ -463,7 +467,7 @@ function assert_api_spec_approvals() {
          echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
          if [ "${APPROVALS}" == "FALSE" ]; then
            if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
-              echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}"
+              echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
            else
              echo "You must have panyx0718 approval for the api change! ${API_FILE}"
            fi

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -125,7 +125,7 @@ def __bootstrap__():
    os.environ['OMP_NUM_THREADS'] = str(num_threads)
    sysstr = platform.system()
    read_env_flags = [
-        'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_ngraph',
+        'check_nan_inf', 'benchmark', 'eager_delete_scope',
        'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
        'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
        'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
@@ -143,6 +143,9 @@ def __bootstrap__():
    if core.is_compiled_with_mkldnn():
        read_env_flags.append('use_mkldnn')

+    if core.is_compiled_with_ngraph():
+        read_env_flags.append('use_ngraph')
+
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')
        read_env_flags.append('rpc_server_profile_path')

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -223,22 +223,27 @@ class CompiledProgram(object):
                tps), "num_trainers == len(end_points)"
            self._build_strategy.trainers_endpoints = tps

+        if self._build_strategy.sync_batch_norm:
+            self._build_strategy.enable_sequential_execution = True
+
        self._persistable_vars = []
-        for block_id in range(self._program_desc.num_blocks()):
-            bdesc = self._program_desc.block(block_id)
-            self._persistable_vars.extend([
-                cpt.to_text(v.name()) for v in bdesc.all_vars()
-                if v.persistable() and v.type() != core.VarDesc.VarType.RAW
-            ])
+        for node in self._graph.nodes():
+            if node.is_var() and node.var() is not None and node.var().persistable() and \
+                    node.var().type() != core.VarDesc.VarType.RAW:
+                self._persistable_vars.append(cpt.to_text(node.name()))

        places = list(map(_place_obj, self._places))
-
-        return core.ParallelExecutor(places,
-                                     set(self._persistable_vars),
-                                     cpt.to_text(self._loss_name)
-                                     if self._loss_name else six.u(''), scope,
-                                     self._local_scopes, self._exec_strategy,
-                                     self._build_strategy, self._graph)
+        # ParallelExecutor would broadcast all the parameters during initializing.
+        # The parameters of each process should be in the same ordered for the data-parallelism
+        # distributed training to keep the broadcast correct.
+        self._persistable_vars = list(set(self._persistable_vars))
+        self._persistable_vars.sort()
+
+        return core.ParallelExecutor(
+            places, self._persistable_vars,
+            cpt.to_text(self._loss_name)
+            if self._loss_name else six.u(''), self._scope, self._local_scopes,
+            self._exec_strategy, self._build_strategy, self._graph)

    def _compile_inference(self):
        return core.create_paddle_predictor(self._infer_config)

--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -13,58 +13,92 @@
 # limitations under the license.

 from __future__ import print_function
+import os
+import six
 import unittest
+import paddle
 import paddle.fluid as fluid
-import six
 from paddle.fluid.framework import IrGraph
 from paddle.fluid import core

+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["CPU_NUM"] = "1"

-def residual_block(num):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      bias_attr=False):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=bias_attr)
-        return fluid.layers.batch_norm(input=tmp, act=act)

-    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+def conv_block():
+    img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = data
-    for _ in six.moves.xrange(num):
-        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
-        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
-        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
-    fc = fluid.layers.fc(input=hidden, size=10)
-    loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    return [img, label], avg_loss


 class TestGraph(unittest.TestCase):
-    def test_graph_functions(self):
+    def graph_apis(self, use_cuda=False, for_ci=True):
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
-            loss = residual_block(2)
+            feeds, loss = conv_block()
            opt = fluid.optimizer.Adam(learning_rate=0.001)
            opt.minimize(loss)
        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        backup_graph = graph.clone()
+        self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes()))
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.memory_optimize = False
+        build_strategy.enable_inplace = False
+        origin_binary = fluid.CompiledProgram(graph.graph).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy)
+        backup_binary = fluid.CompiledProgram(
+            backup_graph.graph).with_data_parallel(
+                loss_name=loss.name, build_strategy=build_strategy)
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        iters = 5
+        batch_size = 8
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+
+        def train(binary):
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(binary,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss.name])
+                print('{}: {}'.format('loss', loss_v))
+
+        train(origin_binary)
+        train(backup_binary)
+
        marked_nodes = set()
        for op in graph.all_op_nodes():
            if op.name().find('conv2d') > -1:
                marked_nodes.add(op)
-        graph.draw('.', 'residual', marked_nodes)
+        if not for_ci:
+            graph.draw('.', 'residual', marked_nodes)
+            backup_marked_nodes = set()
+            for op in backup_graph.all_op_nodes():
+                if op.name().find('conv2d') > -1:
+                    backup_marked_nodes.add(op)
+            backup_graph.draw('.', 'backup', backup_marked_nodes)
        self.assertFalse(graph.has_circle())
        self.assertEqual(graph.graph_num(), 1)
        nodes = graph.topology_sort()
@@ -75,6 +109,13 @@ class TestGraph(unittest.TestCase):
        graph.safe_remove_nodes(marked_nodes)
        self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))

+    def test_graph_apis_cpu(self):
+        self.graph_apis(use_cuda=False, for_ci=True)
+
+    def test_graph_apis_cuda(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.graph_apis(use_cuda=True, for_ci=True)
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -12,6 +12,7 @@
 # see the license for the specific language governing permissions and
 # limitations under the license.

+import os
 import unittest
 import random
 import numpy as np
@@ -25,6 +26,9 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
 from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
 from paddle.fluid import core

+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["CPU_NUM"] = "1"
+

 def linear_fc(num):
    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
@@ -249,7 +253,11 @@ class TestQuantizationFreezePass(unittest.TestCase):
                    marked_nodes.add(op)
            test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)

-        quantized_main_program = main_graph.to_program()
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.memory_optimize = False
+        build_strategy.enable_inplace = False
+        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy)
        quantized_test_program = test_graph.to_program()
        iters = 5
        batch_size = 8
@@ -264,7 +272,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
-                loss_v = exe.run(program=quantized_main_program,
+                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
                if not for_ci:

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2002,6 +2002,19 @@ class IrGraph(object):
        self.graph = graph
        self._for_test = for_test

+    def clone(self):
+        """
+        Create a new and duplicated IrGraph.
+
+        Warns:
+            The method only clones the graph structure, not its attributes.
+
+        Returns:
+            IrGraph: A new and duplicated graph.
+        """
+        g = self.graph.clone()
+        return IrGraph(g, self._for_test)
+
    def is_test(self):
        """
        If the graph is used for testing, the function returns true. Otherwise, returns false.
@@ -2232,10 +2245,10 @@ class IrGraph(object):
        Notes: the `graph` cannot contain a circle.

        Returns:
-            set(IrNode): nodes in topology order.
+            list(IrNode): nodes in topology order.
        """
        ordered_nodes = core.topology_sort(self.graph)
-        return {IrNode(n) for n in ordered_nodes}
+        return [IrNode(n) for n in ordered_nodes]

    def build_adjacency_list(self):
        """
@@ -2303,7 +2316,7 @@ class IrGraph(object):
        """
        Convert the graph into a Program.

-        Notes: When the graph includes backward operator nodes, the
+        WARN: When the graph includes backward operator nodes, the
        conversion process may be failed. Usually, this function is
        only used to convert a test graph.


--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -22,7 +22,8 @@ from . import layers
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant
-__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
+
+__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit']


 class Conv2D(layers.Layer):
@@ -468,3 +469,137 @@ class Embedding(layers.Layer):
            })

        return out
+
+
+class GRUUnit(layers.Layer):
+    """
+    **GRU unit layer**
+
+    if origin_mode is True, then the equation of a gru step is from paper
+    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
+    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
+
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
+
+    if origin_mode is False, then the equation of a gru step is from paper
+    `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
+    Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
+
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
+
+
+    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
+    of the equation above, the :math:`z_t` is split into 3 parts -
+    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
+    implement a full GRU unit operator for an input, a fully
+    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
+
+    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
+    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
+    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
+    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
+    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
+
+    Args:
+        input (Variable): The fc transformed input value of current step.
+        name_scope (str): See base class.
+        hidden (Variable): The hidden value of gru unit from previous step.
+        size (integer): The input dimension value.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            hidden-hidden weight matrix. Note:
+
+            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
+              :math:`D` is the hidden size.
+            - All elements in the weight matrix can be divided into two parts.
+              The first part are weights of the update gate and reset gate with
+              shape :math:`(D \\times 2D)`, and the second part are weights for
+              candidate hidden state with shape :math:`(D \\times D)`.
+
+            If it is set to None or one attribute of ParamAttr, gru_unit will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate,
+            reset gate and candidate calculations. If it is set to None or one
+            attribute of ParamAttr, gru_unit will create ParamAttr as
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
+        activation (string): The activation type for cell (actNode).
+                             Default: 'tanh'
+        gate_activation (string): The activation type for gates (actGate).
+                                  Default: 'sigmoid'
+
+    Returns:
+        tuple: The hidden value, reset-hidden value and gate values.
+    """
+
+    def __init__(self,
+                 name_scope,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 activation='tanh',
+                 gate_activation='sigmoid',
+                 origin_mode=False,
+                 dtype='float32'):
+        super(GRUUnit, self).__init__(name_scope)
+
+        activation_dict = dict(
+            identity=0,
+            sigmoid=1,
+            tanh=2,
+            relu=3, )
+        activation = activation_dict[activation]
+        gate_activation = activation_dict[gate_activation]
+
+        self._dtype = dtype
+        size = size // 3
+        # create weight
+        self._weight = self.create_parameter(
+            attr=param_attr, shape=[size, 3 * size], dtype=dtype)
+
+        # create bias
+        bias_size = [1, 3 * size]
+        self._bias = self.create_parameter(
+            attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    def forward(self, input, hidden):
+        inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': self._weight}
+        if self._bias:
+            inputs['Bias'] = self._bias
+
+        gate = self._helper.create_variable_for_type_inference(self._dtype)
+        reset_hidden_pre = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        updated_hidden = self._helper.create_variable_for_type_inference(
+            self._dtype)
+        self._helper.append_op(
+            type='gru_unit',
+            inputs=inputs,
+            outputs={
+                'Gate': gate,
+                'ResetHiddenPrev': reset_hidden_pre,
+                'Hidden': updated_hidden,
+            },
+            attrs={
+                'activation': 2,  # tanh
+                'gate_activation': 1,  # sigmoid
+            })
+
+        return updated_hidden, reset_hidden_pre, gate
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -516,6 +516,8 @@ def yolov3_loss(x,
                class_num,
                ignore_thresh,
                downsample_ratio,
+                gtscore=None,
+                use_label_smooth=True,
                name=None):
    """
    ${comment}
@@ -534,28 +536,35 @@ def yolov3_loss(x,
        class_num (int): ${class_num_comment}
        ignore_thresh (float): ${ignore_thresh_comment}
        downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): the name of yolov3 loss
+        name (string): the name of yolov3 loss. Default None.
+        gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
+                            of [N, B]. Default None.
+        use_label_smooth (bool): ${use_label_smooth_comment}

    Returns:
-        Variable: A 1-D tensor with shape [1], the value of yolov3 loss
+        Variable: A 1-D tensor with shape [N], the value of yolov3 loss

    Raises:
        TypeError: Input x of yolov3_loss must be Variable
-        TypeError: Input gtbox of yolov3_loss must be Variable"
-        TypeError: Input gtlabel of yolov3_loss must be Variable"
+        TypeError: Input gtbox of yolov3_loss must be Variable
+        TypeError: Input gtlabel of yolov3_loss must be Variable
+        TypeError: Input gtscore of yolov3_loss must be None or Variable
        TypeError: Attr anchors of yolov3_loss must be list or tuple
        TypeError: Attr class_num of yolov3_loss must be an integer
        TypeError: Attr ignore_thresh of yolov3_loss must be a float number
+        TypeError: Attr use_label_smooth of yolov3_loss must be a bool value

    Examples:
      .. code-block:: python

          x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-          gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
-          gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+          gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
+          gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
+          gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32')
          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
          anchor_mask = [0, 1, 2]
-          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, 
+          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
+                                          gtscore=gtscore, anchors=anchors, 
                                          anchor_mask=anchor_mask, class_num=80,
                                          ignore_thresh=0.7, downsample_ratio=32)
    """
@@ -567,6 +576,8 @@ def yolov3_loss(x,
        raise TypeError("Input gtbox of yolov3_loss must be Variable")
    if not isinstance(gtlabel, Variable):
        raise TypeError("Input gtlabel of yolov3_loss must be Variable")
+    if gtscore is not None and not isinstance(gtscore, Variable):
+        raise TypeError("Input gtscore of yolov3_loss must be Variable")
    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
        raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
    if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
@@ -576,6 +587,9 @@ def yolov3_loss(x,
    if not isinstance(ignore_thresh, float):
        raise TypeError(
            "Attr ignore_thresh of yolov3_loss must be a float number")
+    if not isinstance(use_label_smooth, bool):
+        raise TypeError(
+            "Attr use_label_smooth of yolov3_loss must be a bool value")

    if name is None:
        loss = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -586,21 +600,26 @@ def yolov3_loss(x,
    objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
    gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')

+    inputs = {
+        "X": x,
+        "GTBox": gtbox,
+        "GTLabel": gtlabel,
+    }
+    if gtscore:
+        inputs["GTScore"] = gtscore
+
    attrs = {
        "anchors": anchors,
        "anchor_mask": anchor_mask,
        "class_num": class_num,
        "ignore_thresh": ignore_thresh,
        "downsample_ratio": downsample_ratio,
+        "use_label_smooth": use_label_smooth,
    }

    helper.append_op(
        type='yolov3_loss',
-        inputs={
-            "X": x,
-            "GTBox": gtbox,
-            "GTLabel": gtlabel,
-        },
+        inputs=inputs,
        outputs={
            'Loss': loss,
            'ObjectnessMask': objectness_mask,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1432,6 +1432,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
          predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
          cost = fluid.layers.cross_entropy(input=predict, label=label)
    """
+    if not soft_label:
+        return cross_entropy2(input, label, ignore_index)
    helper = LayerHelper('cross_entropy', **locals())
    out = helper.create_variable_for_type_inference(dtype=input.dtype)
    helper.append_op(
@@ -1444,6 +1446,22 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
    return out


+def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
+    helper = LayerHelper('cross_entropy2', **locals())
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
+    match_x = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy2',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out],
+                 'MatchX': [match_x],
+                 'XShape': [xshape]},
+        attrs={'ignore_index': ignore_index})
+    return out
+
+
 def bpr_loss(input, label, name=None):
    """
    Bayesian Personalized Ranking Loss Operator.
@@ -2904,11 +2922,17 @@ def batch_norm(input,
        y_i &\\gets \\gamma \\hat{x_i} + \\beta

    Args:
-        input(variable): The input variable which is a LoDTensor.
+        input(variable): The rank of input variable can be 2, 3, 4, 5.
        act(string, Default None): Activation type, linear|relu|prelu|...
-        is_test(bool, Default False): Used for training or training.
-        momentum(float, Default 0.9):
-        epsilon(float, Default 1e-05):
+        is_test (bool, Default False): A flag indicating whether it is in
+            test phrase or not.
+        momentum(float, Default 0.9): The value used for the moving_mean and
+            moving_var computation. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, Default 1e-05): A value added to the denominator for
+            numerical stability. Default is 1e-5.
        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as param_attr. If the Initializer of the param_attr
@@ -2966,15 +2990,8 @@ def batch_norm(input,
        shape=param_shape,
        dtype=dtype,
        default_initializer=Constant(1.0))
-    # setting stop_gradient=True to reduce computation
-    if use_global_stats and helper.param_attr.learning_rate == 0.:
-        scale.stop_gradient = True
-
    bias = helper.create_parameter(
        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
-    # setting stop_gradient=True to reduce computation
-    if use_global_stats and helper.bias_attr.learning_rate == 0.:
-        bias.stop_gradient = True

    mean = helper.create_parameter(
        attr=ParamAttr(

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -23,6 +23,7 @@ __activations_noattr__ = [
    'logsigmoid',
    'exp',
    'tanh',
+    'atan',
    'tanh_shrink',
    'softshrink',
    'sqrt',
@@ -30,6 +31,8 @@ __activations_noattr__ = [
    'ceil',
    'floor',
    'cos',
+    'acos',
+    'asin',
    'sin',
    'round',
    'reciprocal',

--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -476,8 +476,17 @@ class TestYoloDetection(unittest.TestCase):
            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
            gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
            gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13],
-                                      [0, 1], 10, 0.7, 32)
+            gtscore = layers.data(name='gtscore', shape=[10], dtype='float32')
+            loss = layers.yolov3_loss(
+                x,
+                gtbox,
+                gtlabel, [10, 13, 30, 13], [0, 1],
+                10,
+                0.7,
+                32,
+                gtscore=gtscore,
+                use_label_smooth=False)
+
            self.assertIsNotNone(loss)

    def test_yolo_box(self):

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -22,6 +22,7 @@ import six
 import time
 import itertools
 import collections
+from collections import defaultdict

 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -257,8 +258,65 @@ class OpTest(unittest.TestCase):
        outs, _ = self._calc_output(place)
        return outs

-    def _calc_output(self, place, parallel=False, no_check_set=None):
+    def _create_var_from_numpy(self, value):
+        if isinstance(value, tuple):
+            data = value[0]
+            lod = value[1]
+            v = fluid.imperative.base.to_variable(value=data)
+            v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod)
+            return v
+        else:
+            return fluid.imperative.base.to_variable(value)
+
+    def _calc_imperative_output(self, place, parallel=False, no_check_set=None):
+        with fluid.imperative.base.guard(place=place):
+            block = fluid.default_main_program().global_block()
+
+            # prepare input variable
+            inputs = defaultdict(list)
+            for name, np_value in six.iteritems(self.inputs):
+                if not isinstance(np_value, list):
+                    np_value = [np_value]
+
+                for i in range(len(np_value)):
+                    inputs[name].append(
+                        self._create_var_from_numpy(np_value[i]))
+
+            # prepare output variable
+            outputs = defaultdict(list)
+            for name, np_value in six.iteritems(self.outputs):
+                if not isinstance(np_value, list):
+                    np_value = [np_value]
+
+                for i in range(len(np_value)):
+                    value = np_value[i]
+                    if isinstance(value, tuple):
+                        v = block.create_var(
+                            name="%s_out%d" % (name, i),
+                            dtype=value[0].dtype,
+                            type=core.VarDesc.VarType.LOD_TENSOR,
+                            persistable=False,
+                            stop_gradient=False)
+                        v._ivar.value().get_tensor(
+                        ).set_recursive_sequence_lengths(value[1])
+                    else:
+                        v = block.create_var(
+                            name="%s_out%d" % (name, i),
+                            dtype=value.dtype,
+                            type=core.VarDesc.VarType.LOD_TENSOR,
+                            persistable=False,
+                            stop_gradient=False)
+                    outputs[name].append(v)
+
+            block.append_op(
+                type=self.op_type,
+                inputs=inputs,
+                outputs=outputs,
+                attrs=self.attrs)
+
+            return outputs

+    def _calc_output(self, place, parallel=False, no_check_set=None):
        program = Program()
        block = program.global_block()
        self._append_ops(block)
@@ -305,8 +363,13 @@ class OpTest(unittest.TestCase):
                                place,
                                atol,
                                no_check_set=None,
-                                equal_nan=False):
+                                equal_nan=False,
+                                check_imperative=False):
+        if check_imperative:
+            imperative_outs = self._calc_imperative_output(
+                place, no_check_set=no_check_set)
        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
+
        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
            if out_name not in self.outputs:
                continue
@@ -330,6 +393,10 @@ class OpTest(unittest.TestCase):
                                         type(sub_out))
                for item in sub_out:
                    sub_out_name, expect = item[0], item[1]
+                    if check_imperative:
+                        imperative_actual = imperative_outs[sub_out_name][0]
+                        imperative_actual_t = np.array(
+                            imperative_actual._ivar.value().get_tensor())
                    idx = find_actual(sub_out_name, fetch_list)
                    actual = outs[idx]
                    actual_t = np.array(actual)
@@ -340,12 +407,31 @@ class OpTest(unittest.TestCase):
                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                        "Output (" + sub_out_name + ") has diff at " +
                        str(place))
+                    if check_imperative:
+                        self.assertTrue(
+                            np.allclose(
+                                imperative_actual_t,
+                                expect_t,
+                                atol=atol,
+                                equal_nan=equal_nan),
+                            "Output (" + sub_out_name + ") has diff at " +
+                            str(place) + " in imperative mode")
                    if isinstance(expect, tuple):
                        self.assertListEqual(
                            actual.recursive_sequence_lengths(), expect[1],
                            "Output (" + sub_out_name +
                            ") has different lod at " + str(place))
+                    if check_imperative:
+                        self.assertListEqual(
+                            imperative_actual._ivar.value().get_tensor()
+                            .recursive_sequence_lengths(), expect[1],
+                            "Output (" + out_name + ") has different lod at " +
+                            str(place) + " in imperative mode")
            else:
+                if check_imperative:
+                    imperative_actual = imperative_outs[out_name][0]
+                    imperative_actual_t = np.array(
+                        imperative_actual._ivar.value().get_tensor())
                idx = find_actual(out_name, fetch_list)
                actual = outs[idx]
                actual_t = np.array(actual)
@@ -357,10 +443,27 @@ class OpTest(unittest.TestCase):
                    "Output (" + out_name + ") has diff at " + str(place) +
                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
                    str(actual_t) + " in class " + self.__class__.__name__)
+                if check_imperative:
+                    self.assertTrue(
+                        np.allclose(
+                            imperative_actual_t,
+                            expect_t,
+                            atol=atol,
+                            equal_nan=equal_nan),
+                        "Output (" + out_name + ") has diff at " + str(place) +
+                        "\nExpect " + str(expect_t) + "\n" + "But Got" +
+                        str(imperative_actual_t) + " in class " +
+                        self.__class__.__name__)
                if isinstance(expect, tuple):
                    self.assertListEqual(actual.recursive_sequence_lengths(),
                                         expect[1], "Output (" + out_name +
                                         ") has different lod at " + str(place))
+                    if check_imperative:
+                        self.assertListEqual(
+                            imperative_actual._ivar.value().get_tensor()
+                            .recursive_sequence_lengths(), expect[1],
+                            "Output (" + out_name + ") has different lod at " +
+                            str(place) + " in imperative mode")

    def _get_places(self):
        if self.dtype == np.float16:
@@ -383,10 +486,15 @@ class OpTest(unittest.TestCase):
            places.append(core.CUDAPlace(0))
        return places

-    def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
+    def check_output(self,
+                     atol=1e-5,
+                     no_check_set=None,
+                     equal_nan=False,
+                     check_imperative=False):
        places = self._get_places()
        for place in places:
-            self.check_output_with_place(place, atol, no_check_set, equal_nan)
+            self.check_output_with_place(place, atol, no_check_set, equal_nan,
+                                         check_imperative)

    def check_output_customized(self, checker):
        places = self._get_places()

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -100,6 +100,23 @@ class TestTanh(TestActivation):
        self.check_grad(['X'], 'Out', max_relative_error=0.007)


+class TestAtan(TestActivation):
+    def setUp(self):
+        self.op_type = "atan"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.arctan(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestTanhShrink(TestActivation):
    def setUp(self):
        self.op_type = "tanh_shrink"
@@ -248,6 +265,23 @@ class TestCos(TestActivation):
        self.check_grad(['X'], 'Out', max_relative_error=0.007)


+class TestAcos(TestActivation):
+    def setUp(self):
+        self.op_type = "acos"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.arccos(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestSin(TestActivation):
    def setUp(self):
        self.op_type = "sin"
@@ -265,6 +299,23 @@ class TestSin(TestActivation):
        self.check_grad(['X'], 'Out', max_relative_error=0.007)


+class TestAsin(TestActivation):
+    def setUp(self):
+        self.op_type = "asin"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.arcsin(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestRound(TestActivation):
    def setUp(self):
        self.op_type = "round"
@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs)
 create_test_act_fp16_class(TestCeil, grad_check=False)
 create_test_act_fp16_class(TestFloor, grad_check=False)
 create_test_act_fp16_class(TestCos, grad_atol=0.85)
+create_test_act_fp16_class(TestAcos, grad_atol=0.85)
 create_test_act_fp16_class(TestSin)
+create_test_act_fp16_class(TestAsin)
+create_test_act_fp16_class(TestAtan)
 create_test_act_fp16_class(TestRound, grad_check=False)
 create_test_act_fp16_class(TestRelu)
 create_test_act_fp16_class(TestGelu)

--- a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_test import OpTest
+import unittest
+import numpy as np
+import six
+
+
+class CrossEntropy2OpTestBase(OpTest):
+    def initParameters(self):
+        return [32, 64], 'float32', -100
+
+    def calc_output(self, logits, label, ignore_index):
+        ret = np.zeros(shape=label.shape, dtype=logits.dtype)
+        match_x = np.zeros(shape=label.shape, dtype=logits.dtype)
+        for idx in six.moves.range(label.shape[0]):
+            if label[idx] == ignore_index:
+                continue
+            match_x[idx] = logits[idx][label[idx]]
+            ret[idx] = -np.log(match_x[idx])
+        return ret, match_x
+
+    def setUp(self):
+        self.shape, self.dtype, self.ignore_index = self.initParameters()
+        self.op_type = 'cross_entropy2'
+        feature_size = int(self.shape[-1])
+        batch_size = int(np.prod(self.shape) / feature_size)
+        logits = (np.random.random(size=self.shape) + 1).astype(self.dtype)
+        label = np.random.random_integers(
+            low=0, high=feature_size - 1,
+            size=self.shape[0:-1] + [1]).astype('int64')
+        outputs, match_x = self.calc_output(
+            np.reshape(logits, [batch_size, feature_size]),
+            np.reshape(label, [batch_size, 1]), self.ignore_index)
+        self.inputs = {'X': logits, 'Label': label}
+        self.outputs = {
+            'Y': np.reshape(outputs, label.shape),
+            'MatchX': np.reshape(match_x, label.shape),
+            'XShape': np.zeros(
+                shape=logits.shape, dtype=logits.dtype)
+        }
+        self.attrs = {'ignore_index': self.ignore_index}
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        self.check_grad(
+            inputs_to_check=['X'],
+            output_names=['Y'],
+            no_grad_set=['XShape', 'MatchX', 'Label'])
+
+
+class CrossEntropy2OpTest2(CrossEntropy2OpTestBase):
+    def initParameters(self):
+        return [32, 64], 'float64', 3
+
+
+class CrossEntropy2OpTest3(CrossEntropy2OpTestBase):
+    def initParameters(self):
+        return [4, 8, 16, 32], 'float32', -100
+
+
+class CrossEntropy2OpTest4(CrossEntropy2OpTestBase):
+    def initParameters(self):
+        return [4, 8, 16, 32], 'float32', 3
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -524,8 +524,8 @@ class TestLocalLookupTable(TestDistLookupTableBase):
        ops = [
            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
            'split_selected_rows', 'send', 'sequence_pool_grad',
            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
@@ -564,8 +564,8 @@ class TestDistLookupTable(TestDistLookupTableBase):
        ops = [
            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
            'lookup_table_grad', 'split_selected_rows', 'send',
            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
@@ -612,8 +612,8 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
        ops = [
            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
            'split_selected_rows', 'send', 'sequence_pool_grad',
            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
@@ -652,8 +652,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
        ops = [
            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
            'lookup_table_grad', 'split_selected_rows', 'send',
            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
@@ -841,8 +841,8 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
        ops = [
            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
-            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
            'split_selected_rows', 'send', 'sequence_pool_grad',
            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',

--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -31,6 +31,80 @@ def dequantize_max_abs(x, scale, max_range):
    return y


+def channel_wise_quantize_max_abs(x, quant_bit=8):
+    scales = []
+    for i in range(x.shape[0]):
+        scales.append(np.max(np.abs(x[i])).astype("float32"))
+
+    y = x.copy()
+    max_range = math.pow(2, quant_bit - 1) - 1
+    for i, scale in enumerate(scales):
+        y[i] = np.round(y[i] / scale * max_range)
+    return y, scales
+
+
+def channel_wise_dequantize_max_abs(x,
+                                    scales,
+                                    quant_bits,
+                                    activation_scale=None):
+    y = x.copy()
+    for i in range(x.shape[0]):
+        y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i]
+    if activation_scale is not None:
+        y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
+    return y
+
+
+class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
+    def set_args(self):
+        self.quant_bits = [8, 8]
+        self.data_type = "float32"
+        self.activation_scale = 0.7861
+
+    def setUp(self):
+        self.set_args()
+        self.op_type = "fake_channel_wise_dequantize_max_abs"
+        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
+        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
+                                              self.activation_scale)
+
+        self.inputs = {
+            'X': yq,
+            'Scales': [("scales0", np.array(scales).astype(self.data_type)),
+                       ("scales1", np.array(
+                           [self.activation_scale]).astype(self.data_type))]
+        }
+        self.attrs = {'quant_bits': self.quant_bits}
+        self.outputs = {'Out': ydq}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
+    def set_args(self):
+        self.quant_bits = [8]
+        self.data_type = "float32"
+
+    def setUp(self):
+        self.set_args()
+        self.op_type = "fake_channel_wise_dequantize_max_abs"
+        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
+        ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits)
+
+        self.inputs = {
+            'X': yq,
+            'Scales': [("scales0", np.array(scales).astype(self.data_type))]
+        }
+        self.attrs = {'quant_bits': self.quant_bits}
+        self.outputs = {'Out': ydq}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFakeDequantizeMaxAbsOp(OpTest):
    def set_args(self):
        self.num_bits = 8

--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -35,6 +35,30 @@ class TestFakeQuantizeOp(OpTest):
        self.check_output()


+class TestFakeChannelWiseQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_channel_wise_quantize_abs_max"
+        self.attrs = {'bit_length': 8}
+        self.inputs = {
+            'X': np.random.random((4, 3, 64, 64)).astype("float32"),
+        }
+        scales = []
+        for i in range(self.inputs['X'].shape[0]):
+            scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32"))
+        outputs = self.inputs['X'].copy()
+        for i, scale in enumerate(scales):
+            outputs[i] = np.round(outputs[i] / scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1))
+
+        self.outputs = {
+            'Out': outputs,
+            'OutScales': np.array(scales).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
    def setUp(self):
        self.op_type = "fake_quantize_range_abs_max"

--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -156,7 +156,7 @@ class TestGRUOp(OpTest):
        }

    def test_check_output(self):
-        self.check_output(atol=1e-8)
+        self.check_output(atol=1e-8, check_imperative=True)

    def test_check_grad(self):
        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -112,6 +112,47 @@ class TestLayer(LayerTest):
        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
        self.assertTrue(np.allclose(static_ret, static_ret2))

+    def test_gru_unit(self):
+        lod = [[2, 4, 3]]
+        D = 5
+        T = sum(lod[0])
+        N = len(lod[0])
+
+        input = np.random.rand(T, 3 * D).astype('float32')
+        hidden_input = np.random.rand(T, D).astype('float32')
+
+        with self.static_graph():
+            x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
+            hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
+            updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
+                input=x, hidden=hidden, size=D * 3)
+            static_ret = self.get_static_graph_result(
+                feed={'x': input,
+                      'hidden': hidden_input},
+                fetch_list=[updated_hidden, reset_hidden_pre, gate])
+
+        with self.static_graph():
+            x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
+            hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
+            updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
+                input=x, hidden=hidden, size=D * 3)
+            gru = nn.GRUUnit('gru', size=D * 3)
+            updated_hidden, reset_hidden_pre, gate = gru(x, hidden)
+
+            static_ret2 = self.get_static_graph_result(
+                feed={'x': input,
+                      'hidden': hidden_input},
+                fetch_list=[updated_hidden, reset_hidden_pre, gate])
+
+        with self.dynamic_graph():
+            gru = nn.GRUUnit('gru', size=D * 3)
+            dy_ret = gru(
+                base.to_variable(input), base.to_variable(hidden_input))
+
+        for i in range(len(static_ret)):
+            self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
+            self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
+

 class TestBook(unittest.TestCase):
    def test_fit_a_line(self):

--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function

 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest


@@ -63,5 +64,28 @@ class TestCase2(TestSliceOp):
        self.out = self.input[-3:3, 0:100, :, 2:-1]


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFP16(TestSliceOp):
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(
+                place, ['Input'], 'Out', max_relative_error=0.006)
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import os
+import six
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler
+
+
+class TestSyncBatchNormOpTraining(unittest.TestCase):
+    def setUp(self):
+        #self.dtype = np.float32
+        self.dtype = np.float64
+        self.N = 32
+        self.C = 16
+        self.H = 64
+        self.W = 32
+        self.dshape = [self.N, self.C, self.H, self.W]
+
+    def build_program(self,
+                      place,
+                      layout,
+                      seed,
+                      sync_bn=False,
+                      only_forward=False):
+        main = fluid.Program()
+        startup = fluid.Program()
+        main.random_seed = seed
+        startup.random_seed = seed
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                data = fluid.layers.data(
+                    name='input',
+                    shape=self.dshape,
+                    dtype=self.dtype,
+                    append_batch_size=False)
+                conv = fluid.layers.conv2d(
+                    input=data,
+                    num_filters=32,
+                    filter_size=1,
+                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
+                    bias_attr=False,
+                    use_cudnn=False)
+                bn = fluid.layers.batch_norm(
+                    conv,
+                    param_attr=fluid.ParamAttr(name='bn_scale'),
+                    bias_attr=fluid.ParamAttr(name='bn_bias'),
+                    moving_mean_name='bn_moving_mean',
+                    moving_variance_name='bn_moving_variance',
+                    data_layout=layout,
+                    is_test=only_forward)
+                sigmoid = fluid.layers.sigmoid(bn)
+                out = fluid.layers.reduce_sum(sigmoid)
+                if not sync_bn:
+                    out = out / core.get_cuda_device_count()
+                if not only_forward:
+                    sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
+                    sgd_opt.backward(out)
+        return main, startup, [out, conv, bn]
+
+    def compare(self, place, layout, only_forward):
+        seed = 10
+        os.environ['FLAGS_cudnn_deterministic'] = "1"
+        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+        # Single-GPU, N = 32 per GPU
+        main, startup, outs = self.build_program(place, layout, seed, False,
+                                                 only_forward)
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        fetch_names = [v.name for v in outs] + [
+            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+        ]
+        if not only_forward:
+            others = [
+                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
+                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
+            ]
+            fetch_names += others
+        bn_fetches = exe.run(program=main,
+                             feed={'input': data},
+                             fetch_list=fetch_names)
+
+        #####################################################################
+        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
+        main, startup, outs = self.build_program(place, layout, seed, True,
+                                                 only_forward)
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        fetch_names = [v.name for v in outs] + [
+            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+        ]
+        if not only_forward:
+            others = [
+                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
+                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
+            ]
+            fetch_names += others
+        for nm in fetch_names:
+            fv = fluid.framework._get_var(str(nm), program=main)
+            fv.persistable = True
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.sync_batch_norm = True
+        build_strategy.enable_inplace = False
+        build_strategy.memory_optimize = False
+        comp_prog = compiler.CompiledProgram(main).with_data_parallel(
+            outs[0].name if not only_forward else None,
+            build_strategy=build_strategy)
+        sync_bn_fetches = exe.run(program=comp_prog,
+                                  feed={'input': data},
+                                  fetch_list=fetch_names)
+
+        for i in six.moves.xrange(1, len(sync_bn_fetches)):
+            bn_val = bn_fetches[i]
+            sync_bn_val = sync_bn_fetches[i]
+            if sync_bn_val.shape != bn_val.shape:
+                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
+            self.assertTrue(
+                np.allclose(
+                    bn_val, sync_bn_val, atol=1e-3),
+                "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN     " +
+                str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
+
+    def test_train(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        places = [core.CUDAPlace(0)]
+        for place in places:
+            for layout in ["NCHW", "NHWC"]:
+                self.compare(place, layout, False)
+
+    def test_infer(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        places = [core.CUDAPlace(0)]
+        for place in places:
+            for layout in ["NCHW", "NHWC"]:
+                self.compare(place, layout, True)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -23,8 +23,8 @@ from op_test import OpTest
 from paddle.fluid import core


-def l2loss(x, y):
-    return 0.5 * (y - x) * (y - x)
+def l1loss(x, y):
+    return abs(x - y)


 def sce(x, label):
@@ -66,7 +66,7 @@ def batch_xywh_box_iou(box1, box2):
    return inter_area / union


-def YOLOv3Loss(x, gtbox, gtlabel, attrs):
+def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
    n, c, h, w = x.shape
    b = gtbox.shape[1]
    anchors = attrs['anchors']
@@ -75,21 +75,21 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
    mask_num = len(anchor_mask)
    class_num = attrs["class_num"]
    ignore_thresh = attrs['ignore_thresh']
-    downsample = attrs['downsample']
-    input_size = downsample * h
+    downsample_ratio = attrs['downsample_ratio']
+    use_label_smooth = attrs['use_label_smooth']
+    input_size = downsample_ratio * h
    x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
    loss = np.zeros((n)).astype('float32')

+    label_pos = 1.0 - 1.0 / class_num if use_label_smooth else 1.0
+    label_neg = 1.0 / class_num if use_label_smooth else 0.0
+
    pred_box = x[:, :, :, :, :4].copy()
    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h

-    x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
-                                 np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
-                                 class_num)
-
    mask_anchors = []
    for m in anchor_mask:
        mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
@@ -138,21 +138,22 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
            ty = gtbox[i, j, 1] * w - gj
            tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
            th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
-            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3])
+            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) * gtscore[i, j]
            loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
            loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
-            loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale
-            loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale
+            loss[i] += l1loss(x[i, an_idx, gj, gi, 2], tw) * scale
+            loss[i] += l1loss(x[i, an_idx, gj, gi, 3], th) * scale

-            objness[i, an_idx * h * w + gj * w + gi] = 1.0
+            objness[i, an_idx * h * w + gj * w + gi] = gtscore[i, j]

            for label_idx in range(class_num):
-                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx],
-                               float(label_idx == gtlabel[i, j]))
+                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], label_pos
+                               if label_idx == gtlabel[i, j] else
+                               label_neg) * gtscore[i, j]

        for j in range(mask_num * h * w):
            if objness[i, j] > 0:
-                loss[i] += sce(pred_obj[i, j], 1.0)
+                loss[i] += sce(pred_obj[i, j], 1.0) * objness[i, j]
            elif objness[i, j] == 0:
                loss[i] += sce(pred_obj[i, j], 0.0)

@@ -176,7 +177,8 @@ class TestYolov3LossOp(OpTest):
            "anchor_mask": self.anchor_mask,
            "class_num": self.class_num,
            "ignore_thresh": self.ignore_thresh,
-            "downsample": self.downsample,
+            "downsample_ratio": self.downsample_ratio,
+            "use_label_smooth": self.use_label_smooth,
        }

        self.inputs = {
@@ -184,7 +186,14 @@ class TestYolov3LossOp(OpTest):
            'GTBox': gtbox.astype('float32'),
            'GTLabel': gtlabel.astype('int32'),
        }
-        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
+
+        gtscore = np.ones(self.gtbox_shape[:2]).astype('float32')
+        if self.gtscore:
+            gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32')
+            self.inputs['GTScore'] = gtscore
+
+        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, gtscore,
+                                               self.attrs)
        self.outputs = {
            'Loss': loss,
            'ObjectnessMask': objness,
@@ -193,24 +202,57 @@ class TestYolov3LossOp(OpTest):

    def test_check_output(self):
        place = core.CPUPlace()
-        self.check_output_with_place(place, atol=1e-3)
+        self.check_output_with_place(place, atol=2e-3)

    def test_check_grad_ignore_gtbox(self):
        place = core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['X'],
-            'Loss',
-            no_grad_set=set(["GTBox", "GTLabel"]),
-            max_relative_error=0.3)
+        self.check_grad_with_place(place, ['X'], 'Loss', max_relative_error=0.2)
+
+    def initTestCase(self):
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
+        self.class_num = 5
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
+        self.gtscore = True
+        self.use_label_smooth = True
+
+
+class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp):
+    def initTestCase(self):
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
+        self.class_num = 5
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
+        self.gtscore = True
+        self.use_label_smooth = False
+

+class TestYolov3LossNoGTScore(TestYolov3LossOp):
    def initTestCase(self):
-        self.anchors = [10, 13, 16, 30, 33, 23]
-        self.anchor_mask = [1, 2]
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
        self.class_num = 5
-        self.ignore_thresh = 0.5
-        self.downsample = 32
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
        self.gtbox_shape = (3, 5, 4)
+        self.gtscore = False
+        self.use_label_smooth = True


 if __name__ == "__main__":

--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -30,6 +30,6 @@ if error:
        '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
    1. cd ${paddle_path}, compile paddle;
    2. pip install build/python/dist/(build whl package);
-    3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"'''
+    3. run "python tools/print_signatures.py paddle.fluid,paddle.reader > paddle/fluid/API.spec"'''
    )
    sys.exit(1)
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -51,6 +51,8 @@ def visit_member(parent_name, member):
            all = (args, doc)
            member_dict[cur_name] = all
        except TypeError:  # special for PyBind method
+            if cur_name in check_modules_list:
+                return
            member_dict[cur_name] = "  ".join([
                line.strip() for line in pydoc.render_doc(member).split('\n')
                if "->" in line
@@ -78,6 +80,7 @@ def visit_all_module(mod):
            visit_member(mod.__name__, instance)


+check_modules_list = ["paddle.reader.ComposeNotAligned.__init__"]
 modules = sys.argv[1].split(",")
 for m in modules:
    visit_all_module(importlib.import_module(m))

--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
        event['args'] = args
        self._events.append(event)

+    def emit_counter(self, category, name, pid, timestamp, counter, value):
+        """Emits a record for a single counter.
+
+        Args:
+            category: The event category as string
+            name: The event name as string
+            pid: Identifier of the process generating this event as integer
+            timestamp: The timestamps of this event as long integer
+            counter: Name of the counter as string
+            value: Value of the counter as integer
+            tid: Thread id of the allocation as integer
+        """
+        event = self._create_event('C', category, name, pid, 0, timestamp)
+        event['args'] = {counter: value}
+        self._events.append(event)
+
    def format_to_string(self, pretty=False):
        """Formats the chrome trace to a string.

@@ -117,6 +133,7 @@ class Timeline(object):
        self._profile_dict = profile_dict
        self._pid = 0
        self._devices = dict()
+        self._mem_devices = dict()
        self._chrome_trace = _ChromeTraceFormatter()

    def _allocate_pid(self):
@@ -143,6 +160,47 @@ class Timeline(object):
                        self._devices[(k, event.device_id, "GPUKernel")] = pid
                        self._chrome_trace.emit_pid("%s:gpu:%d" %
                                                    (k, event.device_id), pid)
+            if not hasattr(profile_pb, "mem_events"):
+                continue
+            for mevent in profile_pb.mem_events:
+                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
+                    if (k, mevent.device_id, "GPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "GPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:gpu:%d" % (k, mevent.device_id),
+                            pid)
+                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
+                    if (k, mevent.device_id, "CPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "CPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:cpu:%d" % (k, mevent.device_id),
+                            pid)
+                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
+                    if (k, mevent.device_id, "CUDAPinnedPlace"
+                        ) not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id,
+                                           "CUDAPinnedPlace")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:cudapinnedplace:%d" %
+                            (k, mevent.device_id), pid)
+                if (k, 0, "CPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "CPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
+                                                (k, 0), pid)
+                if (k, 0, "GPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "GPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
+                                                (k, 0), pid)
+                if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
+                    self._chrome_trace.emit_pid(
+                        "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)

    def _allocate_events(self):
        for k, profile_pb in six.iteritems(self._profile_dict):
@@ -155,7 +213,7 @@ class Timeline(object):
                args = {'name': event.name}
                if event.memcopy.bytes > 0:
                    args['mem_bytes'] = event.memcopy.bytes
-                if event.detail_info:
+                if hasattr(event, "detail_info") and event.detail_info:
                    args['detail_info'] = event.detail_info
                # TODO(panyx0718): Chrome tracing only handles ms. However, some
                # ops takes micro-seconds. Hence, we keep the ns here.
@@ -163,9 +221,59 @@ class Timeline(object):
                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
                    event.sub_device_id, 'Op', event.name, args)

+    def _allocate_memory_event(self):
+        if not hasattr(profiler_pb2, "MemEvent"):
+            return
+        place_to_str = {
+            profiler_pb2.MemEvent.CPUPlace: "CPU",
+            profiler_pb2.MemEvent.CUDAPlace: "GPU",
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+        }
+        for k, profile_pb in six.iteritems(self._profile_dict):
+            mem_list = []
+            end_profiler = 0
+            for mevent in profile_pb.mem_events:
+                crt_info = dict()
+                crt_info['time'] = mevent.start_ns
+                crt_info['size'] = mevent.bytes
+                if mevent.place in place_to_str:
+                    place = place_to_str[mevent.place]
+                else:
+                    place = "UnDefine"
+                crt_info['place'] = place
+                pid = self._mem_devices[(k, mevent.device_id, place)]
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                mem_list.append(crt_info)
+                crt_info = dict()
+                crt_info['place'] = place
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                crt_info['time'] = mevent.end_ns
+                crt_info['size'] = -mevent.bytes
+                mem_list.append(crt_info)
+                end_profiler = max(end_profiler, crt_info['time'])
+            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
+            i = 0
+            total_size = 0
+            while i < len(mem_list):
+                total_size += mem_list[i]['size']
+                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
+                        i + 1]['time']:
+                    total_size += mem_list[i + 1]['size']
+                    i += 1
+
+                self._chrome_trace.emit_counter(
+                    "Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
+                    0, total_size)
+                i += 1
+
    def generate_chrome_trace(self):
        self._allocate_pids()
        self._allocate_events()
+        self._allocate_memory_event()
        return self._chrome_trace.format_to_string()