diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 59c40a0e5d18b753038f2b9301d1c9494e3901be..c2d04828564e69d7ac965881057f185194aa0475 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -52,8 +52,8 @@ function(op_library TARGET)
         endif()
         if(WITH_MKLDNN)
             string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
-                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
             endif()
         endif()
     else()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 4acccd0899568184735db35f0d949ec0e8b67fff..f50a38842a21c795c979f859e88a9b16c3e54bd8 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
-paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None))
+paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
@@ -142,10 +142,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
 paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
-paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None))
+paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1))
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
-paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1))
+paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True))
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -322,9 +322,10 @@ paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_class
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
 paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
+paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
+paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.box_clip ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.multiclass_nms ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
@@ -361,6 +362,9 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.Calibrator.__init__ ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.contrib.Calibrator.sample_data ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.Calibrator.save_int8_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b118dccd1b3de881b4791bff6cd331726c8e05da..07c2c970d4de3cecf03e4cf80e60e81e7a9595a8 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -10,8 +10,22 @@ function(pass_library TARGET DEST)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
+    set(targetPrefix "")
+
+    # Get optional argument
+    set(extraMacroArgs ${ARGN})
+    list(LENGTH extraMacroArgs numExtraMacroArgs)
+    if(numExtraMacroArgs GREATER 0)
+        list(GET extraMacroArgs 0 targetPrefix)
+    endif()
+
     cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
+    if(targetPrefix)
+        cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
+    else()
+        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
+    endif()
+
     # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
     if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
         message(STATUS "add pass ${TARGET} ${DEST}")
@@ -51,6 +65,7 @@ pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
+pass_library(identity_scale_op_clean_pass base)
 
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -62,11 +77,11 @@ foreach (index RANGE 3 6)
 endforeach()
 
 if(WITH_MKLDNN)
-    pass_library(mkldnn_placement_pass base)
-    pass_library(depthwise_conv_mkldnn_pass base)
-    pass_library(conv_bias_mkldnn_fuse_pass inference)
-    pass_library(conv_relu_mkldnn_fuse_pass inference)
-    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
+    pass_library(mkldnn_placement_pass base mkldnn)
+    pass_library(depthwise_conv_mkldnn_pass base mkldnn)
+    pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
+    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
 endif()
 
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
@@ -86,7 +101,7 @@ cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framewor
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
-    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
-    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
+    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
+    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b738aa159ebfd77f00c9e532fbd94542e2097db
--- /dev/null
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init("identity_scale_op_clean", graph.get());
+
+  // pre_op -> scale_in -> scale_op -> scale_out
+  // ->
+  // pre_op -> scale_out
+  GraphPatternDetector detector;
+  auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op();
+  auto scale_in = detector.mutable_pattern()
+                      ->NewNode("scale_in")
+                      ->assert_is_op_input("scale")
+                      ->AsIntermediate();
+  auto scale_op = detector.mutable_pattern()
+                      ->NewNode("scale_fuse")
+                      ->assert_is_op("scale")
+                      ->assert_op_attr<float>("scale", 1.)
+                      ->assert_op_attr<float>("bias", 0.);
+  auto scale_out = detector.mutable_pattern()
+                       ->NewNode("scale_out")
+                       ->assert_is_op_output("scale");
+
+  pre_op->LinksTo({scale_in});
+  scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
+
+  GraphPatternDetector::handle_t handler = [&](
+      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+    Node* scale_op_var = subgraph.at(scale_op);
+    Node* scale_in_var = subgraph.at(scale_in);
+    Node* scale_out_var = subgraph.at(scale_out);
+    Node* pre_op_var = subgraph.at(pre_op);
+    // Link pre_op directly to scale_out
+    const std::string scale_in_name = scale_in_var->Name();
+    const std::string scale_out_name = scale_out_var->Name();
+    // Remove links in graph
+    GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
+    // Modify proto message
+    auto* pre_op_desc = pre_op_var->Op();
+    for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) {
+      auto* arguments = parameter.mutable_arguments();
+      auto it = std::find(arguments->begin(), arguments->end(), scale_in_name);
+      PADDLE_ENFORCE(it != arguments->end());
+      *it = scale_out_name;
+    }
+
+    IR_NODE_LINK_TO(pre_op_var, scale_out_var);
+  };
+
+  detector(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(identity_scale_op_clean_pass,
+              paddle::framework::ir::IdentityScaleOpCleanPass);
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..50a654d82f0e4fb7e8e91c665397716407e6d2a5
--- /dev/null
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IdentityScaleOpCleanPass : public FusePassBase {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+ private:
+  virtual ~IdentityScaleOpCleanPass() = default;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index d4a701e0b173a96d8605dff308fee7007a0ecc0c..5d0b294f6fec5f14dcddb91f8ceffb27fc833d4e 100644
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
 #include <functional>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
similarity index 99%
rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index a8029e67e659a269f8492cf6e2f1f09040144283..fb3db81347b102cfa264082b36a2e22ea8c22982 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include <functional>
 #include <list>
 #include <map>
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
index 61ba097fd8cb55e25bda1947ea97d53308c55bd3..9ef5c298b8cddfec094e9544dc6da9afdcaf0dab 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -15,8 +15,8 @@
 #include <gtest/gtest.h>
 #include <string>
 
-#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
similarity index 97%
rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
index e359a3832ee8d549f8c58d63bc1cc6564ecadede..4f4605398a665e63662a64a3a925c32d48f10952 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
index 19248b4dfee1da81d18cd2effac08ba68dde80fb..06d56f6222e4bb9a9969d4ab2d260c97d1ce6c72 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h"
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
similarity index 96%
rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index 19056e18aa892dbc83dfbf7305b6ad8b6b6bc51c..7851e8c84bca2e3b05d3b1603eaa4c0ca5909e10 100644
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index 09d0b15f46a7e50afb6aea46383013ce6a6c6118..1783e3322b1df8125f580f09a12aefe64d246c1a 100644
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn_placement_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
index 951fcb066ce759ebfec0182e1e9dca887e343170..20e52410ffe3caa86450bc05bf3aabf5a5bce374 100644
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn_placement_pass.h
rename to paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index fe3c841186c35ea28c1d44007d91de5b997c1388..7476c199cfd073ec0962fa9a48f24750a6484bb5 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -83,7 +83,6 @@ void IRPassManager::CreatePasses(Argument *argument,
           new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
     }
 
-    // graph_ = pass->Apply(std::move(graph_));
     pre_pass = pass_name;
 
     passes_.emplace_back(std::move(pass));
@@ -97,8 +96,9 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
   PADDLE_ENFORCE(graph.get());
   // Apply all the passes
   for (const auto &pass : passes_) {
-    if (pass->Type() == "graph_viz_pass") continue;
-    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
+    if (pass->Type() != "graph_viz_pass") {
+      PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
+    }
     graph = pass->Apply(std::move(graph));
   }
   return std::move(graph);
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index eecab238a88e90399eb70f17caa57633af4e2a69..e92273b4dd94f11e0e90c91fd82dafe42bf158f3 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -318,4 +318,9 @@ NativeConfig AnalysisConfig::ToNativeConfig() const {
   return config;
 }
 
+void AnalysisConfig::SwitchIrDebug(int x) {
+  ir_debug_ = x;
+  Update();
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 6d11b461082d0ed8ba08c9e280bba86737b86e71..002ba90e40e69d565f5a54e374a3f0083b84273f 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -196,7 +196,7 @@ TEST(AnalysisPredictor, memory_optim) {
   AnalysisConfig config(FLAGS_dirname);
   config.DisableGpu();
   config.EnableMemoryOptim(true);
-  config.pass_builder()->TurnOnDebug();
+  config.SwitchIrDebug();
 
   auto native_predictor =
       CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 9d9ed6a39d8324002a8850deae9bb8dd5af7ef9b..47361b3279e14dd65a0e6e7f864e508ef1183045 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -140,9 +140,12 @@ struct AnalysisConfig {
    */
   bool tensorrt_engine_enabled() const { return use_tensorrt_; }
 
-  /** Control whther to debug IR graph analysis phase.
+  /** \brief Control whether to debug IR graph analysis phase.
+   *
+   * This will generate DOT files for visualizing the computation graph after
+   * each analysis pass applied.
    */
-  void SwitchIrDebug(int x = true) { ir_debug_ = x; }
+  void SwitchIrDebug(int x = true);
 
   /** Turn on MKLDNN.
    */
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 391932a1ee018c45818457c55fd8f82a22ab7405..aa353f12ca7333713e2d640cce6b2dfbea3c4e26 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -117,6 +117,7 @@ class CpuPassStrategy : public PassStrategy {
         "conv_bn_fuse_pass",             //
         "conv_eltwiseadd_bn_fuse_pass",  //
         "is_test_pass",                  //
+        "identity_scale_op_clean_pass",  //
     });
     use_gpu_ = false;
   }
@@ -155,6 +156,7 @@ class GpuPassStrategy : public PassStrategy {
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
       "infer_clean_graph_pass",                        //
+          "identity_scale_op_clean_pass",              //
           "conv_affine_channel_fuse_pass",             //
           "conv_eltwiseadd_affine_channel_fuse_pass",  //
           "conv_bn_fuse_pass",                         //
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index aa3da397ff67dd06dd750d336a49056baedaaab6..7ecd9e35332843e3a391cdad5ce32220d890abd1 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -128,9 +128,9 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
   "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
 
-# bert, max_len=20
-set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert20")
-download_model_and_data(${BERT_INSTALL_DIR} "bert_model.tar.gz" "bert_data_len20.txt.tar.gz")
+# bert, max_len=20, embedding_dim=128
+set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
+download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc SERIAL)
 
 # anakin
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index 8be2a6d79b2ede2c149aa523e38c3960ab30acb1..dd953e0dccbb3749bfcc87966453c6976dfefa10 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -142,7 +142,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
   cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
   cfg->DisableGpu();
   cfg->SwitchSpecifyInputNames();
-  cfg->pass_builder()->TurnOnDebug();
+  cfg->SwitchIrDebug();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
   if (use_mkldnn) {
     cfg->EnableMKLDNN();
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 2db297e2005c6b657259187d6b6b76657d9e4388..2003be82019333ca97b9fa8ef83668825fe5710d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -69,7 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_Text_Classification, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg.pass_builder()->TurnOnDebug();
+  cfg.SwitchIrDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index a7b239731b9a2e876c16d9ff84dfb8ac3df7b82e..c43eaf7f9849ee4a88ed95bdb8b6966da8760435 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
-#cc_binary(visualizer SRCS visualizer.cc DEPS analysis
-#    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
+cc_binary(visualizer SRCS visualizer.cc DEPS analysis
+    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 7c29eac46d5cd8c959b28355d45bed6c6a5d88d6..189db2317d0544014d9c74e0fd5e9ead54925b9c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
 #include <string>
-#include "paddle/fluid/operators/mkldnn_activation_op.h"
+#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index e78ecc1a12309fe084a4165e5bb0d8bfb1dcf957..e93cd8615e052e4dfc6255549bf7a9b84b7dd657 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -51,6 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("selected_scores",
               "A LoDTensor containing the accumulated scores corresponding to "
               "Output(selected_ids).");
+    AddOutput(
+        "parent_idx",
+        "A Tensor preserving the selected_ids' parent indice in pre_ids.");
 
     // Attributes stored in AttributeMap
     AddAttr<int>("level", "the level of LoDTensor");
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 1b939e742de06aedf187d25d002d19e0a4fafc9d..f808020cc765585d1633c6c3bf528080a7e83f07 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -41,13 +41,15 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
     auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
+    auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
     PADDLE_ENFORCE_NOT_NULL(selected_ids);
     PADDLE_ENFORCE_NOT_NULL(selected_scores);
+    PADDLE_ENFORCE_NOT_NULL(parent_idx);
 
     math::BeamSearchFunctor<DeviceContext, T> alg;
     alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
-        ids, scores, selected_ids, selected_scores, level, beam_size, end_id,
-        is_accumulated);
+        ids, scores, selected_ids, selected_scores, parent_idx, level,
+        beam_size, end_id, is_accumulated);
   }
 };
 
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index d3a61dc367c3642b8faa9085a470a302712395e5..f6fbe97565c43c306ea885c765c0a665492fa317 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -31,6 +31,8 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
+detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
+detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
 
 if(WITH_GPU)
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index b99edb5bf05f94e762b377a8882e4c3fcdb5afad..a7bc3e027229884e78721d29428a8ab3f08a6ebc 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -99,5 +99,29 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
   }
 }
 
+template <class T>
+void ClipTiledBoxes(const platform::DeviceContext& ctx,
+                    const framework::Tensor& im_info,
+                    const framework::Tensor& input_boxes,
+                    framework::Tensor* out) {
+  T* out_data = out->mutable_data<T>(ctx.GetPlace());
+  const T* im_info_data = im_info.data<T>();
+  const T* input_boxes_data = input_boxes.data<T>();
+  T zero(0);
+  T im_w = round(im_info_data[1] / im_info_data[2]);
+  T im_h = round(im_info_data[0] / im_info_data[2]);
+  for (int64_t i = 0; i < input_boxes.numel(); ++i) {
+    if (i % 4 == 0) {
+      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
+    } else if (i % 4 == 1) {
+      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
+    } else if (i % 4 == 2) {
+      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
+    } else {
+      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3aa766559a530bc31fbb277f2bcd474da776e63b
--- /dev/null
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/box_clip_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class BoxClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of BoxClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
+                   "Input(ImInfo) of BoxClipOp should not be null.");
+
+    auto input_box_dims = ctx->GetInputDim("Input");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+
+    if (ctx->IsRuntime()) {
+      auto input_box_size = input_box_dims.size();
+      PADDLE_ENFORCE_EQ(input_box_dims[input_box_size - 1], 4,
+                        "The last dimension of Input must be 4");
+      PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                        "The rank of Input(Input) in BoxClipOp must be 2");
+      PADDLE_ENFORCE_EQ(im_info_dims[1], 3,
+                        "The last dimension of ImInfo must be 3");
+    }
+    ctx->ShareDim("Input", /*->*/ "Output");
+    ctx->ShareLoD("Input", /*->*/ "Output");
+  }
+};
+
+class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "(LoDTensor) "
+             "Input is a LoDTensor with shape [..., 4] holds 4 points"
+             "in last dimension in format [xmin, ymin, xmax, ymax]");
+    AddInput("ImInfo",
+             "(Tensor) Information for image reshape is in shape (N, 3), "
+             "in format (height, width, im_scale)");
+    AddOutput("Output",
+              "(LoDTensor) "
+              "Output is a LoDTensor with the same shape as Input"
+              "and it is the result after clip");
+    AddComment(R"DOC(
+This operator clips input boxes to original input images.
+
+For each input box, The formula is given as follows:
+
+       $$xmin = \max(\min(xmin, im_w - 1), 0)$$
+       $$ymin = \max(\min(ymin, im_h - 1), 0)$$     
+       $$xmax = \max(\min(xmax, im_w - 1), 0)$$
+       $$ymax = \max(\min(ymax, im_h - 1), 0)$$
+
+where im_w and im_h are computed from ImInfo, the formula is given as follows:
+
+       $$im_w = \round(width / im_scale)$$
+       $$im_h = \round(height / im_scale)$$ 
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(box_clip, ops::BoxClipOp, ops::BoxClipOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    box_clip, ops::BoxClipKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BoxClipKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b727da5f7b736b6f22407d1dfbca708ed0cf04d9
--- /dev/null
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/box_clip_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTenso = framework::LoDTensor;
+
+static constexpr int ImInfoSize = 3;
+
+template <typename T, int BlockSize>
+static __global__ void GPUBoxClip(const T *input, const size_t *lod,
+                                  const size_t width, const T *im_info,
+                                  T *output) {
+  T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] /
+                 im_info[blockIdx.x * ImInfoSize + 2]);
+  T im_h = round(im_info[blockIdx.x * ImInfoSize] /
+                 im_info[blockIdx.x * ImInfoSize + 2]);
+  for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width;
+       i += BlockSize) {
+    int idx = lod[blockIdx.x] * width + i;
+    T im_size = (idx % 2 == 0) ? im_w : im_h;
+    output[idx] = max(min(input[idx], im_size - 1), T(0.));
+  }
+}
+
+template <typename DeviceContext, typename T>
+class GPUBoxClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *input = context.Input<LoDTensor>("Input");
+    auto *im_info = context.Input<Tensor>("ImInfo");
+    auto *output = context.Output<LoDTensor>("Output");
+    const int64_t num = input->dims()[0];
+    const int64_t bbox_width = input->numel() / num;
+    auto lod = input->lod();
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto stream = dev_ctx.stream();
+    const size_t batch_size = lod.back().size() - 1;
+    T *output_data = output->mutable_data<T>(dev_ctx.GetPlace());
+    GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
+        input->data<T>(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()),
+        bbox_width, im_info->data<T>(), output_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    box_clip, ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..74e1f88f8d8b28e490d170934760bd9bffc807bc
--- /dev/null
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class BoxClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input_box = context.Input<LoDTensor>("Input");
+    auto* im_info = context.Input<LoDTensor>("ImInfo");
+    auto* output_box = context.Output<LoDTensor>("Output");
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    output_box->mutable_data<T>(context.GetPlace());
+    if (input_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(input_box->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    auto box_lod = input_box->lod().back();
+    int64_t n = static_cast<int64_t>(box_lod.size() - 1);
+    for (int i = 0; i < n; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
+      Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]);
+      ClipTiledBoxes<T>(dev_ctx, im_info_slice, box_slice, &output_slice);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 06fbb9815c52ea69e3aa9e893512e039853b9514..fdcff62e1fe59b3a2f4925bdff98632f71220abb 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/box_coder_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -32,32 +33,57 @@ class BoxCoderOp : public framework::OperatorWithKernel {
 
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
-                        "The rank of Input of PriorBoxVar must be 2");
+                        "The rank of Input PriorBox must be 2");
       PADDLE_ENFORCE_EQ(prior_box_dims[1], 4,
                         "The shape of PriorBox is [N, 4]");
       if (ctx->HasInput("PriorBoxVar")) {
         auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
-        PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+        PADDLE_ENFORCE(
+            prior_box_var_dims.size() == 1 || prior_box_var_dims.size() == 2,
+            "Input(PriorBoxVar) of BoxCoderOp should be 1 or 2.");
+        if (prior_box_var_dims.size() == 1) {
+          PADDLE_ENFORCE_EQ(
+              prior_box_var_dims[0], 4,
+              "The 1st dimension of Input(PriorBoxVar) should be 4"
+              "when the rank is 1.");
+        } else {
+          PADDLE_ENFORCE_EQ(
+              prior_box_dims, prior_box_var_dims,
+              "The dimension of Input(PriorBoxVar) should be equal to"
+              "the dimension of Input(PriorBox when the rank is 2.)");
+        }
       }
+    }
 
-      auto code_type =
-          GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
-      if (code_type == BoxCodeType::kEncodeCenterSize) {
-        PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                          "The rank of Input of TargetBox must be 2");
-        PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
-                          "The shape of TargetBox is [M, 4]");
-      } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-        PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
-                          "The rank of Input of TargetBox must be 3");
+    auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    int axis = ctx->Attrs().Get<int>("axis");
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                        "The rank of Input TargetBox must be 2");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                        "The shape of TargetBox is [M, 4]");
+      ctx->SetOutputDim(
+          "OutputBox",
+          framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
+                        "The rank of Input TargetBox must be 3");
+      if (axis == 0) {
         PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
-        PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+      } else if (axis == 1) {
+        PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
+      } else {
+        PADDLE_THROW("axis must be 0 or 1.");
       }
+      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+      ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
+    }
+
+    if (code_type == BoxCodeType::kDecodeCenterSize && axis == 1) {
+      ctx->ShareLoD("PriorBox", /*->*/ "OutputBox");
+    } else {
+      ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
     }
-    ctx->SetOutputDim(
-        "OutputBox",
-        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
-    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
   }
 };
 
@@ -100,6 +126,21 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default true) "
                   "whether treat the priorbox as a noramlized box")
         .SetDefault(true);
+    AddAttr<int>("axis",
+                 "(int, default 0)"
+                 "which axis in PriorBox to broadcast for box decode,"
+                 "for example, if axis is 0 and TargetBox has shape"
+                 "[N, M, 4] and PriorBox has shape [M, 4], then PriorBox "
+                 "will broadcast to [N, M, 4] for decoding. It is only valid"
+                 "when code type is decode_center_size")
+        .SetDefault(0)
+        .InEnum({0, 1});
+    AddAttr<std::vector<float>>(
+        "variance",
+        "(vector<float>, default {}),"
+        "variance of prior box with shape [4]. PriorBoxVar and variance can"
+        "not be provided at the same time.")
+        .SetDefault(std::vector<float>{});
     AddOutput("OutputBox",
               "(LoDTensor or Tensor) "
               "When code_type is 'encode_center_size', the output tensor of "
@@ -138,7 +179,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
-encoded/decoded coordinates, width and height.
+encoded/decoded coordinates, width and height. 
+
+During Box Decoding, two modes for broadcast are supported. Say target box has 
+shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior
+box will broadcast to target box along the assigned axis. 
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index a7af111f63d654319dd1d90d2032956951dfe49e..e078af3eb478a8bebc6a7fc6460d169d803a3c4b 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -9,6 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/box_coder_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -16,11 +19,11 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
-                                       const T* prior_box_var_data,
-                                       const T* target_box_data, const int row,
-                                       const int col, const int len,
-                                       const bool normalized, T* output) {
+__global__ void EncodeCenterSizeKernel(
+    const T* prior_box_data, const T* prior_box_var_data,
+    const T* target_box_data, const int row, const int col, const int len,
+    const bool normalized, const T prior_box_var_size, const float* variance,
+    const int var_size, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int row_idx = idx / col;
@@ -30,11 +33,9 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
     T prior_box_height = prior_box_data[col_idx * len + 3] -
                          prior_box_data[col_idx * len + 1] +
                          (normalized == false);
-    T prior_box_center_x =
-        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
-    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
-                            prior_box_data[col_idx * len + 1]) /
-                           2;
+    T prior_box_center_x = prior_box_data[col_idx * len] + prior_box_width / 2;
+    T prior_box_center_y =
+        prior_box_data[col_idx * len + 1] + prior_box_height / 2;
 
     T target_box_center_x =
         (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
@@ -55,58 +56,73 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
     output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
     output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
     if (prior_box_var_data) {
-      output[idx * len] /= prior_box_var_data[col_idx * len];
-      output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1];
-      output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2];
-      output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3];
+      int prior_var_offset = 0;
+      if (prior_box_var_size == 2) {
+        prior_var_offset = col_idx * len;
+      }
+      output[idx * len] /= prior_box_var_data[prior_var_offset];
+      output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1];
+      output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2];
+      output[idx * len + 3] /= prior_box_var_data[prior_var_offset + 3];
+    } else if (var_size == 4) {
+      for (int k = 0; k < 4; ++k) {
+        output[idx * len + k] /= static_cast<T>(variance[k]);
+      }
     }
   }
 }
 
 template <typename T>
-__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
-                                       const T* prior_box_var_data,
-                                       const T* target_box_data, const int row,
-                                       const int col, const int len,
-                                       const bool normalized, T* output) {
+__global__ void DecodeCenterSizeKernel(
+    const T* prior_box_data, const T* prior_box_var_data,
+    const T* target_box_data, const int row, const int col, const int len,
+    const bool normalized, const T prior_box_var_size, const float* variance,
+    const int var_size, const int axis, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int prior_box_offset = 0;
   if (idx < row * col) {
     const int col_idx = idx % col;
-    T prior_box_width = prior_box_data[col_idx * len + 2] -
-                        prior_box_data[col_idx * len] + (normalized == false);
-    T prior_box_height = prior_box_data[col_idx * len + 3] -
-                         prior_box_data[col_idx * len + 1] +
+    const int row_idx = idx / col;
+    prior_box_offset = axis == 0 ? col_idx * len : row_idx * len;
+    T prior_box_width = prior_box_data[prior_box_offset + 2] -
+                        prior_box_data[prior_box_offset] +
+                        (normalized == false);
+    T prior_box_height = prior_box_data[prior_box_offset + 3] -
+                         prior_box_data[prior_box_offset + 1] +
                          (normalized == false);
     T prior_box_center_x =
-        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
-    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
-                            prior_box_data[col_idx * len + 1]) /
-                           2;
+        prior_box_data[prior_box_offset] + prior_box_width / 2;
+    T prior_box_center_y =
+        prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
     T target_box_width, target_box_height;
     T target_box_center_x, target_box_center_y;
+    T box_var_x = T(1), box_var_y = T(1);
+    T box_var_w = T(1), box_var_h = T(1);
     if (prior_box_var_data) {
-      target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
-                             target_box_data[idx * len + 2]) *
-                         prior_box_width;
-      target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
-                              target_box_data[idx * len + 3]) *
-                          prior_box_height;
-      target_box_center_x = prior_box_var_data[col_idx * len] *
-                                target_box_data[idx * len] * prior_box_width +
-                            prior_box_center_x;
-      target_box_center_y = prior_box_var_data[col_idx * len + 1] *
-                                target_box_data[idx * len + 1] *
-                                prior_box_height +
-                            prior_box_center_y;
-    } else {
-      target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width;
-      target_box_height =
-          exp(target_box_data[idx * len + 3]) * prior_box_height;
-      target_box_center_x =
-          target_box_data[idx * len] * prior_box_width + prior_box_center_x;
-      target_box_center_y = target_box_data[idx * len + 1] * prior_box_height +
-                            prior_box_center_y;
+      int prior_var_offset = 0;
+      if (prior_box_var_size == 2) {
+        prior_var_offset = axis == 0 ? col_idx * len : row_idx * len;
+      }
+      box_var_x = prior_box_var_data[prior_var_offset];
+      box_var_y = prior_box_var_data[prior_var_offset + 1];
+      box_var_w = prior_box_var_data[prior_var_offset + 2];
+      box_var_h = prior_box_var_data[prior_var_offset + 3];
+    } else if (var_size == 4) {
+      box_var_x = static_cast<T>(variance[0]);
+      box_var_y = static_cast<T>(variance[1]);
+      box_var_w = static_cast<T>(variance[2]);
+      box_var_h = static_cast<T>(variance[3]);
     }
+    target_box_width =
+        exp(box_var_w * target_box_data[idx * len + 2]) * prior_box_width;
+    target_box_height =
+        exp(box_var_h * target_box_data[idx * len + 3]) * prior_box_height;
+    target_box_center_x =
+        box_var_x * target_box_data[idx * len] * prior_box_width +
+        prior_box_center_x;
+    target_box_center_y =
+        box_var_y * target_box_data[idx * len + 1] * prior_box_height +
+        prior_box_center_y;
 
     output[idx * len] = target_box_center_x - target_box_width / 2;
     output[idx * len + 1] = target_box_center_y - target_box_height / 2;
@@ -127,36 +143,64 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
     auto* output_box = context.Output<framework::Tensor>("OutputBox");
-
+    std::vector<float> variance = context.Attr<std::vector<float>>("variance");
     const T* prior_box_data = prior_box->data<T>();
     const T* target_box_data = target_box->data<T>();
     const T* prior_box_var_data = nullptr;
-    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+    auto prior_box_var_size = 0;
+    if (prior_box_var) {
+      PADDLE_ENFORCE(variance.empty(),
+                     "Input 'PriorBoxVar' and attribute 'variance' should not"
+                     "be used at the same time.");
+      prior_box_var_data = prior_box_var->data<T>();
+      prior_box_var_size = prior_box_var->dims().size();
+    }
+    if (!(variance.empty())) {
+      PADDLE_ENFORCE(static_cast<int>(variance.size()) == 4,
+                     "Size of attribute 'variance' should be 4");
+    }
 
     if (target_box->lod().size()) {
       PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
                         "Only support 1 level of LoD.");
     }
+    const int var_size = static_cast<int>(variance.size());
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
+    int axis = context.Attr<int>("axis");
+
     auto row = target_box->dims()[0];
     auto col = prior_box->dims()[0];
+    if (code_type == BoxCodeType::kDecodeCenterSize) {
+      col = target_box->dims()[1];
+    }
     auto len = prior_box->dims()[1];
     int block = 512;
     int grid = (row * col + block - 1) / block;
     auto& device_ctx = context.cuda_device_context();
 
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(device_ctx);
+    int bytes = var_size * sizeof(float);
+    auto dev_var = allocator.Allocate(bytes);
+    float* dev_var_data = reinterpret_cast<float*>(dev_var->ptr());
+    auto cplace = platform::CPUPlace();
+    const auto gplace = boost::get<platform::CUDAPlace>(context.GetPlace());
+    memory::Copy(gplace, dev_var_data, cplace, &variance[0], bytes,
+                 device_ctx.stream());
+
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
     T* output = output_box->data<T>();
 
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    bool normalized = context.Attr<bool>("box_normalized");
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          normalized, output);
+          normalized, prior_box_var_size, dev_var_data, var_size, output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
       DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          normalized, output);
+          normalized, prior_box_var_size, dev_var_data, var_size, axis, output);
     }
   }
 };
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index b2a2bcdce932032a761a1fc064fe622f7629f9bf..a0b1faf7bdc7001eba2d92b4d03fbaf9feb7bcbb 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -34,7 +35,8 @@ class BoxCoderKernel : public framework::OpKernel<T> {
   void EncodeCenterSize(const framework::Tensor* target_box,
                         const framework::Tensor* prior_box,
                         const framework::Tensor* prior_box_var,
-                        const bool normalized, T* output) const {
+                        const bool normalized,
+                        const std::vector<float> variance, T* output) const {
     int64_t row = target_box->dims()[0];
     int64_t col = prior_box->dims()[0];
     int64_t len = prior_box->dims()[1];
@@ -53,10 +55,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
         T prior_box_height = prior_box_data[j * len + 3] -
                              prior_box_data[j * len + 1] +
                              (normalized == false);
-        T prior_box_center_x =
-            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
         T prior_box_center_y =
-            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+            prior_box_data[j * len + 1] + prior_box_height / 2;
 
         T target_box_center_x =
             (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
@@ -78,10 +79,18 @@ class BoxCoderKernel : public framework::OpKernel<T> {
         output[offset + 3] =
             std::log(std::fabs(target_box_height / prior_box_height));
         if (prior_box_var) {
-          output[offset] /= prior_box_var_data[j * len];
-          output[offset + 1] /= prior_box_var_data[j * len + 1];
-          output[offset + 2] /= prior_box_var_data[j * len + 2];
-          output[offset + 3] /= prior_box_var_data[j * len + 3];
+          int prior_var_offset = 0;
+          if (prior_box_var->dims().size() == 2) {
+            prior_var_offset = j * len;
+          }
+          output[offset] /= prior_box_var_data[prior_var_offset];
+          output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
+          output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
+          output[offset + 3] /= prior_box_var_data[prior_var_offset + 3];
+        } else if (!(variance.empty())) {
+          for (int k = 0; k < 4; ++k) {
+            output[offset + k] /= static_cast<T>(variance[k]);
+          }
         }
       }
     }
@@ -89,58 +98,71 @@ class BoxCoderKernel : public framework::OpKernel<T> {
   void DecodeCenterSize(const framework::Tensor* target_box,
                         const framework::Tensor* prior_box,
                         const framework::Tensor* prior_box_var,
-                        const bool normalized, T* output) const {
+                        const bool normalized, const int axis,
+                        const std::vector<float> variance, T* output) const {
     int64_t row = target_box->dims()[0];
-    int64_t col = prior_box->dims()[0];
-    int64_t len = prior_box->dims()[1];
+    int64_t col = target_box->dims()[1];
+    int64_t len = target_box->dims()[2];
 
     auto* target_box_data = target_box->data<T>();
     auto* prior_box_data = prior_box->data<T>();
     const T* prior_box_var_data = nullptr;
     if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
-
+    int prior_box_offset = 0;
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for collapse(2)
 #endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         size_t offset = i * col * len + j * len;
-        T prior_box_width = prior_box_data[j * len + 2] -
-                            prior_box_data[j * len] + (normalized == false);
-        T prior_box_height = prior_box_data[j * len + 3] -
-                             prior_box_data[j * len + 1] +
+        if (axis == 0) {
+          prior_box_offset = j * len;
+        } else if (axis == 1) {
+          prior_box_offset = i * len;
+        }
+        T prior_box_width = prior_box_data[prior_box_offset + 2] -
+                            prior_box_data[prior_box_offset] +
+                            (normalized == false);
+        T prior_box_height = prior_box_data[prior_box_offset + 3] -
+                             prior_box_data[prior_box_offset + 1] +
                              (normalized == false);
         T prior_box_center_x =
-            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+            prior_box_data[prior_box_offset] + prior_box_width / 2;
         T prior_box_center_y =
-            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+            prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
 
         T target_box_center_x = 0, target_box_center_y = 0;
         T target_box_width = 0, target_box_height = 0;
+        T box_var_x = T(1), box_var_y = T(1);
+        T box_var_w = T(1), box_var_h = T(1);
         if (prior_box_var) {
-          target_box_center_x = prior_box_var_data[j * len] *
-                                    target_box_data[offset] * prior_box_width +
-                                prior_box_center_x;
-          target_box_center_y = prior_box_var_data[j * len + 1] *
-                                    target_box_data[offset + 1] *
-                                    prior_box_height +
-                                prior_box_center_y;
-          target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                      target_box_data[offset + 2]) *
-                             prior_box_width;
-          target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                       target_box_data[offset + 3]) *
-                              prior_box_height;
-        } else {
-          target_box_center_x =
-              target_box_data[offset] * prior_box_width + prior_box_center_x;
-          target_box_center_y = target_box_data[offset + 1] * prior_box_height +
-                                prior_box_center_y;
-          target_box_width =
-              std::exp(target_box_data[offset + 2]) * prior_box_width;
-          target_box_height =
-              std::exp(target_box_data[offset + 3]) * prior_box_height;
+          int prior_var_offset = 0;
+          if (prior_box_var->dims().size() == 2) {
+            if (axis == 0)
+              prior_var_offset = j * len;
+            else if (axis == 1)
+              prior_var_offset = i * len;
+          }
+          box_var_x = prior_box_var_data[prior_var_offset];
+          box_var_y = prior_box_var_data[prior_var_offset + 1];
+          box_var_w = prior_box_var_data[prior_var_offset + 2];
+          box_var_h = prior_box_var_data[prior_var_offset + 3];
+        } else if (!(variance.empty())) {
+          box_var_x = static_cast<T>(variance[0]);
+          box_var_y = static_cast<T>(variance[1]);
+          box_var_w = static_cast<T>(variance[2]);
+          box_var_h = static_cast<T>(variance[3]);
         }
+        target_box_center_x =
+            box_var_x * target_box_data[offset] * prior_box_width +
+            prior_box_center_x;
+        target_box_center_y =
+            box_var_y * target_box_data[offset + 1] * prior_box_height +
+            prior_box_center_y;
+        target_box_width =
+            std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
+        target_box_height = std::exp(box_var_h * target_box_data[offset + 3]) *
+                            prior_box_height;
 
         output[offset] = target_box_center_x - target_box_width / 2;
         output[offset + 1] = target_box_center_y - target_box_height / 2;
@@ -157,26 +179,40 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
     auto* output_box = context.Output<framework::Tensor>("OutputBox");
-
+    std::vector<float> variance = context.Attr<std::vector<float>>("variance");
+    const int axis = context.Attr<int>("axis");
     if (target_box->lod().size()) {
       PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
                         "Only support 1 level of LoD.");
     }
+    if (prior_box_var) {
+      PADDLE_ENFORCE(variance.empty(),
+                     "Input 'PriorBoxVar' and attribute 'variance' should not"
+                     "be used at the same time.");
+    }
+    if (!(variance.empty())) {
+      PADDLE_ENFORCE(static_cast<int>(variance.size()) == 4,
+                     "Size of attribute 'variance' should be 4");
+    }
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
+
     auto row = target_box->dims()[0];
     auto col = prior_box->dims()[0];
+    if (code_type == BoxCodeType::kDecodeCenterSize) {
+      col = target_box->dims()[1];
+    }
     auto len = prior_box->dims()[1];
 
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
 
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    bool normalized = context.Attr<bool>("box_normalized");
     T* output = output_box->data<T>();
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
-                       output);
+                       variance, output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized,
-                       output);
+      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized, axis,
+                       variance, output);
     }
   }
 };
diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
similarity index 69%
rename from paddle/fluid/operators/yolov3_loss_op.cc
rename to paddle/fluid/operators/detection/yolov3_loss_op.cc
index 60508f7ab871910c38f1e4aa04c2035075d37df5..2a69ad4b53c26f5e2e0547e75e0d9c6518a8bcba 100644
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -9,7 +9,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/yolov3_loss_op.h"
+#include "paddle/fluid/operators/detection/yolov3_loss_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -29,23 +29,33 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
                    "Input(GTLabel) of Yolov3LossOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Loss"),
                    "Output(Loss) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ObjectnessMask"),
+        "Output(ObjectnessMask) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("GTMatchMask"),
+                   "Output(GTMatchMask) of Yolov3LossOp should not be null.");
 
     auto dim_x = ctx->GetInputDim("X");
     auto dim_gtbox = ctx->GetInputDim("GTBox");
     auto dim_gtlabel = ctx->GetInputDim("GTLabel");
     auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    int anchor_num = anchors.size() / 2;
+    auto anchor_mask = ctx->Attrs().Get<std::vector<int>>("anchor_mask");
+    int mask_num = anchor_mask.size();
     auto class_num = ctx->Attrs().Get<int>("class_num");
+
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
     PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
                       "Input(X) dim[3] and dim[4] should be euqal.");
-    PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
-                      "Input(X) dim[1] should be equal to (anchor_number * (5 "
-                      "+ class_num)).");
+    PADDLE_ENFORCE_EQ(
+        dim_x[1], mask_num * (5 + class_num),
+        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+        "+ class_num)).");
     PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
                       "Input(GTBox) should be a 3-D tensor");
     PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
     PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
-                      "Input(GTBox) should be a 2-D tensor");
+                      "Input(GTLabel) should be a 2-D tensor");
     PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
                       "Input(GTBox) and Input(GTLabel) dim[0] should be same");
     PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
@@ -54,11 +64,22 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
                       "Attr(anchors) length should be greater then 0.");
     PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
                       "Attr(anchors) length should be even integer.");
+    for (size_t i = 0; i < anchor_mask.size(); i++) {
+      PADDLE_ENFORCE_LT(
+          anchor_mask[i], anchor_num,
+          "Attr(anchor_mask) should not crossover Attr(anchors).");
+    }
     PADDLE_ENFORCE_GT(class_num, 0,
                       "Attr(class_num) should be an integer greater then 0.");
 
-    std::vector<int64_t> dim_out({1});
+    std::vector<int64_t> dim_out({dim_x[0]});
     ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
+
+    std::vector<int64_t> dim_obj_mask({dim_x[0], mask_num, dim_x[2], dim_x[3]});
+    ctx->SetOutputDim("ObjectnessMask", framework::make_ddim(dim_obj_mask));
+
+    std::vector<int64_t> dim_gt_match_mask({dim_gtbox[0], dim_gtbox[1]});
+    ctx->SetOutputDim("GTMatchMask", framework::make_ddim(dim_gt_match_mask));
   }
 
  protected:
@@ -73,11 +94,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of YOLO v3 loss operator, "
+             "The input tensor of YOLOv3 loss operator, "
              "This is a 4-D tensor with shape of [N, C, H, W]."
              "H and W should be same, and the second dimention(C) stores"
              "box locations, confidence score and classification one-hot"
-             "key of each anchor box");
+             "keys of each anchor box");
     AddInput("GTBox",
              "The input tensor of ground truth boxes, "
              "This is a 3-D tensor with shape of [N, max_box_num, 5], "
@@ -89,32 +110,39 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("GTLabel",
              "The input tensor of ground truth label, "
              "This is a 2-D tensor with shape of [N, max_box_num], "
-             "and each element shoudl be an integer to indicate the "
+             "and each element should be an integer to indicate the "
              "box class id.");
     AddOutput("Loss",
               "The output yolov3 loss tensor, "
-              "This is a 1-D tensor with shape of [1]");
+              "This is a 1-D tensor with shape of [N]");
+    AddOutput("ObjectnessMask",
+              "This is an intermediate tensor with shape of [N, M, H, W], "
+              "M is the number of anchor masks. This parameter caches the "
+              "mask for calculate objectness loss in gradient kernel.")
+        .AsIntermediate();
+    AddOutput("GTMatchMask",
+              "This is an intermediate tensor with shape of [N, B], "
+              "B is the max box number of GT boxes. This parameter caches "
+              "matched mask index of each GT boxes for gradient calculate.")
+        .AsIntermediate();
 
     AddAttr<int>("class_num", "The number of classes to predict.");
     AddAttr<std::vector<int>>("anchors",
                               "The anchor width and height, "
-                              "it will be parsed pair by pair.");
+                              "it will be parsed pair by pair.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<std::vector<int>>("anchor_mask",
+                              "The mask index of anchors used in "
+                              "current YOLOv3 loss calculation.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("downsample_ratio",
+                 "The downsample ratio from network input to YOLOv3 loss "
+                 "input, so 32, 16, 8 should be set for the first, second, "
+                 "and thrid YOLOv3 loss operators.")
+        .SetDefault(32);
     AddAttr<float>("ignore_thresh",
-                   "The ignore threshold to ignore confidence loss.");
-    AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
-        .SetDefault(1.0);
-    AddAttr<float>(
-        "loss_weight_conf_target",
-        "The weight of confidence score loss in locations with target object.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_conf_notarget",
-                   "The weight of confidence score loss in locations without "
-                   "target object.")
-        .SetDefault(1.0);
-    AddAttr<float>("loss_weight_class", "The weight of classification loss.")
-        .SetDefault(1.0);
+                   "The ignore threshold to ignore confidence loss.")
+        .SetDefault(0.7);
     AddComment(R"DOC(
          This operator generate yolov3 loss by given predict result and ground
          truth boxes.
@@ -147,17 +175,28 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          thresh, the confidence score loss of this anchor box will be ignored.
 
          Therefore, the yolov3 loss consist of three major parts, box location loss,
-         confidence score loss, and classification loss. The MSE loss is used for 
-         box location, and binary cross entropy loss is used for confidence score 
-         loss and classification loss.
+         confidence score loss, and classification loss. The L2 loss is used for 
+         box coordinates (w, h), and sigmoid cross entropy loss is used for box 
+         coordinates (x, y), confidence score loss and classification loss.
+
+         Each groud truth box find a best matching anchor box in all anchors, 
+         prediction of this anchor box will incur all three parts of losses, and
+         prediction of anchor boxes with no GT box matched will only incur objectness
+         loss.
+
+         In order to trade off box coordinate losses between big boxes and small 
+         boxes, box coordinate losses will be mutiplied by scale weight, which is
+         calculated as follow.
+
+         $$
+         weight_{box} = 2.0 - t_w * t_h
+         $$
 
          Final loss will be represented as follow.
 
          $$
-         loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh}
-              + \loss_weight_{conf_target} * loss_{conf_target}
-              + \loss_weight_{conf_notarget} * loss_{conf_notarget}
-              + \loss_weight_{class} * loss_{class}
+         loss = (loss_{xy} + loss_{wh}) * weight_{box}
+              + loss_{conf} + loss_{class}
          $$
          )DOC");
   }
@@ -196,6 +235,8 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("GTBox", Input("GTBox"));
     op->SetInput("GTLabel", Input("GTLabel"));
     op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
+    op->SetInput("GTMatchMask", Output("GTMatchMask"));
 
     op->SetAttrMap(Attrs());
 
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8407d4e6e8f87a2e8d073c4fbda5691abe1bba68
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -0,0 +1,447 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+static inline bool LessEqualZero(T x) {
+  return x < 1e-6;
+}
+
+template <typename T>
+static T SigmoidCrossEntropy(T x, T label) {
+  return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x)));
+}
+
+template <typename T>
+static T L2Loss(T x, T y) {
+  return 0.5 * (y - x) * (y - x);
+}
+
+template <typename T>
+static T SigmoidCrossEntropyGrad(T x, T label) {
+  return 1.0 / (1.0 + std::exp(-x)) - label;
+}
+
+template <typename T>
+static T L2LossGrad(T x, T y) {
+  return x - y;
+}
+
+static int GetMaskIndex(std::vector<int> mask, int val) {
+  for (size_t i = 0; i < mask.size(); i++) {
+    if (mask[i] == val) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+template <typename T>
+struct Box {
+  T x, y, w, h;
+};
+
+template <typename T>
+static inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
+                                int j, int an_idx, int grid_size,
+                                int input_size, int index, int stride) {
+  Box<T> b;
+  b.x = (i + sigmoid<T>(x[index])) / grid_size;
+  b.y = (j + sigmoid<T>(x[index + stride])) / grid_size;
+  b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] / input_size;
+  b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] / input_size;
+  return b;
+}
+
+template <typename T>
+static inline Box<T> GetGtBox(const T* gt, int batch, int max_boxes, int idx) {
+  Box<T> b;
+  b.x = gt[(batch * max_boxes + idx) * 4];
+  b.y = gt[(batch * max_boxes + idx) * 4 + 1];
+  b.w = gt[(batch * max_boxes + idx) * 4 + 2];
+  b.h = gt[(batch * max_boxes + idx) * 4 + 3];
+  return b;
+}
+
+template <typename T>
+static inline T BoxOverlap(T c1, T w1, T c2, T w2) {
+  T l1 = c1 - w1 / 2.0;
+  T l2 = c2 - w2 / 2.0;
+  T left = l1 > l2 ? l1 : l2;
+  T r1 = c1 + w1 / 2.0;
+  T r2 = c2 + w2 / 2.0;
+  T right = r1 < r2 ? r1 : r2;
+  return right - left;
+}
+
+template <typename T>
+static inline T CalcBoxIoU(Box<T> b1, Box<T> b2) {
+  T w = BoxOverlap(b1.x, b1.w, b2.x, b2.w);
+  T h = BoxOverlap(b1.y, b1.h, b2.y, b2.h);
+  T inter_area = (w < 0 || h < 0) ? 0.0 : w * h;
+  T union_area = b1.w * b1.h + b2.w * b2.h - inter_area;
+  return inter_area / union_area;
+}
+
+static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num,
+                                int an_stride, int stride, int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
+                                std::vector<int> anchors, int an_idx,
+                                int box_idx, int gi, int gj, int grid_size,
+                                int input_size, int stride) {
+  T tx = gt.x * grid_size - gi;
+  T ty = gt.y * grid_size - gj;
+  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
+  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
+
+  T scale = (2.0 - gt.w * gt.h);
+  loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
+  loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
+  loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale;
+  loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale;
+}
+
+template <typename T>
+static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
+                                    Box<T> gt, std::vector<int> anchors,
+                                    int an_idx, int box_idx, int gi, int gj,
+                                    int grid_size, int input_size, int stride) {
+  T tx = gt.x * grid_size - gi;
+  T ty = gt.y * grid_size - gj;
+  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
+  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
+
+  T scale = (2.0 - gt.w * gt.h);
+  input_grad[box_idx] =
+      SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
+  input_grad[box_idx + stride] =
+      SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
+  input_grad[box_idx + 2 * stride] =
+      L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
+  input_grad[box_idx + 3 * stride] =
+      L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
+}
+
+template <typename T>
+static inline void CalcLabelLoss(T* loss, const T* input, const int index,
+                                 const int label, const int class_num,
+                                 const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    T pred = input[index + i * stride];
+    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0);
+  }
+}
+
+template <typename T>
+static inline void CalcLabelLossGrad(T* input_grad, const T loss,
+                                     const T* input, const int index,
+                                     const int label, const int class_num,
+                                     const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    T pred = input[index + i * stride];
+    input_grad[index + i * stride] =
+        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss;
+  }
+}
+
+template <typename T>
+static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
+                                   const int n, const int an_num, const int h,
+                                   const int w, const int stride,
+                                   const int an_stride) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          T obj = objness[k * w + l];
+          if (obj > 1e-5) {
+            // positive sample: obj = 1
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0);
+          } else if (obj > -0.5) {
+            // negetive sample: obj = 0
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
+          }
+        }
+      }
+      objness += stride;
+      input += an_stride;
+    }
+  }
+}
+
+template <typename T>
+static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
+                                       const T* input, const T* objness,
+                                       const int n, const int an_num,
+                                       const int h, const int w,
+                                       const int stride, const int an_stride) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          T obj = objness[k * w + l];
+          if (obj > 1e-5) {
+            input_grad[k * w + l] =
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i];
+          } else if (obj > -0.5) {
+            input_grad[k * w + l] =
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
+          }
+        }
+      }
+      objness += stride;
+      input += an_stride;
+      input_grad += an_stride;
+    }
+  }
+}
+
+template <typename T>
+static void inline GtValid(bool* valid, const T* gtbox, const int n,
+                           const int b) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < b; j++) {
+      if (LessEqualZero(gtbox[j * 4 + 2]) || LessEqualZero(gtbox[j * 4 + 3])) {
+        valid[j] = false;
+      } else {
+        valid[j] = true;
+      }
+    }
+    valid += b;
+    gtbox += b * 4;
+  }
+}
+
+template <typename T>
+class Yolov3LossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* loss = ctx.Output<Tensor>("Loss");
+    auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
+    auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
+    int class_num = ctx.Attr<int>("class_num");
+    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int an_num = anchors.size() / 2;
+    const int mask_num = anchor_mask.size();
+    const int b = gt_box->dims()[1];
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    const T* input_data = input->data<T>();
+    const T* gt_box_data = gt_box->data<T>();
+    const int* gt_label_data = gt_label->data<int>();
+    T* loss_data = loss->mutable_data<T>({n}, ctx.GetPlace());
+    memset(loss_data, 0, loss->numel() * sizeof(T));
+    T* obj_mask_data =
+        objness_mask->mutable_data<T>({n, mask_num, h, w}, ctx.GetPlace());
+    memset(obj_mask_data, 0, objness_mask->numel() * sizeof(T));
+    int* gt_match_mask_data =
+        gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
+
+    // calc valid gt box mask, avoid calc duplicately in following code
+    Tensor gt_valid_mask;
+    bool* gt_valid_mask_data =
+        gt_valid_mask.mutable_data<bool>({n, b}, ctx.GetPlace());
+    GtValid<T>(gt_valid_mask_data, gt_box_data, n, b);
+
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < mask_num; j++) {
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            // each predict box find a best match gt box, if overlap is bigger
+            // then ignore_thresh, ignore the objectness loss.
+            int box_idx =
+                GetEntryIndex(i, j, k * w + l, mask_num, an_stride, stride, 0);
+            Box<T> pred = GetYoloBox(input_data, anchors, l, k, anchor_mask[j],
+                                     h, input_size, box_idx, stride);
+            T best_iou = 0;
+            for (int t = 0; t < b; t++) {
+              if (!gt_valid_mask_data[i * b + t]) {
+                continue;
+              }
+              Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+              T iou = CalcBoxIoU(pred, gt);
+              if (iou > best_iou) {
+                best_iou = iou;
+              }
+            }
+
+            // If best IoU is bigger then ignore_thresh,
+            // ignore the objectness loss.
+            if (best_iou > ignore_thresh) {
+              int obj_idx = (i * mask_num + j) * stride + k * w + l;
+              obj_mask_data[obj_idx] = static_cast<T>(-1);
+            }
+            // all losses should be calculated if best IoU
+            // is bigger then truth thresh, but currently,
+            // truth thresh is an unreachable value as 1.0.
+          }
+        }
+      }
+      for (int t = 0; t < b; t++) {
+        if (!gt_valid_mask_data[i * b + t]) {
+          gt_match_mask_data[i * b + t] = -1;
+          continue;
+        }
+        Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+        int gi = static_cast<int>(gt.x * w);
+        int gj = static_cast<int>(gt.y * h);
+        Box<T> gt_shift = gt;
+        gt_shift.x = 0.0;
+        gt_shift.y = 0.0;
+        T best_iou = 0.0;
+        int best_n = 0;
+        // each gt box find a best match anchor box as positive sample,
+        // for positive sample, all losses should be calculated, and for
+        // other samples, only objectness loss is required.
+        for (int an_idx = 0; an_idx < an_num; an_idx++) {
+          Box<T> an_box;
+          an_box.x = 0.0;
+          an_box.y = 0.0;
+          an_box.w = anchors[2 * an_idx] / static_cast<T>(input_size);
+          an_box.h = anchors[2 * an_idx + 1] / static_cast<T>(input_size);
+          float iou = CalcBoxIoU<T>(an_box, gt_shift);
+          if (iou > best_iou) {
+            best_iou = iou;
+            best_n = an_idx;
+          }
+        }
+
+        int mask_idx = GetMaskIndex(anchor_mask, best_n);
+        gt_match_mask_data[i * b + t] = mask_idx;
+        if (mask_idx >= 0) {
+          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                      an_stride, stride, 0);
+          CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
+                                 box_idx, gi, gj, h, input_size, stride);
+
+          int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
+          obj_mask_data[obj_idx] = 1.0;
+
+          int label = gt_label_data[i * b + t];
+          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                        an_stride, stride, 5);
+          CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
+                           class_num, stride);
+        }
+      }
+    }
+
+    CalcObjnessLoss<T>(loss_data, input_data + 4 * stride, obj_mask_data, n,
+                       mask_num, h, w, stride, an_stride);
+  }
+};
+
+template <typename T>
+class Yolov3LossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
+    auto* gt_match_mask = ctx.Input<Tensor>("GTMatchMask");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
+    int class_num = ctx.Attr<int>("class_num");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input_grad->dims()[0];
+    const int c = input_grad->dims()[1];
+    const int h = input_grad->dims()[2];
+    const int w = input_grad->dims()[3];
+    const int mask_num = anchor_mask.size();
+    const int b = gt_match_mask->dims()[1];
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    const T* input_data = input->data<T>();
+    const T* gt_box_data = gt_box->data<T>();
+    const int* gt_label_data = gt_label->data<int>();
+    const T* loss_grad_data = loss_grad->data<T>();
+    const T* obj_mask_data = objness_mask->data<T>();
+    const int* gt_match_mask_data = gt_match_mask->data<int>();
+    T* input_grad_data =
+        input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
+
+    for (int i = 0; i < n; i++) {
+      for (int t = 0; t < b; t++) {
+        int mask_idx = gt_match_mask_data[i * b + t];
+        if (mask_idx >= 0) {
+          Box<T> gt = GetGtBox(gt_box_data, i, b, t);
+          int gi = static_cast<int>(gt.x * w);
+          int gj = static_cast<int>(gt.y * h);
+
+          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                      an_stride, stride, 0);
+          CalcBoxLocationLossGrad<T>(
+              input_grad_data, loss_grad_data[i], input_data, gt, anchors,
+              anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride);
+
+          int label = gt_label_data[i * b + t];
+          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
+                                        an_stride, stride, 5);
+          CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
+                               label_idx, label, class_num, stride);
+        }
+      }
+    }
+
+    CalcObjnessLossGrad<T>(input_grad_data + 4 * stride, loss_grad_data,
+                           input_data + 4 * stride, obj_mask_data, n, mask_num,
+                           h, w, stride, an_stride);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index d65491267de1ce3495d8b8250cf0cff570dfcc6a..7a6927d3e54b4ece8f17d7a1e7e431ba836edff9 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -114,4 +114,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>,
+    ops::DropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
     ops::DropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/elementwise/elementwise_add_mkldnn_op.cc
rename to paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
rename to paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index a35ee8a09ed5ddcc4ac465d200b84358fa65b2f3..e9e2a3b1f5c1c00bb2e95b6171ecd09bfe7a0d21 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -79,17 +79,17 @@ void FusionRepeatedFCReluOpMaker::Make() {
 }
 
 template <typename T>
-static void fc_relu(const T* x, const T* w, const T* b, T* y, int m, int n,
-                    int k) {
+static void fc_relu(const T* x, const T* w, const T* b, T* y,
+                    const jit::matmul_attr_t& attr) {
   auto matmul =
-      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
+      jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
   auto addbias_relu =
-      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(n);
-  matmul(x, w, y, m, n, k);
+      jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
+  matmul(x, w, y, &attr);
   T* dst = y;
-  for (int i = 0; i < m; ++i) {
-    addbias_relu(b, dst, dst, n);
-    dst += n;
+  for (int i = 0; i < attr.m; ++i) {
+    addbias_relu(b, dst, dst, attr.n);
+    dst += attr.n;
   }
 }
 
@@ -107,32 +107,33 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
 
     auto i_dims = in->dims();
     auto w_dims = weights[0]->dims();
-    int m = i_dims[0];
-    int n = w_dims[1];
-    int k = w_dims[0];
-    relus[0]->Resize({m, n});
+    jit::matmul_attr_t attr;
+    attr.m = i_dims[0];
+    attr.n = w_dims[1];
+    attr.k = w_dims[0];
+    relus[0]->Resize({attr.m, attr.n});
     fc_relu(in->data<T>(), weights[0]->data<T>(), biases[0]->data<T>(),
-            relus[0]->mutable_data<T>(place), m, n, k);
+            relus[0]->mutable_data<T>(place), attr);
 
     for (int i = 1; i < weight_sz - 1; ++i) {
       auto i_dims = relus[i - 1]->dims();
       auto w_dims = weights[i]->dims();
-      int m = i_dims[0];
-      int n = w_dims[1];
-      int k = w_dims[0];
-      relus[i]->Resize({m, n});
+      attr.m = i_dims[0];
+      attr.n = w_dims[1];
+      attr.k = w_dims[0];
+      relus[i]->Resize({attr.m, attr.n});
       fc_relu(relus[i - 1]->data<T>(), weights[i]->data<T>(),
-              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), m, n, k);
+              biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
     }
 
     auto i_dims_last = relus[weight_sz - 2]->dims();
     auto w_dims_last = weights[weight_sz - 1]->dims();
-    m = i_dims_last[0];
-    n = w_dims_last[1];
-    k = w_dims_last[0];
+    attr.m = i_dims_last[0];
+    attr.n = w_dims_last[1];
+    attr.k = w_dims_last[0];
     fc_relu(relus[weight_sz - 2]->data<T>(), weights[weight_sz - 1]->data<T>(),
-            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place), m, n,
-            k);
+            biases[weight_sz - 1]->data<T>(), out->mutable_data<T>(place),
+            attr);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 00dafdead53bbd4614c70875441c565724fca46d..8c8b079633aacb711aa304ec7016c37c6bec61ce 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -87,15 +87,18 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
 
     auto x_dims = x->dims();
     auto y_dims = y->dims();
-    int m = x_dims[0];
-    int k = x_dims[1];
-    int n = y_dims[1];
-    int o_numel = m * n;
+    jit::matmul_attr_t attr;
+    attr.m = x_dims[0];
+    attr.k = x_dims[1];
+    attr.n = y_dims[1];
+    int o_numel = attr.m * attr.n;
 
     auto vsquare_x =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(m * k);
+        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
+                                                                       attr.k);
     auto vsquare_y =
-        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(k * n);
+        jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
+                                                                       attr.n);
     auto vsquare_xy =
         jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
     auto vsub =
@@ -103,7 +106,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
     auto vscal =
         jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
     auto matmul =
-        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(k);
+        jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
 
     const T* x_data = x->data<T>();
     const T* y_data = y->data<T>();
@@ -112,12 +115,12 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
     T* squared_xy_data = squared_xy->mutable_data<T>(place);
     T* o_data = out->mutable_data<T>(place);
 
-    matmul(x_data, y_data, squared_xy_data, m, n, k);
+    matmul(x_data, y_data, squared_xy_data, &attr);
     vsquare_xy(squared_xy_data, squared_xy_data, o_numel);
 
-    vsquare_x(x_data, squared_x_data, m * k);
-    vsquare_y(y_data, squared_y_data, k * n);
-    matmul(squared_x_data, squared_y_data, o_data, m, n, k);
+    vsquare_x(x_data, squared_x_data, attr.m * attr.k);
+    vsquare_y(y_data, squared_y_data, attr.k * attr.n);
+    matmul(squared_x_data, squared_y_data, o_data, &attr);
 
     vsub(squared_xy_data, o_data, o_data, o_numel);
     vscal(&scalar, o_data, o_data, o_numel);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 9f4aef08cd58e72ce344a640e6564b9e360ce169..490ba9a585ee8fac82a9e1178f506a6d39e5fd1c 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -31,7 +31,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     auto *output = ctx.Output<Tensor>("Out");
 
     output->mutable_data<T>(ctx.GetPlace());
-
+    if (x->numel() == 0) return;
     GPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
@@ -45,14 +45,13 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *x = ctx.Input<Tensor>("X");
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
     auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
                        .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
-
+    if (dO->numel() == 0) return;
     GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
@@ -61,11 +60,14 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                         ops::GatherOpCUDAKernel<double>,
                         ops::GatherOpCUDAKernel<int64_t>,
-                        ops::GatherOpCUDAKernel<int>);
+                        ops::GatherOpCUDAKernel<int>,
+                        ops::GatherOpCUDAKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
                         ops::GatherGradOpCUDAKernel<double>,
                         ops::GatherGradOpCUDAKernel<int64_t>,
-                        ops::GatherGradOpCUDAKernel<int>);
+                        ops::GatherGradOpCUDAKernel<int>,
+                        ops::GatherGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 2dd726bebb1bc2e4d83844c0b98df01c390e622f..2e18298cf8e34d5f70369c89b3b3b2a9ced0ce62 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -35,7 +35,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
     auto *output = ctx.Output<Tensor>("Out");
 
     output->mutable_data<T>(ctx.GetPlace());
-
+    if (x->numel() == 0) return;
     CPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
@@ -56,7 +56,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
-
+    if (dO->numel() == 0) return;
     ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 93dd3f794f6087a3158fee1f262795871f21611a..de91ba6270ac2ed22c8380878c0a0037fb1629c0 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -82,6 +82,18 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
                          "bilinear interpolation and \"nearest\" for nearest "
                          "neighbor interpolation.")
         .SetDefault("bilinear");
+    AddAttr<bool>(
+        "align_corners",
+        "an optinal bool. Defaults to True. "
+        "If True, the centers of 4 corner pixels of the input and output "
+        "tensors are aligned, preserving the values at the corner pixels, "
+        "if Flase, are not aligned")
+        .SetDefault(true);
+    AddAttr<int>("align_mode",
+                 "(int, default \'1\'), optional for bilinear interpolation"
+                 "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
+                 "can be \'1\' for src_idx = scale*dst_index .")
+        .SetDefault(1);
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
@@ -98,6 +110,64 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           to perform linear interpolation first in one direction, and then 
           again in the other direction.
 
+          Align_corners and align_mode are optinal parameters,the calculation method 
+          of interpolation can be selected by them.
+          
+          Example:
+
+          For scale:
+          
+            if align_corners = True and out_{size}>1 :
+
+              scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
+            
+            else:
+              
+              scale_{factor} = float(in_{size}/out_{size})
+            
+          
+          Nearest neighbor interpolation:
+          
+          if:
+              align_corners = False
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+              W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+          else:
+              align_corners = True
+
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = round(H_{in} * scale_{factor})
+              W_out = round(W_{in} * scale_{factor})
+
+          Bilinear interpolation:
+
+          if:
+              align_corners = False , align_mode = 0
+              
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+              
+              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+          else:
+           
+              input : (N,C,H_in,W_in)
+              output: (N,C,H_out,W_out) where:
+
+              H_out = H_{in} * scale_{factor}
+              W_out = W_{in} * scale_{factor}
+
+          
+
           For details of nearest neighbor interpolation, please refer to Wikipedia: 
           https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 99ac725f73bf60ab0fb9a467432e8a57c646ef35..b887878ea2291d6c56fec91738784e338606b84f 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -23,7 +23,8 @@ __global__ void KeNearestNeighborInterpFw(
     const T* in, const size_t in_img_h, const size_t in_img_w,
     const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w) {
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
@@ -35,10 +36,14 @@ __global__ void KeNearestNeighborInterpFw(
     int channel_id = out_id_w / out_img_size;
 
     int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
 
     int out_img_idx = tid % out_img_w;
-    int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
 
     out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
                   in_img_idy * in_img_w + in_img_idx];
@@ -50,7 +55,8 @@ __global__ void KeNearestNeighborInterpBw(
     T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
     const size_t input_w, const T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w) {
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
@@ -62,10 +68,14 @@ __global__ void KeNearestNeighborInterpBw(
     int channel_id = out_id_w / out_img_size;
 
     int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = static_cast<int>(ratio_h * out_img_idy + 0.5);
+    int in_img_idy = (align_corners)
+                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
 
     int out_img_idx = tid % out_img_w;
-    int in_img_idx = static_cast<int>(ratio_w * out_img_idx + 0.5);
+    int in_img_idx = (align_corners)
+                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
 
     T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
                     in_img_idy * in_img_w + in_img_idx];
@@ -79,10 +89,12 @@ __global__ void KeBilinearInterpFw(
     const T* in, const size_t in_img_h, const size_t in_img_w,
     const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w) {
+    const size_t num_channels, const float ratio_h, const float ratio_w,
+    const bool align_corners, const int align_mode) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
   for (; tid < nthreads; tid += stride) {
     int out_id_h = tid / output_w;
     int out_id_w = tid % output_w;
@@ -91,15 +103,23 @@ __global__ void KeBilinearInterpFw(
     int channel_id = out_id_w / out_img_size;
 
     int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = ratio_h * out_img_idy;
+    int in_img_idy = align_flag
+                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * out_img_idy);
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
     int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h1lambda = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - in_img_idy
+                            : ratio_h * out_img_idy - in_img_idy;
     T h2lambda = 1.f - h1lambda;
 
     int out_img_idx = tid % out_img_w;
-    int in_img_idx = ratio_w * out_img_idx;
+    int in_img_idx = align_flag
+                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
+                         : static_cast<int>(ratio_w * out_img_idx);
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
     int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T w1lambda = ratio_w * out_img_idx - in_img_idx;
+    T w1lambda = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - in_img_idx
+                            : ratio_w * out_img_idx - in_img_idx;
     T w2lambda = 1.f - w1lambda;
 
     const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
@@ -118,10 +138,12 @@ __global__ void KeBilinearInterpBw(
     T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
     const size_t input_w, const T* out, const size_t out_img_h,
     const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const T ratio_h, const T ratio_w) {
+    const size_t num_channels, const T ratio_h, const T ratio_w,
+    const bool align_corners, const int align_mode) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
+  bool align_flag = (align_mode == 0 && !align_corners);
   for (; tid < nthreads; tid += stride) {
     int out_id_h = tid / output_w;
     int out_id_w = tid % output_w;
@@ -130,15 +152,22 @@ __global__ void KeBilinearInterpBw(
     int channel_id = out_id_w / out_img_size;
 
     int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = ratio_h * out_img_idy;
+    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
+                                : ratio_h * out_img_idy;
+    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
     int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h1lambda = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - in_img_idy
+                            : ratio_h * out_img_idy - in_img_idy;
+
     T h2lambda = 1.f - h1lambda;
 
     int out_img_idx = tid % out_img_w;
-    int in_img_idx = ratio_w * out_img_idx;
+    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
+                                : ratio_w * out_img_idx;
+    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
     int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T w1lambda = ratio_w * out_img_idx - in_img_idx;
+    T w1lambda = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - in_img_idx
+                            : ratio_w * out_img_idx - in_img_idx;
     T w2lambda = 1.f - w1lambda;
 
     T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
@@ -175,6 +204,9 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
       out_w = size_data[1];
     }
 
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
+
     int n = input->dims()[0];
     int c = input->dims()[1];
     int in_h = input->dims()[2];
@@ -188,10 +220,16 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
     int in_chw = c * in_hw;
     int out_chw = c * out_hw;
 
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
     if (in_h == out_h && in_w == out_w) {
       framework::TensorCopy(*input, ctx.GetPlace(), output);
@@ -206,12 +244,12 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
       KeNearestNeighborInterpFw<
           T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
           input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-          out_chw, c, ratio_h, ratio_w);
+          out_chw, c, ratio_h, ratio_w, align_corners);
     } else if ("bilinear" == interp_method) {
       KeBilinearInterpFw<
           T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
           input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-          out_chw, c, ratio_h, ratio_w);
+          out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
     }
   }
 };
@@ -234,6 +272,10 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
     auto out_size = ctx.Input<Tensor>("OutSize");
+
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
+
     if (out_size != nullptr) {
       Tensor sizes;
       framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
@@ -252,10 +294,16 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
     int in_chw = c * in_hw;
     int out_chw = c * out_hw;
 
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
     if (in_h == out_h && in_w == out_w) {
       framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
@@ -270,12 +318,12 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
       KeNearestNeighborInterpBw<
           T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
           input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
-          out_w, n, out_chw, c, ratio_h, ratio_w);
+          out_w, n, out_chw, c, ratio_h, ratio_w, align_corners);
     } else if ("bilinear" == interp_method) {
       KeBilinearInterpBw<
           T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
           input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
-          out_w, n, out_chw, c, ratio_h, ratio_w);
+          out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode);
     }
   }
 };
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 7fdb3e1f5a2ff82284d89dd0759e357978e1d873..c631ad1dd158ce114169602f073d69b2291b5b3b 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -26,14 +26,17 @@ template <typename T>
 static void NearestNeighborInterpolate(const Tensor& input, Tensor* output,
                                        const float ratio_h, const float ratio_w,
                                        const int n, const int c,
-                                       const int out_h, const int out_w) {
+                                       const int out_h, const int out_w,
+                                       const bool align_corners) {
   auto input_t = EigenTensor<T, 4>::From(input);
   auto output_t = EigenTensor<T, 4>::From(*output);
   for (int k = 0; k < out_h; k++) {  // loop for images
-    int in_k = static_cast<int>(ratio_h * k + 0.5);
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
 
     for (int l = 0; l < out_w; l++) {
-      int in_l = static_cast<int>(ratio_w * l + 0.5);
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
 
       for (int i = 0; i < n; i++) {    // loop for batches
         for (int j = 0; j < c; j++) {  // loop for channels
@@ -48,20 +51,29 @@ template <typename T>
 static void BilinearInterpolation(const Tensor& input, Tensor* output,
                                   const float ratio_h, const float ratio_w,
                                   const int in_h, const int in_w, const int n,
-                                  const int c, const int out_h,
-                                  const int out_w) {
+                                  const int c, const int out_h, const int out_w,
+                                  const bool align_corners,
+                                  const bool align_mode) {
   auto input_t = EigenTensor<T, 4>::From(input);
   auto output_t = EigenTensor<T, 4>::From(*output);
+  bool align_flag = (align_mode == 0 && !align_corners);
   for (int k = 0; k < out_h; k++) {  // loop for images
-    int y_n = static_cast<int>(ratio_h * k);
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
     int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float d_n = ratio_h * k - y_n;
+    float d_n =
+        align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
     float d_s = 1.f - d_n;
 
     for (int l = 0; l < out_w; l++) {
-      int x_w = static_cast<int>(ratio_w * l);
+      int x_w = (align_mode == 0 && !align_corners)
+                    ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                    : static_cast<int>(ratio_w * l);
+      x_w = (x_w > 0) ? x_w : 0;
       int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-      float d_w = ratio_w * l - x_w;
+      float d_w =
+          align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
       float d_e = 1.f - d_w;
 
       for (int i = 0; i < n; i++) {    // loop for batches
@@ -78,19 +90,20 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
 }
 
 template <typename T>
-static void NearestNeighborInterpolateGrad(const Tensor& output_grad,
-                                           Tensor* input_grad,
-                                           const float ratio_h,
-                                           const float ratio_w, const int n,
-                                           const int c, const int out_h,
-                                           const int out_w) {
+static void NearestNeighborInterpolateGrad(
+    const Tensor& output_grad, Tensor* input_grad, const float ratio_h,
+    const float ratio_w, const int n, const int c, const int out_h,
+    const int out_w, const bool align_corners) {
   auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
   auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
   for (int k = 0; k < out_h; k++) {  // loop for images
-    int in_k = static_cast<int>(ratio_h * k + 0.5);
+    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
+                               : static_cast<int>(ratio_h * k);
 
     for (int l = 0; l < out_w; l++) {
-      int in_l = static_cast<int>(ratio_w * l + 0.5);
+      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
+                                 : static_cast<int>(ratio_w * l);
 
       for (int i = 0; i < n; i++) {    // loop for batches
         for (int j = 0; j < c; j++) {  // loop for channels
@@ -106,19 +119,28 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
                                       Tensor* input_grad, const float ratio_h,
                                       const float ratio_w, const int in_h,
                                       const int in_w, const int n, const int c,
-                                      const int out_h, const int out_w) {
+                                      const int out_h, const int out_w,
+                                      const bool align_corners,
+                                      const int align_mode) {
   auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
   auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  bool align_flag = (align_mode == 0 && !align_corners);
   for (int k = 0; k < out_h; k++) {  // loop for images
-    int y_n = static_cast<int>(ratio_h * k);
+    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                         : static_cast<int>(ratio_h * k);
+    y_n = (y_n > 0) ? y_n : 0;
     int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float d_n = ratio_h * k - y_n;
+    float d_n =
+        align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
     float d_s = 1.f - d_n;
 
     for (int l = 0; l < out_w; l++) {
-      int x_w = static_cast<int>(ratio_w * l);
+      int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                           : static_cast<int>(ratio_w * l);
+      x_w = (x_w > 0) ? x_w : 0;
       int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-      float d_w = ratio_w * l - x_w;
+      float d_w =
+          align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
       float d_e = 1.f - d_w;
 
       for (int i = 0; i < n; i++) {    // loop for batches
@@ -134,7 +156,6 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
     }
   }
 }
-
 template <typename T>
 class InterpolateKernel : public framework::OpKernel<T> {
  public:
@@ -151,6 +172,8 @@ class InterpolateKernel : public framework::OpKernel<T> {
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     }
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
     const int n = input->dims()[0];
     const int c = input->dims()[1];
@@ -168,17 +191,24 @@ class InterpolateKernel : public framework::OpKernel<T> {
       return;
     }
 
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
     if ("bilinear" == interp_method) {
       BilinearInterpolation<T>(*input, output, ratio_h, ratio_w, in_h, in_w, n,
-                               c, out_h, out_w);
+                               c, out_h, out_w, align_corners, align_mode);
     } else if ("nearest" == interp_method) {
       NearestNeighborInterpolate<T>(*input, output, ratio_h, ratio_w, n, c,
-                                    out_h, out_w);
+                                    out_h, out_w, align_corners);
     }
   }
 };
@@ -200,6 +230,8 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     }
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
     const int n = input->dims()[0];
     const int c = input->dims()[1];
@@ -217,17 +249,26 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
       return;
     }
 
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h = 0.f;
+    float ratio_w = 0.f;
+
+    if (out_h > 1) {
+      ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                                : static_cast<float>(in_h) / out_h;
+    }
+    if (out_w > 1) {
+      ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                                : static_cast<float>(in_w) / out_w;
+    }
 
     if ("bilinear" == interp_method) {
       BilinearInterpolationGrad<T>(*output_grad, input_grad, ratio_h, ratio_w,
-                                   in_h, in_w, n, c, out_h, out_w);
+                                   in_h, in_w, n, c, out_h, out_w,
+                                   align_corners, align_mode);
     } else if ("nearest" == interp_method) {
       NearestNeighborInterpolateGrad<T>(*output_grad, input_grad, ratio_h,
-                                        ratio_w, n, c, out_h, out_w);
+                                        ratio_w, n, c, out_h, out_w,
+                                        align_corners);
     }
   }
 };
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 5c5a61f64093802697eb21452267471129c7fcf3..97ddf223aefcdfaf8a488f93a152336c1ed458f4 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -93,6 +93,7 @@ std::vector<int> TestSizes() {
 template <typename KernelTuples, typename... Args>
 struct BenchFunc {
   // return this function avg time
+  // TODO(TJ): clear cache every time
   double operator()(const typename KernelTuples::func_type tgt, Args... args) {
     for (int i = 0; i < FLAGS_burning; ++i) {
       tgt(args...);
@@ -172,6 +173,9 @@ void BenchXYZNKernel() {
     RandomVec<T>(d, y_data);
     BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
                                                      y.data<T>(), z_data, d);
+    // test inplace
+    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), z_data,
+                                                     z_data, d);
   }
 }
 
@@ -311,8 +315,9 @@ void BenchMatMulKernel() {
         const T* a_data = a.data<T>();
         const T* b_data = b.data<T>();
         T* c_data = c.mutable_data<T>(PlaceType());
-        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,
-                                                           c_data, m, n, k);
+        const jit::matmul_attr_t attr{m, n, k};
+        BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
+                                                           c_data, &attr);
       }
     }
   }
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index 2ea8f927e1a13867fa2065841fac05e766735237..efc7eb79d36c5cf9fac4ac40db4e2e28cb242e22 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -9,6 +9,7 @@ function(USE_JITKERNEL_GEN TARGET)
 endfunction()
 
 # use gen jitcode kernel by name
+USE_JITKERNEL_GEN(kMatMul)
 USE_JITKERNEL_GEN(kVMul)
 USE_JITKERNEL_GEN(kVAdd)
 USE_JITKERNEL_GEN(kVSub)
diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc
index dee6c7b9d3ee9756c1b11d10d55fdca341cbee85..5da24c359edd2df93333fe0ca8a18cdc7385aadb 100644
--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
@@ -155,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
   class name##Creator : public JitCodeCreator<int> {                         \
    public:                                                                   \
     bool UseMe(const int& attr) const override {                             \
-      return platform::MayIUse(platform::avx);                               \
+      return platform::MayIUse(platform::avx) && attr <= 1024;               \
     }                                                                        \
     size_t CodeSize(const int& d) const override {                           \
       return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h
index de6b33f467279124d7acd97709516c31706ec4f9..66a97c1be503b0fa983f9a7ec3b61c986774f16b 100644
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
@@ -61,6 +61,7 @@ class VXXJitCode : public JitCode {
       base += "_Vec";
     }
     base += (with_relu_ ? "_Relu" : "");
+    base += "_D" + std::to_string(num_);
     return base.c_str();
   }
   void genCode() override;
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae3858eab20aeb80553d8fcec4088a6632c9c17d
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/matmul.h"
+#include <stddef.h>  // offsetof
+#include <vector>
+
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void MatMulJitCode::genCode() {
+  preCode();
+  int block, rest;
+  const auto groups = packed_groups(n_, k_, &block, &rest);
+  PADDLE_ENFORCE_GT(groups.front(), 0);
+
+  const int block_len = sizeof(float) * block;
+  const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
+  const int w_reg_idx = x_reg_idx - 1;
+  // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t,
+  // packed_weight)]);
+  mov(reg_ptr_wgt, param_y);
+  size_t z_offset = 0;
+  size_t wgt_offset = 0;
+  for (size_t g = 0; g < groups.size(); ++g) {
+    size_t x_offset = 0;
+    for (int k = 0; k < k_; ++k) {
+      vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
+      // clean
+      if (k == 0) {
+        for (int i = 0; i < groups[g]; ++i) {
+          vxorps(zmm_t(i), zmm_t(i), zmm_t(i));
+        }
+      }
+      for (int i = 0; i < groups[g]; ++i) {
+        vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
+        vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
+        wgt_offset += block_len;
+      }
+      // last one, save
+      if (k == k_ - 1) {
+        for (int i = 0; i < groups[g]; ++i) {
+          // only rest save should be careful
+          if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) {
+            break;
+          }
+          vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i));
+        }
+      }
+      x_offset += sizeof(float);
+    }
+    z_offset += block_len * groups[g];
+  }
+
+  if (rest != 0) {
+    // below should refine with mask
+    int reg_idx = groups.back() - 1;
+    z_offset = (n_ - rest) * sizeof(float);
+    int inner_block = 8;
+    while (rest > 0) {
+      if (rest >= 8) {
+        inner_block = 8;
+        vmovups(ptr[param_z + z_offset], ymm_t(reg_idx));
+        // shift zmm of inner_block, change reg_idx if update
+      } else if (rest >= 4) {
+        inner_block = 4;
+        vmovups(ptr[param_z + z_offset], xmm_t(reg_idx));
+      } else if (rest >= 2) {
+        inner_block = 2;
+        vmovq(ptr[param_z + z_offset], xmm_t(reg_idx));
+      } else {
+        inner_block = 1;
+        vmovss(ptr[param_z + z_offset], xmm_t(reg_idx));
+      }
+      z_offset += inner_block * sizeof(float);
+      rest -= inner_block;
+    }
+  }
+
+  postCode();
+}
+
+class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
+ public:
+  bool UseMe(const matmul_attr_t& attr) const override {
+    return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
+           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
+  }
+  size_t CodeSize(const matmul_attr_t& attr) const override {
+    int block = YMM_FLOAT_BLOCK;
+    if (platform::MayIUse(platform::avx512f)) {
+      block = ZMM_FLOAT_BLOCK;
+    }
+    return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(
+      const matmul_attr_t& attr) const override {
+    PADDLE_ENFORCE_GT(attr.m, 0);
+    PADDLE_ENFORCE_GT(attr.n, 0);
+    PADDLE_ENFORCE_GT(attr.k, 0);
+    return make_unique<MatMulJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
new file mode 100644
index 0000000000000000000000000000000000000000..626baa8f738bf0395f3c7f1700610d0a9075879b
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>  // for malloc and free
+#include <string>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class MatMulJitCode : public JitCode {
+ public:
+  explicit MatMulJitCode(const matmul_attr_t& attr,
+                         size_t code_size = 256 * 1024,
+                         void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
+    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    this->genCode();
+  }
+
+  virtual const char* name() const {
+    std::string base = "MatMulJitCode";
+    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
+           std::to_string(k_);
+    return base.c_str();
+  }
+  void genCode() override;
+
+ private:
+  int m_, n_, k_;
+
+  reg64_t param_x{abi_param1};
+  reg64_t param_y{abi_param2};
+  reg64_t param_z{abi_param3};
+  reg64_t param_attr{abi_param4};
+  reg64_t reg_tmp{rax};
+
+  reg64_t reg_ptr_wgt{r10};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 310da0c76f1ab251d788e54f2305f375f3fb4838..3cd5f6554bdc188ce9ea0c0b85c84d032c509600 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -16,6 +16,8 @@
 #include <fstream>
 #include <iostream>
 #include <sstream>
+#include <vector>
+#include "paddle/fluid/platform/cpu_info.h"
 
 DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 
@@ -38,6 +40,35 @@ void GenBase::dumpCode(const unsigned char* code) const {
   }
 }
 
+std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
+  int block;
+  int max_num_regs;
+  if (platform::MayIUse(platform::avx512f)) {
+    block = ZMM_FLOAT_BLOCK;
+    max_num_regs = 32;
+  } else {
+    block = YMM_FLOAT_BLOCK;
+    max_num_regs = 16;
+  }
+  // one for x, one for y, others for z
+  const int max_used_regs_for_n = max_num_regs - 2;
+  const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
+  const int num_block = aligned_n / block;
+  const int num_groups = num_block / max_used_regs_for_n;
+  std::vector<int> groups(num_groups, max_used_regs_for_n);
+  int rest_num_regs = num_block % max_used_regs_for_n;
+  if (rest_num_regs != 0) {
+    groups.push_back(rest_num_regs);
+  }
+  if (block_out) {
+    *block_out = block;
+  }
+  if (rest_out) {
+    *rest_out = n % block;
+  }
+  return groups;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index 4af01a437670aa6a07d370ff23ed2abd369f69a3..d808a332472ae86240cb63356cb417123523366a 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -16,6 +16,7 @@
 
 #include <gflags/gflags.h>
 #include <memory>  // for unique_ptr
+#include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 DECLARE_bool(dump_jitcode);
@@ -67,6 +68,11 @@ class JitCodeCreator : public GenCreator {
   virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
 };
 
+// unify the method of packed groups
+// output the packed groups which used in weights, the block size and rest size
+std::vector<int> packed_groups(int n, int k, int* block = nullptr,
+                               int* rest = nullptr);
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 4dac2f2460f72c7da63f48c82549b948cc253153..e7292fe2bd8031aa5bbff68e7c2305a238085bf1 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/operators/jit/helper.h"
 #include <algorithm>  // tolower
+#include <numeric>
+#include <string>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -91,6 +93,41 @@ KernelType to_kerneltype(const std::string& act) {
   return kNone;
 }
 
+template <>
+void pack_weights<float>(const float* src, float* dst, int n, int k) {
+  int block, rest;
+  const auto groups = packed_groups(n, k, &block, &rest);
+  std::for_each(groups.begin(), groups.end(), [&](int i) {
+    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+  });
+  int sum = std::accumulate(groups.begin(), groups.end(), 0);
+  std::memset(dst, 0, k * sum * block * sizeof(float));
+  PADDLE_ENFORCE_GE(sum * block, n,
+                    "The packed n should be equal to or larger than n");
+
+  const int block_len = sizeof(float) * block;
+  int n_offset = 0;
+
+  for (size_t g = 0; g < groups.size(); ++g) {
+    const float* from = src + n_offset;
+    for (int j = 0; j < k; ++j) {
+      size_t copy_sz = groups[g] * block_len;
+      if (g == groups.size() - 1 && rest != 0) {
+        copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float);
+      }
+      std::memcpy(dst, from + j * n, copy_sz);
+      dst += groups[g] * block;
+    }
+    n_offset += groups[g] * block;
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
+    const T* src, T* dst, int n, int k) {
+  PADDLE_THROW("Only support pack with float type.");
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index 7bdc45779b7d39d36db0d52ca9361943cdcdef3e..d5773d65940127ea0a9b77ed2760bd371b778f4c 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -118,26 +118,33 @@ typename KernelTuples::func_type Get(
   return GetRefer<KT, KernelTuples>();
 }
 
-template <KernelType KT, typename KernelTuples>
-class KernelFuncsCache {
+template <KernelType KT, typename KernelTuples, typename PlaceType>
+class KernelFuncs {
  public:
-  KernelFuncsCache() = default;
-  static KernelFuncsCache& Instance() {
-    static thread_local KernelFuncsCache<KT, KernelTuples> g_func_cache;
+  KernelFuncs() = default;
+  static KernelFuncs& Cache() {
+    static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache;
     return g_func_cache;
   }
 
   bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
 
-  typename KernelTuples::func_type At(int key) { return funcs_.at(key); }
-
   void Insert(int key, typename KernelTuples::func_type func) {
     funcs_.emplace(key, func);
   }
 
+  typename KernelTuples::func_type At(int key) {
+    if (Has(key)) {
+      return funcs_.at(key);
+    }
+    auto func = Get<KT, KernelTuples, PlaceType>(key);
+    Insert(key, func);
+    return func;
+  }
+
  private:
   std::unordered_map<int, typename KernelTuples::func_type> funcs_;
-  DISABLE_COPY_AND_ASSIGN(KernelFuncsCache);
+  DISABLE_COPY_AND_ASSIGN(KernelFuncs);
 };
 
 const char* to_string(KernelType kt);
@@ -152,17 +159,28 @@ inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
      << (attr.use_peephole ? "True" : "False") << "]";
   return os;
 }
+
 inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
   os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
      << "],act_cand[" << to_string(attr.act_cand) << "]";
   return os;
 }
+
 inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
   os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
      << to_string(attr.type) << "]";
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
+  os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
+  return os;
+}
+
+// expose the method to pack matmul weight
+template <typename T>
+void pack_weights(const T* src, T* dst, int n, int k);
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index 42a58580f7b1e0832af57398ba9c29882b6cc6fb..4a8f61146a1921fa1d5f6b7e15af40cd45d31a22 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -145,11 +145,19 @@ struct SeqPoolTuples {
   typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
 };
 
+typedef struct matmul_attr_s {
+  int m, n, k;
+  void* packed_weight{nullptr};
+  matmul_attr_s() = default;
+  explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr)
+      : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {}
+} matmul_attr_t;
+
 template <typename T>
 struct MatMulTuples {
   typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int, int, int);
+  typedef matmul_attr_t attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
index 61de38688664f83775c0c4e5aa6f7e06c3602ddb..1e4a8884e78c5d3c1748988f05ecf461a6f0eb94 100644
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -49,6 +49,13 @@ size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
   return (key << pool_type_shift) + static_cast<int>(attr.type);
 }
 
+template <>
+size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
+  size_t key = attr.m;
+  constexpr int shift = 21;
+  return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
+}
+
 }  // namespace jit
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index 0f42ac158ca7926981df55936cb903d5f4ae4806..0036d1c238b17768c4df61af22a85588990e1815 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -49,49 +49,16 @@ void VTanh(const T* x, T* y, int n) {
 }
 
 void Softmax(const T* x, T* y, int n, int bs) {
-  typename XRNTuples<T>::func_type compute_hmax{nullptr};
-  typename XRNTuples<T>::func_type compute_hsum{nullptr};
-  typename AXYNTuples<T>::func_type compute_vscal{nullptr};
-  typename AXYNTuples<T>::func_type compute_vaddbias{nullptr};
-  typename XYNTuples<T>::func_type compute_vexp{nullptr};
-
-  if (!KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Has(n)) {
-    compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n);
-    KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Insert(n, compute_hmax);
-  } else {
-    compute_hmax = KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().At(n);
-  }
-
-  if (!KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Has(n)) {
-    compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n);
-    KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Insert(n, compute_hsum);
-  } else {
-    compute_hsum = KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().At(n);
-  }
-
-  if (!KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Has(n)) {
-    compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
-    KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Insert(n,
-                                                               compute_vscal);
-  } else {
-    compute_vscal = KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().At(n);
-  }
-
-  if (!KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Has(n)) {
-    compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
-    KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Insert(
-        n, compute_vaddbias);
-  } else {
-    compute_vaddbias =
-        KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().At(n);
-  }
-
-  if (!KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Has(n)) {
-    compute_vexp = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
-    KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Insert(n, compute_vexp);
-  } else {
-    compute_vexp = KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().At(n);
-  }
+  auto compute_hmax =
+      KernelFuncs<kHMax, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
+  auto compute_hsum =
+      KernelFuncs<kHSum, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
+  auto compute_vscal =
+      KernelFuncs<kVScal, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
+  auto compute_vaddbias =
+      KernelFuncs<kVAddBias, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
+  auto compute_vexp =
+      KernelFuncs<kVExp, XYNTuples<T>, platform::CPUPlace>::Cache().At(n);
 
   for (int i = 0; i < bs; ++i) {
     T scalar;
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index 28a37198dae19a57509934ec784746bc23436e7a..4c999131ab116ebe3484355158993558b02cc4b2 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -25,17 +25,19 @@ namespace more {
 namespace mkl {
 
 template <>
-void MatMul<float>(const float* a, const float* b, float* c, int m, int n,
-                   int k) {
-  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
-                                 n, k, 1.f, a, k, b, n, 0.f, c, n);
+void MatMul<float>(const float* a, const float* b, float* c,
+                   const matmul_attr_t* attr) {
+  platform::dynload::cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                                 attr->m, attr->n, attr->k, 1.f, a, attr->k, b,
+                                 attr->n, 0.f, c, attr->n);
 }
 
 template <>
-void MatMul<double>(const double* a, const double* b, double* c, int m, int n,
-                    int k) {
-  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m,
-                                 n, k, 1.0, a, k, b, n, 0.0, c, n);
+void MatMul<double>(const double* a, const double* b, double* c,
+                    const matmul_attr_t* attr) {
+  platform::dynload::cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                                 attr->m, attr->n, attr->k, 1.0, a, attr->k, b,
+                                 attr->n, 0.0, c, attr->n);
 }
 
 template <>
@@ -127,11 +129,6 @@ void ASum<double>(const double* x, double* res, int n) {
 }
 
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
-template <>
-bool MatMulKernel<float>::UseMe(const int& d) const {
-  return platform::MayIUse(platform::avx);
-}
-
 template <>
 bool VMulKernel<float>::UseMe(const int& d) const {
   return platform::MayIUse(platform::avx512f) && d > 512;
@@ -139,7 +136,7 @@ bool VMulKernel<float>::UseMe(const int& d) const {
 
 template <>
 bool VAddKernel<float>::UseMe(const int& d) const {
-  return platform::MayIUse(platform::avx512f) && d > 512;
+  return platform::MayIUse(platform::avx) && d > 512;
 }
 
 template <>
@@ -177,6 +174,16 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
   return true;
 }
 
+template <>
+bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
+  return platform::MayIUse(platform::avx);
+}
+
+template <>
+bool MatMulKernel<double>::UseMe(const matmul_attr_t& attr) const {
+  return true;
+}
+
 template <>
 bool SoftmaxKernel<float>::UseMe(const int& d) const {
   // tuned on avx2
@@ -189,7 +196,6 @@ bool SoftmaxKernel<float>::UseMe(const int& d) const {
     return true;                                         \
   }
 
-AWALYS_USE_ME_WITH_DOUBLE(MatMul);
 AWALYS_USE_ME_WITH_DOUBLE(VMul);
 AWALYS_USE_ME_WITH_DOUBLE(VAdd);
 AWALYS_USE_ME_WITH_DOUBLE(VScal);
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index 6b95b9c872dc12cccaef0b0737edd760447a47d0..8130b87326f1887f232022ab30fa7bf42b0723e7 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -26,7 +26,7 @@ namespace more {
 namespace mkl {
 
 template <typename T>
-void MatMul(const T* a, const T* b, T* c, int m, int n, int k);
+void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr);
 
 template <typename T>
 void VMul(const T* x, const T* y, T* z, int n);
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 5a074db7e0e8ab49dc281e1809edef23e6a25c42..0c4a985f8e8ece0a6169478fa3a9b111f5a6f3b4 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -363,17 +363,19 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
 
 // A(M,K) * B(K,N) = C(M,N)
 template <typename T>
-void MatMul(const T* A, const T* B, T* C, int M, int N, int K) {
+void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) {
+  int M = attr->m;
+  int N = attr->n;
+  int K = attr->k;
   for (int m = 0; m < M; ++m) {
     const T* pa = A + m * K;
     T* pc = C + m * N;
     for (int n = 0; n < N; ++n) {
       const T* pb = B + n;
-      T sum = static_cast<T>(0);
-      for (int k = 0; k < K; ++k) {
-        sum += (pa[k] * pb[k * N]);
+      pc[n] = pa[0] * pb[0];
+      for (int k = 1; k < K; ++k) {
+        pc[n] += pa[k] * pb[k * N];
       }
-      *(pc + n) = sum;
     }
   }
 }
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index cc461552898fc68661ce548a520d65215d3572b4..237e588d35cc3b33658a830db34676967818aab6 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
-static double acc = 1e-5;
+DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
 
 template <typename T>
 void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
@@ -39,7 +39,7 @@ template <typename T>
 void ExpectEQ(const T* target, const T* refer, int n) {
   if (std::is_floating_point<T>::value) {
     for (int i = 0; i < n; ++i) {
-      EXPECT_NEAR(target[i], refer[i], acc);
+      EXPECT_NEAR(target[i], refer[i], FLAGS_acc);
     }
   } else {
     for (int i = 0; i < n; ++i) {
@@ -272,21 +272,23 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
 
 template <typename T>
 struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>, int, int, int> {
+                         std::vector<T>,
+                         typename jit::MatMulTuples<T>::attr_type> {
   void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
                   const std::vector<T>& a, const std::vector<T>& b,
-                  const std::vector<T>& cref, int m, int n, int k) {
+                  const std::vector<T>& cref,
+                  const typename jit::MatMulTuples<T>::attr_type& attr) {
     EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(a.size(), static_cast<size_t>(m * k));
-    EXPECT_EQ(b.size(), static_cast<size_t>(k * n));
-    EXPECT_EQ(cref.size(), static_cast<size_t>(m * n));
+    EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
+    EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
+    EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
     std::vector<T> c(cref.size());
     const T* a_data = a.data();
     const T* b_data = b.data();
     const T* cref_data = cref.data();
     T* c_data = c.data();
-    tgt(a_data, b_data, c_data, m, n, k);
-    ExpectEQ<T>(c_data, cref_data, m * n);
+    tgt(a_data, b_data, c_data, &attr);
+    ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
   }
 };
 
@@ -383,8 +385,8 @@ void TestAXYNKernel() {
 template <jit::KernelType KT, typename T, typename PlaceType>
 void TestXRNKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  auto last_acc = acc;
-  acc = 1e-4;
+  auto last_acc = FLAGS_acc;
+  FLAGS_acc = 1e-4;
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
     EXPECT_TRUE(ref != nullptr);
@@ -395,7 +397,7 @@ void TestXRNKernel() {
     TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
                                                                       ref_res);
   }
-  acc = last_acc;
+  FLAGS_acc = last_acc;
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
@@ -535,9 +537,10 @@ void TestSeqPoolKernel() {
 template <jit::KernelType KT, typename T, typename PlaceType>
 void TestMatMulKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
-  auto last_acc = acc;
-  // TODO(intel): this should be acc issue of MKL
-  acc = 1e-3;
+  auto last_acc = FLAGS_acc;
+  // TODO(intel): fix MKL acc issue
+  // https://github.com/PaddlePaddle/Paddle/issues/15447
+  FLAGS_acc = 1e-3;
   for (int m : {1, 2, 3, 4}) {
     for (int n : {1, 2, 3, 4}) {
       for (int k : TestSizes()) {
@@ -549,13 +552,14 @@ void TestMatMulKernel() {
         const T* a_data = a.data();
         const T* b_data = b.data();
         T* c_data = c.data();
-        ref(a_data, b_data, c_data, m, n, k);
+        const jit::matmul_attr_t attr{m, n, k};
+        ref(a_data, b_data, c_data, &attr);
         TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>,
-                     std::vector<T>, std::vector<T>>(k, a, b, c, m, n, k);
+                     std::vector<T>, std::vector<T>>(attr, a, b, c, attr);
       }
     }
   }
-  acc = last_acc;
+  FLAGS_acc = last_acc;
 }
 
 template <jit::KernelType KT, typename T, typename PlaceType>
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index fd15539f7b6727496988c9b13d0d2551659a420a..0af8b9e69cfe09890f28ef2028baa19319a5c379 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -193,8 +194,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
-                        ops::LookupTableCUDAKernel<double>);
+                        ops::LookupTableCUDAKernel<double>,
+                        ops::LookupTableCUDAKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
                         ops::LookupTableGradCUDAKernel<float>,
-                        ops::LookupTableGradCUDAKernel<double>);
+                        ops::LookupTableGradCUDAKernel<double>,
+                        ops::LookupTableGradCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index fb7119273a734feba870fdabade6a4faa1d5e9a3..69971ef7423eff6bc3f8543a491edb6b0bbd00ca 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -29,8 +29,9 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
                   const framework::LoDTensor *ids,
                   const framework::LoDTensor *scores,
                   framework::LoDTensor *selected_ids,
-                  framework::LoDTensor *selected_scores, size_t level,
-                  size_t beam_size, int end_id, bool is_accumulated) {
+                  framework::LoDTensor *selected_scores,
+                  framework::Tensor *parent_idx, size_t level, size_t beam_size,
+                  int end_id, bool is_accumulated) {
     auto abs_lod = framework::ToAbsOffset(scores->lod());
     auto &high_level = abs_lod[level];
 
@@ -57,11 +58,13 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
         std::vector<int64_t>({static_cast<int>(num_instances), 1}));
     selected_ids->Resize(dims);
     selected_scores->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
 
     auto *selected_ids_data =
         selected_ids->mutable_data<int64_t>(platform::CPUPlace());
     auto *selected_scores_data =
         selected_scores->mutable_data<float>(platform::CPUPlace());
+    auto *parent_idx_data = parent_idx->mutable_data<int>(platform::CPUPlace());
 
     // fill in data
     std::vector<size_t> low_level;
@@ -69,6 +72,7 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
     for (auto &items : selected_items) {
       low_level.push_back(low_offset);
       for (auto &item : items) {
+        parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
         selected_ids_data[low_offset] = item.id;
         selected_scores_data[low_offset] = item.score;
         low_offset++;
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index d94e3023ce537cb9fa456e079c4fa3cf57fb954d..61d021ef627f1ccd90b992c2078a7f3ca879422d 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -157,10 +157,10 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
 }
 
 __device__ __forceinline__ void WriteBack(
-    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
-    Triple* top_beam_local, const int seq_offset_start,
-    const int seq_offset_end, const int selected_seq_start,
-    const int selected_seq_length) {
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, Triple* top_beam_local,
+    const int seq_offset_start, const int seq_offset_end,
+    const int selected_seq_start, const int selected_seq_length) {
   const int tid = threadIdx.x;  // use 1 thread only for each sequence
   int global_index = selected_seq_start;
   for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
@@ -171,6 +171,7 @@ __device__ __forceinline__ void WriteBack(
         selected_ids[global_index] =
             static_cast<int64_t>(top_beam_local[local_index].id);
         selected_scores[global_index] = top_beam_local[local_index].score;
+        parent_idx[global_index] = static_cast<int>(global_offset);
         global_index++;
       }
     }
@@ -180,11 +181,11 @@ __device__ __forceinline__ void WriteBack(
 
 template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
 __device__ void BeamSearchDetails(
-    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
-    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
-    const float* scores, const int seq_offset_start, const int seq_offset_end,
-    const int seq_width, int beam_size, int end_id, bool is_accumulated,
-    int num_used_threads) {
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_offset_start,
+    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
+    bool is_accumulated, int num_used_threads) {
   __shared__ Triple top_beam[MaxLength];
 
   int num_items = 0;
@@ -228,15 +229,15 @@ __device__ void BeamSearchDetails(
       selected_offsets[0] = 0;
     }
 
-    WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local,
-              seq_offset_start, seq_offset_end, selected_seq_start,
-              selected_seq_length);
+    WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets,
+              top_beam_local, seq_offset_start, seq_offset_end,
+              selected_seq_start, selected_seq_length);
   }
 }
 
 template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
 __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
-                                 size_t* selected_offsets,
+                                 int* parent_idx, size_t* selected_offsets,
                                  const int64_t* pre_ids,
                                  const float* pre_scores, const int64_t* ids,
                                  const float* scores, const size_t* seq_offsets,
@@ -250,24 +251,25 @@ __global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
   int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
 
   BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
-      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
-      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
-      is_accumulated, num_used_threads);
+      selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
+      pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
+      beam_size, end_id, is_accumulated, num_used_threads);
 }
 
 template <int MaxLength, int MaxThreadsPerSeq>
 __global__ void BeamSearchKernelSingle(
-    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
-    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
-    const float* scores, const int seq_length, const int seq_width,
-    int beam_size, int end_id, bool is_accumulated, int num_used_threads) {
+    int64_t* selected_ids, float* selected_scores, int* parent_idx,
+    size_t* selected_offsets, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_length,
+    const int seq_width, int beam_size, int end_id, bool is_accumulated,
+    int num_used_threads) {
   const int seq_offset_start = 0;
   const int seq_offset_end = seq_length;
 
   BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
-      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
-      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
-      is_accumulated, num_used_threads);
+      selected_ids, selected_scores, parent_idx, selected_offsets, pre_ids,
+      pre_scores, ids, scores, seq_offset_start, seq_offset_end, seq_width,
+      beam_size, end_id, is_accumulated, num_used_threads);
 }
 
 static inline int GetNumUsedThreads(const int max_threads_per_seq,
@@ -300,8 +302,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                   const framework::LoDTensor* ids,
                   const framework::LoDTensor* scores,
                   framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores, size_t level,
-                  size_t beam_size, int end_id, bool is_accumulated) {
+                  framework::LoDTensor* selected_scores,
+                  framework::Tensor* parent_idx, size_t level, size_t beam_size,
+                  int end_id, bool is_accumulated) {
     auto abs_lod = framework::ToAbsOffset(scores->lod());
 
     const int64_t* pre_ids_data = pre_ids->data<int64_t>();
@@ -322,6 +325,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
         selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
     float* selected_scores_data =
         selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
+    int* parent_idx_data = parent_idx->mutable_data<int>(
+        {static_cast<int64_t>(num_seqs * beam_size)}, context.GetPlace());
 
     framework::LoD selected_lod(2);
     selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
@@ -339,9 +344,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
         CUDA_LAUNCH_KERNEL_HELPER(
             BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
                 1, kMaxThreadsPerSeq, 0, context.stream()>>>(
-                selected_ids_data, selected_scores_data, selected_offsets,
-                pre_ids_data, pre_scores_data, ids_data, scores_data,
-                seq_length, static_cast<int>(seq_width),
+                selected_ids_data, selected_scores_data, parent_idx_data,
+                selected_offsets, pre_ids_data, pre_scores_data, ids_data,
+                scores_data, seq_length, static_cast<int>(seq_width),
                 static_cast<int>(beam_size), static_cast<int>(end_id),
                 is_accumulated, num_used_threads));
       }
@@ -357,9 +362,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
         CUDA_LAUNCH_KERNEL_HELPER(
             BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
                 1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
-                selected_ids_data, selected_scores_data, selected_offsets,
-                pre_ids_data, pre_scores_data, ids_data, scores_data,
-                seq_offsets, static_cast<int>(num_seqs),
+                selected_ids_data, selected_scores_data, parent_idx_data,
+                selected_offsets, pre_ids_data, pre_scores_data, ids_data,
+                scores_data, seq_offsets, static_cast<int>(num_seqs),
                 static_cast<int>(seq_width), static_cast<int>(beam_size),
                 end_id, is_accumulated, num_used_threads));
       }
@@ -379,6 +384,7 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
           {static_cast<int64_t>(selected_lod[1].back()), 1});
       selected_ids->Resize(final_selected_dims);
       selected_scores->Resize(final_selected_dims);
+      parent_idx->Resize({static_cast<int64_t>(selected_lod[1].back())});
     }
   }
 };
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
index 3cd17f426c5596582c91f2b3f0cc5ba513e3aa4b..4474e7ea52affed792572d02202ec2577c471e50 100644
--- a/paddle/fluid/operators/math/beam_search.h
+++ b/paddle/fluid/operators/math/beam_search.h
@@ -104,14 +104,12 @@ class BeamSearchFunctor {
    * Return false if all the input tensor is empty, in machine translation task
    * that means no candidates is provided, and the task will stop running.
    */
-  void operator()(const DeviceContext& context,
-                  const framework::LoDTensor* pre_ids,
-                  const framework::LoDTensor* pre_scores,
-                  const framework::LoDTensor* ids,
-                  const framework::LoDTensor* scores,
-                  framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores, size_t level,
-                  size_t beam_size, int end_id, bool is_accumulated);
+  void operator()(
+      const DeviceContext& context, const framework::LoDTensor* pre_ids,
+      const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids,
+      const framework::LoDTensor* scores, framework::LoDTensor* selected_ids,
+      framework::LoDTensor* selected_scores, framework::Tensor* parent_idx,
+      size_t level, size_t beam_size, int end_id, bool is_accumulated);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index 1c29ee95f6b109209316e4e8c8f3cda37eac62ae..7ea8eb8b00db328ca13d3d33d751aca4eac66dae 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -93,13 +93,14 @@ void TestBeamSearch() {
 
   paddle::framework::LoDTensor selected_ids;
   paddle::framework::LoDTensor selected_scores;
+  paddle::framework::LoDTensor parent_idx;
 
   size_t level = 0;
   size_t beam_size = 2;
   int end_id = 0;
   paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
   beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
-             &selected_scores, level, beam_size, end_id, true);
+             &selected_scores, &parent_idx, level, beam_size, end_id, true);
 
   ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
 
diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h
index cddd0a18db53a7ddf9ca14d5f373180586ef6a31..0ad57c51be79cd3577b43c9af777bff710308fac 100644
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
@@ -30,15 +30,17 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
     return;
   }
   if (relu) {
-    auto compute =
-        jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(N);
+    auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>,
+                                    platform::CPUPlace>::Cache()
+                       .At(N);
     for (int i = 0; i < M; i++) {
       T* dst = Y + i * N;
       compute(B, dst, dst, N);
     }
   } else {
-    auto compute =
-        jit::Get<jit::kVAdd, jit::XYZNTuples<T>, platform::CPUPlace>(N);
+    auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>,
+                                    platform::CPUPlace>::Cache()
+                       .At(N);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 1ff9ff684fc8001afb0f768a033b4c5bd1592702..a1cb3f972826a67721b00ce6df0ec48cc34d6e03 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -82,8 +82,9 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
     const int kClassDim = 1;
     // 2D data. Batch x C
     auto compute_softmax =
-        jit::Get<jit::kSoftmax, jit::SoftmaxTuples<float>, platform::CPUPlace>(
-            in_dims[kClassDim]);
+        jit::KernelFuncs<jit::kSoftmax, jit::SoftmaxTuples<float>,
+                         platform::CPUPlace>::Cache()
+            .At(in_dims[kClassDim]);
     compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
   }
 };
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/activation_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/batch_norm_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/concat_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/conv_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
diff --git a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/conv_transpose_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
diff --git a/paddle/fluid/operators/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/dequantize_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/fc_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
diff --git a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/gaussian_random_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/lrn_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h
similarity index 100%
rename from paddle/fluid/operators/mkldnn_activation_op.h
rename to paddle/fluid/operators/mkldnn/mkldnn_activation_op.h
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/pool_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
diff --git a/paddle/fluid/operators/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/quantize_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/softmax_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/sum_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
similarity index 100%
rename from paddle/fluid/operators/transpose_mkldnn_op.cc
rename to paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
index d6e897ed4666261cdd0bd6565f61abb218d971e5..9f92bc01befb496c103bcd367ae9cfc5c8f402b0 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -31,6 +31,8 @@ std::map<std::string,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
     NgraphBridge::NG_NODE_MAP = {
+        {"conv2d", NG_OPS::BuildConv2dNode},
+        {"conv2d_grad", NG_OPS::BuildConv2dGradNode},
         {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
         {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
         {"fill_constant", NG_OPS::BuildFillConstantNode},
@@ -38,6 +40,8 @@ std::map<std::string,
         {"mean_grad", NG_OPS::BuildMeanGradNode},
         {"mul", NG_OPS::BuildMulNode},
         {"mul_grad", NG_OPS::BuildMulGradNode},
+        {"pool2d", NG_OPS::BuildPool2dNode},
+        {"pool2d_grad", NG_OPS::BuildPool2dGradNode},
         {"softmax", NG_OPS::BuildSoftmaxNode},
         {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
         {"scale", NG_OPS::BuildScaleNode},
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
index b6c7c67c13cc54a4ccdfb4e33795cad76d8179c8..a827f7cb5b7200aaa32d6b3e32f5941860709cf3 100644
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -22,10 +22,12 @@ limitations under the License. */
 #pragma once
 
 #include "ops/binary_unnary_op.h"
+#include "ops/conv2d_op.h"
 #include "ops/elementwise_add_op.h"
 #include "ops/fill_constant_op.h"
 #include "ops/mean_op.h"
 #include "ops/mul_op.h"
+#include "ops/pool2d_op.h"
 #include "ops/scale_op.h"
 #include "ops/softmax_op.h"
 #include "ops/top_k_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..46fb2703f51482afa0546f08b8fc7b2c98e281bc
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
@@ -0,0 +1,235 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+std::shared_ptr<ngraph::Node> GroupedConvolution(
+    const std::shared_ptr<ngraph::Node>& data_batch,
+    const std::shared_ptr<ngraph::Node>& filters, const ngraph::Strides strides,
+    const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
+    size_t groups) {
+  auto& data_shape = data_batch->get_shape();
+  auto& filter_shape = filters->get_shape();
+  ngraph::NodeVector ng_slices;
+
+  for (size_t i = 0; i < groups; ++i) {
+    size_t channel_step = filter_shape.at(1);
+    const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
+    const std::vector<size_t> upper_bound{data_shape.at(0),
+                                          (i + 1) * channel_step,
+                                          data_shape.at(2), data_shape.at(3)};
+    auto data_slice = std::make_shared<ngraph::op::Slice>(
+        data_batch, lower_bound, upper_bound);
+
+    size_t filter_step = filter_shape.at(0) / groups;
+    const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
+    const std::vector<size_t> filter_upper_bound{
+        (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
+        filter_shape.at(3)};
+    auto filter_slice = std::make_shared<ngraph::op::Slice>(
+        filters, filter_lower_bound, filter_upper_bound);
+    auto ng_conv = std::make_shared<ngraph::op::Convolution>(
+        data_slice, filter_slice, strides, dilations, paddings, paddings);
+    ng_slices.push_back(ng_conv);
+  }
+
+  size_t concat_axis = 1;
+  return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
+}
+
+std::shared_ptr<ngraph::Node> GroupedGradConvolutionFilter(
+    const std::shared_ptr<ngraph::Node>& data_batch,
+    const std::shared_ptr<ngraph::Node>& filters,
+    const std::shared_ptr<ngraph::Node>& doutput, const ngraph::Strides strides,
+    const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
+    size_t groups) {
+  auto& data_shape = data_batch->get_shape();
+  auto& filter_shape = filters->get_shape();
+  auto& out_shape = doutput->get_shape();
+  ngraph::NodeVector ng_slices;
+
+  for (size_t i = 0; i < groups; ++i) {
+    size_t channel_step = filter_shape.at(1);
+    const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
+    const std::vector<size_t> upper_bound{data_shape.at(0),
+                                          (i + 1) * channel_step,
+                                          data_shape.at(2), data_shape.at(3)};
+    auto data_slice = std::make_shared<ngraph::op::Slice>(
+        data_batch, lower_bound, upper_bound);
+
+    size_t filter_step = data_shape.at(0);
+
+    const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
+    const std::vector<size_t> filter_upper_bound{
+        (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
+        filter_shape.at(3)};
+    auto filter_slice = std::make_shared<ngraph::op::Slice>(
+        filters, filter_lower_bound, filter_upper_bound);
+
+    const std::vector<size_t> olower_bound{0, i * filter_step, 0, 0};
+    const std::vector<size_t> oupper_bound{out_shape.at(0),
+                                           (i + 1) * filter_step,
+                                           out_shape.at(2), out_shape.at(3)};
+    auto out_slice = std::make_shared<ngraph::op::Slice>(doutput, olower_bound,
+                                                         oupper_bound);
+
+    auto ng_conv = std::make_shared<ngraph::op::ConvolutionBackpropFilters>(
+        data_slice, filter_slice->get_shape(), out_slice, strides, dilations,
+        paddings, paddings, ngraph::Strides{1, 1});
+
+    ng_slices.push_back(ng_conv);
+  }
+
+  size_t concat_axis = 0;
+  return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
+}
+
+std::shared_ptr<ngraph::Node> GroupedGradConvolutionData(
+    const std::shared_ptr<ngraph::Node>& data_batch,
+    const std::shared_ptr<ngraph::Node>& filters,
+    const std::shared_ptr<ngraph::Node>& doutput, const ngraph::Strides strides,
+    const ngraph::Strides dilations, const ngraph::CoordinateDiff& paddings,
+    size_t groups) {
+  auto& data_shape = data_batch->get_shape();
+  auto& filter_shape = filters->get_shape();
+  auto& out_shape = doutput->get_shape();
+  ngraph::NodeVector ng_slices;
+
+  for (size_t i = 0; i < groups; ++i) {
+    size_t channel_step = filter_shape.at(1);
+    const std::vector<size_t> lower_bound{0, i * channel_step, 0, 0};
+    const std::vector<size_t> upper_bound{data_shape.at(0),
+                                          (i + 1) * channel_step,
+                                          data_shape.at(2), data_shape.at(3)};
+    auto data_slice = std::make_shared<ngraph::op::Slice>(
+        data_batch, lower_bound, upper_bound);
+
+    size_t filter_step = data_shape.at(0);
+
+    const std::vector<size_t> filter_lower_bound{i * filter_step, 0, 0, 0};
+    const std::vector<size_t> filter_upper_bound{
+        (i + 1) * filter_step, filter_shape.at(1), filter_shape.at(2),
+        filter_shape.at(3)};
+    auto filter_slice = std::make_shared<ngraph::op::Slice>(
+        filters, filter_lower_bound, filter_upper_bound);
+
+    const std::vector<size_t> olower_bound{0, i * filter_step, 0, 0};
+    const std::vector<size_t> oupper_bound{out_shape.at(0),
+                                           (i + 1) * filter_step,
+                                           out_shape.at(2), out_shape.at(3)};
+    auto out_slice = std::make_shared<ngraph::op::Slice>(doutput, olower_bound,
+                                                         oupper_bound);
+
+    auto ng_conv = std::make_shared<ngraph::op::ConvolutionBackpropData>(
+        data_slice->get_shape(), filter_slice, out_slice, strides, dilations,
+        paddings, paddings, ngraph::Strides{1, 1});
+    ng_slices.push_back(ng_conv);
+  }
+
+  size_t concat_axis = 1;
+  return std::make_shared<ngraph::op::Concat>(ng_slices, concat_axis);
+}
+
+void BuildConv2dNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto filters = paddle::platform::GetInputNode(op, "Filter", ngb_node_map);
+  auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map);
+
+  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
+  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = op_attrs.Get<std::vector<int>>("dilations");
+
+  const ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
+                                   static_cast<size_t>(strides.at(1))};
+  const ngraph::Strides ng_dilations{static_cast<size_t>(dilations.at(0)),
+                                     static_cast<size_t>(dilations.at(1))};
+  const ngraph::CoordinateDiff ng_paddings{
+      static_cast<std::ptrdiff_t>(paddings.at(0)),
+      static_cast<std::ptrdiff_t>(paddings.at(1))};
+
+  int groups = static_cast<size_t>(op_attrs.Get<int>("groups"));
+  PADDLE_ENFORCE_GE(groups, 1, "conv groups needs be no less than 1");
+
+  std::shared_ptr<ngraph::Node> result;
+  if (groups == 1) {
+    result = std::make_shared<ngraph::op::Convolution>(
+        input, filters, ng_strides, ng_dilations, ng_paddings, ng_paddings);
+  } else {
+    result = GroupedConvolution(input, filters, ng_strides, ng_dilations,
+                                ng_paddings, groups);
+  }
+  paddle::platform::SetOutputNode(op, "Output", result, ngb_node_map);
+}
+
+void BuildConv2dGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto filter = paddle::platform::GetInputNode(op, "Filter", ngb_node_map);
+  auto input = paddle::platform::GetInputNode(op, "Input", ngb_node_map);
+  auto doutput =
+      paddle::platform::GetInputNode(op, "Output@GRAD", ngb_node_map);
+
+  int groups = op_attrs.Get<int>("groups");
+  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
+  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = op_attrs.Get<std::vector<int>>("dilations");
+
+  const ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
+                                   static_cast<size_t>(strides.at(1))};
+  const ngraph::Strides ng_dilations{static_cast<size_t>(dilations.at(0)),
+                                     static_cast<size_t>(dilations.at(1))};
+  const ngraph::CoordinateDiff ng_paddings{
+      static_cast<std::ptrdiff_t>(paddings.at(0)),
+      static_cast<std::ptrdiff_t>(paddings.at(1))};
+
+  std::shared_ptr<ngraph::Node> dfilter;
+  std::shared_ptr<ngraph::Node> dinput;
+  if (groups == 1) {
+    dfilter = std::make_shared<ngraph::op::ConvolutionBackpropFilters>(
+        input, filter->get_shape(), doutput, ng_strides, ng_dilations,
+        ng_paddings, ng_paddings, ngraph::Strides{1, 1});
+
+    dinput = std::make_shared<ngraph::op::ConvolutionBackpropData>(
+        input->get_shape(), filter, doutput, ng_strides, ng_dilations,
+        ng_paddings, ng_paddings, ngraph::Strides{1, 1});
+
+  } else {
+    dfilter = GroupedGradConvolutionFilter(input, filter, doutput, ng_strides,
+                                           ng_dilations, ng_paddings, groups);
+    dinput = GroupedGradConvolutionData(input, filter, doutput, ng_strides,
+                                        ng_dilations, ng_paddings, groups);
+  }
+
+  paddle::platform::SetOutputNode(op, "Filter@GRAD", dfilter, ngb_node_map);
+  paddle::platform::SetOutputNode(op, "Input@GRAD", dinput, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..836c9d6c185b305d3dd4c9e9d30e23abb0c1431c
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -0,0 +1,174 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildPool2dNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto x_shape = x->get_shape();
+
+  std::string pooling_type = op_attrs.Get<std::string>("pooling_type");
+  std::vector<int> ksize = op_attrs.Get<std::vector<int>>("ksize");
+  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
+  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(),
+                    "Handling 2d pooling only");
+
+  if (op_attrs.Get<bool>("global_pooling")) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(x_shape.at(i + 2));
+    }
+  }
+
+  ngraph::Shape ng_padding_below{static_cast<size_t>(paddings.at(0)),
+                                 static_cast<size_t>(paddings.at(1))};
+  ngraph::Shape ng_padding_above{static_cast<size_t>(paddings.at(0)),
+                                 static_cast<size_t>(paddings.at(1))};
+  ngraph::Shape ng_ksize_shape{static_cast<size_t>(ksize.at(0)),
+                               static_cast<size_t>(ksize.at(1))};
+  ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
+                             static_cast<size_t>(strides.at(1))};
+
+  auto ComputeCeiledOutput = [](size_t in, size_t k, size_t p, size_t s) {
+    return (in - k + 2 * p) / s + 1;
+  };
+
+  if (op_attrs.Get<bool>("ceil_mode")) {
+    auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
+    auto dummpy_shape = dummy_out->get_shape();
+    for (size_t i = 0; i < ng_padding_above.size(); ++i) {
+      auto desired_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i],
+                                              paddings[i], strides[i]);
+      if (desired_size != dummpy_shape[i + 2]) {
+        ng_padding_above[i] += strides[i];
+      }
+    }
+  }
+
+  bool padding_exclusive = op_attrs.Get<bool>("exclusive");
+  if (pooling_type == "max") {
+    auto pool2d = std::make_shared<ngraph::op::MaxPool>(
+        x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above);
+    paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map);
+  } else if (pooling_type == "avg") {
+    std::shared_ptr<ngraph::Node> pool2d;
+    if (op_attrs.Get<bool>("adaptive")) {
+      auto ComputeAdaptive = [](size_t in, size_t k) {
+        return std::floor(in / k);
+      };
+      ng_strides[0] = x_shape.size() == 4
+                          ? ComputeAdaptive(x_shape[3], ksize[0])
+                          : ng_strides[0];
+      ng_strides[1] = x_shape.size() == 4
+                          ? ComputeAdaptive(x_shape[3], ksize[0])
+                          : ng_strides[1];
+      pool2d =
+          std::make_shared<ngraph::op::AvgPool>(x, ng_ksize_shape, ng_strides);
+    } else {
+      pool2d = std::make_shared<ngraph::op::AvgPool>(
+          x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above,
+          !padding_exclusive);
+    }
+    paddle::platform::SetOutputNode(op, "Out", pool2d, ngb_node_map);
+  } else {
+    PADDLE_THROW("Support max and avg pooling only");
+  }
+}
+
+void BuildPool2dGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
+  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto x_shape = x->get_shape();
+
+  std::string pooling_type = op_attrs.Get<std::string>("pooling_type");
+  std::vector<int> ksize = op_attrs.Get<std::vector<int>>("ksize");
+  std::vector<int> strides = op_attrs.Get<std::vector<int>>("strides");
+  std::vector<int> paddings = op_attrs.Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE_EQ(x_shape.size() - 2, ksize.size(),
+                    "Handling 2d pooling only");
+
+  if (op_attrs.Get<bool>("global_pooling")) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(x_shape.at(i + 2));
+    }
+  }
+
+  ngraph::Shape ng_padding_below{static_cast<size_t>(paddings.at(0)),
+                                 static_cast<size_t>(paddings.at(1))};
+  ngraph::Shape ng_padding_above{static_cast<size_t>(paddings.at(0)),
+                                 static_cast<size_t>(paddings.at(1))};
+  ngraph::Shape ng_ksize_shape{static_cast<size_t>(ksize.at(0)),
+                               static_cast<size_t>(ksize.at(1))};
+  ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
+                             static_cast<size_t>(strides.at(1))};
+
+  bool padding_exclusive = op_attrs.Get<bool>("exclusive");
+  if (pooling_type == "max") {
+    auto pool2d_grad = std::make_shared<ngraph::op::MaxPoolBackprop>(
+        x, dout, out, ng_ksize_shape, ng_strides, ng_padding_below,
+        ng_padding_above);
+    paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map);
+  } else if (pooling_type == "avg") {
+    std::shared_ptr<ngraph::Node> pool2d_grad;
+    if (op_attrs.Get<bool>("adaptive")) {
+      auto ComputeAdaptive = [](size_t in, size_t k) {
+        return std::floor(in / k);
+      };
+      ng_strides[0] = x_shape.size() == 4
+                          ? ComputeAdaptive(x_shape[3], ksize[0])
+                          : ng_strides[0];
+      ng_strides[1] = x_shape.size() == 4
+                          ? ComputeAdaptive(x_shape[3], ksize[0])
+                          : ng_strides[1];
+      pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
+          x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
+          ng_padding_above, !padding_exclusive);
+    } else {
+      pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
+          x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
+          ng_padding_above, !padding_exclusive);
+    }
+    paddle::platform::SetOutputNode(op, "X@GRAD", pool2d_grad, ngb_node_map);
+  } else {
+    PADDLE_THROW("Support max and avg pooling only");
+  }
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 91fdd4309ad71cfb0e57b2fc49d93431afee1d01..eda54f76b898cdf893347d31cadb86dea892a4ce 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -359,6 +359,7 @@ class ReshapeGradInplaceInToOut : public framework::InplaceInToOut {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>,
@@ -388,16 +389,20 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel);
+                                int64_t, ops::ReshapeKernel, plat::float16,
+                                ops::ReshapeKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel, plat::float16,
                                 ops::ReshapeGradKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel);
+                                int64_t, ops::ReshapeKernel, plat::float16,
+                                ops::ReshapeKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel, plat::float16,
                                 ops::ReshapeGradKernel);
 #endif
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 789e61b2d332b9391ef45a8ebe58ad0f1a4d2bf0..94995fc99612adb1164e60f1a51747f74eacfb73 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -54,6 +54,9 @@ class SliceOp : public framework::OperatorWithKernel {
       out_dims[axes[i]] = end - start;
     }
     ctx->SetOutputDim("Out", out_dims);
+    if (axes[0] != 0) {
+      ctx->ShareLoD("Input", /*->*/ "Out");
+    }
   }
 
  protected:
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index bf2a9e5b3d22996e688621727cb280dc9aed7859..24d0b2f906a8e0b360c3f477c9290ebe5d57a3ff 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -17,13 +17,16 @@
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
-                        ops::StackKernel<plat::CUDADeviceContext, double>,
-                        ops::StackKernel<plat::CUDADeviceContext, int>,
-                        ops::StackKernel<plat::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    stack, ops::StackKernel<plat::CUDADeviceContext, float>,
+    ops::StackKernel<plat::CUDADeviceContext, double>,
+    ops::StackKernel<plat::CUDADeviceContext, int>,
+    ops::StackKernel<plat::CUDADeviceContext, int64_t>,
+    ops::StackKernel<plat::CUDADeviceContext, plat::float16>);
 
-REGISTER_OP_CUDA_KERNEL(stack_grad,
-                        ops::StackGradKernel<plat::CUDADeviceContext, float>,
-                        ops::StackGradKernel<plat::CUDADeviceContext, double>,
-                        ops::StackGradKernel<plat::CUDADeviceContext, int>,
-                        ops::StackGradKernel<plat::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    stack_grad, ops::StackGradKernel<plat::CUDADeviceContext, float>,
+    ops::StackGradKernel<plat::CUDADeviceContext, double>,
+    ops::StackGradKernel<plat::CUDADeviceContext, int>,
+    ops::StackGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::StackGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
index b4025350fa9f3610bde43eee91cd059f3063813f..915774e5f3624f26dbd1451a99d7bf0bf75a72c8 100644
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -15,19 +15,27 @@ limitations under the License. */
 #include "paddle/fluid/operators/transpose_op.h"
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     transpose2,
     ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     transpose2_grad,
     ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::float16>);
diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h
deleted file mode 100644
index 0bb285722ddedf721d98237760ec9868e2134442..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ /dev/null
@@ -1,483 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Array5 = Eigen::DSizes<int64_t, 5>;
-
-template <typename T>
-static inline bool isZero(T x) {
-  return fabs(x) < 1e-6;
-}
-
-template <typename T>
-static inline T sigmoid(T x) {
-  return 1.0 / (exp(-1.0 * x) + 1.0);
-}
-
-template <typename T>
-static inline T CalcMaskPointNum(const Tensor& mask) {
-  auto mask_t = EigenVector<int>::Flatten(mask);
-  T count = 0.0;
-  for (int i = 0; i < mask_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      count += 1.0;
-    }
-  }
-  return count;
-}
-
-template <typename T>
-static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y,
-                                const Tensor& mask) {
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  T error_sum = 0.0;
-  T points = 0.0;
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      error_sum += pow(x_t(i) - y_t(i), 2);
-      points += 1;
-    }
-  }
-  return (error_sum / points);
-}
-
-template <typename T>
-static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y,
-                                const Tensor& mask, T mf) {
-  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf;
-    }
-  }
-}
-
-template <typename T>
-static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y,
-                                const Tensor& mask) {
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  T error_sum = 0.0;
-  T points = 0.0;
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      error_sum +=
-          -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i)));
-      points += 1;
-    }
-  }
-  return (error_sum / points);
-}
-
-template <typename T>
-static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x,
-                                       const Tensor& y, const Tensor& mask,
-                                       T mf) {
-  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
-  auto x_t = EigenVector<T>::Flatten(x);
-  auto y_t = EigenVector<T>::Flatten(y);
-  auto mask_t = EigenVector<int>::Flatten(mask);
-
-  for (int i = 0; i < x_t.dimensions()[0]; i++) {
-    if (mask_t(i)) {
-      grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf;
-    }
-  }
-}
-
-template <typename T>
-static void CalcPredResult(const Tensor& input, Tensor* pred_conf,
-                           Tensor* pred_class, Tensor* pred_x, Tensor* pred_y,
-                           Tensor* pred_w, Tensor* pred_h, const int anchor_num,
-                           const int class_num) {
-  const int n = input.dims()[0];
-  const int h = input.dims()[2];
-  const int w = input.dims()[3];
-  const int box_attr_num = 5 + class_num;
-
-  auto input_t = EigenTensor<T, 4>::From(input);
-  auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
-  auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
-  auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
-  auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
-  auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
-  auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
-
-  for (int i = 0; i < n; i++) {
-    for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
-      for (int j = 0; j < h; j++) {
-        for (int k = 0; k < w; k++) {
-          pred_x_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx, j, k));
-          pred_y_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k));
-          pred_w_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 2, j, k);
-          pred_h_t(i, an_idx, j, k) =
-              input_t(i, box_attr_num * an_idx + 3, j, k);
-
-          pred_conf_t(i, an_idx, j, k) =
-              sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k));
-
-          for (int c = 0; c < class_num; c++) {
-            pred_class_t(i, an_idx, j, k, c) =
-                sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k));
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static T CalcBoxIoU(std::vector<T> box1, std::vector<T> box2) {
-  T b1_x1 = box1[0] - box1[2] / 2;
-  T b1_x2 = box1[0] + box1[2] / 2;
-  T b1_y1 = box1[1] - box1[3] / 2;
-  T b1_y2 = box1[1] + box1[3] / 2;
-  T b2_x1 = box2[0] - box2[2] / 2;
-  T b2_x2 = box2[0] + box2[2] / 2;
-  T b2_y1 = box2[1] - box2[3] / 2;
-  T b2_y2 = box2[1] + box2[3] / 2;
-
-  T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1);
-  T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1);
-
-  T inter_rect_x1 = std::max(b1_x1, b2_x1);
-  T inter_rect_y1 = std::max(b1_y1, b2_y1);
-  T inter_rect_x2 = std::min(b1_x2, b2_x2);
-  T inter_rect_y2 = std::min(b1_y2, b2_y2);
-  T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast<T>(0.0)) *
-                 std::max(inter_rect_y2 - inter_rect_y1, static_cast<T>(0.0));
-
-  return inter_area / (b1_area + b2_area - inter_area);
-}
-
-template <typename T>
-static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
-                            const float ignore_thresh, std::vector<int> anchors,
-                            const int grid_size, Tensor* obj_mask,
-                            Tensor* noobj_mask, Tensor* tx, Tensor* ty,
-                            Tensor* tw, Tensor* th, Tensor* tconf,
-                            Tensor* tclass) {
-  const int n = gt_box.dims()[0];
-  const int b = gt_box.dims()[1];
-  const int anchor_num = anchors.size() / 2;
-  auto gt_box_t = EigenTensor<T, 3>::From(gt_box);
-  auto gt_label_t = EigenTensor<int, 2>::From(gt_label);
-  auto obj_mask_t = EigenTensor<int, 4>::From(*obj_mask).setConstant(0);
-  auto noobj_mask_t = EigenTensor<int, 4>::From(*noobj_mask).setConstant(1);
-  auto tx_t = EigenTensor<T, 4>::From(*tx).setConstant(0.0);
-  auto ty_t = EigenTensor<T, 4>::From(*ty).setConstant(0.0);
-  auto tw_t = EigenTensor<T, 4>::From(*tw).setConstant(0.0);
-  auto th_t = EigenTensor<T, 4>::From(*th).setConstant(0.0);
-  auto tconf_t = EigenTensor<T, 4>::From(*tconf).setConstant(0.0);
-  auto tclass_t = EigenTensor<T, 5>::From(*tclass).setConstant(0.0);
-
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < b; j++) {
-      if (isZero<T>(gt_box_t(i, j, 0)) && isZero<T>(gt_box_t(i, j, 1)) &&
-          isZero<T>(gt_box_t(i, j, 2)) && isZero<T>(gt_box_t(i, j, 3))) {
-        continue;
-      }
-
-      int cur_label = gt_label_t(i, j);
-      T gx = gt_box_t(i, j, 0) * grid_size;
-      T gy = gt_box_t(i, j, 1) * grid_size;
-      T gw = gt_box_t(i, j, 2) * grid_size;
-      T gh = gt_box_t(i, j, 3) * grid_size;
-      int gi = static_cast<int>(gx);
-      int gj = static_cast<int>(gy);
-
-      T max_iou = static_cast<T>(0);
-      T iou;
-      int best_an_index = -1;
-      std::vector<T> gt_box_shape({0, 0, gw, gh});
-      for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
-        std::vector<T> anchor_shape({0, 0, static_cast<T>(anchors[2 * an_idx]),
-                                     static_cast<T>(anchors[2 * an_idx + 1])});
-        iou = CalcBoxIoU<T>(gt_box_shape, anchor_shape);
-        if (iou > max_iou) {
-          max_iou = iou;
-          best_an_index = an_idx;
-        }
-        if (iou > ignore_thresh) {
-          noobj_mask_t(i, an_idx, gj, gi) = 0;
-        }
-      }
-      obj_mask_t(i, best_an_index, gj, gi) = 1;
-      noobj_mask_t(i, best_an_index, gj, gi) = 0;
-      tx_t(i, best_an_index, gj, gi) = gx - gi;
-      ty_t(i, best_an_index, gj, gi) = gy - gj;
-      tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]);
-      th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]);
-      tclass_t(i, best_an_index, gj, gi, cur_label) = 1;
-      tconf_t(i, best_an_index, gj, gi) = 1;
-    }
-  }
-}
-
-static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand,
-                                    const Tensor& obj_mask) {
-  const int n = obj_mask_expand->dims()[0];
-  const int an_num = obj_mask_expand->dims()[1];
-  const int h = obj_mask_expand->dims()[2];
-  const int w = obj_mask_expand->dims()[3];
-  const int class_num = obj_mask_expand->dims()[4];
-  auto obj_mask_expand_t = EigenTensor<int, 5>::From(*obj_mask_expand);
-  auto obj_mask_t = EigenTensor<int, 4>::From(obj_mask);
-
-  obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
-                          .broadcast(Array5(1, 1, 1, 1, class_num));
-}
-
-template <typename T>
-static void AddAllGradToInputGrad(
-    Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y,
-    const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x,
-    const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h,
-    const Tensor& grad_conf_target, const Tensor& grad_conf_notarget,
-    const Tensor& grad_class, const int class_num, const float loss_weight_xy,
-    const float loss_weight_wh, const float loss_weight_conf_target,
-    const float loss_weight_conf_notarget, const float loss_weight_class) {
-  const int n = pred_x.dims()[0];
-  const int an_num = pred_x.dims()[1];
-  const int h = pred_x.dims()[2];
-  const int w = pred_x.dims()[3];
-  const int attr_num = class_num + 5;
-  auto grad_t = EigenTensor<T, 4>::From(*grad).setConstant(0.0);
-  auto pred_x_t = EigenTensor<T, 4>::From(pred_x);
-  auto pred_y_t = EigenTensor<T, 4>::From(pred_y);
-  auto pred_conf_t = EigenTensor<T, 4>::From(pred_conf);
-  auto pred_class_t = EigenTensor<T, 5>::From(pred_class);
-  auto grad_x_t = EigenTensor<T, 4>::From(grad_x);
-  auto grad_y_t = EigenTensor<T, 4>::From(grad_y);
-  auto grad_w_t = EigenTensor<T, 4>::From(grad_w);
-  auto grad_h_t = EigenTensor<T, 4>::From(grad_h);
-  auto grad_conf_target_t = EigenTensor<T, 4>::From(grad_conf_target);
-  auto grad_conf_notarget_t = EigenTensor<T, 4>::From(grad_conf_notarget);
-  auto grad_class_t = EigenTensor<T, 5>::From(grad_class);
-
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < an_num; j++) {
-      for (int k = 0; k < h; k++) {
-        for (int l = 0; l < w; l++) {
-          grad_t(i, j * attr_num, k, l) =
-              grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) *
-              (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy;
-          grad_t(i, j * attr_num + 1, k, l) =
-              grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) *
-              (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy;
-          grad_t(i, j * attr_num + 2, k, l) =
-              grad_w_t(i, j, k, l) * loss * loss_weight_wh;
-          grad_t(i, j * attr_num + 3, k, l) =
-              grad_h_t(i, j, k, l) * loss * loss_weight_wh;
-          grad_t(i, j * attr_num + 4, k, l) =
-              grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
-              (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target;
-          grad_t(i, j * attr_num + 4, k, l) +=
-              grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
-              (1.0 - pred_conf_t(i, j, k, l)) * loss *
-              loss_weight_conf_notarget;
-
-          for (int c = 0; c < class_num; c++) {
-            grad_t(i, j * attr_num + 5 + c, k, l) =
-                grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) *
-                (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-class Yolov3LossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* gt_box = ctx.Input<Tensor>("GTBox");
-    auto* gt_label = ctx.Input<Tensor>("GTLabel");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
-    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
-    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
-    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
-    float loss_weight_conf_notarget =
-        ctx.Attr<float>("loss_weight_conf_notarget");
-    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int an_num = anchors.size() / 2;
-
-    Tensor pred_x, pred_y, pred_w, pred_h;
-    Tensor pred_conf, pred_class;
-    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
-                      &pred_w, &pred_h, an_num, class_num);
-
-    Tensor obj_mask, noobj_mask;
-    Tensor tx, ty, tw, th, tconf, tclass;
-    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
-                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
-
-    Tensor obj_mask_expand;
-    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
-                                      ctx.GetPlace());
-    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
-
-    T loss_x = CalcMSEWithMask<T>(pred_x, tx, obj_mask);
-    T loss_y = CalcMSEWithMask<T>(pred_y, ty, obj_mask);
-    T loss_w = CalcMSEWithMask<T>(pred_w, tw, obj_mask);
-    T loss_h = CalcMSEWithMask<T>(pred_h, th, obj_mask);
-    T loss_conf_target = CalcBCEWithMask<T>(pred_conf, tconf, obj_mask);
-    T loss_conf_notarget = CalcBCEWithMask<T>(pred_conf, tconf, noobj_mask);
-    T loss_class = CalcBCEWithMask<T>(pred_class, tclass, obj_mask_expand);
-
-    auto* loss_data = loss->mutable_data<T>({1}, ctx.GetPlace());
-    loss_data[0] = loss_weight_xy * (loss_x + loss_y) +
-                   loss_weight_wh * (loss_w + loss_h) +
-                   loss_weight_conf_target * loss_conf_target +
-                   loss_weight_conf_notarget * loss_conf_notarget +
-                   loss_weight_class * loss_class;
-  }
-};
-
-template <typename T>
-class Yolov3LossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* gt_box = ctx.Input<Tensor>("GTBox");
-    auto* gt_label = ctx.Input<Tensor>("GTLabel");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    const T loss = output_grad->data<T>()[0];
-    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
-    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
-    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
-    float loss_weight_conf_notarget =
-        ctx.Attr<float>("loss_weight_conf_notarget");
-    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
-
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int an_num = anchors.size() / 2;
-
-    Tensor pred_x, pred_y, pred_w, pred_h;
-    Tensor pred_conf, pred_class;
-    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
-                      &pred_w, &pred_h, an_num, class_num);
-
-    Tensor obj_mask, noobj_mask;
-    Tensor tx, ty, tw, th, tconf, tclass;
-    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
-    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
-                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
-
-    Tensor obj_mask_expand;
-    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
-                                      ctx.GetPlace());
-    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
-
-    Tensor grad_x, grad_y, grad_w, grad_h;
-    Tensor grad_conf_target, grad_conf_notarget, grad_class;
-    grad_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_conf_target.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_conf_notarget.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
-    grad_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
-    T obj_mf = CalcMaskPointNum<int>(obj_mask);
-    T noobj_mf = CalcMaskPointNum<int>(noobj_mask);
-    T obj_expand_mf = CalcMaskPointNum<int>(obj_mask_expand);
-    CalcMSEGradWithMask<T>(&grad_x, pred_x, tx, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_y, pred_y, ty, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_w, pred_w, tw, obj_mask, obj_mf);
-    CalcMSEGradWithMask<T>(&grad_h, pred_h, th, obj_mask, obj_mf);
-    CalcBCEGradWithMask<T>(&grad_conf_target, pred_conf, tconf, obj_mask,
-                           obj_mf);
-    CalcBCEGradWithMask<T>(&grad_conf_notarget, pred_conf, tconf, noobj_mask,
-                           noobj_mf);
-    CalcBCEGradWithMask<T>(&grad_class, pred_class, tclass, obj_mask_expand,
-                           obj_expand_mf);
-
-    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
-    AddAllGradToInputGrad<T>(
-        input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y,
-        grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class,
-        class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target,
-        loss_weight_conf_notarget, loss_weight_class);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 6127ca8a3eacd013dd258a02b9f3cc792b634137..870c57e54011361caae5265201d19f58830a87bc 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,6 +22,8 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import int8_inference
+from .int8_inference import *
 from . import reader
 from .reader import *
 from . import slim
@@ -34,6 +36,7 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += int8_inference.__all__
 __all__ += reader.__all__
 __all__ += slim.__all__
 __all__ += utils.__all__
diff --git a/python/paddle/fluid/contrib/int8_inference/README.md b/python/paddle/fluid/contrib/int8_inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a9691dad4494f5eacf427b2806b2393baa57dc1e
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -0,0 +1,72 @@
+# Offline INT8 Calibration Tool
+
+PaddlePaddle supports offline INT8 calibration to accelerate the inference speed. In this document, we provide the instructions on how to enable INT8 calibration and show the ResNet-50 and MobileNet-V1 results in accuracy.
+
+## 0. Prerequisite
+You need to install at least PaddlePaddle-1.3 python package `pip install paddlepaddle==1.3`.
+
+## 1. How to generate INT8 model
+You can refer to the unit test in [test_calibration.py](../tests/test_calibration.py). Basically, there are three steps:
+* Construct calibration object.
+
+```python
+calibrator = int8_utility.Calibrator( # Step 1
+    program=infer_program, # required, FP32 program
+    pretrained_model=model_path, # required, FP32 pretrained model
+    algo=algo, # required, calibration algorithm; default is max, the alternative is KL (Kullback–Leibler divergence)
+    exe=exe, # required, executor
+    output=int8_model, # required, INT8 model
+    feed_var_names=feed_dict, # required, feed dict
+    fetch_list=fetch_targets) # required, fetch targets
+```
+
+* Call the calibrator.sample_data() after executor run.
+```python
+_, acc1, _ = exe.run(
+    program,
+    feed={feed_dict[0]: image,
+          feed_dict[1]: label},
+    fetch_list=fetch_targets)
+
+calibrator.sample_data() # Step 2
+```
+
+* Call the calibrator.save_int8_model() after sampling over specified iterations (e.g., iterations = 50)
+```python
+calibrator.save_int8_model() # Step 3
+```
+
+## 2. How to run INT8 model
+You can load INT8 model by load_inference_model [API](https://github.com/PaddlePaddle/Paddle/blob/8b50ad80ff6934512d3959947ac1e71ea3fb9ea3/python/paddle/fluid/io.py#L991) and run INT8 inference similar as [FP32](https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/object_detection/eval.py "FP32").
+
+```python
+[infer_program, feed_dict,
+    fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+```
+
+## 3. Result
+We provide the results of accuracy measurd on [Intel® Xeon® Platinum Gold Processor](https://ark.intel.com/products/120489/Intel-Xeon-Gold-6148-Processor-27-5M-Cache-2-40-GHz- "Intel® Xeon® Gold 6148 Processor") (also known as Intel® Xeon® Skylake6148).
+
+| Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
+| ------------ | ------------ | ------------ | ------------ | ------------ |
+| ResNet-50  | Small  | 72.00%  | 72.00%  |  0.00% |
+| MobileNet-V1  | Small  | 62.00%  | 62.00%  | 0.00%  |
+| ResNet-50  | Full ImageNet Val  |  76.63%  | 76.17%  | 0.46% |
+| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.49%  | 0.29%  |
+
+Please note that [Small](http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz "Small") is a subset of [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset"). 
+
+Notes:
+* The accuracy measurement requires the model with `label`.
+* The INT8 theoretical speedup is ~1.33X on Intel® Xeon® Skylake Server (please refer to `This allows for 4x more input at the cost of 3x more instructions or 33.33% more compute` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")).
+
+## 4. How to reproduce the results
+* Small dataset
+```bash
+python python/paddle/fluid/contrib/tests/test_calibration.py
+```
+
+* Full dataset
+```bash
+DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
+```
diff --git a/python/paddle/fluid/contrib/int8_inference/__init__.py b/python/paddle/fluid/contrib/int8_inference/__init__.py
index eca2dce114b069bf9b455d77ce670d73b5047fd2..45547201d598c809f7dcf3a1a09103ae5de3e4c6 100644
--- a/python/paddle/fluid/contrib/int8_inference/__init__.py
+++ b/python/paddle/fluid/contrib/int8_inference/__init__.py
@@ -11,3 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
+
+from . import utility
+from .utility import *
+
+__all__ = utility.__all__
diff --git a/python/paddle/fluid/contrib/int8_inference/utility.py b/python/paddle/fluid/contrib/int8_inference/utility.py
index 40de038f28a83738e6e6cd8c77c0a9916ce68b4f..b35d9f2424ccf093f70e75b13e23f6c5ad59e859 100644
--- a/python/paddle/fluid/contrib/int8_inference/utility.py
+++ b/python/paddle/fluid/contrib/int8_inference/utility.py
@@ -11,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import paddle.fluid.core as core
+
+from paddle.fluid import core
 import numpy as np
 import math
 import os
-import paddle.fluid as fluid
+from paddle.fluid.executor import global_scope
+from paddle.fluid import io
+
+__all__ = ['Calibrator']
 
 
 class Calibrator(object):
@@ -76,8 +80,7 @@ class Calibrator(object):
         '''
         for i in self.sampling_program.list_vars():
             if i.name in self.sampling_vars:
-                np_data = np.array(fluid.global_scope().find_var(i.name)
-                                   .get_tensor())
+                np_data = np.array(global_scope().find_var(i.name).get_tensor())
                 if i.name not in self._sampling_data:
                     self._sampling_data[i.name] = []
                 self._sampling_data[i.name].append(np_data)
@@ -86,9 +89,9 @@ class Calibrator(object):
         '''
         Save the quantized model to the disk.
         '''
-        fluid.io.save_inference_model(self.output, self.feed_var_names,
-                                      self.fetch_list, self.exe,
-                                      self.sampling_program)
+        io.save_inference_model(self.output, self.feed_var_names,
+                                self.fetch_list, self.exe,
+                                self.sampling_program)
 
     def __display_debug(self):
         if self.debug:
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
index f07fefe7e097377a845193bb37b6e9aa42708948..424ea245a0f2dff0d437ace386f2e4e0fa6b517d 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -19,15 +19,12 @@ import sys
 import random
 import paddle
 import paddle.fluid as fluid
-import argparse
 import functools
 import contextlib
-import paddle.fluid.profiler as profiler
 from paddle.dataset.common import download
 from PIL import Image, ImageEnhance
 import math
-sys.path.append('..')
-import int8_inference.utility as int8_utility
+import paddle.fluid.contrib.int8_inference.utility as int8_utility
 
 random.seed(0)
 np.random.seed(0)
@@ -43,7 +40,7 @@ img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
 img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
 
 
-# TODO(guomingz): Remove duplicated code from line 45 ~ line 114
+# TODO(guomingz): Remove duplicated code from resize_short, crop_image, process_image, _reader_creator
 def resize_short(img, target_size):
     percent = float(target_size) / min(img.size[0], img.size[1])
     resized_width = int(round(img.size[0] * percent))
@@ -123,16 +120,37 @@ class TestCalibrationForResnet50(unittest.TestCase):
         self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
                                                self.int8_download)
 
-        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
-        data_md5 = '1b6c1c434172cca1bf9ba1e4d7a3157d'
-        self.data_cache_folder = self.download_data(data_url, data_md5, "data")
+        data_urls = []
+        data_md5s = []
+        self.data_cache_folder = ''
+        if os.environ.get('DATASET') == 'full':
+            data_urls.append(
+                'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
+            )
+            data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
+            data_urls.append(
+                'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
+            )
+            data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
+            self.data_cache_folder = self.download_data(data_urls, data_md5s,
+                                                        "full_data", False)
+        else:
+            data_urls.append(
+                'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
+            )
+            data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
+            self.data_cache_folder = self.download_data(data_urls, data_md5s,
+                                                        "small_data", False)
 
         # reader/decorator.py requires the relative path to the data folder
         cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
                                                    self.data_cache_folder)
         os.system(cmd)
 
-        self.iterations = 50
+        self.batch_size = 1
+        self.sample_iterations = 50
+        self.infer_iterations = 50000 if os.environ.get(
+            'DATASET') == 'full' else 50
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
@@ -140,20 +158,44 @@ class TestCalibrationForResnet50(unittest.TestCase):
                                                           zip_path)
             os.system(cmd)
 
-    def download_data(self, data_url, data_md5, folder_name):
-        download(data_url, self.int8_download, data_md5)
+    def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
         data_cache_folder = os.path.join(self.cache_folder, folder_name)
-        file_name = data_url.split('/')[-1]
-        zip_path = os.path.join(self.cache_folder, file_name)
+        zip_path = ''
+        if os.environ.get('DATASET') == 'full':
+            file_names = []
+            for i in range(0, len(data_urls)):
+                download(data_urls[i], self.int8_download, data_md5s[i])
+                file_names.append(data_urls[i].split('/')[-1])
+
+            zip_path = os.path.join(self.cache_folder,
+                                    'full_imagenet_val.tar.gz')
+            if not os.path.exists(zip_path):
+                cat_command = 'cat'
+                for file_name in file_names:
+                    cat_command += ' ' + os.path.join(self.cache_folder,
+                                                      file_name)
+                cat_command += ' > ' + zip_path
+                os.system(cat_command)
+
+        if os.environ.get('DATASET') != 'full' or is_model:
+            download(data_urls[0], self.int8_download, data_md5s[0])
+            file_name = data_urls[0].split('/')[-1]
+            zip_path = os.path.join(self.cache_folder, file_name)
+
+        print('Data is downloaded at {0}').format(zip_path)
         self.cache_unzipping(data_cache_folder, zip_path)
         return data_cache_folder
 
-    def download_resnet50_model(self):
+    def download_model(self):
         # resnet50 fp32 data
-        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
-        data_md5 = '4a5194524823d9b76da6e738e1367881'
-        self.model_cache_folder = self.download_data(data_url, data_md5,
+        data_urls = [
+            'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        ]
+        data_md5s = ['4a5194524823d9b76da6e738e1367881']
+        self.model_cache_folder = self.download_data(data_urls, data_md5s,
                                                      "resnet50_fp32")
+        self.model = "ResNet-50"
+        self.algo = "direct"
 
     def run_program(self, model_path, generate_int8=False, algo='direct'):
         image_shape = [3, 224, 224]
@@ -169,17 +211,17 @@ class TestCalibrationForResnet50(unittest.TestCase):
         t = fluid.transpiler.InferenceTranspiler()
         t.transpile(infer_program, fluid.CPUPlace())
 
-        val_reader = paddle.batch(val(), batch_size=1)
+        val_reader = paddle.batch(val(), self.batch_size)
+        iterations = self.infer_iterations
 
         if generate_int8:
             int8_model = os.path.join(os.getcwd(), "calibration_out")
+            iterations = self.sample_iterations
 
             if os.path.exists(int8_model):
                 os.system("rm -rf " + int8_model)
                 os.system("mkdir " + int8_model)
 
-            print("Start calibration ...")
-
             calibrator = int8_utility.Calibrator(
                 program=infer_program,
                 pretrained_model=model_path,
@@ -191,6 +233,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
 
         test_info = []
         cnt = 0
+        periods = []
         for batch_id, data in enumerate(val_reader()):
             image = np.array(
                 [x[0].reshape(image_shape) for x in data]).astype("float32")
@@ -202,21 +245,28 @@ class TestCalibrationForResnet50(unittest.TestCase):
                 if op.has_attr("use_mkldnn"):
                     op._set_attr("use_mkldnn", True)
 
+            t1 = time.time()
             _, acc1, _ = exe.run(
                 running_program,
                 feed={feed_dict[0]: image,
                       feed_dict[1]: label},
                 fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
             if generate_int8:
                 calibrator.sample_data()
 
             test_info.append(np.mean(acc1) * len(data))
             cnt += len(data)
 
-            if batch_id != self.iterations - 1:
-                continue
+            if (batch_id + 1) % 100 == 0:
+                print("{0} images,".format(batch_id + 1))
+                sys.stdout.flush()
 
-            break
+            if (batch_id + 1) == iterations:
+                break
 
         if generate_int8:
             calibrator.save_int8_model()
@@ -225,32 +275,49 @@ class TestCalibrationForResnet50(unittest.TestCase):
                 "Calibration is done and the corresponding files are generated at {}".
                 format(os.path.abspath("calibration_out")))
         else:
-            return np.sum(test_info) / cnt
+            throughput = cnt / np.sum(periods)
+            latency = np.average(periods)
+            acc1 = np.sum(test_info) / cnt
+            return (throughput, latency, acc1)
 
     def test_calibration(self):
-        self.download_resnet50_model()
-        fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
-        self.run_program(self.model_cache_folder + "/model", True)
-        int8_acc1 = self.run_program("calibration_out")
+        self.download_model()
+        print("Start FP32 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations)
+        (fp32_throughput, fp32_latency,
+         fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
+        print("Start INT8 calibration for {0} on {1} images ...").format(
+            self.model, self.sample_iterations)
+        self.run_program(
+            self.model_cache_folder + "/model", True, algo=self.algo)
+        print("Start INT8 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations)
+        (int8_throughput, int8_latency,
+         int8_acc1) = self.run_program("calibration_out")
         delta_value = np.abs(fp32_acc1 - int8_acc1)
         self.assertLess(delta_value, 0.01)
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
+            format(self.model, self.batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
+            format(self.model, self.batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
 
 
 class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
-    def download_mobilenetv1_model(self):
+    def download_model(self):
         # mobilenetv1 fp32 data
-        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        data_md5 = '13892b0716d26443a8cdea15b3c6438b'
-        self.model_cache_folder = self.download_data(data_url, data_md5,
+        data_urls = [
+            'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        self.model_cache_folder = self.download_data(data_urls, data_md5s,
                                                      "mobilenetv1_fp32")
-
-    def test_calibration(self):
-        self.download_mobilenetv1_model()
-        fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
-        self.run_program(self.model_cache_folder + "/model", True, algo='KL')
-        int8_acc1 = self.run_program("calibration_out")
-        delta_value = np.abs(fp32_acc1 - int8_acc1)
-        self.assertLess(delta_value, 0.01)
+        self.model = "MobileNet-V1"
+        self.algo = "KL"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 4f434328e47df4363b304ff55f587018d3157c5e..5be21ff7f7270f6ce950c069f61418c922bcedc5 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -366,17 +366,40 @@ class TruncatedNormalInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(
+                    ['truncated_gaussian_random', 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         op = block._prepend_op(
             type="truncated_gaussian_random",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": out_dtype,
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed
             },
             stop_gradient=True)
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
         var.op = op
         return op
 
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 9d027ce901b91b31169de3b5468cff8ac9466849..a2abbf36c0267d85c9c97af00c9faabf1187822c 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -22,9 +22,10 @@ import shutil
 import six
 from functools import reduce
 
+from paddle.fluid import layers
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
-from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable
+from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard
 from . import core
 
 __all__ = [
@@ -939,6 +940,17 @@ def save_inference_model(dirname,
                                             we save the original program as inference model.",
                 RuntimeWarning)
 
+    # fix the bug that the activation op's output as target will be pruned.
+    # will affect the inference performance.
+    # TODO(Superjomn) add an IR pass to remove 1-scale op.
+    with program_guard(main_program):
+        uniq_target_vars = []
+        for var in target_vars:
+            if isinstance(var, Variable):
+                var1 = layers.scale(var, 1.)
+            uniq_target_vars.append(var1)
+        target_vars = uniq_target_vars
+
     # when a pserver and a trainer running on the same machine, mkdir may conflict
     try:
         os.makedirs(dirname)
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 7cf575d2539ce770f50411048f8ba948809b3c31..c983e2a44b25c5943df5e822e2e363b2557a6ac3 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -49,6 +49,7 @@ __all__ = [
     'box_coder',
     'polygon_box_transform',
     'yolov3_loss',
+    'box_clip',
     'multiclass_nms',
 ]
 
@@ -346,19 +347,107 @@ def box_coder(prior_box,
               target_box,
               code_type="encode_center_size",
               box_normalized=True,
-              name=None):
+              name=None,
+              axis=0):
     """
-    ${comment}
+    **Box Coder Layer**
+
+    Encode/Decode the target bounding box with the priorbox information.
+    
+    The Encoding schema described below:
+
+    .. math::
+
+        ox = (tx - px) / pw / pxv
+
+        oy = (ty - py) / ph / pyv
+
+        ow = \log(\abs(tw / pw)) / pwv 
+
+        oh = \log(\abs(th / ph)) / phv 
+
+    The Decoding schema described below:
+    
+    .. math::
+  
+        ox = (pw * pxv * tx * + px) - tw / 2
+
+        oy = (ph * pyv * ty * + py) - th / 2
+
+        ow = \exp(pwv * tw) * pw + tw / 2
+
+        oh = \exp(phv * th) * ph + th / 2   
+
+    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, 
+    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote 
+    the priorbox's (anchor) center coordinates, width and height. `pxv`, 
+    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, 
+    `ow`, `oh` denote the encoded/decoded coordinates, width and height. 
+
+    During Box Decoding, two modes for broadcast are supported. Say target 
+    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or 
+    [M, 4]. Then prior box will broadcast to target box along the 
+    assigned axis. 
 
     Args:
-        prior_box(${prior_box_type}): ${prior_box_comment}
-        prior_box_var(${prior_box_var_type}): ${prior_box_var_comment}
-        target_box(${target_box_type}): ${target_box_comment}
-        code_type(${code_type_type}): ${code_type_comment}
-        box_normalized(${box_normalized_type}): ${box_normalized_comment}
+        prior_box(Variable): Box list prior_box is a 2-D Tensor with shape 
+                             [M, 4] holds M boxes, each box is represented as
+                             [xmin, ymin, xmax, ymax], [xmin, ymin] is the 
+                             left top coordinate of the anchor box, if the 
+                             input is image feature map, they are close to 
+                             the origin of the coordinate system. [xmax, ymax]
+                             is the right bottom coordinate of the anchor box.       
+        prior_box_var(Variable|list): prior_box_var supports two types of input. 
+                              One is variable with shape [M, 4] holds M group.
+                              The other one is list consist of 4 elements 
+                              shared by all boxes. 
+        target_box(Variable): This input can be a 2-D LoDTensor with shape 
+                              [N, 4] when code_type is 'encode_center_size'. 
+                              This input also can be a 3-D Tensor with shape 
+                              [N, M, 4] when code_type is 'decode_center_size'. 
+                              Each box is represented as  
+                              [xmin, ymin, xmax, ymax]. This tensor can 
+                              contain LoD information to represent a batch 
+                              of inputs. 
+        code_type(string): The code type used with the target box. It can be
+                           encode_center_size or decode_center_size
+        box_normalized(int): Whether treat the priorbox as a noramlized box.
+                             Set true by default.
+        name(string): The name of box coder.
+        axis(int): Which axis in PriorBox to broadcast for box decode, 
+                   for example, if axis is 0 and TargetBox has shape
+                   [N, M, 4] and PriorBox has shape [M, 4], then PriorBox
+                   will broadcast to [N, M, 4] for decoding. It is only valid
+                   when code type is decode_center_size. Set 0 by default. 
 
     Returns:
-        output_box(${output_box_type}): ${output_box_comment}
+        output_box(Variable): When code_type is 'encode_center_size', the 
+                              output tensor of box_coder_op with shape 
+                              [N, M, 4] representing the result of N target 
+                              boxes encoded with M Prior boxes and variances. 
+                              When code_type is 'decode_center_size', 
+                              N represents the batch size and M represents 
+                              the number of deocded boxes.
+
+    Examples:
+ 
+        .. code-block:: python
+ 
+            prior_box = fluid.layers.data(name='prior_box', 
+                                          shape=[512, 4], 
+                                          dtype='float32',
+                                          append_batch_size=False)
+            target_box = fluid.layers.data(name='target_box',
+                                           shape=[512,81,4],
+                                           dtype='float32',
+                                           append_batch_size=False)
+            output = fluid.layers.box_coder(prior_box=prior_box,
+                                            prior_box_var=[0.1,0.1,0.2,0.2],
+                                            target_box=target_box,
+                                            code_type="decode_center_size",
+                                            box_normalized=False,
+                                            axis=1)
+
     """
     helper = LayerHelper("box_coder", **locals())
 
@@ -369,15 +458,22 @@ def box_coder(prior_box,
         output_box = helper.create_variable(
             name=name, dtype=prior_box.dtype, persistable=False)
 
+    inputs = {"PriorBox": prior_box, "TargetBox": target_box}
+    attrs = {
+        "code_type": code_type,
+        "box_normalized": box_normalized,
+        "axis": axis
+    }
+    if isinstance(prior_box_var, Variable):
+        inputs['PriorBoxVar'] = prior_box_var
+    elif isinstance(prior_box_var, list):
+        attrs['variance'] = prior_box_var
+    else:
+        raise TypeError("Input variance of box_coder must be Variable or lisz")
     helper.append_op(
         type="box_coder",
-        inputs={
-            "PriorBox": prior_box,
-            "PriorBoxVar": prior_box_var,
-            "TargetBox": target_box
-        },
-        attrs={"code_type": code_type,
-               "box_normalized": box_normalized},
+        inputs=inputs,
+        attrs=attrs,
         outputs={"OutputBox": output_box})
     return output_box
 
@@ -413,13 +509,10 @@ def yolov3_loss(x,
                 gtbox,
                 gtlabel,
                 anchors,
+                anchor_mask,
                 class_num,
                 ignore_thresh,
-                loss_weight_xy=None,
-                loss_weight_wh=None,
-                loss_weight_conf_target=None,
-                loss_weight_conf_notarget=None,
-                loss_weight_class=None,
+                downsample_ratio,
                 name=None):
     """
     ${comment}
@@ -431,16 +524,13 @@ def yolov3_loss(x,
                           and x, y, w, h should be relative value of input image.
                           N is the batch number and B is the max box number in 
                           an image.
-        gtlabel (Variable): class id of ground truth boxes, shoud be ins shape
+        gtlabel (Variable): class id of ground truth boxes, shoud be in shape
                             of [N, B].
         anchors (list|tuple): ${anchors_comment}
+        anchor_mask (list|tuple): ${anchor_mask_comment}
         class_num (int): ${class_num_comment}
         ignore_thresh (float): ${ignore_thresh_comment}
-        loss_weight_xy (float|None): ${loss_weight_xy_comment}
-        loss_weight_wh (float|None): ${loss_weight_wh_comment}
-        loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment}
-        loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment}
-        loss_weight_class (float|None): ${loss_weight_class_comment}
+        downsample_ratio (int): ${downsample_ratio_comment}
         name (string): the name of yolov3 loss
 
     Returns:
@@ -460,9 +550,10 @@ def yolov3_loss(x,
         x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
         gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
         gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
-        anchors = [10, 13, 16, 30, 33, 23]
-        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80
-                                        anchors=anchors, ignore_thresh=0.5)
+        anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
+        anchors = [0, 1, 2]
+        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, 
+                                        ignore_thresh=0.5, downsample_ratio=32)
     """
     helper = LayerHelper('yolov3_loss', **locals())
 
@@ -474,6 +565,8 @@ def yolov3_loss(x,
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
+    if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
+        raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple")
     if not isinstance(class_num, int):
         raise TypeError("Attr class_num of yolov3_loss must be an integer")
     if not isinstance(ignore_thresh, float):
@@ -486,31 +579,29 @@ def yolov3_loss(x,
         loss = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
 
+    objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
+    gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
+
     attrs = {
         "anchors": anchors,
+        "anchor_mask": anchor_mask,
         "class_num": class_num,
         "ignore_thresh": ignore_thresh,
+        "downsample_ratio": downsample_ratio,
     }
 
-    if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
-        self.attrs['loss_weight_xy'] = loss_weight_xy
-    if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
-        self.attrs['loss_weight_wh'] = loss_weight_wh
-    if loss_weight_conf_target is not None and isinstance(
-            loss_weight_conf_target, float):
-        self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
-    if loss_weight_conf_notarget is not None and isinstance(
-            loss_weight_conf_notarget, float):
-        self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
-    if loss_weight_class is not None and isinstance(loss_weight_class, float):
-        self.attrs['loss_weight_class'] = loss_weight_class
-
     helper.append_op(
         type='yolov3_loss',
-        inputs={"X": x,
-                "GTBox": gtbox,
-                "GTLabel": gtlabel},
-        outputs={'Loss': loss},
+        inputs={
+            "X": x,
+            "GTBox": gtbox,
+            "GTLabel": gtlabel,
+        },
+        outputs={
+            'Loss': loss,
+            'ObjectnessMask': objectness_mask,
+            'GTMatchMask': gt_match_mask
+        },
         attrs=attrs)
     return loss
 
@@ -1965,6 +2056,54 @@ def generate_proposals(scores,
     return rpn_rois, rpn_roi_probs
 
 
+def box_clip(input, im_info, name=None):
+    """
+    Clip the box into the size given by im_info
+    For each input box, The formula is given as follows:
+        
+    .. code-block:: text
+
+        xmin = max(min(xmin, im_w - 1), 0)
+        ymin = max(min(ymin, im_h - 1), 0) 
+        xmax = max(min(xmax, im_w - 1), 0)
+        ymax = max(min(ymax, im_h - 1), 0)
+    
+    where im_w and im_h are computed from im_info:
+ 
+    .. code-block:: text
+
+        im_h = round(height / scale)
+        im_w = round(weight / scale)
+
+    Args:
+        input(variable): The input box, the last dimension is 4.
+        im_info(variable): The information of image with shape [N, 3] with 
+                            layout (height, width, scale). height and width
+                            is the input size and scale is the ratio of input
+                            size and original size.
+        name (str): The name of this layer. It is optional.
+    
+    Returns:
+        Variable: The cliped tensor variable.
+        
+    Examples:
+        .. code-block:: python
+        
+            boxes = fluid.layers.data(
+                name='data', shape=[8, 4], dtype='float32', lod_level=1)
+            im_info = fluid.layers.data(name='im_info', shape=[3])
+            out = fluid.layers.box_clip(
+                input=boxes, im_info=im_info, inplace=True)
+    """
+
+    helper = LayerHelper("box_clip", **locals())
+    output = helper.create_variable_for_type_inference(dtype=input.dtype)
+    inputs = {"Input": input, "ImInfo": im_info}
+    helper.append_op(type="box_clip", inputs=inputs, outputs={"Output": output})
+
+    return output
+
+
 def multiclass_nms(bboxes,
                    scores,
                    score_threshold,
@@ -2042,9 +2181,11 @@ def multiclass_nms(bboxes,
              (After version 1.3, when no boxes detected, the lod is changed 
              from {0} to {1}) 
 
+
     Examples:
         .. code-block:: python
 
+
             boxes = fluid.layers.data(name='bboxes', shape=[81, 4],
                                       dtype='float32', lod_level=1)
             scores = fluid.layers.data(name='scores', shape=[81],
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index beb5e31211c5f9aa6bddfcb1da7e63d6480e99e1..0e4b5aadc0b0d7e87ea1cfb8e18339fe211e1eef 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -932,7 +932,7 @@ def dynamic_gru(input,
             create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
+            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
             the bias in the update gate, reset gate and candidate calculations.
             If it is set to False, no bias will be applied to the update gate,
             reset gate and candidate calculations. If it is set to None or one
@@ -1073,7 +1073,7 @@ def gru_unit(input,
             create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
+            of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
             the bias in the update gate, reset gate and candidate calculations.
             If it is set to False, no bias will be applied to the update gate,
             reset gate and candidate calculations. If it is set to None or one
@@ -3877,7 +3877,8 @@ def beam_search(pre_ids,
                 end_id,
                 level=0,
                 is_accumulated=True,
-                name=None):
+                name=None,
+                return_parent_idx=False):
     """
     Beam search is a classical algorithm for selecting candidate words in a
     machine translation task.
@@ -3933,10 +3934,16 @@ def beam_search(pre_ids,
              accumulated scores.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
+        return_parent_idx(bool): Whether to return an extra Tensor variable 
+                        preserving the selected_ids' parent indice in pre_ids
+                        in output, which can be used to gather cell states at
+                        the next time step.
 
     Returns:
-        Variable: The LodTensor pair containing the selected ids and the \
-            corresponding scores.
+        Variable: The LodTensor tuple containing the selected ids and the \
+            corresponding scores. If :attr:`return_parent_idx` is :attr:`True`, \
+            an extra Tensor variable preserving the selected_ids' parent indice \
+            is included.
 
     Examples:
         .. code-block:: python
@@ -3969,6 +3976,11 @@ def beam_search(pre_ids,
     selected_scores = helper.create_variable_for_type_inference(
         dtype=score_type)
     selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
+    # parent_idx is a tensor used to gather cell states at the next time
+    # step. Though lod in selected_ids can also be used to gather by
+    # sequence_expand, it is not efficient.
+    # gather_op's index input only supports int32 dtype currently
+    parent_idx = helper.create_variable_for_type_inference(dtype="int32")
 
     helper.append_op(
         type='beam_search',
@@ -3976,6 +3988,7 @@ def beam_search(pre_ids,
         outputs={
             'selected_ids': selected_ids,
             'selected_scores': selected_scores,
+            'parent_idx': parent_idx
         },
         attrs={
             # TODO(ChunweiYan) to assure other value support
@@ -3984,8 +3997,10 @@ def beam_search(pre_ids,
             'end_id': end_id,
             'is_accumulated': is_accumulated,
         })
-
-    return selected_ids, selected_scores
+    if return_parent_idx:
+        return selected_ids, selected_scores, parent_idx
+    else:
+        return selected_ids, selected_scores
 
 
 def beam_search_decode(ids, scores, beam_size, end_id, name=None):
@@ -5403,7 +5418,7 @@ def transpose(x, perm, name=None):
     Examples:
         .. code-block:: python
 
-            # use append_batch_size=False to avoid prepending extra
+            # use append_batch_size=False to avoid prepending extra
             # batch size in shape
             x = fluid.layers.data(name='x', shape=[5, 10, 15],
                             dtype='float32', append_batch_size=False)
@@ -5920,7 +5935,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                                 than :attr:`shape`.
         act (str): The non-linear activation to be applied to the reshaped tensor
                    variable.
-        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
+        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
                        operators. If this flag is set :attr:`True`, reuse input
                        :attr:`x` to reshape, which will change the shape of
                        tensor variable :attr:`x` and might cause errors when
@@ -6581,7 +6596,9 @@ def image_resize(input,
                  scale=None,
                  name=None,
                  resample='BILINEAR',
-                 actual_shape=None):
+                 actual_shape=None,
+                 align_corners=True,
+                 align_mode=1):
     """
     **Resize a Batch of Images**
 
@@ -6594,6 +6611,80 @@ def image_resize(input,
 
         'NEAREST' : Nearest neighbor interpolation
 
+    Nearest neighbor interpolation is to perform nearest neighbor interpolation
+    in both the 3rd dimention(in height direction) and the 4th dimention(in width 
+    direction) on input tensor.
+            
+    Bilinear interpolation is an extension of linear interpolation for 
+    interpolating functions of two variables (e.g. H-direction and 
+    W-direction in this op) on a rectilinear 2D grid. The key idea is 
+    to perform linear interpolation first in one direction, and then 
+    again in the other direction.
+
+    Align_corners and align_mode are optinal parameters,the calculation method 
+    of interpolation can be selected by them.
+
+    Example:
+
+      For scale:
+      
+        if align_corners = True && out_size > 1 :
+
+          scale_factor = (in_size-1.0)/(out_size-1.0)
+        
+        else:
+          
+          scale_factor = float(in_size/out_size)
+        
+      
+      Nearest neighbor interpolation:
+      
+      if:
+          align_corners = False
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+          W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+      else:
+          align_corners = True
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = round(H_{in} * scale_{factor})
+          W_out = round(W_{in} * scale_{factor})
+
+      Bilinear interpolation:
+
+      if:
+          align_corners = False , align_mode = 0
+          
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+          
+          H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+          W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+      else:
+       
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = H_{in} * scale_{factor}
+          W_out = W_{in} * scale_{factor}
+
+    For details of nearest neighbor interpolation, please refer to Wikipedia: 
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
+
+    For details of bilinear interpolation, please refer to Wikipedia: 
+    https://en.wikipedia.org/wiki/Bilinear_interpolation.
+
+
+
     Args:
         input (Variable): The input tensor of image resize layer,
                           This is a 4-D tensor of the shape
@@ -6623,6 +6714,13 @@ def image_resize(input,
                                 set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
+        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the 
+                               input and output tensors are aligned, preserving the values at the 
+                               corner pixels.
+                               Default: True
+        align_mode(int)  :  An optional for bilinear interpolation. can be \'0\' 
+                            for src_idx = scale*(dst_indx+0.5)-0.5 , can be \'1\' for 
+                            src_idx = scale*dst_index .
 
     Returns:
         Variable: The output is a 4-D tensor of the shape
@@ -6635,6 +6733,8 @@ def image_resize(input,
                     or 'NEAREST' currently.
         ValueError: One of out_shape and scale must not be None.
         ValueError: out_shape length should be 2.
+        TypeError: align_corners shoule be a bool value
+        ValueError: align_mode can only be '0' or '1'
 
     Examples:
         .. code-block:: python
@@ -6650,6 +6750,12 @@ def image_resize(input,
             "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
         )
     resample_type = resample_methods[resample]
+
+    if not isinstance(align_corners, bool):
+        raise TypeError("Attr align_corners should be a bool value")
+    if align_mode != 0 and align_mode != 1:
+        raise ValueError("align_mode can only be 0 or 1")
+
     if out_shape is None and scale is None:
         raise ValueError("One of out_shape and scale must not be None.")
     helper = LayerHelper('{}_interp'.format(resample_type), **locals())
@@ -6689,9 +6795,13 @@ def image_resize(input,
         type='{}_interp'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
-        attrs={"out_h": out_h,
-               "out_w": out_w,
-               "interp_method": resample_type})
+        attrs={
+            "out_h": out_h,
+            "out_w": out_w,
+            "interp_method": resample_type,
+            "align_corners": align_corners,
+            "align_mode": align_mode
+        })
     return out
 
 
@@ -6700,7 +6810,9 @@ def resize_bilinear(input,
                     out_shape=None,
                     scale=None,
                     name=None,
-                    actual_shape=None):
+                    actual_shape=None,
+                    align_corners=True,
+                    align_mode=1):
     """
     Resize input by performing bilinear interpolation based on given
     output shape which specified by actual_shape, out_shape and scale
@@ -6715,6 +6827,47 @@ def resize_bilinear(input,
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation
 
+    Align_corners and align_mode are optinal parameters,the calculation 
+    method of interpolation can be selected by them.
+
+
+    Align_corners and align_mode are optinal parameters,the calculation method 
+    of interpolation can be selected by them.
+
+    Example:
+
+      For scale:
+      
+        if align_corners = True && out_size > 1 :
+
+          scale_factor = (in_size-1.0)/(out_size-1.0)
+        
+        else:
+          
+          scale_factor = float(in_size/out_size)     
+
+    Bilinear interpolation:
+
+      if:
+          align_corners = False , align_mode = 0
+          
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+          
+          H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+          W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+
+
+      else:
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = H_{in} * scale_{factor}
+          W_out = W_{in} * scale_{factor}
+
+
+
     Args:
         input(${x_type}): ${x_comment}.
 
@@ -6738,6 +6891,8 @@ def resize_bilinear(input,
                                 set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
+        align_corners(bool): ${align_corners_comment}
+        align_mode(bool): ${align_mode_comment}
 
     Returns:
         ${out_comment}.
@@ -6748,7 +6903,8 @@ def resize_bilinear(input,
             out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
     """
 
-    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
+    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape,
+                        align_corners, align_mode)
 
 
 @templatedoc(op_type="nearest_interp")
@@ -6756,13 +6912,48 @@ def resize_nearest(input,
                    out_shape=None,
                    scale=None,
                    name=None,
-                   actual_shape=None):
+                   actual_shape=None,
+                   align_corners=True):
     """
     Resize input by performing nearest neighbor interpolation in both the
     3rd dimention(in height direction) and the 4th dimention(in width
     direction) based on given output shape which specified by actual_shape,
     out_shape and scale in priority order.
 
+    Example:
+
+      For scale:
+      
+        if align_corners = True && out_size > 1 :
+
+          scale_factor = (in_size-1.0)/(out_size-1.0)
+        
+        else:
+          
+          scale_factor = float(in_size/out_size)
+        
+      
+      Nearest neighbor interpolation:
+      
+      if:
+          align_corners = False
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = \left \lfloor {H_{in} * scale_{}factor}} \right \rfloor
+          W_out = \left \lfloor {W_{in} * scale_{}factor}} \right \rfloor
+
+      else:
+          align_corners = True
+
+          input : (N,C,H_in,W_in)
+          output: (N,C,H_out,W_out) where:
+
+          H_out = round(H_{in} * scale_{factor})
+          W_out = round(W_{in} * scale_{factor})
+
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
@@ -6789,6 +6980,7 @@ def resize_nearest(input,
                                 set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
+        align_corners(bool): ${align_corners_comment}
 
     Returns:
         ${out_comment}.
@@ -6799,7 +6991,8 @@ def resize_nearest(input,
             out = fluid.layers.resize_nearest(input, out_shape=[12, 12])
     """
 
-    return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape)
+    return image_resize(input, out_shape, scale, name, 'NEAREST', actual_shape,
+                        align_corners)
 
 
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 6c18af7283e19bd431c8d543255d900dc89cba09..3dcf9dc06998be9c38a48f18075cbf99f3dccb1a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -135,7 +135,7 @@ def thresholded_relu(x, threshold=None):
         if val is not None:
             kwargs[name] = val
 
-    _thresholded_relu_(**kwargs)
+    return _thresholded_relu_(**kwargs)
 
 
 thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 8723d9842a94dbfd94cd423eb708a7b9897af985..0d39a139eed87f900b1f59fd0569b6acaec0962b 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -50,6 +50,19 @@ class TestDetection(unittest.TestCase):
             self.assertEqual(out.shape[-1], 6)
         print(str(program))
 
+    def test_box_coder_api(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[4], dtype='float32')
+            y = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
+            bcoder = layers.box_coder(
+                prior_box=x,
+                prior_box_var=[0.1, 0.2, 0.1, 0.2],
+                target_box=y,
+                code_type='encode_center_size')
+            self.assertIsNotNone(bcoder)
+        print(str(program))
+
     def test_detection_api(self):
         program = Program()
         with program_guard(program):
@@ -463,12 +476,23 @@ class TestYoloDetection(unittest.TestCase):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
             gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
             gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10,
-                                      0.5)
+            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13],
+                                      [0, 1], 10, 0.7, 32)
 
             self.assertIsNotNone(loss)
 
 
+class TestBoxClip(unittest.TestCase):
+    def test_box_clip(self):
+        program = Program()
+        with program_guard(program):
+            input_box = layers.data(
+                name='input_box', shape=[7, 4], dtype='float32', lod_level=1)
+            im_info = layers.data(name='im_info', shape=[3], dtype='float32')
+            out = layers.box_clip(input_box, im_info)
+            self.assertIsNotNone(out)
+
+
 class TestMulticlassNMS(unittest.TestCase):
     def test_multiclass_nms(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 7e693c6a41f71f11fd702e2cfc26aa4a21cd2de7..699181d01da862dca72113e6c11630ae5693e41c 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,15 +1,6 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# The MKLDNN tests are skiped when the MKLDNN flag is OFF
-if(NOT WITH_MKLDNN)
-    foreach(src ${TEST_OPS})
-        if(${src} MATCHES ".*_mkldnn_op$")
-            list(REMOVE_ITEM TEST_OPS ${src})
-        endif()
-    endforeach()
-endif(NOT WITH_MKLDNN)
-
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
     list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
@@ -123,3 +114,7 @@ endif()
 if (WITH_NGRAPH)
     add_subdirectory(ngraph)
 endif()
+
+if (WITH_MKLDNN)
+    add_subdirectory(mkldnn)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/__init__.py b/python/paddle/fluid/tests/unittests/mkldnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94a21a7e406b833797f8f521c62a2351c2bc30a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 611d0dd076b827b0f528f2e3a31182cc4939d1f1..ad94a4b21c347c9a2782437948c20d3b3071c679 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -17,9 +17,9 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 from scipy.special import expit
-from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
 
 
 class TestMKLDNNReluDim2(TestRelu):
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
similarity index 92%
rename from python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
index 1286cee8dc1855c1b1695da46ae0b5222c065114..5fce90372d9beda9b04ab68d0a8ac5ef5c124421 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py
@@ -19,9 +19,9 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.framework import grad_var_name
-from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
+from paddle.fluid.tests.unittests.test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
 
 
 class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
index 0f2130f9049c7ee294444282e59c654551f76603..1a399740692eab8ccea0c984a1a4f2ac984eb045 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
+from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
 
 
 class TestMKLDNNConcatOp(TestConcatOp):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 5ad376cb08e488e85be6369a91d4e81031e9e9db..100a03cea0f740a615c4a08810d4ad9e8c974d7a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 
 import paddle.fluid.core as core
-from op_test import OpTest
-from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
 
 
 def conv2d_forward_refer(input, filter, group, conv_param):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 438d45b84033b697c3210acc44392b93bf436df0..0542eef80070cbf281ee013c28b7092a2dd17eaa 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
+from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
 
 
 class TestMKLDNN(TestConv2dOp):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index deefdd09abe6b9f9ca362654f21850f598337245..9bcdb7b2a975b648471714ab628caf91b6b6f3a9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
+from paddle.fluid.tests.unittests.test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
 
 
 class TestMKLDNN(TestConv2dTransposeOp):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
index f0e1265e142b800587599783367eca2203033bf1..080b74502fbe83e97e88a65866e0d9b66b37033e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
+from paddle.fluid.tests.unittests.test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
 
 
 class TestMKLDNN(TestConv3dOp):
diff --git a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
index 0c5e1abd7c8fb010357998c0ceaebaf21619fda9..9a54f927cbde648bbbb06d043bbc1391ee43c314 100644
--- a/python/paddle/fluid/tests/unittests/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 
 
 class TestDeQuantizeOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
index d85cc1f856df8eaa73cef318b48a292042488edf..c3a42656b71d09dbc22abf8ce2ddc243b43b422f 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -16,8 +16,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest
-from test_elementwise_add_op import *
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_elementwise_add_op import *
 '''
 Some tests differ from the tests defined in test_elementwise_add_op.py
 because MKLDNN does not support tensors of number of dimensions 3.
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
index 536e9a1c58ec4a8b1b5a7c1d3a5fe737b38d24ab..738715dd70181988028adff1c50be3a52199c312 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
@@ -15,10 +15,10 @@
 from __future__ import print_function
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-from test_elementwise_mul_op import *
+from paddle.fluid.tests.unittests.test_elementwise_mul_op import *
 
 
 class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
diff --git a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
index 45951a34d6f61a242cb2dc004d6801a6c1c9dd92..84229a5cffbb466ef3c69cd997adacfb21f6aae2 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 
 
 def fully_connected_naive(input, weights, bias_data=None):
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
similarity index 90%
rename from python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
index 9777ec390656d3f6166bf9f5de7bbad8b6bd786d..c18bd77bd3e6de08283f3ac3a31c73453f3c9129 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_gaussian_random_op import TestGaussianRandomOp
+from paddle.fluid.tests.unittests.test_gaussian_random_op import TestGaussianRandomOp
 
 
 class TestMKLDNN(TestGaussianRandomOp):
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
index f6bb2ab7a696c40cb61dd5b38ca702b577fe7ea2..a5e6e116a5f1bc1e051ce3cfdac8cd1e5f3ed90e 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from test_lrn_op import TestLRNOp
+from paddle.fluid.tests.unittests.test_lrn_op import TestLRNOp
 
 
 class TestLRNMKLDNNOp(TestLRNOp):
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
index f4495d0bc8198189962d033ec18b8b67f1f47c84..fca906fecc5fe8d25b9251c886398f8df778043f 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 import paddle.fluid.core as core
-from op_test import OpTest
-from test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
 
 
 class TestPool2dMKLDNNInt8_Op(TestPool2D_Op):
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
similarity index 90%
rename from python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
index 7de5fefc148021d4109da2ac9f4b36c93a05a23f..6de43dd46e5d184ec934f2d85e0c87137e9702e0 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
+from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
 
 def create_test_mkldnn_class(parent):
diff --git a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
index 99607928648be437b7f944f86a0c28b99d1775c4..132f7bd039f7797fb0fc332d6f7b8c242af46535 100644
--- a/python/paddle/fluid/tests/unittests/test_quantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 
 
 class TestQuantizeOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
similarity index 92%
rename from python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
index 55820f31b81df9f3618d1004f6d21565564efa29..5928047b5171bcf33b024040ce79577b8aa0b53a 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_sum_op import TestSumOp
+from paddle.fluid.tests.unittests.test_sum_op import TestSumOp
 
 
 class TestMKLDNN(TestSumOp):
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
index 0c201b9e4f48df94924a248d820ae2cf73367560..4845eefe367f1ad6a2eb6ffd1f9b0598b1b4fbbd 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_transpose_op import TestTransposeOp
+from paddle.fluid.tests.unittests.test_transpose_op import TestTransposeOp
 
 
 class TestTransposeMKLDNN(TestTransposeOp):
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5424e8a6e615820b4a1a5f2ee7e7e87dd0b22af
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_conv2d_op import *
+
+
+class TestNGRAPH(TestConv2dOp):
+    def init_kernel_type(self):
+        super(TestNGRAPH, self).init_kernel_type()
+
+
+class TestNGRAPHWithPad(TestWithPad):
+    def init_kernel_type(self):
+        super(TestNGRAPHWithPad, self).init_kernel_type()
+
+
+class TestNGRAPHWithStride(TestWithStride):
+    def init_kernel_type(self):
+        super(TestNGRAPHWithStride, self).init_kernel_type()
+
+
+class TestNGRAPHWithGroup(TestWithGroup):
+    def init_kernel_type(self):
+        super(TestNGRAPHWithGroup, self).init_kernel_type()
+
+
+class TestNGRAPHWith1x1(TestWith1x1):
+    def init_kernel_type(self):
+        super(TestNGRAPHWith1x1, self).init_kernel_type()
+
+
+class TestNGRAPHWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        super(TestNGRAPHWithInput1x1Filter1x1, self).init_kernel_type()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..95e592e8ec036ad231ed57ddbc706683cb7aa153
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from paddle.fluid.tests.unittests.test_pool2d_op import *
+
+
+class TestNGRAPHPool2D_Op(TestPool2D_Op):
+    def init_test_case(self):
+        super(TestNGRAPHPool2D_Op, self).init_test_case()
+
+
+class TestNGRAPHCase1(TestCase1):
+    def init_test_case(self):
+        super(TestNGRAPHCase1, self).init_test_case()
+
+
+class TestNGRAPHCase2(TestCase2):
+    def init_test_case(self):
+        super(TestNGRAPHCase2, self).init_test_case()
+
+
+class TestNGRAPHCase3(TestCase3):
+    def init_pool_type(self):
+        super(TestNGRAPHCase3, self).init_pool_type()
+
+
+class TestNGRAPHCase4(TestCase4):
+    def init_pool_type(self):
+        super(TestNGRAPHCase4, self).init_pool_type()
+
+
+class TestNGRAPHCase5(TestCase5):
+    def init_pool_type(self):
+        super(TestNGRAPHCase5, self).init_pool_type()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index c28dda4b53ce5d394ff11222e5df8d257b4e80da..1d9f4b78f30fefa21c189036c3731e0afe39ea9e 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -38,6 +38,7 @@ class BeamSearchOpTester(unittest.TestCase):
         self._create_pre_ids()
         self.scope.var('selected_ids')
         self.scope.var('selected_scores')
+        self.scope.var('parent_idx')
 
     def test_run(self):
         op = Operator(
@@ -48,12 +49,14 @@ class BeamSearchOpTester(unittest.TestCase):
             scores='scores',
             selected_ids='selected_ids',
             selected_scores='selected_scores',
+            parent_idx='parent_idx',
             level=0,
             beam_size=2,
             end_id=0, )
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         selected_scores = self.scope.find_var("selected_scores").get_tensor()
+        parent_idx = self.scope.find_var("parent_idx").get_tensor()
         self.assertTrue(
             np.allclose(
                 np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
@@ -62,6 +65,8 @@ class BeamSearchOpTester(unittest.TestCase):
                 np.array(selected_scores),
                 np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
         self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
+        self.assertTrue(
+            np.allclose(np.array(parent_idx), np.array([0, 1, 2, 3])))
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index c8a7063dc1cd3e5cc7cd3458b51f5e74981aa75c..f60ed1d79ae5778f751d6101fde386ae3a90c0f7 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -20,7 +20,13 @@ from op_test import OpTest
 import paddle.fluid.core as core
 
 
-def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
+def bilinear_interp_np(input,
+                       out_h,
+                       out_w,
+                       out_size=None,
+                       actual_shape=None,
+                       align_corners=True,
+                       align_mode=0):
     """bilinear interpolation implement in shape [N, C, H, W]"""
     if out_size is not None:
         out_h = out_size[0]
@@ -29,25 +35,45 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
         out_h = actual_shape[0]
         out_w = actual_shape[1]
     batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
     if out_h > 1:
-        ratio_h = (in_h - 1.0) / (out_h - 1.0)
-    else:
-        ratio_h = 0.0
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
     if out_w > 1:
-        ratio_w = (in_w - 1.0) / (out_w - 1.0)
-    else:
-        ratio_w = 0.0
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_h, out_w))
+
     for i in range(out_h):
-        h = int(ratio_h * i)
+        if (align_mode == 0 and not align_corners):
+            h = int(ratio_h * (i + 0.5) - 0.5)
+        else:
+            h = int(ratio_h * i)
+
+        h = max(0, h)
         hid = 1 if h < in_h - 1 else 0
-        h1lambda = ratio_h * i - h
+        if (align_mode == 0 and not align_corners):
+            h1lambda = ratio_h * (i + 0.5) - 0.5 - h
+        else:
+            h1lambda = ratio_h * i - h
         h2lambda = 1.0 - h1lambda
         for j in range(out_w):
-            w = int(ratio_w * j)
+            if (align_mode == 0 and not align_corners):
+                w = int(ratio_w * (j + 0.5) - 0.5)
+            else:
+                w = int(ratio_w * j)
+            w = max(0, w)
             wid = 1 if w < in_w - 1 else 0
-            w1lambda = ratio_w * j - w
+            if (align_mode == 0 and not align_corners):
+                w1lambda = ratio_w * (j + 0.5) - 0.5 - w
+            else:
+                w1lambda = ratio_w * j - w
             w2lambda = 1.0 - w1lambda
 
             out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
@@ -66,7 +92,8 @@ class TestBilinearInterpOp(OpTest):
         input_np = np.random.random(self.input_shape).astype("float32")
 
         output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size, self.actual_shape)
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -75,7 +102,9 @@ class TestBilinearInterpOp(OpTest):
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
-            'interp_method': self.interp_method
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
         }
         self.outputs = {'Out': output_np}
 
@@ -91,6 +120,8 @@ class TestBilinearInterpOp(OpTest):
         self.out_h = 2
         self.out_w = 2
         self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase1(TestBilinearInterpOp):
@@ -99,6 +130,8 @@ class TestBilinearInterpCase1(TestBilinearInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase2(TestBilinearInterpOp):
@@ -107,6 +140,8 @@ class TestBilinearInterpCase2(TestBilinearInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase3(TestBilinearInterpOp):
@@ -115,6 +150,8 @@ class TestBilinearInterpCase3(TestBilinearInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase4(TestBilinearInterpOp):
@@ -124,6 +161,8 @@ class TestBilinearInterpCase4(TestBilinearInterpOp):
         self.out_h = 1
         self.out_w = 1
         self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase5(TestBilinearInterpOp):
@@ -133,6 +172,8 @@ class TestBilinearInterpCase5(TestBilinearInterpOp):
         self.out_h = 12
         self.out_w = 12
         self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase6(TestBilinearInterpOp):
@@ -142,6 +183,8 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
         self.out_h = 64
         self.out_w = 128
         self.out_size = np.array([65, 129]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpActualShape(TestBilinearInterpOp):
@@ -151,6 +194,8 @@ class TestBilinearInterpActualShape(TestBilinearInterpOp):
         self.out_h = 64
         self.out_w = 32
         self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpOpUint8(OpTest):
@@ -162,14 +207,17 @@ class TestBilinearInterpOpUint8(OpTest):
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
         output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size, self.actual_shape)
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
-            'interp_method': self.interp_method
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode
         }
         self.outputs = {'Out': output_np}
 
@@ -181,6 +229,8 @@ class TestBilinearInterpOpUint8(OpTest):
         self.input_shape = [1, 3, 9, 6]
         self.out_h = 10
         self.out_w = 9
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
@@ -189,6 +239,8 @@ class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
         self.input_shape = [2, 3, 128, 64]
         self.out_h = 120
         self.out_w = 50
+        self.align_corners = True
+        self.align_mode = 1
 
 
 class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
@@ -198,6 +250,26 @@ class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
         self.out_h = 5
         self.out_w = 13
         self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_box_clip_op.py b/python/paddle/fluid/tests/unittests/test_box_clip_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2b0598f31dd27e12e5ce329129129b5e0f1caf0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_box_clip_op.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+import copy
+
+
+def box_clip(input_box, im_info, output_box):
+    im_w = round(im_info[1] / im_info[2])
+    im_h = round(im_info[0] / im_info[2])
+    output_box[:, :, 0] = np.maximum(
+        np.minimum(input_box[:, :, 0], im_w - 1), 0)
+    output_box[:, :, 1] = np.maximum(
+        np.minimum(input_box[:, :, 1], im_h - 1), 0)
+    output_box[:, :, 2] = np.maximum(
+        np.minimum(input_box[:, :, 2], im_w - 1), 0)
+    output_box[:, :, 3] = np.maximum(
+        np.minimum(input_box[:, :, 3], im_h - 1), 0)
+
+
+def batch_box_clip(input_boxes, im_info, lod):
+    n = input_boxes.shape[0]
+    m = input_boxes.shape[1]
+    output_boxes = np.zeros((n, m, 4), dtype=np.float32)
+    cur_offset = 0
+    for i in range(len(lod)):
+        box_clip(input_boxes[cur_offset:(cur_offset + lod[i]), :, :],
+                 im_info[i, :],
+                 output_boxes[cur_offset:(cur_offset + lod[i]), :, :])
+        cur_offset += lod[i]
+    return output_boxes
+
+
+class TestBoxClipOp(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_clip"
+        lod = [[1, 2, 3]]
+        input_boxes = np.random.random((6, 10, 4)) * 5
+        im_info = np.array([[5, 8, 1.], [6, 6, 1.], [7, 5, 1.]])
+        output_boxes = batch_box_clip(input_boxes, im_info, lod[0])
+
+        self.inputs = {
+            'Input': (input_boxes.astype('float32'), lod),
+            'ImInfo': im_info.astype('float32'),
+        }
+        self.outputs = {'Output': output_boxes}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 2511c5c22e012babdeb71a71d3546456ea2ceaf3..6156268bf25ada310a3d22242ecff4b9cdf1759a 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -21,80 +21,80 @@ import math
 from op_test import OpTest
 
 
-def box_coder(target_box, prior_box, prior_box_var, output_box, code_type,
-              box_normalized):
-    prior_box_x = (
-        (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
-    prior_box_y = (
-        (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0])
-    prior_box_width = (
-        (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0])
-    prior_box_height = (
-        (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
-    prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
-                                          prior_box_var.shape[1])
-    if not box_normalized:
-        prior_box_height = prior_box_height + 1
-        prior_box_width = prior_box_width + 1
-
-    if (code_type == "EncodeCenterSize"):
-        target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
-            target_box.shape[0], 1)
-        target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape(
-            target_box.shape[0], 1)
-        target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape(
-            target_box.shape[0], 1)
-        target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
-            target_box.shape[0], 1)
-        if not box_normalized:
-            target_box_height = target_box_height + 1
-            target_box_width = target_box_width + 1
-
-        output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
-                prior_box_var[:,:,0]
-        output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \
-                prior_box_var[:,:,1]
-        output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \
-                prior_box_var[:,:,2]
-        output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \
-                prior_box_var[:,:,3]
-
-    elif (code_type == "DecodeCenterSize"):
-        target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
-                       prior_box_width + prior_box_x
-        target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
-                       prior_box_height + prior_box_y
-        target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \
-                           prior_box_width
-        target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
-                            prior_box_height
-
-        output_box[:, :, 0] = target_box_x - target_box_width / 2
-        output_box[:, :, 1] = target_box_y - target_box_height / 2
-        output_box[:, :, 2] = target_box_x + target_box_width / 2
-        output_box[:, :, 3] = target_box_y + target_box_height / 2
-        if not box_normalized:
-            output_box[:, :, 2] = output_box[:, :, 2] - 1
-            output_box[:, :, 3] = output_box[:, :, 3] - 1
-
-
-def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
-                    box_normalized):
-    n = target_box.shape[0]
-    m = prior_box.shape[0]
+def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
+    pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
+    pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
+    pb_x = pb_w * 0.5 + p_box[:, 0]
+    pb_y = pb_h * 0.5 + p_box[:, 1]
+    shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1)
+
+    pb_w = pb_w.reshape(shape)
+    pb_h = pb_h.reshape(shape)
+    pb_x = pb_x.reshape(shape)
+    pb_y = pb_y.reshape(shape)
+
+    if pb_v.ndim == 2:
+        pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1])
+    if pb_v.ndim == 1:
+        tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
+        tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y
+        tb_w = np.exp(pb_v[2] * t_box[:, :, 2]) * pb_w
+        tb_h = np.exp(pb_v[3] * t_box[:, :, 3]) * pb_h
+    else:
+        tb_x = pb_v[:, :, 0] * t_box[:, :, 0] * pb_w + pb_x
+        tb_y = pb_v[:, :, 1] * t_box[:, :, 1] * pb_h + pb_y
+        tb_w = np.exp(pb_v[:, :, 2] * t_box[:, :, 2]) * pb_w
+        tb_h = np.exp(pb_v[:, :, 3] * t_box[:, :, 3]) * pb_h
+    output_box[:, :, 0] = tb_x - tb_w / 2
+    output_box[:, :, 1] = tb_y - tb_h / 2
+    output_box[:, :, 2] = tb_x + tb_w / 2 - (not norm)
+    output_box[:, :, 3] = tb_y + tb_h / 2 - (not norm)
+
+
+def box_encoder(t_box, p_box, pb_v, output_box, norm):
+    pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
+    pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
+    pb_x = pb_w * 0.5 + p_box[:, 0]
+    pb_y = pb_h * 0.5 + p_box[:, 1]
+    shape = (1, p_box.shape[0])
+
+    pb_w = pb_w.reshape(shape)
+    pb_h = pb_h.reshape(shape)
+    pb_x = pb_x.reshape(shape)
+    pb_y = pb_y.reshape(shape)
+
+    if pb_v.ndim == 2:
+        pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1])
+    tb_x = ((t_box[:, 2] + t_box[:, 0]) / 2).reshape(t_box.shape[0], 1)
+    tb_y = ((t_box[:, 3] + t_box[:, 1]) / 2).reshape(t_box.shape[0], 1)
+    tb_w = (t_box[:, 2] - t_box[:, 0]).reshape(t_box.shape[0], 1) + (not norm)
+    tb_h = (t_box[:, 3] - t_box[:, 1]).reshape(t_box.shape[0], 1) + (not norm)
+    if pb_v.ndim == 1:
+        output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[0]
+        output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[1]
+        output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[2]
+        output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[3]
+    else:
+        output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[:, :, 0]
+        output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[:, :, 1]
+        output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[:, :, 2]
+        output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[:, :, 3]
+
+
+def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0):
+    n = t_box.shape[0]
+    m = p_box.shape[0]
+    if code_type == "DecodeCenterSize":
+        m = t_box.shape[1]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
     cur_offset = 0
     for i in range(len(lod)):
         if (code_type == "EncodeCenterSize"):
-            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :],
-                      prior_box, prior_box_var,
-                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
-                      code_type, box_normalized)
+            box_encoder(t_box[cur_offset:(cur_offset + lod[i]), :], p_box, pb_v,
+                        output_box[cur_offset:(cur_offset + lod[i]), :, :],
+                        norm)
         elif (code_type == "DecodeCenterSize"):
-            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :],
-                      prior_box, prior_box_var,
-                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
-                      code_type, box_normalized)
+            box_decoder(t_box, p_box, pb_v, output_box, norm, axis)
         cur_offset += lod[i]
     return output_box
 
@@ -106,9 +106,35 @@ class TestBoxCoderOp(OpTest):
     def setUp(self):
         self.op_type = "box_coder"
         lod = [[1, 1, 1, 1, 1]]
-        prior_box = np.random.random((10, 4)).astype('float32')
-        prior_box_var = np.random.random((10, 4)).astype('float32')
-        target_box = np.random.random((5, 10, 4)).astype('float32')
+        prior_box = np.random.random((81, 4)).astype('float32')
+        prior_box_var = np.random.random((81, 4)).astype('float32')
+        target_box = np.random.random((20, 81, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized)
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithOneRankVar(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[1, 1, 1, 1, 1]]
+        prior_box = np.random.random((81, 4)).astype('float32')
+        prior_box_var = np.random.random((4)).astype('float32')
+        target_box = np.random.random((20, 81, 4)).astype('float32')
         code_type = "DecodeCenterSize"
         box_normalized = False
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
@@ -133,9 +159,9 @@ class TestBoxCoderOpWithoutBoxVar(OpTest):
     def setUp(self):
         self.op_type = "box_coder"
         lod = [[0, 1, 2, 3, 4, 5]]
-        prior_box = np.random.random((10, 4)).astype('float32')
-        prior_box_var = np.ones((10, 4)).astype('float32')
-        target_box = np.random.random((5, 10, 4)).astype('float32')
+        prior_box = np.random.random((81, 4)).astype('float32')
+        prior_box_var = np.ones((81, 4)).astype('float32')
+        target_box = np.random.random((20, 81, 4)).astype('float32')
         code_type = "DecodeCenterSize"
         box_normalized = False
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
@@ -158,10 +184,10 @@ class TestBoxCoderOpWithLoD(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[4, 8, 8]]
-        prior_box = np.random.random((10, 4)).astype('float32')
-        prior_box_var = np.random.random((10, 4)).astype('float32')
-        target_box = np.random.random((20, 4)).astype('float32')
+        lod = [[10, 20, 20]]
+        prior_box = np.random.random((20, 4)).astype('float32')
+        prior_box_var = np.random.random((20, 4)).astype('float32')
+        target_box = np.random.random((50, 4)).astype('float32')
         code_type = "EncodeCenterSize"
         box_normalized = True
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
@@ -176,5 +202,63 @@ class TestBoxCoderOpWithLoD(OpTest):
         self.outputs = {'OutputBox': output_box}
 
 
+class TestBoxCoderOpWithAxis(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[1, 1, 1, 1, 1]]
+        prior_box = np.random.random((30, 4)).astype('float32')
+        prior_box_var = np.random.random((4)).astype('float32')
+        target_box = np.random.random((30, 81, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        axis = 1
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized, axis)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False,
+            'axis': axis
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithVariance(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[1, 1, 1, 1, 1]]
+        prior_box = np.random.random((30, 4)).astype('float32')
+        prior_box_var = np.random.random((4)).astype('float32')
+        target_box = np.random.random((30, 81, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        axis = 1
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized, axis)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False,
+            'variance': prior_box_var.astype(np.float).flatten(),
+            'axis': axis
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 5877e91f92e642e69265104c6728cd9bd41c41cd..afe990e74ff96dfbca4f335b561f9bbe7d295246 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -268,9 +268,6 @@ class TestImperativePtbRnn(unittest.TestCase):
                 sgd.minimize(dy_loss)
                 for param in ptb_model.parameters():
                     dy_param_updated[param.name] = param._numpy()
-                # print("dy_loss is {}".format(dy_loss._numpy()))
-                # print("last_hidden is {}".format(last_hidden._numpy()))
-                # print("last_cell is {}".format(last_cell._numpy()))
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index def73d7072c8d0c95f5196f4ecf90f2174234ba7..9c9f86330704466c7a8801af6ab0fb2bba23f931 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -83,7 +83,8 @@ class TestBook(unittest.TestCase):
 
         self.assertEqual(feed_var_names, ["x", "y"])
         self.assertEqual(len(fetch_vars), 1)
-        self.assertEqual(str(fetch_vars[0]), str(avg_cost))
+        print("fetch %s" % str(fetch_vars[0]))
+        self.assertTrue("scale" in str(fetch_vars[0]))
         self.assertEqual(expected, actual)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
index 242709425f2d3f190d3c1ed795d30938fb8e23fe..5bb2260ef7a143670dd75fc88769603d1437173d 100644
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -24,7 +24,8 @@ def nearest_neighbor_interp_np(X,
                                out_h,
                                out_w,
                                out_size=None,
-                               actual_shape=None):
+                               actual_shape=None,
+                               align_corners=True):
     """nearest neighbor interpolation implement in shape [N, C, H, W]"""
     if out_size is not None:
         out_h = out_size[0]
@@ -35,17 +36,31 @@ def nearest_neighbor_interp_np(X,
     n, c, in_h, in_w = X.shape
 
     ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        ratio_h = (in_h - 1.0) / (out_h - 1.0)
-    if out_w > 1:
-        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+    if (out_h > 1):
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            ratio_h = 1.0 * in_h / out_h
+    if (out_w > 1):
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((n, c, out_h, out_w))
-    for i in range(out_h):
-        in_i = int(ratio_h * i + 0.5)
-        for j in range(out_w):
-            in_j = int(ratio_w * j + 0.5)
-            out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    if align_corners:
+        for i in range(out_h):
+            in_i = int(ratio_h * i + 0.5)
+            for j in range(out_w):
+                in_j = int(ratio_w * j + 0.5)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
+    else:
+        for i in range(out_h):
+            in_i = int(ratio_h * i)
+            for j in range(out_w):
+                in_j = int(ratio_w * j)
+                out[:, :, i, j] = X[:, :, in_i, in_j]
 
     return out.astype(X.dtype)
 
@@ -59,7 +74,8 @@ class TestNearestInterpOp(OpTest):
         input_np = np.random.random(self.input_shape).astype("float32")
 
         output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
-                                               self.out_size, self.actual_shape)
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -68,7 +84,8 @@ class TestNearestInterpOp(OpTest):
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
-            'interp_method': self.interp_method
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
         }
         self.outputs = {'Out': output_np}
 
@@ -84,6 +101,7 @@ class TestNearestInterpOp(OpTest):
         self.out_h = 2
         self.out_w = 2
         self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
@@ -92,6 +110,7 @@ class TestNearestNeighborInterpCase1(TestNearestInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase2(TestNearestInterpOp):
@@ -100,6 +119,7 @@ class TestNearestNeighborInterpCase2(TestNearestInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase3(TestNearestInterpOp):
@@ -108,6 +128,7 @@ class TestNearestNeighborInterpCase3(TestNearestInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase4(TestNearestInterpOp):
@@ -117,6 +138,7 @@ class TestNearestNeighborInterpCase4(TestNearestInterpOp):
         self.out_h = 1
         self.out_w = 1
         self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase5(TestNearestInterpOp):
@@ -126,6 +148,7 @@ class TestNearestNeighborInterpCase5(TestNearestInterpOp):
         self.out_h = 12
         self.out_w = 12
         self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase6(TestNearestInterpOp):
@@ -135,6 +158,7 @@ class TestNearestNeighborInterpCase6(TestNearestInterpOp):
         self.out_h = 64
         self.out_w = 128
         self.out_size = np.array([65, 129]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
@@ -144,6 +168,7 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
         self.out_h = 64
         self.out_w = 32
         self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
 
 
 class TestNearestInterpOpUint8(OpTest):
@@ -155,14 +180,16 @@ class TestNearestInterpOpUint8(OpTest):
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
         output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
-                                               self.out_size, self.actual_shape)
+                                               self.out_size, self.actual_shape,
+                                               self.align_corners)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
-            'interp_method': self.interp_method
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners
         }
         self.outputs = {'Out': output_np}
 
@@ -174,6 +201,7 @@ class TestNearestInterpOpUint8(OpTest):
         self.input_shape = [1, 3, 9, 6]
         self.out_h = 10
         self.out_w = 9
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
@@ -182,6 +210,7 @@ class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
         self.input_shape = [2, 3, 128, 64]
         self.out_h = 120
         self.out_w = 50
+        self.align_corners = True
 
 
 class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
@@ -191,6 +220,12 @@ class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
         self.out_h = 5
         self.out_w = 13
         self.out_size = np.array([6, 15]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+    def set_align_corners(self):
+        self.align_corners = False
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 544fe4b4f81909b69a05d9751316e3d3137fdc45..020c1139230a9177c4d7765367359d91839d7d46 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -16,174 +16,179 @@ from __future__ import division
 
 import unittest
 import numpy as np
+from scipy.special import logit
+from scipy.special import expit
 from op_test import OpTest
 
 from paddle.fluid import core
 
 
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-1.0 * x))
+def l2loss(x, y):
+    return 0.5 * (y - x) * (y - x)
 
 
-def mse(x, y, num):
-    return ((y - x)**2).sum() / num
+def sce(x, label):
+    sigmoid_x = expit(x)
+    term1 = label * np.log(sigmoid_x)
+    term2 = (1.0 - label) * np.log(1.0 - sigmoid_x)
+    return -term1 - term2
 
 
-def bce(x, y, mask):
-    x = x.reshape((-1))
-    y = y.reshape((-1))
-    mask = mask.reshape((-1))
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
 
-    error_sum = 0.0
-    count = 0
-    for i in range(x.shape[0]):
-        if mask[i] > 0:
-            error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i])
-            count += 1
-    return error_sum / (-1.0 * count)
 
+def batch_xywh_box_iou(box1, box2):
+    b1_left = box1[:, :, 0] - box1[:, :, 2] / 2
+    b1_right = box1[:, :, 0] + box1[:, :, 2] / 2
+    b1_top = box1[:, :, 1] - box1[:, :, 3] / 2
+    b1_bottom = box1[:, :, 1] + box1[:, :, 3] / 2
 
-def box_iou(box1, box2):
-    b1_x1 = box1[0] - box1[2] / 2
-    b1_x2 = box1[0] + box1[2] / 2
-    b1_y1 = box1[1] - box1[3] / 2
-    b1_y2 = box1[1] + box1[3] / 2
-    b2_x1 = box2[0] - box2[2] / 2
-    b2_x2 = box2[0] + box2[2] / 2
-    b2_y1 = box2[1] - box2[3] / 2
-    b2_y2 = box2[1] + box2[3] / 2
+    b2_left = box2[:, :, 0] - box2[:, :, 2] / 2
+    b2_right = box2[:, :, 0] + box2[:, :, 2] / 2
+    b2_top = box2[:, :, 1] - box2[:, :, 3] / 2
+    b2_bottom = box2[:, :, 1] + box2[:, :, 3] / 2
 
-    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
-    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
+    left = np.maximum(b1_left[:, :, np.newaxis], b2_left[:, np.newaxis, :])
+    right = np.minimum(b1_right[:, :, np.newaxis], b2_right[:, np.newaxis, :])
+    top = np.maximum(b1_top[:, :, np.newaxis], b2_top[:, np.newaxis, :])
+    bottom = np.minimum(b1_bottom[:, :, np.newaxis],
+                        b2_bottom[:, np.newaxis, :])
 
-    inter_rect_x1 = max(b1_x1, b2_x1)
-    inter_rect_y1 = max(b1_y1, b2_y1)
-    inter_rect_x2 = min(b1_x2, b2_x2)
-    inter_rect_y2 = min(b1_y2, b2_y2)
-    inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max(
-        inter_rect_y2 - inter_rect_y1, 0)
+    inter_w = np.clip(right - left, 0., 1.)
+    inter_h = np.clip(bottom - top, 0., 1.)
+    inter_area = inter_w * inter_h
 
-    return inter_area / (b1_area + b2_area + inter_area)
+    b1_area = (b1_right - b1_left) * (b1_bottom - b1_top)
+    b2_area = (b2_right - b2_left) * (b2_bottom - b2_top)
+    union = b1_area[:, :, np.newaxis] + b2_area[:, np.newaxis, :] - inter_area
 
+    return inter_area / union
 
-def build_target(gtboxs, gtlabel, attrs, grid_size):
-    n, b, _ = gtboxs.shape
-    ignore_thresh = attrs["ignore_thresh"]
-    anchors = attrs["anchors"]
-    class_num = attrs["class_num"]
-    an_num = len(anchors) // 2
-    obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
-    tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
-    tcls = np.zeros(
-        (n, an_num, grid_size, grid_size, class_num)).astype('float32')
 
+def YOLOv3Loss(x, gtbox, gtlabel, attrs):
+    n, c, h, w = x.shape
+    b = gtbox.shape[1]
+    anchors = attrs['anchors']
+    an_num = len(anchors) // 2
+    anchor_mask = attrs['anchor_mask']
+    mask_num = len(anchor_mask)
+    class_num = attrs["class_num"]
+    ignore_thresh = attrs['ignore_thresh']
+    downsample = attrs['downsample']
+    input_size = downsample * h
+    x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
+    loss = np.zeros((n)).astype('float32')
+
+    pred_box = x[:, :, :, :, :4].copy()
+    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
+    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
+    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
+    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
+
+    x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
+                                 np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
+                                 class_num)
+
+    mask_anchors = []
+    for m in anchor_mask:
+        mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in mask_anchors])
+    anchor_w = anchors_s[:, 0:1].reshape((1, mask_num, 1, 1))
+    anchor_h = anchors_s[:, 1:2].reshape((1, mask_num, 1, 1))
+    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
+    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
+
+    pred_box = pred_box.reshape((n, -1, 4))
+    pred_obj = x[:, :, :, :, 4].reshape((n, -1))
+    objness = np.zeros(pred_box.shape[:2]).astype('float32')
+    ious = batch_xywh_box_iou(pred_box, gtbox)
+    ious_max = np.max(ious, axis=-1)
+    objness = np.where(ious_max > ignore_thresh, -np.ones_like(objness),
+                       objness)
+
+    gtbox_shift = gtbox.copy()
+    gtbox_shift[:, :, 0] = 0
+    gtbox_shift[:, :, 1] = 0
+
+    anchors = [(anchors[2 * i], anchors[2 * i + 1]) for i in range(0, an_num)]
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
+    anchor_boxes = np.concatenate(
+        [np.zeros_like(anchors_s), anchors_s], axis=-1)
+    anchor_boxes = np.tile(anchor_boxes[np.newaxis, :, :], (n, 1, 1))
+    ious = batch_xywh_box_iou(gtbox_shift, anchor_boxes)
+    iou_matches = np.argmax(ious, axis=-1)
+    gt_matches = iou_matches.copy()
     for i in range(n):
         for j in range(b):
-            if gtboxs[i, j, :].sum() == 0:
+            if gtbox[i, j, 2:].sum() == 0:
+                gt_matches[i, j] = -1
                 continue
+            if iou_matches[i, j] not in anchor_mask:
+                gt_matches[i, j] = -1
+                continue
+            an_idx = anchor_mask.index(iou_matches[i, j])
+            gt_matches[i, j] = an_idx
+            gi = int(gtbox[i, j, 0] * w)
+            gj = int(gtbox[i, j, 1] * h)
 
-            gt_label = gtlabel[i, j]
-            gx = gtboxs[i, j, 0] * grid_size
-            gy = gtboxs[i, j, 1] * grid_size
-            gw = gtboxs[i, j, 2] * grid_size
-            gh = gtboxs[i, j, 3] * grid_size
-
-            gi = int(gx)
-            gj = int(gy)
-
-            gtbox = [0, 0, gw, gh]
-            max_iou = 0
-            for k in range(an_num):
-                anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]]
-                iou = box_iou(gtbox, anchor_box)
-                if iou > max_iou:
-                    max_iou = iou
-                    best_an_index = k
-                if iou > ignore_thresh:
-                    noobj_mask[i, best_an_index, gj, gi] = 0
-
-            obj_mask[i, best_an_index, gj, gi] = 1
-            noobj_mask[i, best_an_index, gj, gi] = 0
-            tx[i, best_an_index, gj, gi] = gx - gi
-            ty[i, best_an_index, gj, gi] = gy - gj
-            tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 *
-                                                               best_an_index])
-            th[i, best_an_index, gj, gi] = np.log(
-                gh / anchors[2 * best_an_index + 1])
-            tconf[i, best_an_index, gj, gi] = 1
-            tcls[i, best_an_index, gj, gi, gt_label] = 1
-
-    return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask)
-
-
-def YoloV3Loss(x, gtbox, gtlabel, attrs):
-    n, c, h, w = x.shape
-    an_num = len(attrs['anchors']) // 2
-    class_num = attrs["class_num"]
-    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
-    pred_x = sigmoid(x[:, :, :, :, 0])
-    pred_y = sigmoid(x[:, :, :, :, 1])
-    pred_w = x[:, :, :, :, 2]
-    pred_h = x[:, :, :, :, 3]
-    pred_conf = sigmoid(x[:, :, :, :, 4])
-    pred_cls = sigmoid(x[:, :, :, :, 5:])
-
-    tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target(
-        gtbox, gtlabel, attrs, x.shape[2])
-
-    obj_mask_expand = np.tile(
-        np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num'])))
-    loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum())
-    loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum())
-    loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum())
-    loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum())
-    loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask)
-    loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask,
-                             noobj_mask)
-    loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand,
-                     obj_mask_expand)
-
-    return attrs['loss_weight_xy'] * (loss_x + loss_y) \
-            + attrs['loss_weight_wh'] * (loss_w + loss_h) \
-            + attrs['loss_weight_conf_target'] * loss_conf_target \
-            + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \
-            + attrs['loss_weight_class'] * loss_class
+            tx = gtbox[i, j, 0] * w - gi
+            ty = gtbox[i, j, 1] * w - gj
+            tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
+            th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
+            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3])
+            loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
+            loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
+            loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale
+            loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale
+
+            objness[i, an_idx * h * w + gj * w + gi] = 1.0
+
+            for label_idx in range(class_num):
+                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx],
+                               float(label_idx == gtlabel[i, j]))
+
+        for j in range(mask_num * h * w):
+            if objness[i, j] > 0:
+                loss[i] += sce(pred_obj[i, j], 1.0)
+            elif objness[i, j] == 0:
+                loss[i] += sce(pred_obj[i, j], 0.0)
+
+    return (loss, objness.reshape((n, mask_num, h, w)).astype('float32'), \
+            gt_matches.astype('int32'))
 
 
 class TestYolov3LossOp(OpTest):
     def setUp(self):
-        self.loss_weight_xy = 1.0
-        self.loss_weight_wh = 1.0
-        self.loss_weight_conf_target = 1.0
-        self.loss_weight_conf_notarget = 1.0
-        self.loss_weight_class = 1.0
         self.initTestCase()
         self.op_type = 'yolov3_loss'
-        x = np.random.random(size=self.x_shape).astype('float32')
+        x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32'))
         gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
-        gtlabel = np.random.randint(0, self.class_num,
-                                    self.gtbox_shape[:2]).astype('int32')
+        gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
+        gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
+        gtbox = gtbox * gtmask[:, :, np.newaxis]
+        gtlabel = gtlabel * gtmask
 
         self.attrs = {
             "anchors": self.anchors,
+            "anchor_mask": self.anchor_mask,
             "class_num": self.class_num,
             "ignore_thresh": self.ignore_thresh,
-            "loss_weight_xy": self.loss_weight_xy,
-            "loss_weight_wh": self.loss_weight_wh,
-            "loss_weight_conf_target": self.loss_weight_conf_target,
-            "loss_weight_conf_notarget": self.loss_weight_conf_notarget,
-            "loss_weight_class": self.loss_weight_class,
+            "downsample": self.downsample,
         }
 
-        self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel}
+        self.inputs = {
+            'X': x,
+            'GTBox': gtbox.astype('float32'),
+            'GTLabel': gtlabel.astype('int32'),
+        }
+        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
         self.outputs = {
-            'Loss': np.array(
-                [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32')
+            'Loss': loss,
+            'ObjectnessMask': objness,
+            "GTMatchMask": gt_matches
         }
 
     def test_check_output(self):
@@ -196,19 +201,16 @@ class TestYolov3LossOp(OpTest):
             place, ['X'],
             'Loss',
             no_grad_set=set(["GTBox", "GTLabel"]),
-            max_relative_error=0.06)
+            max_relative_error=0.3)
 
     def initTestCase(self):
-        self.anchors = [10, 13, 12, 12]
-        self.class_num = 10
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        self.anchor_mask = [1, 2]
+        self.class_num = 5
         self.ignore_thresh = 0.5
-        self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7)
-        self.gtbox_shape = (5, 10, 4)
-        self.loss_weight_xy = 2.5
-        self.loss_weight_wh = 0.8
-        self.loss_weight_conf_target = 1.5
-        self.loss_weight_conf_notarget = 0.5
-        self.loss_weight_class = 1.2
+        self.downsample = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index f33c05ed2f48c2498b98fc486d6ff7471088d77e..82d0d336e523ec48c5ceca3b92ff0963c4499123 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -17,3 +17,4 @@ from __future__ import print_function
 from .program_utils import *
 from .ufind import *
 from .checkport import *
+from .vars_distributed import *
diff --git a/python/paddle/fluid/transpiler/details/vars_distributed.py b/python/paddle/fluid/transpiler/details/vars_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..05e7f6e3e706376efc8af870a780d96c45642514
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/vars_distributed.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+from paddle.fluid.framework import Variable
+
+
+class VarStruct(object):
+    """
+    record part properties of a Variable in python.
+    """
+
+    def __init__(self, name, shape, dtype, type, lod_level, persistable):
+        self.name = name
+        self.shape = shape
+        self.dtype = dtype
+        self.type = type
+        self.lod_level = lod_level
+        self.persistable = persistable
+
+
+class VarDistributed(object):
+    """
+    a class to record the var distributed on parameter servers.
+    the class will record the relationship between origin var and slice var.
+    the slice var's properties, such as type/shape/offset/endpoint.
+    """
+
+    def __init__(self,
+                 origin_var,
+                 slice_var,
+                 is_slice=None,
+                 block_id=None,
+                 offset=None,
+                 vtype=None,
+                 endpoint=None):
+        """
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        """
+
+        if isinstance(origin_var, Variable):
+            self.origin = self.__create_var_struct(origin_var)
+        else:
+            self.origin = origin_var
+
+        if isinstance(slice_var, Variable):
+            self.slice = self.__create_var_struct(slice_var)
+        else:
+            self.slice = slice_var
+
+        if self.equal(self.origin, self.slice):
+            self.is_slice = False
+            self.block_id = 0
+            self.offset = 0
+        else:
+            self.is_slice = True
+            self.block_id = 0
+            self.offset = 0
+
+        if is_slice is not None:
+            self.is_slice = is_slice
+        if block_id is not None:
+            self.block_id = block_id
+        if offset is not None:
+            self.offset = offset
+
+        self.vtype = vtype
+        self.endpoint = endpoint
+
+    @staticmethod
+    def __create_var_struct(var):
+        return VarStruct(var.name, var.shape, var.dtype, var.type,
+                         var.lod_level, var.persistable)
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
+
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def __str__(self):
+        origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
+            format(i="{", e="}", name=self.origin.name, type=self.origin.type,
+                   shape=self.origin.shape, dtype=self.origin.dtype)
+
+        slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
+                        ".slice({is_slice}).block({block_id}).offset({offset})". \
+            format(i="{", e="}", name=self.slice.name, type=self.slice.type,
+                   shape=self.slice.shape, dtype=self.slice.dtype,
+                   is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
+
+        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
+            self.vtype, origin_var_str, slice_var_str, self.endpoint)
+
+
+class VarsDistributed(object):
+    """
+    a gather about VarDistributed with many methods to find distributed vars.
+    through the class, we can get overview about the distributed parameters on parameter servers.
+    this class may centralized and convenient for developer to manage and get variable's distribute.
+    other module can also use this to find variables such io.py.
+    """
+
+    def __init__(self):
+        self.distributed_vars = []
+
+    def add_distributed_var(self,
+                            origin_var,
+                            slice_var,
+                            is_slice=None,
+                            block_id=None,
+                            offset=None,
+                            vtype=None,
+                            endpoint=None):
+        """
+        add distributed var in this.
+
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        Returns:
+            None
+        """
+        self.distributed_vars.append(
+            VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
+                           vtype, endpoint))
+
+    def get_distributed_var_by_slice(self, var_name):
+        """
+        get distributed var by conditions.
+
+        Args:
+            var_name(str): slice var name, such as "w.traier0.block1"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.slice.name == var_name:
+                return dist_var
+        return None
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
+        """
+        get distributed var by conditions.
+
+        Args:
+            origin_var_name(str):
+            endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
+                return dist_var
+        return None
+
+    def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+            groupby(bool|False): group by origin var or not.
+
+        Returns:
+            list: distributed var list.
+            dict: distributed var map when groupby=True
+        """
+        vtype_vars = []
+        for var in self.distributed_vars:
+            if var.vtype in vtypes:
+                vtype_vars.append(var)
+        if not groupby:
+            return vtype_vars
+
+        params_map = {}
+        for var in vtype_vars:
+            origin_var_name = var.origin.name
+
+            if origin_var_name in params_map.keys():
+                optimizers = params_map.get(origin_var_name)
+            else:
+                optimizers = []
+            optimizers.append(var)
+            params_map[origin_var_name] = optimizers
+        return params_map
+
+    def get_distributed_vars_by_ep(self, endpoint, vtype=None):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+
+        Returns:
+            list: distributed var list.
+        """
+        endpoint_vars = []
+        for var in self.distributed_vars:
+            if var.endpoint == endpoint:
+                endpoint_vars.append(var)
+        if not vtype:
+            return endpoint_vars
+
+        vtype_vars = []
+        for var in endpoint_vars:
+            if var.vtype == vtype:
+                vtype_vars.append(var)
+        return vtype_vars
+
+    def overview(self):
+        """
+        get the overview string about all params on all parameter servers.
+
+        Returns:
+            Str: overview string.
+
+        """
+        vars_str = []
+        for var in self.distributed_vars:
+            vars_str.append(str(var))
+        return "\n".join(vars_str)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index e58f34e3750803669149685003ea5858fa775ed7..a3293afbbd7cef8470c808e98ae88a05f2e492f4 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -30,19 +30,23 @@ Steps to transpile pserver:
 5. add listen_and_serv op
 """
 
+import sys
 import math
-import numpy as np
+from functools import reduce
+
 import collections
+import six
 import logging
 
+import numpy as np
+
 from .ps_dispatcher import RoundRobin, PSDispatcher
 from .. import core, framework, unique_name
 from ..framework import Program, default_main_program, \
-    default_startup_program, Block, \
-    Parameter, Variable, grad_var_name
-from .details import *
+    default_startup_program, Block, Parameter, grad_var_name
+from .details import wait_server_ready, UnionFind, VarStruct, VarsDistributed
+from .details import delete_ops, find_op_by_output_arg
 from ..distribute_lookup_table import find_distributed_lookup_table
-from functools import reduce
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
@@ -62,260 +66,6 @@ def log(*args):
         print(args)
 
 
-class VarStruct(object):
-    """
-    record part properties of a Variable in python.
-    """
-
-    def __init__(self, name, shape, dtype, type, lod_level, persistable):
-        self.name = name
-        self.shape = shape
-        self.dtype = dtype
-        self.type = type
-        self.lod_level = lod_level
-        self.persistable = persistable
-
-
-class VarDistributed(object):
-    """
-    a class to record the var distributed on parameter servers.
-    the class will record the relationship between origin var and slice var.
-    the slice var's properties, such as type/shape/offset/endpoint.
-    """
-
-    def __init__(self,
-                 origin_var,
-                 slice_var,
-                 is_slice=None,
-                 block_id=None,
-                 offset=None,
-                 vtype=None,
-                 endpoint=None):
-        """
-        Args:
-            origin_var(Variable|VarStruct): origin var properties
-            slice_var(Variable|VarStruct): slice var properties
-            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
-            block_id(int|None): the number about the slice var.
-            offset(int|None): if the slice var is sliced, offset is the numel before the var.
-            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
-            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
-        """
-
-        if isinstance(origin_var, Variable):
-            self.origin = self.__create_var_struct(origin_var)
-        else:
-            self.origin = origin_var
-
-        if isinstance(slice_var, Variable):
-            self.slice = self.__create_var_struct(slice_var)
-        else:
-            self.slice = slice_var
-
-        if self.equal(self.origin, self.slice):
-            self.is_slice = False
-            self.block_id = 0
-            self.offset = 0
-        else:
-            self.is_slice = True
-            self.block_id = 0
-            self.offset = 0
-
-        if is_slice is not None:
-            self.is_slice = is_slice
-        if block_id is not None:
-            self.block_id = block_id
-        if offset is not None:
-            self.offset = offset
-
-        self.vtype = vtype
-        self.endpoint = endpoint
-
-    @staticmethod
-    def __create_var_struct(var):
-        return VarStruct(var.name, var.shape, var.dtype, var.type,
-                         var.lod_level, var.persistable)
-
-    @staticmethod
-    def equal(var1, var2):
-        """
-        the two var is equal or not.
-        Returns:
-            bool: equal will return True else False
-        """
-        assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
-
-        return var1.name == var2.name and \
-               var1.type == var2.type and \
-               var1.shape == var2.shape and \
-               var1.dtype == var2.dtype and \
-               var1.lod_level == var2.lod_level and \
-               var1.persistable == var2.persistable
-
-    def __str__(self):
-        origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
-            format(i="{", e="}", name=self.origin.name, type=self.origin.type,
-                   shape=self.origin.shape, dtype=self.origin.dtype)
-
-        slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
-                        ".slice({is_slice}).block({block_id}).offset({offset})". \
-            format(i="{", e="}", name=self.slice.name, type=self.slice.type,
-                   shape=self.slice.shape, dtype=self.slice.dtype,
-                   is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
-
-        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
-            self.vtype, origin_var_str, slice_var_str, self.endpoint)
-
-
-class VarsDistributed(object):
-    """
-    a gather about VarDistributed with many methods to find distributed vars.
-    through the class, we can get overview about the distributed parameters on parameter servers.
-    this class may centralized and convenient for developer to manage and get variable's distribute.
-    other module can also use this to find variables such io.py.
-    """
-
-    def __init__(self):
-        self.distributed_vars = []
-
-    def add_distributed_var(self,
-                            origin_var,
-                            slice_var,
-                            is_slice=None,
-                            block_id=None,
-                            offset=None,
-                            vtype=None,
-                            endpoint=None):
-        """
-        add distributed var in this.
-
-        Args:
-            origin_var(Variable|VarStruct): origin var properties
-            slice_var(Variable|VarStruct): slice var properties
-            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
-            block_id(int|None): the number about the slice var.
-            offset(int|None): if the slice var is sliced, offset is the numel before the var.
-            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
-            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
-        Returns:
-            None
-        """
-        self.distributed_vars.append(
-            VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
-                           vtype, endpoint))
-
-    def get_distributed_var_by_slice(self, var_name):
-        """
-        get distributed var by conditions.
-
-        Args:
-            var_name(str): slice var name, such as "w.traier0.block1"
-        Returns:
-            VarDistributed: distributed var.
-        """
-        for dist_var in self.distributed_vars:
-            if dist_var.slice.name == var_name:
-                return dist_var
-        return None
-
-    @staticmethod
-    def equal(var1, var2):
-        """
-        the two var is equal or not.
-        Returns:
-            bool: equal will return True else False
-        """
-        return var1.name == var2.name and \
-               var1.type == var2.type and \
-               var1.shape == var2.shape and \
-               var1.dtype == var2.dtype and \
-               var1.lod_level == var2.lod_level and \
-               var1.persistable == var2.persistable
-
-    def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
-        """
-        get distributed var by conditions.
-
-        Args:
-            origin_var_name(str):
-            endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
-        Returns:
-            VarDistributed: distributed var.
-        """
-        for dist_var in self.distributed_vars:
-            if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
-                return dist_var
-        return None
-
-    def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
-        """
-        get distributed vars by conditions.
-
-        Args:
-            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
-            groupby(bool|False): group by origin var or not.
-
-        Returns:
-            list: distributed var list.
-            dict: distributed var map when groupby=True
-        """
-        vtype_vars = []
-        for var in self.distributed_vars:
-            if var.vtype in vtypes:
-                vtype_vars.append(var)
-        if not groupby:
-            return vtype_vars
-
-        params_map = {}
-        for var in vtype_vars:
-            origin_var_name = var.origin.name
-
-            if origin_var_name in params_map.keys():
-                optimizers = params_map.get(origin_var_name)
-            else:
-                optimizers = []
-            optimizers.append(var)
-            params_map[origin_var_name] = optimizers
-        return params_map
-
-    def get_distributed_vars_by_ep(self, endpoint, vtype=None):
-        """
-        get distributed vars by conditions.
-
-        Args:
-            endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
-            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
-
-        Returns:
-            list: distributed var list.
-        """
-        endpoint_vars = []
-        for var in self.distributed_vars:
-            if var.endpoint == endpoint:
-                endpoint_vars.append(var)
-        if not vtype:
-            return endpoint_vars
-
-        vtype_vars = []
-        for var in endpoint_vars:
-            if var.vtype == vtype:
-                vtype_vars.append(var)
-        return vtype_vars
-
-    def overview(self):
-        """
-        get the overview string about all params on all parameter servers.
-
-        Returns:
-            Str: overview string.
-
-        """
-        vars_str = []
-        for var in self.distributed_vars:
-            vars_str.append(str(var))
-        return "\n".join(vars_str)
-
-
 class VarBlock:
     def __init__(self, varname, offset, size):
         self.varname = varname
diff --git a/python/setup.py.in b/python/setup.py.in
index c947785cbf7517be56c3e43120db65284ab22d10..f93f0cd130e33311bade2b15726c3eff37546214 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,6 +109,7 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.int8_inference',
           'paddle.fluid.contrib.reader',
           'paddle.fluid.contrib.slim',
           'paddle.fluid.contrib.slim.core',